2018.12.2 First commit.
6
.gitignore
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
rule_extraction 20181014.py
|
||||
__pycache__
|
||||
.ipynb_checkpoints
|
||||
.gitignore.bak
|
||||
history
|
||||
README_bk.md
|
||||
656
1_Demo_Data_Explore.ipynb
Normal file
1109
2.1_Demo_Missing_Data.ipynb
Normal file
1582
2.2_Demo_Outlier.ipynb
Normal file
271
2.3_Demo_Rare_Values.ipynb
Normal file
@@ -0,0 +1,271 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"# import seaborn as sns\n",
|
||||
"# import matplotlib.pyplot as plt\n",
|
||||
"import os\n",
|
||||
"# plt.style.use('seaborn-colorblind')\n",
|
||||
"# %matplotlib inline\n",
|
||||
"from feature_cleaning import rare_values as ra"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Variable Pclass label proportion:\n",
|
||||
"3 0.551066\n",
|
||||
"1 0.242424\n",
|
||||
"2 0.206510\n",
|
||||
"Name: Pclass, dtype: float64\n",
|
||||
"Variable SibSp label proportion:\n",
|
||||
"0 0.682379\n",
|
||||
"1 0.234568\n",
|
||||
"2 0.031425\n",
|
||||
"4 0.020202\n",
|
||||
"3 0.017957\n",
|
||||
"8 0.007856\n",
|
||||
"5 0.005612\n",
|
||||
"Name: SibSp, dtype: float64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"use_cols = [\n",
|
||||
" 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
|
||||
" 'Survived'\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"# see column Pclass & SibSp's distributions\n",
|
||||
"# SibSp has values 3/8/5 that occur rarely, under 2%\n",
|
||||
"# Pclass has 3 values, but no one is under 20%\n",
|
||||
"data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n",
|
||||
"for i in ['Pclass','SibSp']:\n",
|
||||
" print('Variable',i,'label proportion:')\n",
|
||||
" print(data[i].value_counts()/len(data))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Grouping into one new category\n",
|
||||
"Grouping the observations that show rare labels into a unique category ('rare')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# create the encoder and fit with our data\n",
|
||||
"enc = ra.GroupingRareValues(cols=['Pclass','SibSp'],threshold=0.01).fit(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[{'col': 'Pclass', 'mapping': 3 3\n",
|
||||
"1 1\n",
|
||||
"2 2\n",
|
||||
"dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0 0\n",
|
||||
"1 1\n",
|
||||
"2 2\n",
|
||||
"4 4\n",
|
||||
"3 3\n",
|
||||
"8 rare\n",
|
||||
"5 rare\n",
|
||||
"dtype: object, 'data_type': dtype('int64')}]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# let's see the mapping\n",
|
||||
"# for SibSp, values 5 & 8 are encoded as 'rare' as they appear less than 10%\n",
|
||||
"# for Pclass, nothing changed\n",
|
||||
"print(enc.mapping)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# perform transformation\n",
|
||||
"data2 = enc.transform(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0 608\n",
|
||||
"1 209\n",
|
||||
"2 28\n",
|
||||
"4 18\n",
|
||||
"3 16\n",
|
||||
"rare 12\n",
|
||||
"Name: SibSp, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# check the result\n",
|
||||
"print(data2.SibSp.value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Mode Imputation\n",
|
||||
"Replacing the rare label by most frequent label"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# create the encoder and fit with our data\n",
|
||||
"enc = ra.ModeImputation(cols=['Pclass','SibSp'],threshold=0.01).fit(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[{'col': 'Pclass', 'mapping': 3 3\n",
|
||||
"1 1\n",
|
||||
"2 2\n",
|
||||
"dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0 0\n",
|
||||
"1 1\n",
|
||||
"2 2\n",
|
||||
"4 4\n",
|
||||
"3 3\n",
|
||||
"8 0\n",
|
||||
"5 0\n",
|
||||
"dtype: int64, 'data_type': dtype('int64')}]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# let's see the mapping\n",
|
||||
"# for SibSp, values 5 & 8 are encoded as 0, as label 0 is the most frequent label\n",
|
||||
"# for Pclass, nothing changed\n",
|
||||
"print(enc.mapping)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# perform transformation\n",
|
||||
"data3 = enc.transform(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0 620\n",
|
||||
"1 209\n",
|
||||
"2 28\n",
|
||||
"4 18\n",
|
||||
"3 16\n",
|
||||
"Name: SibSp, dtype: int64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# check the result\n",
|
||||
"print(data3.SibSp.value_counts())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
326
3.1_Demo_Feature_Scaling.ipynb
Normal file
@@ -0,0 +1,326 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"# import seaborn as sns\n",
|
||||
"# import matplotlib.pyplot as plt\n",
|
||||
"import os\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"# plt.style.use('seaborn-colorblind')\n",
|
||||
"# %matplotlib inline\n",
|
||||
"#from feature_cleaning import rare_values as ra"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"use_cols = [\n",
|
||||
" 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
|
||||
" 'Survived'\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Survived</th>\n",
|
||||
" <th>Pclass</th>\n",
|
||||
" <th>Sex</th>\n",
|
||||
" <th>Age</th>\n",
|
||||
" <th>SibSp</th>\n",
|
||||
" <th>Fare</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>male</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>7.2500</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>71.2833</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>26.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>7.9250</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Survived Pclass Sex Age SibSp Fare\n",
|
||||
"0 0 3 male 22.0 1 7.2500\n",
|
||||
"1 1 1 female 38.0 1 71.2833\n",
|
||||
"2 1 3 female 26.0 0 7.9250"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data.head(3)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"((623, 6), (268, 6))"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Note that we include target variable in the X_train \n",
|
||||
"# because we need it to supervise our discretization\n",
|
||||
"# this is not the standard way of using train-test-split\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
|
||||
" random_state=0)\n",
|
||||
"X_train.shape, X_test.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Normalization - Standardization (Z-score scaling)\n",
|
||||
"\n",
|
||||
"removes the mean and scales the data to unit variance.<br />z = (X - X.mean) / std"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Survived Pclass Sex Age SibSp Fare Fare_zscore\n",
|
||||
"857 1 1 male 51.0 0 26.5500 -0.122530\n",
|
||||
"52 1 1 female 49.0 1 76.7292 0.918124\n",
|
||||
"386 0 3 male 1.0 5 46.9000 0.299503\n",
|
||||
"124 0 1 male 54.0 0 77.2875 0.929702\n",
|
||||
"578 0 3 female NaN 1 14.4583 -0.373297\n",
|
||||
"549 1 2 male 8.0 1 36.7500 0.089005\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# add the new created feature\n",
|
||||
"from sklearn.preprocessing import StandardScaler\n",
|
||||
"ss = StandardScaler().fit(X_train[['Fare']])\n",
|
||||
"X_train_copy = X_train.copy(deep=True)\n",
|
||||
"X_train_copy['Fare_zscore'] = ss.transform(X_train_copy[['Fare']])\n",
|
||||
"print(X_train_copy.head(6))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"5.916437306188636e-17\n",
|
||||
"1.0008035356861\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# check if it is with mean=0 std=1\n",
|
||||
"print(X_train_copy['Fare_zscore'].mean())\n",
|
||||
"print(X_train_copy['Fare_zscore'].std())\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Min-Max scaling\n",
|
||||
"transforms features by scaling each feature to a given range. Default to [0,1].<br />X_scaled = (X - X.min / (X.max - X.min)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Survived Pclass Sex Age SibSp Fare Fare_minmax\n",
|
||||
"857 1 1 male 51.0 0 26.5500 0.051822\n",
|
||||
"52 1 1 female 49.0 1 76.7292 0.149765\n",
|
||||
"386 0 3 male 1.0 5 46.9000 0.091543\n",
|
||||
"124 0 1 male 54.0 0 77.2875 0.150855\n",
|
||||
"578 0 3 female NaN 1 14.4583 0.028221\n",
|
||||
"549 1 2 male 8.0 1 36.7500 0.071731\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# add the new created feature\n",
|
||||
"from sklearn.preprocessing import MinMaxScaler\n",
|
||||
"mms = MinMaxScaler().fit(X_train[['Fare']])\n",
|
||||
"X_train_copy = X_train.copy(deep=True)\n",
|
||||
"X_train_copy['Fare_minmax'] = mms.transform(X_train_copy[['Fare']])\n",
|
||||
"print(X_train_copy.head(6))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1.0\n",
|
||||
"0.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# check the range of Fare_minmax\n",
|
||||
"print(X_train_copy['Fare_minmax'].max())\n",
|
||||
"print(X_train_copy['Fare_minmax'].min())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"source": [
|
||||
"## Robust scaling\n",
|
||||
"removes the median and scales the data according to the quantile range (defaults to IQR)<br />X_scaled = (X - X.median) / IQR"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Survived Pclass Sex Age SibSp Fare Fare_robust\n",
|
||||
"857 1 1 male 51.0 0 26.5500 0.492275\n",
|
||||
"52 1 1 female 49.0 1 76.7292 2.630973\n",
|
||||
"386 0 3 male 1.0 5 46.9000 1.359616\n",
|
||||
"124 0 1 male 54.0 0 77.2875 2.654768\n",
|
||||
"578 0 3 female NaN 1 14.4583 -0.023088\n",
|
||||
"549 1 2 male 8.0 1 36.7500 0.927011\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# add the new created feature\n",
|
||||
"from sklearn.preprocessing import RobustScaler\n",
|
||||
"rs = RobustScaler().fit(X_train[['Fare']])\n",
|
||||
"X_train_copy = X_train.copy(deep=True)\n",
|
||||
"X_train_copy['Fare_robust'] = rs.transform(X_train_copy[['Fare']])\n",
|
||||
"print(X_train_copy.head(6))"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
865
3.2_Demo_Discretisation.ipynb
Normal file
@@ -0,0 +1,865 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"# import seaborn as sns\n",
|
||||
"# import matplotlib.pyplot as plt\n",
|
||||
"import os\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from feature_engineering import discretization as dc\n",
|
||||
"\n",
|
||||
"# plt.style.use('seaborn-colorblind')\n",
|
||||
"# %matplotlib inline\n",
|
||||
"#from feature_cleaning import rare_values as ra"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"use_cols = [\n",
|
||||
" 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
|
||||
" 'Survived'\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Survived</th>\n",
|
||||
" <th>Pclass</th>\n",
|
||||
" <th>Sex</th>\n",
|
||||
" <th>Age</th>\n",
|
||||
" <th>SibSp</th>\n",
|
||||
" <th>Fare</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>male</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>7.2500</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>71.2833</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>26.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>7.9250</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Survived Pclass Sex Age SibSp Fare\n",
|
||||
"0 0 3 male 22.0 1 7.2500\n",
|
||||
"1 1 1 female 38.0 1 71.2833\n",
|
||||
"2 1 3 female 26.0 0 7.9250"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data.head(3)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"((623, 6), (268, 6))"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Note that we include target variable in the X_train \n",
|
||||
"# because we need it to supervise our discretization\n",
|
||||
"# this is not the standard way of using train-test-split\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
|
||||
" random_state=0)\n",
|
||||
"X_train.shape, X_test.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Equal width binning\n",
|
||||
"divides the scope of possible values into N bins of the same width"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.preprocessing import KBinsDiscretizer\n",
|
||||
"enc_equal_width = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='uniform').fit(X_train[['Fare']])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([array([ 0. , 170.7764, 341.5528, 512.3292])], dtype=object)"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# equal width for every bins\n",
|
||||
"enc_equal_width.bin_edges_"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0.0 610\n",
|
||||
"1.0 11\n",
|
||||
"2.0 2\n",
|
||||
"Name: 0, dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"result = enc_equal_width.transform(X_train[['Fare']])\n",
|
||||
"pd.DataFrame(result)[0].value_counts()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Survived Pclass Sex Age SibSp Fare Fare_equal_width\n",
|
||||
"857 1 1 male 51.0 0 26.5500 0.0\n",
|
||||
"52 1 1 female 49.0 1 76.7292 0.0\n",
|
||||
"386 0 3 male 1.0 5 46.9000 0.0\n",
|
||||
"124 0 1 male 54.0 0 77.2875 0.0\n",
|
||||
"578 0 3 female NaN 1 14.4583 0.0\n",
|
||||
"549 1 2 male 8.0 1 36.7500 0.0\n",
|
||||
"118 0 1 male 24.0 0 247.5208 1.0\n",
|
||||
"12 0 3 male 20.0 0 8.0500 0.0\n",
|
||||
"157 0 3 male 30.0 0 8.0500 0.0\n",
|
||||
"127 1 3 male 24.0 0 7.1417 0.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# add the new discretized variable\n",
|
||||
"X_train_copy = X_train.copy(deep=True)\n",
|
||||
"X_train_copy['Fare_equal_width'] = enc_equal_width.transform(X_train[['Fare']])\n",
|
||||
"print(X_train_copy.head(10))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Equal frequency binning\n",
|
||||
"divides the scope of possible values of the variable into N bins, \n",
|
||||
"where each bin carries the same amount of observations"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"enc_equal_freq = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='quantile').fit(X_train[['Fare']])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([array([ 0. , 8.69303333, 26.2875 , 512.3292 ])],\n",
|
||||
" dtype=object)"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# check the bin edges\n",
|
||||
"enc_equal_freq.bin_edges_"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"2.0 209\n",
|
||||
"0.0 208\n",
|
||||
"1.0 206\n",
|
||||
"Name: 0, dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# equal number of case for every bins\n",
|
||||
"result = enc_equal_freq.transform(X_train[['Fare']])\n",
|
||||
"pd.DataFrame(result)[0].value_counts()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Survived Pclass Sex Age SibSp Fare Fare_equal_freq\n",
|
||||
"857 1 1 male 51.0 0 26.5500 2.0\n",
|
||||
"52 1 1 female 49.0 1 76.7292 2.0\n",
|
||||
"386 0 3 male 1.0 5 46.9000 2.0\n",
|
||||
"124 0 1 male 54.0 0 77.2875 2.0\n",
|
||||
"578 0 3 female NaN 1 14.4583 1.0\n",
|
||||
"549 1 2 male 8.0 1 36.7500 2.0\n",
|
||||
"118 0 1 male 24.0 0 247.5208 2.0\n",
|
||||
"12 0 3 male 20.0 0 8.0500 0.0\n",
|
||||
"157 0 3 male 30.0 0 8.0500 0.0\n",
|
||||
"127 1 3 male 24.0 0 7.1417 0.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# add the new discretized variable\n",
|
||||
"X_train_copy = X_train.copy(deep=True)\n",
|
||||
"X_train_copy['Fare_equal_freq'] = enc_equal_freq.transform(X_train[['Fare']])\n",
|
||||
"print(X_train_copy.head(10))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## K-means binning\n",
|
||||
"using k-means to partition values into clusters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"enc_kmeans = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='kmeans').fit(X_train[['Fare']])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"array([array([ 0. , 93.5271531 , 338.08506324, 512.3292 ])],\n",
|
||||
" dtype=object)"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# check the bin edges\n",
|
||||
"enc_kmeans.bin_edges_"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0.0 587\n",
|
||||
"1.0 34\n",
|
||||
"2.0 2\n",
|
||||
"Name: 0, dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 15,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"result = enc_kmeans.transform(X_train[['Fare']])\n",
|
||||
"pd.DataFrame(result)[0].value_counts()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Survived Pclass Sex Age SibSp Fare Fare_kmeans\n",
|
||||
"857 1 1 male 51.0 0 26.5500 0.0\n",
|
||||
"52 1 1 female 49.0 1 76.7292 0.0\n",
|
||||
"386 0 3 male 1.0 5 46.9000 0.0\n",
|
||||
"124 0 1 male 54.0 0 77.2875 0.0\n",
|
||||
"578 0 3 female NaN 1 14.4583 0.0\n",
|
||||
"549 1 2 male 8.0 1 36.7500 0.0\n",
|
||||
"118 0 1 male 24.0 0 247.5208 1.0\n",
|
||||
"12 0 3 male 20.0 0 8.0500 0.0\n",
|
||||
"157 0 3 male 30.0 0 8.0500 0.0\n",
|
||||
"127 1 3 male 24.0 0 7.1417 0.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# add the new discretized variable\n",
|
||||
"X_train_copy = X_train.copy(deep=True)\n",
|
||||
"X_train_copy['Fare_kmeans'] = enc_kmeans.transform(X_train[['Fare']])\n",
|
||||
"print(X_train_copy.head(10))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Discretisation with Decision Tree\n",
|
||||
"using a decision tree to identify the optimal splitting points that would determine the bins"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"enc1 = dc.DiscretizeByDecisionTree(col='Fare',max_depth=2).fit(X=X_train,y=y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,\n",
|
||||
" max_features=None, max_leaf_nodes=None,\n",
|
||||
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
|
||||
" min_samples_leaf=1, min_samples_split=2,\n",
|
||||
" min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n",
|
||||
" splitter='best')"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"enc1.tree_model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data1 = enc1.transform(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 20,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Survived Pclass Sex Age SibSp Fare Fare_tree_discret\n",
|
||||
"0 0 3 male 22.0 1 7.2500 0.107143\n",
|
||||
"1 1 1 female 38.0 1 71.2833 0.442308\n",
|
||||
"2 1 3 female 26.0 0 7.9250 0.255319\n",
|
||||
"3 1 1 female 35.0 1 53.1000 0.442308\n",
|
||||
"4 0 3 male 35.0 0 8.0500 0.255319\n",
|
||||
"[0.10714286 0.44230769 0.25531915 0.74626866]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# see how the new column Fare_tree_discret is distributed\n",
|
||||
"# the values are corresponding to the proba of the prediction by the tree\n",
|
||||
"print(data1.head(5))\n",
|
||||
"\n",
|
||||
"# the unique value of the discretisized column\n",
|
||||
"print(data1.Fare_tree_discret.unique())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 21,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Fare Fare\n",
|
||||
"Fare_tree_discret \n",
|
||||
"0.107143 0.0000 7.5208\n",
|
||||
"0.255319 7.5500 10.5167\n",
|
||||
"0.442308 11.1333 73.5000\n",
|
||||
"0.746269 75.2500 512.3292\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# see how the bins are cut\n",
|
||||
"# because we use a tree with max-depth of 2, we have at most 2*2=4 bins generated by the tree\n",
|
||||
"col='Fare'\n",
|
||||
"bins = pd.concat([data1.groupby([col+'_tree_discret'])[col].min(),\n",
|
||||
" data1.groupby([col+'_tree_discret'])[col].max()], axis=1)\n",
|
||||
"print(bins)\n",
|
||||
"\n",
|
||||
"# all values between 0 to 7.5208 in the original variable 'Fare' \n",
|
||||
"# are given new value 0.107143 in the new column 'Fare_tree_discret'\n",
|
||||
"# and so on"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Discretisation with Decision Tree with optimal depth search"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"result ROC-AUC for each depth\n",
|
||||
" depth roc_auc_mean roc_auc_std\n",
|
||||
"0 2 0.662132 0.026253\n",
|
||||
"1 3 0.647950 0.045010\n",
|
||||
"2 4 0.650984 0.035127\n",
|
||||
"3 5 0.651180 0.027663\n",
|
||||
"4 6 0.653961 0.037421\n",
|
||||
"5 7 0.643688 0.033513\n",
|
||||
"optimal_depth: [2]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# search for the best depth from range 2-7\n",
|
||||
"# we see when depth=2 we get the best roc-auc mean\n",
|
||||
"enc2 = dc.DiscretizeByDecisionTree(col='Fare',max_depth=[2,3,4,5,6,7]).fit(X=X_train,y=y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"DecisionTreeClassifier(class_weight=None, criterion='gini',\n",
|
||||
" max_depth=array([2], dtype=int64), max_features=None,\n",
|
||||
" max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
|
||||
" min_impurity_split=None, min_samples_leaf=1,\n",
|
||||
" min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
|
||||
" presort=False, random_state=None, splitter='best')"
|
||||
]
|
||||
},
|
||||
"execution_count": 23,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# using optimal depth=2 we train the model, same result as last one\n",
|
||||
"enc2.tree_model"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Survived</th>\n",
|
||||
" <th>Pclass</th>\n",
|
||||
" <th>Sex</th>\n",
|
||||
" <th>Age</th>\n",
|
||||
" <th>SibSp</th>\n",
|
||||
" <th>Fare</th>\n",
|
||||
" <th>Fare_tree_discret</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>male</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>7.2500</td>\n",
|
||||
" <td>0.107143</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>71.2833</td>\n",
|
||||
" <td>0.442308</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>26.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>7.9250</td>\n",
|
||||
" <td>0.255319</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>35.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>53.1000</td>\n",
|
||||
" <td>0.442308</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>male</td>\n",
|
||||
" <td>35.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>8.0500</td>\n",
|
||||
" <td>0.255319</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Survived Pclass Sex Age SibSp Fare Fare_tree_discret\n",
|
||||
"0 0 3 male 22.0 1 7.2500 0.107143\n",
|
||||
"1 1 1 female 38.0 1 71.2833 0.442308\n",
|
||||
"2 1 3 female 26.0 0 7.9250 0.255319\n",
|
||||
"3 1 1 female 35.0 1 53.1000 0.442308\n",
|
||||
"4 0 3 male 35.0 0 8.0500 0.255319"
|
||||
]
|
||||
},
|
||||
"execution_count": 24,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data2 = enc2.transform(data)\n",
|
||||
"data2.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"source": [
|
||||
"## Discretisation with ChiMerge\n",
|
||||
"supervised hierarchical bottom-up (merge) method that locally exploits the chi-square criterion to decide whether two adjacent intervals are similar enough to be merged"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 25,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Interval for variable Fare\n",
|
||||
" variable interval flag_0 flag_1\n",
|
||||
"0 Fare -inf,7.875 94.0 28.0\n",
|
||||
"1 Fare 7.875,7.8792 0.0 3.0\n",
|
||||
"2 Fare 7.8792,7.8958 25.0 1.0\n",
|
||||
"3 Fare 7.8958,73.5 245.0 160.0\n",
|
||||
"4 Fare 73.5+ 17.0 50.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"enc3 = dc.ChiMerge(col='Fare',num_of_bins=5).fit(X=X_train,y='Survived')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[-0.1, 7.875, 7.8792, 7.8958, 73.5, 512.3292]"
|
||||
]
|
||||
},
|
||||
"execution_count": 26,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# the bins boundary created by ChiMerge\n",
|
||||
"\n",
|
||||
"enc3.bins"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 27,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data3 = enc3.transform(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 28,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Survived Pclass Sex Age SibSp Fare Fare_chimerge\n",
|
||||
"0 0 3 male 22.0 1 7.2500 (-0.101, 7.875]\n",
|
||||
"1 1 1 female 38.0 1 71.2833 (7.896, 73.5]\n",
|
||||
"2 1 3 female 26.0 0 7.9250 (7.896, 73.5]\n",
|
||||
"3 1 1 female 35.0 1 53.1000 (7.896, 73.5]\n",
|
||||
"4 0 3 male 35.0 0 8.0500 (7.896, 73.5]\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"print(data3.head(5))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[(-0.101, 7.875], (7.896, 73.5], (73.5, 512.329], (7.875, 7.879], (7.879, 7.896]]\n",
|
||||
"Categories (5, interval[float64]): [(-0.101, 7.875] < (7.875, 7.879] < (7.879, 7.896] < (7.896, 73.5] < (73.5, 512.329]]"
|
||||
]
|
||||
},
|
||||
"execution_count": 29,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# all values are grouped into 5 intervals\n",
|
||||
"data3.Fare_chimerge.unique()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
688
3.3_Demo_Feature_Encoding.ipynb
Normal file
@@ -0,0 +1,688 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"# import seaborn as sns\n",
|
||||
"# import matplotlib.pyplot as plt\n",
|
||||
"import os\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"\n",
|
||||
"import category_encoders as ce\n",
|
||||
"from feature_engineering import encoding\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Survived</th>\n",
|
||||
" <th>Pclass</th>\n",
|
||||
" <th>Sex</th>\n",
|
||||
" <th>Age</th>\n",
|
||||
" <th>SibSp</th>\n",
|
||||
" <th>Fare</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>male</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>7.2500</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>71.2833</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>26.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>7.9250</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>35.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>53.1000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>male</td>\n",
|
||||
" <td>35.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>8.0500</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Survived Pclass Sex Age SibSp Fare\n",
|
||||
"0 0 3 male 22.0 1 7.2500\n",
|
||||
"1 1 1 female 38.0 1 71.2833\n",
|
||||
"2 1 3 female 26.0 0 7.9250\n",
|
||||
"3 1 1 female 35.0 1 53.1000\n",
|
||||
"4 0 3 male 35.0 0 8.0500"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"use_cols = [\n",
|
||||
" 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
|
||||
" 'Survived'\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n",
|
||||
"data.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"((623, 6), (268, 6))"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
|
||||
" random_state=0)\n",
|
||||
"X_train.shape, X_test.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## One-hot encoding\n",
|
||||
"replace the categorical variable by different boolean variables (0/1) to indicate whether or not certain label is true for that observation"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data1 = pd.get_dummies(data,drop_first=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Survived</th>\n",
|
||||
" <th>Pclass</th>\n",
|
||||
" <th>Age</th>\n",
|
||||
" <th>SibSp</th>\n",
|
||||
" <th>Fare</th>\n",
|
||||
" <th>Sex_male</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>7.2500</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>71.2833</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>26.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>7.9250</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>35.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>53.1000</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>35.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>8.0500</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Survived Pclass Age SibSp Fare Sex_male\n",
|
||||
"0 0 3 22.0 1 7.2500 1\n",
|
||||
"1 1 1 38.0 1 71.2833 0\n",
|
||||
"2 1 3 26.0 0 7.9250 0\n",
|
||||
"3 1 1 35.0 1 53.1000 0\n",
|
||||
"4 0 3 35.0 0 8.0500 1"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data1.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Ordinal-encoding\n",
|
||||
"replace the labels by some ordinal number if ordinal is meaningful"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ord_enc = ce.OrdinalEncoder(cols=['Sex']).fit(X_train,y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Survived Pclass Sex Age SibSp Fare\n",
|
||||
"0 0 3 1 22.0 1 7.2500\n",
|
||||
"1 1 1 2 38.0 1 71.2833\n",
|
||||
"2 1 3 2 26.0 0 7.9250\n",
|
||||
"3 1 1 2 35.0 1 53.1000\n",
|
||||
"4 0 3 1 35.0 0 8.0500\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data4 = ord_enc.transform(data)\n",
|
||||
"print(data4.head(5))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Mean encoding\n",
|
||||
"replace the label by the mean of the target for that label. \n",
|
||||
"(the target must be 0/1 valued or continuous)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Sex\n",
|
||||
"female 0.753488\n",
|
||||
"male 0.196078\n",
|
||||
"Name: Survived, dtype: float64"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# cross check-- the mean of target group by Sex\n",
|
||||
"X_train['Survived'].groupby(data['Sex']).mean()\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"mean_enc = encoding.MeanEncoding(cols=['Sex']).fit(X_train,y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Survived Pclass Sex Age SibSp Fare\n",
|
||||
"0 0 3 0.196078 22.0 1 7.2500\n",
|
||||
"1 1 1 0.753488 38.0 1 71.2833\n",
|
||||
"2 1 3 0.753488 26.0 0 7.9250\n",
|
||||
"3 1 1 0.753488 35.0 1 53.1000\n",
|
||||
"4 0 3 0.196078 35.0 0 8.0500\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data6 = mean_enc.transform(data)\n",
|
||||
"print(data6.head(5))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Target-encoding\n",
|
||||
"Similar to mean encoding, but use both posterior probability and prior probability of the target"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# create the encoder and fit with our data\n",
|
||||
"target_enc = ce.TargetEncoder(cols=['Sex']).fit(X_train,y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# perform transformation\n",
|
||||
"# data.Survived.groupby(data['Sex']).agg(['mean'])\n",
|
||||
"data2 = target_enc.transform(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Survived</th>\n",
|
||||
" <th>Pclass</th>\n",
|
||||
" <th>Sex</th>\n",
|
||||
" <th>Age</th>\n",
|
||||
" <th>SibSp</th>\n",
|
||||
" <th>Fare</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>0.196078</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>7.2500</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0.753488</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>71.2833</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>0.753488</td>\n",
|
||||
" <td>26.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>7.9250</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>0.753488</td>\n",
|
||||
" <td>35.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>53.1000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>0.196078</td>\n",
|
||||
" <td>35.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>8.0500</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Survived Pclass Sex Age SibSp Fare\n",
|
||||
"0 0 3 0.196078 22.0 1 7.2500\n",
|
||||
"1 1 1 0.753488 38.0 1 71.2833\n",
|
||||
"2 1 3 0.753488 26.0 0 7.9250\n",
|
||||
"3 1 1 0.753488 35.0 1 53.1000\n",
|
||||
"4 0 3 0.196078 35.0 0 8.0500"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# check the result\n",
|
||||
"data2.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## WOE-encoding\n",
|
||||
"replace the label with Weight of Evidence of each label. WOE is computed from the basic odds ratio: \n",
|
||||
"\n",
|
||||
"ln( (Proportion of Good Outcomes) / (Proportion of Bad Outcomes))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"woe_enc = ce.WOEEncoder(cols=['Sex']).fit(X_train,y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data3 = woe_enc.transform(data)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Survived</th>\n",
|
||||
" <th>Pclass</th>\n",
|
||||
" <th>Sex</th>\n",
|
||||
" <th>Age</th>\n",
|
||||
" <th>SibSp</th>\n",
|
||||
" <th>Fare</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>-0.950742</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>7.2500</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1.555633</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>71.2833</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>1.555633</td>\n",
|
||||
" <td>26.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>7.9250</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1.555633</td>\n",
|
||||
" <td>35.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>53.1000</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>-0.950742</td>\n",
|
||||
" <td>35.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>8.0500</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Survived Pclass Sex Age SibSp Fare\n",
|
||||
"0 0 3 -0.950742 22.0 1 7.2500\n",
|
||||
"1 1 1 1.555633 38.0 1 71.2833\n",
|
||||
"2 1 3 1.555633 26.0 0 7.9250\n",
|
||||
"3 1 1 1.555633 35.0 1 53.1000\n",
|
||||
"4 0 3 -0.950742 35.0 0 8.0500"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data3.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
497
3.4_Demo_Feature_Transformation.ipynb
Normal file
522
3.5_Demo_Feature_Generation.ipynb
Normal file
@@ -0,0 +1,522 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"# import seaborn as sns\n",
|
||||
"# import matplotlib.pyplot as plt\n",
|
||||
"import os\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.metrics import roc_curve, roc_auc_score\n",
|
||||
"\n",
|
||||
"# plt.style.use('seaborn-colorblind')\n",
|
||||
"# %matplotlib inline\n",
|
||||
"#from feature_cleaning import rare_values as ra"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"use_cols = [\n",
|
||||
" 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
|
||||
" 'Survived'\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>Survived</th>\n",
|
||||
" <th>Pclass</th>\n",
|
||||
" <th>Sex</th>\n",
|
||||
" <th>Age</th>\n",
|
||||
" <th>SibSp</th>\n",
|
||||
" <th>Fare</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>male</td>\n",
|
||||
" <td>22.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>7.2500</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>38.0</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>71.2833</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>female</td>\n",
|
||||
" <td>26.0</td>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>7.9250</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" Survived Pclass Sex Age SibSp Fare\n",
|
||||
"0 0 3 male 22.0 1 7.2500\n",
|
||||
"1 1 1 female 38.0 1 71.2833\n",
|
||||
"2 1 3 female 26.0 0 7.9250"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data.head(3)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"((623, 6), (268, 6))"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Note that we include target variable in the X_train \n",
|
||||
"# because we need it to supervise our discretization\n",
|
||||
"# this is not the standard way of using train-test-split\n",
|
||||
"X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
|
||||
" random_state=0)\n",
|
||||
"X_train.shape, X_test.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Polynomial Expansion\n",
|
||||
"\n",
|
||||
"generate a new feature set consisting of all polynomial combinations of the features with degree less than or equal to the specified degree"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" Pclass SibSp Pclass^2 Pclass SibSp SibSp^2\n",
|
||||
"0 1.0 0.0 1.0 0.0 0.0\n",
|
||||
"1 1.0 1.0 1.0 1.0 1.0\n",
|
||||
"2 3.0 5.0 9.0 15.0 25.0\n",
|
||||
"3 1.0 0.0 1.0 0.0 0.0\n",
|
||||
"4 3.0 1.0 9.0 3.0 1.0\n",
|
||||
"5 2.0 1.0 4.0 2.0 1.0\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# create polynomial combinations of feature 'Pclass','SibSp' with degree 2\n",
|
||||
"from sklearn.preprocessing import PolynomialFeatures\n",
|
||||
"pf = PolynomialFeatures(degree=2,include_bias=False).fit(X_train[['Pclass','SibSp']])\n",
|
||||
"tmp = pf.transform(X_train[['Pclass','SibSp']])\n",
|
||||
"X_train_copy = pd.DataFrame(tmp,columns=pf.get_feature_names(['Pclass','SibSp']))\n",
|
||||
"print(X_train_copy.head(6))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Feature Learning by Trees\n",
|
||||
"GBDT derived feature + LR"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"sample's belonging node of each base tree \n",
|
||||
"' [[ 7. 7. 6. ... 4. 7. 4.]\n",
|
||||
" [ 7. 7. 6. ... 14. 7. 7.]\n",
|
||||
" [11. 11. 11. ... 4. 6. 11.]\n",
|
||||
" ...\n",
|
||||
" [10. 10. 10. ... 4. 6. 10.]\n",
|
||||
" [13. 14. 13. ... 4. 7. 13.]\n",
|
||||
" [ 7. 7. 6. ... 6. 7. 7.]]\n",
|
||||
"AUC for GBDT derived feature + LR: 0.7746130952380953\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
|
||||
"If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
|
||||
"In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
|
||||
" warnings.warn(msg, FutureWarning)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier\n",
|
||||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||||
"\n",
|
||||
"gbdt = GradientBoostingClassifier(n_estimators=20)\n",
|
||||
"one_hot = OneHotEncoder()\n",
|
||||
"\n",
|
||||
"X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||||
"X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||||
"\n",
|
||||
"gbdt.fit(X_train, y_train)\n",
|
||||
"\n",
|
||||
"X_leaf_index = gbdt.apply(X_train)[:, :, 0] # apply return the node index on each tree \n",
|
||||
"print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n",
|
||||
"# fit one-hot encoder\n",
|
||||
"one_hot.fit(X_leaf_index) \n",
|
||||
"X_one_hot = one_hot.transform(X_leaf_index) \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"from sklearn.linear_model import LogisticRegression\n",
|
||||
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
|
||||
"lr.fit(X_one_hot,y_train)\n",
|
||||
"y_pred = lr.predict_proba(\n",
|
||||
" one_hot.transform(gbdt.apply(X_test)[:, :, 0]))[:,1]\n",
|
||||
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
|
||||
"print(\"AUC for GBDT derived feature + LR:\", roc_auc_score(y_test, y_pred))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Feature Learning by Trees\n",
|
||||
"RandomForest derived feature + LR"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"sample's belonging node of each base tree \n",
|
||||
"' [[212 35 79 ... 146 60 46]\n",
|
||||
" [307 165 266 ... 136 132 44]\n",
|
||||
" [285 285 320 ... 301 294 300]\n",
|
||||
" ...\n",
|
||||
" [ 13 177 133 ... 186 169 117]\n",
|
||||
" [190 296 311 ... 282 289 297]\n",
|
||||
" [264 165 243 ... 152 110 314]]\n",
|
||||
"AUC for RandomForest derived feature + LR: 0.759672619047619\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
|
||||
"If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
|
||||
"In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
|
||||
" warnings.warn(msg, FutureWarning)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"rf = RandomForestClassifier(n_estimators=20)\n",
|
||||
"one_hot = OneHotEncoder()\n",
|
||||
"\n",
|
||||
"X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||||
"X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||||
"\n",
|
||||
"rf.fit(X_train, y_train)\n",
|
||||
"\n",
|
||||
"X_leaf_index = rf.apply(X_train) # apply return the node index on each tree \n",
|
||||
"print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n",
|
||||
"# fit one-hot encoder\n",
|
||||
"one_hot.fit(X_leaf_index) \n",
|
||||
"X_one_hot = one_hot.transform(X_leaf_index) \n",
|
||||
"\n",
|
||||
"\n",
|
||||
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
|
||||
"lr.fit(X_one_hot,y_train)\n",
|
||||
"y_pred = lr.predict_proba(\n",
|
||||
" one_hot.transform(rf.apply(X_test)))[:,1]\n",
|
||||
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
|
||||
"print(\"AUC for RandomForest derived feature + LR:\", roc_auc_score(y_test, y_pred))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"source": [
|
||||
"## Feature Learning by Trees\n",
|
||||
"GBDT derived feature + Raw feature +LR"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"AUC for GBDT derived feature + Raw feature +LR: 0.7603571428571428\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from scipy.sparse import hstack\n",
|
||||
"\n",
|
||||
"X_train_ext = hstack([one_hot.transform(gbdt.apply(X_train)[:, :, 0]), X_train])\n",
|
||||
"X_test_ext = hstack([one_hot.transform(gbdt.apply(X_test)[:, :, 0]), X_test])\n",
|
||||
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
|
||||
"lr.fit(X_train_ext,y_train)\n",
|
||||
"y_pred = lr.predict_proba(X_test_ext)[:,1]\n",
|
||||
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
|
||||
"print(\"AUC for GBDT derived feature + Raw feature +LR:\", roc_auc_score(y_test, y_pred))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Feature Learning by Trees\n",
|
||||
"RandomForest derived feature + Raw feature +LR"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"AUC for RandomForest derived feature + Raw feature + LR: 0.76\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X_train_ext = hstack([one_hot.transform(rf.apply(X_train)), X_train])\n",
|
||||
"X_test_ext = hstack([one_hot.transform(rf.apply(X_test)), X_test])\n",
|
||||
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
|
||||
"lr.fit(X_train_ext,y_train)\n",
|
||||
"y_pred = lr.predict_proba(X_test_ext)[:,1]\n",
|
||||
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
|
||||
"print(\"AUC for RandomForest derived feature + Raw feature + LR:\", roc_auc_score(y_test, y_pred))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Feature Learning by Trees\n",
|
||||
"Use only Raw Feature + LR"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"AUC for RandomForest derived feature + LR: 0.6988690476190476\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
|
||||
"lr.fit(X_train,y_train)\n",
|
||||
"y_pred = lr.predict_proba(X_test)[:,1]\n",
|
||||
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
|
||||
"print(\"AUC for RandomForest derived feature + LR:\", roc_auc_score(y_test, y_pred))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Feature Learning by Trees\n",
|
||||
"\n",
|
||||
"Use only Raw Feature + GBDT"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"AUC for Raw feature + GBDT: 0.7613988095238096\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"gbdt = GradientBoostingClassifier(n_estimators=20)\n",
|
||||
"\n",
|
||||
"X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||||
"X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||||
"\n",
|
||||
"gbdt.fit(X_train, y_train)\n",
|
||||
"y_pred = gbdt.predict_proba(X_test)[:,1]\n",
|
||||
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
|
||||
"print(\"AUC for Raw feature + GBDT:\", roc_auc_score(y_test, y_pred))\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Feature Learning by Trees\n",
|
||||
"\n",
|
||||
"Use only Raw Feature + RF\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"AUC for Raw feature + RF: 0.7235119047619047\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"rf = RandomForestClassifier(n_estimators=20)\n",
|
||||
"\n",
|
||||
"X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||||
"X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||||
"\n",
|
||||
"rf.fit(X_train, y_train)\n",
|
||||
"y_pred = rf.predict_proba(X_test)[:,1]\n",
|
||||
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
|
||||
"print(\"AUC for Raw feature + RF:\", roc_auc_score(y_test, y_pred))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"#### Without tuning, we can see GBDT derived feature + LR get the best result"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
696
4.1_Demo_Feature_Selection_Filter.ipynb
Normal file
@@ -0,0 +1,696 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"# import seaborn as sns\n",
|
||||
"# import matplotlib.pyplot as plt\n",
|
||||
"import os\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"# plt.style.use('seaborn-colorblind')\n",
|
||||
"# %matplotlib inline\n",
|
||||
"from feature_selection import filter_method as ft"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.datasets import load_breast_cancer\n",
|
||||
"data = load_breast_cancer()\n",
|
||||
"data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
|
||||
" columns= np.append(data['feature_names'], ['target']))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>mean radius</th>\n",
|
||||
" <th>mean texture</th>\n",
|
||||
" <th>mean perimeter</th>\n",
|
||||
" <th>mean area</th>\n",
|
||||
" <th>mean smoothness</th>\n",
|
||||
" <th>mean compactness</th>\n",
|
||||
" <th>mean concavity</th>\n",
|
||||
" <th>mean concave points</th>\n",
|
||||
" <th>mean symmetry</th>\n",
|
||||
" <th>mean fractal dimension</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>worst texture</th>\n",
|
||||
" <th>worst perimeter</th>\n",
|
||||
" <th>worst area</th>\n",
|
||||
" <th>worst smoothness</th>\n",
|
||||
" <th>worst compactness</th>\n",
|
||||
" <th>worst concavity</th>\n",
|
||||
" <th>worst concave points</th>\n",
|
||||
" <th>worst symmetry</th>\n",
|
||||
" <th>worst fractal dimension</th>\n",
|
||||
" <th>target</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>17.99</td>\n",
|
||||
" <td>10.38</td>\n",
|
||||
" <td>122.80</td>\n",
|
||||
" <td>1001.0</td>\n",
|
||||
" <td>0.11840</td>\n",
|
||||
" <td>0.27760</td>\n",
|
||||
" <td>0.3001</td>\n",
|
||||
" <td>0.14710</td>\n",
|
||||
" <td>0.2419</td>\n",
|
||||
" <td>0.07871</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>17.33</td>\n",
|
||||
" <td>184.60</td>\n",
|
||||
" <td>2019.0</td>\n",
|
||||
" <td>0.1622</td>\n",
|
||||
" <td>0.6656</td>\n",
|
||||
" <td>0.7119</td>\n",
|
||||
" <td>0.2654</td>\n",
|
||||
" <td>0.4601</td>\n",
|
||||
" <td>0.11890</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>20.57</td>\n",
|
||||
" <td>17.77</td>\n",
|
||||
" <td>132.90</td>\n",
|
||||
" <td>1326.0</td>\n",
|
||||
" <td>0.08474</td>\n",
|
||||
" <td>0.07864</td>\n",
|
||||
" <td>0.0869</td>\n",
|
||||
" <td>0.07017</td>\n",
|
||||
" <td>0.1812</td>\n",
|
||||
" <td>0.05667</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>23.41</td>\n",
|
||||
" <td>158.80</td>\n",
|
||||
" <td>1956.0</td>\n",
|
||||
" <td>0.1238</td>\n",
|
||||
" <td>0.1866</td>\n",
|
||||
" <td>0.2416</td>\n",
|
||||
" <td>0.1860</td>\n",
|
||||
" <td>0.2750</td>\n",
|
||||
" <td>0.08902</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>19.69</td>\n",
|
||||
" <td>21.25</td>\n",
|
||||
" <td>130.00</td>\n",
|
||||
" <td>1203.0</td>\n",
|
||||
" <td>0.10960</td>\n",
|
||||
" <td>0.15990</td>\n",
|
||||
" <td>0.1974</td>\n",
|
||||
" <td>0.12790</td>\n",
|
||||
" <td>0.2069</td>\n",
|
||||
" <td>0.05999</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>25.53</td>\n",
|
||||
" <td>152.50</td>\n",
|
||||
" <td>1709.0</td>\n",
|
||||
" <td>0.1444</td>\n",
|
||||
" <td>0.4245</td>\n",
|
||||
" <td>0.4504</td>\n",
|
||||
" <td>0.2430</td>\n",
|
||||
" <td>0.3613</td>\n",
|
||||
" <td>0.08758</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>11.42</td>\n",
|
||||
" <td>20.38</td>\n",
|
||||
" <td>77.58</td>\n",
|
||||
" <td>386.1</td>\n",
|
||||
" <td>0.14250</td>\n",
|
||||
" <td>0.28390</td>\n",
|
||||
" <td>0.2414</td>\n",
|
||||
" <td>0.10520</td>\n",
|
||||
" <td>0.2597</td>\n",
|
||||
" <td>0.09744</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>26.50</td>\n",
|
||||
" <td>98.87</td>\n",
|
||||
" <td>567.7</td>\n",
|
||||
" <td>0.2098</td>\n",
|
||||
" <td>0.8663</td>\n",
|
||||
" <td>0.6869</td>\n",
|
||||
" <td>0.2575</td>\n",
|
||||
" <td>0.6638</td>\n",
|
||||
" <td>0.17300</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>20.29</td>\n",
|
||||
" <td>14.34</td>\n",
|
||||
" <td>135.10</td>\n",
|
||||
" <td>1297.0</td>\n",
|
||||
" <td>0.10030</td>\n",
|
||||
" <td>0.13280</td>\n",
|
||||
" <td>0.1980</td>\n",
|
||||
" <td>0.10430</td>\n",
|
||||
" <td>0.1809</td>\n",
|
||||
" <td>0.05883</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>16.67</td>\n",
|
||||
" <td>152.20</td>\n",
|
||||
" <td>1575.0</td>\n",
|
||||
" <td>0.1374</td>\n",
|
||||
" <td>0.2050</td>\n",
|
||||
" <td>0.4000</td>\n",
|
||||
" <td>0.1625</td>\n",
|
||||
" <td>0.2364</td>\n",
|
||||
" <td>0.07678</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>5 rows × 31 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
|
||||
"0 17.99 10.38 122.80 1001.0 0.11840 \n",
|
||||
"1 20.57 17.77 132.90 1326.0 0.08474 \n",
|
||||
"2 19.69 21.25 130.00 1203.0 0.10960 \n",
|
||||
"3 11.42 20.38 77.58 386.1 0.14250 \n",
|
||||
"4 20.29 14.34 135.10 1297.0 0.10030 \n",
|
||||
"\n",
|
||||
" mean compactness mean concavity mean concave points mean symmetry \\\n",
|
||||
"0 0.27760 0.3001 0.14710 0.2419 \n",
|
||||
"1 0.07864 0.0869 0.07017 0.1812 \n",
|
||||
"2 0.15990 0.1974 0.12790 0.2069 \n",
|
||||
"3 0.28390 0.2414 0.10520 0.2597 \n",
|
||||
"4 0.13280 0.1980 0.10430 0.1809 \n",
|
||||
"\n",
|
||||
" mean fractal dimension ... worst texture worst perimeter worst area \\\n",
|
||||
"0 0.07871 ... 17.33 184.60 2019.0 \n",
|
||||
"1 0.05667 ... 23.41 158.80 1956.0 \n",
|
||||
"2 0.05999 ... 25.53 152.50 1709.0 \n",
|
||||
"3 0.09744 ... 26.50 98.87 567.7 \n",
|
||||
"4 0.05883 ... 16.67 152.20 1575.0 \n",
|
||||
"\n",
|
||||
" worst smoothness worst compactness worst concavity worst concave points \\\n",
|
||||
"0 0.1622 0.6656 0.7119 0.2654 \n",
|
||||
"1 0.1238 0.1866 0.2416 0.1860 \n",
|
||||
"2 0.1444 0.4245 0.4504 0.2430 \n",
|
||||
"3 0.2098 0.8663 0.6869 0.2575 \n",
|
||||
"4 0.1374 0.2050 0.4000 0.1625 \n",
|
||||
"\n",
|
||||
" worst symmetry worst fractal dimension target \n",
|
||||
"0 0.4601 0.11890 0.0 \n",
|
||||
"1 0.2750 0.08902 0.0 \n",
|
||||
"2 0.3613 0.08758 0.0 \n",
|
||||
"3 0.6638 0.17300 0.0 \n",
|
||||
"4 0.2364 0.07678 0.0 \n",
|
||||
"\n",
|
||||
"[5 rows x 31 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"((455, 30), (114, 30))"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
|
||||
" data.target, test_size=0.2,\n",
|
||||
" random_state=0)\n",
|
||||
"X_train.shape, X_test.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Variance method\n",
|
||||
"removing features that show the same value for the majority/all of the observations (constant/quasi-constant features)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"0 variables are found to be almost constant\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# the original dataset has no constant variable\n",
|
||||
"quasi_constant_feature = ft.constant_feature_detect(data=X_train,threshold=0.9)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"1.0 0.923077\n",
|
||||
"0.0 0.068132\n",
|
||||
"2.0 0.008791\n",
|
||||
"Name: dummy, dtype: float64"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# lets create a duumy variable that help us do the demonstration\n",
|
||||
"X_train['dummy'] = np.floor(X_train['worst smoothness']*10)\n",
|
||||
"# variable dummy has> 92% of the observations show one value, 1.0\n",
|
||||
"X_train.dummy.value_counts() / np.float(len(X_train))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"1 variables are found to be almost constant\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['dummy']"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"quasi_constant_feature = ft.constant_feature_detect(data=X_train,threshold=0.9)\n",
|
||||
"quasi_constant_feature"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"(455, 30)\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# drop that variable\n",
|
||||
"X_train.drop(labels=quasi_constant_feature,axis=1,inplace=True)\n",
|
||||
"print(X_train.shape)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Correlation method\n",
|
||||
"remove features that are highly correlated with each other"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" feature1 feature2 corr\n",
|
||||
"0 mean perimeter mean radius 0.998185\n",
|
||||
"6 mean perimeter mean area 0.986692\n",
|
||||
"14 mean perimeter worst perimeter 0.970507\n",
|
||||
"19 mean perimeter worst radius 0.969520\n",
|
||||
"33 mean perimeter worst area 0.941920 \n",
|
||||
"\n",
|
||||
" feature1 feature2 corr\n",
|
||||
"12 perimeter error radius error 0.978323\n",
|
||||
"30 perimeter error area error 0.944995 \n",
|
||||
"\n",
|
||||
" feature1 feature2 corr\n",
|
||||
"36 mean concavity mean concave points 0.914627 \n",
|
||||
"\n",
|
||||
" feature1 feature2 corr\n",
|
||||
"38 mean texture worst texture 0.908182 \n",
|
||||
"\n",
|
||||
" feature1 feature2 corr\n",
|
||||
"40 worst concave points mean concave points 0.906312 \n",
|
||||
"\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"corr = ft.corr_feature_detect(data=X_train,threshold=0.9)\n",
|
||||
"# print all the correlated feature groups!\n",
|
||||
"for i in corr:\n",
|
||||
" print(i,'\\n')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"then we can decide which ones to remove."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Mutual Information Filter\n",
|
||||
"Mutual information measures how much information the presence/absence of a feature contributes to making the correct prediction on Y."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Index(['mean concave points', 'worst perimeter', 'worst area'], dtype='object')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# select the top 3 features\n",
|
||||
"mi = ft.mutual_info(X=X_train,y=y_train,select_k=3)\n",
|
||||
"print(mi)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Index(['mean perimeter', 'mean concave points', 'worst radius',\n",
|
||||
" 'worst perimeter', 'worst area', 'worst concave points'],\n",
|
||||
" dtype='object')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# select the top 20% features\n",
|
||||
"mi = ft.mutual_info(X=X_train,y=y_train,select_k=0.2)\n",
|
||||
"print(mi)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Chi-Square Filter\n",
|
||||
"Compute chi-squared stats between each non-negative feature and class"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Index(['mean area', 'area error', 'worst area'], dtype='object')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# select the top 3 features\n",
|
||||
"chi = ft.chi_square_test(X=X_train,y=y_train,select_k=3)\n",
|
||||
"print(chi)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Index(['mean perimeter', 'mean area', 'area error', 'worst radius',\n",
|
||||
" 'worst perimeter', 'worst area'],\n",
|
||||
" dtype='object')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# select the top 20% features\n",
|
||||
"chi = ft.chi_square_test(X=X_train,y=y_train,select_k=0.2)\n",
|
||||
"print(chi)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Univariate ROC-AUC or MSE\n",
|
||||
"builds one decision tree per feature, to predict the target, then make predictions and ranks the features according to the machine learning metric (roc-auc or mse)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"worst perimeter 0.917275\n",
|
||||
"worst area 0.895840\n",
|
||||
"worst radius 0.893458\n",
|
||||
"worst concave points 0.863131\n",
|
||||
"mean concavity 0.856939\n",
|
||||
"mean radius 0.849000\n",
|
||||
"mean area 0.839314\n",
|
||||
"worst concavity 0.831375\n",
|
||||
"mean perimeter 0.829628\n",
|
||||
"mean concave points 0.826453\n",
|
||||
"area error 0.812321\n",
|
||||
"worst compactness 0.742299\n",
|
||||
"radius error 0.740235\n",
|
||||
"mean compactness 0.734360\n",
|
||||
"perimeter error 0.680534\n",
|
||||
"worst texture 0.647666\n",
|
||||
"worst fractal dimension 0.640997\n",
|
||||
"concavity error 0.640203\n",
|
||||
"worst symmetry 0.620991\n",
|
||||
"concave points error 0.618133\n",
|
||||
"compactness error 0.607336\n",
|
||||
"mean symmetry 0.591775\n",
|
||||
"mean texture 0.573357\n",
|
||||
"texture error 0.568593\n",
|
||||
"worst smoothness 0.565100\n",
|
||||
"mean smoothness 0.557637\n",
|
||||
"fractal dimension error 0.542077\n",
|
||||
"smoothness error 0.522706\n",
|
||||
"symmetry error 0.493649\n",
|
||||
"mean fractal dimension 0.475548\n",
|
||||
"dtype: float64\n",
|
||||
"11 out of the 30 featues are kept\n",
|
||||
"mean radius 0.849000\n",
|
||||
"mean perimeter 0.829628\n",
|
||||
"mean area 0.839314\n",
|
||||
"mean concavity 0.856939\n",
|
||||
"mean concave points 0.826453\n",
|
||||
"area error 0.812321\n",
|
||||
"worst radius 0.893458\n",
|
||||
"worst perimeter 0.917275\n",
|
||||
"worst area 0.895840\n",
|
||||
"worst concavity 0.831375\n",
|
||||
"worst concave points 0.863131\n",
|
||||
"dtype: float64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"uni_roc_auc = ft.univariate_roc_auc(X_train=X_train,y_train=y_train,\n",
|
||||
" X_test=X_test,y_test=y_test,threshold=0.8)\n",
|
||||
"print(uni_roc_auc)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"mean fractal dimension 0.491228\n",
|
||||
"symmetry error 0.480750\n",
|
||||
"fractal dimension error 0.456140\n",
|
||||
"smoothness error 0.449561\n",
|
||||
"texture error 0.412281\n",
|
||||
"worst smoothness 0.403265\n",
|
||||
"mean smoothness 0.399123\n",
|
||||
"mean texture 0.396930\n",
|
||||
"mean symmetry 0.363060\n",
|
||||
"compactness error 0.361842\n",
|
||||
"concave points error 0.357456\n",
|
||||
"worst fractal dimension 0.355263\n",
|
||||
"worst symmetry 0.350877\n",
|
||||
"worst texture 0.333333\n",
|
||||
"concavity error 0.333333\n",
|
||||
"perimeter error 0.300439\n",
|
||||
"mean compactness 0.258772\n",
|
||||
"worst compactness 0.254386\n",
|
||||
"radius error 0.245614\n",
|
||||
"area error 0.179825\n",
|
||||
"mean perimeter 0.166667\n",
|
||||
"mean concave points 0.166667\n",
|
||||
"worst concavity 0.162281\n",
|
||||
"mean radius 0.146930\n",
|
||||
"mean concavity 0.142544\n",
|
||||
"mean area 0.140351\n",
|
||||
"worst concave points 0.123782\n",
|
||||
"worst area 0.103070\n",
|
||||
"worst radius 0.100877\n",
|
||||
"worst perimeter 0.098684\n",
|
||||
"dtype: float64\n",
|
||||
"6 out of the 30 featues are kept\n",
|
||||
"mean fractal dimension 0.491228\n",
|
||||
"texture error 0.412281\n",
|
||||
"smoothness error 0.449561\n",
|
||||
"symmetry error 0.480750\n",
|
||||
"fractal dimension error 0.456140\n",
|
||||
"worst smoothness 0.403265\n",
|
||||
"dtype: float64\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"uni_mse = ft.univariate_mse(X_train=X_train,y_train=y_train,\n",
|
||||
" X_test=X_test,y_test=y_test,threshold=0.4)\n",
|
||||
"print(uni_mse)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
548
4.2_Demo_Feature_Selection_Wrapper.ipynb
Normal file
@@ -0,0 +1,548 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 45,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"# import seaborn as sns\n",
|
||||
"# import matplotlib.pyplot as plt\n",
|
||||
"import os\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from mlxtend.feature_selection import SequentialFeatureSelector as SFS\n",
|
||||
"from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS\n",
|
||||
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
||||
"\n",
|
||||
"# plt.style.use('seaborn-colorblind')\n",
|
||||
"# %matplotlib inline\n",
|
||||
"# from feature_selection import filter_method as ft"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.datasets import load_breast_cancer\n",
|
||||
"data = load_breast_cancer()\n",
|
||||
"data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
|
||||
" columns= np.append(data['feature_names'], ['target']))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>mean radius</th>\n",
|
||||
" <th>mean texture</th>\n",
|
||||
" <th>mean perimeter</th>\n",
|
||||
" <th>mean area</th>\n",
|
||||
" <th>mean smoothness</th>\n",
|
||||
" <th>mean compactness</th>\n",
|
||||
" <th>mean concavity</th>\n",
|
||||
" <th>mean concave points</th>\n",
|
||||
" <th>mean symmetry</th>\n",
|
||||
" <th>mean fractal dimension</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>worst texture</th>\n",
|
||||
" <th>worst perimeter</th>\n",
|
||||
" <th>worst area</th>\n",
|
||||
" <th>worst smoothness</th>\n",
|
||||
" <th>worst compactness</th>\n",
|
||||
" <th>worst concavity</th>\n",
|
||||
" <th>worst concave points</th>\n",
|
||||
" <th>worst symmetry</th>\n",
|
||||
" <th>worst fractal dimension</th>\n",
|
||||
" <th>target</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>17.99</td>\n",
|
||||
" <td>10.38</td>\n",
|
||||
" <td>122.80</td>\n",
|
||||
" <td>1001.0</td>\n",
|
||||
" <td>0.11840</td>\n",
|
||||
" <td>0.27760</td>\n",
|
||||
" <td>0.3001</td>\n",
|
||||
" <td>0.14710</td>\n",
|
||||
" <td>0.2419</td>\n",
|
||||
" <td>0.07871</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>17.33</td>\n",
|
||||
" <td>184.60</td>\n",
|
||||
" <td>2019.0</td>\n",
|
||||
" <td>0.1622</td>\n",
|
||||
" <td>0.6656</td>\n",
|
||||
" <td>0.7119</td>\n",
|
||||
" <td>0.2654</td>\n",
|
||||
" <td>0.4601</td>\n",
|
||||
" <td>0.11890</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>20.57</td>\n",
|
||||
" <td>17.77</td>\n",
|
||||
" <td>132.90</td>\n",
|
||||
" <td>1326.0</td>\n",
|
||||
" <td>0.08474</td>\n",
|
||||
" <td>0.07864</td>\n",
|
||||
" <td>0.0869</td>\n",
|
||||
" <td>0.07017</td>\n",
|
||||
" <td>0.1812</td>\n",
|
||||
" <td>0.05667</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>23.41</td>\n",
|
||||
" <td>158.80</td>\n",
|
||||
" <td>1956.0</td>\n",
|
||||
" <td>0.1238</td>\n",
|
||||
" <td>0.1866</td>\n",
|
||||
" <td>0.2416</td>\n",
|
||||
" <td>0.1860</td>\n",
|
||||
" <td>0.2750</td>\n",
|
||||
" <td>0.08902</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>19.69</td>\n",
|
||||
" <td>21.25</td>\n",
|
||||
" <td>130.00</td>\n",
|
||||
" <td>1203.0</td>\n",
|
||||
" <td>0.10960</td>\n",
|
||||
" <td>0.15990</td>\n",
|
||||
" <td>0.1974</td>\n",
|
||||
" <td>0.12790</td>\n",
|
||||
" <td>0.2069</td>\n",
|
||||
" <td>0.05999</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>25.53</td>\n",
|
||||
" <td>152.50</td>\n",
|
||||
" <td>1709.0</td>\n",
|
||||
" <td>0.1444</td>\n",
|
||||
" <td>0.4245</td>\n",
|
||||
" <td>0.4504</td>\n",
|
||||
" <td>0.2430</td>\n",
|
||||
" <td>0.3613</td>\n",
|
||||
" <td>0.08758</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>11.42</td>\n",
|
||||
" <td>20.38</td>\n",
|
||||
" <td>77.58</td>\n",
|
||||
" <td>386.1</td>\n",
|
||||
" <td>0.14250</td>\n",
|
||||
" <td>0.28390</td>\n",
|
||||
" <td>0.2414</td>\n",
|
||||
" <td>0.10520</td>\n",
|
||||
" <td>0.2597</td>\n",
|
||||
" <td>0.09744</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>26.50</td>\n",
|
||||
" <td>98.87</td>\n",
|
||||
" <td>567.7</td>\n",
|
||||
" <td>0.2098</td>\n",
|
||||
" <td>0.8663</td>\n",
|
||||
" <td>0.6869</td>\n",
|
||||
" <td>0.2575</td>\n",
|
||||
" <td>0.6638</td>\n",
|
||||
" <td>0.17300</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>20.29</td>\n",
|
||||
" <td>14.34</td>\n",
|
||||
" <td>135.10</td>\n",
|
||||
" <td>1297.0</td>\n",
|
||||
" <td>0.10030</td>\n",
|
||||
" <td>0.13280</td>\n",
|
||||
" <td>0.1980</td>\n",
|
||||
" <td>0.10430</td>\n",
|
||||
" <td>0.1809</td>\n",
|
||||
" <td>0.05883</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>16.67</td>\n",
|
||||
" <td>152.20</td>\n",
|
||||
" <td>1575.0</td>\n",
|
||||
" <td>0.1374</td>\n",
|
||||
" <td>0.2050</td>\n",
|
||||
" <td>0.4000</td>\n",
|
||||
" <td>0.1625</td>\n",
|
||||
" <td>0.2364</td>\n",
|
||||
" <td>0.07678</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>5 rows × 31 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
|
||||
"0 17.99 10.38 122.80 1001.0 0.11840 \n",
|
||||
"1 20.57 17.77 132.90 1326.0 0.08474 \n",
|
||||
"2 19.69 21.25 130.00 1203.0 0.10960 \n",
|
||||
"3 11.42 20.38 77.58 386.1 0.14250 \n",
|
||||
"4 20.29 14.34 135.10 1297.0 0.10030 \n",
|
||||
"\n",
|
||||
" mean compactness mean concavity mean concave points mean symmetry \\\n",
|
||||
"0 0.27760 0.3001 0.14710 0.2419 \n",
|
||||
"1 0.07864 0.0869 0.07017 0.1812 \n",
|
||||
"2 0.15990 0.1974 0.12790 0.2069 \n",
|
||||
"3 0.28390 0.2414 0.10520 0.2597 \n",
|
||||
"4 0.13280 0.1980 0.10430 0.1809 \n",
|
||||
"\n",
|
||||
" mean fractal dimension ... worst texture worst perimeter worst area \\\n",
|
||||
"0 0.07871 ... 17.33 184.60 2019.0 \n",
|
||||
"1 0.05667 ... 23.41 158.80 1956.0 \n",
|
||||
"2 0.05999 ... 25.53 152.50 1709.0 \n",
|
||||
"3 0.09744 ... 26.50 98.87 567.7 \n",
|
||||
"4 0.05883 ... 16.67 152.20 1575.0 \n",
|
||||
"\n",
|
||||
" worst smoothness worst compactness worst concavity worst concave points \\\n",
|
||||
"0 0.1622 0.6656 0.7119 0.2654 \n",
|
||||
"1 0.1238 0.1866 0.2416 0.1860 \n",
|
||||
"2 0.1444 0.4245 0.4504 0.2430 \n",
|
||||
"3 0.2098 0.8663 0.6869 0.2575 \n",
|
||||
"4 0.1374 0.2050 0.4000 0.1625 \n",
|
||||
"\n",
|
||||
" worst symmetry worst fractal dimension target \n",
|
||||
"0 0.4601 0.11890 0.0 \n",
|
||||
"1 0.2750 0.08902 0.0 \n",
|
||||
"2 0.3613 0.08758 0.0 \n",
|
||||
"3 0.6638 0.17300 0.0 \n",
|
||||
"4 0.2364 0.07678 0.0 \n",
|
||||
"\n",
|
||||
"[5 rows x 31 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"((455, 30), (114, 30))"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
|
||||
" data.target, test_size=0.2,\n",
|
||||
" random_state=0)\n",
|
||||
"X_train.shape, X_test.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Forward Selection\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 30 out of 30 | elapsed: 11.4s finished\n",
|
||||
"Features: 1/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 29 out of 29 | elapsed: 11.2s finished\n",
|
||||
"Features: 2/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 28 out of 28 | elapsed: 10.7s finished\n",
|
||||
"Features: 3/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 27 out of 27 | elapsed: 10.3s finished\n",
|
||||
"Features: 4/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 26 out of 26 | elapsed: 10.0s finished\n",
|
||||
"Features: 5/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 25 out of 25 | elapsed: 9.6s finished\n",
|
||||
"Features: 6/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 24 out of 24 | elapsed: 9.2s finished\n",
|
||||
"Features: 7/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 23 out of 23 | elapsed: 8.8s finished\n",
|
||||
"Features: 8/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 22 out of 22 | elapsed: 8.4s finished\n",
|
||||
"Features: 9/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 21 out of 21 | elapsed: 8.1s finished\n",
|
||||
"Features: 10/10"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# step forward feature selection\n",
|
||||
"# select top 10 features based on the optimal roc_auc and RandomForest Classifier\n",
|
||||
"\n",
|
||||
"sfs1 = SFS(RandomForestClassifier(n_jobs=-1,n_estimators=5), \n",
|
||||
" k_features=10, \n",
|
||||
" forward=True, \n",
|
||||
" floating=False, \n",
|
||||
" verbose=1,\n",
|
||||
" scoring='roc_auc',\n",
|
||||
" cv=3)\n",
|
||||
"\n",
|
||||
"sfs1 = sfs1.fit(np.array(X_train), y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Index(['mean texture', 'mean perimeter', 'mean concavity',\n",
|
||||
" 'mean fractal dimension', 'area error', 'compactness error',\n",
|
||||
" 'worst perimeter', 'worst area', 'worst smoothness', 'worst symmetry'],\n",
|
||||
" dtype='object')"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"selected_feat1= X_train.columns[list(sfs1.k_feature_idx_)]\n",
|
||||
"selected_feat1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Backward Elimination"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 30 out of 30 | elapsed: 11.5s finished\n",
|
||||
"Features: 1/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 29 out of 29 | elapsed: 11.2s finished\n",
|
||||
"Features: 2/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 28 out of 28 | elapsed: 10.7s finished\n",
|
||||
"Features: 3/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 27 out of 27 | elapsed: 10.2s finished\n",
|
||||
"Features: 4/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 26 out of 26 | elapsed: 10.1s finished\n",
|
||||
"Features: 5/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 25 out of 25 | elapsed: 9.6s finished\n",
|
||||
"Features: 6/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 24 out of 24 | elapsed: 9.2s finished\n",
|
||||
"Features: 7/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 23 out of 23 | elapsed: 8.8s finished\n",
|
||||
"Features: 8/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 22 out of 22 | elapsed: 8.5s finished\n",
|
||||
"Features: 9/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||||
"[Parallel(n_jobs=1)]: Done 21 out of 21 | elapsed: 8.2s finished\n",
|
||||
"Features: 10/10"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# step backward feature selection\n",
|
||||
"# select top 10 features based on the optimal roc_auc and RandomForest Classifier\n",
|
||||
"\n",
|
||||
"sfs2 = SFS(RandomForestClassifier(n_jobs=-1,n_estimators=5), \n",
|
||||
" k_features=10, \n",
|
||||
" forward=False, \n",
|
||||
" floating=False, \n",
|
||||
" verbose=1,\n",
|
||||
" scoring='roc_auc',\n",
|
||||
" cv=3)\n",
|
||||
"\n",
|
||||
"sfs2 = sfs1.fit(np.array(X_train.fillna(0)), y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Index(['mean area', 'mean compactness', 'texture error', 'area error',\n",
|
||||
" 'compactness error', 'concavity error', 'worst texture',\n",
|
||||
" 'worst perimeter', 'worst smoothness', 'worst concavity'],\n",
|
||||
" dtype='object')"
|
||||
]
|
||||
},
|
||||
"execution_count": 44,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"selected_feat2= X_train.columns[list(sfs2.k_feature_idx_)]\n",
|
||||
"selected_feat2\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Note that SFS and SBE return different results"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Exhaustive Feature Selection"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 51,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Features: 847/847"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"efs1 = EFS(RandomForestClassifier(n_jobs=-1,n_estimators=5, random_state=0), \n",
|
||||
" min_features=1,\n",
|
||||
" max_features=6, \n",
|
||||
" scoring='roc_auc',\n",
|
||||
" print_progress=True,\n",
|
||||
" cv=2)\n",
|
||||
"\n",
|
||||
"# in order to shorter search time for the demonstration\n",
|
||||
"# we only try all possible 1,2,3,4,5,6\n",
|
||||
"# feature combinations from a dataset of 10 features\n",
|
||||
"\n",
|
||||
"efs1 = efs1.fit(np.array(X_train[X_train.columns[0:10]].fillna(0)), y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Index(['mean radius', 'mean texture', 'mean area', 'mean smoothness',\n",
|
||||
" 'mean concavity'],\n",
|
||||
" dtype='object')"
|
||||
]
|
||||
},
|
||||
"execution_count": 52,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"selected_feat3= X_train.columns[list(efs1.best_idx_)]\n",
|
||||
"selected_feat3"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
679
4.3_Demo_Feature_Selection_Embedded.ipynb
Normal file
595
4.4_Demo_Feature_Selection_Feature_Shuffling.ipynb
Normal file
@@ -0,0 +1,595 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"# import seaborn as sns\n",
|
||||
"# import matplotlib.pyplot as plt\n",
|
||||
"import os\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.feature_selection import SelectFromModel\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||
"# plt.style.use('seaborn-colorblind')\n",
|
||||
"# %matplotlib inline\n",
|
||||
"from feature_selection import feature_shuffle\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.datasets import load_breast_cancer\n",
|
||||
"data = load_breast_cancer()\n",
|
||||
"data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
|
||||
" columns= np.append(data['feature_names'], ['target']))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>mean radius</th>\n",
|
||||
" <th>mean texture</th>\n",
|
||||
" <th>mean perimeter</th>\n",
|
||||
" <th>mean area</th>\n",
|
||||
" <th>mean smoothness</th>\n",
|
||||
" <th>mean compactness</th>\n",
|
||||
" <th>mean concavity</th>\n",
|
||||
" <th>mean concave points</th>\n",
|
||||
" <th>mean symmetry</th>\n",
|
||||
" <th>mean fractal dimension</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>worst texture</th>\n",
|
||||
" <th>worst perimeter</th>\n",
|
||||
" <th>worst area</th>\n",
|
||||
" <th>worst smoothness</th>\n",
|
||||
" <th>worst compactness</th>\n",
|
||||
" <th>worst concavity</th>\n",
|
||||
" <th>worst concave points</th>\n",
|
||||
" <th>worst symmetry</th>\n",
|
||||
" <th>worst fractal dimension</th>\n",
|
||||
" <th>target</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>17.99</td>\n",
|
||||
" <td>10.38</td>\n",
|
||||
" <td>122.80</td>\n",
|
||||
" <td>1001.0</td>\n",
|
||||
" <td>0.11840</td>\n",
|
||||
" <td>0.27760</td>\n",
|
||||
" <td>0.3001</td>\n",
|
||||
" <td>0.14710</td>\n",
|
||||
" <td>0.2419</td>\n",
|
||||
" <td>0.07871</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>17.33</td>\n",
|
||||
" <td>184.60</td>\n",
|
||||
" <td>2019.0</td>\n",
|
||||
" <td>0.1622</td>\n",
|
||||
" <td>0.6656</td>\n",
|
||||
" <td>0.7119</td>\n",
|
||||
" <td>0.2654</td>\n",
|
||||
" <td>0.4601</td>\n",
|
||||
" <td>0.11890</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>20.57</td>\n",
|
||||
" <td>17.77</td>\n",
|
||||
" <td>132.90</td>\n",
|
||||
" <td>1326.0</td>\n",
|
||||
" <td>0.08474</td>\n",
|
||||
" <td>0.07864</td>\n",
|
||||
" <td>0.0869</td>\n",
|
||||
" <td>0.07017</td>\n",
|
||||
" <td>0.1812</td>\n",
|
||||
" <td>0.05667</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>23.41</td>\n",
|
||||
" <td>158.80</td>\n",
|
||||
" <td>1956.0</td>\n",
|
||||
" <td>0.1238</td>\n",
|
||||
" <td>0.1866</td>\n",
|
||||
" <td>0.2416</td>\n",
|
||||
" <td>0.1860</td>\n",
|
||||
" <td>0.2750</td>\n",
|
||||
" <td>0.08902</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>19.69</td>\n",
|
||||
" <td>21.25</td>\n",
|
||||
" <td>130.00</td>\n",
|
||||
" <td>1203.0</td>\n",
|
||||
" <td>0.10960</td>\n",
|
||||
" <td>0.15990</td>\n",
|
||||
" <td>0.1974</td>\n",
|
||||
" <td>0.12790</td>\n",
|
||||
" <td>0.2069</td>\n",
|
||||
" <td>0.05999</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>25.53</td>\n",
|
||||
" <td>152.50</td>\n",
|
||||
" <td>1709.0</td>\n",
|
||||
" <td>0.1444</td>\n",
|
||||
" <td>0.4245</td>\n",
|
||||
" <td>0.4504</td>\n",
|
||||
" <td>0.2430</td>\n",
|
||||
" <td>0.3613</td>\n",
|
||||
" <td>0.08758</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>11.42</td>\n",
|
||||
" <td>20.38</td>\n",
|
||||
" <td>77.58</td>\n",
|
||||
" <td>386.1</td>\n",
|
||||
" <td>0.14250</td>\n",
|
||||
" <td>0.28390</td>\n",
|
||||
" <td>0.2414</td>\n",
|
||||
" <td>0.10520</td>\n",
|
||||
" <td>0.2597</td>\n",
|
||||
" <td>0.09744</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>26.50</td>\n",
|
||||
" <td>98.87</td>\n",
|
||||
" <td>567.7</td>\n",
|
||||
" <td>0.2098</td>\n",
|
||||
" <td>0.8663</td>\n",
|
||||
" <td>0.6869</td>\n",
|
||||
" <td>0.2575</td>\n",
|
||||
" <td>0.6638</td>\n",
|
||||
" <td>0.17300</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>20.29</td>\n",
|
||||
" <td>14.34</td>\n",
|
||||
" <td>135.10</td>\n",
|
||||
" <td>1297.0</td>\n",
|
||||
" <td>0.10030</td>\n",
|
||||
" <td>0.13280</td>\n",
|
||||
" <td>0.1980</td>\n",
|
||||
" <td>0.10430</td>\n",
|
||||
" <td>0.1809</td>\n",
|
||||
" <td>0.05883</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>16.67</td>\n",
|
||||
" <td>152.20</td>\n",
|
||||
" <td>1575.0</td>\n",
|
||||
" <td>0.1374</td>\n",
|
||||
" <td>0.2050</td>\n",
|
||||
" <td>0.4000</td>\n",
|
||||
" <td>0.1625</td>\n",
|
||||
" <td>0.2364</td>\n",
|
||||
" <td>0.07678</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>5 rows × 31 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
|
||||
"0 17.99 10.38 122.80 1001.0 0.11840 \n",
|
||||
"1 20.57 17.77 132.90 1326.0 0.08474 \n",
|
||||
"2 19.69 21.25 130.00 1203.0 0.10960 \n",
|
||||
"3 11.42 20.38 77.58 386.1 0.14250 \n",
|
||||
"4 20.29 14.34 135.10 1297.0 0.10030 \n",
|
||||
"\n",
|
||||
" mean compactness mean concavity mean concave points mean symmetry \\\n",
|
||||
"0 0.27760 0.3001 0.14710 0.2419 \n",
|
||||
"1 0.07864 0.0869 0.07017 0.1812 \n",
|
||||
"2 0.15990 0.1974 0.12790 0.2069 \n",
|
||||
"3 0.28390 0.2414 0.10520 0.2597 \n",
|
||||
"4 0.13280 0.1980 0.10430 0.1809 \n",
|
||||
"\n",
|
||||
" mean fractal dimension ... worst texture worst perimeter worst area \\\n",
|
||||
"0 0.07871 ... 17.33 184.60 2019.0 \n",
|
||||
"1 0.05667 ... 23.41 158.80 1956.0 \n",
|
||||
"2 0.05999 ... 25.53 152.50 1709.0 \n",
|
||||
"3 0.09744 ... 26.50 98.87 567.7 \n",
|
||||
"4 0.05883 ... 16.67 152.20 1575.0 \n",
|
||||
"\n",
|
||||
" worst smoothness worst compactness worst concavity worst concave points \\\n",
|
||||
"0 0.1622 0.6656 0.7119 0.2654 \n",
|
||||
"1 0.1238 0.1866 0.2416 0.1860 \n",
|
||||
"2 0.1444 0.4245 0.4504 0.2430 \n",
|
||||
"3 0.2098 0.8663 0.6869 0.2575 \n",
|
||||
"4 0.1374 0.2050 0.4000 0.1625 \n",
|
||||
"\n",
|
||||
" worst symmetry worst fractal dimension target \n",
|
||||
"0 0.4601 0.11890 0.0 \n",
|
||||
"1 0.2750 0.08902 0.0 \n",
|
||||
"2 0.3613 0.08758 0.0 \n",
|
||||
"3 0.6638 0.17300 0.0 \n",
|
||||
"4 0.2364 0.07678 0.0 \n",
|
||||
"\n",
|
||||
"[5 rows x 31 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"((455, 30), (114, 30))"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
|
||||
" data.target, test_size=0.2,\n",
|
||||
" random_state=0)\n",
|
||||
"X_train.shape, X_test.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Feature Shuffling\n",
|
||||
"permute the values of each feature, one at the time, and measure how much the permutation decreases the accuracy, or the roc_auc, or the mse of the machine learning model.\n",
|
||||
"If the variables are important, this is, highly predictive, a random permutation of their values will decrease dramatically any of these metrics."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"auc_drop, selected_features = feature_shuffle.feature_shuffle_rf(X_train=X_train,\n",
|
||||
" y_train=y_train,\n",
|
||||
" random_state=0)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>feature</th>\n",
|
||||
" <th>auc_drop</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>22</th>\n",
|
||||
" <td>worst perimeter</td>\n",
|
||||
" <td>8.359457e-05</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>27</th>\n",
|
||||
" <td>worst concave points</td>\n",
|
||||
" <td>3.134796e-05</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>23</th>\n",
|
||||
" <td>worst area</td>\n",
|
||||
" <td>1.110223e-16</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>12</th>\n",
|
||||
" <td>perimeter error</td>\n",
|
||||
" <td>1.110223e-16</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>mean radius</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>16</th>\n",
|
||||
" <td>concavity error</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>28</th>\n",
|
||||
" <td>worst symmetry</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>26</th>\n",
|
||||
" <td>worst concavity</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>25</th>\n",
|
||||
" <td>worst compactness</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>24</th>\n",
|
||||
" <td>worst smoothness</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>21</th>\n",
|
||||
" <td>worst texture</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>20</th>\n",
|
||||
" <td>worst radius</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>19</th>\n",
|
||||
" <td>fractal dimension error</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>18</th>\n",
|
||||
" <td>symmetry error</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>17</th>\n",
|
||||
" <td>concave points error</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>15</th>\n",
|
||||
" <td>compactness error</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>mean texture</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>14</th>\n",
|
||||
" <td>smoothness error</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>13</th>\n",
|
||||
" <td>area error</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11</th>\n",
|
||||
" <td>texture error</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10</th>\n",
|
||||
" <td>radius error</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9</th>\n",
|
||||
" <td>mean fractal dimension</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8</th>\n",
|
||||
" <td>mean symmetry</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>7</th>\n",
|
||||
" <td>mean concave points</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>mean concavity</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>mean compactness</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>mean smoothness</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>mean area</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>mean perimeter</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>29</th>\n",
|
||||
" <td>worst fractal dimension</td>\n",
|
||||
" <td>0.000000e+00</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" feature auc_drop\n",
|
||||
"22 worst perimeter 8.359457e-05\n",
|
||||
"27 worst concave points 3.134796e-05\n",
|
||||
"23 worst area 1.110223e-16\n",
|
||||
"12 perimeter error 1.110223e-16\n",
|
||||
"0 mean radius 0.000000e+00\n",
|
||||
"16 concavity error 0.000000e+00\n",
|
||||
"28 worst symmetry 0.000000e+00\n",
|
||||
"26 worst concavity 0.000000e+00\n",
|
||||
"25 worst compactness 0.000000e+00\n",
|
||||
"24 worst smoothness 0.000000e+00\n",
|
||||
"21 worst texture 0.000000e+00\n",
|
||||
"20 worst radius 0.000000e+00\n",
|
||||
"19 fractal dimension error 0.000000e+00\n",
|
||||
"18 symmetry error 0.000000e+00\n",
|
||||
"17 concave points error 0.000000e+00\n",
|
||||
"15 compactness error 0.000000e+00\n",
|
||||
"1 mean texture 0.000000e+00\n",
|
||||
"14 smoothness error 0.000000e+00\n",
|
||||
"13 area error 0.000000e+00\n",
|
||||
"11 texture error 0.000000e+00\n",
|
||||
"10 radius error 0.000000e+00\n",
|
||||
"9 mean fractal dimension 0.000000e+00\n",
|
||||
"8 mean symmetry 0.000000e+00\n",
|
||||
"7 mean concave points 0.000000e+00\n",
|
||||
"6 mean concavity 0.000000e+00\n",
|
||||
"5 mean compactness 0.000000e+00\n",
|
||||
"4 mean smoothness 0.000000e+00\n",
|
||||
"3 mean area 0.000000e+00\n",
|
||||
"2 mean perimeter 0.000000e+00\n",
|
||||
"29 worst fractal dimension 0.000000e+00"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# we select features that have auc_drop > 0\n",
|
||||
"auc_drop"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"22 worst perimeter\n",
|
||||
"27 worst concave points\n",
|
||||
"23 worst area\n",
|
||||
"12 perimeter error\n",
|
||||
"Name: feature, dtype: object"
|
||||
]
|
||||
},
|
||||
"execution_count": 19,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"selected_features"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
884
4.5_Demo_Feature_Selection_Hybrid_method.ipynb
Normal file
@@ -0,0 +1,884 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"# import seaborn as sns\n",
|
||||
"# import matplotlib.pyplot as plt\n",
|
||||
"import os\n",
|
||||
"from sklearn.model_selection import train_test_split\n",
|
||||
"from sklearn.feature_selection import SelectFromModel\n",
|
||||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||||
"# plt.style.use('seaborn-colorblind')\n",
|
||||
"# %matplotlib inline\n",
|
||||
"from sklearn.feature_selection import RFE\n",
|
||||
"from feature_selection import hybrid\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Dataset"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from sklearn.datasets import load_breast_cancer\n",
|
||||
"data = load_breast_cancer()\n",
|
||||
"data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
|
||||
" columns= np.append(data['feature_names'], ['target']))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>mean radius</th>\n",
|
||||
" <th>mean texture</th>\n",
|
||||
" <th>mean perimeter</th>\n",
|
||||
" <th>mean area</th>\n",
|
||||
" <th>mean smoothness</th>\n",
|
||||
" <th>mean compactness</th>\n",
|
||||
" <th>mean concavity</th>\n",
|
||||
" <th>mean concave points</th>\n",
|
||||
" <th>mean symmetry</th>\n",
|
||||
" <th>mean fractal dimension</th>\n",
|
||||
" <th>...</th>\n",
|
||||
" <th>worst texture</th>\n",
|
||||
" <th>worst perimeter</th>\n",
|
||||
" <th>worst area</th>\n",
|
||||
" <th>worst smoothness</th>\n",
|
||||
" <th>worst compactness</th>\n",
|
||||
" <th>worst concavity</th>\n",
|
||||
" <th>worst concave points</th>\n",
|
||||
" <th>worst symmetry</th>\n",
|
||||
" <th>worst fractal dimension</th>\n",
|
||||
" <th>target</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>17.99</td>\n",
|
||||
" <td>10.38</td>\n",
|
||||
" <td>122.80</td>\n",
|
||||
" <td>1001.0</td>\n",
|
||||
" <td>0.11840</td>\n",
|
||||
" <td>0.27760</td>\n",
|
||||
" <td>0.3001</td>\n",
|
||||
" <td>0.14710</td>\n",
|
||||
" <td>0.2419</td>\n",
|
||||
" <td>0.07871</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>17.33</td>\n",
|
||||
" <td>184.60</td>\n",
|
||||
" <td>2019.0</td>\n",
|
||||
" <td>0.1622</td>\n",
|
||||
" <td>0.6656</td>\n",
|
||||
" <td>0.7119</td>\n",
|
||||
" <td>0.2654</td>\n",
|
||||
" <td>0.4601</td>\n",
|
||||
" <td>0.11890</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>20.57</td>\n",
|
||||
" <td>17.77</td>\n",
|
||||
" <td>132.90</td>\n",
|
||||
" <td>1326.0</td>\n",
|
||||
" <td>0.08474</td>\n",
|
||||
" <td>0.07864</td>\n",
|
||||
" <td>0.0869</td>\n",
|
||||
" <td>0.07017</td>\n",
|
||||
" <td>0.1812</td>\n",
|
||||
" <td>0.05667</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>23.41</td>\n",
|
||||
" <td>158.80</td>\n",
|
||||
" <td>1956.0</td>\n",
|
||||
" <td>0.1238</td>\n",
|
||||
" <td>0.1866</td>\n",
|
||||
" <td>0.2416</td>\n",
|
||||
" <td>0.1860</td>\n",
|
||||
" <td>0.2750</td>\n",
|
||||
" <td>0.08902</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>19.69</td>\n",
|
||||
" <td>21.25</td>\n",
|
||||
" <td>130.00</td>\n",
|
||||
" <td>1203.0</td>\n",
|
||||
" <td>0.10960</td>\n",
|
||||
" <td>0.15990</td>\n",
|
||||
" <td>0.1974</td>\n",
|
||||
" <td>0.12790</td>\n",
|
||||
" <td>0.2069</td>\n",
|
||||
" <td>0.05999</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>25.53</td>\n",
|
||||
" <td>152.50</td>\n",
|
||||
" <td>1709.0</td>\n",
|
||||
" <td>0.1444</td>\n",
|
||||
" <td>0.4245</td>\n",
|
||||
" <td>0.4504</td>\n",
|
||||
" <td>0.2430</td>\n",
|
||||
" <td>0.3613</td>\n",
|
||||
" <td>0.08758</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>11.42</td>\n",
|
||||
" <td>20.38</td>\n",
|
||||
" <td>77.58</td>\n",
|
||||
" <td>386.1</td>\n",
|
||||
" <td>0.14250</td>\n",
|
||||
" <td>0.28390</td>\n",
|
||||
" <td>0.2414</td>\n",
|
||||
" <td>0.10520</td>\n",
|
||||
" <td>0.2597</td>\n",
|
||||
" <td>0.09744</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>26.50</td>\n",
|
||||
" <td>98.87</td>\n",
|
||||
" <td>567.7</td>\n",
|
||||
" <td>0.2098</td>\n",
|
||||
" <td>0.8663</td>\n",
|
||||
" <td>0.6869</td>\n",
|
||||
" <td>0.2575</td>\n",
|
||||
" <td>0.6638</td>\n",
|
||||
" <td>0.17300</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>20.29</td>\n",
|
||||
" <td>14.34</td>\n",
|
||||
" <td>135.10</td>\n",
|
||||
" <td>1297.0</td>\n",
|
||||
" <td>0.10030</td>\n",
|
||||
" <td>0.13280</td>\n",
|
||||
" <td>0.1980</td>\n",
|
||||
" <td>0.10430</td>\n",
|
||||
" <td>0.1809</td>\n",
|
||||
" <td>0.05883</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>16.67</td>\n",
|
||||
" <td>152.20</td>\n",
|
||||
" <td>1575.0</td>\n",
|
||||
" <td>0.1374</td>\n",
|
||||
" <td>0.2050</td>\n",
|
||||
" <td>0.4000</td>\n",
|
||||
" <td>0.1625</td>\n",
|
||||
" <td>0.2364</td>\n",
|
||||
" <td>0.07678</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>5 rows × 31 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
|
||||
"0 17.99 10.38 122.80 1001.0 0.11840 \n",
|
||||
"1 20.57 17.77 132.90 1326.0 0.08474 \n",
|
||||
"2 19.69 21.25 130.00 1203.0 0.10960 \n",
|
||||
"3 11.42 20.38 77.58 386.1 0.14250 \n",
|
||||
"4 20.29 14.34 135.10 1297.0 0.10030 \n",
|
||||
"\n",
|
||||
" mean compactness mean concavity mean concave points mean symmetry \\\n",
|
||||
"0 0.27760 0.3001 0.14710 0.2419 \n",
|
||||
"1 0.07864 0.0869 0.07017 0.1812 \n",
|
||||
"2 0.15990 0.1974 0.12790 0.2069 \n",
|
||||
"3 0.28390 0.2414 0.10520 0.2597 \n",
|
||||
"4 0.13280 0.1980 0.10430 0.1809 \n",
|
||||
"\n",
|
||||
" mean fractal dimension ... worst texture worst perimeter worst area \\\n",
|
||||
"0 0.07871 ... 17.33 184.60 2019.0 \n",
|
||||
"1 0.05667 ... 23.41 158.80 1956.0 \n",
|
||||
"2 0.05999 ... 25.53 152.50 1709.0 \n",
|
||||
"3 0.09744 ... 26.50 98.87 567.7 \n",
|
||||
"4 0.05883 ... 16.67 152.20 1575.0 \n",
|
||||
"\n",
|
||||
" worst smoothness worst compactness worst concavity worst concave points \\\n",
|
||||
"0 0.1622 0.6656 0.7119 0.2654 \n",
|
||||
"1 0.1238 0.1866 0.2416 0.1860 \n",
|
||||
"2 0.1444 0.4245 0.4504 0.2430 \n",
|
||||
"3 0.2098 0.8663 0.6869 0.2575 \n",
|
||||
"4 0.1374 0.2050 0.4000 0.1625 \n",
|
||||
"\n",
|
||||
" worst symmetry worst fractal dimension target \n",
|
||||
"0 0.4601 0.11890 0.0 \n",
|
||||
"1 0.2750 0.08902 0.0 \n",
|
||||
"2 0.3613 0.08758 0.0 \n",
|
||||
"3 0.6638 0.17300 0.0 \n",
|
||||
"4 0.2364 0.07678 0.0 \n",
|
||||
"\n",
|
||||
"[5 rows x 31 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data.head(5)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"((455, 30), (114, 30))"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
|
||||
" data.target, test_size=0.2,\n",
|
||||
" random_state=0)\n",
|
||||
"X_train.shape, X_test.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Recursive Feature Elimination \n",
|
||||
"### with Random Forests Importance\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Example 1\n",
|
||||
"This method is slightly **different from the guide**, as it use a different stopping criterion: the desired number of features to select is eventually reached."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
|
||||
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
|
||||
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
|
||||
" min_samples_leaf=1, min_samples_split=2,\n",
|
||||
" min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,\n",
|
||||
" oob_score=False, random_state=None, verbose=0,\n",
|
||||
" warm_start=False),\n",
|
||||
" n_features_to_select=10, step=1, verbose=0)"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# n_features_to_select decide the stopping criterion\n",
|
||||
"# we stop till 10 features remaining\n",
|
||||
"\n",
|
||||
"sel_ = RFE(RandomForestClassifier(n_estimators=20), n_features_to_select=10)\n",
|
||||
"sel_.fit(X_train.fillna(0), y_train)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Index(['mean texture', 'mean perimeter', 'mean area', 'mean concavity',\n",
|
||||
" 'mean concave points', 'worst radius', 'worst perimeter', 'worst area',\n",
|
||||
" 'worst concave points', 'worst symmetry'],\n",
|
||||
" dtype='object')\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"selected_feat = X_train.columns[(sel_.get_support())]\n",
|
||||
"print(selected_feat)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"collapsed": true
|
||||
},
|
||||
"source": [
|
||||
"### Example 2\n",
|
||||
"recursive feature elimination with RandomForest\n",
|
||||
"with the method same as the guide\n",
|
||||
"1. Rank the features according to their importance derived from a machine learning algorithm: it can be tree importance, or LASSO / Ridge, or the linear / logistic regression coefficients.\n",
|
||||
"2. Remove one feature -the least important- and build a machine learning algorithm utilizing the remaining features.\n",
|
||||
"3. Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.\n",
|
||||
"4. If the metric decreases by more of an arbitrarily set threshold, then that feature is important and should be kept. Otherwise, we can remove that feature.\n",
|
||||
"5. Repeat steps 2-4 until all features have been removed (and therefore evaluated) and the drop in performance assessed.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"testing feature: mean radius which is feature 1 out of 30\n",
|
||||
"New Test ROC AUC=0.9941251190854239\n",
|
||||
"All features Test ROC AUC=0.9968243886948238\n",
|
||||
"Drop in ROC AUC=0.0026992696093999236\n",
|
||||
"keep: mean radius\n",
|
||||
"\n",
|
||||
"testing feature: mean texture which is feature 2 out of 30\n",
|
||||
"New Test ROC AUC=0.9936487773896475\n",
|
||||
"All features Test ROC AUC=0.9968243886948238\n",
|
||||
"Drop in ROC AUC=0.0031756113051762958\n",
|
||||
"keep: mean texture\n",
|
||||
"\n",
|
||||
"testing feature: mean perimeter which is feature 3 out of 30\n",
|
||||
"New Test ROC AUC=0.9968243886948238\n",
|
||||
"All features Test ROC AUC=0.9968243886948238\n",
|
||||
"Drop in ROC AUC=0.0\n",
|
||||
"remove: mean perimeter\n",
|
||||
"\n",
|
||||
"testing feature: mean area which is feature 4 out of 30\n",
|
||||
"New Test ROC AUC=0.9960304858685297\n",
|
||||
"All features Test ROC AUC=0.9968243886948238\n",
|
||||
"Drop in ROC AUC=0.0007939028262941017\n",
|
||||
"remove: mean area\n",
|
||||
"\n",
|
||||
"testing feature: mean smoothness which is feature 5 out of 30\n",
|
||||
"New Test ROC AUC=0.9965068275643061\n",
|
||||
"All features Test ROC AUC=0.9960304858685297\n",
|
||||
"Drop in ROC AUC=-0.0004763416957763722\n",
|
||||
"remove: mean smoothness\n",
|
||||
"\n",
|
||||
"testing feature: mean compactness which is feature 6 out of 30\n",
|
||||
"New Test ROC AUC=0.9942838996506828\n",
|
||||
"All features Test ROC AUC=0.9965068275643061\n",
|
||||
"Drop in ROC AUC=0.0022229279136233293\n",
|
||||
"keep: mean compactness\n",
|
||||
"\n",
|
||||
"testing feature: mean concavity which is feature 7 out of 30\n",
|
||||
"New Test ROC AUC=0.9957129247380121\n",
|
||||
"All features Test ROC AUC=0.9965068275643061\n",
|
||||
"Drop in ROC AUC=0.0007939028262939907\n",
|
||||
"remove: mean concavity\n",
|
||||
"\n",
|
||||
"testing feature: mean concave points which is feature 8 out of 30\n",
|
||||
"New Test ROC AUC=0.9976182915211178\n",
|
||||
"All features Test ROC AUC=0.9957129247380121\n",
|
||||
"Drop in ROC AUC=-0.0019053667831057108\n",
|
||||
"remove: mean concave points\n",
|
||||
"\n",
|
||||
"testing feature: mean symmetry which is feature 9 out of 30\n",
|
||||
"New Test ROC AUC=0.9953953636074945\n",
|
||||
"All features Test ROC AUC=0.9976182915211178\n",
|
||||
"Drop in ROC AUC=0.0022229279136233293\n",
|
||||
"keep: mean symmetry\n",
|
||||
"\n",
|
||||
"testing feature: mean fractal dimension which is feature 10 out of 30\n",
|
||||
"New Test ROC AUC=0.9949190219117181\n",
|
||||
"All features Test ROC AUC=0.9976182915211178\n",
|
||||
"Drop in ROC AUC=0.0026992696093997015\n",
|
||||
"keep: mean fractal dimension\n",
|
||||
"\n",
|
||||
"testing feature: radius error which is feature 11 out of 30\n",
|
||||
"New Test ROC AUC=0.9952365830422356\n",
|
||||
"All features Test ROC AUC=0.9976182915211178\n",
|
||||
"Drop in ROC AUC=0.002381708478882194\n",
|
||||
"keep: radius error\n",
|
||||
"\n",
|
||||
"testing feature: texture error which is feature 12 out of 30\n",
|
||||
"New Test ROC AUC=0.9952365830422356\n",
|
||||
"All features Test ROC AUC=0.9976182915211178\n",
|
||||
"Drop in ROC AUC=0.002381708478882194\n",
|
||||
"keep: texture error\n",
|
||||
"\n",
|
||||
"testing feature: perimeter error which is feature 13 out of 30\n",
|
||||
"New Test ROC AUC=0.9939663385201651\n",
|
||||
"All features Test ROC AUC=0.9976182915211178\n",
|
||||
"Drop in ROC AUC=0.003651953000952668\n",
|
||||
"keep: perimeter error\n",
|
||||
"\n",
|
||||
"testing feature: area error which is feature 14 out of 30\n",
|
||||
"New Test ROC AUC=0.994919021911718\n",
|
||||
"All features Test ROC AUC=0.9976182915211178\n",
|
||||
"Drop in ROC AUC=0.0026992696093998125\n",
|
||||
"keep: area error\n",
|
||||
"\n",
|
||||
"testing feature: smoothness error which is feature 15 out of 30\n",
|
||||
"New Test ROC AUC=0.995871705303271\n",
|
||||
"All features Test ROC AUC=0.9976182915211178\n",
|
||||
"Drop in ROC AUC=0.001746586217846846\n",
|
||||
"keep: smoothness error\n",
|
||||
"\n",
|
||||
"testing feature: compactness error which is feature 16 out of 30\n",
|
||||
"New Test ROC AUC=0.9958717053032708\n",
|
||||
"All features Test ROC AUC=0.9976182915211178\n",
|
||||
"Drop in ROC AUC=0.0017465862178469571\n",
|
||||
"keep: compactness error\n",
|
||||
"\n",
|
||||
"testing feature: concavity error which is feature 17 out of 30\n",
|
||||
"New Test ROC AUC=0.9961892664337886\n",
|
||||
"All features Test ROC AUC=0.9976182915211178\n",
|
||||
"Drop in ROC AUC=0.0014290250873292276\n",
|
||||
"keep: concavity error\n",
|
||||
"\n",
|
||||
"testing feature: concave points error which is feature 18 out of 30\n",
|
||||
"New Test ROC AUC=0.9961892664337885\n",
|
||||
"All features Test ROC AUC=0.9976182915211178\n",
|
||||
"Drop in ROC AUC=0.0014290250873293386\n",
|
||||
"keep: concave points error\n",
|
||||
"\n",
|
||||
"testing feature: symmetry error which is feature 19 out of 30\n",
|
||||
"New Test ROC AUC=0.9968243886948238\n",
|
||||
"All features Test ROC AUC=0.9976182915211178\n",
|
||||
"Drop in ROC AUC=0.0007939028262939907\n",
|
||||
"remove: symmetry error\n",
|
||||
"\n",
|
||||
"testing feature: fractal dimension error which is feature 20 out of 30\n",
|
||||
"New Test ROC AUC=0.9946014607812005\n",
|
||||
"All features Test ROC AUC=0.9968243886948238\n",
|
||||
"Drop in ROC AUC=0.0022229279136233293\n",
|
||||
"keep: fractal dimension error\n",
|
||||
"\n",
|
||||
"testing feature: worst radius which is feature 21 out of 30\n",
|
||||
"New Test ROC AUC=0.9955541441727532\n",
|
||||
"All features Test ROC AUC=0.9968243886948238\n",
|
||||
"Drop in ROC AUC=0.001270244522070585\n",
|
||||
"keep: worst radius\n",
|
||||
"\n",
|
||||
"testing feature: worst texture which is feature 22 out of 30\n",
|
||||
"New Test ROC AUC=0.9958717053032708\n",
|
||||
"All features Test ROC AUC=0.9968243886948238\n",
|
||||
"Drop in ROC AUC=0.0009526833915529664\n",
|
||||
"remove: worst texture\n",
|
||||
"\n",
|
||||
"testing feature: worst perimeter which is feature 23 out of 30\n",
|
||||
"New Test ROC AUC=0.995871705303271\n",
|
||||
"All features Test ROC AUC=0.9958717053032708\n",
|
||||
"Drop in ROC AUC=-1.1102230246251565e-16\n",
|
||||
"remove: worst perimeter\n",
|
||||
"\n",
|
||||
"testing feature: worst area which is feature 24 out of 30\n",
|
||||
"New Test ROC AUC=0.9938075579549063\n",
|
||||
"All features Test ROC AUC=0.995871705303271\n",
|
||||
"Drop in ROC AUC=0.0020641473483646866\n",
|
||||
"keep: worst area\n",
|
||||
"\n",
|
||||
"testing feature: worst smoothness which is feature 25 out of 30\n",
|
||||
"New Test ROC AUC=0.9939663385201651\n",
|
||||
"All features Test ROC AUC=0.995871705303271\n",
|
||||
"Drop in ROC AUC=0.0019053667831058219\n",
|
||||
"keep: worst smoothness\n",
|
||||
"\n",
|
||||
"testing feature: worst compactness which is feature 26 out of 30\n",
|
||||
"New Test ROC AUC=0.9960304858685296\n",
|
||||
"All features Test ROC AUC=0.995871705303271\n",
|
||||
"Drop in ROC AUC=-0.0001587805652586427\n",
|
||||
"remove: worst compactness\n",
|
||||
"\n",
|
||||
"testing feature: worst concavity which is feature 27 out of 30\n",
|
||||
"New Test ROC AUC=0.9966656081295648\n",
|
||||
"All features Test ROC AUC=0.9960304858685296\n",
|
||||
"Drop in ROC AUC=-0.0006351222610352369\n",
|
||||
"remove: worst concavity\n",
|
||||
"\n",
|
||||
"testing feature: worst concave points which is feature 28 out of 30\n",
|
||||
"New Test ROC AUC=0.9936487773896475\n",
|
||||
"All features Test ROC AUC=0.9966656081295648\n",
|
||||
"Drop in ROC AUC=0.00301683073991732\n",
|
||||
"keep: worst concave points\n",
|
||||
"\n",
|
||||
"testing feature: worst symmetry which is feature 29 out of 30\n",
|
||||
"New Test ROC AUC=0.9976182915211178\n",
|
||||
"All features Test ROC AUC=0.9966656081295648\n",
|
||||
"Drop in ROC AUC=-0.0009526833915529664\n",
|
||||
"remove: worst symmetry\n",
|
||||
"\n",
|
||||
"testing feature: worst fractal dimension which is feature 30 out of 30\n",
|
||||
"New Test ROC AUC=0.9973007303906002\n",
|
||||
"All features Test ROC AUC=0.9976182915211178\n",
|
||||
"Drop in ROC AUC=0.00031756113051761847\n",
|
||||
"remove: worst fractal dimension\n",
|
||||
"DONE!!\n",
|
||||
"total features to remove: 12\n",
|
||||
"total features to keep: 18\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# tol decide whether we should drop or keep the feature in current round\n",
|
||||
"features_to_keep = hybrid.recursive_feature_elimination_rf(X_train=X_train,\n",
|
||||
" y_train=y_train,\n",
|
||||
" X_test=X_test,\n",
|
||||
" y_test=y_test,\n",
|
||||
" tol=0.001)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['mean radius',\n",
|
||||
" 'mean texture',\n",
|
||||
" 'mean compactness',\n",
|
||||
" 'mean symmetry',\n",
|
||||
" 'mean fractal dimension',\n",
|
||||
" 'radius error',\n",
|
||||
" 'texture error',\n",
|
||||
" 'perimeter error',\n",
|
||||
" 'area error',\n",
|
||||
" 'smoothness error',\n",
|
||||
" 'compactness error',\n",
|
||||
" 'concavity error',\n",
|
||||
" 'concave points error',\n",
|
||||
" 'fractal dimension error',\n",
|
||||
" 'worst radius',\n",
|
||||
" 'worst area',\n",
|
||||
" 'worst smoothness',\n",
|
||||
" 'worst concave points']"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"features_to_keep"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Recursive Feature Addition\n",
|
||||
"### with Random Forests Importance"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Example 1\n",
|
||||
"recursive feature addition with RandomForest\n",
|
||||
"with the method same as the guide\n",
|
||||
"1. Rank the features according to their importance derived from a machine learning algorithm: it can be tree importance, or LASSO / Ridge, or the linear / logistic regression coefficients.\n",
|
||||
"2. Build a machine learning model with only 1 feature, the most important one, and calculate the model metric for performance.\n",
|
||||
"3. Add one feature -the most important- and build a machine learning algorithm utilizing the added and any feature from previous rounds.\n",
|
||||
"4. Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.\n",
|
||||
"5. If the metric increases by more than an arbitrarily set threshold, then that feature is important and should be kept. Otherwise, we can remove that feature.\n",
|
||||
"6. Repeat steps 2-5 until all features have been removed (and therefore evaluated) and the drop in performance assessed.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"\n",
|
||||
"testing feature: mean texture which is feature 1 out of 30\n",
|
||||
"New Test ROC AUC=0.9558590028580501\n",
|
||||
"All features Test ROC AUC=0.9009209272785013\n",
|
||||
"Increase in ROC AUC=0.054938075579548884\n",
|
||||
"keep: mean texture\n",
|
||||
"\n",
|
||||
"testing feature: mean perimeter which is feature 2 out of 30\n",
|
||||
"New Test ROC AUC=0.9609399809463322\n",
|
||||
"All features Test ROC AUC=0.9558590028580501\n",
|
||||
"Increase in ROC AUC=0.005080978088282007\n",
|
||||
"keep: mean perimeter\n",
|
||||
"\n",
|
||||
"testing feature: mean area which is feature 3 out of 30\n",
|
||||
"New Test ROC AUC=0.9609399809463322\n",
|
||||
"All features Test ROC AUC=0.9609399809463322\n",
|
||||
"Increase in ROC AUC=0.0\n",
|
||||
"remove: mean area\n",
|
||||
"\n",
|
||||
"testing feature: mean smoothness which is feature 4 out of 30\n",
|
||||
"New Test ROC AUC=0.9684026675134964\n",
|
||||
"All features Test ROC AUC=0.9609399809463322\n",
|
||||
"Increase in ROC AUC=0.007462686567164201\n",
|
||||
"keep: mean smoothness\n",
|
||||
"\n",
|
||||
"testing feature: mean compactness which is feature 5 out of 30\n",
|
||||
"New Test ROC AUC=0.9750714512543665\n",
|
||||
"All features Test ROC AUC=0.9684026675134964\n",
|
||||
"Increase in ROC AUC=0.006668783740870099\n",
|
||||
"keep: mean compactness\n",
|
||||
"\n",
|
||||
"testing feature: mean concavity which is feature 6 out of 30\n",
|
||||
"New Test ROC AUC=0.9933312162591298\n",
|
||||
"All features Test ROC AUC=0.9750714512543665\n",
|
||||
"Increase in ROC AUC=0.01825976500476334\n",
|
||||
"keep: mean concavity\n",
|
||||
"\n",
|
||||
"testing feature: mean concave points which is feature 7 out of 30\n",
|
||||
"New Test ROC AUC=0.9925373134328358\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.0007939028262939907\n",
|
||||
"remove: mean concave points\n",
|
||||
"\n",
|
||||
"testing feature: mean symmetry which is feature 8 out of 30\n",
|
||||
"New Test ROC AUC=0.9895204826929185\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.0038107335662113107\n",
|
||||
"remove: mean symmetry\n",
|
||||
"\n",
|
||||
"testing feature: mean fractal dimension which is feature 9 out of 30\n",
|
||||
"New Test ROC AUC=0.9892029215624007\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.00412829469672904\n",
|
||||
"remove: mean fractal dimension\n",
|
||||
"\n",
|
||||
"testing feature: radius error which is feature 10 out of 30\n",
|
||||
"New Test ROC AUC=0.9895204826929184\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.0038107335662114217\n",
|
||||
"remove: radius error\n",
|
||||
"\n",
|
||||
"testing feature: texture error which is feature 11 out of 30\n",
|
||||
"New Test ROC AUC=0.9868212130835186\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.006510003175611234\n",
|
||||
"remove: texture error\n",
|
||||
"\n",
|
||||
"testing feature: perimeter error which is feature 12 out of 30\n",
|
||||
"New Test ROC AUC=0.9890441409971419\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.004287075261987905\n",
|
||||
"remove: perimeter error\n",
|
||||
"\n",
|
||||
"testing feature: area error which is feature 13 out of 30\n",
|
||||
"New Test ROC AUC=0.989044140997142\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.004287075261987794\n",
|
||||
"remove: area error\n",
|
||||
"\n",
|
||||
"testing feature: smoothness error which is feature 14 out of 30\n",
|
||||
"New Test ROC AUC=0.988091457605589\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.00523975865354076\n",
|
||||
"remove: smoothness error\n",
|
||||
"\n",
|
||||
"testing feature: compactness error which is feature 15 out of 30\n",
|
||||
"New Test ROC AUC=0.9895204826929184\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.0038107335662114217\n",
|
||||
"remove: compactness error\n",
|
||||
"\n",
|
||||
"testing feature: concavity error which is feature 16 out of 30\n",
|
||||
"New Test ROC AUC=0.9911082883455065\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.0022229279136233293\n",
|
||||
"remove: concavity error\n",
|
||||
"\n",
|
||||
"testing feature: concave points error which is feature 17 out of 30\n",
|
||||
"New Test ROC AUC=0.9906319466497301\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.0026992696093997015\n",
|
||||
"remove: concave points error\n",
|
||||
"\n",
|
||||
"testing feature: symmetry error which is feature 18 out of 30\n",
|
||||
"New Test ROC AUC=0.9876151159098127\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.0057161003493171325\n",
|
||||
"remove: symmetry error\n",
|
||||
"\n",
|
||||
"testing feature: fractal dimension error which is feature 19 out of 30\n",
|
||||
"New Test ROC AUC=0.9896792632581772\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.003651953000952557\n",
|
||||
"remove: fractal dimension error\n",
|
||||
"\n",
|
||||
"testing feature: worst radius which is feature 20 out of 30\n",
|
||||
"New Test ROC AUC=0.994125119085424\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=0.0007939028262942127\n",
|
||||
"remove: worst radius\n",
|
||||
"\n",
|
||||
"testing feature: worst texture which is feature 21 out of 30\n",
|
||||
"New Test ROC AUC=0.9906319466497301\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.0026992696093997015\n",
|
||||
"remove: worst texture\n",
|
||||
"\n",
|
||||
"testing feature: worst perimeter which is feature 22 out of 30\n",
|
||||
"New Test ROC AUC=0.9933312162591299\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=1.1102230246251565e-16\n",
|
||||
"remove: worst perimeter\n",
|
||||
"\n",
|
||||
"testing feature: worst area which is feature 23 out of 30\n",
|
||||
"New Test ROC AUC=0.9931724356938711\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.0001587805652586427\n",
|
||||
"remove: worst area\n",
|
||||
"\n",
|
||||
"testing feature: worst smoothness which is feature 24 out of 30\n",
|
||||
"New Test ROC AUC=0.9933312162591299\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=1.1102230246251565e-16\n",
|
||||
"remove: worst smoothness\n",
|
||||
"\n",
|
||||
"testing feature: worst compactness which is feature 25 out of 30\n",
|
||||
"New Test ROC AUC=0.9895204826929184\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=-0.0038107335662114217\n",
|
||||
"remove: worst compactness\n",
|
||||
"\n",
|
||||
"testing feature: worst concavity which is feature 26 out of 30\n",
|
||||
"New Test ROC AUC=0.9938075579549063\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=0.0004763416957764832\n",
|
||||
"remove: worst concavity\n",
|
||||
"\n",
|
||||
"testing feature: worst concave points which is feature 27 out of 30\n",
|
||||
"New Test ROC AUC=0.9971419498253413\n",
|
||||
"All features Test ROC AUC=0.9933312162591298\n",
|
||||
"Increase in ROC AUC=0.0038107335662115327\n",
|
||||
"keep: worst concave points\n",
|
||||
"\n",
|
||||
"testing feature: worst symmetry which is feature 28 out of 30\n",
|
||||
"New Test ROC AUC=0.9957129247380121\n",
|
||||
"All features Test ROC AUC=0.9971419498253413\n",
|
||||
"Increase in ROC AUC=-0.0014290250873292276\n",
|
||||
"remove: worst symmetry\n",
|
||||
"\n",
|
||||
"testing feature: worst fractal dimension which is feature 29 out of 30\n",
|
||||
"New Test ROC AUC=0.9950778024769769\n",
|
||||
"All features Test ROC AUC=0.9971419498253413\n",
|
||||
"Increase in ROC AUC=-0.0020641473483644646\n",
|
||||
"remove: worst fractal dimension\n",
|
||||
"DONE!!\n",
|
||||
"total features to keep: 7\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"features_to_keep = hybrid.recursive_feature_addition_rf(X_train=X_train,\n",
|
||||
" y_train=y_train,\n",
|
||||
" X_test=X_test,\n",
|
||||
" y_test=y_test,\n",
|
||||
" tol=0.001)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['mean radius',\n",
|
||||
" 'mean texture',\n",
|
||||
" 'mean perimeter',\n",
|
||||
" 'mean smoothness',\n",
|
||||
" 'mean compactness',\n",
|
||||
" 'mean concavity',\n",
|
||||
" 'worst concave points']"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"features_to_keep"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.6.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
824
A Short Guide for Feature Engineering and Feature Selection.md
Normal file
@@ -0,0 +1,824 @@
|
||||
**Table of Contents**:
|
||||
|
||||
[TOC]
|
||||
|
||||
# A Short Guide for Feature Engineering and Feature Selection
|
||||
|
||||
Feature engineering and selection is the art/science of converting data to the best way possible, which involve an elegant blend of domain expertise, intuition and mathematics. This guide is a concise reference for beginners with most simple yet widely used techniques for feature engineering and selection. Any comments and commits are most welcome.
|
||||
|
||||
## 0. Basic Concepts
|
||||
|
||||
### 0.1 What is Machine Learning
|
||||
|
||||
> Machine Learning is the science of getting computers to act without being explicitly programmed - [Arthur Samuel](https://simple.wikipedia.org/wiki/Machine_learning)
|
||||
|
||||
> Machine Learning is a technique of data science that helps computers learn from existing data in order to forecast future behaviors, outcomes and trends - [Microsoft](https://docs.microsoft.com/en-us/azure/machine-learning/service/overview-what-is-azure-ml)
|
||||
|
||||
> The field of Machine Learning seeks to answer the question “How can we build computer systems that automatically improve with experience, and what are the fundamental laws that govern all learning processes?“ - [Carnegie Mellon University](http://www.cs.cmu.edu/~tom/pubs/MachineLearning.pdf)
|
||||
|
||||
Narrowly speaking, in data mining context, machine learning (ML) is the process of letting computers to learn from historical data, recognize pattern/relationship within data, and then make predictions.
|
||||
|
||||
|
||||
|
||||
### 0.2 Methodology
|
||||
|
||||
A typical ML workflow/pipeline looks like this:
|
||||
|
||||
|
||||

|
||||
|
||||
|
||||
Source: Practical Machine Learning with Python, Springer
|
||||
|
||||
|
||||
There can be many ways to divide the tasks that make up the ML workflow into phases. But generally the basic steps are similar as the graph above.
|
||||
|
||||
|
||||
|
||||
### 0.3 Typical Tasks
|
||||
|
||||
| Task | Definition | Example |
|
||||
| ----------------- | --------------------------------------------- | ------------------------------------ |
|
||||
| Classification | predict what category new instance belongs to | is the tumor malign/benign? |
|
||||
| Regression | predict a continuous numeric value | predict house/stock prices in future |
|
||||
| Anomaly Detection | identify outliers | fraud detection |
|
||||
| Clustering | separate similar data points into groups | customer segmentation |
|
||||
|
||||
|
||||
|
||||
### 0.4 Terminology
|
||||
|
||||
- **Feature**: also known as Attribute/ Independent Variable/ Predictor/ Input Variable. It's an individual measurable property/characteristic of a phenomenon being observed [[wiki]](https://en.wikipedia.org/wiki/Feature_(machine_learning)). The age of a person, etc.
|
||||
- **Target**: also known as Dependent Variable/ Response Variable/ Output Variable. It's the variable being predicted in supervised learning.
|
||||
- **Algorithm**: the specific procedure used to implement a particular ML technique. Linear Regression, etc.
|
||||
- **Model**: the algorithm applied to a dataset, complete with its settings (its parameters). Y=4.5x+0.8, etc. We want the model that best captures the relationship between features and the target.
|
||||
- **Supervised learning** : train the model with labeled data to generate reasonable predictions for the response to new data.
|
||||
- **Unsupervised learning** : train the model with un-labeled data to find intrinsic structures/ patterns within the data.
|
||||
- **Reinforcement learning**: the model is learned from a series of actions by maximizing a reward function, which can either be maximized by penalizing bad actions and/or rewarding good actions. Self-driving, etc.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## 1. Data Exploration
|
||||
|
||||
### 1.1 Variables
|
||||
|
||||
**Definition**: any measurable property/characteristic of a phenomenon being observed. They are called 'variables' because the value they take may vary (and it usually does) in a population.
|
||||
|
||||
**Types of Variable**
|
||||
|
||||
| Type | Sub-type | Definition | Example |
|
||||
| ----------- | ---------- | ------------------------------------------------------------ | ------------------------------ |
|
||||
| Categorical | Nominal | Variables with values selected from a group of categories, while not having any kind of natural order. [ref](http://www-ist.massey.ac.nz/dstirlin/CAST/CAST/Hstructures/structures_c2.html) | Gender, car types |
|
||||
| | Ordinal | A categorical variable whose categories can be meaningfully ordered. [ref](http://www-ist.massey.ac.nz/dstirlin/CAST/CAST/Hstructures/structures_c2.html) | Grade of an exam |
|
||||
| Numerical | Discrete | Variables whose values are either finite or countably infinite. [wiki](https://en.wikipedia.org/wiki/Continuous_or_discrete_variable) | Number of children in a family |
|
||||
| | Continuous | Variable which can take on infinitely many, uncountable values. [wiki](https://en.wikipedia.org/wiki/Continuous_or_discrete_variable) | House prices, time passed |
|
||||
|
||||
|
||||
|
||||
### 1.2 Variable Identification
|
||||
|
||||
**Definition**: Identify the data types of each variable.
|
||||
|
||||
**Note**: In reality we may have mixed type of variable for a variety of reasons. For example, in credit scoring "Missed payment status" is a common variable that can take values 1, 2, 3 meaning that the customer has missed 1-3 payments in their account. And it can also take the value D, if the customer defaulted on that account. We may have to convert data types after certain steps of data cleaning.
|
||||
|
||||
|
||||
|
||||
### 1.3 Univariate Analysis
|
||||
|
||||
Descriptive statistics on one single variable.
|
||||
|
||||
| Variable | What to look |
|
||||
| ----------- | ------------------------------------------------------------ |
|
||||
| Categorical | **Shape**:<br />Histogram/ Frequency table... |
|
||||
| Numerical | **Central Tendency**:<br />Mean/ Median/ Mode<br />**Dispersion**:<br />Min/ Max/ Range/ Quantile/ IQR/ MAD/ Variance/ Standard Deviation/ <br />**Shape**:<br />Skewness/ Histogram/ Boxplot... |
|
||||
|
||||
Below are some methods that can give us the basic stats on the variable:
|
||||
|
||||
- pandas.Dataframe.describe()
|
||||
- pandas.Dataframe.dtypes
|
||||
- Barplot
|
||||
- Countplot
|
||||
- Boxplot
|
||||
- Distplot
|
||||
|
||||
|
||||
|
||||
### 1.4 Bi-variate Analysis
|
||||
|
||||
Descriptive statistics between two or more variables.
|
||||
|
||||
- Scatter Plot
|
||||
|
||||
- Correlation Plot
|
||||
- Heat Map
|
||||
|
||||
**Scatter Plot** is a type of plot or mathematical diagram using Cartesian coordinates to display values for typically two variables for a set of data. If the pattern of dots slopes from lower left to upper right, it indicates a positive correlation between the variables being studied. If the pattern of dots slopes from upper left to lower right, it indicates a negative correlation. [[wiki]](https://en.wikipedia.org/wiki/Scatter_plot)
|
||||
|
||||
**Correlation plot** can be used to quickly find insights. It is used to investigate the dependence between multiple variables at the same time and to highlight the most correlated variables in a data table.
|
||||
|
||||
**Heat map** (or heatmap) is a graphical representation of data where the individual values contained in a matrix are represented as colors.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## 2. Feature Cleaning
|
||||
|
||||
### 2.1 Missing Values
|
||||
|
||||
**Definition**: no value is stored in a certain observation within a variable.
|
||||
|
||||
#### 2.1.1 Why Missing Data Matters
|
||||
|
||||
- certain algorithms cannot work when missing value are present
|
||||
- even for algorithm that handle missing data, without treatment the model can lead to inaccurate conclusion
|
||||
|
||||
A study on the impact of missing data on different ML algorithm can be found [here](http://core.ecu.edu/omgt/krosj/IMDSDataMining2003.pdf).
|
||||
|
||||
#### 2.1.2 Missing Mechanisms[^1]
|
||||
|
||||
It is important to understand the mechanisms by which missing fields are introduced in a dataset. Depending on the mechanism, we may choose to process the missing values differently. The mechanisms were first introduced by Rubin[^2].
|
||||
|
||||
**Missing Completely at Random**
|
||||
|
||||
A variable is missing completely at random (MCAR) if the probability of being missing is the same for all the observations. When data is MCAR, there is absolutely no relationship between the data missing and any other values, observed or missing, within the dataset. In other words, those missing data points are a random subset of the data. There is nothing systematic going on that makes some data more likely to be missing than other.
|
||||
|
||||
If values for observations are missing completely at random, then disregarding those cases would not bias the inferences made.
|
||||
|
||||
**Missing at Random**
|
||||
|
||||
Missing as Random (MAR) occurs when there is a systematic relationship between the propensity of missing values and the observed data. In other words, the probability an observation being missing depends only on available information (other variables in the dataset), but not on the variable itself.
|
||||
|
||||
For example, if men are more likely to disclose their weight than women, weight is MAR (on variable gender). The weight information will be missing at random for those men and women that decided not to disclose their weight, but as men are more prone to disclose it, there will be more missing values for women than for men.
|
||||
|
||||
In a situation like the above, if we decide to proceed with the variable with missing values, we might benefit from including gender to control the bias in weight for the missing observations.
|
||||
|
||||
**Missing Not At Random - Depends on Unobserved Predictors**
|
||||
|
||||
Missingness depends on information that has not been recorded, and this information also predicts the missing values. E.g., if a particular treatment causes discomfort, a patient is more likely to drop out of the study (and 'discomfort' is not measured).
|
||||
|
||||
In this situation, data sample is biased if we drop those missing cases.
|
||||
|
||||
**Missing Not At Random - Depends on Missing Value Itself**
|
||||
|
||||
Missingness depends on the (potentially missing) variable itself. E.g., people with higher earnings are less likely to reveal them.
|
||||
|
||||
|
||||
|
||||
#### 2.1.3 How to Assume a Missing Mechanism
|
||||
|
||||
- By **business understanding**. In many situations we can assume the mechanism by probing into the business logic behind that variable.
|
||||
- By **statistical test**. Divide the dataset into ones with/without missing and perform t-test to see if there's significant differences. If there is, we can assume that missing is not completed at random.
|
||||
|
||||
But we should keep in mind that we can hardly 100% be sure that data are MCAR, MAR, or MNAR because unobserved predictors (lurking variables) are unobserved.
|
||||
|
||||
|
||||
|
||||
#### 2.1.4 How to Handle Missing Data
|
||||
|
||||
| Method | Definition | Pros | Cons |
|
||||
| ------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------ | :----------------------------------------------------------- |
|
||||
| Listwise Deletion | excluding all cases (listwise) that have missing values | preserve distribution if MCAR | 1. may discard too much data and hurt the model<br>2. may yield biased estimates if not MCAR (as we keep a special subsample from the population) |
|
||||
| Mean/Median/Mode Imputation | replacing the NA by mean/median/most frequent values (for categorical feature) of that variable | good practice if MCAR | 1. distort distribution<br>2. distort relationship with other variables |
|
||||
| End of distribution Imputation | replacing the NA by values that are at the far end of the distribution of that variable, calculated by mean + 3*std | Captures the importance of missingness if there is one | 1. distort distribution<br />2. may be considered outlier if NA is few or mask true outlier if NA is many.<br />3. if missingness is not important this may mask the predictive power of the original variable |
|
||||
| Random Imputation | replacing the NA by taking a random value from the pool of available observations of that variable | preserve distribution if MCAR | not recommended in business settings for its randomness (different result for same input) |
|
||||
| Arbitrary Value Imputation | replacing the NA by arbitrary values | Captures the importance of missingness if there is one | 1. distort distribution<br />2. typical used value: -9999/9999. But be aware it may be regarded as outliers. |
|
||||
| Add a variable to denote NA | creating an additional variable indicating whether the data was missing for that observation | Captures the importance of missingness if there is one | expand feature space |
|
||||
|
||||
In real settings, when it's hard to decide the missing mechanism or there's few time to study deeply about each missing variables, the popular way is to adopt:
|
||||
|
||||
- Mean/Median/Mode Imputation (depend on the distribution)
|
||||
- End of distribution Imputation
|
||||
- Add a variable to denote NA
|
||||
|
||||
simultaneously, so that we both catch the value of missingness and obtain a complete dataset.
|
||||
|
||||
**Note**: Some algorithms like XGboost incorporate missing data treatment into its model building process, so you don't need to do the step. However it's important to make sure you understand how the algorithm treat them and explain to the business team.
|
||||
|
||||
|
||||
|
||||
### 2.2 Outliers
|
||||
|
||||
**Definition**: An outlier is an observation which deviates so much from the other observations as to arouse suspicions that it was generated by a different mechanism.[^3]
|
||||
|
||||
**Note**: Outliers, depending on the context, either deserve special attention or should be completely ignored. For example, an unusual transaction on a credit card is usually a sign of fraudulent activity, while a height of 1600cm of a person is very likely due to measurement error and should be filter out or impute with something else.
|
||||
|
||||
#### 2.2.1 Why Outlier Matters
|
||||
|
||||
The presence of outliers may:
|
||||
|
||||
- make algorithm not work properly
|
||||
- introduce noises to dataset
|
||||
- make samples less representative
|
||||
|
||||
Some algorithms are very sensitive to outliers, For example, Adaboost may treat outliers as "hard" cases and put tremendous weights on outliers, therefore producing a model with bad generalization. Any algorithms that rely on means/variance are sensitive to outliers as those stats are greatly influenced by extreme values.
|
||||
|
||||
On the other hand some algorithm are more robust to outliers. For example, decision trees tend to ignore the presence of outliers when creating the branches of their trees. Typically, trees make splits by asking if variable x >= value t, and therefore the outlier will fall on each side of the branch, but it will be treated equally as the remaining values, regardless of its magnitude.
|
||||
|
||||
#### 2.2.2 Outlier Detection
|
||||
|
||||
In fact outlier analysis and anomaly detection is a huge field of research. Charu's book "Outlier Analysis"[^4] offer a great insight into the topic. PyOD[^5] is a comprehensive Python toolkit which contains many of the advanced methods in this field.
|
||||
|
||||
All the methods here listed are for univariate outlier detection. Multivariate outlier detection is beyond the scope of this guide.
|
||||
|
||||
| Method | Definition | Pros | Cons |
|
||||
| ---------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
|
||||
| Detect by arbitrary boundary | identify outliers based on arbitrary boundaries | flexiable | require business understanding |
|
||||
| Mean & Standard Deviation method[^6][^7] | outlier detection by Mean & Standard Deviation Method | good for variable with Gaussian distribution (68-95-99 rule) | sensitive to extreme value itself (as the outlier increase the sd) |
|
||||
| IQR method[^8] | outlier detection by Interquartile Ranges Rule | robust than Mean & SD method as it use quantile & IQR. Resilient to extremes. | can be too aggressive |
|
||||
| MAD method[^6][^7] | outlier detection by Median and Median Absolute Deviation Method | robust than Mean & SD method. Resilient to extremes. | can be too aggressive |
|
||||
|
||||
However, beyond these methods, it's more important to keep in mind that the business context should govern how you define and react to these outliers. The meanings of your findings should be dictated by the underlying context, rather than the number itself.
|
||||
|
||||
|
||||
|
||||
#### 2.2.3 How to Handle Outliers
|
||||
|
||||
| Method | Definition | Pros | Cons |
|
||||
| ------------------------------- | ------------------------------------------------------------ | -------------------------------- | ------------------------------------------- |
|
||||
| Mean/Median/Mode Imputation | replacing the outlier by mean/median/most frequent values of that variable | preserve distribution | lose information of outlier if there is one |
|
||||
| Discretization | transform continuous variables into discrete variables | minimize the impact from outlier | lose information of outlier if there is one |
|
||||
| Imputation with arbitrary value | impute outliers with arbitrary value. | flexiable | hard to decide the value |
|
||||
| Windsorization | top-coding & bottom coding (capping the maximum of a distribution at an arbitrarily set value, vice versa). | prevent model over-fitting | distort distribution |
|
||||
| Discard outliers | drop all the observations that are outliers | / | lose information of outlier if there is one |
|
||||
|
||||
**Note**: A detailed guide of doing windsorization can be found [here](https://www.statisticshowto.datasciencecentral.com/winsorize/).
|
||||
|
||||
There are many strategies for dealing with outliers in data, and depending on the context and data set, any could be the right or the wrong way. It’s important to investigate the nature of the outlier before deciding.
|
||||
|
||||
|
||||
|
||||
### 2.3 Rare Values
|
||||
|
||||
**Definition**: Categorical variable with some of its values appear only seldomly.
|
||||
|
||||
**Note**: In some situations rare values, like outliers, may contains valuable information of the dataset and therefore need particular attention. For example, a rare value in transaction may denote fraudulent.
|
||||
|
||||
#### 2.3.1 Why Rare Value Matters
|
||||
|
||||
- Rare values in categorical variables tend to cause over-fitting, particularly in **tree based** methods.
|
||||
- A big number of infrequent labels adds noise, with little information, therefore causing over-fitting.
|
||||
- Rare labels may be present in training set, but not in test set, therefore causing over-fitting to the train set.
|
||||
- Rare labels may appear in the test set, and not in the train set. Thus, the model will not know how to evaluate it.
|
||||
|
||||
#### 2.3.2 How to Handle Rare Value
|
||||
|
||||
| Method | Definition |
|
||||
| ------------------------------ | ------------------------------------------------------------ |
|
||||
| Mode Imputation | Replacing the rare label by most frequent label |
|
||||
| Grouping into one new category | Grouping the observations that show rare labels into a unique category |
|
||||
|
||||
Depending on the situation, we may use different strategies:
|
||||
|
||||
- when **there's one predominant category (over 90%)** in the variable: observe the relationship between that variable and the target, then either discard that variable, or keep it as it was. In this case, variable often is not useful for prediction as it is quasi-constant (as we will later see in Feature Selection part).
|
||||
- when **there's a small number of categories**: keep it as it was. Because only few categories are unlikely to bring so much noise.
|
||||
- when **there's high cardinality**: try the 2 methods above. But it does not guarantee better results than original variable.
|
||||
|
||||
|
||||
|
||||
### 2.4 High Cardinality
|
||||
|
||||
**Definition**: The number of labels within a categorical variable is known as cardinality. A high number of labels within a variable is known as high cardinality.
|
||||
|
||||
#### 2.4.1 Why High Cardinality Matters
|
||||
|
||||
- Variables with too many labels tend to dominate over those with only a few labels, particularly in **tree based** algorithms.
|
||||
- A big number of labels within a variable may introduce noise with little if any information, therefore making the machine learning models prone to over-fit.
|
||||
- Some of the labels may only be present in the training data set, but not in the test set, therefore causing algorithms to over-fit the training set.
|
||||
- Contrarily, new labels may appear in the test set that were not present in the training set, therefore leaving algorithm unable to perform a calculation over the new observation.
|
||||
|
||||
#### 2.4.2 How to Handle High Cardinality
|
||||
|
||||
| Method |
|
||||
| ------------------------------------------------------ |
|
||||
| Grouping labels with business understanding |
|
||||
| Grouping labels with rare occurrence into one category |
|
||||
| Grouping labels with decision tree |
|
||||
|
||||
All these methods attempt to group some of the labels and reduce cardinality. Grouping labels with decision tree is equivalent to the method introduced in section 3.2.2 Discretization with decision tree, which aims to merge labels into more homogenous groups. Grouping labels with rare occurrence into one category is equivalent to method in section 2.3.2.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## 3. Feature Engineering
|
||||
|
||||
### 3.1 Feature Scaling
|
||||
|
||||
**Definition**: Feature scaling is a method used to standardize the range of independent variables or features of data. In data processing, it is also known as data normalization and is generally performed during the data preprocessing step.
|
||||
|
||||
#### 3.1.1 Why Feature Scaling Matters
|
||||
|
||||
- If range of inputs varies, in some algorithms, object functions will not work properly.
|
||||
|
||||
- **Gradient descent** converges much faster with feature scaling done. Gradient descent is a common optimization algorithm used in logistic regression, SVMs, neural networks etc.
|
||||
|
||||
- Algorithms that involve **distance calculation** like KNN, Clustering are also affected by the magnitude of the feature. Just consider how Euclidean distance is calculated: taking the square root of the sum of the squared differences between observations. This distance can be greatly affected by differences in scale among the variables. Variables with large variances have a larger effect on this measure than variables with small variances.
|
||||
|
||||
**Note**: Tree-based algorithms are almost the only algorithms that are not affected by the magnitude of the input, as we can easily see from how trees are built. When deciding how to make a split, tree algorithm look for decisions like "whether feature value X>3.0" and compute the purity of the child node after the split, so the scale of the feature does not count.
|
||||
|
||||
#### 3.1.2 How to Handle Feature Scaling
|
||||
|
||||
| Method | Definition | Pros | Cons |
|
||||
| ------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
|
||||
| Normalization - Standardization (Z-score scaling) | removes the mean and scales the data to unit variance.<br />z = (X - X.mean) / std | feature is rescaled to have a standard normal distribution that centered around 0 with SD of 1 | compress the observations in the narrow range if the variable is skewed or has outliers, thus impair the predictive power. |
|
||||
| Min-Max scaling | transforms features by scaling each feature to a given range. Default to [0,1].<br />X_scaled = (X - X.min / (X.max - X.min) | / | compress the observations in the narrow range if the variable is skewed or has outliers, thus impair the predictive power. |
|
||||
| Robust scaling | removes the median and scales the data according to the quantile range (defaults to IQR)<br />X_scaled = (X - X.median) / IQR | better at preserving the spread of the variable after transformation for skewed variables | / |
|
||||
|
||||
|
||||
|
||||
A comparison of three methods when facing outliers:
|
||||
|
||||
<div align=center>
|
||||
|
||||

|
||||
|
||||
[img source](https://stackoverflow.com/questions/51841506/data-standardization-vs-normalization-vs-robust-scaler)
|
||||
|
||||
As we can see, Normalization - Standardization and Min-Max method will compress most data to a narrow range, while robust scaler does a better job at keeping the spread of the data, although it cannot **remove** the outlier from the processed result. Remember removing/imputing outliers is another topic in data cleaning and should be done beforehand.
|
||||
|
||||
Experience on how to choose feature scaling method:
|
||||
|
||||
- if your feature is not Gaussian like, say, has a skewed distribution or has outliers, Normalization - Standardization is not a good choice as it will compress most data to a narrow range.
|
||||
- However, we can transform the feature into Gaussian like and then use Normalization - Standardization. Feature transformation will be discussed in section 3.4
|
||||
- When performing distance or covariance calculation (algorithm like Clustering, PCA and LDA), it is better to use Normalization - Standardization as it will remove the effect of scales on variance and covariance. Explanation [here](https://blog.csdn.net/zbc1090549839/article/details/44103801).
|
||||
- Min-Max scaling has the same drawbacks as Normalization - Standardization, and also new data may not be bounded to [0,1] as they can be out of the original range. Some algorithms, for example some deep learning network prefer input on a 0-1 scale so this is a good choice.
|
||||
|
||||
|
||||
|
||||
Below is some additional resource on this topic:
|
||||
|
||||
- A comparison of the three methods when facing skewed variables can be found [here](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py).
|
||||
- An in-depth study of feature scaling can be found [here](http://sebastianraschka.com/Articles/2014_about_feature_scaling.html).
|
||||
|
||||
|
||||
|
||||
### 3.2 Discretize
|
||||
|
||||
**Definition**: Discretization is the process of transforming continuous variables into discrete variables by creating a set of contiguous intervals that spans the range of the variable's values.
|
||||
|
||||
#### 3.2.1 Why Discretize Matters
|
||||
|
||||
- help to improve model performance by grouping of similar attributes with similar predictive strengths
|
||||
- enhance interpretability with grouped values
|
||||
- minimize the impact of **extreme values/seldom reversal patterns**
|
||||
- prevent overfitting possible with numerical variables
|
||||
- allow feature interaction between continuous variables (section 3.5.5)
|
||||
|
||||
|
||||
|
||||
#### 3.2.2 How to Handle Discretization
|
||||
|
||||
| Method | Definition | Pros | Cons |
|
||||
| ----------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
|
||||
| Equal width binning | divides the scope of possible values into N bins of the same width | / | sensitive to skewed distribution |
|
||||
| Equal frequency binning | divides the scope of possible values of the variable into N bins, where each bin carries the same amount of observations | may help boost the algorithm's performance | this arbitrary binning may disrupt the relationship with the target |
|
||||
| K-means binning | using k-means to partition values into clusters | / | needs hyper-parameter tuning |
|
||||
| Discretization using decision trees | using a decision tree to identify the optimal splitting points that would determine the bins | observations within each bin are more similar to themselves than to those of other bins | 1. may cause over-fitting<br>2. may not get a good performing tree |
|
||||
| ChiMerge[^11] | supervised hierarchical bottom-up (merge) method that locally exploits the chi-square criterion to decide whether two adjacent intervals are similar enough to be merged | robust and make use of a priori knowledge | cannot handle unlabeled data |
|
||||
|
||||
In general there's no best choice of discretization method. It really depends on the dataset and the following learning algorithm. Study carefully about your features and context before deciding. You can also try different methods and compare the model performance.
|
||||
|
||||
Some literature reviews on feature discretization can be found [here1](https://pdfs.semanticscholar.org/94c3/d92eccbb66f571153f99b7ae6c6167a00923.pdf), [here2](http://robotics.stanford.edu/users/sahami/papers-dir/disc.pdf), [here3](http://axon.cs.byu.edu/papers/ventura.thesis.ps).
|
||||
|
||||
|
||||
|
||||
### 3.3 Feature Encoding
|
||||
|
||||
#### 3.3.1 Why Feature Encoding Matters
|
||||
|
||||
We must transform strings of categorical variables into numbers so that algorithms can handle those values. Even if you see an algorithm can take into categorical inputs, it's most likely that the algorithm incorporate the encoding process within.
|
||||
|
||||
#### 3.3.2 How to Handle Feature Encoding
|
||||
|
||||
| Method | Definition | Pros | Cons |
|
||||
| ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
|
||||
| One-hot encoding | replace the categorical variable by different boolean variables (0/1) to indicate whether or not certain label is true for that observation | keep all information of that variable | 1. expand feature space dramatically if too many labels in that variable<br />2. does not add additional value to make the variable more predictive |
|
||||
| Ordinal-encoding | replace the labels by some ordinal number if ordinal is meaningful | straightforward | does not add additional value to make the variable more predictive |
|
||||
| Count/frequency encoding | replace each label of the categorical variable by the count/frequency within that category | / | 1. may yield same encoding for two different labels (if they appear same times) and lose valuable info.<br />2. may not add predictive power |
|
||||
| Mean encoding | replace the label by the mean of the target for that label. (the target must be 0/1 valued or continuous) | 1. Capture information within the label, therefore rendering more predictive features<br/>2. Create a monotonic relationship between the variable and the target<br>3. Do not expand the feature space | Prone to cause over-fitting |
|
||||
| WOE encoding[^9] | replace the label with Weight of Evidence of each label. WOE is computed from the basic odds ratio: ln( (Proportion of Good Outcomes) / (Proportion of Bad Outcomes)) | 1. Establishes a monotonic relationship to the dependent variable<br/>2. Orders the categories on a "logistic" scale which is natural for logistic regression<br>3,The transformed variables, can then be compared because they are on the same scale. Therefore, it is possible to determine which one is more predictive. | 1. May incur in loss of information (variation) due to binning to few categories<br/>2. Prone to cause over-fitting |
|
||||
| Target encoding[^10] | Similar to mean encoding, but use both posterior probability and prior probability of the target | 1. Capture information within the label, therefore rendering more predictive features<br/>2. Create a monotonic relationship between the variable and the target<br/>3. Do not expand the feature space | Prone to cause over-fitting |
|
||||
|
||||
**Note**: if we are using one-hot encoding in linear regression, we should keep k-1 binary variable to avoid multicollinearity. This is true for any algorithms that look at all features at the same time during training. Including SVM, neural network and clustering. Tree-based algorithm, on the other hand, need the entire set of binary variable to select the best split.
|
||||
|
||||
An in-detail intro to WOE can be found [here](http://documentation.statsoft.com/StatisticaHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview).
|
||||
|
||||
|
||||
|
||||
### 3.4 Feature Transformation
|
||||
|
||||
#### 3.4.1 Why Feature Transformation Matters
|
||||
|
||||
##### 3.4.1.1 Linear Assumption
|
||||
|
||||
**Regression**
|
||||
|
||||
Linear regression is a straightforward approach for predicting a quantitative response Y on the basis of a different predictor variable X1, X2, ... Xn. It assumes that there is a linear relationship between X(s) and Y. Mathematically, we can write this linear relationship as Y ≈ β0 + β1X1 + β2X2 + ... + βnXn.
|
||||
|
||||
**Classification**
|
||||
|
||||
Similarly, for classification, Logistic Regression assumes a linear relationship between the variables and the log of the odds.
|
||||
|
||||
Odds = p / (1 - p), where p is the probability of y = 1
|
||||
|
||||
log(odds) = β0 + β1X1 + β2X2 + ... + βnXn
|
||||
|
||||
**Why it's important to follow linear assumption**
|
||||
|
||||
If the machine learning model assumes a linear dependency between the predictors Xs and the outcome Y, when there is not such a linear relationship, the model will have a poor performance. In such cases, we are better off trying another machine learning model that does not make such assumption.
|
||||
|
||||
If there is no linear relationship and we have to use the linear/logistic regression models, mathematical transformation/discretization may help create the relationship, though it cannot guarantee a better result.
|
||||
|
||||
##### 3.4.1.2 Variable Distribution
|
||||
|
||||
**Linear Regression Assumptions**
|
||||
|
||||
Linear Regression has the following assumptions over the predictor variables X:
|
||||
|
||||
- Linear relationship with the outcome Y
|
||||
|
||||
- Multivariate normality
|
||||
- No or little multicollinearity
|
||||
- Homoscedasticity
|
||||
|
||||
Normality assumption means that every variable X should follow a Gaussian distribution.
|
||||
|
||||
Homoscedasticity, also known as homogeneity of variance, describes a situation in which the error term (that is, the “noise” or random disturbance in the relationship between the independent variables (Xs) and the dependent variable (Y)) is the same across all values of the independent variables.
|
||||
|
||||
Violations in the assumptions of homoscedasticity and / or normality (assuming a distribution of data is homoscedastic or Gaussian, when in reality it is not) may result in poor model performance.
|
||||
|
||||
The remaining machine learning models, including Neural Networks, Support Vector Machines, Tree based methods and PCA do not make any assumption over the distribution of the independent variables. However, in many occasions the model performance may **benefit from a "Gaussian-like" distribution**.
|
||||
|
||||
Why may models benefit from a "Gaussian-like" distributions? In variables with a normal distribution, the observations of X available to predict Y vary across a greater range of values, that is, the values of X are "spread" over a greater range.
|
||||
|
||||
In the situations above, transformation of the original variable can help give the variable more of a bell-shape of the Gaussian distribution.
|
||||
|
||||
#### 3.4.2 How to Handle Feature Transformation
|
||||
|
||||
| Method | Definition |
|
||||
| --------------------------- | -------------------------------------------------------- |
|
||||
| Logarithmic transformation | log(x+1). We use (x+1) instead of x to avoid value of 0 |
|
||||
| Reciprocal transformation | 1/x. Warning that x should not be 0. |
|
||||
| Square root transformation | x**(1/2) |
|
||||
| Exponential transformation | X**(m) |
|
||||
| Box-cox transformation[^12] | (X**λ-1)/λ |
|
||||
| Quantile transformation | transform features using quantiles information |
|
||||
|
||||
**Log transformation** is useful when applied to skewed distributions as they tend to expand the values which fall in the range of lower magnitudes and tend to compress or reduce the values which fall in the range of higher magnitudes, which helps to make the skewed distribution as normal-like as possible. **Square root transformation** does a similar thing in this sense.
|
||||
|
||||
**Box-Cox transformation** in sklearn[^13] is another popular function belonging to the power transform family of functions. This function has a pre-requisite that the numeric values to be transformed must be positive (similar to what log transform expects). In case they are negative, shifting using a constant value helps. Mathematically, the Box-Cox transform function can be denoted as follows.
|
||||
|
||||

|
||||
|
||||
**Quantile transformation** in sklearn[^14] transforms the features to follow a uniform or a normal distribution. Therefore, for a given feature, this transformation tends to spread out the most frequent values. It also reduces the impact of (marginal) outliers: this is therefore a robust preprocessing scheme. However, this transform is non-linear. It may distort linear correlations between variables measured at the same scale but renders variables measured at different scales more directly comparable.
|
||||
|
||||
|
||||
|
||||
We can use **Q-Q plot** to check if the variable is normally distributed (a 45 degree straight line of the values over the theoretical quantiles) after transformation.
|
||||
|
||||
Below is an example showing the effect of sklearn's Box-plot/Yeo-johnson/Quantile transform to map data from various distributions to a normal distribution.
|
||||
|
||||
<div align=center>
|
||||
|
||||

|
||||
|
||||
[img source](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_map_data_to_normal.html#sphx-glr-auto-examples-preprocessing-plot-map-data-to-normal-py)
|
||||
|
||||
On “small” datasets (less than a few hundred points), the quantile transformer is prone to overfitting. The use of the power transform is then recommended.
|
||||
|
||||
|
||||
|
||||
|
||||
### 3.5 Feature Generation
|
||||
|
||||
**Definition**: Creating new features as a combination of existing ones. It's a great way to add domain knowledge to the dataset.
|
||||
|
||||
#### 3.5.1 Missing Data Derived Feature
|
||||
|
||||
As mentioned in section 2.1, we can create new binary feature denoting whether the observations have missing value on raw feature with value 0/1.
|
||||
|
||||
#### 3.5.2 Simple Statistical Derived Feature
|
||||
|
||||
Creating new features by performing simple statistical calculations on the raw features, including:
|
||||
|
||||
- count/sum
|
||||
- average/median/mode
|
||||
- max/min/stddev/variance/range/IQR/Coefficient of Variation
|
||||
- time span/interval
|
||||
|
||||
Take call log for example, we can create new features like: number of calls, number of call-in/call-out, average calling duration, monthly average calling duration, max calling duration, etc.
|
||||
|
||||
#### 3.5.3 Feature Crossing
|
||||
|
||||
After having some simple statistical derived features, we can have them crossed together. Common dimensions used for crossing include:
|
||||
|
||||
- time
|
||||
- region
|
||||
- business types
|
||||
|
||||
Still take call log for example, we can have crossed features like: number of calls during night times/day times, number of calls under different business types (banks/taxi services/travelling/hospitalities), number of calls during the past 3 months, etc. Many of the statistical calculations mentioned in section 3.5.2 can be used again to create more features.
|
||||
|
||||
**Note**: An open-source python framework named **Featuretools** that helps automatically generate such features can be found [here](https://github.com/Featuretools/featuretools).
|
||||
|
||||

|
||||
|
||||
Personally I haven't used it in practice. You may try and discover if it can be of industry usage.
|
||||
|
||||
#### 3.5.4 Ratios and Proportions
|
||||
|
||||
Common techniques. For example, in order to predict future performance of credit card sales of a branch, ratios like credit card sales / sales person or credit card sales / marketing spend would be more powerful than just using absolute number of card sold in the branch.
|
||||
|
||||
#### 3.5.5 Cross Products between Categorical Features
|
||||
|
||||
Consider a categorical feature A, with two possible values {A1, A2}. Let B be a feature with possibilities {B1, B2}. Then, a feature-cross between A & B would take one of the following values: {(A1, B1), (A1, B2), (A2, B1), (A2, B2)}. You can basically give these ‘combinations’ any names you like. Just remember that every combination denotes a synergy between the information contained by the corresponding values of A and B.
|
||||
|
||||
This is an extremely useful technique, when certain features together denote a property better than individually by themselves. Mathematically speaking, you are doing a cross product between all possible values of the categorical features. The concepts is similar to Feature Crossing of section 3.5.3, but this one particularly refers to the crossing between 2 categorical features.
|
||||
|
||||
#### 3.5.6 Polynomial Expansion
|
||||
|
||||
The cross product can also be applied to numerical features, which results in a new interaction feature between A and B. This can be done easily be sklearn's [PolynomialFeatures](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html#sklearn.preprocessing.PolynomialFeatures), which generate a new feature set consisting of all polynomial combinations of the features with degree less than or equal to the specified degree. For example, three raw features {X1, X2, X3} can generate a feature set of {1, X1X2, X1X3, X2X3, X1X2X3} with a degree of 2.
|
||||
|
||||
#### 3.5.7 Feature Learning by Trees
|
||||
|
||||
In tree-based algorithms, each sample will be assigned to a particular leaf node. The decision path to each node can be seen as a new non-linear feature, and we can create N new binary features where n equals to the total number of leaf nodes in a tree or tree ensembles. The features can then be fed into other algorithms such as logistic regression.
|
||||
|
||||
The idea of using tree algorithm to generate new features is first introduced by Facebook in this [paper](http://quinonero.net/Publications/predicting-clicks-facebook.pdf).
|
||||
|
||||
The good things about this method is that we can get a complex combinations of several features together, which is informative (as is constructed by the tree's learning algorithm). This saves us much time compared to doing feature crossing manually, and is widely used in CTR (click-through rate) of online advertising industry.
|
||||
|
||||
#### 3.5.8 Feature Learning by Deep Networks
|
||||
|
||||
As we can see from all above, feature generation by manual takes lots of effort and may not guarantee good returns, particular when we have huge amounts of features to work with. Feature learning with trees can be seen as an early attempt in creating features automatically, and with the deep learning methods come into fashion from around 2016, they also have achieved some success in this area, such as **autoencoders** and **restricted Boltzmann machines**. They have been shown to automatically and in a unsupervised or semi-supervised way, learn abstract representations of features (a compressed form), that in turn have supported state-of-the-art results in domains such as speech recognition, image classification, object recognition and other areas. However, such features have limited interpretability and deep learning require much more data to be able to extract high quality result.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## 4. Feature Selection
|
||||
|
||||
**Definition**: Feature Selection is the process of selecting a subset of relevant features for use in machine learning model building.
|
||||
|
||||
It is not always the truth that the more data, the better the result will be. Including irrelevant features (the ones that are just unhelpful to the prediction) and redundant features (irrelevant in the presence of others) will only make the learning process overwhelmed and easy to cause overfitting.
|
||||
|
||||
With feature selection, we can have:
|
||||
|
||||
- simplification of models to make them easier to interpret
|
||||
- shorter training times and lesser computational cost
|
||||
- lesser cost in data collection
|
||||
- avoid the curse of dimensionality
|
||||
- enhanced generalization by reducing overfitting
|
||||
|
||||
We should keep in mind that different feature subsets render optimal performance for different algorithms. So it's not a separate process along with the machine learning model training. Therefore, if we are selecting features for a linear model, it is better to use selection procedures targeted to those models, like importance by regression coefficient or Lasso. And if we are selecting features for trees, it is better to use tree derived importance.
|
||||
|
||||
|
||||
|
||||
### 4.1 Filter Method
|
||||
|
||||
Filter methods select features based on a performance measure regardless of the ML algorithm later employed.
|
||||
|
||||
Univariate filters evaluate and rank a single feature according to a certain criteria, while multivariate filters evaluate the entire feature space. Filter methods are:
|
||||
|
||||
- selecting variable regardless of the model
|
||||
- less computationally expensive
|
||||
- usually give lower prediction performance
|
||||
|
||||
As a result, filter methods are suited for a first step quick screen and removal of irrelevant features.
|
||||
|
||||
| Method | Definition |
|
||||
| ------------------------- | ------------------------------------------------------------ |
|
||||
| Variance | removing features that show the same value for the majority/all of the observations (constant/quasi-constant features) |
|
||||
| Correlation | remove features that are highly correlated with each other |
|
||||
| Chi-Square | Compute chi-squared stats between each non-negative feature and class |
|
||||
| Mutual Information Filter | Mutual information measures how much information the presence/absence of a feature contributes to making the correct prediction on Y. |
|
||||
| Univariate ROC-AUC or MSE | builds one decision tree per feature, to predict the target, then make predictions and ranks the features according to the machine learning metric (roc-auc or mse) |
|
||||
| Information Value (IV) | a byproduct of WOE. <br>IV = Σ(Proportion of Good Outcomes - Proportion of Bad Outcomes) * WOE |
|
||||
|
||||
WOE encoding (see section 3.3.2) and IV often go hand in hand in scorecard development. The two concepts both derived from logistic regression and is kind of standard practice in credit card industry. IV is a popular and widely used measure as there are very convenient rules of thumb for variables selection associated with IV as below:
|
||||
|
||||

|
||||
|
||||
However, all these filtering methods fail to consider the interaction between features and may reduce our predict power. Personally I only use variance and correlation to filter some absolutely unnecessary features.
|
||||
|
||||
|
||||
|
||||
**Note**: One thing to keep in mind when using chi-square test or univariate selection methods, is that in very big datasets, most of the features will show a small p_value, and therefore look like they are highly predictive. This is in fact an effect of the sample size. So care should be taken when selecting features using these procedures. An ultra tiny p_value does not highlight an ultra-important feature, it rather indicates that the dataset contains too many samples.
|
||||
|
||||
**Note**: Correlated features do not necessarily affect model performance (trees, etc), but high dimensionality does and too many features hurt model interpretability. So it's always better to reduce correlated features.
|
||||
|
||||
|
||||
|
||||
### 4.2 Wrapper Method
|
||||
|
||||
Wrappers use a search strategy to search through the space of possible feature subsets and evaluate each subset by the quality of the performance on a ML algorithm. Practically any combination of search strategy and algorithm can be used as a wrapper. It is featured as:
|
||||
|
||||
- use ML models to score the feature subset
|
||||
- train a new model on each subset
|
||||
- very computationally expensive
|
||||
- usually provide the best performing subset for a give ML algorithm, but probably not for another
|
||||
- need an arbitrary defined stopping criteria
|
||||
|
||||
The most common **search strategy** group is Sequential search, including Forward Selection, Backward Elimination and Exhaustive Search. Randomized search is another popular choice, including Evolutionary computation algorithms such as genetic, and Simulated annealing.
|
||||
|
||||
Another key element in wrappers is **stopping criteria**. When to stop the search? In general there're three:
|
||||
|
||||
- performance increase
|
||||
- performance decrease
|
||||
- predefined number of features is reached
|
||||
|
||||
|
||||
|
||||
#### 4.2.1 Forward Selection
|
||||
|
||||
Step forward feature selection starts by evaluating all features individually and selects the one that generates the best performing algorithm, according to a pre-set evaluation criteria. In the second step, it evaluates all possible combinations of the selected feature and a second feature, and selects the pair that produce the best performing algorithm based on the same pre-set criteria.
|
||||
|
||||
The pre-set criteria can be the roc_auc for classification and the r squared for regression for example.
|
||||
|
||||
This selection procedure is called greedy, because it evaluates all possible single, double, triple and so on feature combinations. Therefore, it is quite computationally expensive, and sometimes, if feature space is big, even unfeasible.
|
||||
|
||||
There is a special package for python that implements this type of feature selection: [mlxtend](https://github.com/rasbt/mlxtend).
|
||||
|
||||
#### 4.2.2 Backward Elimination
|
||||
|
||||
Step backward feature selection starts by fitting a model using all features. Then it removes one feature. It will remove the one that produces the highest performing algorithm (least statistically significant) for a certain evaluation criteria. In the second step, it will remove a second feature, the one that again produces the best performing algorithm. And it proceeds, removing feature after feature, until a certain criteria is met.
|
||||
|
||||
The pre-set criteria can be the roc_auc for classification and the r squared for regression for example.
|
||||
|
||||
#### 4.2.3 Exhaustive Feature Selection
|
||||
|
||||
In an exhaustive feature selection the best subset of features is selected, over all possible feature subsets, by optimizing a specified performance metric for a certain machine learning algorithm. For example, if the classifier is a logistic regression and the dataset consists of **4** features, the algorithm will evaluate all **15** feature combinations as follows:
|
||||
|
||||
- all possible combinations of 1 feature
|
||||
- all possible combinations of 2 features
|
||||
- all possible combinations of 3 features
|
||||
- all the 4 features
|
||||
|
||||
and select the one that results in the best performance (e.g., classification accuracy) of the logistic regression classifier.
|
||||
|
||||
This exhaustive search is very computationally expensive. In practice for this computational cost, it is rarely used.
|
||||
|
||||
#### 4.2.4 Genetic Algorithm
|
||||
|
||||
TODO
|
||||
|
||||
|
||||
|
||||
### 4.3 Embedded Method
|
||||
|
||||
Embedded Method combine the advantages of the filter and wrapper methods. A learning algorithm takes advantage of its own variable selection process and performs feature selection and classification at same time. Common embedded methods include Lasso and various types of tree-based algorithms. It is featured as:
|
||||
|
||||
- perform feature selection as part of the model building process
|
||||
- consider interactions between features
|
||||
- less computationally expensive as it only train the model once, compared to Wrappers
|
||||
- usually provide the best performing subset for a give ML algorithm, but probably not for another
|
||||
|
||||
|
||||
|
||||
#### 4.3.1 Regularization with Lasso
|
||||
|
||||
Regularization consists in adding a penalty to the different parameters of the machine learning model to reduce the freedom of the model. Hence, the model will be less likely to fit the noise of the training data so less likely to be overfitting.
|
||||
|
||||
In linear model regularization, the penalty is applied over the coefficients that multiply each of the predictors. For linear models there are in general 3 types of regularization:
|
||||
|
||||
- L1 regularization (Lasso)
|
||||
- L2 regularization (Ridge)
|
||||
- L1/L2 (Elastic net)
|
||||
|
||||
From the different types of regularization, **Lasso (L1)** has the property that is able to shrink some of the coefficients to zero. Therefore, that feature can be removed from the model.
|
||||
|
||||
Both for linear and logistic regression we can use the Lasso regularization to remove non-important features. Keep in mind that increasing the penalization will increase the number of features removed. Therefore, you will need to keep an eye and monitor that you don't set a penalty too high so that to remove even important features, or too low and then not remove non-important features.
|
||||
|
||||
Having said this, if the penalty is too high and important features are removed, you should notice a drop in the performance of the algorithm and then realize that you need to decrease the regularization.
|
||||
|
||||
Regularization is a large topic. For for information you can refer to here:
|
||||
|
||||
- [Least angle and l1 penalised regression: A review](https://projecteuclid.org/download/pdfview_1/euclid.ssu/1211317636)
|
||||
- [Penalised feature selection and classification in bioinformatics](https://www.ncbi.nlm.nih.gov/pubmed/18562478)
|
||||
- [Feature selection for classification: A review](https://web.archive.org/web/20160314145552/http://www.public.asu.edu/~jtang20/publication/feature_selection_for_classification.pdf)
|
||||
|
||||
- [Machine Learning Explained: Regularization](https://www.r-bloggers.com/machine-learning-explained-regularization/)
|
||||
|
||||
|
||||
|
||||
#### 4.3.2 Random Forest Importance
|
||||
|
||||
Random forests are one of the most popular machine learning algorithms. They are so successful because they provide in general a good predictive performance, low overfitting and easy interpretability. This interpretability is given by the fact that it is straightforward to derive the importance of each variable on the tree decision. In other words, it is easy to compute how much each variable is contributing to the decision.
|
||||
|
||||
Random forest is a bagging algorithm consists a bunch of base estimators (decision trees), each of them built over a random extraction of the observations from the dataset and a random extraction of the features. Not every tree sees all the features or all the observations, and this guarantees that the trees are **de-correlated** and therefore **less prone to over-fitting.**
|
||||
|
||||
Each tree is also a sequence of yes-no questions based on a single or combination of features. At each split, the question divides the dataset into 2 buckets, each of them hosting observations that are more similar among themselves and different from the ones in the other bucket. Therefore, the importance of each feature is derived by how "**pure**" each of the buckets is.
|
||||
|
||||
For classification, the measure of impurity is either the **Gini impurity** or the **information gain/entropy**. For regression the measure of impurity is **variance**. Therefore, when training a tree, it is possible to compute how much each feature decreases the impurity. The more a feature decreases the impurity, the more important the feature is. In random forests, the impurity decrease from each feature can be averaged across trees to determine the final importance of the variable.
|
||||
|
||||
Selecting features by using tree derived feature importance is a very straightforward, fast and generally accurate way of selecting good features for machine learning. In particular, if you are going to build tree methods.
|
||||
|
||||
However, correlated features will show in a tree similar and lowered importance, compared to what their importance would be if the tree was built without correlated counterparts.
|
||||
|
||||
**Limitation**
|
||||
|
||||
- correlated features show similar importance
|
||||
|
||||
- correlated features importance is lower than real importance, when tree is build without its correlated counterparts
|
||||
|
||||
- high carinal variable tend to show higher importance
|
||||
|
||||
|
||||
#### 4.3.3 Gradient Boosted Trees Importance
|
||||
|
||||
Similarly to selecting features using Random Forests derived feature importance, we can select features based on the importance derived by gradient boosted trees. And we can do that in one go, or in a recursive manner, depending on how much time we have, how many features are in the dataset, and whether they are correlated or not.
|
||||
|
||||
|
||||
|
||||
### 4.4 Feature Shuffling
|
||||
|
||||
A popular method of feature selection consists in random shuffling the values of a specific variable and determining how that permutation affects the performance metric of the machine learning algorithm. In other words, the idea is to permute the values of each feature, one at the time, and measure how much the permutation decreases the accuracy, or the roc_auc, or the mse of the machine learning model. If the variables are important, this is, highly predictive, a random permutation of their values will decrease dramatically any of these metrics. Contrarily, non-important / non-predictive variables, should have little to no effect on the model performance metric we are assessing.
|
||||
|
||||
|
||||
|
||||
### 4.5 Hybrid Method
|
||||
|
||||
#### 4.5.1 Recursive Feature Elimination
|
||||
|
||||
This method consists of the following steps:
|
||||
|
||||
1. Rank the features according to their importance derived from a machine learning algorithm: it can be tree importance, or LASSO / Ridge, or the linear / logistic regression coefficients.
|
||||
2. Remove one feature -the least important- and build a machine learning algorithm utilizing the remaining features.
|
||||
|
||||
3. Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.
|
||||
4. If the metric decreases by more of an arbitrarily set threshold, then that feature is important and should be kept. Otherwise, we can remove that feature.
|
||||
5. Repeat steps 2-4 until all features have been removed (and therefore evaluated) and the drop in performance assessed.
|
||||
|
||||
The method combines the selection process like wrappers and feature importance derivation from ML models like embedded methods so it's called hybrid.
|
||||
|
||||
The difference between this method and the step backwards feature selection lies in that it does not remove all features first in order to determine which one to remove. It removes the least important one, based on the machine learning model derived importance. And then, it makes an assessment as to whether that feature should be removed or not. So it removes each feature only once during selection, whereas step backward feature selection removes all the features at each step of selection.
|
||||
|
||||
This method is therefore faster than wrapper methods and generally better than embedded methods. In practice it works extremely well. It does also account for correlations (depending on how stringent you set the arbitrary performance drop threshold). On the downside, the drop in performance assessed to decide whether the feature should be kept or removed, is set arbitrarily. The smaller the drop the more features will be selected, and vice versa.
|
||||
|
||||
|
||||
|
||||
**Example: Recursive Feature Elimination with Random Forests Importance**
|
||||
|
||||
As we talked about in section 4.3.2, Random Forests assign equal or similar importance to features that are highly correlated. In addition, when features are correlated, the importance assigned is lower than the importance attributed to the feature itself, should the tree be built without the correlated counterparts.
|
||||
|
||||
Therefore, instead of eliminating features based on importance **at one time** (from all initial features), we may get a better selection by removing one feature **recursively**, and recalculating the importance on each round.
|
||||
|
||||
In this situation, when a feature that is highly correlated to another one is removed, then, the importance of the remaining feature increases. This may lead to a better subset feature space selection. On the downside, building several random forests is quite time consuming, in particular if the dataset contains a high number of features.
|
||||
|
||||
#### 4.5.2 Recursive Feature Addition
|
||||
|
||||
This method consists of the following steps:
|
||||
|
||||
1. Rank the features according to their importance derived from a machine learning algorithm: it can be tree importance, or LASSO / Ridge, or the linear / logistic regression coefficients.
|
||||
2. Build a machine learning model with only 1 feature, the most important one, and calculate the model metric for performance.
|
||||
|
||||
3. Add one feature -the most important- and build a machine learning algorithm utilizing the added and any feature from previous rounds.
|
||||
|
||||
4. Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.
|
||||
|
||||
5. If the metric increases by more than an arbitrarily set threshold, then that feature is important and should be kept. Otherwise, we can remove that feature.
|
||||
|
||||
6. Repeat steps 2-5 until all features have been removed (and therefore evaluated) and the drop in performance assessed.
|
||||
|
||||
The difference between this method and the step forward feature selection is similar. It does not look for all features first in order to determine which one to add, so it's faster than wrappers.
|
||||
|
||||
|
||||
|
||||
### 4.6 Dimensionality Reduction
|
||||
|
||||
- PCA(主成分分析)
|
||||
|
||||
- SVD(奇异值分解)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## 5. Data Leakage
|
||||
|
||||
This section is a remainder to myself as I have had made huge mistakes because of not aware of the problem. Data leakage is when information from outside the training dataset is used to create the model[^15]. The result is that you may be creating overly optimistic models that are practically useless and cannot be used in production. The model shows great result on both your training and testing data but in fact it's not because your model really has a good generalizability but it uses information from the test data.
|
||||
|
||||
While it is well known to use cross-validation or at least separate a validation set in training and evaluating the models, people may easily forget to do the same during the feature engineering & selection process. Keep in mind that the test dataset must not be used in any way to make choices about the model, including feature engineering & selection.
|
||||
|
||||
|
||||
|
||||
------
|
||||
|
||||
**Reference**
|
||||
|
||||
[^1]: http://www.simonqueenborough.info/R/basic/missing-data
|
||||
[^2]: Rubin, D. B. (1976). Inference and missing data. Biometrika 63(3): 581-592.
|
||||
[^3]: D. Hawkins. Identification of Outliers, Chapman and Hall , 1980.
|
||||
[^4]: https://www.springer.com/gp/book/9781461463955
|
||||
[^5]: https://github.com/yzhao062/pyod
|
||||
[^6]: https://docs.oracle.com/cd/E40248_01/epm.1112/cb_statistical/frameset.htm?ch07s02s10s01.html
|
||||
[^7]: https://www.academia.edu/5324493/Detecting_outliers_Do_not_use_standard_deviation_around_the_mean_use_absolute_deviation_around_the_median
|
||||
[^8]: https://www.purplemath.com/modules/boxwhisk3.htm
|
||||
[^9]: http://documentation.statsoft.com/StatisticaHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview
|
||||
[^10]: A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems. https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
|
||||
[^11]: https://www.aaai.org/Papers/AAAI/1992/AAAI92-019.pdf
|
||||
[^12]: http://onlinestatbook.com/2/transformations/box-cox.html
|
||||
[^13]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html#sklearn.preprocessing.PowerTransformer
|
||||
[^14]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html#sklearn.preprocessing.QuantileTransformer
|
||||
[^15]: https://machinelearningmastery.com/data-leakage-machine-learning/
|
||||
204
README.md
Normal file
@@ -0,0 +1,204 @@
|
||||
# Feature Engineering & Feature Selection
|
||||
|
||||
## About
|
||||
|
||||
A comprehensive [guide]() for **Feature Engineering** and **Feature Selection**, with implementations and examples in Python.
|
||||
|
||||
|
||||
|
||||
## What You'll Learn
|
||||
|
||||
Not only a collection of hands-on functions, but also explanation on **Why**, **How** and **When** to adopt **Which** techniques of feature engineering in data mining.
|
||||
|
||||
- the nature and risk of data problem we often encounter
|
||||
- explanation of the various feature engineering & selection techniques
|
||||
- rationale to use it
|
||||
- pros & cons of each method
|
||||
- code & example
|
||||
|
||||
|
||||
|
||||
## Getting Started
|
||||
|
||||
This repo is mainly used as a reference for anyone who are doing feature engineering, and most of the modules are implemented through scikit-learn or its communities.
|
||||
|
||||
To run the demos or use the customized function, please download the ZIP file from the repo or just copy-paste any part of the code you find helpful. They should all be very easy to understand.
|
||||
|
||||
**Required Dependencies**:
|
||||
|
||||
- Python 3.5, 3.6 or 3.7
|
||||
- numpy>=1.15
|
||||
- pandas>=0.23
|
||||
- scipy>=1.1.0
|
||||
- scikit_learn>=0.20.1
|
||||
- seaborn>=0.9.0
|
||||
|
||||
|
||||
|
||||
## Table of Contents and Code Examples
|
||||
|
||||
Below is a list of methods currently implemented in the repo. The complete guide can be found [here]().
|
||||
|
||||
**1. Data Exploration**
|
||||
|
||||
1.1 Variables
|
||||
1.2 Variable Identification
|
||||
Check Data Types
|
||||
1.3 Univariate Analysis
|
||||
Descriptive Analysis
|
||||
Discrete Variable Barplot
|
||||
Discrete Variable Countplot
|
||||
Discrete Variable Boxplot
|
||||
Continuous Variable Distplot
|
||||
1.4 Bi-variate Analysis
|
||||
Scatter Plot
|
||||
Correlation Plot
|
||||
Heat Map
|
||||
|
||||
**2. Feature Cleaning**
|
||||
|
||||
2.1 Missing Values
|
||||
Missing Value Check
|
||||
Listwise Deletion
|
||||
Mean/Median/Mode Imputation
|
||||
End of distribution Imputation
|
||||
Random Imputation
|
||||
Arbitrary Value Imputation
|
||||
Add a variable to denote NA
|
||||
2.2 Outliers
|
||||
Detect by Arbitrary Boundary
|
||||
Detect by Mean & Standard Deviation
|
||||
Detect by IQR
|
||||
Detect by MAD
|
||||
Mean/Median/Mode Imputation
|
||||
Discretization
|
||||
Imputation with Arbitrary Value
|
||||
Windsorization
|
||||
Discard Outliers
|
||||
2.3 Rare Values
|
||||
Mode Imputation
|
||||
Grouping into One New Category
|
||||
2.4 High Cardinality
|
||||
Grouping Labels with Business Understanding
|
||||
Grouping Labels with Rare Occurrence into One Category
|
||||
Grouping Labels with Decision Tree
|
||||
|
||||
**3. Feature Engineering**
|
||||
|
||||
3.1 Feature Scaling
|
||||
Normalization - Standardization
|
||||
Min-Max Scaling
|
||||
Robust Scaling
|
||||
3.2 Discretize
|
||||
Equal Width Binning
|
||||
Equal Frequency Binning
|
||||
K-means Binning
|
||||
Discretization by Decision Trees
|
||||
ChiMerge
|
||||
3.3 Feature Encoding
|
||||
One-hot Encoding
|
||||
Ordinal-Encoding
|
||||
Count/frequency Encoding
|
||||
Mean Encoding
|
||||
WOE Encoding
|
||||
Target Encoding
|
||||
3.4 Feature Transformation
|
||||
Logarithmic Transformation
|
||||
Reciprocal Transformation
|
||||
Square Root Transformation
|
||||
Exponential Transformation
|
||||
Box-cox Transformation
|
||||
Quantile Transformation
|
||||
3.5 Feature Generation
|
||||
Missing Data Derived
|
||||
Simple Stats
|
||||
Crossing
|
||||
Ratio & Proportion
|
||||
Cross Product
|
||||
Polynomial
|
||||
Feature Leanring by Tree
|
||||
Feature Leanring by Deep Network
|
||||
|
||||
**4. Feature Selection**
|
||||
|
||||
4.1 Filter Method
|
||||
Variance
|
||||
Correlation
|
||||
Chi-Square
|
||||
Mutual Information Filter
|
||||
Univariate ROC-AUC or MSE
|
||||
Information Value (IV)
|
||||
4.2 Wrapper Method
|
||||
Forward Selection
|
||||
Backward Elimination
|
||||
Exhaustive Feature Selection
|
||||
Genetic Algorithm
|
||||
4.3 Embedded Method
|
||||
Lasso (L1)
|
||||
Random Forest Importance
|
||||
Gradient Boosted Trees Importance
|
||||
4.4 Feature Shuffling
|
||||
Random Shuffling
|
||||
4.5 Hybrid Method
|
||||
Recursive Feature Selection
|
||||
Recursive Feature Addition
|
||||
|
||||
|
||||
|
||||
|
||||
## Motivation
|
||||
|
||||
Feature Engineering & Selection is the most essential part of building a useable machine learning project, even though hundreds of cutting-edge machine learning algorithms coming in these days like deep learning and transfer learning. Indeed, like what Prof Domingos, the author of *'The Master Algorithm'* says:
|
||||
|
||||
> “At the end of the day, some machine learning projects succeed and some fail. What makes the difference? Easily the most important factor is the features used.”
|
||||
>
|
||||
> — Prof. Pedro Domingos
|
||||
|
||||

|
||||
Data and feature determine the upper limit of a ML project, while models and algorithms are just approaching that limit. However, few materials could be found that systematically introduce the art of feature engineering, and even fewer could explain the rationale behind. This repo aims at teaching you a good guide for Feature Engineering & Selection.
|
||||
|
||||
|
||||
|
||||
## Key Links and Resources
|
||||
|
||||
- Udemy's Feature Engineering online course
|
||||
|
||||
https://www.udemy.com/feature-engineering-for-machine-learning/
|
||||
|
||||
- Udemy's Feature Selection online course
|
||||
|
||||
https://www.udemy.com/feature-selection-for-machine-learning
|
||||
|
||||
- JMLR Special Issue on Variable and Feature Selection
|
||||
|
||||
http://jmlr.org/papers/special/feature03.html
|
||||
|
||||
- Data Analysis Using Regression and Multilevel/Hierarchical Models, Chapter 25: Missing data
|
||||
|
||||
http://www.stat.columbia.edu/~gelman/arm/missing.pdf
|
||||
|
||||
- Data mining and the impact of missing data
|
||||
|
||||
http://core.ecu.edu/omgt/krosj/IMDSDataMining2003.pdf
|
||||
|
||||
- PyOD: A Python Toolkit for Scalable Outlier Detection
|
||||
|
||||
https://github.com/yzhao062/pyod
|
||||
|
||||
- Weight of Evidence (WoE) Introductory Overview
|
||||
|
||||
http://documentation.statsoft.com/StatisticaHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview
|
||||
|
||||
- About Feature Scaling and Normalization
|
||||
|
||||
http://sebastianraschka.com/Articles/2014_about_feature_scaling.html
|
||||
|
||||
- Feature Generation with RF, GBDT and Xgboost
|
||||
|
||||
https://blog.csdn.net/anshuai_aw1/article/details/82983997
|
||||
|
||||
- A review of feature selection methods with applications
|
||||
|
||||
https://ieeexplore.ieee.org/iel7/7153596/7160221/07160458.pdf
|
||||
|
||||
|
||||
506
data/housing.data.txt
Normal file
@@ -0,0 +1,506 @@
|
||||
0.00632 18.00 2.310 0 0.5380 6.5750 65.20 4.0900 1 296.0 15.30 396.90 4.98 24.00
|
||||
0.02731 0.00 7.070 0 0.4690 6.4210 78.90 4.9671 2 242.0 17.80 396.90 9.14 21.60
|
||||
0.02729 0.00 7.070 0 0.4690 7.1850 61.10 4.9671 2 242.0 17.80 392.83 4.03 34.70
|
||||
0.03237 0.00 2.180 0 0.4580 6.9980 45.80 6.0622 3 222.0 18.70 394.63 2.94 33.40
|
||||
0.06905 0.00 2.180 0 0.4580 7.1470 54.20 6.0622 3 222.0 18.70 396.90 5.33 36.20
|
||||
0.02985 0.00 2.180 0 0.4580 6.4300 58.70 6.0622 3 222.0 18.70 394.12 5.21 28.70
|
||||
0.08829 12.50 7.870 0 0.5240 6.0120 66.60 5.5605 5 311.0 15.20 395.60 12.43 22.90
|
||||
0.14455 12.50 7.870 0 0.5240 6.1720 96.10 5.9505 5 311.0 15.20 396.90 19.15 27.10
|
||||
0.21124 12.50 7.870 0 0.5240 5.6310 100.00 6.0821 5 311.0 15.20 386.63 29.93 16.50
|
||||
0.17004 12.50 7.870 0 0.5240 6.0040 85.90 6.5921 5 311.0 15.20 386.71 17.10 18.90
|
||||
0.22489 12.50 7.870 0 0.5240 6.3770 94.30 6.3467 5 311.0 15.20 392.52 20.45 15.00
|
||||
0.11747 12.50 7.870 0 0.5240 6.0090 82.90 6.2267 5 311.0 15.20 396.90 13.27 18.90
|
||||
0.09378 12.50 7.870 0 0.5240 5.8890 39.00 5.4509 5 311.0 15.20 390.50 15.71 21.70
|
||||
0.62976 0.00 8.140 0 0.5380 5.9490 61.80 4.7075 4 307.0 21.00 396.90 8.26 20.40
|
||||
0.63796 0.00 8.140 0 0.5380 6.0960 84.50 4.4619 4 307.0 21.00 380.02 10.26 18.20
|
||||
0.62739 0.00 8.140 0 0.5380 5.8340 56.50 4.4986 4 307.0 21.00 395.62 8.47 19.90
|
||||
1.05393 0.00 8.140 0 0.5380 5.9350 29.30 4.4986 4 307.0 21.00 386.85 6.58 23.10
|
||||
0.78420 0.00 8.140 0 0.5380 5.9900 81.70 4.2579 4 307.0 21.00 386.75 14.67 17.50
|
||||
0.80271 0.00 8.140 0 0.5380 5.4560 36.60 3.7965 4 307.0 21.00 288.99 11.69 20.20
|
||||
0.72580 0.00 8.140 0 0.5380 5.7270 69.50 3.7965 4 307.0 21.00 390.95 11.28 18.20
|
||||
1.25179 0.00 8.140 0 0.5380 5.5700 98.10 3.7979 4 307.0 21.00 376.57 21.02 13.60
|
||||
0.85204 0.00 8.140 0 0.5380 5.9650 89.20 4.0123 4 307.0 21.00 392.53 13.83 19.60
|
||||
1.23247 0.00 8.140 0 0.5380 6.1420 91.70 3.9769 4 307.0 21.00 396.90 18.72 15.20
|
||||
0.98843 0.00 8.140 0 0.5380 5.8130 100.00 4.0952 4 307.0 21.00 394.54 19.88 14.50
|
||||
0.75026 0.00 8.140 0 0.5380 5.9240 94.10 4.3996 4 307.0 21.00 394.33 16.30 15.60
|
||||
0.84054 0.00 8.140 0 0.5380 5.5990 85.70 4.4546 4 307.0 21.00 303.42 16.51 13.90
|
||||
0.67191 0.00 8.140 0 0.5380 5.8130 90.30 4.6820 4 307.0 21.00 376.88 14.81 16.60
|
||||
0.95577 0.00 8.140 0 0.5380 6.0470 88.80 4.4534 4 307.0 21.00 306.38 17.28 14.80
|
||||
0.77299 0.00 8.140 0 0.5380 6.4950 94.40 4.4547 4 307.0 21.00 387.94 12.80 18.40
|
||||
1.00245 0.00 8.140 0 0.5380 6.6740 87.30 4.2390 4 307.0 21.00 380.23 11.98 21.00
|
||||
1.13081 0.00 8.140 0 0.5380 5.7130 94.10 4.2330 4 307.0 21.00 360.17 22.60 12.70
|
||||
1.35472 0.00 8.140 0 0.5380 6.0720 100.00 4.1750 4 307.0 21.00 376.73 13.04 14.50
|
||||
1.38799 0.00 8.140 0 0.5380 5.9500 82.00 3.9900 4 307.0 21.00 232.60 27.71 13.20
|
||||
1.15172 0.00 8.140 0 0.5380 5.7010 95.00 3.7872 4 307.0 21.00 358.77 18.35 13.10
|
||||
1.61282 0.00 8.140 0 0.5380 6.0960 96.90 3.7598 4 307.0 21.00 248.31 20.34 13.50
|
||||
0.06417 0.00 5.960 0 0.4990 5.9330 68.20 3.3603 5 279.0 19.20 396.90 9.68 18.90
|
||||
0.09744 0.00 5.960 0 0.4990 5.8410 61.40 3.3779 5 279.0 19.20 377.56 11.41 20.00
|
||||
0.08014 0.00 5.960 0 0.4990 5.8500 41.50 3.9342 5 279.0 19.20 396.90 8.77 21.00
|
||||
0.17505 0.00 5.960 0 0.4990 5.9660 30.20 3.8473 5 279.0 19.20 393.43 10.13 24.70
|
||||
0.02763 75.00 2.950 0 0.4280 6.5950 21.80 5.4011 3 252.0 18.30 395.63 4.32 30.80
|
||||
0.03359 75.00 2.950 0 0.4280 7.0240 15.80 5.4011 3 252.0 18.30 395.62 1.98 34.90
|
||||
0.12744 0.00 6.910 0 0.4480 6.7700 2.90 5.7209 3 233.0 17.90 385.41 4.84 26.60
|
||||
0.14150 0.00 6.910 0 0.4480 6.1690 6.60 5.7209 3 233.0 17.90 383.37 5.81 25.30
|
||||
0.15936 0.00 6.910 0 0.4480 6.2110 6.50 5.7209 3 233.0 17.90 394.46 7.44 24.70
|
||||
0.12269 0.00 6.910 0 0.4480 6.0690 40.00 5.7209 3 233.0 17.90 389.39 9.55 21.20
|
||||
0.17142 0.00 6.910 0 0.4480 5.6820 33.80 5.1004 3 233.0 17.90 396.90 10.21 19.30
|
||||
0.18836 0.00 6.910 0 0.4480 5.7860 33.30 5.1004 3 233.0 17.90 396.90 14.15 20.00
|
||||
0.22927 0.00 6.910 0 0.4480 6.0300 85.50 5.6894 3 233.0 17.90 392.74 18.80 16.60
|
||||
0.25387 0.00 6.910 0 0.4480 5.3990 95.30 5.8700 3 233.0 17.90 396.90 30.81 14.40
|
||||
0.21977 0.00 6.910 0 0.4480 5.6020 62.00 6.0877 3 233.0 17.90 396.90 16.20 19.40
|
||||
0.08873 21.00 5.640 0 0.4390 5.9630 45.70 6.8147 4 243.0 16.80 395.56 13.45 19.70
|
||||
0.04337 21.00 5.640 0 0.4390 6.1150 63.00 6.8147 4 243.0 16.80 393.97 9.43 20.50
|
||||
0.05360 21.00 5.640 0 0.4390 6.5110 21.10 6.8147 4 243.0 16.80 396.90 5.28 25.00
|
||||
0.04981 21.00 5.640 0 0.4390 5.9980 21.40 6.8147 4 243.0 16.80 396.90 8.43 23.40
|
||||
0.01360 75.00 4.000 0 0.4100 5.8880 47.60 7.3197 3 469.0 21.10 396.90 14.80 18.90
|
||||
0.01311 90.00 1.220 0 0.4030 7.2490 21.90 8.6966 5 226.0 17.90 395.93 4.81 35.40
|
||||
0.02055 85.00 0.740 0 0.4100 6.3830 35.70 9.1876 2 313.0 17.30 396.90 5.77 24.70
|
||||
0.01432 100.00 1.320 0 0.4110 6.8160 40.50 8.3248 5 256.0 15.10 392.90 3.95 31.60
|
||||
0.15445 25.00 5.130 0 0.4530 6.1450 29.20 7.8148 8 284.0 19.70 390.68 6.86 23.30
|
||||
0.10328 25.00 5.130 0 0.4530 5.9270 47.20 6.9320 8 284.0 19.70 396.90 9.22 19.60
|
||||
0.14932 25.00 5.130 0 0.4530 5.7410 66.20 7.2254 8 284.0 19.70 395.11 13.15 18.70
|
||||
0.17171 25.00 5.130 0 0.4530 5.9660 93.40 6.8185 8 284.0 19.70 378.08 14.44 16.00
|
||||
0.11027 25.00 5.130 0 0.4530 6.4560 67.80 7.2255 8 284.0 19.70 396.90 6.73 22.20
|
||||
0.12650 25.00 5.130 0 0.4530 6.7620 43.40 7.9809 8 284.0 19.70 395.58 9.50 25.00
|
||||
0.01951 17.50 1.380 0 0.4161 7.1040 59.50 9.2229 3 216.0 18.60 393.24 8.05 33.00
|
||||
0.03584 80.00 3.370 0 0.3980 6.2900 17.80 6.6115 4 337.0 16.10 396.90 4.67 23.50
|
||||
0.04379 80.00 3.370 0 0.3980 5.7870 31.10 6.6115 4 337.0 16.10 396.90 10.24 19.40
|
||||
0.05789 12.50 6.070 0 0.4090 5.8780 21.40 6.4980 4 345.0 18.90 396.21 8.10 22.00
|
||||
0.13554 12.50 6.070 0 0.4090 5.5940 36.80 6.4980 4 345.0 18.90 396.90 13.09 17.40
|
||||
0.12816 12.50 6.070 0 0.4090 5.8850 33.00 6.4980 4 345.0 18.90 396.90 8.79 20.90
|
||||
0.08826 0.00 10.810 0 0.4130 6.4170 6.60 5.2873 4 305.0 19.20 383.73 6.72 24.20
|
||||
0.15876 0.00 10.810 0 0.4130 5.9610 17.50 5.2873 4 305.0 19.20 376.94 9.88 21.70
|
||||
0.09164 0.00 10.810 0 0.4130 6.0650 7.80 5.2873 4 305.0 19.20 390.91 5.52 22.80
|
||||
0.19539 0.00 10.810 0 0.4130 6.2450 6.20 5.2873 4 305.0 19.20 377.17 7.54 23.40
|
||||
0.07896 0.00 12.830 0 0.4370 6.2730 6.00 4.2515 5 398.0 18.70 394.92 6.78 24.10
|
||||
0.09512 0.00 12.830 0 0.4370 6.2860 45.00 4.5026 5 398.0 18.70 383.23 8.94 21.40
|
||||
0.10153 0.00 12.830 0 0.4370 6.2790 74.50 4.0522 5 398.0 18.70 373.66 11.97 20.00
|
||||
0.08707 0.00 12.830 0 0.4370 6.1400 45.80 4.0905 5 398.0 18.70 386.96 10.27 20.80
|
||||
0.05646 0.00 12.830 0 0.4370 6.2320 53.70 5.0141 5 398.0 18.70 386.40 12.34 21.20
|
||||
0.08387 0.00 12.830 0 0.4370 5.8740 36.60 4.5026 5 398.0 18.70 396.06 9.10 20.30
|
||||
0.04113 25.00 4.860 0 0.4260 6.7270 33.50 5.4007 4 281.0 19.00 396.90 5.29 28.00
|
||||
0.04462 25.00 4.860 0 0.4260 6.6190 70.40 5.4007 4 281.0 19.00 395.63 7.22 23.90
|
||||
0.03659 25.00 4.860 0 0.4260 6.3020 32.20 5.4007 4 281.0 19.00 396.90 6.72 24.80
|
||||
0.03551 25.00 4.860 0 0.4260 6.1670 46.70 5.4007 4 281.0 19.00 390.64 7.51 22.90
|
||||
0.05059 0.00 4.490 0 0.4490 6.3890 48.00 4.7794 3 247.0 18.50 396.90 9.62 23.90
|
||||
0.05735 0.00 4.490 0 0.4490 6.6300 56.10 4.4377 3 247.0 18.50 392.30 6.53 26.60
|
||||
0.05188 0.00 4.490 0 0.4490 6.0150 45.10 4.4272 3 247.0 18.50 395.99 12.86 22.50
|
||||
0.07151 0.00 4.490 0 0.4490 6.1210 56.80 3.7476 3 247.0 18.50 395.15 8.44 22.20
|
||||
0.05660 0.00 3.410 0 0.4890 7.0070 86.30 3.4217 2 270.0 17.80 396.90 5.50 23.60
|
||||
0.05302 0.00 3.410 0 0.4890 7.0790 63.10 3.4145 2 270.0 17.80 396.06 5.70 28.70
|
||||
0.04684 0.00 3.410 0 0.4890 6.4170 66.10 3.0923 2 270.0 17.80 392.18 8.81 22.60
|
||||
0.03932 0.00 3.410 0 0.4890 6.4050 73.90 3.0921 2 270.0 17.80 393.55 8.20 22.00
|
||||
0.04203 28.00 15.040 0 0.4640 6.4420 53.60 3.6659 4 270.0 18.20 395.01 8.16 22.90
|
||||
0.02875 28.00 15.040 0 0.4640 6.2110 28.90 3.6659 4 270.0 18.20 396.33 6.21 25.00
|
||||
0.04294 28.00 15.040 0 0.4640 6.2490 77.30 3.6150 4 270.0 18.20 396.90 10.59 20.60
|
||||
0.12204 0.00 2.890 0 0.4450 6.6250 57.80 3.4952 2 276.0 18.00 357.98 6.65 28.40
|
||||
0.11504 0.00 2.890 0 0.4450 6.1630 69.60 3.4952 2 276.0 18.00 391.83 11.34 21.40
|
||||
0.12083 0.00 2.890 0 0.4450 8.0690 76.00 3.4952 2 276.0 18.00 396.90 4.21 38.70
|
||||
0.08187 0.00 2.890 0 0.4450 7.8200 36.90 3.4952 2 276.0 18.00 393.53 3.57 43.80
|
||||
0.06860 0.00 2.890 0 0.4450 7.4160 62.50 3.4952 2 276.0 18.00 396.90 6.19 33.20
|
||||
0.14866 0.00 8.560 0 0.5200 6.7270 79.90 2.7778 5 384.0 20.90 394.76 9.42 27.50
|
||||
0.11432 0.00 8.560 0 0.5200 6.7810 71.30 2.8561 5 384.0 20.90 395.58 7.67 26.50
|
||||
0.22876 0.00 8.560 0 0.5200 6.4050 85.40 2.7147 5 384.0 20.90 70.80 10.63 18.60
|
||||
0.21161 0.00 8.560 0 0.5200 6.1370 87.40 2.7147 5 384.0 20.90 394.47 13.44 19.30
|
||||
0.13960 0.00 8.560 0 0.5200 6.1670 90.00 2.4210 5 384.0 20.90 392.69 12.33 20.10
|
||||
0.13262 0.00 8.560 0 0.5200 5.8510 96.70 2.1069 5 384.0 20.90 394.05 16.47 19.50
|
||||
0.17120 0.00 8.560 0 0.5200 5.8360 91.90 2.2110 5 384.0 20.90 395.67 18.66 19.50
|
||||
0.13117 0.00 8.560 0 0.5200 6.1270 85.20 2.1224 5 384.0 20.90 387.69 14.09 20.40
|
||||
0.12802 0.00 8.560 0 0.5200 6.4740 97.10 2.4329 5 384.0 20.90 395.24 12.27 19.80
|
||||
0.26363 0.00 8.560 0 0.5200 6.2290 91.20 2.5451 5 384.0 20.90 391.23 15.55 19.40
|
||||
0.10793 0.00 8.560 0 0.5200 6.1950 54.40 2.7778 5 384.0 20.90 393.49 13.00 21.70
|
||||
0.10084 0.00 10.010 0 0.5470 6.7150 81.60 2.6775 6 432.0 17.80 395.59 10.16 22.80
|
||||
0.12329 0.00 10.010 0 0.5470 5.9130 92.90 2.3534 6 432.0 17.80 394.95 16.21 18.80
|
||||
0.22212 0.00 10.010 0 0.5470 6.0920 95.40 2.5480 6 432.0 17.80 396.90 17.09 18.70
|
||||
0.14231 0.00 10.010 0 0.5470 6.2540 84.20 2.2565 6 432.0 17.80 388.74 10.45 18.50
|
||||
0.17134 0.00 10.010 0 0.5470 5.9280 88.20 2.4631 6 432.0 17.80 344.91 15.76 18.30
|
||||
0.13158 0.00 10.010 0 0.5470 6.1760 72.50 2.7301 6 432.0 17.80 393.30 12.04 21.20
|
||||
0.15098 0.00 10.010 0 0.5470 6.0210 82.60 2.7474 6 432.0 17.80 394.51 10.30 19.20
|
||||
0.13058 0.00 10.010 0 0.5470 5.8720 73.10 2.4775 6 432.0 17.80 338.63 15.37 20.40
|
||||
0.14476 0.00 10.010 0 0.5470 5.7310 65.20 2.7592 6 432.0 17.80 391.50 13.61 19.30
|
||||
0.06899 0.00 25.650 0 0.5810 5.8700 69.70 2.2577 2 188.0 19.10 389.15 14.37 22.00
|
||||
0.07165 0.00 25.650 0 0.5810 6.0040 84.10 2.1974 2 188.0 19.10 377.67 14.27 20.30
|
||||
0.09299 0.00 25.650 0 0.5810 5.9610 92.90 2.0869 2 188.0 19.10 378.09 17.93 20.50
|
||||
0.15038 0.00 25.650 0 0.5810 5.8560 97.00 1.9444 2 188.0 19.10 370.31 25.41 17.30
|
||||
0.09849 0.00 25.650 0 0.5810 5.8790 95.80 2.0063 2 188.0 19.10 379.38 17.58 18.80
|
||||
0.16902 0.00 25.650 0 0.5810 5.9860 88.40 1.9929 2 188.0 19.10 385.02 14.81 21.40
|
||||
0.38735 0.00 25.650 0 0.5810 5.6130 95.60 1.7572 2 188.0 19.10 359.29 27.26 15.70
|
||||
0.25915 0.00 21.890 0 0.6240 5.6930 96.00 1.7883 4 437.0 21.20 392.11 17.19 16.20
|
||||
0.32543 0.00 21.890 0 0.6240 6.4310 98.80 1.8125 4 437.0 21.20 396.90 15.39 18.00
|
||||
0.88125 0.00 21.890 0 0.6240 5.6370 94.70 1.9799 4 437.0 21.20 396.90 18.34 14.30
|
||||
0.34006 0.00 21.890 0 0.6240 6.4580 98.90 2.1185 4 437.0 21.20 395.04 12.60 19.20
|
||||
1.19294 0.00 21.890 0 0.6240 6.3260 97.70 2.2710 4 437.0 21.20 396.90 12.26 19.60
|
||||
0.59005 0.00 21.890 0 0.6240 6.3720 97.90 2.3274 4 437.0 21.20 385.76 11.12 23.00
|
||||
0.32982 0.00 21.890 0 0.6240 5.8220 95.40 2.4699 4 437.0 21.20 388.69 15.03 18.40
|
||||
0.97617 0.00 21.890 0 0.6240 5.7570 98.40 2.3460 4 437.0 21.20 262.76 17.31 15.60
|
||||
0.55778 0.00 21.890 0 0.6240 6.3350 98.20 2.1107 4 437.0 21.20 394.67 16.96 18.10
|
||||
0.32264 0.00 21.890 0 0.6240 5.9420 93.50 1.9669 4 437.0 21.20 378.25 16.90 17.40
|
||||
0.35233 0.00 21.890 0 0.6240 6.4540 98.40 1.8498 4 437.0 21.20 394.08 14.59 17.10
|
||||
0.24980 0.00 21.890 0 0.6240 5.8570 98.20 1.6686 4 437.0 21.20 392.04 21.32 13.30
|
||||
0.54452 0.00 21.890 0 0.6240 6.1510 97.90 1.6687 4 437.0 21.20 396.90 18.46 17.80
|
||||
0.29090 0.00 21.890 0 0.6240 6.1740 93.60 1.6119 4 437.0 21.20 388.08 24.16 14.00
|
||||
1.62864 0.00 21.890 0 0.6240 5.0190 100.00 1.4394 4 437.0 21.20 396.90 34.41 14.40
|
||||
3.32105 0.00 19.580 1 0.8710 5.4030 100.00 1.3216 5 403.0 14.70 396.90 26.82 13.40
|
||||
4.09740 0.00 19.580 0 0.8710 5.4680 100.00 1.4118 5 403.0 14.70 396.90 26.42 15.60
|
||||
2.77974 0.00 19.580 0 0.8710 4.9030 97.80 1.3459 5 403.0 14.70 396.90 29.29 11.80
|
||||
2.37934 0.00 19.580 0 0.8710 6.1300 100.00 1.4191 5 403.0 14.70 172.91 27.80 13.80
|
||||
2.15505 0.00 19.580 0 0.8710 5.6280 100.00 1.5166 5 403.0 14.70 169.27 16.65 15.60
|
||||
2.36862 0.00 19.580 0 0.8710 4.9260 95.70 1.4608 5 403.0 14.70 391.71 29.53 14.60
|
||||
2.33099 0.00 19.580 0 0.8710 5.1860 93.80 1.5296 5 403.0 14.70 356.99 28.32 17.80
|
||||
2.73397 0.00 19.580 0 0.8710 5.5970 94.90 1.5257 5 403.0 14.70 351.85 21.45 15.40
|
||||
1.65660 0.00 19.580 0 0.8710 6.1220 97.30 1.6180 5 403.0 14.70 372.80 14.10 21.50
|
||||
1.49632 0.00 19.580 0 0.8710 5.4040 100.00 1.5916 5 403.0 14.70 341.60 13.28 19.60
|
||||
1.12658 0.00 19.580 1 0.8710 5.0120 88.00 1.6102 5 403.0 14.70 343.28 12.12 15.30
|
||||
2.14918 0.00 19.580 0 0.8710 5.7090 98.50 1.6232 5 403.0 14.70 261.95 15.79 19.40
|
||||
1.41385 0.00 19.580 1 0.8710 6.1290 96.00 1.7494 5 403.0 14.70 321.02 15.12 17.00
|
||||
3.53501 0.00 19.580 1 0.8710 6.1520 82.60 1.7455 5 403.0 14.70 88.01 15.02 15.60
|
||||
2.44668 0.00 19.580 0 0.8710 5.2720 94.00 1.7364 5 403.0 14.70 88.63 16.14 13.10
|
||||
1.22358 0.00 19.580 0 0.6050 6.9430 97.40 1.8773 5 403.0 14.70 363.43 4.59 41.30
|
||||
1.34284 0.00 19.580 0 0.6050 6.0660 100.00 1.7573 5 403.0 14.70 353.89 6.43 24.30
|
||||
1.42502 0.00 19.580 0 0.8710 6.5100 100.00 1.7659 5 403.0 14.70 364.31 7.39 23.30
|
||||
1.27346 0.00 19.580 1 0.6050 6.2500 92.60 1.7984 5 403.0 14.70 338.92 5.50 27.00
|
||||
1.46336 0.00 19.580 0 0.6050 7.4890 90.80 1.9709 5 403.0 14.70 374.43 1.73 50.00
|
||||
1.83377 0.00 19.580 1 0.6050 7.8020 98.20 2.0407 5 403.0 14.70 389.61 1.92 50.00
|
||||
1.51902 0.00 19.580 1 0.6050 8.3750 93.90 2.1620 5 403.0 14.70 388.45 3.32 50.00
|
||||
2.24236 0.00 19.580 0 0.6050 5.8540 91.80 2.4220 5 403.0 14.70 395.11 11.64 22.70
|
||||
2.92400 0.00 19.580 0 0.6050 6.1010 93.00 2.2834 5 403.0 14.70 240.16 9.81 25.00
|
||||
2.01019 0.00 19.580 0 0.6050 7.9290 96.20 2.0459 5 403.0 14.70 369.30 3.70 50.00
|
||||
1.80028 0.00 19.580 0 0.6050 5.8770 79.20 2.4259 5 403.0 14.70 227.61 12.14 23.80
|
||||
2.30040 0.00 19.580 0 0.6050 6.3190 96.10 2.1000 5 403.0 14.70 297.09 11.10 23.80
|
||||
2.44953 0.00 19.580 0 0.6050 6.4020 95.20 2.2625 5 403.0 14.70 330.04 11.32 22.30
|
||||
1.20742 0.00 19.580 0 0.6050 5.8750 94.60 2.4259 5 403.0 14.70 292.29 14.43 17.40
|
||||
2.31390 0.00 19.580 0 0.6050 5.8800 97.30 2.3887 5 403.0 14.70 348.13 12.03 19.10
|
||||
0.13914 0.00 4.050 0 0.5100 5.5720 88.50 2.5961 5 296.0 16.60 396.90 14.69 23.10
|
||||
0.09178 0.00 4.050 0 0.5100 6.4160 84.10 2.6463 5 296.0 16.60 395.50 9.04 23.60
|
||||
0.08447 0.00 4.050 0 0.5100 5.8590 68.70 2.7019 5 296.0 16.60 393.23 9.64 22.60
|
||||
0.06664 0.00 4.050 0 0.5100 6.5460 33.10 3.1323 5 296.0 16.60 390.96 5.33 29.40
|
||||
0.07022 0.00 4.050 0 0.5100 6.0200 47.20 3.5549 5 296.0 16.60 393.23 10.11 23.20
|
||||
0.05425 0.00 4.050 0 0.5100 6.3150 73.40 3.3175 5 296.0 16.60 395.60 6.29 24.60
|
||||
0.06642 0.00 4.050 0 0.5100 6.8600 74.40 2.9153 5 296.0 16.60 391.27 6.92 29.90
|
||||
0.05780 0.00 2.460 0 0.4880 6.9800 58.40 2.8290 3 193.0 17.80 396.90 5.04 37.20
|
||||
0.06588 0.00 2.460 0 0.4880 7.7650 83.30 2.7410 3 193.0 17.80 395.56 7.56 39.80
|
||||
0.06888 0.00 2.460 0 0.4880 6.1440 62.20 2.5979 3 193.0 17.80 396.90 9.45 36.20
|
||||
0.09103 0.00 2.460 0 0.4880 7.1550 92.20 2.7006 3 193.0 17.80 394.12 4.82 37.90
|
||||
0.10008 0.00 2.460 0 0.4880 6.5630 95.60 2.8470 3 193.0 17.80 396.90 5.68 32.50
|
||||
0.08308 0.00 2.460 0 0.4880 5.6040 89.80 2.9879 3 193.0 17.80 391.00 13.98 26.40
|
||||
0.06047 0.00 2.460 0 0.4880 6.1530 68.80 3.2797 3 193.0 17.80 387.11 13.15 29.60
|
||||
0.05602 0.00 2.460 0 0.4880 7.8310 53.60 3.1992 3 193.0 17.80 392.63 4.45 50.00
|
||||
0.07875 45.00 3.440 0 0.4370 6.7820 41.10 3.7886 5 398.0 15.20 393.87 6.68 32.00
|
||||
0.12579 45.00 3.440 0 0.4370 6.5560 29.10 4.5667 5 398.0 15.20 382.84 4.56 29.80
|
||||
0.08370 45.00 3.440 0 0.4370 7.1850 38.90 4.5667 5 398.0 15.20 396.90 5.39 34.90
|
||||
0.09068 45.00 3.440 0 0.4370 6.9510 21.50 6.4798 5 398.0 15.20 377.68 5.10 37.00
|
||||
0.06911 45.00 3.440 0 0.4370 6.7390 30.80 6.4798 5 398.0 15.20 389.71 4.69 30.50
|
||||
0.08664 45.00 3.440 0 0.4370 7.1780 26.30 6.4798 5 398.0 15.20 390.49 2.87 36.40
|
||||
0.02187 60.00 2.930 0 0.4010 6.8000 9.90 6.2196 1 265.0 15.60 393.37 5.03 31.10
|
||||
0.01439 60.00 2.930 0 0.4010 6.6040 18.80 6.2196 1 265.0 15.60 376.70 4.38 29.10
|
||||
0.01381 80.00 0.460 0 0.4220 7.8750 32.00 5.6484 4 255.0 14.40 394.23 2.97 50.00
|
||||
0.04011 80.00 1.520 0 0.4040 7.2870 34.10 7.3090 2 329.0 12.60 396.90 4.08 33.30
|
||||
0.04666 80.00 1.520 0 0.4040 7.1070 36.60 7.3090 2 329.0 12.60 354.31 8.61 30.30
|
||||
0.03768 80.00 1.520 0 0.4040 7.2740 38.30 7.3090 2 329.0 12.60 392.20 6.62 34.60
|
||||
0.03150 95.00 1.470 0 0.4030 6.9750 15.30 7.6534 3 402.0 17.00 396.90 4.56 34.90
|
||||
0.01778 95.00 1.470 0 0.4030 7.1350 13.90 7.6534 3 402.0 17.00 384.30 4.45 32.90
|
||||
0.03445 82.50 2.030 0 0.4150 6.1620 38.40 6.2700 2 348.0 14.70 393.77 7.43 24.10
|
||||
0.02177 82.50 2.030 0 0.4150 7.6100 15.70 6.2700 2 348.0 14.70 395.38 3.11 42.30
|
||||
0.03510 95.00 2.680 0 0.4161 7.8530 33.20 5.1180 4 224.0 14.70 392.78 3.81 48.50
|
||||
0.02009 95.00 2.680 0 0.4161 8.0340 31.90 5.1180 4 224.0 14.70 390.55 2.88 50.00
|
||||
0.13642 0.00 10.590 0 0.4890 5.8910 22.30 3.9454 4 277.0 18.60 396.90 10.87 22.60
|
||||
0.22969 0.00 10.590 0 0.4890 6.3260 52.50 4.3549 4 277.0 18.60 394.87 10.97 24.40
|
||||
0.25199 0.00 10.590 0 0.4890 5.7830 72.70 4.3549 4 277.0 18.60 389.43 18.06 22.50
|
||||
0.13587 0.00 10.590 1 0.4890 6.0640 59.10 4.2392 4 277.0 18.60 381.32 14.66 24.40
|
||||
0.43571 0.00 10.590 1 0.4890 5.3440 100.00 3.8750 4 277.0 18.60 396.90 23.09 20.00
|
||||
0.17446 0.00 10.590 1 0.4890 5.9600 92.10 3.8771 4 277.0 18.60 393.25 17.27 21.70
|
||||
0.37578 0.00 10.590 1 0.4890 5.4040 88.60 3.6650 4 277.0 18.60 395.24 23.98 19.30
|
||||
0.21719 0.00 10.590 1 0.4890 5.8070 53.80 3.6526 4 277.0 18.60 390.94 16.03 22.40
|
||||
0.14052 0.00 10.590 0 0.4890 6.3750 32.30 3.9454 4 277.0 18.60 385.81 9.38 28.10
|
||||
0.28955 0.00 10.590 0 0.4890 5.4120 9.80 3.5875 4 277.0 18.60 348.93 29.55 23.70
|
||||
0.19802 0.00 10.590 0 0.4890 6.1820 42.40 3.9454 4 277.0 18.60 393.63 9.47 25.00
|
||||
0.04560 0.00 13.890 1 0.5500 5.8880 56.00 3.1121 5 276.0 16.40 392.80 13.51 23.30
|
||||
0.07013 0.00 13.890 0 0.5500 6.6420 85.10 3.4211 5 276.0 16.40 392.78 9.69 28.70
|
||||
0.11069 0.00 13.890 1 0.5500 5.9510 93.80 2.8893 5 276.0 16.40 396.90 17.92 21.50
|
||||
0.11425 0.00 13.890 1 0.5500 6.3730 92.40 3.3633 5 276.0 16.40 393.74 10.50 23.00
|
||||
0.35809 0.00 6.200 1 0.5070 6.9510 88.50 2.8617 8 307.0 17.40 391.70 9.71 26.70
|
||||
0.40771 0.00 6.200 1 0.5070 6.1640 91.30 3.0480 8 307.0 17.40 395.24 21.46 21.70
|
||||
0.62356 0.00 6.200 1 0.5070 6.8790 77.70 3.2721 8 307.0 17.40 390.39 9.93 27.50
|
||||
0.61470 0.00 6.200 0 0.5070 6.6180 80.80 3.2721 8 307.0 17.40 396.90 7.60 30.10
|
||||
0.31533 0.00 6.200 0 0.5040 8.2660 78.30 2.8944 8 307.0 17.40 385.05 4.14 44.80
|
||||
0.52693 0.00 6.200 0 0.5040 8.7250 83.00 2.8944 8 307.0 17.40 382.00 4.63 50.00
|
||||
0.38214 0.00 6.200 0 0.5040 8.0400 86.50 3.2157 8 307.0 17.40 387.38 3.13 37.60
|
||||
0.41238 0.00 6.200 0 0.5040 7.1630 79.90 3.2157 8 307.0 17.40 372.08 6.36 31.60
|
||||
0.29819 0.00 6.200 0 0.5040 7.6860 17.00 3.3751 8 307.0 17.40 377.51 3.92 46.70
|
||||
0.44178 0.00 6.200 0 0.5040 6.5520 21.40 3.3751 8 307.0 17.40 380.34 3.76 31.50
|
||||
0.53700 0.00 6.200 0 0.5040 5.9810 68.10 3.6715 8 307.0 17.40 378.35 11.65 24.30
|
||||
0.46296 0.00 6.200 0 0.5040 7.4120 76.90 3.6715 8 307.0 17.40 376.14 5.25 31.70
|
||||
0.57529 0.00 6.200 0 0.5070 8.3370 73.30 3.8384 8 307.0 17.40 385.91 2.47 41.70
|
||||
0.33147 0.00 6.200 0 0.5070 8.2470 70.40 3.6519 8 307.0 17.40 378.95 3.95 48.30
|
||||
0.44791 0.00 6.200 1 0.5070 6.7260 66.50 3.6519 8 307.0 17.40 360.20 8.05 29.00
|
||||
0.33045 0.00 6.200 0 0.5070 6.0860 61.50 3.6519 8 307.0 17.40 376.75 10.88 24.00
|
||||
0.52058 0.00 6.200 1 0.5070 6.6310 76.50 4.1480 8 307.0 17.40 388.45 9.54 25.10
|
||||
0.51183 0.00 6.200 0 0.5070 7.3580 71.60 4.1480 8 307.0 17.40 390.07 4.73 31.50
|
||||
0.08244 30.00 4.930 0 0.4280 6.4810 18.50 6.1899 6 300.0 16.60 379.41 6.36 23.70
|
||||
0.09252 30.00 4.930 0 0.4280 6.6060 42.20 6.1899 6 300.0 16.60 383.78 7.37 23.30
|
||||
0.11329 30.00 4.930 0 0.4280 6.8970 54.30 6.3361 6 300.0 16.60 391.25 11.38 22.00
|
||||
0.10612 30.00 4.930 0 0.4280 6.0950 65.10 6.3361 6 300.0 16.60 394.62 12.40 20.10
|
||||
0.10290 30.00 4.930 0 0.4280 6.3580 52.90 7.0355 6 300.0 16.60 372.75 11.22 22.20
|
||||
0.12757 30.00 4.930 0 0.4280 6.3930 7.80 7.0355 6 300.0 16.60 374.71 5.19 23.70
|
||||
0.20608 22.00 5.860 0 0.4310 5.5930 76.50 7.9549 7 330.0 19.10 372.49 12.50 17.60
|
||||
0.19133 22.00 5.860 0 0.4310 5.6050 70.20 7.9549 7 330.0 19.10 389.13 18.46 18.50
|
||||
0.33983 22.00 5.860 0 0.4310 6.1080 34.90 8.0555 7 330.0 19.10 390.18 9.16 24.30
|
||||
0.19657 22.00 5.860 0 0.4310 6.2260 79.20 8.0555 7 330.0 19.10 376.14 10.15 20.50
|
||||
0.16439 22.00 5.860 0 0.4310 6.4330 49.10 7.8265 7 330.0 19.10 374.71 9.52 24.50
|
||||
0.19073 22.00 5.860 0 0.4310 6.7180 17.50 7.8265 7 330.0 19.10 393.74 6.56 26.20
|
||||
0.14030 22.00 5.860 0 0.4310 6.4870 13.00 7.3967 7 330.0 19.10 396.28 5.90 24.40
|
||||
0.21409 22.00 5.860 0 0.4310 6.4380 8.90 7.3967 7 330.0 19.10 377.07 3.59 24.80
|
||||
0.08221 22.00 5.860 0 0.4310 6.9570 6.80 8.9067 7 330.0 19.10 386.09 3.53 29.60
|
||||
0.36894 22.00 5.860 0 0.4310 8.2590 8.40 8.9067 7 330.0 19.10 396.90 3.54 42.80
|
||||
0.04819 80.00 3.640 0 0.3920 6.1080 32.00 9.2203 1 315.0 16.40 392.89 6.57 21.90
|
||||
0.03548 80.00 3.640 0 0.3920 5.8760 19.10 9.2203 1 315.0 16.40 395.18 9.25 20.90
|
||||
0.01538 90.00 3.750 0 0.3940 7.4540 34.20 6.3361 3 244.0 15.90 386.34 3.11 44.00
|
||||
0.61154 20.00 3.970 0 0.6470 8.7040 86.90 1.8010 5 264.0 13.00 389.70 5.12 50.00
|
||||
0.66351 20.00 3.970 0 0.6470 7.3330 100.00 1.8946 5 264.0 13.00 383.29 7.79 36.00
|
||||
0.65665 20.00 3.970 0 0.6470 6.8420 100.00 2.0107 5 264.0 13.00 391.93 6.90 30.10
|
||||
0.54011 20.00 3.970 0 0.6470 7.2030 81.80 2.1121 5 264.0 13.00 392.80 9.59 33.80
|
||||
0.53412 20.00 3.970 0 0.6470 7.5200 89.40 2.1398 5 264.0 13.00 388.37 7.26 43.10
|
||||
0.52014 20.00 3.970 0 0.6470 8.3980 91.50 2.2885 5 264.0 13.00 386.86 5.91 48.80
|
||||
0.82526 20.00 3.970 0 0.6470 7.3270 94.50 2.0788 5 264.0 13.00 393.42 11.25 31.00
|
||||
0.55007 20.00 3.970 0 0.6470 7.2060 91.60 1.9301 5 264.0 13.00 387.89 8.10 36.50
|
||||
0.76162 20.00 3.970 0 0.6470 5.5600 62.80 1.9865 5 264.0 13.00 392.40 10.45 22.80
|
||||
0.78570 20.00 3.970 0 0.6470 7.0140 84.60 2.1329 5 264.0 13.00 384.07 14.79 30.70
|
||||
0.57834 20.00 3.970 0 0.5750 8.2970 67.00 2.4216 5 264.0 13.00 384.54 7.44 50.00
|
||||
0.54050 20.00 3.970 0 0.5750 7.4700 52.60 2.8720 5 264.0 13.00 390.30 3.16 43.50
|
||||
0.09065 20.00 6.960 1 0.4640 5.9200 61.50 3.9175 3 223.0 18.60 391.34 13.65 20.70
|
||||
0.29916 20.00 6.960 0 0.4640 5.8560 42.10 4.4290 3 223.0 18.60 388.65 13.00 21.10
|
||||
0.16211 20.00 6.960 0 0.4640 6.2400 16.30 4.4290 3 223.0 18.60 396.90 6.59 25.20
|
||||
0.11460 20.00 6.960 0 0.4640 6.5380 58.70 3.9175 3 223.0 18.60 394.96 7.73 24.40
|
||||
0.22188 20.00 6.960 1 0.4640 7.6910 51.80 4.3665 3 223.0 18.60 390.77 6.58 35.20
|
||||
0.05644 40.00 6.410 1 0.4470 6.7580 32.90 4.0776 4 254.0 17.60 396.90 3.53 32.40
|
||||
0.09604 40.00 6.410 0 0.4470 6.8540 42.80 4.2673 4 254.0 17.60 396.90 2.98 32.00
|
||||
0.10469 40.00 6.410 1 0.4470 7.2670 49.00 4.7872 4 254.0 17.60 389.25 6.05 33.20
|
||||
0.06127 40.00 6.410 1 0.4470 6.8260 27.60 4.8628 4 254.0 17.60 393.45 4.16 33.10
|
||||
0.07978 40.00 6.410 0 0.4470 6.4820 32.10 4.1403 4 254.0 17.60 396.90 7.19 29.10
|
||||
0.21038 20.00 3.330 0 0.4429 6.8120 32.20 4.1007 5 216.0 14.90 396.90 4.85 35.10
|
||||
0.03578 20.00 3.330 0 0.4429 7.8200 64.50 4.6947 5 216.0 14.90 387.31 3.76 45.40
|
||||
0.03705 20.00 3.330 0 0.4429 6.9680 37.20 5.2447 5 216.0 14.90 392.23 4.59 35.40
|
||||
0.06129 20.00 3.330 1 0.4429 7.6450 49.70 5.2119 5 216.0 14.90 377.07 3.01 46.00
|
||||
0.01501 90.00 1.210 1 0.4010 7.9230 24.80 5.8850 1 198.0 13.60 395.52 3.16 50.00
|
||||
0.00906 90.00 2.970 0 0.4000 7.0880 20.80 7.3073 1 285.0 15.30 394.72 7.85 32.20
|
||||
0.01096 55.00 2.250 0 0.3890 6.4530 31.90 7.3073 1 300.0 15.30 394.72 8.23 22.00
|
||||
0.01965 80.00 1.760 0 0.3850 6.2300 31.50 9.0892 1 241.0 18.20 341.60 12.93 20.10
|
||||
0.03871 52.50 5.320 0 0.4050 6.2090 31.30 7.3172 6 293.0 16.60 396.90 7.14 23.20
|
||||
0.04590 52.50 5.320 0 0.4050 6.3150 45.60 7.3172 6 293.0 16.60 396.90 7.60 22.30
|
||||
0.04297 52.50 5.320 0 0.4050 6.5650 22.90 7.3172 6 293.0 16.60 371.72 9.51 24.80
|
||||
0.03502 80.00 4.950 0 0.4110 6.8610 27.90 5.1167 4 245.0 19.20 396.90 3.33 28.50
|
||||
0.07886 80.00 4.950 0 0.4110 7.1480 27.70 5.1167 4 245.0 19.20 396.90 3.56 37.30
|
||||
0.03615 80.00 4.950 0 0.4110 6.6300 23.40 5.1167 4 245.0 19.20 396.90 4.70 27.90
|
||||
0.08265 0.00 13.920 0 0.4370 6.1270 18.40 5.5027 4 289.0 16.00 396.90 8.58 23.90
|
||||
0.08199 0.00 13.920 0 0.4370 6.0090 42.30 5.5027 4 289.0 16.00 396.90 10.40 21.70
|
||||
0.12932 0.00 13.920 0 0.4370 6.6780 31.10 5.9604 4 289.0 16.00 396.90 6.27 28.60
|
||||
0.05372 0.00 13.920 0 0.4370 6.5490 51.00 5.9604 4 289.0 16.00 392.85 7.39 27.10
|
||||
0.14103 0.00 13.920 0 0.4370 5.7900 58.00 6.3200 4 289.0 16.00 396.90 15.84 20.30
|
||||
0.06466 70.00 2.240 0 0.4000 6.3450 20.10 7.8278 5 358.0 14.80 368.24 4.97 22.50
|
||||
0.05561 70.00 2.240 0 0.4000 7.0410 10.00 7.8278 5 358.0 14.80 371.58 4.74 29.00
|
||||
0.04417 70.00 2.240 0 0.4000 6.8710 47.40 7.8278 5 358.0 14.80 390.86 6.07 24.80
|
||||
0.03537 34.00 6.090 0 0.4330 6.5900 40.40 5.4917 7 329.0 16.10 395.75 9.50 22.00
|
||||
0.09266 34.00 6.090 0 0.4330 6.4950 18.40 5.4917 7 329.0 16.10 383.61 8.67 26.40
|
||||
0.10000 34.00 6.090 0 0.4330 6.9820 17.70 5.4917 7 329.0 16.10 390.43 4.86 33.10
|
||||
0.05515 33.00 2.180 0 0.4720 7.2360 41.10 4.0220 7 222.0 18.40 393.68 6.93 36.10
|
||||
0.05479 33.00 2.180 0 0.4720 6.6160 58.10 3.3700 7 222.0 18.40 393.36 8.93 28.40
|
||||
0.07503 33.00 2.180 0 0.4720 7.4200 71.90 3.0992 7 222.0 18.40 396.90 6.47 33.40
|
||||
0.04932 33.00 2.180 0 0.4720 6.8490 70.30 3.1827 7 222.0 18.40 396.90 7.53 28.20
|
||||
0.49298 0.00 9.900 0 0.5440 6.6350 82.50 3.3175 4 304.0 18.40 396.90 4.54 22.80
|
||||
0.34940 0.00 9.900 0 0.5440 5.9720 76.70 3.1025 4 304.0 18.40 396.24 9.97 20.30
|
||||
2.63548 0.00 9.900 0 0.5440 4.9730 37.80 2.5194 4 304.0 18.40 350.45 12.64 16.10
|
||||
0.79041 0.00 9.900 0 0.5440 6.1220 52.80 2.6403 4 304.0 18.40 396.90 5.98 22.10
|
||||
0.26169 0.00 9.900 0 0.5440 6.0230 90.40 2.8340 4 304.0 18.40 396.30 11.72 19.40
|
||||
0.26938 0.00 9.900 0 0.5440 6.2660 82.80 3.2628 4 304.0 18.40 393.39 7.90 21.60
|
||||
0.36920 0.00 9.900 0 0.5440 6.5670 87.30 3.6023 4 304.0 18.40 395.69 9.28 23.80
|
||||
0.25356 0.00 9.900 0 0.5440 5.7050 77.70 3.9450 4 304.0 18.40 396.42 11.50 16.20
|
||||
0.31827 0.00 9.900 0 0.5440 5.9140 83.20 3.9986 4 304.0 18.40 390.70 18.33 17.80
|
||||
0.24522 0.00 9.900 0 0.5440 5.7820 71.70 4.0317 4 304.0 18.40 396.90 15.94 19.80
|
||||
0.40202 0.00 9.900 0 0.5440 6.3820 67.20 3.5325 4 304.0 18.40 395.21 10.36 23.10
|
||||
0.47547 0.00 9.900 0 0.5440 6.1130 58.80 4.0019 4 304.0 18.40 396.23 12.73 21.00
|
||||
0.16760 0.00 7.380 0 0.4930 6.4260 52.30 4.5404 5 287.0 19.60 396.90 7.20 23.80
|
||||
0.18159 0.00 7.380 0 0.4930 6.3760 54.30 4.5404 5 287.0 19.60 396.90 6.87 23.10
|
||||
0.35114 0.00 7.380 0 0.4930 6.0410 49.90 4.7211 5 287.0 19.60 396.90 7.70 20.40
|
||||
0.28392 0.00 7.380 0 0.4930 5.7080 74.30 4.7211 5 287.0 19.60 391.13 11.74 18.50
|
||||
0.34109 0.00 7.380 0 0.4930 6.4150 40.10 4.7211 5 287.0 19.60 396.90 6.12 25.00
|
||||
0.19186 0.00 7.380 0 0.4930 6.4310 14.70 5.4159 5 287.0 19.60 393.68 5.08 24.60
|
||||
0.30347 0.00 7.380 0 0.4930 6.3120 28.90 5.4159 5 287.0 19.60 396.90 6.15 23.00
|
||||
0.24103 0.00 7.380 0 0.4930 6.0830 43.70 5.4159 5 287.0 19.60 396.90 12.79 22.20
|
||||
0.06617 0.00 3.240 0 0.4600 5.8680 25.80 5.2146 4 430.0 16.90 382.44 9.97 19.30
|
||||
0.06724 0.00 3.240 0 0.4600 6.3330 17.20 5.2146 4 430.0 16.90 375.21 7.34 22.60
|
||||
0.04544 0.00 3.240 0 0.4600 6.1440 32.20 5.8736 4 430.0 16.90 368.57 9.09 19.80
|
||||
0.05023 35.00 6.060 0 0.4379 5.7060 28.40 6.6407 1 304.0 16.90 394.02 12.43 17.10
|
||||
0.03466 35.00 6.060 0 0.4379 6.0310 23.30 6.6407 1 304.0 16.90 362.25 7.83 19.40
|
||||
0.05083 0.00 5.190 0 0.5150 6.3160 38.10 6.4584 5 224.0 20.20 389.71 5.68 22.20
|
||||
0.03738 0.00 5.190 0 0.5150 6.3100 38.50 6.4584 5 224.0 20.20 389.40 6.75 20.70
|
||||
0.03961 0.00 5.190 0 0.5150 6.0370 34.50 5.9853 5 224.0 20.20 396.90 8.01 21.10
|
||||
0.03427 0.00 5.190 0 0.5150 5.8690 46.30 5.2311 5 224.0 20.20 396.90 9.80 19.50
|
||||
0.03041 0.00 5.190 0 0.5150 5.8950 59.60 5.6150 5 224.0 20.20 394.81 10.56 18.50
|
||||
0.03306 0.00 5.190 0 0.5150 6.0590 37.30 4.8122 5 224.0 20.20 396.14 8.51 20.60
|
||||
0.05497 0.00 5.190 0 0.5150 5.9850 45.40 4.8122 5 224.0 20.20 396.90 9.74 19.00
|
||||
0.06151 0.00 5.190 0 0.5150 5.9680 58.50 4.8122 5 224.0 20.20 396.90 9.29 18.70
|
||||
0.01301 35.00 1.520 0 0.4420 7.2410 49.30 7.0379 1 284.0 15.50 394.74 5.49 32.70
|
||||
0.02498 0.00 1.890 0 0.5180 6.5400 59.70 6.2669 1 422.0 15.90 389.96 8.65 16.50
|
||||
0.02543 55.00 3.780 0 0.4840 6.6960 56.40 5.7321 5 370.0 17.60 396.90 7.18 23.90
|
||||
0.03049 55.00 3.780 0 0.4840 6.8740 28.10 6.4654 5 370.0 17.60 387.97 4.61 31.20
|
||||
0.03113 0.00 4.390 0 0.4420 6.0140 48.50 8.0136 3 352.0 18.80 385.64 10.53 17.50
|
||||
0.06162 0.00 4.390 0 0.4420 5.8980 52.30 8.0136 3 352.0 18.80 364.61 12.67 17.20
|
||||
0.01870 85.00 4.150 0 0.4290 6.5160 27.70 8.5353 4 351.0 17.90 392.43 6.36 23.10
|
||||
0.01501 80.00 2.010 0 0.4350 6.6350 29.70 8.3440 4 280.0 17.00 390.94 5.99 24.50
|
||||
0.02899 40.00 1.250 0 0.4290 6.9390 34.50 8.7921 1 335.0 19.70 389.85 5.89 26.60
|
||||
0.06211 40.00 1.250 0 0.4290 6.4900 44.40 8.7921 1 335.0 19.70 396.90 5.98 22.90
|
||||
0.07950 60.00 1.690 0 0.4110 6.5790 35.90 10.7103 4 411.0 18.30 370.78 5.49 24.10
|
||||
0.07244 60.00 1.690 0 0.4110 5.8840 18.50 10.7103 4 411.0 18.30 392.33 7.79 18.60
|
||||
0.01709 90.00 2.020 0 0.4100 6.7280 36.10 12.1265 5 187.0 17.00 384.46 4.50 30.10
|
||||
0.04301 80.00 1.910 0 0.4130 5.6630 21.90 10.5857 4 334.0 22.00 382.80 8.05 18.20
|
||||
0.10659 80.00 1.910 0 0.4130 5.9360 19.50 10.5857 4 334.0 22.00 376.04 5.57 20.60
|
||||
8.98296 0.00 18.100 1 0.7700 6.2120 97.40 2.1222 24 666.0 20.20 377.73 17.60 17.80
|
||||
3.84970 0.00 18.100 1 0.7700 6.3950 91.00 2.5052 24 666.0 20.20 391.34 13.27 21.70
|
||||
5.20177 0.00 18.100 1 0.7700 6.1270 83.40 2.7227 24 666.0 20.20 395.43 11.48 22.70
|
||||
4.26131 0.00 18.100 0 0.7700 6.1120 81.30 2.5091 24 666.0 20.20 390.74 12.67 22.60
|
||||
4.54192 0.00 18.100 0 0.7700 6.3980 88.00 2.5182 24 666.0 20.20 374.56 7.79 25.00
|
||||
3.83684 0.00 18.100 0 0.7700 6.2510 91.10 2.2955 24 666.0 20.20 350.65 14.19 19.90
|
||||
3.67822 0.00 18.100 0 0.7700 5.3620 96.20 2.1036 24 666.0 20.20 380.79 10.19 20.80
|
||||
4.22239 0.00 18.100 1 0.7700 5.8030 89.00 1.9047 24 666.0 20.20 353.04 14.64 16.80
|
||||
3.47428 0.00 18.100 1 0.7180 8.7800 82.90 1.9047 24 666.0 20.20 354.55 5.29 21.90
|
||||
4.55587 0.00 18.100 0 0.7180 3.5610 87.90 1.6132 24 666.0 20.20 354.70 7.12 27.50
|
||||
3.69695 0.00 18.100 0 0.7180 4.9630 91.40 1.7523 24 666.0 20.20 316.03 14.00 21.90
|
||||
13.52220 0.00 18.100 0 0.6310 3.8630 100.00 1.5106 24 666.0 20.20 131.42 13.33 23.10
|
||||
4.89822 0.00 18.100 0 0.6310 4.9700 100.00 1.3325 24 666.0 20.20 375.52 3.26 50.00
|
||||
5.66998 0.00 18.100 1 0.6310 6.6830 96.80 1.3567 24 666.0 20.20 375.33 3.73 50.00
|
||||
6.53876 0.00 18.100 1 0.6310 7.0160 97.50 1.2024 24 666.0 20.20 392.05 2.96 50.00
|
||||
9.23230 0.00 18.100 0 0.6310 6.2160 100.00 1.1691 24 666.0 20.20 366.15 9.53 50.00
|
||||
8.26725 0.00 18.100 1 0.6680 5.8750 89.60 1.1296 24 666.0 20.20 347.88 8.88 50.00
|
||||
11.10810 0.00 18.100 0 0.6680 4.9060 100.00 1.1742 24 666.0 20.20 396.90 34.77 13.80
|
||||
18.49820 0.00 18.100 0 0.6680 4.1380 100.00 1.1370 24 666.0 20.20 396.90 37.97 13.80
|
||||
19.60910 0.00 18.100 0 0.6710 7.3130 97.90 1.3163 24 666.0 20.20 396.90 13.44 15.00
|
||||
15.28800 0.00 18.100 0 0.6710 6.6490 93.30 1.3449 24 666.0 20.20 363.02 23.24 13.90
|
||||
9.82349 0.00 18.100 0 0.6710 6.7940 98.80 1.3580 24 666.0 20.20 396.90 21.24 13.30
|
||||
23.64820 0.00 18.100 0 0.6710 6.3800 96.20 1.3861 24 666.0 20.20 396.90 23.69 13.10
|
||||
17.86670 0.00 18.100 0 0.6710 6.2230 100.00 1.3861 24 666.0 20.20 393.74 21.78 10.20
|
||||
88.97620 0.00 18.100 0 0.6710 6.9680 91.90 1.4165 24 666.0 20.20 396.90 17.21 10.40
|
||||
15.87440 0.00 18.100 0 0.6710 6.5450 99.10 1.5192 24 666.0 20.20 396.90 21.08 10.90
|
||||
9.18702 0.00 18.100 0 0.7000 5.5360 100.00 1.5804 24 666.0 20.20 396.90 23.60 11.30
|
||||
7.99248 0.00 18.100 0 0.7000 5.5200 100.00 1.5331 24 666.0 20.20 396.90 24.56 12.30
|
||||
20.08490 0.00 18.100 0 0.7000 4.3680 91.20 1.4395 24 666.0 20.20 285.83 30.63 8.80
|
||||
16.81180 0.00 18.100 0 0.7000 5.2770 98.10 1.4261 24 666.0 20.20 396.90 30.81 7.20
|
||||
24.39380 0.00 18.100 0 0.7000 4.6520 100.00 1.4672 24 666.0 20.20 396.90 28.28 10.50
|
||||
22.59710 0.00 18.100 0 0.7000 5.0000 89.50 1.5184 24 666.0 20.20 396.90 31.99 7.40
|
||||
14.33370 0.00 18.100 0 0.7000 4.8800 100.00 1.5895 24 666.0 20.20 372.92 30.62 10.20
|
||||
8.15174 0.00 18.100 0 0.7000 5.3900 98.90 1.7281 24 666.0 20.20 396.90 20.85 11.50
|
||||
6.96215 0.00 18.100 0 0.7000 5.7130 97.00 1.9265 24 666.0 20.20 394.43 17.11 15.10
|
||||
5.29305 0.00 18.100 0 0.7000 6.0510 82.50 2.1678 24 666.0 20.20 378.38 18.76 23.20
|
||||
11.57790 0.00 18.100 0 0.7000 5.0360 97.00 1.7700 24 666.0 20.20 396.90 25.68 9.70
|
||||
8.64476 0.00 18.100 0 0.6930 6.1930 92.60 1.7912 24 666.0 20.20 396.90 15.17 13.80
|
||||
13.35980 0.00 18.100 0 0.6930 5.8870 94.70 1.7821 24 666.0 20.20 396.90 16.35 12.70
|
||||
8.71675 0.00 18.100 0 0.6930 6.4710 98.80 1.7257 24 666.0 20.20 391.98 17.12 13.10
|
||||
5.87205 0.00 18.100 0 0.6930 6.4050 96.00 1.6768 24 666.0 20.20 396.90 19.37 12.50
|
||||
7.67202 0.00 18.100 0 0.6930 5.7470 98.90 1.6334 24 666.0 20.20 393.10 19.92 8.50
|
||||
38.35180 0.00 18.100 0 0.6930 5.4530 100.00 1.4896 24 666.0 20.20 396.90 30.59 5.00
|
||||
9.91655 0.00 18.100 0 0.6930 5.8520 77.80 1.5004 24 666.0 20.20 338.16 29.97 6.30
|
||||
25.04610 0.00 18.100 0 0.6930 5.9870 100.00 1.5888 24 666.0 20.20 396.90 26.77 5.60
|
||||
14.23620 0.00 18.100 0 0.6930 6.3430 100.00 1.5741 24 666.0 20.20 396.90 20.32 7.20
|
||||
9.59571 0.00 18.100 0 0.6930 6.4040 100.00 1.6390 24 666.0 20.20 376.11 20.31 12.10
|
||||
24.80170 0.00 18.100 0 0.6930 5.3490 96.00 1.7028 24 666.0 20.20 396.90 19.77 8.30
|
||||
41.52920 0.00 18.100 0 0.6930 5.5310 85.40 1.6074 24 666.0 20.20 329.46 27.38 8.50
|
||||
67.92080 0.00 18.100 0 0.6930 5.6830 100.00 1.4254 24 666.0 20.20 384.97 22.98 5.00
|
||||
20.71620 0.00 18.100 0 0.6590 4.1380 100.00 1.1781 24 666.0 20.20 370.22 23.34 11.90
|
||||
11.95110 0.00 18.100 0 0.6590 5.6080 100.00 1.2852 24 666.0 20.20 332.09 12.13 27.90
|
||||
7.40389 0.00 18.100 0 0.5970 5.6170 97.90 1.4547 24 666.0 20.20 314.64 26.40 17.20
|
||||
14.43830 0.00 18.100 0 0.5970 6.8520 100.00 1.4655 24 666.0 20.20 179.36 19.78 27.50
|
||||
51.13580 0.00 18.100 0 0.5970 5.7570 100.00 1.4130 24 666.0 20.20 2.60 10.11 15.00
|
||||
14.05070 0.00 18.100 0 0.5970 6.6570 100.00 1.5275 24 666.0 20.20 35.05 21.22 17.20
|
||||
18.81100 0.00 18.100 0 0.5970 4.6280 100.00 1.5539 24 666.0 20.20 28.79 34.37 17.90
|
||||
28.65580 0.00 18.100 0 0.5970 5.1550 100.00 1.5894 24 666.0 20.20 210.97 20.08 16.30
|
||||
45.74610 0.00 18.100 0 0.6930 4.5190 100.00 1.6582 24 666.0 20.20 88.27 36.98 7.00
|
||||
18.08460 0.00 18.100 0 0.6790 6.4340 100.00 1.8347 24 666.0 20.20 27.25 29.05 7.20
|
||||
10.83420 0.00 18.100 0 0.6790 6.7820 90.80 1.8195 24 666.0 20.20 21.57 25.79 7.50
|
||||
25.94060 0.00 18.100 0 0.6790 5.3040 89.10 1.6475 24 666.0 20.20 127.36 26.64 10.40
|
||||
73.53410 0.00 18.100 0 0.6790 5.9570 100.00 1.8026 24 666.0 20.20 16.45 20.62 8.80
|
||||
11.81230 0.00 18.100 0 0.7180 6.8240 76.50 1.7940 24 666.0 20.20 48.45 22.74 8.40
|
||||
11.08740 0.00 18.100 0 0.7180 6.4110 100.00 1.8589 24 666.0 20.20 318.75 15.02 16.70
|
||||
7.02259 0.00 18.100 0 0.7180 6.0060 95.30 1.8746 24 666.0 20.20 319.98 15.70 14.20
|
||||
12.04820 0.00 18.100 0 0.6140 5.6480 87.60 1.9512 24 666.0 20.20 291.55 14.10 20.80
|
||||
7.05042 0.00 18.100 0 0.6140 6.1030 85.10 2.0218 24 666.0 20.20 2.52 23.29 13.40
|
||||
8.79212 0.00 18.100 0 0.5840 5.5650 70.60 2.0635 24 666.0 20.20 3.65 17.16 11.70
|
||||
15.86030 0.00 18.100 0 0.6790 5.8960 95.40 1.9096 24 666.0 20.20 7.68 24.39 8.30
|
||||
12.24720 0.00 18.100 0 0.5840 5.8370 59.70 1.9976 24 666.0 20.20 24.65 15.69 10.20
|
||||
37.66190 0.00 18.100 0 0.6790 6.2020 78.70 1.8629 24 666.0 20.20 18.82 14.52 10.90
|
||||
7.36711 0.00 18.100 0 0.6790 6.1930 78.10 1.9356 24 666.0 20.20 96.73 21.52 11.00
|
||||
9.33889 0.00 18.100 0 0.6790 6.3800 95.60 1.9682 24 666.0 20.20 60.72 24.08 9.50
|
||||
8.49213 0.00 18.100 0 0.5840 6.3480 86.10 2.0527 24 666.0 20.20 83.45 17.64 14.50
|
||||
10.06230 0.00 18.100 0 0.5840 6.8330 94.30 2.0882 24 666.0 20.20 81.33 19.69 14.10
|
||||
6.44405 0.00 18.100 0 0.5840 6.4250 74.80 2.2004 24 666.0 20.20 97.95 12.03 16.10
|
||||
5.58107 0.00 18.100 0 0.7130 6.4360 87.90 2.3158 24 666.0 20.20 100.19 16.22 14.30
|
||||
13.91340 0.00 18.100 0 0.7130 6.2080 95.00 2.2222 24 666.0 20.20 100.63 15.17 11.70
|
||||
11.16040 0.00 18.100 0 0.7400 6.6290 94.60 2.1247 24 666.0 20.20 109.85 23.27 13.40
|
||||
14.42080 0.00 18.100 0 0.7400 6.4610 93.30 2.0026 24 666.0 20.20 27.49 18.05 9.60
|
||||
15.17720 0.00 18.100 0 0.7400 6.1520 100.00 1.9142 24 666.0 20.20 9.32 26.45 8.70
|
||||
13.67810 0.00 18.100 0 0.7400 5.9350 87.90 1.8206 24 666.0 20.20 68.95 34.02 8.40
|
||||
9.39063 0.00 18.100 0 0.7400 5.6270 93.90 1.8172 24 666.0 20.20 396.90 22.88 12.80
|
||||
22.05110 0.00 18.100 0 0.7400 5.8180 92.40 1.8662 24 666.0 20.20 391.45 22.11 10.50
|
||||
9.72418 0.00 18.100 0 0.7400 6.4060 97.20 2.0651 24 666.0 20.20 385.96 19.52 17.10
|
||||
5.66637 0.00 18.100 0 0.7400 6.2190 100.00 2.0048 24 666.0 20.20 395.69 16.59 18.40
|
||||
9.96654 0.00 18.100 0 0.7400 6.4850 100.00 1.9784 24 666.0 20.20 386.73 18.85 15.40
|
||||
12.80230 0.00 18.100 0 0.7400 5.8540 96.60 1.8956 24 666.0 20.20 240.52 23.79 10.80
|
||||
10.67180 0.00 18.100 0 0.7400 6.4590 94.80 1.9879 24 666.0 20.20 43.06 23.98 11.80
|
||||
6.28807 0.00 18.100 0 0.7400 6.3410 96.40 2.0720 24 666.0 20.20 318.01 17.79 14.90
|
||||
9.92485 0.00 18.100 0 0.7400 6.2510 96.60 2.1980 24 666.0 20.20 388.52 16.44 12.60
|
||||
9.32909 0.00 18.100 0 0.7130 6.1850 98.70 2.2616 24 666.0 20.20 396.90 18.13 14.10
|
||||
7.52601 0.00 18.100 0 0.7130 6.4170 98.30 2.1850 24 666.0 20.20 304.21 19.31 13.00
|
||||
6.71772 0.00 18.100 0 0.7130 6.7490 92.60 2.3236 24 666.0 20.20 0.32 17.44 13.40
|
||||
5.44114 0.00 18.100 0 0.7130 6.6550 98.20 2.3552 24 666.0 20.20 355.29 17.73 15.20
|
||||
5.09017 0.00 18.100 0 0.7130 6.2970 91.80 2.3682 24 666.0 20.20 385.09 17.27 16.10
|
||||
8.24809 0.00 18.100 0 0.7130 7.3930 99.30 2.4527 24 666.0 20.20 375.87 16.74 17.80
|
||||
9.51363 0.00 18.100 0 0.7130 6.7280 94.10 2.4961 24 666.0 20.20 6.68 18.71 14.90
|
||||
4.75237 0.00 18.100 0 0.7130 6.5250 86.50 2.4358 24 666.0 20.20 50.92 18.13 14.10
|
||||
4.66883 0.00 18.100 0 0.7130 5.9760 87.90 2.5806 24 666.0 20.20 10.48 19.01 12.70
|
||||
8.20058 0.00 18.100 0 0.7130 5.9360 80.30 2.7792 24 666.0 20.20 3.50 16.94 13.50
|
||||
7.75223 0.00 18.100 0 0.7130 6.3010 83.70 2.7831 24 666.0 20.20 272.21 16.23 14.90
|
||||
6.80117 0.00 18.100 0 0.7130 6.0810 84.40 2.7175 24 666.0 20.20 396.90 14.70 20.00
|
||||
4.81213 0.00 18.100 0 0.7130 6.7010 90.00 2.5975 24 666.0 20.20 255.23 16.42 16.40
|
||||
3.69311 0.00 18.100 0 0.7130 6.3760 88.40 2.5671 24 666.0 20.20 391.43 14.65 17.70
|
||||
6.65492 0.00 18.100 0 0.7130 6.3170 83.00 2.7344 24 666.0 20.20 396.90 13.99 19.50
|
||||
5.82115 0.00 18.100 0 0.7130 6.5130 89.90 2.8016 24 666.0 20.20 393.82 10.29 20.20
|
||||
7.83932 0.00 18.100 0 0.6550 6.2090 65.40 2.9634 24 666.0 20.20 396.90 13.22 21.40
|
||||
3.16360 0.00 18.100 0 0.6550 5.7590 48.20 3.0665 24 666.0 20.20 334.40 14.13 19.90
|
||||
3.77498 0.00 18.100 0 0.6550 5.9520 84.70 2.8715 24 666.0 20.20 22.01 17.15 19.00
|
||||
4.42228 0.00 18.100 0 0.5840 6.0030 94.50 2.5403 24 666.0 20.20 331.29 21.32 19.10
|
||||
15.57570 0.00 18.100 0 0.5800 5.9260 71.00 2.9084 24 666.0 20.20 368.74 18.13 19.10
|
||||
13.07510 0.00 18.100 0 0.5800 5.7130 56.70 2.8237 24 666.0 20.20 396.90 14.76 20.10
|
||||
4.34879 0.00 18.100 0 0.5800 6.1670 84.00 3.0334 24 666.0 20.20 396.90 16.29 19.90
|
||||
4.03841 0.00 18.100 0 0.5320 6.2290 90.70 3.0993 24 666.0 20.20 395.33 12.87 19.60
|
||||
3.56868 0.00 18.100 0 0.5800 6.4370 75.00 2.8965 24 666.0 20.20 393.37 14.36 23.20
|
||||
4.64689 0.00 18.100 0 0.6140 6.9800 67.60 2.5329 24 666.0 20.20 374.68 11.66 29.80
|
||||
8.05579 0.00 18.100 0 0.5840 5.4270 95.40 2.4298 24 666.0 20.20 352.58 18.14 13.80
|
||||
6.39312 0.00 18.100 0 0.5840 6.1620 97.40 2.2060 24 666.0 20.20 302.76 24.10 13.30
|
||||
4.87141 0.00 18.100 0 0.6140 6.4840 93.60 2.3053 24 666.0 20.20 396.21 18.68 16.70
|
||||
15.02340 0.00 18.100 0 0.6140 5.3040 97.30 2.1007 24 666.0 20.20 349.48 24.91 12.00
|
||||
10.23300 0.00 18.100 0 0.6140 6.1850 96.70 2.1705 24 666.0 20.20 379.70 18.03 14.60
|
||||
14.33370 0.00 18.100 0 0.6140 6.2290 88.00 1.9512 24 666.0 20.20 383.32 13.11 21.40
|
||||
5.82401 0.00 18.100 0 0.5320 6.2420 64.70 3.4242 24 666.0 20.20 396.90 10.74 23.00
|
||||
5.70818 0.00 18.100 0 0.5320 6.7500 74.90 3.3317 24 666.0 20.20 393.07 7.74 23.70
|
||||
5.73116 0.00 18.100 0 0.5320 7.0610 77.00 3.4106 24 666.0 20.20 395.28 7.01 25.00
|
||||
2.81838 0.00 18.100 0 0.5320 5.7620 40.30 4.0983 24 666.0 20.20 392.92 10.42 21.80
|
||||
2.37857 0.00 18.100 0 0.5830 5.8710 41.90 3.7240 24 666.0 20.20 370.73 13.34 20.60
|
||||
3.67367 0.00 18.100 0 0.5830 6.3120 51.90 3.9917 24 666.0 20.20 388.62 10.58 21.20
|
||||
5.69175 0.00 18.100 0 0.5830 6.1140 79.80 3.5459 24 666.0 20.20 392.68 14.98 19.10
|
||||
4.83567 0.00 18.100 0 0.5830 5.9050 53.20 3.1523 24 666.0 20.20 388.22 11.45 20.60
|
||||
0.15086 0.00 27.740 0 0.6090 5.4540 92.70 1.8209 4 711.0 20.10 395.09 18.06 15.20
|
||||
0.18337 0.00 27.740 0 0.6090 5.4140 98.30 1.7554 4 711.0 20.10 344.05 23.97 7.00
|
||||
0.20746 0.00 27.740 0 0.6090 5.0930 98.00 1.8226 4 711.0 20.10 318.43 29.68 8.10
|
||||
0.10574 0.00 27.740 0 0.6090 5.9830 98.80 1.8681 4 711.0 20.10 390.11 18.07 13.60
|
||||
0.11132 0.00 27.740 0 0.6090 5.9830 83.50 2.1099 4 711.0 20.10 396.90 13.35 20.10
|
||||
0.17331 0.00 9.690 0 0.5850 5.7070 54.00 2.3817 6 391.0 19.20 396.90 12.01 21.80
|
||||
0.27957 0.00 9.690 0 0.5850 5.9260 42.60 2.3817 6 391.0 19.20 396.90 13.59 24.50
|
||||
0.17899 0.00 9.690 0 0.5850 5.6700 28.80 2.7986 6 391.0 19.20 393.29 17.60 23.10
|
||||
0.28960 0.00 9.690 0 0.5850 5.3900 72.90 2.7986 6 391.0 19.20 396.90 21.14 19.70
|
||||
0.26838 0.00 9.690 0 0.5850 5.7940 70.60 2.8927 6 391.0 19.20 396.90 14.10 18.30
|
||||
0.23912 0.00 9.690 0 0.5850 6.0190 65.30 2.4091 6 391.0 19.20 396.90 12.92 21.20
|
||||
0.17783 0.00 9.690 0 0.5850 5.5690 73.50 2.3999 6 391.0 19.20 395.77 15.10 17.50
|
||||
0.22438 0.00 9.690 0 0.5850 6.0270 79.70 2.4982 6 391.0 19.20 396.90 14.33 16.80
|
||||
0.06263 0.00 11.930 0 0.5730 6.5930 69.10 2.4786 1 273.0 21.00 391.99 9.67 22.40
|
||||
0.04527 0.00 11.930 0 0.5730 6.1200 76.70 2.2875 1 273.0 21.00 396.90 9.08 20.60
|
||||
0.06076 0.00 11.930 0 0.5730 6.9760 91.00 2.1675 1 273.0 21.00 396.90 5.64 23.90
|
||||
0.10959 0.00 11.930 0 0.5730 6.7940 89.30 2.3889 1 273.0 21.00 393.45 6.48 22.00
|
||||
0.04741 0.00 11.930 0 0.5730 6.0300 80.80 2.5050 1 273.0 21.00 396.90 7.88 11.90
|
||||
768
data/pima-indians-diabetes.data.csv
Normal file
@@ -0,0 +1,768 @@
|
||||
6,148,72,35,0,33.6,0.627,50,1
|
||||
1,85,66,29,0,26.6,0.351,31,0
|
||||
8,183,64,0,0,23.3,0.672,32,1
|
||||
1,89,66,23,94,28.1,0.167,21,0
|
||||
0,137,40,35,168,43.1,2.288,33,1
|
||||
5,116,74,0,0,25.6,0.201,30,0
|
||||
3,78,50,32,88,31.0,0.248,26,1
|
||||
10,115,0,0,0,35.3,0.134,29,0
|
||||
2,197,70,45,543,30.5,0.158,53,1
|
||||
8,125,96,0,0,0.0,0.232,54,1
|
||||
4,110,92,0,0,37.6,0.191,30,0
|
||||
10,168,74,0,0,38.0,0.537,34,1
|
||||
10,139,80,0,0,27.1,1.441,57,0
|
||||
1,189,60,23,846,30.1,0.398,59,1
|
||||
5,166,72,19,175,25.8,0.587,51,1
|
||||
7,100,0,0,0,30.0,0.484,32,1
|
||||
0,118,84,47,230,45.8,0.551,31,1
|
||||
7,107,74,0,0,29.6,0.254,31,1
|
||||
1,103,30,38,83,43.3,0.183,33,0
|
||||
1,115,70,30,96,34.6,0.529,32,1
|
||||
3,126,88,41,235,39.3,0.704,27,0
|
||||
8,99,84,0,0,35.4,0.388,50,0
|
||||
7,196,90,0,0,39.8,0.451,41,1
|
||||
9,119,80,35,0,29.0,0.263,29,1
|
||||
11,143,94,33,146,36.6,0.254,51,1
|
||||
10,125,70,26,115,31.1,0.205,41,1
|
||||
7,147,76,0,0,39.4,0.257,43,1
|
||||
1,97,66,15,140,23.2,0.487,22,0
|
||||
13,145,82,19,110,22.2,0.245,57,0
|
||||
5,117,92,0,0,34.1,0.337,38,0
|
||||
5,109,75,26,0,36.0,0.546,60,0
|
||||
3,158,76,36,245,31.6,0.851,28,1
|
||||
3,88,58,11,54,24.8,0.267,22,0
|
||||
6,92,92,0,0,19.9,0.188,28,0
|
||||
10,122,78,31,0,27.6,0.512,45,0
|
||||
4,103,60,33,192,24.0,0.966,33,0
|
||||
11,138,76,0,0,33.2,0.420,35,0
|
||||
9,102,76,37,0,32.9,0.665,46,1
|
||||
2,90,68,42,0,38.2,0.503,27,1
|
||||
4,111,72,47,207,37.1,1.390,56,1
|
||||
3,180,64,25,70,34.0,0.271,26,0
|
||||
7,133,84,0,0,40.2,0.696,37,0
|
||||
7,106,92,18,0,22.7,0.235,48,0
|
||||
9,171,110,24,240,45.4,0.721,54,1
|
||||
7,159,64,0,0,27.4,0.294,40,0
|
||||
0,180,66,39,0,42.0,1.893,25,1
|
||||
1,146,56,0,0,29.7,0.564,29,0
|
||||
2,71,70,27,0,28.0,0.586,22,0
|
||||
7,103,66,32,0,39.1,0.344,31,1
|
||||
7,105,0,0,0,0.0,0.305,24,0
|
||||
1,103,80,11,82,19.4,0.491,22,0
|
||||
1,101,50,15,36,24.2,0.526,26,0
|
||||
5,88,66,21,23,24.4,0.342,30,0
|
||||
8,176,90,34,300,33.7,0.467,58,1
|
||||
7,150,66,42,342,34.7,0.718,42,0
|
||||
1,73,50,10,0,23.0,0.248,21,0
|
||||
7,187,68,39,304,37.7,0.254,41,1
|
||||
0,100,88,60,110,46.8,0.962,31,0
|
||||
0,146,82,0,0,40.5,1.781,44,0
|
||||
0,105,64,41,142,41.5,0.173,22,0
|
||||
2,84,0,0,0,0.0,0.304,21,0
|
||||
8,133,72,0,0,32.9,0.270,39,1
|
||||
5,44,62,0,0,25.0,0.587,36,0
|
||||
2,141,58,34,128,25.4,0.699,24,0
|
||||
7,114,66,0,0,32.8,0.258,42,1
|
||||
5,99,74,27,0,29.0,0.203,32,0
|
||||
0,109,88,30,0,32.5,0.855,38,1
|
||||
2,109,92,0,0,42.7,0.845,54,0
|
||||
1,95,66,13,38,19.6,0.334,25,0
|
||||
4,146,85,27,100,28.9,0.189,27,0
|
||||
2,100,66,20,90,32.9,0.867,28,1
|
||||
5,139,64,35,140,28.6,0.411,26,0
|
||||
13,126,90,0,0,43.4,0.583,42,1
|
||||
4,129,86,20,270,35.1,0.231,23,0
|
||||
1,79,75,30,0,32.0,0.396,22,0
|
||||
1,0,48,20,0,24.7,0.140,22,0
|
||||
7,62,78,0,0,32.6,0.391,41,0
|
||||
5,95,72,33,0,37.7,0.370,27,0
|
||||
0,131,0,0,0,43.2,0.270,26,1
|
||||
2,112,66,22,0,25.0,0.307,24,0
|
||||
3,113,44,13,0,22.4,0.140,22,0
|
||||
2,74,0,0,0,0.0,0.102,22,0
|
||||
7,83,78,26,71,29.3,0.767,36,0
|
||||
0,101,65,28,0,24.6,0.237,22,0
|
||||
5,137,108,0,0,48.8,0.227,37,1
|
||||
2,110,74,29,125,32.4,0.698,27,0
|
||||
13,106,72,54,0,36.6,0.178,45,0
|
||||
2,100,68,25,71,38.5,0.324,26,0
|
||||
15,136,70,32,110,37.1,0.153,43,1
|
||||
1,107,68,19,0,26.5,0.165,24,0
|
||||
1,80,55,0,0,19.1,0.258,21,0
|
||||
4,123,80,15,176,32.0,0.443,34,0
|
||||
7,81,78,40,48,46.7,0.261,42,0
|
||||
4,134,72,0,0,23.8,0.277,60,1
|
||||
2,142,82,18,64,24.7,0.761,21,0
|
||||
6,144,72,27,228,33.9,0.255,40,0
|
||||
2,92,62,28,0,31.6,0.130,24,0
|
||||
1,71,48,18,76,20.4,0.323,22,0
|
||||
6,93,50,30,64,28.7,0.356,23,0
|
||||
1,122,90,51,220,49.7,0.325,31,1
|
||||
1,163,72,0,0,39.0,1.222,33,1
|
||||
1,151,60,0,0,26.1,0.179,22,0
|
||||
0,125,96,0,0,22.5,0.262,21,0
|
||||
1,81,72,18,40,26.6,0.283,24,0
|
||||
2,85,65,0,0,39.6,0.930,27,0
|
||||
1,126,56,29,152,28.7,0.801,21,0
|
||||
1,96,122,0,0,22.4,0.207,27,0
|
||||
4,144,58,28,140,29.5,0.287,37,0
|
||||
3,83,58,31,18,34.3,0.336,25,0
|
||||
0,95,85,25,36,37.4,0.247,24,1
|
||||
3,171,72,33,135,33.3,0.199,24,1
|
||||
8,155,62,26,495,34.0,0.543,46,1
|
||||
1,89,76,34,37,31.2,0.192,23,0
|
||||
4,76,62,0,0,34.0,0.391,25,0
|
||||
7,160,54,32,175,30.5,0.588,39,1
|
||||
4,146,92,0,0,31.2,0.539,61,1
|
||||
5,124,74,0,0,34.0,0.220,38,1
|
||||
5,78,48,0,0,33.7,0.654,25,0
|
||||
4,97,60,23,0,28.2,0.443,22,0
|
||||
4,99,76,15,51,23.2,0.223,21,0
|
||||
0,162,76,56,100,53.2,0.759,25,1
|
||||
6,111,64,39,0,34.2,0.260,24,0
|
||||
2,107,74,30,100,33.6,0.404,23,0
|
||||
5,132,80,0,0,26.8,0.186,69,0
|
||||
0,113,76,0,0,33.3,0.278,23,1
|
||||
1,88,30,42,99,55.0,0.496,26,1
|
||||
3,120,70,30,135,42.9,0.452,30,0
|
||||
1,118,58,36,94,33.3,0.261,23,0
|
||||
1,117,88,24,145,34.5,0.403,40,1
|
||||
0,105,84,0,0,27.9,0.741,62,1
|
||||
4,173,70,14,168,29.7,0.361,33,1
|
||||
9,122,56,0,0,33.3,1.114,33,1
|
||||
3,170,64,37,225,34.5,0.356,30,1
|
||||
8,84,74,31,0,38.3,0.457,39,0
|
||||
2,96,68,13,49,21.1,0.647,26,0
|
||||
2,125,60,20,140,33.8,0.088,31,0
|
||||
0,100,70,26,50,30.8,0.597,21,0
|
||||
0,93,60,25,92,28.7,0.532,22,0
|
||||
0,129,80,0,0,31.2,0.703,29,0
|
||||
5,105,72,29,325,36.9,0.159,28,0
|
||||
3,128,78,0,0,21.1,0.268,55,0
|
||||
5,106,82,30,0,39.5,0.286,38,0
|
||||
2,108,52,26,63,32.5,0.318,22,0
|
||||
10,108,66,0,0,32.4,0.272,42,1
|
||||
4,154,62,31,284,32.8,0.237,23,0
|
||||
0,102,75,23,0,0.0,0.572,21,0
|
||||
9,57,80,37,0,32.8,0.096,41,0
|
||||
2,106,64,35,119,30.5,1.400,34,0
|
||||
5,147,78,0,0,33.7,0.218,65,0
|
||||
2,90,70,17,0,27.3,0.085,22,0
|
||||
1,136,74,50,204,37.4,0.399,24,0
|
||||
4,114,65,0,0,21.9,0.432,37,0
|
||||
9,156,86,28,155,34.3,1.189,42,1
|
||||
1,153,82,42,485,40.6,0.687,23,0
|
||||
8,188,78,0,0,47.9,0.137,43,1
|
||||
7,152,88,44,0,50.0,0.337,36,1
|
||||
2,99,52,15,94,24.6,0.637,21,0
|
||||
1,109,56,21,135,25.2,0.833,23,0
|
||||
2,88,74,19,53,29.0,0.229,22,0
|
||||
17,163,72,41,114,40.9,0.817,47,1
|
||||
4,151,90,38,0,29.7,0.294,36,0
|
||||
7,102,74,40,105,37.2,0.204,45,0
|
||||
0,114,80,34,285,44.2,0.167,27,0
|
||||
2,100,64,23,0,29.7,0.368,21,0
|
||||
0,131,88,0,0,31.6,0.743,32,1
|
||||
6,104,74,18,156,29.9,0.722,41,1
|
||||
3,148,66,25,0,32.5,0.256,22,0
|
||||
4,120,68,0,0,29.6,0.709,34,0
|
||||
4,110,66,0,0,31.9,0.471,29,0
|
||||
3,111,90,12,78,28.4,0.495,29,0
|
||||
6,102,82,0,0,30.8,0.180,36,1
|
||||
6,134,70,23,130,35.4,0.542,29,1
|
||||
2,87,0,23,0,28.9,0.773,25,0
|
||||
1,79,60,42,48,43.5,0.678,23,0
|
||||
2,75,64,24,55,29.7,0.370,33,0
|
||||
8,179,72,42,130,32.7,0.719,36,1
|
||||
6,85,78,0,0,31.2,0.382,42,0
|
||||
0,129,110,46,130,67.1,0.319,26,1
|
||||
5,143,78,0,0,45.0,0.190,47,0
|
||||
5,130,82,0,0,39.1,0.956,37,1
|
||||
6,87,80,0,0,23.2,0.084,32,0
|
||||
0,119,64,18,92,34.9,0.725,23,0
|
||||
1,0,74,20,23,27.7,0.299,21,0
|
||||
5,73,60,0,0,26.8,0.268,27,0
|
||||
4,141,74,0,0,27.6,0.244,40,0
|
||||
7,194,68,28,0,35.9,0.745,41,1
|
||||
8,181,68,36,495,30.1,0.615,60,1
|
||||
1,128,98,41,58,32.0,1.321,33,1
|
||||
8,109,76,39,114,27.9,0.640,31,1
|
||||
5,139,80,35,160,31.6,0.361,25,1
|
||||
3,111,62,0,0,22.6,0.142,21,0
|
||||
9,123,70,44,94,33.1,0.374,40,0
|
||||
7,159,66,0,0,30.4,0.383,36,1
|
||||
11,135,0,0,0,52.3,0.578,40,1
|
||||
8,85,55,20,0,24.4,0.136,42,0
|
||||
5,158,84,41,210,39.4,0.395,29,1
|
||||
1,105,58,0,0,24.3,0.187,21,0
|
||||
3,107,62,13,48,22.9,0.678,23,1
|
||||
4,109,64,44,99,34.8,0.905,26,1
|
||||
4,148,60,27,318,30.9,0.150,29,1
|
||||
0,113,80,16,0,31.0,0.874,21,0
|
||||
1,138,82,0,0,40.1,0.236,28,0
|
||||
0,108,68,20,0,27.3,0.787,32,0
|
||||
2,99,70,16,44,20.4,0.235,27,0
|
||||
6,103,72,32,190,37.7,0.324,55,0
|
||||
5,111,72,28,0,23.9,0.407,27,0
|
||||
8,196,76,29,280,37.5,0.605,57,1
|
||||
5,162,104,0,0,37.7,0.151,52,1
|
||||
1,96,64,27,87,33.2,0.289,21,0
|
||||
7,184,84,33,0,35.5,0.355,41,1
|
||||
2,81,60,22,0,27.7,0.290,25,0
|
||||
0,147,85,54,0,42.8,0.375,24,0
|
||||
7,179,95,31,0,34.2,0.164,60,0
|
||||
0,140,65,26,130,42.6,0.431,24,1
|
||||
9,112,82,32,175,34.2,0.260,36,1
|
||||
12,151,70,40,271,41.8,0.742,38,1
|
||||
5,109,62,41,129,35.8,0.514,25,1
|
||||
6,125,68,30,120,30.0,0.464,32,0
|
||||
5,85,74,22,0,29.0,1.224,32,1
|
||||
5,112,66,0,0,37.8,0.261,41,1
|
||||
0,177,60,29,478,34.6,1.072,21,1
|
||||
2,158,90,0,0,31.6,0.805,66,1
|
||||
7,119,0,0,0,25.2,0.209,37,0
|
||||
7,142,60,33,190,28.8,0.687,61,0
|
||||
1,100,66,15,56,23.6,0.666,26,0
|
||||
1,87,78,27,32,34.6,0.101,22,0
|
||||
0,101,76,0,0,35.7,0.198,26,0
|
||||
3,162,52,38,0,37.2,0.652,24,1
|
||||
4,197,70,39,744,36.7,2.329,31,0
|
||||
0,117,80,31,53,45.2,0.089,24,0
|
||||
4,142,86,0,0,44.0,0.645,22,1
|
||||
6,134,80,37,370,46.2,0.238,46,1
|
||||
1,79,80,25,37,25.4,0.583,22,0
|
||||
4,122,68,0,0,35.0,0.394,29,0
|
||||
3,74,68,28,45,29.7,0.293,23,0
|
||||
4,171,72,0,0,43.6,0.479,26,1
|
||||
7,181,84,21,192,35.9,0.586,51,1
|
||||
0,179,90,27,0,44.1,0.686,23,1
|
||||
9,164,84,21,0,30.8,0.831,32,1
|
||||
0,104,76,0,0,18.4,0.582,27,0
|
||||
1,91,64,24,0,29.2,0.192,21,0
|
||||
4,91,70,32,88,33.1,0.446,22,0
|
||||
3,139,54,0,0,25.6,0.402,22,1
|
||||
6,119,50,22,176,27.1,1.318,33,1
|
||||
2,146,76,35,194,38.2,0.329,29,0
|
||||
9,184,85,15,0,30.0,1.213,49,1
|
||||
10,122,68,0,0,31.2,0.258,41,0
|
||||
0,165,90,33,680,52.3,0.427,23,0
|
||||
9,124,70,33,402,35.4,0.282,34,0
|
||||
1,111,86,19,0,30.1,0.143,23,0
|
||||
9,106,52,0,0,31.2,0.380,42,0
|
||||
2,129,84,0,0,28.0,0.284,27,0
|
||||
2,90,80,14,55,24.4,0.249,24,0
|
||||
0,86,68,32,0,35.8,0.238,25,0
|
||||
12,92,62,7,258,27.6,0.926,44,1
|
||||
1,113,64,35,0,33.6,0.543,21,1
|
||||
3,111,56,39,0,30.1,0.557,30,0
|
||||
2,114,68,22,0,28.7,0.092,25,0
|
||||
1,193,50,16,375,25.9,0.655,24,0
|
||||
11,155,76,28,150,33.3,1.353,51,1
|
||||
3,191,68,15,130,30.9,0.299,34,0
|
||||
3,141,0,0,0,30.0,0.761,27,1
|
||||
4,95,70,32,0,32.1,0.612,24,0
|
||||
3,142,80,15,0,32.4,0.200,63,0
|
||||
4,123,62,0,0,32.0,0.226,35,1
|
||||
5,96,74,18,67,33.6,0.997,43,0
|
||||
0,138,0,0,0,36.3,0.933,25,1
|
||||
2,128,64,42,0,40.0,1.101,24,0
|
||||
0,102,52,0,0,25.1,0.078,21,0
|
||||
2,146,0,0,0,27.5,0.240,28,1
|
||||
10,101,86,37,0,45.6,1.136,38,1
|
||||
2,108,62,32,56,25.2,0.128,21,0
|
||||
3,122,78,0,0,23.0,0.254,40,0
|
||||
1,71,78,50,45,33.2,0.422,21,0
|
||||
13,106,70,0,0,34.2,0.251,52,0
|
||||
2,100,70,52,57,40.5,0.677,25,0
|
||||
7,106,60,24,0,26.5,0.296,29,1
|
||||
0,104,64,23,116,27.8,0.454,23,0
|
||||
5,114,74,0,0,24.9,0.744,57,0
|
||||
2,108,62,10,278,25.3,0.881,22,0
|
||||
0,146,70,0,0,37.9,0.334,28,1
|
||||
10,129,76,28,122,35.9,0.280,39,0
|
||||
7,133,88,15,155,32.4,0.262,37,0
|
||||
7,161,86,0,0,30.4,0.165,47,1
|
||||
2,108,80,0,0,27.0,0.259,52,1
|
||||
7,136,74,26,135,26.0,0.647,51,0
|
||||
5,155,84,44,545,38.7,0.619,34,0
|
||||
1,119,86,39,220,45.6,0.808,29,1
|
||||
4,96,56,17,49,20.8,0.340,26,0
|
||||
5,108,72,43,75,36.1,0.263,33,0
|
||||
0,78,88,29,40,36.9,0.434,21,0
|
||||
0,107,62,30,74,36.6,0.757,25,1
|
||||
2,128,78,37,182,43.3,1.224,31,1
|
||||
1,128,48,45,194,40.5,0.613,24,1
|
||||
0,161,50,0,0,21.9,0.254,65,0
|
||||
6,151,62,31,120,35.5,0.692,28,0
|
||||
2,146,70,38,360,28.0,0.337,29,1
|
||||
0,126,84,29,215,30.7,0.520,24,0
|
||||
14,100,78,25,184,36.6,0.412,46,1
|
||||
8,112,72,0,0,23.6,0.840,58,0
|
||||
0,167,0,0,0,32.3,0.839,30,1
|
||||
2,144,58,33,135,31.6,0.422,25,1
|
||||
5,77,82,41,42,35.8,0.156,35,0
|
||||
5,115,98,0,0,52.9,0.209,28,1
|
||||
3,150,76,0,0,21.0,0.207,37,0
|
||||
2,120,76,37,105,39.7,0.215,29,0
|
||||
10,161,68,23,132,25.5,0.326,47,1
|
||||
0,137,68,14,148,24.8,0.143,21,0
|
||||
0,128,68,19,180,30.5,1.391,25,1
|
||||
2,124,68,28,205,32.9,0.875,30,1
|
||||
6,80,66,30,0,26.2,0.313,41,0
|
||||
0,106,70,37,148,39.4,0.605,22,0
|
||||
2,155,74,17,96,26.6,0.433,27,1
|
||||
3,113,50,10,85,29.5,0.626,25,0
|
||||
7,109,80,31,0,35.9,1.127,43,1
|
||||
2,112,68,22,94,34.1,0.315,26,0
|
||||
3,99,80,11,64,19.3,0.284,30,0
|
||||
3,182,74,0,0,30.5,0.345,29,1
|
||||
3,115,66,39,140,38.1,0.150,28,0
|
||||
6,194,78,0,0,23.5,0.129,59,1
|
||||
4,129,60,12,231,27.5,0.527,31,0
|
||||
3,112,74,30,0,31.6,0.197,25,1
|
||||
0,124,70,20,0,27.4,0.254,36,1
|
||||
13,152,90,33,29,26.8,0.731,43,1
|
||||
2,112,75,32,0,35.7,0.148,21,0
|
||||
1,157,72,21,168,25.6,0.123,24,0
|
||||
1,122,64,32,156,35.1,0.692,30,1
|
||||
10,179,70,0,0,35.1,0.200,37,0
|
||||
2,102,86,36,120,45.5,0.127,23,1
|
||||
6,105,70,32,68,30.8,0.122,37,0
|
||||
8,118,72,19,0,23.1,1.476,46,0
|
||||
2,87,58,16,52,32.7,0.166,25,0
|
||||
1,180,0,0,0,43.3,0.282,41,1
|
||||
12,106,80,0,0,23.6,0.137,44,0
|
||||
1,95,60,18,58,23.9,0.260,22,0
|
||||
0,165,76,43,255,47.9,0.259,26,0
|
||||
0,117,0,0,0,33.8,0.932,44,0
|
||||
5,115,76,0,0,31.2,0.343,44,1
|
||||
9,152,78,34,171,34.2,0.893,33,1
|
||||
7,178,84,0,0,39.9,0.331,41,1
|
||||
1,130,70,13,105,25.9,0.472,22,0
|
||||
1,95,74,21,73,25.9,0.673,36,0
|
||||
1,0,68,35,0,32.0,0.389,22,0
|
||||
5,122,86,0,0,34.7,0.290,33,0
|
||||
8,95,72,0,0,36.8,0.485,57,0
|
||||
8,126,88,36,108,38.5,0.349,49,0
|
||||
1,139,46,19,83,28.7,0.654,22,0
|
||||
3,116,0,0,0,23.5,0.187,23,0
|
||||
3,99,62,19,74,21.8,0.279,26,0
|
||||
5,0,80,32,0,41.0,0.346,37,1
|
||||
4,92,80,0,0,42.2,0.237,29,0
|
||||
4,137,84,0,0,31.2,0.252,30,0
|
||||
3,61,82,28,0,34.4,0.243,46,0
|
||||
1,90,62,12,43,27.2,0.580,24,0
|
||||
3,90,78,0,0,42.7,0.559,21,0
|
||||
9,165,88,0,0,30.4,0.302,49,1
|
||||
1,125,50,40,167,33.3,0.962,28,1
|
||||
13,129,0,30,0,39.9,0.569,44,1
|
||||
12,88,74,40,54,35.3,0.378,48,0
|
||||
1,196,76,36,249,36.5,0.875,29,1
|
||||
5,189,64,33,325,31.2,0.583,29,1
|
||||
5,158,70,0,0,29.8,0.207,63,0
|
||||
5,103,108,37,0,39.2,0.305,65,0
|
||||
4,146,78,0,0,38.5,0.520,67,1
|
||||
4,147,74,25,293,34.9,0.385,30,0
|
||||
5,99,54,28,83,34.0,0.499,30,0
|
||||
6,124,72,0,0,27.6,0.368,29,1
|
||||
0,101,64,17,0,21.0,0.252,21,0
|
||||
3,81,86,16,66,27.5,0.306,22,0
|
||||
1,133,102,28,140,32.8,0.234,45,1
|
||||
3,173,82,48,465,38.4,2.137,25,1
|
||||
0,118,64,23,89,0.0,1.731,21,0
|
||||
0,84,64,22,66,35.8,0.545,21,0
|
||||
2,105,58,40,94,34.9,0.225,25,0
|
||||
2,122,52,43,158,36.2,0.816,28,0
|
||||
12,140,82,43,325,39.2,0.528,58,1
|
||||
0,98,82,15,84,25.2,0.299,22,0
|
||||
1,87,60,37,75,37.2,0.509,22,0
|
||||
4,156,75,0,0,48.3,0.238,32,1
|
||||
0,93,100,39,72,43.4,1.021,35,0
|
||||
1,107,72,30,82,30.8,0.821,24,0
|
||||
0,105,68,22,0,20.0,0.236,22,0
|
||||
1,109,60,8,182,25.4,0.947,21,0
|
||||
1,90,62,18,59,25.1,1.268,25,0
|
||||
1,125,70,24,110,24.3,0.221,25,0
|
||||
1,119,54,13,50,22.3,0.205,24,0
|
||||
5,116,74,29,0,32.3,0.660,35,1
|
||||
8,105,100,36,0,43.3,0.239,45,1
|
||||
5,144,82,26,285,32.0,0.452,58,1
|
||||
3,100,68,23,81,31.6,0.949,28,0
|
||||
1,100,66,29,196,32.0,0.444,42,0
|
||||
5,166,76,0,0,45.7,0.340,27,1
|
||||
1,131,64,14,415,23.7,0.389,21,0
|
||||
4,116,72,12,87,22.1,0.463,37,0
|
||||
4,158,78,0,0,32.9,0.803,31,1
|
||||
2,127,58,24,275,27.7,1.600,25,0
|
||||
3,96,56,34,115,24.7,0.944,39,0
|
||||
0,131,66,40,0,34.3,0.196,22,1
|
||||
3,82,70,0,0,21.1,0.389,25,0
|
||||
3,193,70,31,0,34.9,0.241,25,1
|
||||
4,95,64,0,0,32.0,0.161,31,1
|
||||
6,137,61,0,0,24.2,0.151,55,0
|
||||
5,136,84,41,88,35.0,0.286,35,1
|
||||
9,72,78,25,0,31.6,0.280,38,0
|
||||
5,168,64,0,0,32.9,0.135,41,1
|
||||
2,123,48,32,165,42.1,0.520,26,0
|
||||
4,115,72,0,0,28.9,0.376,46,1
|
||||
0,101,62,0,0,21.9,0.336,25,0
|
||||
8,197,74,0,0,25.9,1.191,39,1
|
||||
1,172,68,49,579,42.4,0.702,28,1
|
||||
6,102,90,39,0,35.7,0.674,28,0
|
||||
1,112,72,30,176,34.4,0.528,25,0
|
||||
1,143,84,23,310,42.4,1.076,22,0
|
||||
1,143,74,22,61,26.2,0.256,21,0
|
||||
0,138,60,35,167,34.6,0.534,21,1
|
||||
3,173,84,33,474,35.7,0.258,22,1
|
||||
1,97,68,21,0,27.2,1.095,22,0
|
||||
4,144,82,32,0,38.5,0.554,37,1
|
||||
1,83,68,0,0,18.2,0.624,27,0
|
||||
3,129,64,29,115,26.4,0.219,28,1
|
||||
1,119,88,41,170,45.3,0.507,26,0
|
||||
2,94,68,18,76,26.0,0.561,21,0
|
||||
0,102,64,46,78,40.6,0.496,21,0
|
||||
2,115,64,22,0,30.8,0.421,21,0
|
||||
8,151,78,32,210,42.9,0.516,36,1
|
||||
4,184,78,39,277,37.0,0.264,31,1
|
||||
0,94,0,0,0,0.0,0.256,25,0
|
||||
1,181,64,30,180,34.1,0.328,38,1
|
||||
0,135,94,46,145,40.6,0.284,26,0
|
||||
1,95,82,25,180,35.0,0.233,43,1
|
||||
2,99,0,0,0,22.2,0.108,23,0
|
||||
3,89,74,16,85,30.4,0.551,38,0
|
||||
1,80,74,11,60,30.0,0.527,22,0
|
||||
2,139,75,0,0,25.6,0.167,29,0
|
||||
1,90,68,8,0,24.5,1.138,36,0
|
||||
0,141,0,0,0,42.4,0.205,29,1
|
||||
12,140,85,33,0,37.4,0.244,41,0
|
||||
5,147,75,0,0,29.9,0.434,28,0
|
||||
1,97,70,15,0,18.2,0.147,21,0
|
||||
6,107,88,0,0,36.8,0.727,31,0
|
||||
0,189,104,25,0,34.3,0.435,41,1
|
||||
2,83,66,23,50,32.2,0.497,22,0
|
||||
4,117,64,27,120,33.2,0.230,24,0
|
||||
8,108,70,0,0,30.5,0.955,33,1
|
||||
4,117,62,12,0,29.7,0.380,30,1
|
||||
0,180,78,63,14,59.4,2.420,25,1
|
||||
1,100,72,12,70,25.3,0.658,28,0
|
||||
0,95,80,45,92,36.5,0.330,26,0
|
||||
0,104,64,37,64,33.6,0.510,22,1
|
||||
0,120,74,18,63,30.5,0.285,26,0
|
||||
1,82,64,13,95,21.2,0.415,23,0
|
||||
2,134,70,0,0,28.9,0.542,23,1
|
||||
0,91,68,32,210,39.9,0.381,25,0
|
||||
2,119,0,0,0,19.6,0.832,72,0
|
||||
2,100,54,28,105,37.8,0.498,24,0
|
||||
14,175,62,30,0,33.6,0.212,38,1
|
||||
1,135,54,0,0,26.7,0.687,62,0
|
||||
5,86,68,28,71,30.2,0.364,24,0
|
||||
10,148,84,48,237,37.6,1.001,51,1
|
||||
9,134,74,33,60,25.9,0.460,81,0
|
||||
9,120,72,22,56,20.8,0.733,48,0
|
||||
1,71,62,0,0,21.8,0.416,26,0
|
||||
8,74,70,40,49,35.3,0.705,39,0
|
||||
5,88,78,30,0,27.6,0.258,37,0
|
||||
10,115,98,0,0,24.0,1.022,34,0
|
||||
0,124,56,13,105,21.8,0.452,21,0
|
||||
0,74,52,10,36,27.8,0.269,22,0
|
||||
0,97,64,36,100,36.8,0.600,25,0
|
||||
8,120,0,0,0,30.0,0.183,38,1
|
||||
6,154,78,41,140,46.1,0.571,27,0
|
||||
1,144,82,40,0,41.3,0.607,28,0
|
||||
0,137,70,38,0,33.2,0.170,22,0
|
||||
0,119,66,27,0,38.8,0.259,22,0
|
||||
7,136,90,0,0,29.9,0.210,50,0
|
||||
4,114,64,0,0,28.9,0.126,24,0
|
||||
0,137,84,27,0,27.3,0.231,59,0
|
||||
2,105,80,45,191,33.7,0.711,29,1
|
||||
7,114,76,17,110,23.8,0.466,31,0
|
||||
8,126,74,38,75,25.9,0.162,39,0
|
||||
4,132,86,31,0,28.0,0.419,63,0
|
||||
3,158,70,30,328,35.5,0.344,35,1
|
||||
0,123,88,37,0,35.2,0.197,29,0
|
||||
4,85,58,22,49,27.8,0.306,28,0
|
||||
0,84,82,31,125,38.2,0.233,23,0
|
||||
0,145,0,0,0,44.2,0.630,31,1
|
||||
0,135,68,42,250,42.3,0.365,24,1
|
||||
1,139,62,41,480,40.7,0.536,21,0
|
||||
0,173,78,32,265,46.5,1.159,58,0
|
||||
4,99,72,17,0,25.6,0.294,28,0
|
||||
8,194,80,0,0,26.1,0.551,67,0
|
||||
2,83,65,28,66,36.8,0.629,24,0
|
||||
2,89,90,30,0,33.5,0.292,42,0
|
||||
4,99,68,38,0,32.8,0.145,33,0
|
||||
4,125,70,18,122,28.9,1.144,45,1
|
||||
3,80,0,0,0,0.0,0.174,22,0
|
||||
6,166,74,0,0,26.6,0.304,66,0
|
||||
5,110,68,0,0,26.0,0.292,30,0
|
||||
2,81,72,15,76,30.1,0.547,25,0
|
||||
7,195,70,33,145,25.1,0.163,55,1
|
||||
6,154,74,32,193,29.3,0.839,39,0
|
||||
2,117,90,19,71,25.2,0.313,21,0
|
||||
3,84,72,32,0,37.2,0.267,28,0
|
||||
6,0,68,41,0,39.0,0.727,41,1
|
||||
7,94,64,25,79,33.3,0.738,41,0
|
||||
3,96,78,39,0,37.3,0.238,40,0
|
||||
10,75,82,0,0,33.3,0.263,38,0
|
||||
0,180,90,26,90,36.5,0.314,35,1
|
||||
1,130,60,23,170,28.6,0.692,21,0
|
||||
2,84,50,23,76,30.4,0.968,21,0
|
||||
8,120,78,0,0,25.0,0.409,64,0
|
||||
12,84,72,31,0,29.7,0.297,46,1
|
||||
0,139,62,17,210,22.1,0.207,21,0
|
||||
9,91,68,0,0,24.2,0.200,58,0
|
||||
2,91,62,0,0,27.3,0.525,22,0
|
||||
3,99,54,19,86,25.6,0.154,24,0
|
||||
3,163,70,18,105,31.6,0.268,28,1
|
||||
9,145,88,34,165,30.3,0.771,53,1
|
||||
7,125,86,0,0,37.6,0.304,51,0
|
||||
13,76,60,0,0,32.8,0.180,41,0
|
||||
6,129,90,7,326,19.6,0.582,60,0
|
||||
2,68,70,32,66,25.0,0.187,25,0
|
||||
3,124,80,33,130,33.2,0.305,26,0
|
||||
6,114,0,0,0,0.0,0.189,26,0
|
||||
9,130,70,0,0,34.2,0.652,45,1
|
||||
3,125,58,0,0,31.6,0.151,24,0
|
||||
3,87,60,18,0,21.8,0.444,21,0
|
||||
1,97,64,19,82,18.2,0.299,21,0
|
||||
3,116,74,15,105,26.3,0.107,24,0
|
||||
0,117,66,31,188,30.8,0.493,22,0
|
||||
0,111,65,0,0,24.6,0.660,31,0
|
||||
2,122,60,18,106,29.8,0.717,22,0
|
||||
0,107,76,0,0,45.3,0.686,24,0
|
||||
1,86,66,52,65,41.3,0.917,29,0
|
||||
6,91,0,0,0,29.8,0.501,31,0
|
||||
1,77,56,30,56,33.3,1.251,24,0
|
||||
4,132,0,0,0,32.9,0.302,23,1
|
||||
0,105,90,0,0,29.6,0.197,46,0
|
||||
0,57,60,0,0,21.7,0.735,67,0
|
||||
0,127,80,37,210,36.3,0.804,23,0
|
||||
3,129,92,49,155,36.4,0.968,32,1
|
||||
8,100,74,40,215,39.4,0.661,43,1
|
||||
3,128,72,25,190,32.4,0.549,27,1
|
||||
10,90,85,32,0,34.9,0.825,56,1
|
||||
4,84,90,23,56,39.5,0.159,25,0
|
||||
1,88,78,29,76,32.0,0.365,29,0
|
||||
8,186,90,35,225,34.5,0.423,37,1
|
||||
5,187,76,27,207,43.6,1.034,53,1
|
||||
4,131,68,21,166,33.1,0.160,28,0
|
||||
1,164,82,43,67,32.8,0.341,50,0
|
||||
4,189,110,31,0,28.5,0.680,37,0
|
||||
1,116,70,28,0,27.4,0.204,21,0
|
||||
3,84,68,30,106,31.9,0.591,25,0
|
||||
6,114,88,0,0,27.8,0.247,66,0
|
||||
1,88,62,24,44,29.9,0.422,23,0
|
||||
1,84,64,23,115,36.9,0.471,28,0
|
||||
7,124,70,33,215,25.5,0.161,37,0
|
||||
1,97,70,40,0,38.1,0.218,30,0
|
||||
8,110,76,0,0,27.8,0.237,58,0
|
||||
11,103,68,40,0,46.2,0.126,42,0
|
||||
11,85,74,0,0,30.1,0.300,35,0
|
||||
6,125,76,0,0,33.8,0.121,54,1
|
||||
0,198,66,32,274,41.3,0.502,28,1
|
||||
1,87,68,34,77,37.6,0.401,24,0
|
||||
6,99,60,19,54,26.9,0.497,32,0
|
||||
0,91,80,0,0,32.4,0.601,27,0
|
||||
2,95,54,14,88,26.1,0.748,22,0
|
||||
1,99,72,30,18,38.6,0.412,21,0
|
||||
6,92,62,32,126,32.0,0.085,46,0
|
||||
4,154,72,29,126,31.3,0.338,37,0
|
||||
0,121,66,30,165,34.3,0.203,33,1
|
||||
3,78,70,0,0,32.5,0.270,39,0
|
||||
2,130,96,0,0,22.6,0.268,21,0
|
||||
3,111,58,31,44,29.5,0.430,22,0
|
||||
2,98,60,17,120,34.7,0.198,22,0
|
||||
1,143,86,30,330,30.1,0.892,23,0
|
||||
1,119,44,47,63,35.5,0.280,25,0
|
||||
6,108,44,20,130,24.0,0.813,35,0
|
||||
2,118,80,0,0,42.9,0.693,21,1
|
||||
10,133,68,0,0,27.0,0.245,36,0
|
||||
2,197,70,99,0,34.7,0.575,62,1
|
||||
0,151,90,46,0,42.1,0.371,21,1
|
||||
6,109,60,27,0,25.0,0.206,27,0
|
||||
12,121,78,17,0,26.5,0.259,62,0
|
||||
8,100,76,0,0,38.7,0.190,42,0
|
||||
8,124,76,24,600,28.7,0.687,52,1
|
||||
1,93,56,11,0,22.5,0.417,22,0
|
||||
8,143,66,0,0,34.9,0.129,41,1
|
||||
6,103,66,0,0,24.3,0.249,29,0
|
||||
3,176,86,27,156,33.3,1.154,52,1
|
||||
0,73,0,0,0,21.1,0.342,25,0
|
||||
11,111,84,40,0,46.8,0.925,45,1
|
||||
2,112,78,50,140,39.4,0.175,24,0
|
||||
3,132,80,0,0,34.4,0.402,44,1
|
||||
2,82,52,22,115,28.5,1.699,25,0
|
||||
6,123,72,45,230,33.6,0.733,34,0
|
||||
0,188,82,14,185,32.0,0.682,22,1
|
||||
0,67,76,0,0,45.3,0.194,46,0
|
||||
1,89,24,19,25,27.8,0.559,21,0
|
||||
1,173,74,0,0,36.8,0.088,38,1
|
||||
1,109,38,18,120,23.1,0.407,26,0
|
||||
1,108,88,19,0,27.1,0.400,24,0
|
||||
6,96,0,0,0,23.7,0.190,28,0
|
||||
1,124,74,36,0,27.8,0.100,30,0
|
||||
7,150,78,29,126,35.2,0.692,54,1
|
||||
4,183,0,0,0,28.4,0.212,36,1
|
||||
1,124,60,32,0,35.8,0.514,21,0
|
||||
1,181,78,42,293,40.0,1.258,22,1
|
||||
1,92,62,25,41,19.5,0.482,25,0
|
||||
0,152,82,39,272,41.5,0.270,27,0
|
||||
1,111,62,13,182,24.0,0.138,23,0
|
||||
3,106,54,21,158,30.9,0.292,24,0
|
||||
3,174,58,22,194,32.9,0.593,36,1
|
||||
7,168,88,42,321,38.2,0.787,40,1
|
||||
6,105,80,28,0,32.5,0.878,26,0
|
||||
11,138,74,26,144,36.1,0.557,50,1
|
||||
3,106,72,0,0,25.8,0.207,27,0
|
||||
6,117,96,0,0,28.7,0.157,30,0
|
||||
2,68,62,13,15,20.1,0.257,23,0
|
||||
9,112,82,24,0,28.2,1.282,50,1
|
||||
0,119,0,0,0,32.4,0.141,24,1
|
||||
2,112,86,42,160,38.4,0.246,28,0
|
||||
2,92,76,20,0,24.2,1.698,28,0
|
||||
6,183,94,0,0,40.8,1.461,45,0
|
||||
0,94,70,27,115,43.5,0.347,21,0
|
||||
2,108,64,0,0,30.8,0.158,21,0
|
||||
4,90,88,47,54,37.7,0.362,29,0
|
||||
0,125,68,0,0,24.7,0.206,21,0
|
||||
0,132,78,0,0,32.4,0.393,21,0
|
||||
5,128,80,0,0,34.6,0.144,45,0
|
||||
4,94,65,22,0,24.7,0.148,21,0
|
||||
7,114,64,0,0,27.4,0.732,34,1
|
||||
0,102,78,40,90,34.5,0.238,24,0
|
||||
2,111,60,0,0,26.2,0.343,23,0
|
||||
1,128,82,17,183,27.5,0.115,22,0
|
||||
10,92,62,0,0,25.9,0.167,31,0
|
||||
13,104,72,0,0,31.2,0.465,38,1
|
||||
5,104,74,0,0,28.8,0.153,48,0
|
||||
2,94,76,18,66,31.6,0.649,23,0
|
||||
7,97,76,32,91,40.9,0.871,32,1
|
||||
1,100,74,12,46,19.5,0.149,28,0
|
||||
0,102,86,17,105,29.3,0.695,27,0
|
||||
4,128,70,0,0,34.3,0.303,24,0
|
||||
6,147,80,0,0,29.5,0.178,50,1
|
||||
4,90,0,0,0,28.0,0.610,31,0
|
||||
3,103,72,30,152,27.6,0.730,27,0
|
||||
2,157,74,35,440,39.4,0.134,30,0
|
||||
1,167,74,17,144,23.4,0.447,33,1
|
||||
0,179,50,36,159,37.8,0.455,22,1
|
||||
11,136,84,35,130,28.3,0.260,42,1
|
||||
0,107,60,25,0,26.4,0.133,23,0
|
||||
1,91,54,25,100,25.2,0.234,23,0
|
||||
1,117,60,23,106,33.8,0.466,27,0
|
||||
5,123,74,40,77,34.1,0.269,28,0
|
||||
2,120,54,0,0,26.8,0.455,27,0
|
||||
1,106,70,28,135,34.2,0.142,22,0
|
||||
2,155,52,27,540,38.7,0.240,25,1
|
||||
2,101,58,35,90,21.8,0.155,22,0
|
||||
1,120,80,48,200,38.9,1.162,41,0
|
||||
11,127,106,0,0,39.0,0.190,51,0
|
||||
3,80,82,31,70,34.2,1.292,27,1
|
||||
10,162,84,0,0,27.7,0.182,54,0
|
||||
1,199,76,43,0,42.9,1.394,22,1
|
||||
8,167,106,46,231,37.6,0.165,43,1
|
||||
9,145,80,46,130,37.9,0.637,40,1
|
||||
6,115,60,39,0,33.7,0.245,40,1
|
||||
1,112,80,45,132,34.8,0.217,24,0
|
||||
4,145,82,18,0,32.5,0.235,70,1
|
||||
10,111,70,27,0,27.5,0.141,40,1
|
||||
6,98,58,33,190,34.0,0.430,43,0
|
||||
9,154,78,30,100,30.9,0.164,45,0
|
||||
6,165,68,26,168,33.6,0.631,49,0
|
||||
1,99,58,10,0,25.4,0.551,21,0
|
||||
10,68,106,23,49,35.5,0.285,47,0
|
||||
3,123,100,35,240,57.3,0.880,22,0
|
||||
8,91,82,0,0,35.6,0.587,68,0
|
||||
6,195,70,0,0,30.9,0.328,31,1
|
||||
9,156,86,0,0,24.8,0.230,53,1
|
||||
0,93,60,0,0,35.3,0.263,25,0
|
||||
3,121,52,0,0,36.0,0.127,25,1
|
||||
2,101,58,17,265,24.2,0.614,23,0
|
||||
2,56,56,28,45,24.2,0.332,22,0
|
||||
0,162,76,36,0,49.6,0.364,26,1
|
||||
0,95,64,39,105,44.6,0.366,22,0
|
||||
4,125,80,0,0,32.3,0.536,27,1
|
||||
5,136,82,0,0,0.0,0.640,69,0
|
||||
2,129,74,26,205,33.2,0.591,25,0
|
||||
3,130,64,0,0,23.1,0.314,22,0
|
||||
1,107,50,19,0,28.3,0.181,29,0
|
||||
1,140,74,26,180,24.1,0.828,23,0
|
||||
1,144,82,46,180,46.1,0.335,46,1
|
||||
8,107,80,0,0,24.6,0.856,34,0
|
||||
13,158,114,0,0,42.3,0.257,44,1
|
||||
2,121,70,32,95,39.1,0.886,23,0
|
||||
7,129,68,49,125,38.5,0.439,43,1
|
||||
2,90,60,0,0,23.5,0.191,25,0
|
||||
7,142,90,24,480,30.4,0.128,43,1
|
||||
3,169,74,19,125,29.9,0.268,31,1
|
||||
0,99,0,0,0,25.0,0.253,22,0
|
||||
4,127,88,11,155,34.5,0.598,28,0
|
||||
4,118,70,0,0,44.5,0.904,26,0
|
||||
2,122,76,27,200,35.9,0.483,26,0
|
||||
6,125,78,31,0,27.6,0.565,49,1
|
||||
1,168,88,29,0,35.0,0.905,52,1
|
||||
2,129,0,0,0,38.5,0.304,41,0
|
||||
4,110,76,20,100,28.4,0.118,27,0
|
||||
6,80,80,36,0,39.8,0.177,28,0
|
||||
10,115,0,0,0,0.0,0.261,30,1
|
||||
2,127,46,21,335,34.4,0.176,22,0
|
||||
9,164,78,0,0,32.8,0.148,45,1
|
||||
2,93,64,32,160,38.0,0.674,23,1
|
||||
3,158,64,13,387,31.2,0.295,24,0
|
||||
5,126,78,27,22,29.6,0.439,40,0
|
||||
10,129,62,36,0,41.2,0.441,38,1
|
||||
0,134,58,20,291,26.4,0.352,21,0
|
||||
3,102,74,0,0,29.5,0.121,32,0
|
||||
7,187,50,33,392,33.9,0.826,34,1
|
||||
3,173,78,39,185,33.8,0.970,31,1
|
||||
10,94,72,18,0,23.1,0.595,56,0
|
||||
1,108,60,46,178,35.5,0.415,24,0
|
||||
5,97,76,27,0,35.6,0.378,52,1
|
||||
4,83,86,19,0,29.3,0.317,34,0
|
||||
1,114,66,36,200,38.1,0.289,21,0
|
||||
1,149,68,29,127,29.3,0.349,42,1
|
||||
5,117,86,30,105,39.1,0.251,42,0
|
||||
1,111,94,0,0,32.8,0.265,45,0
|
||||
4,112,78,40,0,39.4,0.236,38,0
|
||||
1,116,78,29,180,36.1,0.496,25,0
|
||||
0,141,84,26,0,32.4,0.433,22,0
|
||||
2,175,88,0,0,22.9,0.326,22,0
|
||||
2,92,52,0,0,30.1,0.141,22,0
|
||||
3,130,78,23,79,28.4,0.323,34,1
|
||||
8,120,86,0,0,28.4,0.259,22,1
|
||||
2,174,88,37,120,44.5,0.646,24,1
|
||||
2,106,56,27,165,29.0,0.426,22,0
|
||||
2,105,75,0,0,23.3,0.560,53,0
|
||||
4,95,60,32,0,35.4,0.284,28,0
|
||||
0,126,86,27,120,27.4,0.515,21,0
|
||||
8,65,72,23,0,32.0,0.600,42,0
|
||||
2,99,60,17,160,36.6,0.453,21,0
|
||||
1,102,74,0,0,39.5,0.293,42,1
|
||||
11,120,80,37,150,42.3,0.785,48,1
|
||||
3,102,44,20,94,30.8,0.400,26,0
|
||||
1,109,58,18,116,28.5,0.219,22,0
|
||||
9,140,94,0,0,32.7,0.734,45,1
|
||||
13,153,88,37,140,40.6,1.174,39,0
|
||||
12,100,84,33,105,30.0,0.488,46,0
|
||||
1,147,94,41,0,49.3,0.358,27,1
|
||||
1,81,74,41,57,46.3,1.096,32,0
|
||||
3,187,70,22,200,36.4,0.408,36,1
|
||||
6,162,62,0,0,24.3,0.178,50,1
|
||||
4,136,70,0,0,31.2,1.182,22,1
|
||||
1,121,78,39,74,39.0,0.261,28,0
|
||||
3,108,62,24,0,26.0,0.223,25,0
|
||||
0,181,88,44,510,43.3,0.222,26,1
|
||||
8,154,78,32,0,32.4,0.443,45,1
|
||||
1,128,88,39,110,36.5,1.057,37,1
|
||||
7,137,90,41,0,32.0,0.391,39,0
|
||||
0,123,72,0,0,36.3,0.258,52,1
|
||||
1,106,76,0,0,37.5,0.197,26,0
|
||||
6,190,92,0,0,35.5,0.278,66,1
|
||||
2,88,58,26,16,28.4,0.766,22,0
|
||||
9,170,74,31,0,44.0,0.403,43,1
|
||||
9,89,62,0,0,22.5,0.142,33,0
|
||||
10,101,76,48,180,32.9,0.171,63,0
|
||||
2,122,70,27,0,36.8,0.340,27,0
|
||||
5,121,72,23,112,26.2,0.245,30,0
|
||||
1,126,60,0,0,30.1,0.349,47,1
|
||||
1,93,70,31,0,30.4,0.315,23,0
|
||||
|
892
data/titanic.csv
Normal file
@@ -0,0 +1,892 @@
|
||||
PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
|
||||
1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
|
||||
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C
|
||||
3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
|
||||
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
|
||||
5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S
|
||||
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
|
||||
7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S
|
||||
8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S
|
||||
9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S
|
||||
10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C
|
||||
11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S
|
||||
12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S
|
||||
13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S
|
||||
14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S
|
||||
15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S
|
||||
16,1,2,"Hewlett, Mrs. (Mary D Kingcome) ",female,55,0,0,248706,16,,S
|
||||
17,0,3,"Rice, Master. Eugene",male,2,4,1,382652,29.125,,Q
|
||||
18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13,,S
|
||||
19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",female,31,1,0,345763,18,,S
|
||||
20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C
|
||||
21,0,2,"Fynney, Mr. Joseph J",male,35,0,0,239865,26,,S
|
||||
22,1,2,"Beesley, Mr. Lawrence",male,34,0,0,248698,13,D56,S
|
||||
23,1,3,"McGowan, Miss. Anna ""Annie""",female,15,0,0,330923,8.0292,,Q
|
||||
24,1,1,"Sloper, Mr. William Thompson",male,28,0,0,113788,35.5,A6,S
|
||||
25,0,3,"Palsson, Miss. Torborg Danira",female,8,3,1,349909,21.075,,S
|
||||
26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",female,38,1,5,347077,31.3875,,S
|
||||
27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C
|
||||
28,0,1,"Fortune, Mr. Charles Alexander",male,19,3,2,19950,263,C23 C25 C27,S
|
||||
29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
|
||||
30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S
|
||||
31,0,1,"Uruchurtu, Don. Manuel E",male,40,0,0,PC 17601,27.7208,,C
|
||||
32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C
|
||||
33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q
|
||||
34,0,2,"Wheadon, Mr. Edward H",male,66,0,0,C.A. 24579,10.5,,S
|
||||
35,0,1,"Meyer, Mr. Edgar Joseph",male,28,1,0,PC 17604,82.1708,,C
|
||||
36,0,1,"Holverson, Mr. Alexander Oskar",male,42,1,0,113789,52,,S
|
||||
37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C
|
||||
38,0,3,"Cann, Mr. Ernest Charles",male,21,0,0,A./5. 2152,8.05,,S
|
||||
39,0,3,"Vander Planke, Miss. Augusta Maria",female,18,2,0,345764,18,,S
|
||||
40,1,3,"Nicola-Yarred, Miss. Jamila",female,14,1,0,2651,11.2417,,C
|
||||
41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40,1,0,7546,9.475,,S
|
||||
42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",female,27,1,0,11668,21,,S
|
||||
43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C
|
||||
44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C
|
||||
45,1,3,"Devaney, Miss. Margaret Delia",female,19,0,0,330958,7.8792,,Q
|
||||
46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.05,,S
|
||||
47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q
|
||||
48,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.75,,Q
|
||||
49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C
|
||||
50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18,1,0,349237,17.8,,S
|
||||
51,0,3,"Panula, Master. Juha Niilo",male,7,4,1,3101295,39.6875,,S
|
||||
52,0,3,"Nosworthy, Mr. Richard Cater",male,21,0,0,A/4. 39886,7.8,,S
|
||||
53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49,1,0,PC 17572,76.7292,D33,C
|
||||
54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",female,29,1,0,2926,26,,S
|
||||
55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65,0,1,113509,61.9792,B30,C
|
||||
56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S
|
||||
57,1,2,"Rugg, Miss. Emily",female,21,0,0,C.A. 31026,10.5,,S
|
||||
58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C
|
||||
59,1,2,"West, Miss. Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S
|
||||
60,0,3,"Goodwin, Master. William Frederick",male,11,5,2,CA 2144,46.9,,S
|
||||
61,0,3,"Sirayanian, Mr. Orsen",male,22,0,0,2669,7.2292,,C
|
||||
62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28,
|
||||
63,0,1,"Harris, Mr. Henry Birkhardt",male,45,1,0,36973,83.475,C83,S
|
||||
64,0,3,"Skoog, Master. Harald",male,4,3,2,347088,27.9,,S
|
||||
65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C
|
||||
66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C
|
||||
67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29,0,0,C.A. 29395,10.5,F33,S
|
||||
68,0,3,"Crease, Mr. Ernest James",male,19,0,0,S.P. 3464,8.1583,,S
|
||||
69,1,3,"Andersson, Miss. Erna Alexandra",female,17,4,2,3101281,7.925,,S
|
||||
70,0,3,"Kink, Mr. Vincenz",male,26,2,0,315151,8.6625,,S
|
||||
71,0,2,"Jenkin, Mr. Stephen Curnow",male,32,0,0,C.A. 33111,10.5,,S
|
||||
72,0,3,"Goodwin, Miss. Lillian Amy",female,16,5,2,CA 2144,46.9,,S
|
||||
73,0,2,"Hood, Mr. Ambrose Jr",male,21,0,0,S.O.C. 14879,73.5,,S
|
||||
74,0,3,"Chronopoulos, Mr. Apostolos",male,26,1,0,2680,14.4542,,C
|
||||
75,1,3,"Bing, Mr. Lee",male,32,0,0,1601,56.4958,,S
|
||||
76,0,3,"Moen, Mr. Sigurd Hansen",male,25,0,0,348123,7.65,F G73,S
|
||||
77,0,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,S
|
||||
78,0,3,"Moutal, Mr. Rahamin Haim",male,,0,0,374746,8.05,,S
|
||||
79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29,,S
|
||||
80,1,3,"Dowdell, Miss. Elizabeth",female,30,0,0,364516,12.475,,S
|
||||
81,0,3,"Waelens, Mr. Achille",male,22,0,0,345767,9,,S
|
||||
82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29,0,0,345779,9.5,,S
|
||||
83,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q
|
||||
84,0,1,"Carrau, Mr. Francisco M",male,28,0,0,113059,47.1,,S
|
||||
85,1,2,"Ilett, Miss. Bertha",female,17,0,0,SO/C 14885,10.5,,S
|
||||
86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",female,33,3,0,3101278,15.85,,S
|
||||
87,0,3,"Ford, Mr. William Neal",male,16,1,3,W./C. 6608,34.375,,S
|
||||
88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S
|
||||
89,1,1,"Fortune, Miss. Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S
|
||||
90,0,3,"Celotti, Mr. Francesco",male,24,0,0,343275,8.05,,S
|
||||
91,0,3,"Christmann, Mr. Emil",male,29,0,0,343276,8.05,,S
|
||||
92,0,3,"Andreasson, Mr. Paul Edvin",male,20,0,0,347466,7.8542,,S
|
||||
93,0,1,"Chaffee, Mr. Herbert Fuller",male,46,1,0,W.E.P. 5734,61.175,E31,S
|
||||
94,0,3,"Dean, Mr. Bertram Frank",male,26,1,2,C.A. 2315,20.575,,S
|
||||
95,0,3,"Coxon, Mr. Daniel",male,59,0,0,364500,7.25,,S
|
||||
96,0,3,"Shorney, Mr. Charles Joseph",male,,0,0,374910,8.05,,S
|
||||
97,0,1,"Goldschmidt, Mr. George B",male,71,0,0,PC 17754,34.6542,A5,C
|
||||
98,1,1,"Greenfield, Mr. William Bertram",male,23,0,1,PC 17759,63.3583,D10 D12,C
|
||||
99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34,0,1,231919,23,,S
|
||||
100,0,2,"Kantor, Mr. Sinai",male,34,1,0,244367,26,,S
|
||||
101,0,3,"Petranec, Miss. Matilda",female,28,0,0,349245,7.8958,,S
|
||||
102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S
|
||||
103,0,1,"White, Mr. Richard Frasar",male,21,0,1,35281,77.2875,D26,S
|
||||
104,0,3,"Johansson, Mr. Gustaf Joel",male,33,0,0,7540,8.6542,,S
|
||||
105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37,2,0,3101276,7.925,,S
|
||||
106,0,3,"Mionoff, Mr. Stoytcho",male,28,0,0,349207,7.8958,,S
|
||||
107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21,0,0,343120,7.65,,S
|
||||
108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.775,,S
|
||||
109,0,3,"Rekic, Mr. Tido",male,38,0,0,349249,7.8958,,S
|
||||
110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.15,,Q
|
||||
111,0,1,"Porter, Mr. Walter Chamberlain",male,47,0,0,110465,52,C110,S
|
||||
112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C
|
||||
113,0,3,"Barton, Mr. David John",male,22,0,0,324669,8.05,,S
|
||||
114,0,3,"Jussila, Miss. Katriina",female,20,1,0,4136,9.825,,S
|
||||
115,0,3,"Attalah, Miss. Malake",female,17,0,0,2627,14.4583,,C
|
||||
116,0,3,"Pekoniemi, Mr. Edvard",male,21,0,0,STON/O 2. 3101294,7.925,,S
|
||||
117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q
|
||||
118,0,2,"Turpin, Mr. William John Robert",male,29,1,0,11668,21,,S
|
||||
119,0,1,"Baxter, Mr. Quigg Edmond",male,24,0,1,PC 17558,247.5208,B58 B60,C
|
||||
120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2,4,2,347082,31.275,,S
|
||||
121,0,2,"Hickman, Mr. Stanley George",male,21,2,0,S.O.C. 14879,73.5,,S
|
||||
122,0,3,"Moore, Mr. Leonard Charles",male,,0,0,A4. 54510,8.05,,S
|
||||
123,0,2,"Nasser, Mr. Nicholas",male,32.5,1,0,237736,30.0708,,C
|
||||
124,1,2,"Webber, Miss. Susan",female,32.5,0,0,27267,13,E101,S
|
||||
125,0,1,"White, Mr. Percival Wayland",male,54,0,1,35281,77.2875,D26,S
|
||||
126,1,3,"Nicola-Yarred, Master. Elias",male,12,1,0,2651,11.2417,,C
|
||||
127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q
|
||||
128,1,3,"Madsen, Mr. Fridtjof Arne",male,24,0,0,C 17369,7.1417,,S
|
||||
129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C
|
||||
130,0,3,"Ekstrom, Mr. Johan",male,45,0,0,347061,6.975,,S
|
||||
131,0,3,"Drazenoic, Mr. Jozef",male,33,0,0,349241,7.8958,,C
|
||||
132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20,0,0,SOTON/O.Q. 3101307,7.05,,S
|
||||
133,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",female,47,1,0,A/5. 3337,14.5,,S
|
||||
134,1,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29,1,0,228414,26,,S
|
||||
135,0,2,"Sobey, Mr. Samuel James Hayden",male,25,0,0,C.A. 29178,13,,S
|
||||
136,0,2,"Richard, Mr. Emile",male,23,0,0,SC/PARIS 2133,15.0458,,C
|
||||
137,1,1,"Newsom, Miss. Helen Monypeny",female,19,0,2,11752,26.2833,D47,S
|
||||
138,0,1,"Futrelle, Mr. Jacques Heath",male,37,1,0,113803,53.1,C123,S
|
||||
139,0,3,"Osen, Mr. Olaf Elon",male,16,0,0,7534,9.2167,,S
|
||||
140,0,1,"Giglio, Mr. Victor",male,24,0,0,PC 17593,79.2,B86,C
|
||||
141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C
|
||||
142,1,3,"Nysten, Miss. Anna Sofia",female,22,0,0,347081,7.75,,S
|
||||
143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda Dolck)",female,24,1,0,STON/O2. 3101279,15.85,,S
|
||||
144,0,3,"Burke, Mr. Jeremiah",male,19,0,0,365222,6.75,,Q
|
||||
145,0,2,"Andrew, Mr. Edgardo Samuel",male,18,0,0,231945,11.5,,S
|
||||
146,0,2,"Nicholls, Mr. Joseph Charles",male,19,1,1,C.A. 33112,36.75,,S
|
||||
147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27,0,0,350043,7.7958,,S
|
||||
148,0,3,"Ford, Miss. Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S
|
||||
149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26,F2,S
|
||||
150,0,2,"Byles, Rev. Thomas Roussel Davids",male,42,0,0,244310,13,,S
|
||||
151,0,2,"Bateman, Rev. Robert James",male,51,0,0,S.O.P. 1166,12.525,,S
|
||||
152,1,1,"Pears, Mrs. Thomas (Edith Wearne)",female,22,1,0,113776,66.6,C2,S
|
||||
153,0,3,"Meo, Mr. Alfonzo",male,55.5,0,0,A.5. 11206,8.05,,S
|
||||
154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S
|
||||
155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S
|
||||
156,0,1,"Williams, Mr. Charles Duane",male,51,0,1,PC 17597,61.3792,,C
|
||||
157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q
|
||||
158,0,3,"Corn, Mr. Harry",male,30,0,0,SOTON/OQ 392090,8.05,,S
|
||||
159,0,3,"Smiljanic, Mr. Mile",male,,0,0,315037,8.6625,,S
|
||||
160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S
|
||||
161,0,3,"Cribb, Mr. John Hatfield",male,44,0,1,371362,16.1,,S
|
||||
162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Milne)",female,40,0,0,C.A. 33595,15.75,,S
|
||||
163,0,3,"Bengtsson, Mr. John Viktor",male,26,0,0,347068,7.775,,S
|
||||
164,0,3,"Calic, Mr. Jovo",male,17,0,0,315093,8.6625,,S
|
||||
165,0,3,"Panula, Master. Eino Viljami",male,1,4,1,3101295,39.6875,,S
|
||||
166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S
|
||||
167,1,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,,0,1,113505,55,E33,S
|
||||
168,0,3,"Skoog, Mrs. William (Anna Bernhardina Karlsson)",female,45,1,4,347088,27.9,,S
|
||||
169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S
|
||||
170,0,3,"Ling, Mr. Lee",male,28,0,0,1601,56.4958,,S
|
||||
171,0,1,"Van der hoef, Mr. Wyckoff",male,61,0,0,111240,33.5,B19,S
|
||||
172,0,3,"Rice, Master. Arthur",male,4,4,1,382652,29.125,,Q
|
||||
173,1,3,"Johnson, Miss. Eleanor Ileen",female,1,1,1,347742,11.1333,,S
|
||||
174,0,3,"Sivola, Mr. Antti Wilhelm",male,21,0,0,STON/O 2. 3101280,7.925,,S
|
||||
175,0,1,"Smith, Mr. James Clinch",male,56,0,0,17764,30.6958,A7,C
|
||||
176,0,3,"Klasen, Mr. Klas Albin",male,18,1,1,350404,7.8542,,S
|
||||
177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S
|
||||
178,0,1,"Isham, Miss. Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C
|
||||
179,0,2,"Hale, Mr. Reginald",male,30,0,0,250653,13,,S
|
||||
180,0,3,"Leonard, Mr. Lionel",male,36,0,0,LINE,0,,S
|
||||
181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S
|
||||
182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C
|
||||
183,0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S
|
||||
184,1,2,"Becker, Master. Richard F",male,1,2,1,230136,39,F4,S
|
||||
185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4,0,2,315153,22.025,,S
|
||||
186,0,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50,A32,S
|
||||
187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q
|
||||
188,1,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45,0,0,111428,26.55,,S
|
||||
189,0,3,"Bourke, Mr. John",male,40,1,1,364849,15.5,,Q
|
||||
190,0,3,"Turcin, Mr. Stjepan",male,36,0,0,349247,7.8958,,S
|
||||
191,1,2,"Pinsky, Mrs. (Rosa)",female,32,0,0,234604,13,,S
|
||||
192,0,2,"Carbines, Mr. William",male,19,0,0,28424,13,,S
|
||||
193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S
|
||||
194,1,2,"Navratil, Master. Michel M",male,3,1,1,230080,26,F2,S
|
||||
195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44,0,0,PC 17610,27.7208,B4,C
|
||||
196,1,1,"Lurette, Miss. Elise",female,58,0,0,PC 17569,146.5208,B80,C
|
||||
197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q
|
||||
198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42,0,1,4579,8.4042,,S
|
||||
199,1,3,"Madigan, Miss. Margaret ""Maggie""",female,,0,0,370370,7.75,,Q
|
||||
200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S
|
||||
201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28,0,0,345770,9.5,,S
|
||||
202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S
|
||||
203,0,3,"Johanson, Mr. Jakob Alfred",male,34,0,0,3101264,6.4958,,S
|
||||
204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C
|
||||
205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18,0,0,A/5 3540,8.05,,S
|
||||
206,0,3,"Strom, Miss. Telma Matilda",female,2,0,1,347054,10.4625,G6,S
|
||||
207,0,3,"Backstrom, Mr. Karl Alfred",male,32,1,0,3101278,15.85,,S
|
||||
208,1,3,"Albimona, Mr. Nassef Cassem",male,26,0,0,2699,18.7875,,C
|
||||
209,1,3,"Carr, Miss. Helen ""Ellen""",female,16,0,0,367231,7.75,,Q
|
||||
210,1,1,"Blank, Mr. Henry",male,40,0,0,112277,31,A31,C
|
||||
211,0,3,"Ali, Mr. Ahmed",male,24,0,0,SOTON/O.Q. 3101311,7.05,,S
|
||||
212,1,2,"Cameron, Miss. Clear Annie",female,35,0,0,F.C.C. 13528,21,,S
|
||||
213,0,3,"Perkin, Mr. John Henry",male,22,0,0,A/5 21174,7.25,,S
|
||||
214,0,2,"Givard, Mr. Hans Kristensen",male,30,0,0,250646,13,,S
|
||||
215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q
|
||||
216,1,1,"Newell, Miss. Madeleine",female,31,1,0,35273,113.275,D36,C
|
||||
217,1,3,"Honkanen, Miss. Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S
|
||||
218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42,1,0,243847,27,,S
|
||||
219,1,1,"Bazzani, Miss. Albina",female,32,0,0,11813,76.2917,D15,C
|
||||
220,0,2,"Harris, Mr. Walter",male,30,0,0,W/C 14208,10.5,,S
|
||||
221,1,3,"Sunderland, Mr. Victor Francis",male,16,0,0,SOTON/OQ 392089,8.05,,S
|
||||
222,0,2,"Bracken, Mr. James H",male,27,0,0,220367,13,,S
|
||||
223,0,3,"Green, Mr. George Henry",male,51,0,0,21440,8.05,,S
|
||||
224,0,3,"Nenkoff, Mr. Christo",male,,0,0,349234,7.8958,,S
|
||||
225,1,1,"Hoyt, Mr. Frederick Maxfield",male,38,1,0,19943,90,C93,S
|
||||
226,0,3,"Berglund, Mr. Karl Ivar Sven",male,22,0,0,PP 4348,9.35,,S
|
||||
227,1,2,"Mellors, Mr. William John",male,19,0,0,SW/PP 751,10.5,,S
|
||||
228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S
|
||||
229,0,2,"Fahlstrom, Mr. Arne Jonas",male,18,0,0,236171,13,,S
|
||||
230,0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S
|
||||
231,1,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",female,35,1,0,36973,83.475,C83,S
|
||||
232,0,3,"Larsson, Mr. Bengt Edvin",male,29,0,0,347067,7.775,,S
|
||||
233,0,2,"Sjostedt, Mr. Ernst Adolf",male,59,0,0,237442,13.5,,S
|
||||
234,1,3,"Asplund, Miss. Lillian Gertrud",female,5,4,2,347077,31.3875,,S
|
||||
235,0,2,"Leyson, Mr. Robert William Norman",male,24,0,0,C.A. 29566,10.5,,S
|
||||
236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S
|
||||
237,0,2,"Hold, Mr. Stephen",male,44,1,0,26707,26,,S
|
||||
238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S
|
||||
239,0,2,"Pengelly, Mr. Frederick William",male,19,0,0,28665,10.5,,S
|
||||
240,0,2,"Hunt, Mr. George Henry",male,33,0,0,SCO/W 1585,12.275,,S
|
||||
241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C
|
||||
242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5,,Q
|
||||
243,0,2,"Coleridge, Mr. Reginald Charles",male,29,0,0,W./C. 14263,10.5,,S
|
||||
244,0,3,"Maenpaa, Mr. Matti Alexanteri",male,22,0,0,STON/O 2. 3101275,7.125,,S
|
||||
245,0,3,"Attalah, Mr. Sleiman",male,30,0,0,2694,7.225,,C
|
||||
246,0,1,"Minahan, Dr. William Edward",male,44,2,0,19928,90,C78,Q
|
||||
247,0,3,"Lindahl, Miss. Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S
|
||||
248,1,2,"Hamalainen, Mrs. William (Anna)",female,24,0,2,250649,14.5,,S
|
||||
249,1,1,"Beckwith, Mr. Richard Leonard",male,37,1,1,11751,52.5542,D35,S
|
||||
250,0,2,"Carter, Rev. Ernest Courtenay",male,54,1,0,244252,26,,S
|
||||
251,0,3,"Reed, Mr. James George",male,,0,0,362316,7.25,,S
|
||||
252,0,3,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",female,29,1,1,347054,10.4625,G6,S
|
||||
253,0,1,"Stead, Mr. William Thomas",male,62,0,0,113514,26.55,C87,S
|
||||
254,0,3,"Lobb, Mr. William Arthur",male,30,1,0,A/5. 3336,16.1,,S
|
||||
255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41,0,2,370129,20.2125,,S
|
||||
256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29,0,2,2650,15.2458,,C
|
||||
257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C
|
||||
258,1,1,"Cherry, Miss. Gladys",female,30,0,0,110152,86.5,B77,S
|
||||
259,1,1,"Ward, Miss. Anna",female,35,0,0,PC 17755,512.3292,,C
|
||||
260,1,2,"Parrish, Mrs. (Lutie Davis)",female,50,0,1,230433,26,,S
|
||||
261,0,3,"Smith, Mr. Thomas",male,,0,0,384461,7.75,,Q
|
||||
262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S
|
||||
263,0,1,"Taussig, Mr. Emil",male,52,1,1,110413,79.65,E67,S
|
||||
264,0,1,"Harrison, Mr. William",male,40,0,0,112059,0,B94,S
|
||||
265,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q
|
||||
266,0,2,"Reeves, Mr. David",male,36,0,0,C.A. 17248,10.5,,S
|
||||
267,0,3,"Panula, Mr. Ernesti Arvid",male,16,4,1,3101295,39.6875,,S
|
||||
268,1,3,"Persson, Mr. Ernst Ulrik",male,25,1,0,347083,7.775,,S
|
||||
269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58,0,1,PC 17582,153.4625,C125,S
|
||||
270,1,1,"Bissette, Miss. Amelia",female,35,0,0,PC 17760,135.6333,C99,S
|
||||
271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31,,S
|
||||
272,1,3,"Tornquist, Mr. William Henry",male,25,0,0,LINE,0,,S
|
||||
273,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",female,41,0,1,250644,19.5,,S
|
||||
274,0,1,"Natsch, Mr. Charles H",male,37,0,1,PC 17596,29.7,C118,C
|
||||
275,1,3,"Healy, Miss. Hanora ""Nora""",female,,0,0,370375,7.75,,Q
|
||||
276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S
|
||||
277,0,3,"Lindblom, Miss. Augusta Charlotta",female,45,0,0,347073,7.75,,S
|
||||
278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0,,S
|
||||
279,0,3,"Rice, Master. Eric",male,7,4,1,382652,29.125,,Q
|
||||
280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35,1,1,C.A. 2673,20.25,,S
|
||||
281,0,3,"Duane, Mr. Frank",male,65,0,0,336439,7.75,,Q
|
||||
282,0,3,"Olsson, Mr. Nils Johan Goransson",male,28,0,0,347464,7.8542,,S
|
||||
283,0,3,"de Pelsmaeker, Mr. Alfons",male,16,0,0,345778,9.5,,S
|
||||
284,1,3,"Dorking, Mr. Edward Arthur",male,19,0,0,A/5. 10482,8.05,,S
|
||||
285,0,1,"Smith, Mr. Richard William",male,,0,0,113056,26,A19,S
|
||||
286,0,3,"Stankovic, Mr. Ivan",male,33,0,0,349239,8.6625,,C
|
||||
287,1,3,"de Mulder, Mr. Theodore",male,30,0,0,345774,9.5,,S
|
||||
288,0,3,"Naidenoff, Mr. Penko",male,22,0,0,349206,7.8958,,S
|
||||
289,1,2,"Hosono, Mr. Masabumi",male,42,0,0,237798,13,,S
|
||||
290,1,3,"Connolly, Miss. Kate",female,22,0,0,370373,7.75,,Q
|
||||
291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26,0,0,19877,78.85,,S
|
||||
292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19,1,0,11967,91.0792,B49,C
|
||||
293,0,2,"Levy, Mr. Rene Jacques",male,36,0,0,SC/Paris 2163,12.875,D,C
|
||||
294,0,3,"Haas, Miss. Aloisia",female,24,0,0,349236,8.85,,S
|
||||
295,0,3,"Mineff, Mr. Ivan",male,24,0,0,349233,7.8958,,S
|
||||
296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C
|
||||
297,0,3,"Hanna, Mr. Mansour",male,23.5,0,0,2693,7.2292,,C
|
||||
298,0,1,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S
|
||||
299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S
|
||||
300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50,0,1,PC 17558,247.5208,B58 B60,C
|
||||
301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q
|
||||
302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q
|
||||
303,0,3,"Johnson, Mr. William Cahoone Jr",male,19,0,0,LINE,0,,S
|
||||
304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q
|
||||
305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S
|
||||
306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
|
||||
307,1,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C
|
||||
308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)",female,17,1,0,PC 17758,108.9,C65,C
|
||||
309,0,2,"Abelson, Mr. Samuel",male,30,1,0,P/PP 3381,24,,C
|
||||
310,1,1,"Francatelli, Miss. Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C
|
||||
311,1,1,"Hays, Miss. Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C
|
||||
312,1,1,"Ryerson, Miss. Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C
|
||||
313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26,1,1,250651,26,,S
|
||||
314,0,3,"Hendekovic, Mr. Ignjac",male,28,0,0,349243,7.8958,,S
|
||||
315,0,2,"Hart, Mr. Benjamin",male,43,1,1,F.C.C. 13529,26.25,,S
|
||||
316,1,3,"Nilsson, Miss. Helmina Josefina",female,26,0,0,347470,7.8542,,S
|
||||
317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24,1,0,244367,26,,S
|
||||
318,0,2,"Moraweck, Dr. Ernest",male,54,0,0,29011,14,,S
|
||||
319,1,1,"Wick, Miss. Mary Natalie",female,31,0,2,36928,164.8667,C7,S
|
||||
320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)",female,40,1,1,16966,134.5,E34,C
|
||||
321,0,3,"Dennis, Mr. Samuel",male,22,0,0,A/5 21172,7.25,,S
|
||||
322,0,3,"Danoff, Mr. Yoto",male,27,0,0,349219,7.8958,,S
|
||||
323,1,2,"Slayter, Miss. Hilda Mary",female,30,0,0,234818,12.35,,Q
|
||||
324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",female,22,1,1,248738,29,,S
|
||||
325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S
|
||||
326,1,1,"Young, Miss. Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C
|
||||
327,0,3,"Nysveen, Mr. Johan Hansen",male,61,0,0,345364,6.2375,,S
|
||||
328,1,2,"Ball, Mrs. (Ada E Hall)",female,36,0,0,28551,13,D,S
|
||||
329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31,1,1,363291,20.525,,S
|
||||
330,1,1,"Hippach, Miss. Jean Gertrude",female,16,0,1,111361,57.9792,B18,C
|
||||
331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q
|
||||
332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
|
||||
333,0,1,"Graham, Mr. George Edward",male,38,0,1,PC 17582,153.4625,C91,S
|
||||
334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16,2,0,345764,18,,S
|
||||
335,1,1,"Frauenthal, Mrs. Henry William (Clara Heinsheimer)",female,,1,0,PC 17611,133.65,,S
|
||||
336,0,3,"Denkoff, Mr. Mitto",male,,0,0,349225,7.8958,,S
|
||||
337,0,1,"Pears, Mr. Thomas Clinton",male,29,1,0,113776,66.6,C2,S
|
||||
338,1,1,"Burns, Miss. Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C
|
||||
339,1,3,"Dahl, Mr. Karl Edwart",male,45,0,0,7598,8.05,,S
|
||||
340,0,1,"Blackwell, Mr. Stephen Weart",male,45,0,0,113784,35.5,T,S
|
||||
341,1,2,"Navratil, Master. Edmond Roger",male,2,1,1,230080,26,F2,S
|
||||
342,1,1,"Fortune, Miss. Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S
|
||||
343,0,2,"Collander, Mr. Erik Gustaf",male,28,0,0,248740,13,,S
|
||||
344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25,0,0,244361,13,,S
|
||||
345,0,2,"Fox, Mr. Stanley Hubert",male,36,0,0,229236,13,,S
|
||||
346,1,2,"Brown, Miss. Amelia ""Mildred""",female,24,0,0,248733,13,F33,S
|
||||
347,1,2,"Smith, Miss. Marion Elsie",female,40,0,0,31418,13,,S
|
||||
348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1,,S
|
||||
349,1,3,"Coutts, Master. William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S
|
||||
350,0,3,"Dimic, Mr. Jovan",male,42,0,0,315088,8.6625,,S
|
||||
351,0,3,"Odahl, Mr. Nils Martin",male,23,0,0,7267,9.225,,S
|
||||
352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35,C128,S
|
||||
353,0,3,"Elias, Mr. Tannous",male,15,1,1,2695,7.2292,,C
|
||||
354,0,3,"Arnold-Franchi, Mr. Josef",male,25,1,0,349237,17.8,,S
|
||||
355,0,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C
|
||||
356,0,3,"Vanden Steen, Mr. Leo Peter",male,28,0,0,345783,9.5,,S
|
||||
357,1,1,"Bowerman, Miss. Elsie Edith",female,22,0,1,113505,55,E33,S
|
||||
358,0,2,"Funk, Miss. Annie Clemmer",female,38,0,0,237671,13,,S
|
||||
359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q
|
||||
360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q
|
||||
361,0,3,"Skoog, Mr. Wilhelm",male,40,1,4,347088,27.9,,S
|
||||
362,0,2,"del Carlo, Mr. Sebastiano",male,29,1,0,SC/PARIS 2167,27.7208,,C
|
||||
363,0,3,"Barbara, Mrs. (Catherine David)",female,45,0,1,2691,14.4542,,C
|
||||
364,0,3,"Asim, Mr. Adola",male,35,0,0,SOTON/O.Q. 3101310,7.05,,S
|
||||
365,0,3,"O'Brien, Mr. Thomas",male,,1,0,370365,15.5,,Q
|
||||
366,0,3,"Adahl, Mr. Mauritz Nils Martin",male,30,0,0,C 7076,7.25,,S
|
||||
367,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",female,60,1,0,110813,75.25,D37,C
|
||||
368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C
|
||||
369,1,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q
|
||||
370,1,1,"Aubart, Mme. Leontine Pauline",female,24,0,0,PC 17477,69.3,B35,C
|
||||
371,1,1,"Harder, Mr. George Achilles",male,25,1,0,11765,55.4417,E50,C
|
||||
372,0,3,"Wiklund, Mr. Jakob Alfred",male,18,1,0,3101267,6.4958,,S
|
||||
373,0,3,"Beavan, Mr. William Thomas",male,19,0,0,323951,8.05,,S
|
||||
374,0,1,"Ringhini, Mr. Sante",male,22,0,0,PC 17760,135.6333,,C
|
||||
375,0,3,"Palsson, Miss. Stina Viola",female,3,3,1,349909,21.075,,S
|
||||
376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C
|
||||
377,1,3,"Landergren, Miss. Aurora Adelia",female,22,0,0,C 7077,7.25,,S
|
||||
378,0,1,"Widener, Mr. Harry Elkins",male,27,0,2,113503,211.5,C82,C
|
||||
379,0,3,"Betros, Mr. Tannous",male,20,0,0,2648,4.0125,,C
|
||||
380,0,3,"Gustafsson, Mr. Karl Gideon",male,19,0,0,347069,7.775,,S
|
||||
381,1,1,"Bidois, Miss. Rosalie",female,42,0,0,PC 17757,227.525,,C
|
||||
382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1,0,2,2653,15.7417,,C
|
||||
383,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,,S
|
||||
384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",female,35,1,0,113789,52,,S
|
||||
385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S
|
||||
386,0,2,"Davies, Mr. Charles Henry",male,18,0,0,S.O.C. 14879,73.5,,S
|
||||
387,0,3,"Goodwin, Master. Sidney Leonard",male,1,5,2,CA 2144,46.9,,S
|
||||
388,1,2,"Buss, Miss. Kate",female,36,0,0,27849,13,,S
|
||||
389,0,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,,Q
|
||||
390,1,2,"Lehmann, Miss. Bertha",female,17,0,0,SC 1748,12,,C
|
||||
391,1,1,"Carter, Mr. William Ernest",male,36,1,2,113760,120,B96 B98,S
|
||||
392,1,3,"Jansson, Mr. Carl Olof",male,21,0,0,350034,7.7958,,S
|
||||
393,0,3,"Gustafsson, Mr. Johan Birger",male,28,2,0,3101277,7.925,,S
|
||||
394,1,1,"Newell, Miss. Marjorie",female,23,1,0,35273,113.275,D36,C
|
||||
395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",female,24,0,2,PP 9549,16.7,G6,S
|
||||
396,0,3,"Johansson, Mr. Erik",male,22,0,0,350052,7.7958,,S
|
||||
397,0,3,"Olsson, Miss. Elina",female,31,0,0,350407,7.8542,,S
|
||||
398,0,2,"McKane, Mr. Peter David",male,46,0,0,28403,26,,S
|
||||
399,0,2,"Pain, Dr. Alfred",male,23,0,0,244278,10.5,,S
|
||||
400,1,2,"Trout, Mrs. William H (Jessie L)",female,28,0,0,240929,12.65,,S
|
||||
401,1,3,"Niskanen, Mr. Juha",male,39,0,0,STON/O 2. 3101289,7.925,,S
|
||||
402,0,3,"Adams, Mr. John",male,26,0,0,341826,8.05,,S
|
||||
403,0,3,"Jussila, Miss. Mari Aina",female,21,1,0,4137,9.825,,S
|
||||
404,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28,1,0,STON/O2. 3101279,15.85,,S
|
||||
405,0,3,"Oreskovic, Miss. Marija",female,20,0,0,315096,8.6625,,S
|
||||
406,0,2,"Gale, Mr. Shadrach",male,34,1,0,28664,21,,S
|
||||
407,0,3,"Widegren, Mr. Carl/Charles Peter",male,51,0,0,347064,7.75,,S
|
||||
408,1,2,"Richards, Master. William Rowe",male,3,1,1,29106,18.75,,S
|
||||
409,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21,0,0,312992,7.775,,S
|
||||
410,0,3,"Lefebre, Miss. Ida",female,,3,1,4133,25.4667,,S
|
||||
411,0,3,"Sdycoff, Mr. Todor",male,,0,0,349222,7.8958,,S
|
||||
412,0,3,"Hart, Mr. Henry",male,,0,0,394140,6.8583,,Q
|
||||
413,1,1,"Minahan, Miss. Daisy E",female,33,1,0,19928,90,C78,Q
|
||||
414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0,,S
|
||||
415,1,3,"Sundman, Mr. Johan Julian",male,44,0,0,STON/O 2. 3101269,7.925,,S
|
||||
416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S
|
||||
417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34,1,1,28220,32.5,,S
|
||||
418,1,2,"Silven, Miss. Lyyli Karoliina",female,18,0,2,250652,13,,S
|
||||
419,0,2,"Matthews, Mr. William John",male,30,0,0,28228,13,,S
|
||||
420,0,3,"Van Impe, Miss. Catharina",female,10,0,2,345773,24.15,,S
|
||||
421,0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C
|
||||
422,0,3,"Charters, Mr. David",male,21,0,0,A/5. 13032,7.7333,,Q
|
||||
423,0,3,"Zimmerman, Mr. Leo",male,29,0,0,315082,7.875,,S
|
||||
424,0,3,"Danbom, Mrs. Ernst Gilbert (Anna Sigrid Maria Brogren)",female,28,1,1,347080,14.4,,S
|
||||
425,0,3,"Rosblom, Mr. Viktor Richard",male,18,1,1,370129,20.2125,,S
|
||||
426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S
|
||||
427,1,2,"Clarke, Mrs. Charles V (Ada Maria Winfield)",female,28,1,0,2003,26,,S
|
||||
428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S
|
||||
429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q
|
||||
430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32,0,0,SOTON/O.Q. 392078,8.05,E10,S
|
||||
431,1,1,"Bjornstrom-Steffansson, Mr. Mauritz Hakan",male,28,0,0,110564,26.55,C52,S
|
||||
432,1,3,"Thorneycroft, Mrs. Percival (Florence Kate White)",female,,1,0,376564,16.1,,S
|
||||
433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide Slow)",female,42,1,0,SC/AH 3085,26,,S
|
||||
434,0,3,"Kallio, Mr. Nikolai Erland",male,17,0,0,STON/O 2. 3101274,7.125,,S
|
||||
435,0,1,"Silvey, Mr. William Baird",male,50,1,0,13507,55.9,E44,S
|
||||
436,1,1,"Carter, Miss. Lucile Polk",female,14,1,2,113760,120,B96 B98,S
|
||||
437,0,3,"Ford, Miss. Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S
|
||||
438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24,2,3,29106,18.75,,S
|
||||
439,0,1,"Fortune, Mr. Mark",male,64,1,4,19950,263,C23 C25 C27,S
|
||||
440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31,0,0,C.A. 18723,10.5,,S
|
||||
441,1,2,"Hart, Mrs. Benjamin (Esther Ada Bloomfield)",female,45,1,1,F.C.C. 13529,26.25,,S
|
||||
442,0,3,"Hampe, Mr. Leon",male,20,0,0,345769,9.5,,S
|
||||
443,0,3,"Petterson, Mr. Johan Emil",male,25,1,0,347076,7.775,,S
|
||||
444,1,2,"Reynaldo, Ms. Encarnacion",female,28,0,0,230434,13,,S
|
||||
445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S
|
||||
446,1,1,"Dodge, Master. Washington",male,4,0,2,33638,81.8583,A34,S
|
||||
447,1,2,"Mellinger, Miss. Madeleine Violet",female,13,0,1,250644,19.5,,S
|
||||
448,1,1,"Seward, Mr. Frederic Kimber",male,34,0,0,113794,26.55,,S
|
||||
449,1,3,"Baclini, Miss. Marie Catherine",female,5,2,1,2666,19.2583,,C
|
||||
450,1,1,"Peuchen, Major. Arthur Godfrey",male,52,0,0,113786,30.5,C104,S
|
||||
451,0,2,"West, Mr. Edwy Arthur",male,36,1,2,C.A. 34651,27.75,,S
|
||||
452,0,3,"Hagland, Mr. Ingvald Olai Olsen",male,,1,0,65303,19.9667,,S
|
||||
453,0,1,"Foreman, Mr. Benjamin Laventall",male,30,0,0,113051,27.75,C111,C
|
||||
454,1,1,"Goldenberg, Mr. Samuel L",male,49,1,0,17453,89.1042,C92,C
|
||||
455,0,3,"Peduzzi, Mr. Joseph",male,,0,0,A/5 2817,8.05,,S
|
||||
456,1,3,"Jalsevac, Mr. Ivan",male,29,0,0,349240,7.8958,,C
|
||||
457,0,1,"Millet, Mr. Francis Davis",male,65,0,0,13509,26.55,E38,S
|
||||
458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S
|
||||
459,1,2,"Toomey, Miss. Ellen",female,50,0,0,F.C.C. 13531,10.5,,S
|
||||
460,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q
|
||||
461,1,1,"Anderson, Mr. Harry",male,48,0,0,19952,26.55,E12,S
|
||||
462,0,3,"Morley, Mr. William",male,34,0,0,364506,8.05,,S
|
||||
463,0,1,"Gee, Mr. Arthur H",male,47,0,0,111320,38.5,E63,S
|
||||
464,0,2,"Milling, Mr. Jacob Christian",male,48,0,0,234360,13,,S
|
||||
465,0,3,"Maisner, Mr. Simon",male,,0,0,A/S 2816,8.05,,S
|
||||
466,0,3,"Goncalves, Mr. Manuel Estanslas",male,38,0,0,SOTON/O.Q. 3101306,7.05,,S
|
||||
467,0,2,"Campbell, Mr. William",male,,0,0,239853,0,,S
|
||||
468,0,1,"Smart, Mr. John Montgomery",male,56,0,0,113792,26.55,,S
|
||||
469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q
|
||||
470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C
|
||||
471,0,3,"Keefe, Mr. Arthur",male,,0,0,323592,7.25,,S
|
||||
472,0,3,"Cacic, Mr. Luka",male,38,0,0,315089,8.6625,,S
|
||||
473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33,1,2,C.A. 34651,27.75,,S
|
||||
474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23,0,0,SC/AH Basle 541,13.7917,D,C
|
||||
475,0,3,"Strandberg, Miss. Ida Sofia",female,22,0,0,7553,9.8375,,S
|
||||
476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52,A14,S
|
||||
477,0,2,"Renouf, Mr. Peter Henry",male,34,1,0,31027,21,,S
|
||||
478,0,3,"Braund, Mr. Lewis Richard",male,29,1,0,3460,7.0458,,S
|
||||
479,0,3,"Karlsson, Mr. Nils August",male,22,0,0,350060,7.5208,,S
|
||||
480,1,3,"Hirvonen, Miss. Hildur E",female,2,0,1,3101298,12.2875,,S
|
||||
481,0,3,"Goodwin, Master. Harold Victor",male,9,5,2,CA 2144,46.9,,S
|
||||
482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0,,S
|
||||
483,0,3,"Rouse, Mr. Richard Henry",male,50,0,0,A/5 3594,8.05,,S
|
||||
484,1,3,"Turkula, Mrs. (Hedwig)",female,63,0,0,4134,9.5875,,S
|
||||
485,1,1,"Bishop, Mr. Dickinson H",male,25,1,0,11967,91.0792,B49,C
|
||||
486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S
|
||||
487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35,1,0,19943,90,C93,S
|
||||
488,0,1,"Kent, Mr. Edward Austin",male,58,0,0,11771,29.7,B37,C
|
||||
489,0,3,"Somerton, Mr. Francis William",male,30,0,0,A.5. 18509,8.05,,S
|
||||
490,1,3,"Coutts, Master. Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S
|
||||
491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S
|
||||
492,0,3,"Windelov, Mr. Einar",male,21,0,0,SOTON/OQ 3101317,7.25,,S
|
||||
493,0,1,"Molson, Mr. Harry Markland",male,55,0,0,113787,30.5,C30,S
|
||||
494,0,1,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,,C
|
||||
495,0,3,"Stanley, Mr. Edward Roland",male,21,0,0,A/4 45380,8.05,,S
|
||||
496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C
|
||||
497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C
|
||||
498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,,S
|
||||
499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S
|
||||
500,0,3,"Svensson, Mr. Olof",male,24,0,0,350035,7.7958,,S
|
||||
501,0,3,"Calic, Mr. Petar",male,17,0,0,315086,8.6625,,S
|
||||
502,0,3,"Canavan, Miss. Mary",female,21,0,0,364846,7.75,,Q
|
||||
503,0,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,330909,7.6292,,Q
|
||||
504,0,3,"Laitinen, Miss. Kristina Sofia",female,37,0,0,4135,9.5875,,S
|
||||
505,1,1,"Maioni, Miss. Roberta",female,16,0,0,110152,86.5,B79,S
|
||||
506,0,1,"Penasco y Castellana, Mr. Victor de Satode",male,18,1,0,PC 17758,108.9,C65,C
|
||||
507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33,0,2,26360,26,,S
|
||||
508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S
|
||||
509,0,3,"Olsen, Mr. Henry Margido",male,28,0,0,C 4001,22.525,,S
|
||||
510,1,3,"Lang, Mr. Fang",male,26,0,0,1601,56.4958,,S
|
||||
511,1,3,"Daly, Mr. Eugene Patrick",male,29,0,0,382651,7.75,,Q
|
||||
512,0,3,"Webber, Mr. James",male,,0,0,SOTON/OQ 3101316,8.05,,S
|
||||
513,1,1,"McGough, Mr. James Robert",male,36,0,0,PC 17473,26.2875,E25,S
|
||||
514,1,1,"Rothschild, Mrs. Martin (Elizabeth L. Barrett)",female,54,1,0,PC 17603,59.4,,C
|
||||
515,0,3,"Coleff, Mr. Satio",male,24,0,0,349209,7.4958,,S
|
||||
516,0,1,"Walker, Mr. William Anderson",male,47,0,0,36967,34.0208,D46,S
|
||||
517,1,2,"Lemore, Mrs. (Amelia Milley)",female,34,0,0,C.A. 34260,10.5,F33,S
|
||||
518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q
|
||||
519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes Hughes)",female,36,1,0,226875,26,,S
|
||||
520,0,3,"Pavlovic, Mr. Stefo",male,32,0,0,349242,7.8958,,S
|
||||
521,1,1,"Perreault, Miss. Anne",female,30,0,0,12749,93.5,B73,S
|
||||
522,0,3,"Vovk, Mr. Janko",male,22,0,0,349252,7.8958,,S
|
||||
523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.225,,C
|
||||
524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44,0,1,111361,57.9792,B18,C
|
||||
525,0,3,"Kassem, Mr. Fared",male,,0,0,2700,7.2292,,C
|
||||
526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q
|
||||
527,1,2,"Ridsdale, Miss. Lucy",female,50,0,0,W./C. 14258,10.5,,S
|
||||
528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S
|
||||
529,0,3,"Salonen, Mr. Johan Werner",male,39,0,0,3101296,7.925,,S
|
||||
530,0,2,"Hocking, Mr. Richard George",male,23,2,1,29104,11.5,,S
|
||||
531,1,2,"Quick, Miss. Phyllis May",female,2,1,1,26360,26,,S
|
||||
532,0,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C
|
||||
533,0,3,"Elias, Mr. Joseph Jr",male,17,1,1,2690,7.2292,,C
|
||||
534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C
|
||||
535,0,3,"Cacic, Miss. Marija",female,30,0,0,315084,8.6625,,S
|
||||
536,1,2,"Hart, Miss. Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S
|
||||
537,0,1,"Butt, Major. Archibald Willingham",male,45,0,0,113050,26.55,B38,S
|
||||
538,1,1,"LeRoy, Miss. Bertha",female,30,0,0,PC 17761,106.425,,C
|
||||
539,0,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S
|
||||
540,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C
|
||||
541,1,1,"Crosby, Miss. Harriet R",female,36,0,2,WE/P 5735,71,B22,S
|
||||
542,0,3,"Andersson, Miss. Ingeborg Constanzia",female,9,4,2,347082,31.275,,S
|
||||
543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11,4,2,347082,31.275,,S
|
||||
544,1,2,"Beane, Mr. Edward",male,32,1,0,2908,26,,S
|
||||
545,0,1,"Douglas, Mr. Walter Donald",male,50,1,0,PC 17761,106.425,C86,C
|
||||
546,0,1,"Nicholson, Mr. Arthur Ernest",male,64,0,0,693,26,,S
|
||||
547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19,1,0,2908,26,,S
|
||||
548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C
|
||||
549,0,3,"Goldsmith, Mr. Frank John",male,33,1,1,363291,20.525,,S
|
||||
550,1,2,"Davies, Master. John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S
|
||||
551,1,1,"Thayer, Mr. John Borland Jr",male,17,0,2,17421,110.8833,C70,C
|
||||
552,0,2,"Sharp, Mr. Percival James R",male,27,0,0,244358,26,,S
|
||||
553,0,3,"O'Brien, Mr. Timothy",male,,0,0,330979,7.8292,,Q
|
||||
554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22,0,0,2620,7.225,,C
|
||||
555,1,3,"Ohman, Miss. Velin",female,22,0,0,347085,7.775,,S
|
||||
556,0,1,"Wright, Mr. George",male,62,0,0,113807,26.55,,S
|
||||
557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherland) (""Mrs Morgan"")",female,48,1,0,11755,39.6,A16,C
|
||||
558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C
|
||||
559,1,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",female,39,1,1,110413,79.65,E67,S
|
||||
560,1,3,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",female,36,1,0,345572,17.4,,S
|
||||
561,0,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.75,,Q
|
||||
562,0,3,"Sivic, Mr. Husein",male,40,0,0,349251,7.8958,,S
|
||||
563,0,2,"Norman, Mr. Robert Douglas",male,28,0,0,218629,13.5,,S
|
||||
564,0,3,"Simmons, Mr. John",male,,0,0,SOTON/OQ 392082,8.05,,S
|
||||
565,0,3,"Meanwell, Miss. (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S
|
||||
566,0,3,"Davies, Mr. Alfred J",male,24,2,0,A/4 48871,24.15,,S
|
||||
567,0,3,"Stoytcheff, Mr. Ilia",male,19,0,0,349205,7.8958,,S
|
||||
568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29,0,4,349909,21.075,,S
|
||||
569,0,3,"Doharr, Mr. Tannous",male,,0,0,2686,7.2292,,C
|
||||
570,1,3,"Jonsson, Mr. Carl",male,32,0,0,350417,7.8542,,S
|
||||
571,1,2,"Harris, Mr. George",male,62,0,0,S.W./PP 752,10.5,,S
|
||||
572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53,2,0,11769,51.4792,C101,S
|
||||
573,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36,0,0,PC 17474,26.3875,E25,S
|
||||
574,1,3,"Kelly, Miss. Mary",female,,0,0,14312,7.75,,Q
|
||||
575,0,3,"Rush, Mr. Alfred George John",male,16,0,0,A/4. 20589,8.05,,S
|
||||
576,0,3,"Patchett, Mr. George",male,19,0,0,358585,14.5,,S
|
||||
577,1,2,"Garside, Miss. Ethel",female,34,0,0,243880,13,,S
|
||||
578,1,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39,1,0,13507,55.9,E44,S
|
||||
579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C
|
||||
580,1,3,"Jussila, Mr. Eiriik",male,32,0,0,STON/O 2. 3101286,7.925,,S
|
||||
581,1,2,"Christy, Miss. Julie Rachel",female,25,1,1,237789,30,,S
|
||||
582,1,1,"Thayer, Mrs. John Borland (Marian Longstreth Morris)",female,39,1,1,17421,110.8833,C68,C
|
||||
583,0,2,"Downton, Mr. William James",male,54,0,0,28403,26,,S
|
||||
584,0,1,"Ross, Mr. John Hugo",male,36,0,0,13049,40.125,A10,C
|
||||
585,0,3,"Paulner, Mr. Uscher",male,,0,0,3411,8.7125,,C
|
||||
586,1,1,"Taussig, Miss. Ruth",female,18,0,2,110413,79.65,E68,S
|
||||
587,0,2,"Jarvis, Mr. John Denzil",male,47,0,0,237565,15,,S
|
||||
588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60,1,1,13567,79.2,B41,C
|
||||
589,0,3,"Gilinski, Mr. Eliezer",male,22,0,0,14973,8.05,,S
|
||||
590,0,3,"Murdlin, Mr. Joseph",male,,0,0,A./5. 3235,8.05,,S
|
||||
591,0,3,"Rintamaki, Mr. Matti",male,35,0,0,STON/O 2. 3101273,7.125,,S
|
||||
592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",female,52,1,0,36947,78.2667,D20,C
|
||||
593,0,3,"Elsbury, Mr. William James",male,47,0,0,A/5 3902,7.25,,S
|
||||
594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q
|
||||
595,0,2,"Chapman, Mr. John Henry",male,37,1,0,SC/AH 29037,26,,S
|
||||
596,0,3,"Van Impe, Mr. Jean Baptiste",male,36,1,1,345773,24.15,,S
|
||||
597,1,2,"Leitch, Miss. Jessie Wills",female,,0,0,248727,33,,S
|
||||
598,0,3,"Johnson, Mr. Alfred",male,49,0,0,LINE,0,,S
|
||||
599,0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C
|
||||
600,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49,1,0,PC 17485,56.9292,A20,C
|
||||
601,1,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Christy)",female,24,2,1,243847,27,,S
|
||||
602,0,3,"Slabenoff, Mr. Petco",male,,0,0,349214,7.8958,,S
|
||||
603,0,1,"Harrington, Mr. Charles H",male,,0,0,113796,42.4,,S
|
||||
604,0,3,"Torber, Mr. Ernst William",male,44,0,0,364511,8.05,,S
|
||||
605,1,1,"Homer, Mr. Harry (""Mr E Haven"")",male,35,0,0,111426,26.55,,C
|
||||
606,0,3,"Lindell, Mr. Edvard Bengtsson",male,36,1,0,349910,15.55,,S
|
||||
607,0,3,"Karaic, Mr. Milan",male,30,0,0,349246,7.8958,,S
|
||||
608,1,1,"Daniel, Mr. Robert Williams",male,27,0,0,113804,30.5,,S
|
||||
609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)",female,22,1,2,SC/Paris 2123,41.5792,,C
|
||||
610,1,1,"Shutes, Miss. Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S
|
||||
611,0,3,"Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)",female,39,1,5,347082,31.275,,S
|
||||
612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.05,,S
|
||||
613,1,3,"Murphy, Miss. Margaret Jane",female,,1,0,367230,15.5,,Q
|
||||
614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q
|
||||
615,0,3,"Brocklebank, Mr. William Alfred",male,35,0,0,364512,8.05,,S
|
||||
616,1,2,"Herman, Miss. Alice",female,24,1,2,220845,65,,S
|
||||
617,0,3,"Danbom, Mr. Ernst Gilbert",male,34,1,1,347080,14.4,,S
|
||||
618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26,1,0,A/5. 3336,16.1,,S
|
||||
619,1,2,"Becker, Miss. Marion Louise",female,4,2,1,230136,39,F4,S
|
||||
620,0,2,"Gavey, Mr. Lawrence",male,26,0,0,31028,10.5,,S
|
||||
621,0,3,"Yasbeck, Mr. Antoni",male,27,1,0,2659,14.4542,,C
|
||||
622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42,1,0,11753,52.5542,D19,S
|
||||
623,1,3,"Nakid, Mr. Sahid",male,20,1,1,2653,15.7417,,C
|
||||
624,0,3,"Hansen, Mr. Henry Damsgaard",male,21,0,0,350029,7.8542,,S
|
||||
625,0,3,"Bowen, Mr. David John ""Dai""",male,21,0,0,54636,16.1,,S
|
||||
626,0,1,"Sutton, Mr. Frederick",male,61,0,0,36963,32.3208,D50,S
|
||||
627,0,2,"Kirkland, Rev. Charles Leonard",male,57,0,0,219533,12.35,,Q
|
||||
628,1,1,"Longley, Miss. Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S
|
||||
629,0,3,"Bostandyeff, Mr. Guentcho",male,26,0,0,349224,7.8958,,S
|
||||
630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q
|
||||
631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80,0,0,27042,30,A23,S
|
||||
632,0,3,"Lundahl, Mr. Johan Svensson",male,51,0,0,347743,7.0542,,S
|
||||
633,1,1,"Stahelin-Maeglin, Dr. Max",male,32,0,0,13214,30.5,B50,C
|
||||
634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0,,S
|
||||
635,0,3,"Skoog, Miss. Mabel",female,9,3,2,347088,27.9,,S
|
||||
636,1,2,"Davis, Miss. Mary",female,28,0,0,237668,13,,S
|
||||
637,0,3,"Leinonen, Mr. Antti Gustaf",male,32,0,0,STON/O 2. 3101292,7.925,,S
|
||||
638,0,2,"Collyer, Mr. Harvey",male,31,1,1,C.A. 31921,26.25,,S
|
||||
639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41,0,5,3101295,39.6875,,S
|
||||
640,0,3,"Thorneycroft, Mr. Percival",male,,1,0,376564,16.1,,S
|
||||
641,0,3,"Jensen, Mr. Hans Peder",male,20,0,0,350050,7.8542,,S
|
||||
642,1,1,"Sagesser, Mlle. Emma",female,24,0,0,PC 17477,69.3,B35,C
|
||||
643,0,3,"Skoog, Miss. Margit Elizabeth",female,2,3,2,347088,27.9,,S
|
||||
644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S
|
||||
645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C
|
||||
646,1,1,"Harper, Mr. Henry Sleeper",male,48,1,0,PC 17572,76.7292,D33,C
|
||||
647,0,3,"Cor, Mr. Liudevit",male,19,0,0,349231,7.8958,,S
|
||||
648,1,1,"Simonius-Blumer, Col. Oberst Alfons",male,56,0,0,13213,35.5,A26,C
|
||||
649,0,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.55,,S
|
||||
650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S
|
||||
651,0,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S
|
||||
652,1,2,"Doling, Miss. Elsie",female,18,0,1,231919,23,,S
|
||||
653,0,3,"Kalvik, Mr. Johannes Halvorsen",male,21,0,0,8475,8.4333,,S
|
||||
654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q
|
||||
655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18,0,0,365226,6.75,,Q
|
||||
656,0,2,"Hickman, Mr. Leonard Mark",male,24,2,0,S.O.C. 14879,73.5,,S
|
||||
657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S
|
||||
658,0,3,"Bourke, Mrs. John (Catherine)",female,32,1,1,364849,15.5,,Q
|
||||
659,0,2,"Eitemiller, Mr. George Floyd",male,23,0,0,29751,13,,S
|
||||
660,0,1,"Newell, Mr. Arthur Webster",male,58,0,2,35273,113.275,D48,C
|
||||
661,1,1,"Frauenthal, Dr. Henry William",male,50,2,0,PC 17611,133.65,,S
|
||||
662,0,3,"Badt, Mr. Mohamed",male,40,0,0,2623,7.225,,C
|
||||
663,0,1,"Colley, Mr. Edward Pomeroy",male,47,0,0,5727,25.5875,E58,S
|
||||
664,0,3,"Coleff, Mr. Peju",male,36,0,0,349210,7.4958,,S
|
||||
665,1,3,"Lindqvist, Mr. Eino William",male,20,1,0,STON/O 2. 3101285,7.925,,S
|
||||
666,0,2,"Hickman, Mr. Lewis",male,32,2,0,S.O.C. 14879,73.5,,S
|
||||
667,0,2,"Butler, Mr. Reginald Fenton",male,25,0,0,234686,13,,S
|
||||
668,0,3,"Rommetvedt, Mr. Knud Paust",male,,0,0,312993,7.775,,S
|
||||
669,0,3,"Cook, Mr. Jacob",male,43,0,0,A/5 3536,8.05,,S
|
||||
670,1,1,"Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)",female,,1,0,19996,52,C126,S
|
||||
671,1,2,"Brown, Mrs. Thomas William Solomon (Elizabeth Catherine Ford)",female,40,1,1,29750,39,,S
|
||||
672,0,1,"Davidson, Mr. Thornton",male,31,1,0,F.C. 12750,52,B71,S
|
||||
673,0,2,"Mitchell, Mr. Henry Michael",male,70,0,0,C.A. 24580,10.5,,S
|
||||
674,1,2,"Wilhelms, Mr. Charles",male,31,0,0,244270,13,,S
|
||||
675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0,,S
|
||||
676,0,3,"Edvardsson, Mr. Gustaf Hjalmar",male,18,0,0,349912,7.775,,S
|
||||
677,0,3,"Sawyer, Mr. Frederick Charles",male,24.5,0,0,342826,8.05,,S
|
||||
678,1,3,"Turja, Miss. Anna Sofia",female,18,0,0,4138,9.8417,,S
|
||||
679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43,1,6,CA 2144,46.9,,S
|
||||
680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36,0,1,PC 17755,512.3292,B51 B53 B55,C
|
||||
681,0,3,"Peters, Miss. Katie",female,,0,0,330935,8.1375,,Q
|
||||
682,1,1,"Hassab, Mr. Hammad",male,27,0,0,PC 17572,76.7292,D49,C
|
||||
683,0,3,"Olsvigen, Mr. Thor Anderson",male,20,0,0,6563,9.225,,S
|
||||
684,0,3,"Goodwin, Mr. Charles Edward",male,14,5,2,CA 2144,46.9,,S
|
||||
685,0,2,"Brown, Mr. Thomas William Solomon",male,60,1,1,29750,39,,S
|
||||
686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25,1,2,SC/Paris 2123,41.5792,,C
|
||||
687,0,3,"Panula, Mr. Jaako Arnold",male,14,4,1,3101295,39.6875,,S
|
||||
688,0,3,"Dakic, Mr. Branko",male,19,0,0,349228,10.1708,,S
|
||||
689,0,3,"Fischer, Mr. Eberhard Thelander",male,18,0,0,350036,7.7958,,S
|
||||
690,1,1,"Madill, Miss. Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S
|
||||
691,1,1,"Dick, Mr. Albert Adrian",male,31,1,0,17474,57,B20,S
|
||||
692,1,3,"Karun, Miss. Manca",female,4,0,1,349256,13.4167,,C
|
||||
693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S
|
||||
694,0,3,"Saad, Mr. Khalil",male,25,0,0,2672,7.225,,C
|
||||
695,0,1,"Weir, Col. John",male,60,0,0,113800,26.55,,S
|
||||
696,0,2,"Chapman, Mr. Charles Henry",male,52,0,0,248731,13.5,,S
|
||||
697,0,3,"Kelly, Mr. James",male,44,0,0,363592,8.05,,S
|
||||
698,1,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q
|
||||
699,0,1,"Thayer, Mr. John Borland",male,49,1,1,17421,110.8833,C68,C
|
||||
700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42,0,0,348121,7.65,F G63,S
|
||||
701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18,1,0,PC 17757,227.525,C62 C64,C
|
||||
702,1,1,"Silverthorne, Mr. Spencer Victor",male,35,0,0,PC 17475,26.2875,E24,S
|
||||
703,0,3,"Barbara, Miss. Saiide",female,18,0,1,2691,14.4542,,C
|
||||
704,0,3,"Gallagher, Mr. Martin",male,25,0,0,36864,7.7417,,Q
|
||||
705,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,,S
|
||||
706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39,0,0,250655,26,,S
|
||||
707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45,0,0,223596,13.5,,S
|
||||
708,1,1,"Calderhead, Mr. Edward Pennington",male,42,0,0,PC 17476,26.2875,E24,S
|
||||
709,1,1,"Cleaver, Miss. Alice",female,22,0,0,113781,151.55,,S
|
||||
710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C
|
||||
711,1,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",female,24,0,0,PC 17482,49.5042,C90,C
|
||||
712,0,1,"Klaber, Mr. Herman",male,,0,0,113028,26.55,C124,S
|
||||
713,1,1,"Taylor, Mr. Elmer Zebley",male,48,1,0,19996,52,C126,S
|
||||
714,0,3,"Larsson, Mr. August Viktor",male,29,0,0,7545,9.4833,,S
|
||||
715,0,2,"Greenberg, Mr. Samuel",male,52,0,0,250647,13,,S
|
||||
716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19,0,0,348124,7.65,F G73,S
|
||||
717,1,1,"Endres, Miss. Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C
|
||||
718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S
|
||||
719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q
|
||||
720,0,3,"Johnson, Mr. Malkolm Joackim",male,33,0,0,347062,7.775,,S
|
||||
721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6,0,1,248727,33,,S
|
||||
722,0,3,"Jensen, Mr. Svend Lauritz",male,17,1,0,350048,7.0542,,S
|
||||
723,0,2,"Gillespie, Mr. William Henry",male,34,0,0,12233,13,,S
|
||||
724,0,2,"Hodges, Mr. Henry Price",male,50,0,0,250643,13,,S
|
||||
725,1,1,"Chambers, Mr. Norman Campbell",male,27,1,0,113806,53.1,E8,S
|
||||
726,0,3,"Oreskovic, Mr. Luka",male,20,0,0,315094,8.6625,,S
|
||||
727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30,3,0,31027,21,,S
|
||||
728,1,3,"Mannion, Miss. Margareth",female,,0,0,36866,7.7375,,Q
|
||||
729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25,1,0,236853,26,,S
|
||||
730,0,3,"Ilmakangas, Miss. Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S
|
||||
731,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S
|
||||
732,0,3,"Hassan, Mr. Houssein G N",male,11,0,0,2699,18.7875,,C
|
||||
733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0,,S
|
||||
734,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13,,S
|
||||
735,0,2,"Troupiansky, Mr. Moses Aaron",male,23,0,0,233639,13,,S
|
||||
736,0,3,"Williams, Mr. Leslie",male,28.5,0,0,54636,16.1,,S
|
||||
737,0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",female,48,1,3,W./C. 6608,34.375,,S
|
||||
738,1,1,"Lesurer, Mr. Gustave J",male,35,0,0,PC 17755,512.3292,B101,C
|
||||
739,0,3,"Ivanoff, Mr. Kanio",male,,0,0,349201,7.8958,,S
|
||||
740,0,3,"Nankoff, Mr. Minko",male,,0,0,349218,7.8958,,S
|
||||
741,1,1,"Hawksford, Mr. Walter James",male,,0,0,16988,30,D45,S
|
||||
742,0,1,"Cavendish, Mr. Tyrell William",male,36,1,0,19877,78.85,C46,S
|
||||
743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C
|
||||
744,0,3,"McNamee, Mr. Neal",male,24,1,0,376566,16.1,,S
|
||||
745,1,3,"Stranden, Mr. Juho",male,31,0,0,STON/O 2. 3101288,7.925,,S
|
||||
746,0,1,"Crosby, Capt. Edward Gifford",male,70,1,1,WE/P 5735,71,B22,S
|
||||
747,0,3,"Abbott, Mr. Rossmore Edward",male,16,1,1,C.A. 2673,20.25,,S
|
||||
748,1,2,"Sinkkonen, Miss. Anna",female,30,0,0,250648,13,,S
|
||||
749,0,1,"Marvin, Mr. Daniel Warner",male,19,1,0,113773,53.1,D30,S
|
||||
750,0,3,"Connaghton, Mr. Michael",male,31,0,0,335097,7.75,,Q
|
||||
751,1,2,"Wells, Miss. Joan",female,4,1,1,29103,23,,S
|
||||
752,1,3,"Moor, Master. Meier",male,6,0,1,392096,12.475,E121,S
|
||||
753,0,3,"Vande Velde, Mr. Johannes Joseph",male,33,0,0,345780,9.5,,S
|
||||
754,0,3,"Jonkoff, Mr. Lalio",male,23,0,0,349204,7.8958,,S
|
||||
755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48,1,2,220845,65,,S
|
||||
756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S
|
||||
757,0,3,"Carlsson, Mr. August Sigfrid",male,28,0,0,350042,7.7958,,S
|
||||
758,0,2,"Bailey, Mr. Percy Andrew",male,18,0,0,29108,11.5,,S
|
||||
759,0,3,"Theobald, Mr. Thomas Leonard",male,34,0,0,363294,8.05,,S
|
||||
760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)",female,33,0,0,110152,86.5,B77,S
|
||||
761,0,3,"Garfirth, Mr. John",male,,0,0,358585,14.5,,S
|
||||
762,0,3,"Nirva, Mr. Iisakki Antino Aijo",male,41,0,0,SOTON/O2 3101272,7.125,,S
|
||||
763,1,3,"Barah, Mr. Hanna Assi",male,20,0,0,2663,7.2292,,C
|
||||
764,1,1,"Carter, Mrs. William Ernest (Lucile Polk)",female,36,1,2,113760,120,B96 B98,S
|
||||
765,0,3,"Eklund, Mr. Hans Linus",male,16,0,0,347074,7.775,,S
|
||||
766,1,1,"Hogeboom, Mrs. John C (Anna Andrews)",female,51,1,0,13502,77.9583,D11,S
|
||||
767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C
|
||||
768,0,3,"Mangan, Miss. Mary",female,30.5,0,0,364850,7.75,,Q
|
||||
769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q
|
||||
770,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32,0,0,8471,8.3625,,S
|
||||
771,0,3,"Lievens, Mr. Rene Aime",male,24,0,0,345781,9.5,,S
|
||||
772,0,3,"Jensen, Mr. Niels Peder",male,48,0,0,350047,7.8542,,S
|
||||
773,0,2,"Mack, Mrs. (Mary)",female,57,0,0,S.O./P.P. 3,10.5,E77,S
|
||||
774,0,3,"Elias, Mr. Dibo",male,,0,0,2674,7.225,,C
|
||||
775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54,1,3,29105,23,,S
|
||||
776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18,0,0,347078,7.75,,S
|
||||
777,0,3,"Tobin, Mr. Roger",male,,0,0,383121,7.75,F38,Q
|
||||
778,1,3,"Emanuel, Miss. Virginia Ethel",female,5,0,0,364516,12.475,,S
|
||||
779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
|
||||
780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)",female,43,0,1,24160,211.3375,B3,S
|
||||
781,1,3,"Ayoub, Miss. Banoura",female,13,0,0,2687,7.2292,,C
|
||||
782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17,1,0,17474,57,B20,S
|
||||
783,0,1,"Long, Mr. Milton Clyde",male,29,0,0,113501,30,D6,S
|
||||
784,0,3,"Johnston, Mr. Andrew G",male,,1,2,W./C. 6607,23.45,,S
|
||||
785,0,3,"Ali, Mr. William",male,25,0,0,SOTON/O.Q. 3101312,7.05,,S
|
||||
786,0,3,"Harmer, Mr. Abraham (David Lishin)",male,25,0,0,374887,7.25,,S
|
||||
787,1,3,"Sjoblom, Miss. Anna Sofia",female,18,0,0,3101265,7.4958,,S
|
||||
788,0,3,"Rice, Master. George Hugh",male,8,4,1,382652,29.125,,Q
|
||||
789,1,3,"Dean, Master. Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S
|
||||
790,0,1,"Guggenheim, Mr. Benjamin",male,46,0,0,PC 17593,79.2,B82 B84,C
|
||||
791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q
|
||||
792,0,2,"Gaskell, Mr. Alfred",male,16,0,0,239865,26,,S
|
||||
793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S
|
||||
794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C
|
||||
795,0,3,"Dantcheff, Mr. Ristiu",male,25,0,0,349203,7.8958,,S
|
||||
796,0,2,"Otter, Mr. Richard",male,39,0,0,28213,13,,S
|
||||
797,1,1,"Leader, Dr. Alice (Farnham)",female,49,0,0,17465,25.9292,D17,S
|
||||
798,1,3,"Osman, Mrs. Mara",female,31,0,0,349244,8.6833,,S
|
||||
799,0,3,"Ibrahim Shawah, Mr. Yousseff",male,30,0,0,2685,7.2292,,C
|
||||
800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)",female,30,1,1,345773,24.15,,S
|
||||
801,0,2,"Ponesell, Mr. Martin",male,34,0,0,250647,13,,S
|
||||
802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31,1,1,C.A. 31921,26.25,,S
|
||||
803,1,1,"Carter, Master. William Thornton II",male,11,1,2,113760,120,B96 B98,S
|
||||
804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C
|
||||
805,1,3,"Hedman, Mr. Oskar Arvid",male,27,0,0,347089,6.975,,S
|
||||
806,0,3,"Johansson, Mr. Karl Johan",male,31,0,0,347063,7.775,,S
|
||||
807,0,1,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0,A36,S
|
||||
808,0,3,"Pettersson, Miss. Ellen Natalia",female,18,0,0,347087,7.775,,S
|
||||
809,0,2,"Meyer, Mr. August",male,39,0,0,248723,13,,S
|
||||
810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",female,33,1,0,113806,53.1,E8,S
|
||||
811,0,3,"Alexander, Mr. William",male,26,0,0,3474,7.8875,,S
|
||||
812,0,3,"Lester, Mr. James",male,39,0,0,A/4 48871,24.15,,S
|
||||
813,0,2,"Slemen, Mr. Richard James",male,35,0,0,28206,10.5,,S
|
||||
814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S
|
||||
815,0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,364499,8.05,,S
|
||||
816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0,B102,S
|
||||
817,0,3,"Heininen, Miss. Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S
|
||||
818,0,2,"Mallet, Mr. Albert",male,31,1,1,S.C./PARIS 2079,37.0042,,C
|
||||
819,0,3,"Holm, Mr. John Fredrik Alexander",male,43,0,0,C 7075,6.45,,S
|
||||
820,0,3,"Skoog, Master. Karl Thorsten",male,10,3,2,347088,27.9,,S
|
||||
821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gregg)",female,52,1,1,12749,93.5,B69,S
|
||||
822,1,3,"Lulic, Mr. Nikola",male,27,0,0,315098,8.6625,,S
|
||||
823,0,1,"Reuchlin, Jonkheer. John George",male,38,0,0,19972,0,,S
|
||||
824,1,3,"Moor, Mrs. (Beila)",female,27,0,1,392096,12.475,E121,S
|
||||
825,0,3,"Panula, Master. Urho Abraham",male,2,4,1,3101295,39.6875,,S
|
||||
826,0,3,"Flynn, Mr. John",male,,0,0,368323,6.95,,Q
|
||||
827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S
|
||||
828,1,2,"Mallet, Master. Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C
|
||||
829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q
|
||||
830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28,
|
||||
831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15,1,0,2659,14.4542,,C
|
||||
832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,,S
|
||||
833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C
|
||||
834,0,3,"Augustsson, Mr. Albert",male,23,0,0,347468,7.8542,,S
|
||||
835,0,3,"Allum, Mr. Owen George",male,18,0,0,2223,8.3,,S
|
||||
836,1,1,"Compton, Miss. Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C
|
||||
837,0,3,"Pasic, Mr. Jakob",male,21,0,0,315097,8.6625,,S
|
||||
838,0,3,"Sirota, Mr. Maurice",male,,0,0,392092,8.05,,S
|
||||
839,1,3,"Chip, Mr. Chang",male,32,0,0,1601,56.4958,,S
|
||||
840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7,C47,C
|
||||
841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20,0,0,SOTON/O2 3101287,7.925,,S
|
||||
842,0,2,"Mudd, Mr. Thomas Charles",male,16,0,0,S.O./P.P. 3,10.5,,S
|
||||
843,1,1,"Serepeca, Miss. Augusta",female,30,0,0,113798,31,,C
|
||||
844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C
|
||||
845,0,3,"Culumovic, Mr. Jeso",male,17,0,0,315090,8.6625,,S
|
||||
846,0,3,"Abbing, Mr. Anthony",male,42,0,0,C.A. 5547,7.55,,S
|
||||
847,0,3,"Sage, Mr. Douglas Bullen",male,,8,2,CA. 2343,69.55,,S
|
||||
848,0,3,"Markoff, Mr. Marin",male,35,0,0,349213,7.8958,,C
|
||||
849,0,2,"Harper, Rev. John",male,28,0,1,248727,33,,S
|
||||
850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C
|
||||
851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4,4,2,347082,31.275,,S
|
||||
852,0,3,"Svensson, Mr. Johan",male,74,0,0,347060,7.775,,S
|
||||
853,0,3,"Boulos, Miss. Nourelain",female,9,1,1,2678,15.2458,,C
|
||||
854,1,1,"Lines, Miss. Mary Conover",female,16,0,1,PC 17592,39.4,D28,S
|
||||
855,0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",female,44,1,0,244252,26,,S
|
||||
856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18,0,1,392091,9.35,,S
|
||||
857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45,1,1,36928,164.8667,,S
|
||||
858,1,1,"Daly, Mr. Peter Denis ",male,51,0,0,113055,26.55,E17,S
|
||||
859,1,3,"Baclini, Mrs. Solomon (Latifa Qurban)",female,24,0,3,2666,19.2583,,C
|
||||
860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
|
||||
861,0,3,"Hansen, Mr. Claus Peter",male,41,2,0,350026,14.1083,,S
|
||||
862,0,2,"Giles, Mr. Frederick Edward",male,21,1,0,28134,11.5,,S
|
||||
863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",female,48,0,0,17466,25.9292,D17,S
|
||||
864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S
|
||||
865,0,2,"Gill, Mr. John William",male,24,0,0,233866,13,,S
|
||||
866,1,2,"Bystrom, Mrs. (Karolina)",female,42,0,0,236852,13,,S
|
||||
867,1,2,"Duran y More, Miss. Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C
|
||||
868,0,1,"Roebling, Mr. Washington Augustus II",male,31,0,0,PC 17590,50.4958,A24,S
|
||||
869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S
|
||||
870,1,3,"Johnson, Master. Harold Theodor",male,4,1,1,347742,11.1333,,S
|
||||
871,0,3,"Balkic, Mr. Cerin",male,26,0,0,349248,7.8958,,S
|
||||
872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47,1,1,11751,52.5542,D35,S
|
||||
873,0,1,"Carlsson, Mr. Frans Olof",male,33,0,0,695,5,B51 B53 B55,S
|
||||
874,0,3,"Vander Cruyssen, Mr. Victor",male,47,0,0,345765,9,,S
|
||||
875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28,1,0,P/PP 3381,24,,C
|
||||
876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C
|
||||
877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20,0,0,7534,9.8458,,S
|
||||
878,0,3,"Petroff, Mr. Nedelio",male,19,0,0,349212,7.8958,,S
|
||||
879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S
|
||||
880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56,0,1,11767,83.1583,C50,C
|
||||
881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25,0,1,230433,26,,S
|
||||
882,0,3,"Markun, Mr. Johann",male,33,0,0,349257,7.8958,,S
|
||||
883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22,0,0,7552,10.5167,,S
|
||||
884,0,2,"Banfield, Mr. Frederick James",male,28,0,0,C.A./SOTON 34068,10.5,,S
|
||||
885,0,3,"Sutehall, Mr. Henry Jr",male,25,0,0,SOTON/OQ 392076,7.05,,S
|
||||
886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39,0,5,382652,29.125,,Q
|
||||
887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S
|
||||
888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S
|
||||
889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
|
||||
890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C
|
||||
891,0,3,"Dooley, Mr. Patrick",male,32,0,0,370376,7.75,,Q
|
||||
|
207
data_exploration/explore.py
Normal file
@@ -0,0 +1,207 @@
|
||||
#import pandas as pd
|
||||
import numpy as np
|
||||
import seaborn as sns
|
||||
import matplotlib.pyplot as plt
|
||||
import os
|
||||
plt.style.use('seaborn-colorblind')
|
||||
|
||||
# 2018.11.07 Created by Eamon.Zhang
|
||||
|
||||
|
||||
def get_dtypes(data,drop_col=[]):
|
||||
"""Return the dtypes for each column of a pandas Dataframe
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : pandas Dataframe
|
||||
|
||||
drop_col : columns to omit in a list
|
||||
|
||||
Returns
|
||||
-------
|
||||
str_var_list, num_var_list, all_var_list
|
||||
|
||||
"""
|
||||
|
||||
name_of_col = list(data.columns)
|
||||
num_var_list = []
|
||||
str_var_list = []
|
||||
all_var_list = []
|
||||
|
||||
str_var_list = name_of_col.copy()
|
||||
for var in name_of_col:
|
||||
# check if column belongs to numeric type
|
||||
if (data[var].dtypes in (np.int, np.int64, np.uint, np.int32, np.float,
|
||||
np.float64, np.float32, np.double)):
|
||||
str_var_list.remove(var)
|
||||
num_var_list.append(var)
|
||||
# drop the omit column from list
|
||||
for var in drop_col:
|
||||
if var in str_var_list:
|
||||
str_var_list.remove(var)
|
||||
if var in num_var_list:
|
||||
num_var_list.remove(var)
|
||||
|
||||
all_var_list.extend(str_var_list)
|
||||
all_var_list.extend(num_var_list)
|
||||
return str_var_list, num_var_list, all_var_list
|
||||
|
||||
|
||||
def describe(data,output_path=None):
|
||||
"""output the general description of a pandas Dataframe
|
||||
into a csv file
|
||||
|
||||
"""
|
||||
|
||||
result = data.describe(include='all')
|
||||
if output_path is not None:
|
||||
output = os.path.join(output_path,'describe.csv')
|
||||
result.to_csv(output)
|
||||
print('result saved at:', str(output))
|
||||
return result
|
||||
|
||||
|
||||
def discrete_var_barplot(x,y,data,output_path=None):
|
||||
"""draw the barplot of a discrete variable x against y(target variable).
|
||||
By default the bar shows the mean value of y.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
figure save as PNG
|
||||
"""
|
||||
|
||||
plt.figure(figsize=(15,10))
|
||||
sns.barplot(x=x,y=y,data=data)
|
||||
if output_path is not None:
|
||||
output = os.path.join(output_path,'Barplot_'+str(x)+'_'+str(y)+'.png')
|
||||
plt.savefig(output)
|
||||
print('Image saved at', str(output))
|
||||
|
||||
|
||||
def discrete_var_countplot(x,data,output_path=None):
|
||||
"""draw the countplot of a discrete variable x.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
figure save as PNG
|
||||
"""
|
||||
|
||||
plt.figure(figsize=(15,10))
|
||||
sns.countplot(x=x,data=data)
|
||||
if output_path is not None:
|
||||
output = os.path.join(output_path,'Countplot_'+str(x)+'.png')
|
||||
plt.savefig(output)
|
||||
print('Image saved at',str(output))
|
||||
|
||||
|
||||
def discrete_var_boxplot(x,y,data,output_path=None):
|
||||
"""draw the boxplot of a discrete variable x against y.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
figure save as PNG
|
||||
"""
|
||||
|
||||
plt.figure(figsize=(15,10))
|
||||
sns.boxplot(x=x,y=y,data=data)
|
||||
if output_path is not None:
|
||||
output = os.path.join(output_path,'Boxplot_'+str(x)+'_'+str(y)+'.png')
|
||||
plt.savefig(output)
|
||||
print('Image saved at',str(output))
|
||||
|
||||
|
||||
def continuous_var_distplot(x,output_path=None,bins=None):
|
||||
"""draw the distplot of a continuous variable x.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
figure save as PNG
|
||||
"""
|
||||
|
||||
plt.figure(figsize=(15,10))
|
||||
sns.distplot(a=x,kde=False,bins=bins)
|
||||
if output_path is not None:
|
||||
output=os.path.join(output_path,'Distplot_'+str(x.name)+'.png')
|
||||
plt.savefig(output)
|
||||
print('Image saved at',str(output))
|
||||
|
||||
|
||||
# 2018.11.28 Created by Eamon.Zhang
|
||||
|
||||
def scatter_plot(x,y,data,output_path=None):
|
||||
"""draw the scatter-plot of two variables.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
figure save as PNG
|
||||
"""
|
||||
|
||||
plt.figure(figsize=(15,10))
|
||||
sns.scatterplot(x=x,y=y,data=data)
|
||||
if output_path is not None:
|
||||
output = os.path.join(output_path,'Scatter_plot_'+str(x.name)+'_'+str(y.name)+'.png')
|
||||
plt.savefig(output)
|
||||
print('Image saved at',str(output))
|
||||
|
||||
|
||||
def correlation_plot(data,output_path=None):
|
||||
"""draw the correlation plot between variables.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
figure save as PNG
|
||||
"""
|
||||
|
||||
corrmat = data.corr()
|
||||
fig, ax = plt.subplots()
|
||||
fig.set_size_inches(11,11)
|
||||
sns.heatmap(corrmat,cmap="YlGnBu",linewidths=.5,annot=True)
|
||||
if output_path is not None:
|
||||
output = os.path.join(output_path,'Corr_plot'+'.png')
|
||||
plt.savefig(output)
|
||||
print('Image saved at',str(output))
|
||||
|
||||
|
||||
def heatmap(data,output_path=None,fmt='d'):
|
||||
"""draw the heatmap between 2 variables.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
|
||||
Returns
|
||||
-------
|
||||
figure save as PNG
|
||||
"""
|
||||
|
||||
fig, ax = plt.subplots()
|
||||
fig.set_size_inches(11,11)
|
||||
sns.heatmap(data,cmap="YlGnBu",linewidths=.5,annot=True,fmt=fmt)
|
||||
if output_path is not None:
|
||||
output = os.path.join(output_path,'Heatmap'+'.png')
|
||||
plt.savefig(output)
|
||||
print('Image saved at',str(output))
|
||||
122
feature_cleaning/missing_data.py
Normal file
@@ -0,0 +1,122 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from warnings import warn
|
||||
|
||||
# 2018.11.07 Created by Eamon.Zhang
|
||||
|
||||
|
||||
def check_missing(data,output_path=None):
|
||||
"""
|
||||
check the total number & percentage of missing values
|
||||
per variable of a pandas Dataframe
|
||||
"""
|
||||
|
||||
result = pd.concat([data.isnull().sum(),data.isnull().mean()],axis=1)
|
||||
result = result.rename(index=str,columns={0:'total missing',1:'proportion'})
|
||||
if output_path is not None:
|
||||
result.to_csv(output_path+'missing.csv')
|
||||
print('result saved at', output_path, 'missing.csv')
|
||||
return result
|
||||
|
||||
|
||||
def drop_missing(data,axis=0):
|
||||
"""
|
||||
Listwise deletion:
|
||||
excluding all cases (listwise) that have missing values
|
||||
|
||||
Parameters
|
||||
----------
|
||||
axis: drop cases(0)/columns(1),default 0
|
||||
|
||||
Returns
|
||||
-------
|
||||
Pandas dataframe with missing cases/columns dropped
|
||||
"""
|
||||
|
||||
data_copy = data.copy(deep=True)
|
||||
data_copy = data_copy.dropna(axis=axis,inplace=False)
|
||||
return data_copy
|
||||
|
||||
|
||||
def add_var_denote_NA(data,NA_col=[]):
|
||||
"""
|
||||
creating an additional variable indicating whether the data
|
||||
was missing for that observation (1) or not (0).
|
||||
"""
|
||||
|
||||
data_copy = data.copy(deep=True)
|
||||
for i in NA_col:
|
||||
if data_copy[i].isnull().sum()>0:
|
||||
data_copy[i+'_is_NA'] = np.where(data_copy[i].isnull(),1,0)
|
||||
else:
|
||||
warn("Column %s has no missing cases" % i)
|
||||
|
||||
return data_copy
|
||||
|
||||
|
||||
def impute_NA_with_arbitrary(data,impute_value,NA_col=[]):
|
||||
"""
|
||||
replacing NA with arbitrary values.
|
||||
"""
|
||||
|
||||
data_copy = data.copy(deep=True)
|
||||
for i in NA_col:
|
||||
if data_copy[i].isnull().sum()>0:
|
||||
data_copy[i+'_'+str(impute_value)] = data_copy[i].fillna(impute_value)
|
||||
else:
|
||||
warn("Column %s has no missing cases" % i)
|
||||
return data_copy
|
||||
|
||||
|
||||
def impute_NA_with_avg(data,strategy='mean',NA_col=[]):
|
||||
"""
|
||||
replacing the NA with mean/median/most frequent values of that variable.
|
||||
Note it should only be performed over training set and then propagated to test set.
|
||||
"""
|
||||
|
||||
data_copy = data.copy(deep=True)
|
||||
for i in NA_col:
|
||||
if data_copy[i].isnull().sum()>0:
|
||||
if strategy=='mean':
|
||||
data_copy[i+'_impute_mean'] = data_copy[i].fillna(data[i].mean())
|
||||
elif strategy=='median':
|
||||
data_copy[i+'_impute_median'] = data_copy[i].fillna(data[i].median())
|
||||
elif strategy=='mode':
|
||||
data_copy[i+'_impute_mode'] = data_copy[i].fillna(data[i].mode()[0])
|
||||
else:
|
||||
warn("Column %s has no missing" % i)
|
||||
return data_copy
|
||||
|
||||
|
||||
def impute_NA_with_end_of_distribution(data,NA_col=[]):
|
||||
"""
|
||||
replacing the NA by values that are at the far end of the distribution of that variable
|
||||
calculated by mean + 3*std
|
||||
"""
|
||||
|
||||
data_copy = data.copy(deep=True)
|
||||
for i in NA_col:
|
||||
if data_copy[i].isnull().sum()>0:
|
||||
data_copy[i+'_impute_end_of_distri'] = data_copy[i].fillna(data[i].mean()+3*data[i].std())
|
||||
else:
|
||||
warn("Column %s has no missing" % i)
|
||||
return data_copy
|
||||
|
||||
|
||||
def impute_NA_with_random(data,NA_col=[],random_state=0):
|
||||
"""
|
||||
replacing the NA with random sampling from the pool of available observations of the variable
|
||||
"""
|
||||
|
||||
data_copy = data.copy(deep=True)
|
||||
for i in NA_col:
|
||||
if data_copy[i].isnull().sum()>0:
|
||||
data_copy[i+'_random'] = data_copy[i]
|
||||
# extract the random sample to fill the na
|
||||
random_sample = data_copy[i].dropna().sample(data_copy[i].isnull().sum(), random_state=random_state)
|
||||
random_sample.index = data_copy[data_copy[i].isnull()].index
|
||||
data_copy.loc[data_copy[i].isnull(), str(i)+'_random'] = random_sample
|
||||
else:
|
||||
warn("Column %s has no missing" % i)
|
||||
return data_copy
|
||||
|
||||
138
feature_cleaning/outlier.py
Normal file
@@ -0,0 +1,138 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
# from warnings import warn
|
||||
|
||||
# 2018.11.07 Created by Eamon.Zhang
|
||||
|
||||
def outlier_detect_arbitrary(data,col,upper_fence,lower_fence):
|
||||
'''
|
||||
identify outliers based on arbitrary boundaries passed to the function.
|
||||
'''
|
||||
|
||||
para = (upper_fence, lower_fence)
|
||||
tmp = pd.concat([data[col]>upper_fence,data[col]<lower_fence],axis=1)
|
||||
outlier_index = tmp.any(axis=1)
|
||||
print('Num of outlier detected:',outlier_index.value_counts()[1])
|
||||
print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
|
||||
return outlier_index, para
|
||||
|
||||
|
||||
|
||||
def outlier_detect_IQR(data,col,threshold=3):
|
||||
'''
|
||||
outlier detection by Interquartile Ranges Rule, also known as Tukey's test.
|
||||
calculate the IQR ( 75th quantile - 25th quantile)
|
||||
and the 25th 75th quantile.
|
||||
Any value beyond:
|
||||
upper bound = 75th quantile + (IQR * threshold)
|
||||
lower bound = 25th quantile - (IQR * threshold)
|
||||
are regarded as outliers. Default threshold is 3.
|
||||
'''
|
||||
|
||||
IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
|
||||
Lower_fence = data[col].quantile(0.25) - (IQR * threshold)
|
||||
Upper_fence = data[col].quantile(0.75) + (IQR * threshold)
|
||||
para = (Upper_fence, Lower_fence)
|
||||
tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
|
||||
outlier_index = tmp.any(axis=1)
|
||||
print('Num of outlier detected:',outlier_index.value_counts()[1])
|
||||
print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
|
||||
return outlier_index, para
|
||||
|
||||
|
||||
def outlier_detect_mean_std(data,col,threshold=3):
|
||||
'''
|
||||
outlier detection by Mean and Standard Deviation Method.
|
||||
If a value is a certain number(called threshold) of standard deviations away
|
||||
from the mean, that data point is identified as an outlier.
|
||||
Default threshold is 3.
|
||||
|
||||
This method can fail to detect outliers because the outliers increase the standard deviation.
|
||||
The more extreme the outlier, the more the standard deviation is affected.
|
||||
'''
|
||||
|
||||
Upper_fence = data[col].mean() + threshold * data[col].std()
|
||||
Lower_fence = data[col].mean() - threshold * data[col].std()
|
||||
para = (Upper_fence, Lower_fence)
|
||||
tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
|
||||
outlier_index = tmp.any(axis=1)
|
||||
print('Num of outlier detected:',outlier_index.value_counts()[1])
|
||||
print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
|
||||
return outlier_index, para
|
||||
|
||||
|
||||
def outlier_detect_MAD(data,col,threshold=3.5):
|
||||
"""
|
||||
outlier detection by Median and Median Absolute Deviation Method (MAD)
|
||||
The median of the residuals is calculated. Then, the difference is calculated between each historical value and this median.
|
||||
These differences are expressed as their absolute values, and a new median is calculated and multiplied by
|
||||
an empirically derived constant to yield the median absolute deviation (MAD).
|
||||
If a value is a certain number of MAD away from the median of the residuals,
|
||||
that value is classified as an outlier. The default threshold is 3 MAD.
|
||||
|
||||
This method is generally more effective than the mean and standard deviation method for detecting outliers,
|
||||
but it can be too aggressive in classifying values that are not really extremely different.
|
||||
Also, if more than 50% of the data points have the same value, MAD is computed to be 0,
|
||||
so any value different from the residual median is classified as an outlier.
|
||||
"""
|
||||
|
||||
median = data[col].median()
|
||||
median_absolute_deviation = np.median([np.abs(y - median) for y in data[col]])
|
||||
modified_z_scores = pd.Series([0.6745 * (y - median) / median_absolute_deviation for y in data[col]])
|
||||
outlier_index = np.abs(modified_z_scores) > threshold
|
||||
print('Num of outlier detected:',outlier_index.value_counts()[1])
|
||||
print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
|
||||
return outlier_index
|
||||
|
||||
|
||||
# 2018.11.10 outlier treatment
|
||||
def impute_outlier_with_arbitrary(data,outlier_index,value,col=[]):
|
||||
"""
|
||||
impute outliers with arbitrary value
|
||||
"""
|
||||
|
||||
data_copy = data.copy(deep=True)
|
||||
for i in col:
|
||||
data_copy.loc[outlier_index,i] = value
|
||||
return data_copy
|
||||
|
||||
|
||||
def windsorization(data,col,para,strategy='both'):
|
||||
"""
|
||||
top-coding & bottom coding (capping the maximum of a distribution at an arbitrarily set value,vice versa)
|
||||
"""
|
||||
|
||||
data_copy = data.copy(deep=True)
|
||||
if strategy == 'both':
|
||||
data_copy.loc[data_copy[col]>para[0],col] = para[0]
|
||||
data_copy.loc[data_copy[col]<para[1],col] = para[1]
|
||||
elif strategy == 'top':
|
||||
data_copy.loc[data_copy[col]>para[0],col] = para[0]
|
||||
elif strategy == 'bottom':
|
||||
data_copy.loc[data_copy[col]<para[1],col] = para[1]
|
||||
return data_copy
|
||||
|
||||
|
||||
def drop_outlier(data,outlier_index):
|
||||
"""
|
||||
drop the cases that are outliers
|
||||
"""
|
||||
|
||||
data_copy = data[~outlier_index]
|
||||
return data_copy
|
||||
|
||||
|
||||
def impute_outlier_with_avg(data,col,outlier_index,strategy='mean'):
|
||||
"""
|
||||
impute outlier with mean/median/most frequent values of that variable.
|
||||
"""
|
||||
|
||||
data_copy = data.copy(deep=True)
|
||||
if strategy=='mean':
|
||||
data_copy.loc[outlier_index,col] = data_copy[col].mean()
|
||||
elif strategy=='median':
|
||||
data_copy.loc[outlier_index,col] = data_copy[col].median()
|
||||
elif strategy=='mode':
|
||||
data_copy.loc[outlier_index,col] = data_copy[col].mode()[0]
|
||||
|
||||
return data_copy
|
||||
247
feature_cleaning/rare_values.py
Normal file
@@ -0,0 +1,247 @@
|
||||
import pandas as pd
|
||||
# import numpy as np
|
||||
# from warnings import warn
|
||||
|
||||
# 2018.11.07 Created by Eamon.Zhang
|
||||
# 2018.11.12 change into fit() transform() format
|
||||
|
||||
class GroupingRareValues():
|
||||
"""
|
||||
Grouping the observations that show rare labels into a unique category ('rare')
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, mapping=None, cols=None, threshold=0.01):
|
||||
self.cols = cols
|
||||
self.mapping = mapping
|
||||
self._dim = None
|
||||
self.threshold = threshold
|
||||
|
||||
|
||||
def fit(self, X, y=None, **kwargs):
|
||||
"""Fit encoder according to X and y.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape = [n_samples, n_features]
|
||||
Training vectors, where n_samples is the number of samples
|
||||
and n_features is the number of features.
|
||||
y : array-like, shape = [n_samples]
|
||||
Target values.
|
||||
Returns
|
||||
-------
|
||||
self : encoder
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
self._dim = X.shape[1]
|
||||
|
||||
_, categories = self.grouping(
|
||||
X,
|
||||
mapping=self.mapping,
|
||||
cols=self.cols,
|
||||
threshold=self.threshold
|
||||
)
|
||||
self.mapping = categories
|
||||
return self
|
||||
|
||||
|
||||
def transform(self, X):
|
||||
"""Perform the transformation to new categorical data.
|
||||
Will use the mapping (if available) and the column list to encode the
|
||||
data.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape = [n_samples, n_features]
|
||||
Returns
|
||||
-------
|
||||
X : Transformed values with encoding applied.
|
||||
"""
|
||||
|
||||
if self._dim is None:
|
||||
raise ValueError('Must train encoder before it can be used to transform data.')
|
||||
|
||||
# make sure that it is the right size
|
||||
if X.shape[1] != self._dim:
|
||||
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
|
||||
|
||||
X, _ = self.grouping(
|
||||
X,
|
||||
mapping=self.mapping,
|
||||
cols=self.cols,
|
||||
threshold=self.threshold
|
||||
)
|
||||
|
||||
return X
|
||||
|
||||
|
||||
def grouping(self, X_in, threshold, mapping=None, cols=None):
|
||||
"""
|
||||
Grouping the observations that show rare labels into a unique category ('rare')
|
||||
|
||||
"""
|
||||
|
||||
X = X_in.copy(deep=True)
|
||||
|
||||
# if cols is None:
|
||||
# cols = X.columns.values
|
||||
|
||||
if mapping is not None: # transform
|
||||
mapping_out = mapping
|
||||
for i in mapping:
|
||||
column = i.get('col') # get the column name
|
||||
X[column] = X[column].map(i['mapping'])
|
||||
|
||||
# try:
|
||||
# X[column] = X[column].astype(int)
|
||||
# except ValueError as e:
|
||||
# X[column] = X[column].astype(float)
|
||||
else: # fit
|
||||
mapping_out = []
|
||||
for col in cols:
|
||||
# if util.is_category(X[col].dtype):
|
||||
# categories = X[col].cat.categories
|
||||
# else:
|
||||
temp_df = pd.Series(X[col].value_counts()/len(X))
|
||||
mapping = { k: ('rare' if k not in temp_df[temp_df >= threshold].index else k)
|
||||
for k in temp_df.index}
|
||||
|
||||
mapping = pd.Series(mapping)
|
||||
mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )
|
||||
|
||||
return X, mapping_out
|
||||
|
||||
|
||||
|
||||
#==============================================================================
|
||||
# def rare_imputation(X_train, X_test, variable):
|
||||
#
|
||||
# # find the most frequent category
|
||||
# frequent_cat = X_train.groupby(variable)[variable].count().sort_values().tail(1).index.values[0]
|
||||
#
|
||||
# # find rare labels
|
||||
# temp = X_train.groupby([variable])[variable].count()/np.float(len(X_train))
|
||||
# rare_cat = [x for x in temp.loc[temp<0.05].index.values]
|
||||
#
|
||||
# # create new variables, with Rare labels imputed
|
||||
#
|
||||
# # by the most frequent category
|
||||
# X_train[variable+'_freq_imp'] = np.where(X_train[variable].isin(rare_cat), frequent_cat, X_train[variable])
|
||||
# X_test[variable+'_freq_imp'] = np.where(X_test[variable].isin(rare_cat), frequent_cat, X_test[variable])
|
||||
#
|
||||
# # by adding a new label 'Rare'
|
||||
# X_train[variable+'_rare_imp'] = np.where(X_train[variable].isin(rare_cat), 'Rare', X_train[variable])
|
||||
# X_test[variable+'_rare_imp'] = np.where(X_test[variable].isin(rare_cat), 'Rare', X_test[variable])
|
||||
#==============================================================================
|
||||
|
||||
# 2018.11.26 created by Eamon.Zhang
|
||||
class ModeImputation():
|
||||
"""
|
||||
Replacing the rare label by most frequent label
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, mapping=None, cols=None, threshold=0.01):
|
||||
self.cols = cols
|
||||
self.mapping = mapping
|
||||
self._dim = None
|
||||
self.threshold = threshold
|
||||
|
||||
|
||||
def fit(self, X, y=None, **kwargs):
|
||||
"""Fit encoder according to X and y.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape = [n_samples, n_features]
|
||||
Training vectors, where n_samples is the number of samples
|
||||
and n_features is the number of features.
|
||||
y : array-like, shape = [n_samples]
|
||||
Target values.
|
||||
Returns
|
||||
-------
|
||||
self : encoder
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
self._dim = X.shape[1]
|
||||
|
||||
_, categories = self.impute_with_mode(
|
||||
X,
|
||||
mapping=self.mapping,
|
||||
cols=self.cols,
|
||||
threshold=self.threshold
|
||||
)
|
||||
self.mapping = categories
|
||||
return self
|
||||
|
||||
|
||||
def transform(self, X):
|
||||
"""Perform the transformation to new categorical data.
|
||||
Will use the mapping (if available) and the column list to encode the
|
||||
data.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape = [n_samples, n_features]
|
||||
Returns
|
||||
-------
|
||||
X : Transformed values with encoding applied.
|
||||
"""
|
||||
|
||||
if self._dim is None:
|
||||
raise ValueError('Must train encoder before it can be used to transform data.')
|
||||
|
||||
# make sure that it is the right size
|
||||
if X.shape[1] != self._dim:
|
||||
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
|
||||
|
||||
X, _ = self.impute_with_mode(
|
||||
X,
|
||||
mapping=self.mapping,
|
||||
cols=self.cols,
|
||||
threshold=self.threshold
|
||||
)
|
||||
|
||||
return X
|
||||
|
||||
|
||||
def impute_with_mode(self, X_in, threshold, mapping=None, cols=None):
|
||||
"""
|
||||
Grouping the observations that show rare labels into a unique category ('rare')
|
||||
|
||||
"""
|
||||
|
||||
X = X_in.copy(deep=True)
|
||||
|
||||
# if cols is None:
|
||||
# cols = X.columns.values
|
||||
|
||||
if mapping is not None: # transform
|
||||
mapping_out = mapping
|
||||
for i in mapping:
|
||||
column = i.get('col') # get the column name
|
||||
X[column] = X[column].map(i['mapping'])
|
||||
|
||||
# try:
|
||||
# X[column] = X[column].astype(int)
|
||||
# except ValueError as e:
|
||||
# X[column] = X[column].astype(float)
|
||||
else: # fit
|
||||
mapping_out = []
|
||||
for col in cols:
|
||||
# if util.is_category(X[col].dtype):
|
||||
# categories = X[col].cat.categories
|
||||
# else:
|
||||
temp_df = pd.Series(X[col].value_counts()/len(X))
|
||||
median = X[col].mode()[0]
|
||||
mapping = { k: (median if k not in temp_df[temp_df >= threshold].index else k)
|
||||
for k in temp_df.index}
|
||||
|
||||
mapping = pd.Series(mapping)
|
||||
mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )
|
||||
|
||||
return X, mapping_out
|
||||
329
feature_engineering/discretization.py
Normal file
@@ -0,0 +1,329 @@
|
||||
import pandas as pd
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
from sklearn.model_selection import cross_val_score
|
||||
import numpy as np
|
||||
|
||||
# from warnings import warn
|
||||
|
||||
# 2018.11.17 Created by Eamon.Zhang
|
||||
# ChiMerge method modeified from https://github.com/tatsumiw/ChiMerge/blob/master/ChiMerge.py
|
||||
# TODO: add more constraits to the discretized result.
|
||||
class ChiMerge():
|
||||
"""
|
||||
supervised discretization using the ChiMerge method.
|
||||
|
||||
|
||||
Parameters
|
||||
----------
|
||||
confidenceVal: number
|
||||
default=3.841, correspond to p=0.05 dof=1
|
||||
num_of_bins: int
|
||||
number of bins after discretize
|
||||
col: str
|
||||
the column to be performed
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, col=None, bins=None, confidenceVal=3.841, num_of_bins=10):
|
||||
self.col = col
|
||||
self._dim = None
|
||||
self.confidenceVal = confidenceVal
|
||||
self.bins = bins
|
||||
self.num_of_bins = num_of_bins
|
||||
|
||||
|
||||
def fit(self, X, y, **kwargs):
|
||||
"""Fit encoder according to X and y.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape = [n_samples, n_features]
|
||||
Training vectors, where n_samples is the number of samples
|
||||
and n_features is the number of features.
|
||||
y : array-like, shape = [n_samples]
|
||||
Target values.
|
||||
Returns
|
||||
-------
|
||||
self : encoder
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
self._dim = X.shape[1]
|
||||
|
||||
_, bins = self.chimerge(
|
||||
X_in=X,
|
||||
y=y,
|
||||
confidenceVal=self.confidenceVal,
|
||||
col=self.col,
|
||||
num_of_bins=self.num_of_bins
|
||||
)
|
||||
self.bins = bins
|
||||
return self
|
||||
|
||||
|
||||
def transform(self, X):
|
||||
"""Perform the transformation to new data.
|
||||
Will use the tree model and the column list to discretize the
|
||||
column.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape = [n_samples, n_features]
|
||||
Returns
|
||||
-------
|
||||
X : new dataframe with discretized new column.
|
||||
"""
|
||||
|
||||
if self._dim is None:
|
||||
raise ValueError('Must train encoder before it can be used to transform data.')
|
||||
|
||||
# make sure that it is the right size
|
||||
if X.shape[1] != self._dim:
|
||||
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
|
||||
|
||||
X, _ = self.chimerge(
|
||||
X_in=X,
|
||||
col=self.col,
|
||||
bins=self.bins
|
||||
)
|
||||
|
||||
return X
|
||||
|
||||
def chimerge(self, X_in, y=None, confidenceVal=None, num_of_bins=None, col=None, bins=None):
|
||||
"""
|
||||
discretize a variable using ChiMerge
|
||||
|
||||
"""
|
||||
|
||||
X = X_in.copy(deep=True)
|
||||
|
||||
if bins is not None: # transform
|
||||
try:
|
||||
X[col+'_chimerge'] = pd.cut(X[col],bins=bins,include_lowest=True)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
else: # fit
|
||||
try:
|
||||
# create an array which save the num of 0/1 samples of the column to be chimerge
|
||||
total_num = X.groupby([col])[y].count()
|
||||
total_num = pd.DataFrame({'total_num': total_num})
|
||||
positive_class = X.groupby([col])[y].sum()
|
||||
positive_class = pd.DataFrame({'positive_class': positive_class})
|
||||
regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True,how='inner')
|
||||
regroup.reset_index(inplace=True)
|
||||
regroup['negative_class'] = regroup['total_num'] - regroup['positive_class']
|
||||
regroup = regroup.drop('total_num', axis=1)
|
||||
np_regroup = np.array(regroup)
|
||||
# merge interval that have 0 pos/neg samples
|
||||
i = 0
|
||||
while (i <= np_regroup.shape[0] - 2):
|
||||
if ((np_regroup[i, 1] == 0 and np_regroup[i + 1, 1] == 0) or ( np_regroup[i, 2] == 0 and np_regroup[i + 1, 2] == 0)):
|
||||
np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i + 1, 1] # pos
|
||||
np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i + 1, 2] # neg
|
||||
np_regroup[i, 0] = np_regroup[i + 1, 0]
|
||||
np_regroup = np.delete(np_regroup, i + 1, 0)
|
||||
i = i - 1
|
||||
i = i + 1
|
||||
# calculate chi for neighboring intervals
|
||||
# ∑[(yA-yB)²/yB]
|
||||
chi_table = np.array([])
|
||||
for i in np.arange(np_regroup.shape[0] - 1):
|
||||
chi = (np_regroup[i, 1] * np_regroup[i + 1, 2] - np_regroup[i, 2] * np_regroup[i + 1, 1]) ** 2 \
|
||||
* (np_regroup[i, 1] + np_regroup[i, 2] + np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) / \
|
||||
((np_regroup[i, 1] + np_regroup[i, 2]) * (np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) * (
|
||||
np_regroup[i, 1] + np_regroup[i + 1, 1]) * (np_regroup[i, 2] + np_regroup[i + 1, 2]))
|
||||
chi_table = np.append(chi_table, chi)
|
||||
# merge intervals that have closing chi
|
||||
while (1):
|
||||
if (len(chi_table) <= (num_of_bins - 1) and min(chi_table) >= confidenceVal):
|
||||
break
|
||||
chi_min_index = np.argwhere(chi_table == min(chi_table))[0]
|
||||
np_regroup[chi_min_index, 1] = np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]
|
||||
np_regroup[chi_min_index, 2] = np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]
|
||||
np_regroup[chi_min_index, 0] = np_regroup[chi_min_index + 1, 0]
|
||||
np_regroup = np.delete(np_regroup, chi_min_index + 1, 0)
|
||||
|
||||
if (chi_min_index == np_regroup.shape[0] - 1):
|
||||
chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
|
||||
* (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
|
||||
((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
|
||||
chi_table = np.delete(chi_table, chi_min_index, axis=0)
|
||||
|
||||
else:
|
||||
chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
|
||||
* (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
|
||||
((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
|
||||
chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index + 1, 2] - np_regroup[chi_min_index, 2] * np_regroup[chi_min_index + 1, 1]) ** 2 \
|
||||
* (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) / \
|
||||
((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]) * (np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]))
|
||||
chi_table = np.delete(chi_table, chi_min_index + 1, axis=0)
|
||||
result_data = pd.DataFrame()
|
||||
result_data['variable'] = [col] * np_regroup.shape[0]
|
||||
bins = []
|
||||
tmp = []
|
||||
for i in np.arange(np_regroup.shape[0]):
|
||||
if i == 0:
|
||||
y = '-inf' + ',' + str(np_regroup[i, 0])
|
||||
#x = np_regroup[i, 0]
|
||||
#list_temp.append(x)
|
||||
elif i == np_regroup.shape[0] - 1:
|
||||
y = str(np_regroup[i - 1, 0]) + '+'
|
||||
#x = 100000000.
|
||||
#list_temp.append(x)
|
||||
else:
|
||||
y = str(np_regroup[i - 1, 0]) + ',' + str(np_regroup[i, 0])
|
||||
#x = np_regroup[i, 0]
|
||||
#list_temp.append(x)
|
||||
bins.append(np_regroup[i - 1, 0])
|
||||
tmp.append(y)
|
||||
|
||||
#list_temp.append(df[variable].max()+0.1)
|
||||
bins.append(X[col].min()-0.1)
|
||||
|
||||
result_data['interval'] = tmp
|
||||
result_data['flag_0'] = np_regroup[:, 2]
|
||||
result_data['flag_1'] = np_regroup[:, 1]
|
||||
bins.sort(reverse=False)
|
||||
print('Interval for variable %s' % col)
|
||||
print(result_data)
|
||||
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
return X, bins
|
||||
|
||||
|
||||
|
||||
|
||||
# 2018.11.15 Created by Eamon.Zhang
|
||||
class DiscretizeByDecisionTree():
|
||||
"""
|
||||
Discretisation with Decision Trees consists of using a decision tree
|
||||
to identify the optimal splitting points that would determine the bins
|
||||
or contiguous intervals:
|
||||
|
||||
1.train a decision tree of limited depth (2, 3 or 4) using the variable
|
||||
we want to discretise to predict the target.
|
||||
2.the original variable values are then replaced by the
|
||||
probability returned by the tree.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
col: str
|
||||
column to discretise
|
||||
max_depth: int or list of int
|
||||
max depth of the tree. Can be an int or a list of int we want the tree model to search
|
||||
for the optimal depth.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, col=None, max_depth=None, tree_model=None):
|
||||
self.col = col
|
||||
self._dim = None
|
||||
self.max_depth = max_depth
|
||||
self.tree_model = tree_model
|
||||
|
||||
|
||||
def fit(self, X, y, **kwargs):
|
||||
"""Fit encoder according to X and y.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape = [n_samples, n_features]
|
||||
Training vectors, where n_samples is the number of samples
|
||||
and n_features is the number of features.
|
||||
y : array-like, shape = [n_samples]
|
||||
Target values.
|
||||
Returns
|
||||
-------
|
||||
self : encoder
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
self._dim = X.shape[1]
|
||||
|
||||
_, tree = self.discretize(
|
||||
X_in=X,
|
||||
y=y,
|
||||
max_depth=self.max_depth,
|
||||
col=self.col,
|
||||
tree_model=self.tree_model
|
||||
)
|
||||
self.tree_model = tree
|
||||
return self
|
||||
|
||||
def transform(self, X):
|
||||
"""Perform the transformation to new categorical data.
|
||||
Will use the tree model and the column list to discretize the
|
||||
column.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape = [n_samples, n_features]
|
||||
Returns
|
||||
-------
|
||||
X : new dataframe with discretized new column.
|
||||
"""
|
||||
|
||||
if self._dim is None:
|
||||
raise ValueError('Must train encoder before it can be used to transform data.')
|
||||
|
||||
# make sure that it is the right size
|
||||
if X.shape[1] != self._dim:
|
||||
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
|
||||
|
||||
X, _ = self.discretize(
|
||||
X_in=X,
|
||||
col=self.col,
|
||||
tree_model=self.tree_model
|
||||
)
|
||||
|
||||
return X
|
||||
|
||||
|
||||
def discretize(self, X_in, y=None, max_depth=None, tree_model=None, col=None):
|
||||
"""
|
||||
discretize a variable using DecisionTreeClassifier
|
||||
|
||||
"""
|
||||
|
||||
X = X_in.copy(deep=True)
|
||||
|
||||
if tree_model is not None: # transform
|
||||
X[col+'_tree_discret'] = tree_model.predict_proba(X[col].to_frame())[:,1]
|
||||
|
||||
else: # fit
|
||||
if isinstance(max_depth,int):
|
||||
tree_model = DecisionTreeClassifier(max_depth=max_depth)
|
||||
tree_model.fit(X[col].to_frame(), y)
|
||||
# X[col+'_tree_discret'] = tree_model.predict_proba(X[col].to_frame())[:,1]
|
||||
#print(x.tree_discret.unique())
|
||||
# bins = pd.concat( [X.groupby([col+'_tree_discret'])[col].min(),
|
||||
# X.groupby([col+'_tree_discret'])[col].max()], axis=1)
|
||||
# print('bins:')
|
||||
# print(bins)
|
||||
|
||||
elif len(max_depth)>1:
|
||||
score_ls = [] # here I will store the roc auc
|
||||
score_std_ls = [] # here I will store the standard deviation of the roc_auc
|
||||
for tree_depth in max_depth:
|
||||
tree_model = DecisionTreeClassifier(max_depth=tree_depth)
|
||||
scores = cross_val_score(tree_model, X[col].to_frame(), y, cv=3, scoring='roc_auc')
|
||||
score_ls.append(np.mean(scores))
|
||||
score_std_ls.append(np.std(scores))
|
||||
temp = pd.concat([pd.Series(max_depth), pd.Series(score_ls), pd.Series(score_std_ls)], axis=1)
|
||||
temp.columns = ['depth', 'roc_auc_mean', 'roc_auc_std']
|
||||
print('result ROC-AUC for each depth')
|
||||
print(temp)
|
||||
max_roc = temp.roc_auc_mean.max()
|
||||
optimal_depth=temp[temp.roc_auc_mean==max_roc]['depth'].values
|
||||
print('optimal_depth:',optimal_depth)
|
||||
tree_model = DecisionTreeClassifier(max_depth=optimal_depth)
|
||||
tree_model.fit(X[col].to_frame(), y)
|
||||
# bins = pd.concat( [X.groupby([col+'_tree_discret'])[col].min(),
|
||||
# X.groupby([col+'_tree_discret'])[col].max()], axis=1)
|
||||
# print('bins:')
|
||||
# print(bins)
|
||||
else:
|
||||
raise ValueError('max_depth of a tree must be an integer or a list')
|
||||
|
||||
return X, tree_model
|
||||
|
||||
|
||||
109
feature_engineering/encoding.py
Normal file
@@ -0,0 +1,109 @@
|
||||
import pandas as pd
|
||||
|
||||
# 2018.11.28 Created by Eamon.Zhang
|
||||
|
||||
class MeanEncoding():
|
||||
"""
|
||||
replacing the label by the mean of the target for that label.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, mapping=None, cols=None):
|
||||
self.cols = cols
|
||||
self.mapping = mapping
|
||||
self._dim = None
|
||||
# self.threshold = threshold
|
||||
|
||||
|
||||
def fit(self, X, y=None, **kwargs):
|
||||
"""Fit encoder according to X and y.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape = [n_samples, n_features]
|
||||
Training vectors, where n_samples is the number of samples
|
||||
and n_features is the number of features.
|
||||
y : array-like, shape = [n_samples]
|
||||
Target values.
|
||||
Returns
|
||||
-------
|
||||
self : encoder
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
self._dim = X.shape[1]
|
||||
|
||||
_, categories = self.mean_encoding(
|
||||
X,
|
||||
y,
|
||||
mapping=self.mapping,
|
||||
cols=self.cols
|
||||
# threshold=self.threshold
|
||||
)
|
||||
self.mapping = categories
|
||||
return self
|
||||
|
||||
|
||||
def transform(self, X):
|
||||
"""Perform the transformation to new categorical data.
|
||||
Will use the mapping (if available) and the column list to encode the
|
||||
data.
|
||||
Parameters
|
||||
----------
|
||||
X : array-like, shape = [n_samples, n_features]
|
||||
Returns
|
||||
-------
|
||||
X : Transformed values with encoding applied.
|
||||
"""
|
||||
|
||||
if self._dim is None:
|
||||
raise ValueError('Must train encoder before it can be used to transform data.')
|
||||
|
||||
# make sure that it is the right size
|
||||
if X.shape[1] != self._dim:
|
||||
raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
|
||||
|
||||
X, _ = self.mean_encoding(
|
||||
X,
|
||||
mapping=self.mapping,
|
||||
cols=self.cols
|
||||
# threshold=self.threshold
|
||||
)
|
||||
|
||||
return X
|
||||
|
||||
|
||||
def mean_encoding(self, X_in, y=None, mapping=None, cols=None):
|
||||
"""
|
||||
Grouping the observations that show rare labels into a unique category ('rare')
|
||||
|
||||
"""
|
||||
|
||||
X = X_in.copy(deep=True)
|
||||
|
||||
# if cols is None:
|
||||
# cols = X.columns.values
|
||||
|
||||
if mapping is not None: # transform
|
||||
mapping_out = mapping
|
||||
for i in mapping:
|
||||
column = i.get('col') # get the column name
|
||||
X[column] = X[column].map(i['mapping'])
|
||||
|
||||
# try:
|
||||
# X[column] = X[column].astype(int)
|
||||
# except ValueError as e:
|
||||
# X[column] = X[column].astype(float)
|
||||
else: # fit
|
||||
mapping_out = []
|
||||
for col in cols:
|
||||
# if util.is_category(X[col].dtype):
|
||||
# categories = X[col].cat.categories
|
||||
# else:
|
||||
mapping = X[y.name].groupby(X[col]).mean().to_dict()
|
||||
mapping = pd.Series(mapping)
|
||||
mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )
|
||||
|
||||
return X, mapping_out
|
||||
73
feature_engineering/transformation.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import scipy.stats as stats
|
||||
import pylab
|
||||
# from warnings import warn
|
||||
|
||||
# 2018.11.26 Created by Eamon.Zhang
|
||||
def diagnostic_plots(df, variable):
|
||||
# function to plot a histogram and a Q-Q plot
|
||||
# side by side, for a certain variable
|
||||
|
||||
plt.figure(figsize=(15,6))
|
||||
plt.subplot(1, 2, 1)
|
||||
df[variable].hist()
|
||||
|
||||
plt.subplot(1, 2, 2)
|
||||
stats.probplot(df[variable], dist="norm", plot=pylab)
|
||||
|
||||
plt.show()
|
||||
|
||||
|
||||
def log_transform(data,cols=[]):
|
||||
"""
|
||||
Logarithmic transformation
|
||||
"""
|
||||
|
||||
data_copy = data.copy(deep=True)
|
||||
for i in cols:
|
||||
data_copy[i+'_log'] = np.log(data_copy[i]+1)
|
||||
print('Variable ' + i +' Q-Q plot')
|
||||
diagnostic_plots(data_copy,str(i+'_log'))
|
||||
return data_copy
|
||||
|
||||
|
||||
def reciprocal_transform(data,cols=[]):
|
||||
"""
|
||||
Reciprocal transformation
|
||||
"""
|
||||
|
||||
data_copy = data.copy(deep=True)
|
||||
for i in cols:
|
||||
data_copy[i+'_reciprocal'] = 1/(data_copy[i])
|
||||
print('Variable ' + i +' Q-Q plot')
|
||||
diagnostic_plots(data_copy,str(i+'_reciprocal'))
|
||||
return data_copy
|
||||
|
||||
|
||||
def square_root_transform(data,cols=[]):
|
||||
"""
|
||||
square root transformation
|
||||
"""
|
||||
|
||||
data_copy = data.copy(deep=True)
|
||||
for i in cols:
|
||||
data_copy[i+'_square_root'] = (data_copy[i])**(0.5)
|
||||
print('Variable ' + i +' Q-Q plot')
|
||||
diagnostic_plots(data_copy,str(i+'_square_root'))
|
||||
return data_copy
|
||||
|
||||
|
||||
def exp_transform(data,coef,cols=[]):
|
||||
"""
|
||||
exp transformation
|
||||
"""
|
||||
|
||||
data_copy = data.copy(deep=True)
|
||||
for i in cols:
|
||||
data_copy[i+'_exp'] = (data_copy[i])**coef
|
||||
print('Variable ' + i +' Q-Q plot')
|
||||
diagnostic_plots(data_copy,str(i+'_exp'))
|
||||
return data_copy
|
||||
|
||||
76
feature_selection/embedded_method.py
Normal file
@@ -0,0 +1,76 @@
|
||||
#import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
#import seaborn as sns
|
||||
#from sklearn.model_selection import train_test_split
|
||||
|
||||
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier #RandomForestRegressor
|
||||
#from sklearn.feature_selection import SelectFromModel
|
||||
|
||||
# 2018.11.27 Created by Eamon.Zhang
|
||||
|
||||
def rf_importance(X_train,y_train,max_depth=10,class_weight=None,top_n=15,n_estimators=50,random_state=0):
|
||||
|
||||
model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
|
||||
random_state=random_state,class_weight=class_weight,
|
||||
n_jobs=-1)
|
||||
model.fit(X_train, y_train)
|
||||
importances = model.feature_importances_
|
||||
indices = np.argsort(importances)[::-1]
|
||||
feat_labels = X_train.columns
|
||||
std = np.std([tree.feature_importances_ for tree in model.estimators_],
|
||||
axis=0) # inter-trees variability.
|
||||
print("Feature ranking:")
|
||||
# l1,l2,l3,l4 = [],[],[],[]
|
||||
for f in range(X_train.shape[1]):
|
||||
print("%d. feature no:%d feature name:%s (%f)" % (f + 1, indices[f], feat_labels[indices[f]], importances[indices[f]]))
|
||||
# l1.append(f+1)
|
||||
# l2.append(indices[f])
|
||||
# l3.append(feat_labels[indices[f]])
|
||||
# l4.append(importances[indices[f]])
|
||||
#feature_rank = pd.Dataframe(zip(l1,l2,l3,l4),columns=['id','indice','feature','importances'])
|
||||
|
||||
# plotting
|
||||
indices = indices[0:top_n]
|
||||
plt.figure()
|
||||
plt.title("Feature importances top %d" % top_n)
|
||||
plt.bar(range(top_n), importances[indices],
|
||||
color="r", yerr=std[indices], align="center")
|
||||
plt.xticks(range(top_n), indices)
|
||||
plt.xlim([-1,top_n])
|
||||
plt.show()
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def gbt_importance(X_train,y_train,max_depth=10,top_n=15,n_estimators=50,random_state=0):
|
||||
|
||||
model = GradientBoostingClassifier(n_estimators=n_estimators,max_depth=max_depth,
|
||||
random_state=random_state)
|
||||
model.fit(X_train, y_train)
|
||||
importances = model.feature_importances_
|
||||
indices = np.argsort(importances)[::-1]
|
||||
feat_labels = X_train.columns
|
||||
std = np.std([tree[0].feature_importances_ for tree in model.estimators_],
|
||||
axis=0) # inter-trees variability.
|
||||
print("Feature ranking:")
|
||||
# l1,l2,l3,l4 = [],[],[],[]
|
||||
for f in range(X_train.shape[1]):
|
||||
print("%d. feature no:%d feature name:%s (%f)" % (f + 1, indices[f], feat_labels[indices[f]], importances[indices[f]]))
|
||||
# l1.append(f+1)
|
||||
# l2.append(indices[f])
|
||||
# l3.append(feat_labels[indices[f]])
|
||||
# l4.append(importances[indices[f]])
|
||||
# feature_rank = pd.Dataframe(zip(l1,l2,l3,l4),columns=['id','indice','feature','importances'])
|
||||
# plotting
|
||||
indices = indices[0:top_n]
|
||||
plt.figure()
|
||||
plt.title("Feature importances top %d" % top_n)
|
||||
plt.bar(range(top_n), importances[indices],
|
||||
color="r", yerr=std[indices], align="center")
|
||||
plt.xticks(range(top_n), indices)
|
||||
plt.xlim([-1,top_n])
|
||||
plt.show()
|
||||
|
||||
return model
|
||||
43
feature_selection/feature_shuffle.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import pandas as pd
|
||||
#import numpy as np
|
||||
|
||||
|
||||
from sklearn.ensemble import RandomForestClassifier #, RandomForestRegressor
|
||||
from sklearn.metrics import roc_auc_score #, mean_squared_error
|
||||
|
||||
# 2018.11.28 Created by Eamon.Zhang
|
||||
|
||||
|
||||
def feature_shuffle_rf(X_train,y_train,max_depth=None,class_weight=None,top_n=15,n_estimators=50,random_state=0):
|
||||
|
||||
model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
|
||||
random_state=random_state,class_weight=class_weight,
|
||||
n_jobs=-1)
|
||||
model.fit(X_train, y_train)
|
||||
train_auc = roc_auc_score(y_train, (model.predict_proba(X_train))[:, 1])
|
||||
feature_dict = {}
|
||||
|
||||
# selection logic
|
||||
for feature in X_train.columns:
|
||||
X_train_c = X_train.copy().reset_index(drop=True)
|
||||
y_train_c = y_train.copy().reset_index(drop=True)
|
||||
|
||||
# shuffle individual feature
|
||||
X_train_c[feature] = X_train_c[feature].sample(frac=1,random_state=random_state).reset_index(
|
||||
drop=True)
|
||||
#print(X_train_c.isnull().sum())
|
||||
# make prediction with shuffled feature and calculate roc-auc
|
||||
shuff_auc = roc_auc_score(y_train_c,
|
||||
(model.predict_proba(X_train_c))[:, 1])
|
||||
#print(shuff_auc)
|
||||
# save the drop in roc-auc
|
||||
feature_dict[feature] = (train_auc - shuff_auc)
|
||||
#print(feature_dict)
|
||||
|
||||
auc_drop = pd.Series(feature_dict).reset_index()
|
||||
auc_drop.columns = ['feature', 'auc_drop']
|
||||
auc_drop.sort_values(by=['auc_drop'], ascending=False, inplace=True)
|
||||
selected_features = auc_drop[auc_drop.auc_drop>0]['feature']
|
||||
|
||||
return auc_drop, selected_features
|
||||
|
||||
156
feature_selection/filter_method.py
Normal file
@@ -0,0 +1,156 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
#from sklearn.feature_selection import VarianceThreshold
|
||||
from sklearn.feature_selection import mutual_info_classif,chi2
|
||||
from sklearn.feature_selection import SelectKBest, SelectPercentile
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
||||
from sklearn.metrics import roc_auc_score, mean_squared_error
|
||||
|
||||
# 2018.11.17 Created by Eamon.Zhang
|
||||
|
||||
def constant_feature_detect(data,threshold=0.98):
|
||||
""" detect features that show the same value for the
|
||||
majority/all of the observations (constant/quasi-constant features)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
data : pd.Dataframe
|
||||
threshold : threshold to identify the variable as constant
|
||||
|
||||
Returns
|
||||
-------
|
||||
list of variables names
|
||||
"""
|
||||
|
||||
data_copy = data.copy(deep=True)
|
||||
quasi_constant_feature = []
|
||||
for feature in data_copy.columns:
|
||||
predominant = (data_copy[feature].value_counts() / np.float(
|
||||
len(data_copy))).sort_values(ascending=False).values[0]
|
||||
if predominant >= threshold:
|
||||
quasi_constant_feature.append(feature)
|
||||
print(len(quasi_constant_feature),' variables are found to be almost constant')
|
||||
return quasi_constant_feature
|
||||
|
||||
|
||||
def corr_feature_detect(data,threshold=0.8):
|
||||
""" detect highly-correlated features of a Dataframe
|
||||
Parameters
|
||||
----------
|
||||
data : pd.Dataframe
|
||||
threshold : threshold to identify the variable correlated
|
||||
|
||||
Returns
|
||||
-------
|
||||
pairs of correlated variables
|
||||
"""
|
||||
|
||||
corrmat = data.corr()
|
||||
corrmat = corrmat.abs().unstack() # absolute value of corr coef
|
||||
corrmat = corrmat.sort_values(ascending=False)
|
||||
corrmat = corrmat[corrmat >= threshold]
|
||||
corrmat = corrmat[corrmat < 1] # remove the digonal
|
||||
corrmat = pd.DataFrame(corrmat).reset_index()
|
||||
corrmat.columns = ['feature1', 'feature2', 'corr']
|
||||
|
||||
grouped_feature_ls = []
|
||||
correlated_groups = []
|
||||
|
||||
for feature in corrmat.feature1.unique():
|
||||
if feature not in grouped_feature_ls:
|
||||
|
||||
# find all features correlated to a single feature
|
||||
correlated_block = corrmat[corrmat.feature1 == feature]
|
||||
grouped_feature_ls = grouped_feature_ls + list(
|
||||
correlated_block.feature2.unique()) + [feature]
|
||||
|
||||
# append the block of features to the list
|
||||
correlated_groups.append(correlated_block)
|
||||
return correlated_groups
|
||||
|
||||
|
||||
def mutual_info(X,y,select_k=10):
|
||||
|
||||
# mi = mutual_info_classif(X,y)
|
||||
# mi = pd.Series(mi)
|
||||
# mi.index = X.columns
|
||||
# mi.sort_values(ascending=False)
|
||||
|
||||
if select_k >= 1:
|
||||
sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X,y)
|
||||
col = X.columns[sel_.get_support()]
|
||||
|
||||
elif 0 < select_k < 1:
|
||||
sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X,y)
|
||||
col = X.columns[sel_.get_support()]
|
||||
|
||||
else:
|
||||
raise ValueError("select_k must be a positive number")
|
||||
|
||||
return col
|
||||
|
||||
|
||||
# 2018.11.27 edit Chi-square test
|
||||
def chi_square_test(X,y,select_k=10):
|
||||
|
||||
"""
|
||||
Compute chi-squared stats between each non-negative feature and class.
|
||||
This score should be used to evaluate categorical variables in a classification task
|
||||
"""
|
||||
if select_k >= 1:
|
||||
sel_ = SelectKBest(chi2, k=select_k).fit(X,y)
|
||||
col = X.columns[sel_.get_support()]
|
||||
elif 0 < select_k < 1:
|
||||
sel_ = SelectPercentile(chi2, percentile=select_k*100).fit(X,y)
|
||||
col = X.columns[sel_.get_support()]
|
||||
else:
|
||||
raise ValueError("select_k must be a positive number")
|
||||
|
||||
return col
|
||||
|
||||
|
||||
def univariate_roc_auc(X_train,y_train,X_test,y_test,threshold):
|
||||
|
||||
"""
|
||||
First, it builds one decision tree per feature, to predict the target
|
||||
Second, it makes predictions using the decision tree and the mentioned feature
|
||||
Third, it ranks the features according to the machine learning metric (roc-auc or mse)
|
||||
It selects the highest ranked features
|
||||
|
||||
"""
|
||||
roc_values = []
|
||||
for feature in X_train.columns:
|
||||
clf = DecisionTreeClassifier()
|
||||
clf.fit(X_train[feature].to_frame(), y_train)
|
||||
y_scored = clf.predict_proba(X_test[feature].to_frame())
|
||||
roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))
|
||||
roc_values = pd.Series(roc_values)
|
||||
roc_values.index = X_train.columns
|
||||
print(roc_values.sort_values(ascending=False))
|
||||
print(len(roc_values[roc_values > threshold]),'out of the %s featues are kept'% len(X_train.columns))
|
||||
keep_col = roc_values[roc_values > threshold]
|
||||
return keep_col
|
||||
|
||||
|
||||
def univariate_mse(X_train,y_train,X_test,y_test,threshold):
|
||||
|
||||
"""
|
||||
First, it builds one decision tree per feature, to predict the target
|
||||
Second, it makes predictions using the decision tree and the mentioned feature
|
||||
Third, it ranks the features according to the machine learning metric (roc-auc or mse)
|
||||
It selects the highest ranked features
|
||||
|
||||
"""
|
||||
mse_values = []
|
||||
for feature in X_train.columns:
|
||||
clf = DecisionTreeRegressor()
|
||||
clf.fit(X_train[feature].to_frame(), y_train)
|
||||
y_scored = clf.predict(X_test[feature].to_frame())
|
||||
mse_values.append(mean_squared_error(y_test, y_scored))
|
||||
mse_values = pd.Series(mse_values)
|
||||
mse_values.index = X_train.columns
|
||||
print(mse_values.sort_values(ascending=False))
|
||||
print(len(mse_values[mse_values > threshold]),'out of the %s featues are kept'% len(X_train.columns))
|
||||
keep_col = mse_values[mse_values > threshold]
|
||||
return keep_col
|
||||
|
||||
128
feature_selection/hybrid.py
Normal file
@@ -0,0 +1,128 @@
|
||||
#import pandas as pd
|
||||
#import numpy as np
|
||||
|
||||
from sklearn.ensemble import RandomForestClassifier #, RandomForestRegressor
|
||||
from sklearn.metrics import roc_auc_score #, mean_squared_error
|
||||
|
||||
# 2018.12.02 Created by Eamon.Zhang
|
||||
|
||||
|
||||
def recursive_feature_elimination_rf(X_train,y_train,X_test,y_test,
|
||||
tol=0.001,max_depth=None,
|
||||
class_weight=None,
|
||||
top_n=15,n_estimators=50,random_state=0):
|
||||
|
||||
|
||||
features_to_remove = []
|
||||
count = 1
|
||||
# initial model using all the features
|
||||
model_all_features = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
|
||||
random_state=random_state,class_weight=class_weight,
|
||||
n_jobs=-1)
|
||||
model_all_features.fit(X_train, y_train)
|
||||
y_pred_test = model_all_features.predict_proba(X_test)[:, 1]
|
||||
auc_score_all = roc_auc_score(y_test, y_pred_test)
|
||||
|
||||
for feature in X_train.columns:
|
||||
print()
|
||||
print('testing feature: ', feature, ' which is feature ', count,
|
||||
' out of ', len(X_train.columns))
|
||||
count += 1
|
||||
model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
|
||||
random_state=random_state,class_weight=class_weight,
|
||||
n_jobs=-1)
|
||||
|
||||
# fit model with all variables minus the removed features
|
||||
# and the feature to be evaluated
|
||||
model.fit(X_train.drop(features_to_remove + [feature], axis=1), y_train)
|
||||
y_pred_test = model.predict_proba(
|
||||
X_test.drop(features_to_remove + [feature], axis=1))[:, 1]
|
||||
auc_score_int = roc_auc_score(y_test, y_pred_test)
|
||||
print('New Test ROC AUC={}'.format((auc_score_int)))
|
||||
|
||||
# print the original roc-auc with all the features
|
||||
print('All features Test ROC AUC={}'.format((auc_score_all)))
|
||||
|
||||
# determine the drop in the roc-auc
|
||||
diff_auc = auc_score_all - auc_score_int
|
||||
|
||||
# compare the drop in roc-auc with the tolerance
|
||||
if diff_auc >= tol:
|
||||
print('Drop in ROC AUC={}'.format(diff_auc))
|
||||
print('keep: ', feature)
|
||||
|
||||
else:
|
||||
print('Drop in ROC AUC={}'.format(diff_auc))
|
||||
print('remove: ', feature)
|
||||
|
||||
# if the drop in the roc is small and we remove the
|
||||
# feature, we need to set the new roc to the one based on
|
||||
# the remaining features
|
||||
auc_score_all = auc_score_int
|
||||
|
||||
# and append the feature to remove to the list
|
||||
features_to_remove.append(feature)
|
||||
print('DONE!!')
|
||||
print('total features to remove: ', len(features_to_remove))
|
||||
features_to_keep = [x for x in X_train.columns if x not in features_to_remove]
|
||||
print('total features to keep: ', len(features_to_keep))
|
||||
|
||||
return features_to_keep
|
||||
|
||||
|
||||
def recursive_feature_addition_rf(X_train,y_train,X_test,y_test,
|
||||
tol=0.001,max_depth=None,
|
||||
class_weight=None,
|
||||
top_n=15,n_estimators=50,random_state=0):
|
||||
|
||||
|
||||
features_to_keep = [X_train.columns[0]]
|
||||
count = 1
|
||||
# initial model using only one feature
|
||||
model_one_feature = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
|
||||
random_state=random_state,class_weight=class_weight,
|
||||
n_jobs=-1)
|
||||
model_one_feature.fit(X_train[[X_train.columns[0]]], y_train)
|
||||
y_pred_test = model_one_feature.predict_proba(X_test[[X_train.columns[0]]])[:, 1]
|
||||
auc_score_all = roc_auc_score(y_test, y_pred_test)
|
||||
|
||||
for feature in X_train.columns[1:]:
|
||||
print()
|
||||
print('testing feature: ', feature, ' which is feature ', count,
|
||||
' out of ', len(X_train.columns))
|
||||
count += 1
|
||||
model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
|
||||
random_state=random_state,class_weight=class_weight,
|
||||
n_jobs=-1)
|
||||
|
||||
# fit model with the selected features
|
||||
# and the feature to be evaluated
|
||||
model.fit(X_train[features_to_keep + [feature]], y_train)
|
||||
y_pred_test = model.predict_proba(
|
||||
X_test[features_to_keep + [feature]])[:, 1]
|
||||
auc_score_int = roc_auc_score(y_test, y_pred_test)
|
||||
print('New Test ROC AUC={}'.format((auc_score_int)))
|
||||
|
||||
# print the original roc-auc with all the features
|
||||
print('All features Test ROC AUC={}'.format((auc_score_all)))
|
||||
|
||||
# determine the drop in the roc-auc
|
||||
diff_auc = auc_score_int - auc_score_all
|
||||
|
||||
# compare the drop in roc-auc with the tolerance
|
||||
if diff_auc >= tol:
|
||||
# if the increase in the roc is bigger than the threshold
|
||||
# we keep the feature and re-adjust the roc-auc to the new value
|
||||
# considering the added feature
|
||||
print('Increase in ROC AUC={}'.format(diff_auc))
|
||||
print('keep: ', feature)
|
||||
auc_score_all = auc_score_int
|
||||
features_to_keep.append(feature)
|
||||
else:
|
||||
print('Increase in ROC AUC={}'.format(diff_auc))
|
||||
print('remove: ', feature)
|
||||
|
||||
print('DONE!!')
|
||||
print('total features to keep: ', len(features_to_keep))
|
||||
|
||||
return features_to_keep
|
||||
BIN
images/001.png
Normal file
|
After Width: | Height: | Size: 112 KiB |
BIN
images/IV.png
Normal file
|
After Width: | Height: | Size: 23 KiB |
BIN
images/box-cox.png
Normal file
|
After Width: | Height: | Size: 2.1 KiB |
BIN
images/embedded.png
Normal file
|
After Width: | Height: | Size: 26 KiB |
BIN
images/featuretools.png
Normal file
|
After Width: | Height: | Size: 9.8 KiB |
BIN
images/filter.png
Normal file
|
After Width: | Height: | Size: 19 KiB |
BIN
images/scaling.png
Normal file
|
After Width: | Height: | Size: 143 KiB |
BIN
images/sphx_glr_plot_map_data_to_normal_001.png
Normal file
|
After Width: | Height: | Size: 35 KiB |
BIN
images/workflow2.png
Normal file
|
After Width: | Height: | Size: 20 KiB |
BIN
images/wrapper.png
Normal file
|
After Width: | Height: | Size: 20 KiB |
BIN
output/Barplot_Pclass_Survived.png
Normal file
|
After Width: | Height: | Size: 10 KiB |
BIN
output/Boxplot_Pclass_Fare.png
Normal file
|
After Width: | Height: | Size: 11 KiB |
BIN
output/Corr_plot.png
Normal file
|
After Width: | Height: | Size: 26 KiB |
BIN
output/Countplot_Pclass.png
Normal file
|
After Width: | Height: | Size: 9.5 KiB |
BIN
output/Distplot_Fare.png
Normal file
|
After Width: | Height: | Size: 10 KiB |
BIN
output/Heatmap.png
Normal file
|
After Width: | Height: | Size: 76 KiB |
BIN
output/Scatter_plot_Fare_Pclass.png
Normal file
|
After Width: | Height: | Size: 16 KiB |
12
output/describe.csv
Normal file
@@ -0,0 +1,12 @@
|
||||
,Survived,Pclass,Sex,Age,SibSp,Fare
|
||||
count,891.0,891.0,891,714.0,891.0,891.0
|
||||
unique,,,2,,,
|
||||
top,,,male,,,
|
||||
freq,,,577,,,
|
||||
mean,0.3838383838383838,2.308641975308642,,29.69911764705882,0.5230078563411896,32.2042079685746
|
||||
std,0.4865924542648585,0.8360712409770513,,14.526497332334044,1.1027434322934275,49.693428597180905
|
||||
min,0.0,1.0,,0.42,0.0,0.0
|
||||
25%,0.0,2.0,,20.125,0.0,7.9104
|
||||
50%,0.0,3.0,,28.0,0.0,14.4542
|
||||
75%,1.0,3.0,,38.0,1.0,31.0
|
||||
max,1.0,3.0,,80.0,8.0,512.3292
|
||||
|
7
output/missing.csv
Normal file
@@ -0,0 +1,7 @@
|
||||
,total missing,proportion
|
||||
Survived,0,0.0
|
||||
Pclass,0,0.0
|
||||
Sex,0,0.0
|
||||
Age,177,0.19865319865319866
|
||||
SibSp,0,0.0
|
||||
Fare,0,0.0
|
||||
|