2018.12.2 First commit.

2022-05-07 18:26:02 +03:00 · 2018-12-02 21:11:32 +08:00
commit 149c80cedf
50 changed files with 14765 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,6 @@
+rule_extraction 20181014.py
+__pycache__
+.ipynb_checkpoints
+.gitignore.bak
+history
+README_bk.md
--- a/1_Demo_Data_Explore.ipynb
+++ b/1_Demo_Data_Explore.ipynb
--- a/2.1_Demo_Missing_Data.ipynb
+++ b/2.1_Demo_Missing_Data.ipynb
--- a/2.2_Demo_Outlier.ipynb
+++ b/2.2_Demo_Outlier.ipynb
--- a/2.3_Demo_Rare_Values.ipynb
+++ b/2.3_Demo_Rare_Values.ipynb
@@ -0,0 +1,271 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "# import seaborn as sns\n",
+    "# import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "# plt.style.use('seaborn-colorblind')\n",
+    "# %matplotlib inline\n",
+    "from feature_cleaning import rare_values as ra"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Variable Pclass label proportion:\n",
+      "3    0.551066\n",
+      "1    0.242424\n",
+      "2    0.206510\n",
+      "Name: Pclass, dtype: float64\n",
+      "Variable SibSp label proportion:\n",
+      "0    0.682379\n",
+      "1    0.234568\n",
+      "2    0.031425\n",
+      "4    0.020202\n",
+      "3    0.017957\n",
+      "8    0.007856\n",
+      "5    0.005612\n",
+      "Name: SibSp, dtype: float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "use_cols = [\n",
+    "    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
+    "    'Survived'\n",
+    "]\n",
+    "\n",
+    "# see column Pclass & SibSp's distributions\n",
+    "# SibSp has values 3/8/5 that occur rarely, under 2%\n",
+    "# Pclass has 3 values, but no one is under 20%\n",
+    "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n",
+    "for i in ['Pclass','SibSp']:\n",
+    "    print('Variable',i,'label proportion:')\n",
+    "    print(data[i].value_counts()/len(data))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Grouping into one new category\n",
+    "Grouping the observations that show rare labels into a unique category ('rare')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# create the encoder and fit with our data\n",
+    "enc = ra.GroupingRareValues(cols=['Pclass','SibSp'],threshold=0.01).fit(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'col': 'Pclass', 'mapping': 3    3\n",
+      "1    1\n",
+      "2    2\n",
+      "dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0       0\n",
+      "1       1\n",
+      "2       2\n",
+      "4       4\n",
+      "3       3\n",
+      "8    rare\n",
+      "5    rare\n",
+      "dtype: object, 'data_type': dtype('int64')}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# let's see the mapping\n",
+    "# for SibSp, values 5 & 8 are encoded as 'rare' as they appear less than 10%\n",
+    "# for Pclass, nothing changed\n",
+    "print(enc.mapping)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# perform transformation\n",
+    "data2 = enc.transform(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0       608\n",
+      "1       209\n",
+      "2        28\n",
+      "4        18\n",
+      "3        16\n",
+      "rare     12\n",
+      "Name: SibSp, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "# check the result\n",
+    "print(data2.SibSp.value_counts())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Mode Imputation\n",
+    "Replacing the rare label by most frequent label"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# create the encoder and fit with our data\n",
+    "enc = ra.ModeImputation(cols=['Pclass','SibSp'],threshold=0.01).fit(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'col': 'Pclass', 'mapping': 3    3\n",
+      "1    1\n",
+      "2    2\n",
+      "dtype: int64, 'data_type': dtype('int64')}, {'col': 'SibSp', 'mapping': 0    0\n",
+      "1    1\n",
+      "2    2\n",
+      "4    4\n",
+      "3    3\n",
+      "8    0\n",
+      "5    0\n",
+      "dtype: int64, 'data_type': dtype('int64')}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# let's see the mapping\n",
+    "# for SibSp, values 5 & 8 are encoded as 0, as label 0 is the most frequent label\n",
+    "# for Pclass, nothing changed\n",
+    "print(enc.mapping)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# perform transformation\n",
+    "data3 = enc.transform(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0    620\n",
+      "1    209\n",
+      "2     28\n",
+      "4     18\n",
+      "3     16\n",
+      "Name: SibSp, dtype: int64\n"
+     ]
+    }
+   ],
+   "source": [
+    "# check the result\n",
+    "print(data3.SibSp.value_counts())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/3.1_Demo_Feature_Scaling.ipynb
+++ b/3.1_Demo_Feature_Scaling.ipynb
@@ -0,0 +1,326 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "# import seaborn as sns\n",
+    "# import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# plt.style.use('seaborn-colorblind')\n",
+    "# %matplotlib inline\n",
+    "#from feature_cleaning import rare_values as ra"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "use_cols = [\n",
+    "    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
+    "    'Survived'\n",
+    "]\n",
+    "\n",
+    "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Survived</th>\n",
+       "      <th>Pclass</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>SibSp</th>\n",
+       "      <th>Fare</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>male</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7.2500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>female</td>\n",
+       "      <td>38.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>71.2833</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>female</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>7.9250</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Survived  Pclass     Sex   Age  SibSp     Fare\n",
+       "0         0       3    male  22.0      1   7.2500\n",
+       "1         1       1  female  38.0      1  71.2833\n",
+       "2         1       3  female  26.0      0   7.9250"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((623, 6), (268, 6))"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Note that we include target variable in the X_train \n",
+    "# because we need it to supervise our discretization\n",
+    "# this is not the standard way of using train-test-split\n",
+    "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
+    "                                                    random_state=0)\n",
+    "X_train.shape, X_test.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Normalization - Standardization (Z-score scaling)\n",
+    "\n",
+    "removes the mean and scales the data to unit variance.<br />z = (X - X.mean) /  std"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "     Survived  Pclass     Sex   Age  SibSp     Fare  Fare_zscore\n",
+      "857         1       1    male  51.0      0  26.5500    -0.122530\n",
+      "52          1       1  female  49.0      1  76.7292     0.918124\n",
+      "386         0       3    male   1.0      5  46.9000     0.299503\n",
+      "124         0       1    male  54.0      0  77.2875     0.929702\n",
+      "578         0       3  female   NaN      1  14.4583    -0.373297\n",
+      "549         1       2    male   8.0      1  36.7500     0.089005\n"
+     ]
+    }
+   ],
+   "source": [
+    "# add the new created feature\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "ss = StandardScaler().fit(X_train[['Fare']])\n",
+    "X_train_copy = X_train.copy(deep=True)\n",
+    "X_train_copy['Fare_zscore'] = ss.transform(X_train_copy[['Fare']])\n",
+    "print(X_train_copy.head(6))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5.916437306188636e-17\n",
+      "1.0008035356861\n"
+     ]
+    }
+   ],
+   "source": [
+    "# check if it is with mean=0 std=1\n",
+    "print(X_train_copy['Fare_zscore'].mean())\n",
+    "print(X_train_copy['Fare_zscore'].std())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Min-Max scaling\n",
+    "transforms features by scaling each feature to a given range. Default to [0,1].<br />X_scaled = (X - X.min / (X.max - X.min)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "     Survived  Pclass     Sex   Age  SibSp     Fare  Fare_minmax\n",
+      "857         1       1    male  51.0      0  26.5500     0.051822\n",
+      "52          1       1  female  49.0      1  76.7292     0.149765\n",
+      "386         0       3    male   1.0      5  46.9000     0.091543\n",
+      "124         0       1    male  54.0      0  77.2875     0.150855\n",
+      "578         0       3  female   NaN      1  14.4583     0.028221\n",
+      "549         1       2    male   8.0      1  36.7500     0.071731\n"
+     ]
+    }
+   ],
+   "source": [
+    "# add the new created feature\n",
+    "from sklearn.preprocessing import MinMaxScaler\n",
+    "mms = MinMaxScaler().fit(X_train[['Fare']])\n",
+    "X_train_copy = X_train.copy(deep=True)\n",
+    "X_train_copy['Fare_minmax'] = mms.transform(X_train_copy[['Fare']])\n",
+    "print(X_train_copy.head(6))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.0\n",
+      "0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# check the range of Fare_minmax\n",
+    "print(X_train_copy['Fare_minmax'].max())\n",
+    "print(X_train_copy['Fare_minmax'].min())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "## Robust scaling\n",
+    "removes the median and scales the data according to the quantile range (defaults to IQR)<br />X_scaled = (X - X.median) / IQR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "     Survived  Pclass     Sex   Age  SibSp     Fare  Fare_robust\n",
+      "857         1       1    male  51.0      0  26.5500     0.492275\n",
+      "52          1       1  female  49.0      1  76.7292     2.630973\n",
+      "386         0       3    male   1.0      5  46.9000     1.359616\n",
+      "124         0       1    male  54.0      0  77.2875     2.654768\n",
+      "578         0       3  female   NaN      1  14.4583    -0.023088\n",
+      "549         1       2    male   8.0      1  36.7500     0.927011\n"
+     ]
+    }
+   ],
+   "source": [
+    "# add the new created feature\n",
+    "from sklearn.preprocessing import RobustScaler\n",
+    "rs = RobustScaler().fit(X_train[['Fare']])\n",
+    "X_train_copy = X_train.copy(deep=True)\n",
+    "X_train_copy['Fare_robust'] = rs.transform(X_train_copy[['Fare']])\n",
+    "print(X_train_copy.head(6))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/3.2_Demo_Discretisation.ipynb
+++ b/3.2_Demo_Discretisation.ipynb
@@ -0,0 +1,865 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "# import seaborn as sns\n",
+    "# import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from feature_engineering import discretization as dc\n",
+    "\n",
+    "# plt.style.use('seaborn-colorblind')\n",
+    "# %matplotlib inline\n",
+    "#from feature_cleaning import rare_values as ra"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "use_cols = [\n",
+    "    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
+    "    'Survived'\n",
+    "]\n",
+    "\n",
+    "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Survived</th>\n",
+       "      <th>Pclass</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>SibSp</th>\n",
+       "      <th>Fare</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>male</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7.2500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>female</td>\n",
+       "      <td>38.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>71.2833</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>female</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>7.9250</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Survived  Pclass     Sex   Age  SibSp     Fare\n",
+       "0         0       3    male  22.0      1   7.2500\n",
+       "1         1       1  female  38.0      1  71.2833\n",
+       "2         1       3  female  26.0      0   7.9250"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((623, 6), (268, 6))"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Note that we include target variable in the X_train \n",
+    "# because we need it to supervise our discretization\n",
+    "# this is not the standard way of using train-test-split\n",
+    "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
+    "                                                    random_state=0)\n",
+    "X_train.shape, X_test.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Equal width binning\n",
+    "divides the scope of possible values into N bins of the same width"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.preprocessing import KBinsDiscretizer\n",
+    "enc_equal_width = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='uniform').fit(X_train[['Fare']])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([array([  0.    , 170.7764, 341.5528, 512.3292])], dtype=object)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# equal width for every bins\n",
+    "enc_equal_width.bin_edges_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.0    610\n",
+       "1.0     11\n",
+       "2.0      2\n",
+       "Name: 0, dtype: int64"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result = enc_equal_width.transform(X_train[['Fare']])\n",
+    "pd.DataFrame(result)[0].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "     Survived  Pclass     Sex   Age  SibSp      Fare  Fare_equal_width\n",
+      "857         1       1    male  51.0      0   26.5500               0.0\n",
+      "52          1       1  female  49.0      1   76.7292               0.0\n",
+      "386         0       3    male   1.0      5   46.9000               0.0\n",
+      "124         0       1    male  54.0      0   77.2875               0.0\n",
+      "578         0       3  female   NaN      1   14.4583               0.0\n",
+      "549         1       2    male   8.0      1   36.7500               0.0\n",
+      "118         0       1    male  24.0      0  247.5208               1.0\n",
+      "12          0       3    male  20.0      0    8.0500               0.0\n",
+      "157         0       3    male  30.0      0    8.0500               0.0\n",
+      "127         1       3    male  24.0      0    7.1417               0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# add the new discretized variable\n",
+    "X_train_copy = X_train.copy(deep=True)\n",
+    "X_train_copy['Fare_equal_width'] = enc_equal_width.transform(X_train[['Fare']])\n",
+    "print(X_train_copy.head(10))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Equal frequency binning\n",
+    "divides the scope of possible values of the variable into N bins, \n",
+    "where each bin carries the same amount of observations"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "enc_equal_freq = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='quantile').fit(X_train[['Fare']])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([array([  0.        ,   8.69303333,  26.2875    , 512.3292    ])],\n",
+       "      dtype=object)"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# check the bin edges\n",
+    "enc_equal_freq.bin_edges_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2.0    209\n",
+       "0.0    208\n",
+       "1.0    206\n",
+       "Name: 0, dtype: int64"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# equal number of case for every bins\n",
+    "result = enc_equal_freq.transform(X_train[['Fare']])\n",
+    "pd.DataFrame(result)[0].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "     Survived  Pclass     Sex   Age  SibSp      Fare  Fare_equal_freq\n",
+      "857         1       1    male  51.0      0   26.5500              2.0\n",
+      "52          1       1  female  49.0      1   76.7292              2.0\n",
+      "386         0       3    male   1.0      5   46.9000              2.0\n",
+      "124         0       1    male  54.0      0   77.2875              2.0\n",
+      "578         0       3  female   NaN      1   14.4583              1.0\n",
+      "549         1       2    male   8.0      1   36.7500              2.0\n",
+      "118         0       1    male  24.0      0  247.5208              2.0\n",
+      "12          0       3    male  20.0      0    8.0500              0.0\n",
+      "157         0       3    male  30.0      0    8.0500              0.0\n",
+      "127         1       3    male  24.0      0    7.1417              0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# add the new discretized variable\n",
+    "X_train_copy = X_train.copy(deep=True)\n",
+    "X_train_copy['Fare_equal_freq'] = enc_equal_freq.transform(X_train[['Fare']])\n",
+    "print(X_train_copy.head(10))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## K-means binning\n",
+    "using k-means to partition values into clusters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "enc_kmeans = KBinsDiscretizer(n_bins=3,encode='ordinal',strategy='kmeans').fit(X_train[['Fare']])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([array([  0.        ,  93.5271531 , 338.08506324, 512.3292    ])],\n",
+       "      dtype=object)"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# check the bin edges\n",
+    "enc_kmeans.bin_edges_"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.0    587\n",
+       "1.0     34\n",
+       "2.0      2\n",
+       "Name: 0, dtype: int64"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "result = enc_kmeans.transform(X_train[['Fare']])\n",
+    "pd.DataFrame(result)[0].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "     Survived  Pclass     Sex   Age  SibSp      Fare  Fare_kmeans\n",
+      "857         1       1    male  51.0      0   26.5500          0.0\n",
+      "52          1       1  female  49.0      1   76.7292          0.0\n",
+      "386         0       3    male   1.0      5   46.9000          0.0\n",
+      "124         0       1    male  54.0      0   77.2875          0.0\n",
+      "578         0       3  female   NaN      1   14.4583          0.0\n",
+      "549         1       2    male   8.0      1   36.7500          0.0\n",
+      "118         0       1    male  24.0      0  247.5208          1.0\n",
+      "12          0       3    male  20.0      0    8.0500          0.0\n",
+      "157         0       3    male  30.0      0    8.0500          0.0\n",
+      "127         1       3    male  24.0      0    7.1417          0.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# add the new discretized variable\n",
+    "X_train_copy = X_train.copy(deep=True)\n",
+    "X_train_copy['Fare_kmeans'] = enc_kmeans.transform(X_train[['Fare']])\n",
+    "print(X_train_copy.head(10))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Discretisation with Decision Tree\n",
+    "using a decision tree to identify the optimal splitting points that would determine the bins"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "enc1 = dc.DiscretizeByDecisionTree(col='Fare',max_depth=2).fit(X=X_train,y=y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,\n",
+       "            max_features=None, max_leaf_nodes=None,\n",
+       "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
+       "            min_samples_leaf=1, min_samples_split=2,\n",
+       "            min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n",
+       "            splitter='best')"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "enc1.tree_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "data1 = enc1.transform(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   Survived  Pclass     Sex   Age  SibSp     Fare  Fare_tree_discret\n",
+      "0         0       3    male  22.0      1   7.2500           0.107143\n",
+      "1         1       1  female  38.0      1  71.2833           0.442308\n",
+      "2         1       3  female  26.0      0   7.9250           0.255319\n",
+      "3         1       1  female  35.0      1  53.1000           0.442308\n",
+      "4         0       3    male  35.0      0   8.0500           0.255319\n",
+      "[0.10714286 0.44230769 0.25531915 0.74626866]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# see how the new column Fare_tree_discret is distributed\n",
+    "# the values are corresponding to the proba of the prediction by the tree\n",
+    "print(data1.head(5))\n",
+    "\n",
+    "# the unique value of the discretisized column\n",
+    "print(data1.Fare_tree_discret.unique())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                      Fare      Fare\n",
+      "Fare_tree_discret                   \n",
+      "0.107143            0.0000    7.5208\n",
+      "0.255319            7.5500   10.5167\n",
+      "0.442308           11.1333   73.5000\n",
+      "0.746269           75.2500  512.3292\n"
+     ]
+    }
+   ],
+   "source": [
+    "# see how the bins are cut\n",
+    "# because we use a tree with max-depth of 2, we have at most 2*2=4 bins generated by the tree\n",
+    "col='Fare'\n",
+    "bins = pd.concat([data1.groupby([col+'_tree_discret'])[col].min(),\n",
+    "                  data1.groupby([col+'_tree_discret'])[col].max()], axis=1)\n",
+    "print(bins)\n",
+    "\n",
+    "# all values between 0 to 7.5208 in the original variable 'Fare' \n",
+    "# are given new value 0.107143 in the new column 'Fare_tree_discret'\n",
+    "# and so on"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Discretisation with Decision Tree with optimal depth search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "result ROC-AUC for each depth\n",
+      "   depth  roc_auc_mean  roc_auc_std\n",
+      "0      2      0.662132     0.026253\n",
+      "1      3      0.647950     0.045010\n",
+      "2      4      0.650984     0.035127\n",
+      "3      5      0.651180     0.027663\n",
+      "4      6      0.653961     0.037421\n",
+      "5      7      0.643688     0.033513\n",
+      "optimal_depth: [2]\n"
+     ]
+    }
+   ],
+   "source": [
+    "# search for the best depth from range 2-7\n",
+    "# we see when depth=2 we get the best roc-auc mean\n",
+    "enc2 = dc.DiscretizeByDecisionTree(col='Fare',max_depth=[2,3,4,5,6,7]).fit(X=X_train,y=y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DecisionTreeClassifier(class_weight=None, criterion='gini',\n",
+       "            max_depth=array([2], dtype=int64), max_features=None,\n",
+       "            max_leaf_nodes=None, min_impurity_decrease=0.0,\n",
+       "            min_impurity_split=None, min_samples_leaf=1,\n",
+       "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
+       "            presort=False, random_state=None, splitter='best')"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# using optimal depth=2 we train the model, same result as last one\n",
+    "enc2.tree_model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Survived</th>\n",
+       "      <th>Pclass</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>SibSp</th>\n",
+       "      <th>Fare</th>\n",
+       "      <th>Fare_tree_discret</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>male</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7.2500</td>\n",
+       "      <td>0.107143</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>female</td>\n",
+       "      <td>38.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>71.2833</td>\n",
+       "      <td>0.442308</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>female</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>7.9250</td>\n",
+       "      <td>0.255319</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>female</td>\n",
+       "      <td>35.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>53.1000</td>\n",
+       "      <td>0.442308</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>male</td>\n",
+       "      <td>35.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>8.0500</td>\n",
+       "      <td>0.255319</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Survived  Pclass     Sex   Age  SibSp     Fare  Fare_tree_discret\n",
+       "0         0       3    male  22.0      1   7.2500           0.107143\n",
+       "1         1       1  female  38.0      1  71.2833           0.442308\n",
+       "2         1       3  female  26.0      0   7.9250           0.255319\n",
+       "3         1       1  female  35.0      1  53.1000           0.442308\n",
+       "4         0       3    male  35.0      0   8.0500           0.255319"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data2 = enc2.transform(data)\n",
+    "data2.head(5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "## Discretisation with ChiMerge\n",
+    "supervised hierarchical bottom-up (merge) method that locally exploits the chi-square criterion to decide whether two adjacent intervals are similar enough to be merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Interval for variable Fare\n",
+      "  variable       interval  flag_0  flag_1\n",
+      "0     Fare     -inf,7.875    94.0    28.0\n",
+      "1     Fare   7.875,7.8792     0.0     3.0\n",
+      "2     Fare  7.8792,7.8958    25.0     1.0\n",
+      "3     Fare    7.8958,73.5   245.0   160.0\n",
+      "4     Fare          73.5+    17.0    50.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "enc3 = dc.ChiMerge(col='Fare',num_of_bins=5).fit(X=X_train,y='Survived')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[-0.1, 7.875, 7.8792, 7.8958, 73.5, 512.3292]"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# the bins boundary created by ChiMerge\n",
+    "\n",
+    "enc3.bins"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "data3 = enc3.transform(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   Survived  Pclass     Sex   Age  SibSp     Fare    Fare_chimerge\n",
+      "0         0       3    male  22.0      1   7.2500  (-0.101, 7.875]\n",
+      "1         1       1  female  38.0      1  71.2833    (7.896, 73.5]\n",
+      "2         1       3  female  26.0      0   7.9250    (7.896, 73.5]\n",
+      "3         1       1  female  35.0      1  53.1000    (7.896, 73.5]\n",
+      "4         0       3    male  35.0      0   8.0500    (7.896, 73.5]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data3.head(5))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[(-0.101, 7.875], (7.896, 73.5], (73.5, 512.329], (7.875, 7.879], (7.879, 7.896]]\n",
+       "Categories (5, interval[float64]): [(-0.101, 7.875] < (7.875, 7.879] < (7.879, 7.896] < (7.896, 73.5] < (73.5, 512.329]]"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# all values are grouped into 5 intervals\n",
+    "data3.Fare_chimerge.unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/3.3_Demo_Feature_Encoding.ipynb
+++ b/3.3_Demo_Feature_Encoding.ipynb
@@ -0,0 +1,688 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "# import seaborn as sns\n",
+    "# import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "import category_encoders as ce\n",
+    "from feature_engineering import encoding\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Survived</th>\n",
+       "      <th>Pclass</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>SibSp</th>\n",
+       "      <th>Fare</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>male</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7.2500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>female</td>\n",
+       "      <td>38.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>71.2833</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>female</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>7.9250</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>female</td>\n",
+       "      <td>35.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>53.1000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>male</td>\n",
+       "      <td>35.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>8.0500</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Survived  Pclass     Sex   Age  SibSp     Fare\n",
+       "0         0       3    male  22.0      1   7.2500\n",
+       "1         1       1  female  38.0      1  71.2833\n",
+       "2         1       3  female  26.0      0   7.9250\n",
+       "3         1       1  female  35.0      1  53.1000\n",
+       "4         0       3    male  35.0      0   8.0500"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "use_cols = [\n",
+    "    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
+    "    'Survived'\n",
+    "]\n",
+    "\n",
+    "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n",
+    "data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((623, 6), (268, 6))"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
+    "                                                    random_state=0)\n",
+    "X_train.shape, X_test.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## One-hot encoding\n",
+    "replace the categorical variable by different boolean variables (0/1) to indicate whether or not certain label is true for that observation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "data1 = pd.get_dummies(data,drop_first=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Survived</th>\n",
+       "      <th>Pclass</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>SibSp</th>\n",
+       "      <th>Fare</th>\n",
+       "      <th>Sex_male</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7.2500</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>38.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>71.2833</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>7.9250</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>35.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>53.1000</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>35.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>8.0500</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Survived  Pclass   Age  SibSp     Fare  Sex_male\n",
+       "0         0       3  22.0      1   7.2500         1\n",
+       "1         1       1  38.0      1  71.2833         0\n",
+       "2         1       3  26.0      0   7.9250         0\n",
+       "3         1       1  35.0      1  53.1000         0\n",
+       "4         0       3  35.0      0   8.0500         1"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data1.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Ordinal-encoding\n",
+    "replace the labels by some ordinal number if ordinal is meaningful"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "ord_enc = ce.OrdinalEncoder(cols=['Sex']).fit(X_train,y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   Survived  Pclass  Sex   Age  SibSp     Fare\n",
+      "0         0       3    1  22.0      1   7.2500\n",
+      "1         1       1    2  38.0      1  71.2833\n",
+      "2         1       3    2  26.0      0   7.9250\n",
+      "3         1       1    2  35.0      1  53.1000\n",
+      "4         0       3    1  35.0      0   8.0500\n"
+     ]
+    }
+   ],
+   "source": [
+    "data4 = ord_enc.transform(data)\n",
+    "print(data4.head(5))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Mean encoding\n",
+    "replace the label by the mean of the target for that label. \n",
+    "(the target must be 0/1 valued or continuous)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Sex\n",
+       "female    0.753488\n",
+       "male      0.196078\n",
+       "Name: Survived, dtype: float64"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# cross check-- the mean of target group by Sex\n",
+    "X_train['Survived'].groupby(data['Sex']).mean()\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "mean_enc = encoding.MeanEncoding(cols=['Sex']).fit(X_train,y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   Survived  Pclass       Sex   Age  SibSp     Fare\n",
+      "0         0       3  0.196078  22.0      1   7.2500\n",
+      "1         1       1  0.753488  38.0      1  71.2833\n",
+      "2         1       3  0.753488  26.0      0   7.9250\n",
+      "3         1       1  0.753488  35.0      1  53.1000\n",
+      "4         0       3  0.196078  35.0      0   8.0500\n"
+     ]
+    }
+   ],
+   "source": [
+    "data6 = mean_enc.transform(data)\n",
+    "print(data6.head(5))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Target-encoding\n",
+    "Similar to mean encoding, but use both posterior probability and prior probability of the target"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create the encoder and fit with our data\n",
+    "target_enc = ce.TargetEncoder(cols=['Sex']).fit(X_train,y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# perform transformation\n",
+    "# data.Survived.groupby(data['Sex']).agg(['mean'])\n",
+    "data2 = target_enc.transform(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Survived</th>\n",
+       "      <th>Pclass</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>SibSp</th>\n",
+       "      <th>Fare</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.196078</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7.2500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.753488</td>\n",
+       "      <td>38.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>71.2833</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.753488</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>7.9250</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0.753488</td>\n",
+       "      <td>35.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>53.1000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>0.196078</td>\n",
+       "      <td>35.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>8.0500</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Survived  Pclass       Sex   Age  SibSp     Fare\n",
+       "0         0       3  0.196078  22.0      1   7.2500\n",
+       "1         1       1  0.753488  38.0      1  71.2833\n",
+       "2         1       3  0.753488  26.0      0   7.9250\n",
+       "3         1       1  0.753488  35.0      1  53.1000\n",
+       "4         0       3  0.196078  35.0      0   8.0500"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# check the result\n",
+    "data2.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## WOE-encoding\n",
+    "replace the label  with Weight of Evidence of each label. WOE is computed from the basic odds ratio: \n",
+    "\n",
+    "ln( (Proportion of Good Outcomes) / (Proportion of Bad Outcomes))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "woe_enc = ce.WOEEncoder(cols=['Sex']).fit(X_train,y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "data3 = woe_enc.transform(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Survived</th>\n",
+       "      <th>Pclass</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>SibSp</th>\n",
+       "      <th>Fare</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>-0.950742</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7.2500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1.555633</td>\n",
+       "      <td>38.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>71.2833</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>1.555633</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>7.9250</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1.555633</td>\n",
+       "      <td>35.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>53.1000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>-0.950742</td>\n",
+       "      <td>35.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>8.0500</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Survived  Pclass       Sex   Age  SibSp     Fare\n",
+       "0         0       3 -0.950742  22.0      1   7.2500\n",
+       "1         1       1  1.555633  38.0      1  71.2833\n",
+       "2         1       3  1.555633  26.0      0   7.9250\n",
+       "3         1       1  1.555633  35.0      1  53.1000\n",
+       "4         0       3 -0.950742  35.0      0   8.0500"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data3.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/3.4_Demo_Feature_Transformation.ipynb
+++ b/3.4_Demo_Feature_Transformation.ipynb
--- a/3.5_Demo_Feature_Generation.ipynb
+++ b/3.5_Demo_Feature_Generation.ipynb
@@ -0,0 +1,522 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "# import seaborn as sns\n",
+    "# import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import roc_curve,  roc_auc_score\n",
+    "\n",
+    "# plt.style.use('seaborn-colorblind')\n",
+    "# %matplotlib inline\n",
+    "#from feature_cleaning import rare_values as ra"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "use_cols = [\n",
+    "    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
+    "    'Survived'\n",
+    "]\n",
+    "\n",
+    "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Survived</th>\n",
+       "      <th>Pclass</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>Age</th>\n",
+       "      <th>SibSp</th>\n",
+       "      <th>Fare</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>3</td>\n",
+       "      <td>male</td>\n",
+       "      <td>22.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7.2500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>female</td>\n",
+       "      <td>38.0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>71.2833</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>3</td>\n",
+       "      <td>female</td>\n",
+       "      <td>26.0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>7.9250</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Survived  Pclass     Sex   Age  SibSp     Fare\n",
+       "0         0       3    male  22.0      1   7.2500\n",
+       "1         1       1  female  38.0      1  71.2833\n",
+       "2         1       3  female  26.0      0   7.9250"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((623, 6), (268, 6))"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Note that we include target variable in the X_train \n",
+    "# because we need it to supervise our discretization\n",
+    "# this is not the standard way of using train-test-split\n",
+    "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
+    "                                                    random_state=0)\n",
+    "X_train.shape, X_test.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Polynomial Expansion\n",
+    "\n",
+    "generate a new feature set consisting of all polynomial combinations of the features with degree less than or equal to the specified degree"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   Pclass  SibSp  Pclass^2  Pclass SibSp  SibSp^2\n",
+      "0     1.0    0.0       1.0           0.0      0.0\n",
+      "1     1.0    1.0       1.0           1.0      1.0\n",
+      "2     3.0    5.0       9.0          15.0     25.0\n",
+      "3     1.0    0.0       1.0           0.0      0.0\n",
+      "4     3.0    1.0       9.0           3.0      1.0\n",
+      "5     2.0    1.0       4.0           2.0      1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "# create polynomial combinations of feature 'Pclass','SibSp' with degree 2\n",
+    "from sklearn.preprocessing import PolynomialFeatures\n",
+    "pf = PolynomialFeatures(degree=2,include_bias=False).fit(X_train[['Pclass','SibSp']])\n",
+    "tmp = pf.transform(X_train[['Pclass','SibSp']])\n",
+    "X_train_copy = pd.DataFrame(tmp,columns=pf.get_feature_names(['Pclass','SibSp']))\n",
+    "print(X_train_copy.head(6))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Feature Learning by Trees\n",
+    "GBDT derived feature + LR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sample's belonging node of each base tree \n",
+      "' [[ 7.  7.  6. ...  4.  7.  4.]\n",
+      " [ 7.  7.  6. ... 14.  7.  7.]\n",
+      " [11. 11. 11. ...  4.  6. 11.]\n",
+      " ...\n",
+      " [10. 10. 10. ...  4.  6. 10.]\n",
+      " [13. 14. 13. ...  4.  7. 13.]\n",
+      " [ 7.  7.  6. ...  6.  7.  7.]]\n",
+      "AUC for GBDT derived feature + LR： 0.7746130952380953\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
+      "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
+      "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
+      "  warnings.warn(msg, FutureWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier\n",
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "\n",
+    "gbdt = GradientBoostingClassifier(n_estimators=20)\n",
+    "one_hot = OneHotEncoder()\n",
+    "\n",
+    "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
+    "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
+    "\n",
+    "gbdt.fit(X_train, y_train)\n",
+    "\n",
+    "X_leaf_index = gbdt.apply(X_train)[:, :, 0]  # apply return the node index on each tree \n",
+    "print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n",
+    "# fit one-hot encoder\n",
+    "one_hot.fit(X_leaf_index)   \n",
+    "X_one_hot = one_hot.transform(X_leaf_index)  \n",
+    "\n",
+    "\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
+    "lr.fit(X_one_hot,y_train)\n",
+    "y_pred = lr.predict_proba(\n",
+    "    one_hot.transform(gbdt.apply(X_test)[:, :, 0]))[:,1]\n",
+    "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
+    "print(\"AUC for GBDT derived feature + LR：\", roc_auc_score(y_test, y_pred))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Feature Learning by Trees\n",
+    "RandomForest derived feature + LR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "sample's belonging node of each base tree \n",
+      "' [[212  35  79 ... 146  60  46]\n",
+      " [307 165 266 ... 136 132  44]\n",
+      " [285 285 320 ... 301 294 300]\n",
+      " ...\n",
+      " [ 13 177 133 ... 186 169 117]\n",
+      " [190 296 311 ... 282 289 297]\n",
+      " [264 165 243 ... 152 110 314]]\n",
+      "AUC for RandomForest derived feature + LR： 0.759672619047619\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
+      "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
+      "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
+      "  warnings.warn(msg, FutureWarning)\n"
+     ]
+    }
+   ],
+   "source": [
+    "rf = RandomForestClassifier(n_estimators=20)\n",
+    "one_hot = OneHotEncoder()\n",
+    "\n",
+    "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
+    "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
+    "\n",
+    "rf.fit(X_train, y_train)\n",
+    "\n",
+    "X_leaf_index = rf.apply(X_train)  # apply return the node index on each tree \n",
+    "print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n",
+    "# fit one-hot encoder\n",
+    "one_hot.fit(X_leaf_index)   \n",
+    "X_one_hot = one_hot.transform(X_leaf_index)  \n",
+    "\n",
+    "\n",
+    "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
+    "lr.fit(X_one_hot,y_train)\n",
+    "y_pred = lr.predict_proba(\n",
+    "    one_hot.transform(rf.apply(X_test)))[:,1]\n",
+    "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
+    "print(\"AUC for RandomForest derived feature + LR：\", roc_auc_score(y_test, y_pred))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "##  Feature Learning by Trees\n",
+    "GBDT derived feature + Raw feature +LR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "AUC for GBDT derived feature + Raw feature +LR： 0.7603571428571428\n"
+     ]
+    }
+   ],
+   "source": [
+    "from scipy.sparse import hstack\n",
+    "\n",
+    "X_train_ext = hstack([one_hot.transform(gbdt.apply(X_train)[:, :, 0]), X_train])\n",
+    "X_test_ext = hstack([one_hot.transform(gbdt.apply(X_test)[:, :, 0]), X_test])\n",
+    "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
+    "lr.fit(X_train_ext,y_train)\n",
+    "y_pred = lr.predict_proba(X_test_ext)[:,1]\n",
+    "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
+    "print(\"AUC for GBDT derived feature + Raw feature +LR：\", roc_auc_score(y_test, y_pred))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Feature Learning by Trees\n",
+    "RandomForest derived feature + Raw feature +LR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "AUC for RandomForest derived feature + Raw feature + LR： 0.76\n"
+     ]
+    }
+   ],
+   "source": [
+    "X_train_ext = hstack([one_hot.transform(rf.apply(X_train)), X_train])\n",
+    "X_test_ext = hstack([one_hot.transform(rf.apply(X_test)), X_test])\n",
+    "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
+    "lr.fit(X_train_ext,y_train)\n",
+    "y_pred = lr.predict_proba(X_test_ext)[:,1]\n",
+    "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
+    "print(\"AUC for RandomForest derived feature + Raw feature + LR：\", roc_auc_score(y_test, y_pred))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Feature Learning by Trees\n",
+    "Use only Raw Feature + LR"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "AUC for RandomForest derived feature + LR： 0.6988690476190476\n"
+     ]
+    }
+   ],
+   "source": [
+    "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
+    "lr.fit(X_train,y_train)\n",
+    "y_pred = lr.predict_proba(X_test)[:,1]\n",
+    "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
+    "print(\"AUC for RandomForest derived feature + LR：\", roc_auc_score(y_test, y_pred))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Feature Learning by Trees\n",
+    "\n",
+    "Use only Raw Feature + GBDT"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "AUC for Raw feature + GBDT： 0.7613988095238096\n"
+     ]
+    }
+   ],
+   "source": [
+    "gbdt = GradientBoostingClassifier(n_estimators=20)\n",
+    "\n",
+    "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
+    "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
+    "\n",
+    "gbdt.fit(X_train, y_train)\n",
+    "y_pred = gbdt.predict_proba(X_test)[:,1]\n",
+    "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
+    "print(\"AUC for Raw feature + GBDT：\", roc_auc_score(y_test, y_pred))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Feature Learning by Trees\n",
+    "\n",
+    "Use only Raw Feature + RF\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "AUC for Raw feature + RF： 0.7235119047619047\n"
+     ]
+    }
+   ],
+   "source": [
+    "rf = RandomForestClassifier(n_estimators=20)\n",
+    "\n",
+    "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
+    "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
+    "\n",
+    "rf.fit(X_train, y_train)\n",
+    "y_pred = rf.predict_proba(X_test)[:,1]\n",
+    "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
+    "print(\"AUC for Raw feature + RF：\", roc_auc_score(y_test, y_pred))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Without tuning, we can see GBDT derived feature + LR get the best result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/4.1_Demo_Feature_Selection_Filter.ipynb
+++ b/4.1_Demo_Feature_Selection_Filter.ipynb
@@ -0,0 +1,696 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "# import seaborn as sns\n",
+    "# import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "# plt.style.use('seaborn-colorblind')\n",
+    "# %matplotlib inline\n",
+    "from feature_selection import filter_method as ft"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import load_breast_cancer\n",
+    "data = load_breast_cancer()\n",
+    "data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
+    "                  columns= np.append(data['feature_names'], ['target']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>mean radius</th>\n",
+       "      <th>mean texture</th>\n",
+       "      <th>mean perimeter</th>\n",
+       "      <th>mean area</th>\n",
+       "      <th>mean smoothness</th>\n",
+       "      <th>mean compactness</th>\n",
+       "      <th>mean concavity</th>\n",
+       "      <th>mean concave points</th>\n",
+       "      <th>mean symmetry</th>\n",
+       "      <th>mean fractal dimension</th>\n",
+       "      <th>...</th>\n",
+       "      <th>worst texture</th>\n",
+       "      <th>worst perimeter</th>\n",
+       "      <th>worst area</th>\n",
+       "      <th>worst smoothness</th>\n",
+       "      <th>worst compactness</th>\n",
+       "      <th>worst concavity</th>\n",
+       "      <th>worst concave points</th>\n",
+       "      <th>worst symmetry</th>\n",
+       "      <th>worst fractal dimension</th>\n",
+       "      <th>target</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>17.99</td>\n",
+       "      <td>10.38</td>\n",
+       "      <td>122.80</td>\n",
+       "      <td>1001.0</td>\n",
+       "      <td>0.11840</td>\n",
+       "      <td>0.27760</td>\n",
+       "      <td>0.3001</td>\n",
+       "      <td>0.14710</td>\n",
+       "      <td>0.2419</td>\n",
+       "      <td>0.07871</td>\n",
+       "      <td>...</td>\n",
+       "      <td>17.33</td>\n",
+       "      <td>184.60</td>\n",
+       "      <td>2019.0</td>\n",
+       "      <td>0.1622</td>\n",
+       "      <td>0.6656</td>\n",
+       "      <td>0.7119</td>\n",
+       "      <td>0.2654</td>\n",
+       "      <td>0.4601</td>\n",
+       "      <td>0.11890</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>20.57</td>\n",
+       "      <td>17.77</td>\n",
+       "      <td>132.90</td>\n",
+       "      <td>1326.0</td>\n",
+       "      <td>0.08474</td>\n",
+       "      <td>0.07864</td>\n",
+       "      <td>0.0869</td>\n",
+       "      <td>0.07017</td>\n",
+       "      <td>0.1812</td>\n",
+       "      <td>0.05667</td>\n",
+       "      <td>...</td>\n",
+       "      <td>23.41</td>\n",
+       "      <td>158.80</td>\n",
+       "      <td>1956.0</td>\n",
+       "      <td>0.1238</td>\n",
+       "      <td>0.1866</td>\n",
+       "      <td>0.2416</td>\n",
+       "      <td>0.1860</td>\n",
+       "      <td>0.2750</td>\n",
+       "      <td>0.08902</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>19.69</td>\n",
+       "      <td>21.25</td>\n",
+       "      <td>130.00</td>\n",
+       "      <td>1203.0</td>\n",
+       "      <td>0.10960</td>\n",
+       "      <td>0.15990</td>\n",
+       "      <td>0.1974</td>\n",
+       "      <td>0.12790</td>\n",
+       "      <td>0.2069</td>\n",
+       "      <td>0.05999</td>\n",
+       "      <td>...</td>\n",
+       "      <td>25.53</td>\n",
+       "      <td>152.50</td>\n",
+       "      <td>1709.0</td>\n",
+       "      <td>0.1444</td>\n",
+       "      <td>0.4245</td>\n",
+       "      <td>0.4504</td>\n",
+       "      <td>0.2430</td>\n",
+       "      <td>0.3613</td>\n",
+       "      <td>0.08758</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>11.42</td>\n",
+       "      <td>20.38</td>\n",
+       "      <td>77.58</td>\n",
+       "      <td>386.1</td>\n",
+       "      <td>0.14250</td>\n",
+       "      <td>0.28390</td>\n",
+       "      <td>0.2414</td>\n",
+       "      <td>0.10520</td>\n",
+       "      <td>0.2597</td>\n",
+       "      <td>0.09744</td>\n",
+       "      <td>...</td>\n",
+       "      <td>26.50</td>\n",
+       "      <td>98.87</td>\n",
+       "      <td>567.7</td>\n",
+       "      <td>0.2098</td>\n",
+       "      <td>0.8663</td>\n",
+       "      <td>0.6869</td>\n",
+       "      <td>0.2575</td>\n",
+       "      <td>0.6638</td>\n",
+       "      <td>0.17300</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>20.29</td>\n",
+       "      <td>14.34</td>\n",
+       "      <td>135.10</td>\n",
+       "      <td>1297.0</td>\n",
+       "      <td>0.10030</td>\n",
+       "      <td>0.13280</td>\n",
+       "      <td>0.1980</td>\n",
+       "      <td>0.10430</td>\n",
+       "      <td>0.1809</td>\n",
+       "      <td>0.05883</td>\n",
+       "      <td>...</td>\n",
+       "      <td>16.67</td>\n",
+       "      <td>152.20</td>\n",
+       "      <td>1575.0</td>\n",
+       "      <td>0.1374</td>\n",
+       "      <td>0.2050</td>\n",
+       "      <td>0.4000</td>\n",
+       "      <td>0.1625</td>\n",
+       "      <td>0.2364</td>\n",
+       "      <td>0.07678</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 31 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \\\n",
+       "0        17.99         10.38          122.80     1001.0          0.11840   \n",
+       "1        20.57         17.77          132.90     1326.0          0.08474   \n",
+       "2        19.69         21.25          130.00     1203.0          0.10960   \n",
+       "3        11.42         20.38           77.58      386.1          0.14250   \n",
+       "4        20.29         14.34          135.10     1297.0          0.10030   \n",
+       "\n",
+       "   mean compactness  mean concavity  mean concave points  mean symmetry  \\\n",
+       "0           0.27760          0.3001              0.14710         0.2419   \n",
+       "1           0.07864          0.0869              0.07017         0.1812   \n",
+       "2           0.15990          0.1974              0.12790         0.2069   \n",
+       "3           0.28390          0.2414              0.10520         0.2597   \n",
+       "4           0.13280          0.1980              0.10430         0.1809   \n",
+       "\n",
+       "   mean fractal dimension   ...    worst texture  worst perimeter  worst area  \\\n",
+       "0                 0.07871   ...            17.33           184.60      2019.0   \n",
+       "1                 0.05667   ...            23.41           158.80      1956.0   \n",
+       "2                 0.05999   ...            25.53           152.50      1709.0   \n",
+       "3                 0.09744   ...            26.50            98.87       567.7   \n",
+       "4                 0.05883   ...            16.67           152.20      1575.0   \n",
+       "\n",
+       "   worst smoothness  worst compactness  worst concavity  worst concave points  \\\n",
+       "0            0.1622             0.6656           0.7119                0.2654   \n",
+       "1            0.1238             0.1866           0.2416                0.1860   \n",
+       "2            0.1444             0.4245           0.4504                0.2430   \n",
+       "3            0.2098             0.8663           0.6869                0.2575   \n",
+       "4            0.1374             0.2050           0.4000                0.1625   \n",
+       "\n",
+       "   worst symmetry  worst fractal dimension  target  \n",
+       "0          0.4601                  0.11890     0.0  \n",
+       "1          0.2750                  0.08902     0.0  \n",
+       "2          0.3613                  0.08758     0.0  \n",
+       "3          0.6638                  0.17300     0.0  \n",
+       "4          0.2364                  0.07678     0.0  \n",
+       "\n",
+       "[5 rows x 31 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((455, 30), (114, 30))"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
+    "                                                    data.target, test_size=0.2,\n",
+    "                                                    random_state=0)\n",
+    "X_train.shape, X_test.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Variance method\n",
+    "removing features that show the same value for the majority/all of the observations (constant/quasi-constant features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0  variables are found to be almost constant\n"
+     ]
+    }
+   ],
+   "source": [
+    "# the original dataset has no constant variable\n",
+    "quasi_constant_feature = ft.constant_feature_detect(data=X_train,threshold=0.9)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "1.0    0.923077\n",
+       "0.0    0.068132\n",
+       "2.0    0.008791\n",
+       "Name: dummy, dtype: float64"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# lets create a duumy variable that help us do the demonstration\n",
+    "X_train['dummy'] = np.floor(X_train['worst smoothness']*10)\n",
+    "# variable dummy has> 92% of the observations show one value, 1.0\n",
+    "X_train.dummy.value_counts() / np.float(len(X_train))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1  variables are found to be almost constant\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "['dummy']"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "quasi_constant_feature = ft.constant_feature_detect(data=X_train,threshold=0.9)\n",
+    "quasi_constant_feature"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(455, 30)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# drop that variable\n",
+    "X_train.drop(labels=quasi_constant_feature,axis=1,inplace=True)\n",
+    "print(X_train.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Correlation method\n",
+    "remove features that are highly correlated with each other"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "          feature1         feature2      corr\n",
+      "0   mean perimeter      mean radius  0.998185\n",
+      "6   mean perimeter        mean area  0.986692\n",
+      "14  mean perimeter  worst perimeter  0.970507\n",
+      "19  mean perimeter     worst radius  0.969520\n",
+      "33  mean perimeter       worst area  0.941920 \n",
+      "\n",
+      "           feature1      feature2      corr\n",
+      "12  perimeter error  radius error  0.978323\n",
+      "30  perimeter error    area error  0.944995 \n",
+      "\n",
+      "          feature1             feature2      corr\n",
+      "36  mean concavity  mean concave points  0.914627 \n",
+      "\n",
+      "        feature1       feature2      corr\n",
+      "38  mean texture  worst texture  0.908182 \n",
+      "\n",
+      "                feature1             feature2      corr\n",
+      "40  worst concave points  mean concave points  0.906312 \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "corr = ft.corr_feature_detect(data=X_train,threshold=0.9)\n",
+    "# print all the correlated feature groups!\n",
+    "for i in corr:\n",
+    "    print(i,'\\n')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "then we can decide which ones to remove."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Mutual Information Filter\n",
+    "Mutual information measures how much information the presence/absence of a feature contributes to making the correct prediction on Y."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['mean concave points', 'worst perimeter', 'worst area'], dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "# select the top 3 features\n",
+    "mi = ft.mutual_info(X=X_train,y=y_train,select_k=3)\n",
+    "print(mi)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['mean perimeter', 'mean concave points', 'worst radius',\n",
+      "       'worst perimeter', 'worst area', 'worst concave points'],\n",
+      "      dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "# select the top 20% features\n",
+    "mi = ft.mutual_info(X=X_train,y=y_train,select_k=0.2)\n",
+    "print(mi)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Chi-Square Filter\n",
+    "Compute chi-squared stats between each non-negative feature and class"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['mean area', 'area error', 'worst area'], dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "# select the top 3 features\n",
+    "chi = ft.chi_square_test(X=X_train,y=y_train,select_k=3)\n",
+    "print(chi)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['mean perimeter', 'mean area', 'area error', 'worst radius',\n",
+      "       'worst perimeter', 'worst area'],\n",
+      "      dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "# select the top 20% features\n",
+    "chi = ft.chi_square_test(X=X_train,y=y_train,select_k=0.2)\n",
+    "print(chi)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Univariate ROC-AUC or MSE\n",
+    "builds one decision tree per feature, to predict the target, then make predictions and ranks the features according to the machine learning metric (roc-auc or mse)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "worst perimeter            0.917275\n",
+      "worst area                 0.895840\n",
+      "worst radius               0.893458\n",
+      "worst concave points       0.863131\n",
+      "mean concavity             0.856939\n",
+      "mean radius                0.849000\n",
+      "mean area                  0.839314\n",
+      "worst concavity            0.831375\n",
+      "mean perimeter             0.829628\n",
+      "mean concave points        0.826453\n",
+      "area error                 0.812321\n",
+      "worst compactness          0.742299\n",
+      "radius error               0.740235\n",
+      "mean compactness           0.734360\n",
+      "perimeter error            0.680534\n",
+      "worst texture              0.647666\n",
+      "worst fractal dimension    0.640997\n",
+      "concavity error            0.640203\n",
+      "worst symmetry             0.620991\n",
+      "concave points error       0.618133\n",
+      "compactness error          0.607336\n",
+      "mean symmetry              0.591775\n",
+      "mean texture               0.573357\n",
+      "texture error              0.568593\n",
+      "worst smoothness           0.565100\n",
+      "mean smoothness            0.557637\n",
+      "fractal dimension error    0.542077\n",
+      "smoothness error           0.522706\n",
+      "symmetry error             0.493649\n",
+      "mean fractal dimension     0.475548\n",
+      "dtype: float64\n",
+      "11 out of the 30 featues are kept\n",
+      "mean radius             0.849000\n",
+      "mean perimeter          0.829628\n",
+      "mean area               0.839314\n",
+      "mean concavity          0.856939\n",
+      "mean concave points     0.826453\n",
+      "area error              0.812321\n",
+      "worst radius            0.893458\n",
+      "worst perimeter         0.917275\n",
+      "worst area              0.895840\n",
+      "worst concavity         0.831375\n",
+      "worst concave points    0.863131\n",
+      "dtype: float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "uni_roc_auc = ft.univariate_roc_auc(X_train=X_train,y_train=y_train,\n",
+    "                                   X_test=X_test,y_test=y_test,threshold=0.8)\n",
+    "print(uni_roc_auc)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "mean fractal dimension     0.491228\n",
+      "symmetry error             0.480750\n",
+      "fractal dimension error    0.456140\n",
+      "smoothness error           0.449561\n",
+      "texture error              0.412281\n",
+      "worst smoothness           0.403265\n",
+      "mean smoothness            0.399123\n",
+      "mean texture               0.396930\n",
+      "mean symmetry              0.363060\n",
+      "compactness error          0.361842\n",
+      "concave points error       0.357456\n",
+      "worst fractal dimension    0.355263\n",
+      "worst symmetry             0.350877\n",
+      "worst texture              0.333333\n",
+      "concavity error            0.333333\n",
+      "perimeter error            0.300439\n",
+      "mean compactness           0.258772\n",
+      "worst compactness          0.254386\n",
+      "radius error               0.245614\n",
+      "area error                 0.179825\n",
+      "mean perimeter             0.166667\n",
+      "mean concave points        0.166667\n",
+      "worst concavity            0.162281\n",
+      "mean radius                0.146930\n",
+      "mean concavity             0.142544\n",
+      "mean area                  0.140351\n",
+      "worst concave points       0.123782\n",
+      "worst area                 0.103070\n",
+      "worst radius               0.100877\n",
+      "worst perimeter            0.098684\n",
+      "dtype: float64\n",
+      "6 out of the 30 featues are kept\n",
+      "mean fractal dimension     0.491228\n",
+      "texture error              0.412281\n",
+      "smoothness error           0.449561\n",
+      "symmetry error             0.480750\n",
+      "fractal dimension error    0.456140\n",
+      "worst smoothness           0.403265\n",
+      "dtype: float64\n"
+     ]
+    }
+   ],
+   "source": [
+    "uni_mse = ft.univariate_mse(X_train=X_train,y_train=y_train,\n",
+    "                            X_test=X_test,y_test=y_test,threshold=0.4)\n",
+    "print(uni_mse)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/4.2_Demo_Feature_Selection_Wrapper.ipynb
+++ b/4.2_Demo_Feature_Selection_Wrapper.ipynb
@@ -0,0 +1,548 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "# import seaborn as sns\n",
+    "# import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from mlxtend.feature_selection import SequentialFeatureSelector as SFS\n",
+    "from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS\n",
+    "from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
+    "\n",
+    "# plt.style.use('seaborn-colorblind')\n",
+    "# %matplotlib inline\n",
+    "# from feature_selection import filter_method as ft"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import load_breast_cancer\n",
+    "data = load_breast_cancer()\n",
+    "data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
+    "                  columns= np.append(data['feature_names'], ['target']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>mean radius</th>\n",
+       "      <th>mean texture</th>\n",
+       "      <th>mean perimeter</th>\n",
+       "      <th>mean area</th>\n",
+       "      <th>mean smoothness</th>\n",
+       "      <th>mean compactness</th>\n",
+       "      <th>mean concavity</th>\n",
+       "      <th>mean concave points</th>\n",
+       "      <th>mean symmetry</th>\n",
+       "      <th>mean fractal dimension</th>\n",
+       "      <th>...</th>\n",
+       "      <th>worst texture</th>\n",
+       "      <th>worst perimeter</th>\n",
+       "      <th>worst area</th>\n",
+       "      <th>worst smoothness</th>\n",
+       "      <th>worst compactness</th>\n",
+       "      <th>worst concavity</th>\n",
+       "      <th>worst concave points</th>\n",
+       "      <th>worst symmetry</th>\n",
+       "      <th>worst fractal dimension</th>\n",
+       "      <th>target</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>17.99</td>\n",
+       "      <td>10.38</td>\n",
+       "      <td>122.80</td>\n",
+       "      <td>1001.0</td>\n",
+       "      <td>0.11840</td>\n",
+       "      <td>0.27760</td>\n",
+       "      <td>0.3001</td>\n",
+       "      <td>0.14710</td>\n",
+       "      <td>0.2419</td>\n",
+       "      <td>0.07871</td>\n",
+       "      <td>...</td>\n",
+       "      <td>17.33</td>\n",
+       "      <td>184.60</td>\n",
+       "      <td>2019.0</td>\n",
+       "      <td>0.1622</td>\n",
+       "      <td>0.6656</td>\n",
+       "      <td>0.7119</td>\n",
+       "      <td>0.2654</td>\n",
+       "      <td>0.4601</td>\n",
+       "      <td>0.11890</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>20.57</td>\n",
+       "      <td>17.77</td>\n",
+       "      <td>132.90</td>\n",
+       "      <td>1326.0</td>\n",
+       "      <td>0.08474</td>\n",
+       "      <td>0.07864</td>\n",
+       "      <td>0.0869</td>\n",
+       "      <td>0.07017</td>\n",
+       "      <td>0.1812</td>\n",
+       "      <td>0.05667</td>\n",
+       "      <td>...</td>\n",
+       "      <td>23.41</td>\n",
+       "      <td>158.80</td>\n",
+       "      <td>1956.0</td>\n",
+       "      <td>0.1238</td>\n",
+       "      <td>0.1866</td>\n",
+       "      <td>0.2416</td>\n",
+       "      <td>0.1860</td>\n",
+       "      <td>0.2750</td>\n",
+       "      <td>0.08902</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>19.69</td>\n",
+       "      <td>21.25</td>\n",
+       "      <td>130.00</td>\n",
+       "      <td>1203.0</td>\n",
+       "      <td>0.10960</td>\n",
+       "      <td>0.15990</td>\n",
+       "      <td>0.1974</td>\n",
+       "      <td>0.12790</td>\n",
+       "      <td>0.2069</td>\n",
+       "      <td>0.05999</td>\n",
+       "      <td>...</td>\n",
+       "      <td>25.53</td>\n",
+       "      <td>152.50</td>\n",
+       "      <td>1709.0</td>\n",
+       "      <td>0.1444</td>\n",
+       "      <td>0.4245</td>\n",
+       "      <td>0.4504</td>\n",
+       "      <td>0.2430</td>\n",
+       "      <td>0.3613</td>\n",
+       "      <td>0.08758</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>11.42</td>\n",
+       "      <td>20.38</td>\n",
+       "      <td>77.58</td>\n",
+       "      <td>386.1</td>\n",
+       "      <td>0.14250</td>\n",
+       "      <td>0.28390</td>\n",
+       "      <td>0.2414</td>\n",
+       "      <td>0.10520</td>\n",
+       "      <td>0.2597</td>\n",
+       "      <td>0.09744</td>\n",
+       "      <td>...</td>\n",
+       "      <td>26.50</td>\n",
+       "      <td>98.87</td>\n",
+       "      <td>567.7</td>\n",
+       "      <td>0.2098</td>\n",
+       "      <td>0.8663</td>\n",
+       "      <td>0.6869</td>\n",
+       "      <td>0.2575</td>\n",
+       "      <td>0.6638</td>\n",
+       "      <td>0.17300</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>20.29</td>\n",
+       "      <td>14.34</td>\n",
+       "      <td>135.10</td>\n",
+       "      <td>1297.0</td>\n",
+       "      <td>0.10030</td>\n",
+       "      <td>0.13280</td>\n",
+       "      <td>0.1980</td>\n",
+       "      <td>0.10430</td>\n",
+       "      <td>0.1809</td>\n",
+       "      <td>0.05883</td>\n",
+       "      <td>...</td>\n",
+       "      <td>16.67</td>\n",
+       "      <td>152.20</td>\n",
+       "      <td>1575.0</td>\n",
+       "      <td>0.1374</td>\n",
+       "      <td>0.2050</td>\n",
+       "      <td>0.4000</td>\n",
+       "      <td>0.1625</td>\n",
+       "      <td>0.2364</td>\n",
+       "      <td>0.07678</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 31 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \\\n",
+       "0        17.99         10.38          122.80     1001.0          0.11840   \n",
+       "1        20.57         17.77          132.90     1326.0          0.08474   \n",
+       "2        19.69         21.25          130.00     1203.0          0.10960   \n",
+       "3        11.42         20.38           77.58      386.1          0.14250   \n",
+       "4        20.29         14.34          135.10     1297.0          0.10030   \n",
+       "\n",
+       "   mean compactness  mean concavity  mean concave points  mean symmetry  \\\n",
+       "0           0.27760          0.3001              0.14710         0.2419   \n",
+       "1           0.07864          0.0869              0.07017         0.1812   \n",
+       "2           0.15990          0.1974              0.12790         0.2069   \n",
+       "3           0.28390          0.2414              0.10520         0.2597   \n",
+       "4           0.13280          0.1980              0.10430         0.1809   \n",
+       "\n",
+       "   mean fractal dimension   ...    worst texture  worst perimeter  worst area  \\\n",
+       "0                 0.07871   ...            17.33           184.60      2019.0   \n",
+       "1                 0.05667   ...            23.41           158.80      1956.0   \n",
+       "2                 0.05999   ...            25.53           152.50      1709.0   \n",
+       "3                 0.09744   ...            26.50            98.87       567.7   \n",
+       "4                 0.05883   ...            16.67           152.20      1575.0   \n",
+       "\n",
+       "   worst smoothness  worst compactness  worst concavity  worst concave points  \\\n",
+       "0            0.1622             0.6656           0.7119                0.2654   \n",
+       "1            0.1238             0.1866           0.2416                0.1860   \n",
+       "2            0.1444             0.4245           0.4504                0.2430   \n",
+       "3            0.2098             0.8663           0.6869                0.2575   \n",
+       "4            0.1374             0.2050           0.4000                0.1625   \n",
+       "\n",
+       "   worst symmetry  worst fractal dimension  target  \n",
+       "0          0.4601                  0.11890     0.0  \n",
+       "1          0.2750                  0.08902     0.0  \n",
+       "2          0.3613                  0.08758     0.0  \n",
+       "3          0.6638                  0.17300     0.0  \n",
+       "4          0.2364                  0.07678     0.0  \n",
+       "\n",
+       "[5 rows x 31 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((455, 30), (114, 30))"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
+    "                                                    data.target, test_size=0.2,\n",
+    "                                                    random_state=0)\n",
+    "X_train.shape, X_test.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Forward Selection\n",
+    " "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   11.4s finished\n",
+      "Features: 1/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  29 out of  29 | elapsed:   11.2s finished\n",
+      "Features: 2/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed:   10.7s finished\n",
+      "Features: 3/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   10.3s finished\n",
+      "Features: 4/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:   10.0s finished\n",
+      "Features: 5/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    9.6s finished\n",
+      "Features: 6/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    9.2s finished\n",
+      "Features: 7/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  23 out of  23 | elapsed:    8.8s finished\n",
+      "Features: 8/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  22 out of  22 | elapsed:    8.4s finished\n",
+      "Features: 9/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:    8.1s finished\n",
+      "Features: 10/10"
+     ]
+    }
+   ],
+   "source": [
+    "# step forward feature selection\n",
+    "# select top 10 features based on the optimal roc_auc and RandomForest Classifier\n",
+    "\n",
+    "sfs1 = SFS(RandomForestClassifier(n_jobs=-1,n_estimators=5), \n",
+    "           k_features=10, \n",
+    "           forward=True, \n",
+    "           floating=False, \n",
+    "           verbose=1,\n",
+    "           scoring='roc_auc',\n",
+    "           cv=3)\n",
+    "\n",
+    "sfs1 = sfs1.fit(np.array(X_train), y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['mean texture', 'mean perimeter', 'mean concavity',\n",
+       "       'mean fractal dimension', 'area error', 'compactness error',\n",
+       "       'worst perimeter', 'worst area', 'worst smoothness', 'worst symmetry'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "selected_feat1= X_train.columns[list(sfs1.k_feature_idx_)]\n",
+    "selected_feat1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Backward Elimination"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   11.5s finished\n",
+      "Features: 1/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  29 out of  29 | elapsed:   11.2s finished\n",
+      "Features: 2/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  28 out of  28 | elapsed:   10.7s finished\n",
+      "Features: 3/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:   10.2s finished\n",
+      "Features: 4/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  26 out of  26 | elapsed:   10.1s finished\n",
+      "Features: 5/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:    9.6s finished\n",
+      "Features: 6/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:    9.2s finished\n",
+      "Features: 7/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  23 out of  23 | elapsed:    8.8s finished\n",
+      "Features: 8/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  22 out of  22 | elapsed:    8.5s finished\n",
+      "Features: 9/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
+      "[Parallel(n_jobs=1)]: Done  21 out of  21 | elapsed:    8.2s finished\n",
+      "Features: 10/10"
+     ]
+    }
+   ],
+   "source": [
+    "# step backward feature selection\n",
+    "# select top 10 features based on the optimal roc_auc and RandomForest Classifier\n",
+    "\n",
+    "sfs2 = SFS(RandomForestClassifier(n_jobs=-1,n_estimators=5), \n",
+    "           k_features=10, \n",
+    "           forward=False, \n",
+    "           floating=False, \n",
+    "           verbose=1,\n",
+    "           scoring='roc_auc',\n",
+    "           cv=3)\n",
+    "\n",
+    "sfs2 = sfs1.fit(np.array(X_train.fillna(0)), y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['mean area', 'mean compactness', 'texture error', 'area error',\n",
+       "       'compactness error', 'concavity error', 'worst texture',\n",
+       "       'worst perimeter', 'worst smoothness', 'worst concavity'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 44,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "selected_feat2= X_train.columns[list(sfs2.k_feature_idx_)]\n",
+    "selected_feat2\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Note that SFS and SBE return different results"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Exhaustive Feature Selection"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Features: 847/847"
+     ]
+    }
+   ],
+   "source": [
+    "efs1 = EFS(RandomForestClassifier(n_jobs=-1,n_estimators=5, random_state=0), \n",
+    "           min_features=1,\n",
+    "           max_features=6, \n",
+    "           scoring='roc_auc',\n",
+    "           print_progress=True,\n",
+    "           cv=2)\n",
+    "\n",
+    "# in order to shorter search time for the demonstration\n",
+    "# we only try all possible 1,2,3,4,5,6\n",
+    "# feature combinations from a dataset of 10 features\n",
+    "\n",
+    "efs1 = efs1.fit(np.array(X_train[X_train.columns[0:10]].fillna(0)), y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['mean radius', 'mean texture', 'mean area', 'mean smoothness',\n",
+       "       'mean concavity'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "selected_feat3= X_train.columns[list(efs1.best_idx_)]\n",
+    "selected_feat3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/4.3_Demo_Feature_Selection_Embedded.ipynb
+++ b/4.3_Demo_Feature_Selection_Embedded.ipynb
--- a/4.4_Demo_Feature_Selection_Feature_Shuffling.ipynb
+++ b/4.4_Demo_Feature_Selection_Feature_Shuffling.ipynb
@@ -0,0 +1,595 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "# import seaborn as sns\n",
+    "# import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "# plt.style.use('seaborn-colorblind')\n",
+    "# %matplotlib inline\n",
+    "from feature_selection import feature_shuffle\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import load_breast_cancer\n",
+    "data = load_breast_cancer()\n",
+    "data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
+    "                  columns= np.append(data['feature_names'], ['target']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>mean radius</th>\n",
+       "      <th>mean texture</th>\n",
+       "      <th>mean perimeter</th>\n",
+       "      <th>mean area</th>\n",
+       "      <th>mean smoothness</th>\n",
+       "      <th>mean compactness</th>\n",
+       "      <th>mean concavity</th>\n",
+       "      <th>mean concave points</th>\n",
+       "      <th>mean symmetry</th>\n",
+       "      <th>mean fractal dimension</th>\n",
+       "      <th>...</th>\n",
+       "      <th>worst texture</th>\n",
+       "      <th>worst perimeter</th>\n",
+       "      <th>worst area</th>\n",
+       "      <th>worst smoothness</th>\n",
+       "      <th>worst compactness</th>\n",
+       "      <th>worst concavity</th>\n",
+       "      <th>worst concave points</th>\n",
+       "      <th>worst symmetry</th>\n",
+       "      <th>worst fractal dimension</th>\n",
+       "      <th>target</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>17.99</td>\n",
+       "      <td>10.38</td>\n",
+       "      <td>122.80</td>\n",
+       "      <td>1001.0</td>\n",
+       "      <td>0.11840</td>\n",
+       "      <td>0.27760</td>\n",
+       "      <td>0.3001</td>\n",
+       "      <td>0.14710</td>\n",
+       "      <td>0.2419</td>\n",
+       "      <td>0.07871</td>\n",
+       "      <td>...</td>\n",
+       "      <td>17.33</td>\n",
+       "      <td>184.60</td>\n",
+       "      <td>2019.0</td>\n",
+       "      <td>0.1622</td>\n",
+       "      <td>0.6656</td>\n",
+       "      <td>0.7119</td>\n",
+       "      <td>0.2654</td>\n",
+       "      <td>0.4601</td>\n",
+       "      <td>0.11890</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>20.57</td>\n",
+       "      <td>17.77</td>\n",
+       "      <td>132.90</td>\n",
+       "      <td>1326.0</td>\n",
+       "      <td>0.08474</td>\n",
+       "      <td>0.07864</td>\n",
+       "      <td>0.0869</td>\n",
+       "      <td>0.07017</td>\n",
+       "      <td>0.1812</td>\n",
+       "      <td>0.05667</td>\n",
+       "      <td>...</td>\n",
+       "      <td>23.41</td>\n",
+       "      <td>158.80</td>\n",
+       "      <td>1956.0</td>\n",
+       "      <td>0.1238</td>\n",
+       "      <td>0.1866</td>\n",
+       "      <td>0.2416</td>\n",
+       "      <td>0.1860</td>\n",
+       "      <td>0.2750</td>\n",
+       "      <td>0.08902</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>19.69</td>\n",
+       "      <td>21.25</td>\n",
+       "      <td>130.00</td>\n",
+       "      <td>1203.0</td>\n",
+       "      <td>0.10960</td>\n",
+       "      <td>0.15990</td>\n",
+       "      <td>0.1974</td>\n",
+       "      <td>0.12790</td>\n",
+       "      <td>0.2069</td>\n",
+       "      <td>0.05999</td>\n",
+       "      <td>...</td>\n",
+       "      <td>25.53</td>\n",
+       "      <td>152.50</td>\n",
+       "      <td>1709.0</td>\n",
+       "      <td>0.1444</td>\n",
+       "      <td>0.4245</td>\n",
+       "      <td>0.4504</td>\n",
+       "      <td>0.2430</td>\n",
+       "      <td>0.3613</td>\n",
+       "      <td>0.08758</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>11.42</td>\n",
+       "      <td>20.38</td>\n",
+       "      <td>77.58</td>\n",
+       "      <td>386.1</td>\n",
+       "      <td>0.14250</td>\n",
+       "      <td>0.28390</td>\n",
+       "      <td>0.2414</td>\n",
+       "      <td>0.10520</td>\n",
+       "      <td>0.2597</td>\n",
+       "      <td>0.09744</td>\n",
+       "      <td>...</td>\n",
+       "      <td>26.50</td>\n",
+       "      <td>98.87</td>\n",
+       "      <td>567.7</td>\n",
+       "      <td>0.2098</td>\n",
+       "      <td>0.8663</td>\n",
+       "      <td>0.6869</td>\n",
+       "      <td>0.2575</td>\n",
+       "      <td>0.6638</td>\n",
+       "      <td>0.17300</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>20.29</td>\n",
+       "      <td>14.34</td>\n",
+       "      <td>135.10</td>\n",
+       "      <td>1297.0</td>\n",
+       "      <td>0.10030</td>\n",
+       "      <td>0.13280</td>\n",
+       "      <td>0.1980</td>\n",
+       "      <td>0.10430</td>\n",
+       "      <td>0.1809</td>\n",
+       "      <td>0.05883</td>\n",
+       "      <td>...</td>\n",
+       "      <td>16.67</td>\n",
+       "      <td>152.20</td>\n",
+       "      <td>1575.0</td>\n",
+       "      <td>0.1374</td>\n",
+       "      <td>0.2050</td>\n",
+       "      <td>0.4000</td>\n",
+       "      <td>0.1625</td>\n",
+       "      <td>0.2364</td>\n",
+       "      <td>0.07678</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 31 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \\\n",
+       "0        17.99         10.38          122.80     1001.0          0.11840   \n",
+       "1        20.57         17.77          132.90     1326.0          0.08474   \n",
+       "2        19.69         21.25          130.00     1203.0          0.10960   \n",
+       "3        11.42         20.38           77.58      386.1          0.14250   \n",
+       "4        20.29         14.34          135.10     1297.0          0.10030   \n",
+       "\n",
+       "   mean compactness  mean concavity  mean concave points  mean symmetry  \\\n",
+       "0           0.27760          0.3001              0.14710         0.2419   \n",
+       "1           0.07864          0.0869              0.07017         0.1812   \n",
+       "2           0.15990          0.1974              0.12790         0.2069   \n",
+       "3           0.28390          0.2414              0.10520         0.2597   \n",
+       "4           0.13280          0.1980              0.10430         0.1809   \n",
+       "\n",
+       "   mean fractal dimension   ...    worst texture  worst perimeter  worst area  \\\n",
+       "0                 0.07871   ...            17.33           184.60      2019.0   \n",
+       "1                 0.05667   ...            23.41           158.80      1956.0   \n",
+       "2                 0.05999   ...            25.53           152.50      1709.0   \n",
+       "3                 0.09744   ...            26.50            98.87       567.7   \n",
+       "4                 0.05883   ...            16.67           152.20      1575.0   \n",
+       "\n",
+       "   worst smoothness  worst compactness  worst concavity  worst concave points  \\\n",
+       "0            0.1622             0.6656           0.7119                0.2654   \n",
+       "1            0.1238             0.1866           0.2416                0.1860   \n",
+       "2            0.1444             0.4245           0.4504                0.2430   \n",
+       "3            0.2098             0.8663           0.6869                0.2575   \n",
+       "4            0.1374             0.2050           0.4000                0.1625   \n",
+       "\n",
+       "   worst symmetry  worst fractal dimension  target  \n",
+       "0          0.4601                  0.11890     0.0  \n",
+       "1          0.2750                  0.08902     0.0  \n",
+       "2          0.3613                  0.08758     0.0  \n",
+       "3          0.6638                  0.17300     0.0  \n",
+       "4          0.2364                  0.07678     0.0  \n",
+       "\n",
+       "[5 rows x 31 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((455, 30), (114, 30))"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
+    "                                                    data.target, test_size=0.2,\n",
+    "                                                    random_state=0)\n",
+    "X_train.shape, X_test.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Feature Shuffling\n",
+    "permute the values of each feature, one at the time, and measure how much the permutation decreases the accuracy, or the roc_auc, or the mse of the machine learning model.\n",
+    "If the variables are important, this is, highly predictive, a random permutation of their values will decrease dramatically any of these metrics."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "auc_drop, selected_features = feature_shuffle.feature_shuffle_rf(X_train=X_train,\n",
+    "                                                                 y_train=y_train,\n",
+    "                                                                 random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>feature</th>\n",
+       "      <th>auc_drop</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>22</th>\n",
+       "      <td>worst perimeter</td>\n",
+       "      <td>8.359457e-05</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>27</th>\n",
+       "      <td>worst concave points</td>\n",
+       "      <td>3.134796e-05</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>23</th>\n",
+       "      <td>worst area</td>\n",
+       "      <td>1.110223e-16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>perimeter error</td>\n",
+       "      <td>1.110223e-16</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>mean radius</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>16</th>\n",
+       "      <td>concavity error</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>28</th>\n",
+       "      <td>worst symmetry</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>26</th>\n",
+       "      <td>worst concavity</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25</th>\n",
+       "      <td>worst compactness</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>24</th>\n",
+       "      <td>worst smoothness</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>21</th>\n",
+       "      <td>worst texture</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>20</th>\n",
+       "      <td>worst radius</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>19</th>\n",
+       "      <td>fractal dimension error</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>18</th>\n",
+       "      <td>symmetry error</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>concave points error</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>15</th>\n",
+       "      <td>compactness error</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>mean texture</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>smoothness error</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>13</th>\n",
+       "      <td>area error</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>11</th>\n",
+       "      <td>texture error</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10</th>\n",
+       "      <td>radius error</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>mean fractal dimension</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>mean symmetry</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>mean concave points</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>mean concavity</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>mean compactness</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>mean smoothness</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>mean area</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>mean perimeter</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>29</th>\n",
+       "      <td>worst fractal dimension</td>\n",
+       "      <td>0.000000e+00</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                    feature      auc_drop\n",
+       "22          worst perimeter  8.359457e-05\n",
+       "27     worst concave points  3.134796e-05\n",
+       "23               worst area  1.110223e-16\n",
+       "12          perimeter error  1.110223e-16\n",
+       "0               mean radius  0.000000e+00\n",
+       "16          concavity error  0.000000e+00\n",
+       "28           worst symmetry  0.000000e+00\n",
+       "26          worst concavity  0.000000e+00\n",
+       "25        worst compactness  0.000000e+00\n",
+       "24         worst smoothness  0.000000e+00\n",
+       "21            worst texture  0.000000e+00\n",
+       "20             worst radius  0.000000e+00\n",
+       "19  fractal dimension error  0.000000e+00\n",
+       "18           symmetry error  0.000000e+00\n",
+       "17     concave points error  0.000000e+00\n",
+       "15        compactness error  0.000000e+00\n",
+       "1              mean texture  0.000000e+00\n",
+       "14         smoothness error  0.000000e+00\n",
+       "13               area error  0.000000e+00\n",
+       "11            texture error  0.000000e+00\n",
+       "10             radius error  0.000000e+00\n",
+       "9    mean fractal dimension  0.000000e+00\n",
+       "8             mean symmetry  0.000000e+00\n",
+       "7       mean concave points  0.000000e+00\n",
+       "6            mean concavity  0.000000e+00\n",
+       "5          mean compactness  0.000000e+00\n",
+       "4           mean smoothness  0.000000e+00\n",
+       "3                 mean area  0.000000e+00\n",
+       "2            mean perimeter  0.000000e+00\n",
+       "29  worst fractal dimension  0.000000e+00"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#  we select features that have auc_drop > 0\n",
+    "auc_drop"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "22         worst perimeter\n",
+       "27    worst concave points\n",
+       "23              worst area\n",
+       "12         perimeter error\n",
+       "Name: feature, dtype: object"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "selected_features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/4.5_Demo_Feature_Selection_Hybrid_method.ipynb
+++ b/4.5_Demo_Feature_Selection_Hybrid_method.ipynb
@@ -0,0 +1,884 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "# import seaborn as sns\n",
+    "# import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.feature_selection import SelectFromModel\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "# plt.style.use('seaborn-colorblind')\n",
+    "# %matplotlib inline\n",
+    "from sklearn.feature_selection import RFE\n",
+    "from feature_selection import hybrid\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.datasets import load_breast_cancer\n",
+    "data = load_breast_cancer()\n",
+    "data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
+    "                  columns= np.append(data['feature_names'], ['target']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>mean radius</th>\n",
+       "      <th>mean texture</th>\n",
+       "      <th>mean perimeter</th>\n",
+       "      <th>mean area</th>\n",
+       "      <th>mean smoothness</th>\n",
+       "      <th>mean compactness</th>\n",
+       "      <th>mean concavity</th>\n",
+       "      <th>mean concave points</th>\n",
+       "      <th>mean symmetry</th>\n",
+       "      <th>mean fractal dimension</th>\n",
+       "      <th>...</th>\n",
+       "      <th>worst texture</th>\n",
+       "      <th>worst perimeter</th>\n",
+       "      <th>worst area</th>\n",
+       "      <th>worst smoothness</th>\n",
+       "      <th>worst compactness</th>\n",
+       "      <th>worst concavity</th>\n",
+       "      <th>worst concave points</th>\n",
+       "      <th>worst symmetry</th>\n",
+       "      <th>worst fractal dimension</th>\n",
+       "      <th>target</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>17.99</td>\n",
+       "      <td>10.38</td>\n",
+       "      <td>122.80</td>\n",
+       "      <td>1001.0</td>\n",
+       "      <td>0.11840</td>\n",
+       "      <td>0.27760</td>\n",
+       "      <td>0.3001</td>\n",
+       "      <td>0.14710</td>\n",
+       "      <td>0.2419</td>\n",
+       "      <td>0.07871</td>\n",
+       "      <td>...</td>\n",
+       "      <td>17.33</td>\n",
+       "      <td>184.60</td>\n",
+       "      <td>2019.0</td>\n",
+       "      <td>0.1622</td>\n",
+       "      <td>0.6656</td>\n",
+       "      <td>0.7119</td>\n",
+       "      <td>0.2654</td>\n",
+       "      <td>0.4601</td>\n",
+       "      <td>0.11890</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>20.57</td>\n",
+       "      <td>17.77</td>\n",
+       "      <td>132.90</td>\n",
+       "      <td>1326.0</td>\n",
+       "      <td>0.08474</td>\n",
+       "      <td>0.07864</td>\n",
+       "      <td>0.0869</td>\n",
+       "      <td>0.07017</td>\n",
+       "      <td>0.1812</td>\n",
+       "      <td>0.05667</td>\n",
+       "      <td>...</td>\n",
+       "      <td>23.41</td>\n",
+       "      <td>158.80</td>\n",
+       "      <td>1956.0</td>\n",
+       "      <td>0.1238</td>\n",
+       "      <td>0.1866</td>\n",
+       "      <td>0.2416</td>\n",
+       "      <td>0.1860</td>\n",
+       "      <td>0.2750</td>\n",
+       "      <td>0.08902</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>19.69</td>\n",
+       "      <td>21.25</td>\n",
+       "      <td>130.00</td>\n",
+       "      <td>1203.0</td>\n",
+       "      <td>0.10960</td>\n",
+       "      <td>0.15990</td>\n",
+       "      <td>0.1974</td>\n",
+       "      <td>0.12790</td>\n",
+       "      <td>0.2069</td>\n",
+       "      <td>0.05999</td>\n",
+       "      <td>...</td>\n",
+       "      <td>25.53</td>\n",
+       "      <td>152.50</td>\n",
+       "      <td>1709.0</td>\n",
+       "      <td>0.1444</td>\n",
+       "      <td>0.4245</td>\n",
+       "      <td>0.4504</td>\n",
+       "      <td>0.2430</td>\n",
+       "      <td>0.3613</td>\n",
+       "      <td>0.08758</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>11.42</td>\n",
+       "      <td>20.38</td>\n",
+       "      <td>77.58</td>\n",
+       "      <td>386.1</td>\n",
+       "      <td>0.14250</td>\n",
+       "      <td>0.28390</td>\n",
+       "      <td>0.2414</td>\n",
+       "      <td>0.10520</td>\n",
+       "      <td>0.2597</td>\n",
+       "      <td>0.09744</td>\n",
+       "      <td>...</td>\n",
+       "      <td>26.50</td>\n",
+       "      <td>98.87</td>\n",
+       "      <td>567.7</td>\n",
+       "      <td>0.2098</td>\n",
+       "      <td>0.8663</td>\n",
+       "      <td>0.6869</td>\n",
+       "      <td>0.2575</td>\n",
+       "      <td>0.6638</td>\n",
+       "      <td>0.17300</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>20.29</td>\n",
+       "      <td>14.34</td>\n",
+       "      <td>135.10</td>\n",
+       "      <td>1297.0</td>\n",
+       "      <td>0.10030</td>\n",
+       "      <td>0.13280</td>\n",
+       "      <td>0.1980</td>\n",
+       "      <td>0.10430</td>\n",
+       "      <td>0.1809</td>\n",
+       "      <td>0.05883</td>\n",
+       "      <td>...</td>\n",
+       "      <td>16.67</td>\n",
+       "      <td>152.20</td>\n",
+       "      <td>1575.0</td>\n",
+       "      <td>0.1374</td>\n",
+       "      <td>0.2050</td>\n",
+       "      <td>0.4000</td>\n",
+       "      <td>0.1625</td>\n",
+       "      <td>0.2364</td>\n",
+       "      <td>0.07678</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 31 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \\\n",
+       "0        17.99         10.38          122.80     1001.0          0.11840   \n",
+       "1        20.57         17.77          132.90     1326.0          0.08474   \n",
+       "2        19.69         21.25          130.00     1203.0          0.10960   \n",
+       "3        11.42         20.38           77.58      386.1          0.14250   \n",
+       "4        20.29         14.34          135.10     1297.0          0.10030   \n",
+       "\n",
+       "   mean compactness  mean concavity  mean concave points  mean symmetry  \\\n",
+       "0           0.27760          0.3001              0.14710         0.2419   \n",
+       "1           0.07864          0.0869              0.07017         0.1812   \n",
+       "2           0.15990          0.1974              0.12790         0.2069   \n",
+       "3           0.28390          0.2414              0.10520         0.2597   \n",
+       "4           0.13280          0.1980              0.10430         0.1809   \n",
+       "\n",
+       "   mean fractal dimension   ...    worst texture  worst perimeter  worst area  \\\n",
+       "0                 0.07871   ...            17.33           184.60      2019.0   \n",
+       "1                 0.05667   ...            23.41           158.80      1956.0   \n",
+       "2                 0.05999   ...            25.53           152.50      1709.0   \n",
+       "3                 0.09744   ...            26.50            98.87       567.7   \n",
+       "4                 0.05883   ...            16.67           152.20      1575.0   \n",
+       "\n",
+       "   worst smoothness  worst compactness  worst concavity  worst concave points  \\\n",
+       "0            0.1622             0.6656           0.7119                0.2654   \n",
+       "1            0.1238             0.1866           0.2416                0.1860   \n",
+       "2            0.1444             0.4245           0.4504                0.2430   \n",
+       "3            0.2098             0.8663           0.6869                0.2575   \n",
+       "4            0.1374             0.2050           0.4000                0.1625   \n",
+       "\n",
+       "   worst symmetry  worst fractal dimension  target  \n",
+       "0          0.4601                  0.11890     0.0  \n",
+       "1          0.2750                  0.08902     0.0  \n",
+       "2          0.3613                  0.08758     0.0  \n",
+       "3          0.6638                  0.17300     0.0  \n",
+       "4          0.2364                  0.07678     0.0  \n",
+       "\n",
+       "[5 rows x 31 columns]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((455, 30), (114, 30))"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
+    "                                                    data.target, test_size=0.2,\n",
+    "                                                    random_state=0)\n",
+    "X_train.shape, X_test.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##  Recursive Feature Elimination \n",
+    "### with Random Forests Importance\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Example 1\n",
+    "This method is slightly **different from the guide**, as it use a different stopping criterion: the desired number of features to select is eventually reached."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "RFE(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',\n",
+       "            max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
+       "            min_impurity_decrease=0.0, min_impurity_split=None,\n",
+       "            min_samples_leaf=1, min_samples_split=2,\n",
+       "            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=None,\n",
+       "            oob_score=False, random_state=None, verbose=0,\n",
+       "            warm_start=False),\n",
+       "  n_features_to_select=10, step=1, verbose=0)"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "#  n_features_to_select decide the stopping criterion\n",
+    "# we stop till 10 features remaining\n",
+    "\n",
+    "sel_ = RFE(RandomForestClassifier(n_estimators=20), n_features_to_select=10)\n",
+    "sel_.fit(X_train.fillna(0), y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Index(['mean texture', 'mean perimeter', 'mean area', 'mean concavity',\n",
+      "       'mean concave points', 'worst radius', 'worst perimeter', 'worst area',\n",
+      "       'worst concave points', 'worst symmetry'],\n",
+      "      dtype='object')\n"
+     ]
+    }
+   ],
+   "source": [
+    "selected_feat = X_train.columns[(sel_.get_support())]\n",
+    "print(selected_feat)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "### Example 2\n",
+    "recursive feature elimination with RandomForest\n",
+    "with the method same as the guide\n",
+    "1. Rank the features according to their importance derived from a machine learning algorithm: it can be tree importance, or LASSO / Ridge,  or the linear / logistic regression coefficients.\n",
+    "2. Remove one feature -the least important- and build a machine learning algorithm utilizing the remaining features.\n",
+    "3. Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.\n",
+    "4. If the metric decreases by more of an arbitrarily set threshold, then that feature is important and should be kept. Otherwise, we can remove that feature.\n",
+    "5. Repeat steps 2-4 until all features have been removed (and therefore evaluated) and the drop in performance assessed.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "testing feature:  mean radius  which is feature  1  out of  30\n",
+      "New Test ROC AUC=0.9941251190854239\n",
+      "All features Test ROC AUC=0.9968243886948238\n",
+      "Drop in ROC AUC=0.0026992696093999236\n",
+      "keep:  mean radius\n",
+      "\n",
+      "testing feature:  mean texture  which is feature  2  out of  30\n",
+      "New Test ROC AUC=0.9936487773896475\n",
+      "All features Test ROC AUC=0.9968243886948238\n",
+      "Drop in ROC AUC=0.0031756113051762958\n",
+      "keep:  mean texture\n",
+      "\n",
+      "testing feature:  mean perimeter  which is feature  3  out of  30\n",
+      "New Test ROC AUC=0.9968243886948238\n",
+      "All features Test ROC AUC=0.9968243886948238\n",
+      "Drop in ROC AUC=0.0\n",
+      "remove:  mean perimeter\n",
+      "\n",
+      "testing feature:  mean area  which is feature  4  out of  30\n",
+      "New Test ROC AUC=0.9960304858685297\n",
+      "All features Test ROC AUC=0.9968243886948238\n",
+      "Drop in ROC AUC=0.0007939028262941017\n",
+      "remove:  mean area\n",
+      "\n",
+      "testing feature:  mean smoothness  which is feature  5  out of  30\n",
+      "New Test ROC AUC=0.9965068275643061\n",
+      "All features Test ROC AUC=0.9960304858685297\n",
+      "Drop in ROC AUC=-0.0004763416957763722\n",
+      "remove:  mean smoothness\n",
+      "\n",
+      "testing feature:  mean compactness  which is feature  6  out of  30\n",
+      "New Test ROC AUC=0.9942838996506828\n",
+      "All features Test ROC AUC=0.9965068275643061\n",
+      "Drop in ROC AUC=0.0022229279136233293\n",
+      "keep:  mean compactness\n",
+      "\n",
+      "testing feature:  mean concavity  which is feature  7  out of  30\n",
+      "New Test ROC AUC=0.9957129247380121\n",
+      "All features Test ROC AUC=0.9965068275643061\n",
+      "Drop in ROC AUC=0.0007939028262939907\n",
+      "remove:  mean concavity\n",
+      "\n",
+      "testing feature:  mean concave points  which is feature  8  out of  30\n",
+      "New Test ROC AUC=0.9976182915211178\n",
+      "All features Test ROC AUC=0.9957129247380121\n",
+      "Drop in ROC AUC=-0.0019053667831057108\n",
+      "remove:  mean concave points\n",
+      "\n",
+      "testing feature:  mean symmetry  which is feature  9  out of  30\n",
+      "New Test ROC AUC=0.9953953636074945\n",
+      "All features Test ROC AUC=0.9976182915211178\n",
+      "Drop in ROC AUC=0.0022229279136233293\n",
+      "keep:  mean symmetry\n",
+      "\n",
+      "testing feature:  mean fractal dimension  which is feature  10  out of  30\n",
+      "New Test ROC AUC=0.9949190219117181\n",
+      "All features Test ROC AUC=0.9976182915211178\n",
+      "Drop in ROC AUC=0.0026992696093997015\n",
+      "keep:  mean fractal dimension\n",
+      "\n",
+      "testing feature:  radius error  which is feature  11  out of  30\n",
+      "New Test ROC AUC=0.9952365830422356\n",
+      "All features Test ROC AUC=0.9976182915211178\n",
+      "Drop in ROC AUC=0.002381708478882194\n",
+      "keep:  radius error\n",
+      "\n",
+      "testing feature:  texture error  which is feature  12  out of  30\n",
+      "New Test ROC AUC=0.9952365830422356\n",
+      "All features Test ROC AUC=0.9976182915211178\n",
+      "Drop in ROC AUC=0.002381708478882194\n",
+      "keep:  texture error\n",
+      "\n",
+      "testing feature:  perimeter error  which is feature  13  out of  30\n",
+      "New Test ROC AUC=0.9939663385201651\n",
+      "All features Test ROC AUC=0.9976182915211178\n",
+      "Drop in ROC AUC=0.003651953000952668\n",
+      "keep:  perimeter error\n",
+      "\n",
+      "testing feature:  area error  which is feature  14  out of  30\n",
+      "New Test ROC AUC=0.994919021911718\n",
+      "All features Test ROC AUC=0.9976182915211178\n",
+      "Drop in ROC AUC=0.0026992696093998125\n",
+      "keep:  area error\n",
+      "\n",
+      "testing feature:  smoothness error  which is feature  15  out of  30\n",
+      "New Test ROC AUC=0.995871705303271\n",
+      "All features Test ROC AUC=0.9976182915211178\n",
+      "Drop in ROC AUC=0.001746586217846846\n",
+      "keep:  smoothness error\n",
+      "\n",
+      "testing feature:  compactness error  which is feature  16  out of  30\n",
+      "New Test ROC AUC=0.9958717053032708\n",
+      "All features Test ROC AUC=0.9976182915211178\n",
+      "Drop in ROC AUC=0.0017465862178469571\n",
+      "keep:  compactness error\n",
+      "\n",
+      "testing feature:  concavity error  which is feature  17  out of  30\n",
+      "New Test ROC AUC=0.9961892664337886\n",
+      "All features Test ROC AUC=0.9976182915211178\n",
+      "Drop in ROC AUC=0.0014290250873292276\n",
+      "keep:  concavity error\n",
+      "\n",
+      "testing feature:  concave points error  which is feature  18  out of  30\n",
+      "New Test ROC AUC=0.9961892664337885\n",
+      "All features Test ROC AUC=0.9976182915211178\n",
+      "Drop in ROC AUC=0.0014290250873293386\n",
+      "keep:  concave points error\n",
+      "\n",
+      "testing feature:  symmetry error  which is feature  19  out of  30\n",
+      "New Test ROC AUC=0.9968243886948238\n",
+      "All features Test ROC AUC=0.9976182915211178\n",
+      "Drop in ROC AUC=0.0007939028262939907\n",
+      "remove:  symmetry error\n",
+      "\n",
+      "testing feature:  fractal dimension error  which is feature  20  out of  30\n",
+      "New Test ROC AUC=0.9946014607812005\n",
+      "All features Test ROC AUC=0.9968243886948238\n",
+      "Drop in ROC AUC=0.0022229279136233293\n",
+      "keep:  fractal dimension error\n",
+      "\n",
+      "testing feature:  worst radius  which is feature  21  out of  30\n",
+      "New Test ROC AUC=0.9955541441727532\n",
+      "All features Test ROC AUC=0.9968243886948238\n",
+      "Drop in ROC AUC=0.001270244522070585\n",
+      "keep:  worst radius\n",
+      "\n",
+      "testing feature:  worst texture  which is feature  22  out of  30\n",
+      "New Test ROC AUC=0.9958717053032708\n",
+      "All features Test ROC AUC=0.9968243886948238\n",
+      "Drop in ROC AUC=0.0009526833915529664\n",
+      "remove:  worst texture\n",
+      "\n",
+      "testing feature:  worst perimeter  which is feature  23  out of  30\n",
+      "New Test ROC AUC=0.995871705303271\n",
+      "All features Test ROC AUC=0.9958717053032708\n",
+      "Drop in ROC AUC=-1.1102230246251565e-16\n",
+      "remove:  worst perimeter\n",
+      "\n",
+      "testing feature:  worst area  which is feature  24  out of  30\n",
+      "New Test ROC AUC=0.9938075579549063\n",
+      "All features Test ROC AUC=0.995871705303271\n",
+      "Drop in ROC AUC=0.0020641473483646866\n",
+      "keep:  worst area\n",
+      "\n",
+      "testing feature:  worst smoothness  which is feature  25  out of  30\n",
+      "New Test ROC AUC=0.9939663385201651\n",
+      "All features Test ROC AUC=0.995871705303271\n",
+      "Drop in ROC AUC=0.0019053667831058219\n",
+      "keep:  worst smoothness\n",
+      "\n",
+      "testing feature:  worst compactness  which is feature  26  out of  30\n",
+      "New Test ROC AUC=0.9960304858685296\n",
+      "All features Test ROC AUC=0.995871705303271\n",
+      "Drop in ROC AUC=-0.0001587805652586427\n",
+      "remove:  worst compactness\n",
+      "\n",
+      "testing feature:  worst concavity  which is feature  27  out of  30\n",
+      "New Test ROC AUC=0.9966656081295648\n",
+      "All features Test ROC AUC=0.9960304858685296\n",
+      "Drop in ROC AUC=-0.0006351222610352369\n",
+      "remove:  worst concavity\n",
+      "\n",
+      "testing feature:  worst concave points  which is feature  28  out of  30\n",
+      "New Test ROC AUC=0.9936487773896475\n",
+      "All features Test ROC AUC=0.9966656081295648\n",
+      "Drop in ROC AUC=0.00301683073991732\n",
+      "keep:  worst concave points\n",
+      "\n",
+      "testing feature:  worst symmetry  which is feature  29  out of  30\n",
+      "New Test ROC AUC=0.9976182915211178\n",
+      "All features Test ROC AUC=0.9966656081295648\n",
+      "Drop in ROC AUC=-0.0009526833915529664\n",
+      "remove:  worst symmetry\n",
+      "\n",
+      "testing feature:  worst fractal dimension  which is feature  30  out of  30\n",
+      "New Test ROC AUC=0.9973007303906002\n",
+      "All features Test ROC AUC=0.9976182915211178\n",
+      "Drop in ROC AUC=0.00031756113051761847\n",
+      "remove:  worst fractal dimension\n",
+      "DONE!!\n",
+      "total features to remove:  12\n",
+      "total features to keep:  18\n"
+     ]
+    }
+   ],
+   "source": [
+    "# tol decide whether we should drop or keep the feature in current round\n",
+    "features_to_keep = hybrid.recursive_feature_elimination_rf(X_train=X_train,\n",
+    "                                                           y_train=y_train,\n",
+    "                                                           X_test=X_test,\n",
+    "                                                           y_test=y_test,\n",
+    "                                                           tol=0.001)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['mean radius',\n",
+       " 'mean texture',\n",
+       " 'mean compactness',\n",
+       " 'mean symmetry',\n",
+       " 'mean fractal dimension',\n",
+       " 'radius error',\n",
+       " 'texture error',\n",
+       " 'perimeter error',\n",
+       " 'area error',\n",
+       " 'smoothness error',\n",
+       " 'compactness error',\n",
+       " 'concavity error',\n",
+       " 'concave points error',\n",
+       " 'fractal dimension error',\n",
+       " 'worst radius',\n",
+       " 'worst area',\n",
+       " 'worst smoothness',\n",
+       " 'worst concave points']"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "features_to_keep"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Recursive Feature Addition\n",
+    "### with Random Forests Importance"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Example 1\n",
+    "recursive feature addition with RandomForest\n",
+    "with the method same as the guide\n",
+    "1. Rank the features according to their importance derived from a  machine learning algorithm: it can be tree importance, or LASSO / Ridge,  or the linear / logistic regression coefficients.\n",
+    "2. Build a machine learning model with only 1 feature, the most important one, and calculate the model metric for performance.\n",
+    "3. Add one feature -the most important- and build a machine learning  algorithm utilizing the added and any feature from previous rounds.\n",
+    "4. Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.\n",
+    "5. If the metric increases by more than an arbitrarily set threshold,  then that feature is important and should be kept. Otherwise, we can  remove that feature.\n",
+    "6. Repeat steps 2-5 until all features have been removed (and therefore evaluated) and the drop in performance assessed.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "testing feature:  mean texture  which is feature  1  out of  30\n",
+      "New Test ROC AUC=0.9558590028580501\n",
+      "All features Test ROC AUC=0.9009209272785013\n",
+      "Increase in ROC AUC=0.054938075579548884\n",
+      "keep:  mean texture\n",
+      "\n",
+      "testing feature:  mean perimeter  which is feature  2  out of  30\n",
+      "New Test ROC AUC=0.9609399809463322\n",
+      "All features Test ROC AUC=0.9558590028580501\n",
+      "Increase in ROC AUC=0.005080978088282007\n",
+      "keep:  mean perimeter\n",
+      "\n",
+      "testing feature:  mean area  which is feature  3  out of  30\n",
+      "New Test ROC AUC=0.9609399809463322\n",
+      "All features Test ROC AUC=0.9609399809463322\n",
+      "Increase in ROC AUC=0.0\n",
+      "remove:  mean area\n",
+      "\n",
+      "testing feature:  mean smoothness  which is feature  4  out of  30\n",
+      "New Test ROC AUC=0.9684026675134964\n",
+      "All features Test ROC AUC=0.9609399809463322\n",
+      "Increase in ROC AUC=0.007462686567164201\n",
+      "keep:  mean smoothness\n",
+      "\n",
+      "testing feature:  mean compactness  which is feature  5  out of  30\n",
+      "New Test ROC AUC=0.9750714512543665\n",
+      "All features Test ROC AUC=0.9684026675134964\n",
+      "Increase in ROC AUC=0.006668783740870099\n",
+      "keep:  mean compactness\n",
+      "\n",
+      "testing feature:  mean concavity  which is feature  6  out of  30\n",
+      "New Test ROC AUC=0.9933312162591298\n",
+      "All features Test ROC AUC=0.9750714512543665\n",
+      "Increase in ROC AUC=0.01825976500476334\n",
+      "keep:  mean concavity\n",
+      "\n",
+      "testing feature:  mean concave points  which is feature  7  out of  30\n",
+      "New Test ROC AUC=0.9925373134328358\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.0007939028262939907\n",
+      "remove:  mean concave points\n",
+      "\n",
+      "testing feature:  mean symmetry  which is feature  8  out of  30\n",
+      "New Test ROC AUC=0.9895204826929185\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.0038107335662113107\n",
+      "remove:  mean symmetry\n",
+      "\n",
+      "testing feature:  mean fractal dimension  which is feature  9  out of  30\n",
+      "New Test ROC AUC=0.9892029215624007\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.00412829469672904\n",
+      "remove:  mean fractal dimension\n",
+      "\n",
+      "testing feature:  radius error  which is feature  10  out of  30\n",
+      "New Test ROC AUC=0.9895204826929184\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.0038107335662114217\n",
+      "remove:  radius error\n",
+      "\n",
+      "testing feature:  texture error  which is feature  11  out of  30\n",
+      "New Test ROC AUC=0.9868212130835186\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.006510003175611234\n",
+      "remove:  texture error\n",
+      "\n",
+      "testing feature:  perimeter error  which is feature  12  out of  30\n",
+      "New Test ROC AUC=0.9890441409971419\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.004287075261987905\n",
+      "remove:  perimeter error\n",
+      "\n",
+      "testing feature:  area error  which is feature  13  out of  30\n",
+      "New Test ROC AUC=0.989044140997142\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.004287075261987794\n",
+      "remove:  area error\n",
+      "\n",
+      "testing feature:  smoothness error  which is feature  14  out of  30\n",
+      "New Test ROC AUC=0.988091457605589\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.00523975865354076\n",
+      "remove:  smoothness error\n",
+      "\n",
+      "testing feature:  compactness error  which is feature  15  out of  30\n",
+      "New Test ROC AUC=0.9895204826929184\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.0038107335662114217\n",
+      "remove:  compactness error\n",
+      "\n",
+      "testing feature:  concavity error  which is feature  16  out of  30\n",
+      "New Test ROC AUC=0.9911082883455065\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.0022229279136233293\n",
+      "remove:  concavity error\n",
+      "\n",
+      "testing feature:  concave points error  which is feature  17  out of  30\n",
+      "New Test ROC AUC=0.9906319466497301\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.0026992696093997015\n",
+      "remove:  concave points error\n",
+      "\n",
+      "testing feature:  symmetry error  which is feature  18  out of  30\n",
+      "New Test ROC AUC=0.9876151159098127\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.0057161003493171325\n",
+      "remove:  symmetry error\n",
+      "\n",
+      "testing feature:  fractal dimension error  which is feature  19  out of  30\n",
+      "New Test ROC AUC=0.9896792632581772\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.003651953000952557\n",
+      "remove:  fractal dimension error\n",
+      "\n",
+      "testing feature:  worst radius  which is feature  20  out of  30\n",
+      "New Test ROC AUC=0.994125119085424\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=0.0007939028262942127\n",
+      "remove:  worst radius\n",
+      "\n",
+      "testing feature:  worst texture  which is feature  21  out of  30\n",
+      "New Test ROC AUC=0.9906319466497301\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.0026992696093997015\n",
+      "remove:  worst texture\n",
+      "\n",
+      "testing feature:  worst perimeter  which is feature  22  out of  30\n",
+      "New Test ROC AUC=0.9933312162591299\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=1.1102230246251565e-16\n",
+      "remove:  worst perimeter\n",
+      "\n",
+      "testing feature:  worst area  which is feature  23  out of  30\n",
+      "New Test ROC AUC=0.9931724356938711\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.0001587805652586427\n",
+      "remove:  worst area\n",
+      "\n",
+      "testing feature:  worst smoothness  which is feature  24  out of  30\n",
+      "New Test ROC AUC=0.9933312162591299\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=1.1102230246251565e-16\n",
+      "remove:  worst smoothness\n",
+      "\n",
+      "testing feature:  worst compactness  which is feature  25  out of  30\n",
+      "New Test ROC AUC=0.9895204826929184\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=-0.0038107335662114217\n",
+      "remove:  worst compactness\n",
+      "\n",
+      "testing feature:  worst concavity  which is feature  26  out of  30\n",
+      "New Test ROC AUC=0.9938075579549063\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=0.0004763416957764832\n",
+      "remove:  worst concavity\n",
+      "\n",
+      "testing feature:  worst concave points  which is feature  27  out of  30\n",
+      "New Test ROC AUC=0.9971419498253413\n",
+      "All features Test ROC AUC=0.9933312162591298\n",
+      "Increase in ROC AUC=0.0038107335662115327\n",
+      "keep:  worst concave points\n",
+      "\n",
+      "testing feature:  worst symmetry  which is feature  28  out of  30\n",
+      "New Test ROC AUC=0.9957129247380121\n",
+      "All features Test ROC AUC=0.9971419498253413\n",
+      "Increase in ROC AUC=-0.0014290250873292276\n",
+      "remove:  worst symmetry\n",
+      "\n",
+      "testing feature:  worst fractal dimension  which is feature  29  out of  30\n",
+      "New Test ROC AUC=0.9950778024769769\n",
+      "All features Test ROC AUC=0.9971419498253413\n",
+      "Increase in ROC AUC=-0.0020641473483644646\n",
+      "remove:  worst fractal dimension\n",
+      "DONE!!\n",
+      "total features to keep:  7\n"
+     ]
+    }
+   ],
+   "source": [
+    "features_to_keep = hybrid.recursive_feature_addition_rf(X_train=X_train,\n",
+    "                                                        y_train=y_train,\n",
+    "                                                        X_test=X_test,\n",
+    "                                                        y_test=y_test,\n",
+    "                                                        tol=0.001)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['mean radius',\n",
+       " 'mean texture',\n",
+       " 'mean perimeter',\n",
+       " 'mean smoothness',\n",
+       " 'mean compactness',\n",
+       " 'mean concavity',\n",
+       " 'worst concave points']"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "features_to_keep"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/Selection.md
+++ b/Selection.md
@@ -0,0 +1,824 @@
+**Table of Contents**:
+
+[TOC]
+
+# A Short Guide for Feature Engineering and Feature Selection
+
+Feature engineering and selection is the art/science of converting data to the best way possible, which involve an elegant blend of domain expertise, intuition and mathematics. This guide is a concise reference for beginners with most simple yet widely used techniques for feature engineering and selection. Any comments and commits are most welcome.
+
+## 0. Basic Concepts
+
+### 0.1 What is Machine Learning
+
+> Machine Learning is the science of getting computers to act without being explicitly programmed  - [Arthur Samuel](https://simple.wikipedia.org/wiki/Machine_learning)
+
+> Machine Learning is a technique of data science that helps computers learn from existing data in order to forecast future behaviors, outcomes and trends  - [Microsoft](https://docs.microsoft.com/en-us/azure/machine-learning/service/overview-what-is-azure-ml)
+
+> The field of Machine Learning seeks to answer the question “How can we build computer systems that automatically improve with experience, and what are the fundamental laws that govern all learning processes?“ - [Carnegie Mellon University](http://www.cs.cmu.edu/~tom/pubs/MachineLearning.pdf)
+
+Narrowly speaking, in data mining context, machine learning (ML) is the process of letting computers to learn from historical data, recognize pattern/relationship within data, and then make predictions.
+
+
+
+### 0.2 Methodology
+
+A typical ML workflow/pipeline looks like this:
+
+
+![workflow](/images/workflow2.png)
+
+
+				Source: Practical Machine Learning with Python, Springer  
+
+
+There can be many ways to divide the tasks that make up the ML workflow into phases. But generally the basic steps are similar as the graph above.
+
+
+
+### 0.3 Typical Tasks
+
+| Task              | Definition                                    | Example                              |
+| ----------------- | --------------------------------------------- | ------------------------------------ |
+| Classification    | predict what category new instance belongs to | is the tumor malign/benign?          |
+| Regression        | predict a continuous numeric value            | predict house/stock prices in future |
+| Anomaly Detection | identify outliers                             | fraud detection                      |
+| Clustering        | separate similar data points into groups      | customer segmentation                |
+
+
+
+### 0.4 Terminology
+
+- **Feature**: also known as Attribute/ Independent Variable/ Predictor/ Input Variable. It's an individual measurable property/characteristic of a phenomenon being observed  [[wiki]](https://en.wikipedia.org/wiki/Feature_(machine_learning)).  The age of a person, etc.
+- **Target**: also known as Dependent Variable/ Response Variable/ Output Variable. It's the variable being predicted in supervised learning.
+- **Algorithm**: the specific procedure used to implement a particular ML technique. Linear Regression, etc.
+- **Model**: the algorithm applied to a dataset, complete with its settings (its parameters). Y=4.5x+0.8, etc. We want the model that best captures the relationship between features and the target.
+- **Supervised learning** : train the model with labeled data to generate reasonable predictions for the response to new data.
+- **Unsupervised learning** : train the model with un-labeled data to find intrinsic structures/ patterns within the data.
+- **Reinforcement learning**: the model is learned from a series of actions by maximizing a reward function, which can either be maximized by penalizing bad actions and/or rewarding good actions. Self-driving, etc.
+
+
+
+
+
+## 1. Data Exploration
+
+### 1.1 Variables
+
+**Definition**: any measurable property/characteristic of a phenomenon being observed. They are called 'variables' because the value they take may vary (and it usually does) in a population. 
+
+**Types of Variable**
+
+| Type        | Sub-type   | Definition                                                   | Example                        |
+| ----------- | ---------- | ------------------------------------------------------------ | ------------------------------ |
+| Categorical | Nominal    | Variables with values selected from a group of categories, while not having any kind of natural order. [ref](http://www-ist.massey.ac.nz/dstirlin/CAST/CAST/Hstructures/structures_c2.html) | Gender, car types              |
+|             | Ordinal    | A categorical variable whose categories can be meaningfully ordered. [ref](http://www-ist.massey.ac.nz/dstirlin/CAST/CAST/Hstructures/structures_c2.html) | Grade of an exam               |
+| Numerical   | Discrete   | Variables whose values are either finite or countably infinite. [wiki](https://en.wikipedia.org/wiki/Continuous_or_discrete_variable) | Number of children in a family |
+|             | Continuous | Variable which can take on infinitely many, uncountable values. [wiki](https://en.wikipedia.org/wiki/Continuous_or_discrete_variable) | House prices, time passed      |
+
+
+
+### 1.2 Variable Identification
+
+**Definition**: Identify the data types of each variable.
+
+**Note**:  In reality we may have mixed type of variable for a variety of reasons. For example, in credit scoring "Missed payment status" is a common variable that can take values 1, 2, 3 meaning that the customer has missed 1-3 payments in their account. And it can also take the value D, if the customer defaulted on that account. We may have to convert data types after certain steps of data cleaning.
+
+
+
+### 1.3 Univariate Analysis
+
+Descriptive statistics on one single variable.
+
+| Variable    | What to look                                                 |
+| ----------- | ------------------------------------------------------------ |
+| Categorical | **Shape**:<br />Histogram/ Frequency table...                |
+| Numerical   | **Central Tendency**:<br />Mean/ Median/ Mode<br />**Dispersion**:<br />Min/ Max/ Range/ Quantile/ IQR/ MAD/ Variance/ Standard Deviation/ <br />**Shape**:<br />Skewness/ Histogram/ Boxplot... |
+
+Below are some methods that can give us the basic stats on the variable:
+
+- pandas.Dataframe.describe()
+- pandas.Dataframe.dtypes
+- Barplot
+- Countplot
+- Boxplot
+- Distplot
+
+
+
+### 1.4 Bi-variate Analysis
+
+Descriptive statistics between two or more variables.
+
+- Scatter Plot
+
+- Correlation Plot
+- Heat Map
+
+**Scatter Plot** is a type of plot or mathematical diagram using Cartesian coordinates to display values for typically two variables for a set of data. If the pattern of dots slopes from lower left to upper right, it indicates a positive correlation between the variables being studied. If the pattern of dots slopes from upper left to lower right, it indicates a negative correlation. [[wiki]](https://en.wikipedia.org/wiki/Scatter_plot)
+
+**Correlation plot** can be used to quickly find insights. It is used to investigate the dependence between multiple variables at the same time and to highlight the most correlated variables in a data table.
+
+**Heat map** (or heatmap) is a graphical representation of data where the individual values contained in a matrix are represented as colors.
+
+
+
+
+
+## 2. Feature Cleaning
+
+### 2.1 Missing Values
+
+**Definition**: no value is stored in a certain observation within a variable.
+
+#### 2.1.1 Why Missing Data Matters
+
+- certain algorithms cannot work when missing value are present
+- even for algorithm that handle missing data, without treatment the model can lead to inaccurate conclusion
+
+A study on the impact of missing data on different ML algorithm can be found [here](http://core.ecu.edu/omgt/krosj/IMDSDataMining2003.pdf).
+
+#### 2.1.2 Missing Mechanisms[^1]
+
+It is important to understand the mechanisms by which missing fields are introduced in a dataset. Depending on the mechanism, we may choose to process the missing values differently. The mechanisms were first introduced by Rubin[^2].
+
+**Missing Completely at Random**
+
+A variable is missing completely at random (MCAR) if the probability of being missing is the same for all the observations. When data is MCAR, there is absolutely no relationship between the data missing and any other values, observed or missing, within the dataset. In other words, those missing data points are a random subset of the data. There is nothing systematic going on that makes some data more likely to be missing than other.
+
+If values for observations are missing completely at random, then disregarding those cases would not bias the inferences made.
+
+**Missing at Random**
+
+Missing as Random (MAR) occurs when there is a systematic relationship between the propensity of missing values and the observed data. In other words, the probability an observation being missing depends only on available information (other variables in the dataset), but not on the variable itself.
+
+For example, if men are more likely to disclose their weight than women, weight is MAR (on variable gender). The weight information will be missing at random for those men and women that decided not to disclose their weight, but as men are more prone to disclose it, there will be more missing values for women than for men.
+
+In a situation like the above, if we decide to proceed with the variable with missing values, we might benefit from including gender to control the bias in weight for the missing observations.
+
+**Missing Not At Random -  Depends on Unobserved Predictors**
+
+Missingness depends on information that has not been recorded, and this information also predicts the missing values. E.g., if a particular treatment causes discomfort, a patient is more likely to drop out of the study (and 'discomfort' is not measured).
+
+In this situation, data sample is biased if we drop those missing cases.
+
+**Missing Not At Random -  Depends on Missing Value Itself**
+
+Missingness depends on the (potentially missing) variable itself. E.g., people with higher earnings are less likely to reveal them.
+
+
+
+#### 2.1.3 How to Assume a Missing Mechanism
+
+- By **business understanding**. In many situations we can assume the mechanism by probing into the business logic behind that variable.
+- By **statistical test**.  Divide the dataset into ones with/without missing and perform t-test to see if there's significant differences. If there is, we can assume that missing is not completed at random.
+
+But we should keep in mind that we can hardly 100% be sure that data are MCAR, MAR, or MNAR because unobserved predictors (lurking variables) are unobserved. 
+
+
+
+#### 2.1.4 How to Handle Missing Data
+
+| Method                         | Definition                                                   | Pros                                                   | Cons                                                         |
+| ------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------ | :----------------------------------------------------------- |
+| Listwise Deletion              | excluding all cases (listwise) that have missing values      | preserve distribution if MCAR                          | 1. may discard too much data and hurt the model<br>2. may yield biased estimates if not MCAR (as we keep a special subsample from the population) |
+| Mean/Median/Mode Imputation    | replacing the NA by mean/median/most frequent values (for categorical feature) of that variable | good practice if MCAR                                  | 1. distort distribution<br>2. distort relationship with other variables |
+| End of distribution Imputation | replacing the NA by values that are at the far end of the distribution of that variable, calculated by mean + 3*std | Captures the importance of missingness if there is one | 1. distort distribution<br />2. may be considered outlier if NA is few or mask true outlier if NA is many.<br />3. if missingness is not important this may mask the predictive power of the original variable |
+| Random Imputation              | replacing the NA by taking a random value from the pool of available observations of that variable | preserve distribution if MCAR                          | not recommended in business settings for its randomness (different result for same input) |
+| Arbitrary Value Imputation     | replacing the NA by arbitrary values                         | Captures the importance of missingness if there is one | 1. distort distribution<br />2. typical used value: -9999/9999. But be aware it may be regarded as outliers. |
+| Add a variable to denote NA    | creating an additional variable indicating whether the data was missing for that observation | Captures the importance of missingness if there is one | expand feature space                                         |
+
+In real settings, when it's hard to decide the missing mechanism or there's few time to study deeply about each missing variables, the popular way is to adopt:
+
+- Mean/Median/Mode Imputation (depend on the distribution)
+- End of distribution Imputation
+- Add a variable to denote NA
+
+simultaneously, so that we both catch the value of missingness and obtain a complete dataset.
+
+**Note**: Some algorithms like XGboost incorporate missing data treatment into its model building process, so you don't need to do the step. However it's important to make sure you understand how the algorithm treat them and explain to the business team.
+
+
+
+### 2.2 Outliers
+
+**Definition**:  An outlier is an observation which deviates so much from the other observations as to arouse suspicions that it was generated by a different mechanism.[^3]  
+
+**Note**:  Outliers, depending on the context, either deserve special attention or should be completely ignored. For example, an unusual transaction on a credit card is usually a sign of fraudulent activity, while a height of 1600cm of a person is very likely due to measurement error and should be filter out or impute with something else.
+
+#### 2.2.1 Why Outlier Matters
+
+The presence of outliers may:
+
+- make algorithm not work properly
+- introduce noises to dataset
+- make samples less representative
+
+Some algorithms are very sensitive to outliers, For example, Adaboost may treat outliers as "hard" cases and put tremendous weights on outliers, therefore producing a model with bad generalization. Any algorithms that rely on means/variance are sensitive to outliers as those stats are greatly influenced by extreme values.
+
+On the other hand some algorithm are more robust to outliers. For example, decision trees tend to ignore the presence of outliers when creating the branches of their trees. Typically, trees make splits by asking if variable x >= value t, and therefore the outlier will fall on each side of the branch, but it will be treated equally as the remaining values, regardless of its magnitude.
+
+#### 2.2.2 Outlier Detection
+
+In fact outlier analysis and anomaly detection is a huge field of research. Charu's book "Outlier Analysis"[^4] offer a great insight into the topic. PyOD[^5] is a comprehensive Python toolkit which contains many of the advanced methods in this field.
+
+All the methods here listed are for univariate outlier detection. Multivariate outlier detection is beyond the scope of this guide.
+
+| Method                                   | Definition                                                   | Pros                                                         | Cons                                                         |
+| ---------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| Detect by arbitrary boundary             | identify outliers based on arbitrary boundaries              | flexiable                                                    | require business understanding                               |
+| Mean & Standard Deviation method[^6][^7] | outlier detection by Mean & Standard Deviation Method        | good for variable with Gaussian distribution (68-95-99 rule) | sensitive to extreme value itself (as the outlier increase the sd) |
+| IQR method[^8]                           | outlier detection by Interquartile Ranges Rule               | robust than Mean & SD method as it use quantile & IQR. Resilient to extremes. | can be too aggressive                                        |
+| MAD method[^6][^7]                       | outlier detection by Median and Median Absolute Deviation Method | robust than Mean & SD method. Resilient to extremes.         | can be too aggressive                                        |
+
+However, beyond these methods, it's more important to keep in mind that the business context should govern how you define and react to these outliers. The meanings of your findings should be dictated by the underlying context, rather than the number itself.
+
+
+
+#### 2.2.3 How to Handle Outliers
+
+| Method                          | Definition                                                   | Pros                             | Cons                                        |
+| ------------------------------- | ------------------------------------------------------------ | -------------------------------- | ------------------------------------------- |
+| Mean/Median/Mode Imputation     | replacing the outlier by mean/median/most frequent values of that variable | preserve distribution            | lose information of outlier if there is one |
+| Discretization                  | transform continuous variables into discrete variables       | minimize the impact from outlier | lose information of outlier if there is one |
+| Imputation with arbitrary value | impute outliers with arbitrary value.                        | flexiable                        | hard to decide the value                    |
+| Windsorization                  | top-coding & bottom coding (capping the maximum of a distribution at an arbitrarily set value, vice versa). | prevent model over-fitting       | distort distribution                        |
+| Discard outliers                | drop all the observations that are outliers                  | /                                | lose information of outlier if there is one |
+
+**Note**: A detailed guide of doing windsorization can be found [here](https://www.statisticshowto.datasciencecentral.com/winsorize/).
+
+There are many strategies for dealing with outliers in data, and depending on the context and data set, any could be the right or the wrong way. It’s important to investigate the nature of the outlier before deciding.
+
+
+
+### 2.3 Rare Values
+
+**Definition**: Categorical variable with some of its values appear only seldomly.
+
+**Note**:  In some situations rare values, like outliers, may contains valuable information of the dataset and therefore need particular attention. For example, a rare value in transaction may denote fraudulent.
+
+#### 2.3.1 Why Rare Value Matters
+
+- Rare values in categorical variables tend to cause over-fitting, particularly in **tree based** methods.
+- A big number of infrequent labels adds noise, with little information, therefore causing over-fitting.
+- Rare labels may be present in training set, but not in test set, therefore causing over-fitting to the train set.
+- Rare labels may appear in the test set, and not in the train set. Thus, the model will not know how to evaluate it. 
+
+#### 2.3.2 How to Handle Rare Value
+
+| Method                         | Definition                                                   |
+| ------------------------------ | ------------------------------------------------------------ |
+| Mode Imputation                | Replacing the rare label by most frequent label              |
+| Grouping into one new category | Grouping the observations that show rare labels into a unique category |
+
+Depending on the situation, we may use different strategies:
+
+- when **there's one predominant category (over 90%)** in the variable: observe the relationship between that variable and the target, then either discard that variable, or keep it as it was. In this case, variable often is not useful for prediction as it is quasi-constant (as we will later see in Feature Selection part).
+- when **there's a small number of categories**: keep it as it was. Because only few categories are unlikely to bring so much noise.
+- when **there's high cardinality**: try the 2 methods above. But it does not guarantee better results than original variable.
+
+
+
+### 2.4 High Cardinality
+
+**Definition**: The number of labels within a categorical variable is known as  cardinality. A high number of labels within a variable is known as high cardinality. 
+
+#### 2.4.1 Why High Cardinality Matters
+
+- Variables with too many labels tend to dominate over those with only a few labels, particularly in **tree based** algorithms.
+- A big number of labels within a variable may introduce noise with little if any information, therefore making the machine learning models  prone to over-fit.
+- Some of the labels may only be present in the training data set,  but not in the test set, therefore causing algorithms to over-fit the training set.
+- Contrarily, new labels may appear in the test set that were not  present in the training set, therefore leaving algorithm unable to perform a calculation over the new observation.
+
+#### 2.4.2 How to Handle High Cardinality
+
+| Method                                                 |
+| ------------------------------------------------------ |
+| Grouping labels with business understanding            |
+| Grouping labels with rare occurrence into one category |
+| Grouping labels with decision tree                     |
+
+All these methods attempt to group some of the labels and reduce cardinality. Grouping labels with decision tree is equivalent to the method introduced in section 3.2.2 Discretization with decision tree, which aims to merge labels into more homogenous groups. Grouping labels with rare occurrence into one category is equivalent to method in section 2.3.2.
+
+
+
+
+
+## 3. Feature Engineering
+
+### 3.1 Feature Scaling
+
+**Definition**: Feature scaling is a method used to standardize the range of independent variables or features of data. In data processing, it is also known as data normalization and is generally performed during the data preprocessing step.
+
+#### 3.1.1 Why Feature Scaling Matters
+
+- If range of inputs varies, in some algorithms, object functions will not work properly.
+
+- **Gradient descent** converges much faster with feature scaling done. Gradient descent is a common optimization algorithm used in logistic regression, SVMs,  neural networks etc.
+
+- Algorithms that involve **distance calculation** like KNN, Clustering are also affected by the magnitude of the feature. Just consider how Euclidean distance is calculated: taking the square root of the sum of the squared differences between observations. This distance can be greatly affected by differences in scale among the variables. Variables with large variances have a larger effect on this measure than variables with small variances.
+
+**Note**: Tree-based algorithms are almost the only algorithms that are not affected by the magnitude of the input, as we can easily see from how trees are built.  When deciding how to make a split, tree algorithm look for decisions like "whether feature value X>3.0" and compute the purity of the child node after the split, so the scale of the feature does not count.
+
+#### 3.1.2 How to Handle Feature Scaling
+
+| Method                                            | Definition                                                   | Pros                                                         | Cons                                                         |
+| ------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| Normalization - Standardization (Z-score scaling) | removes the mean and scales the data to unit variance.<br />z = (X - X.mean) /  std | feature is rescaled to have a standard normal distribution that centered around 0 with SD of 1 | compress the observations in the narrow range if the variable is skewed or has outliers, thus impair the predictive power. |
+| Min-Max scaling                                   | transforms features by scaling each feature to a given range. Default to [0,1].<br />X_scaled = (X - X.min / (X.max - X.min) | /                                                            | compress the observations in the narrow range if the variable is skewed or has outliers, thus impair the predictive power. |
+| Robust scaling                                    | removes the median and scales the data according to the quantile range (defaults to IQR)<br />X_scaled = (X - X.median) / IQR | better at preserving the spread of the variable after transformation for skewed variables | /                                                            |
+
+
+
+A comparison of three methods when facing outliers:
+
+<div align=center>
+
+![scaling](/images/scaling.png)
+
+[img source](https://stackoverflow.com/questions/51841506/data-standardization-vs-normalization-vs-robust-scaler)
+
+As we can see, Normalization - Standardization and Min-Max method will compress most data to a narrow range, while robust scaler does a better job at keeping the spread of the data, although it cannot **remove** the outlier from the processed result. Remember removing/imputing outliers is another topic in data cleaning and should be done beforehand.
+
+Experience on how to choose feature scaling method:
+
+- if your feature is not Gaussian like, say, has a skewed distribution or has outliers, Normalization - Standardization is not a good choice as it will compress most data to a narrow range.
+- However, we can transform the feature into Gaussian like and then use Normalization - Standardization. Feature transformation will be discussed in section 3.4
+- When performing distance or covariance calculation (algorithm like Clustering, PCA and LDA), it is better to use Normalization - Standardization as it will remove the effect of scales on variance and covariance. Explanation [here](https://blog.csdn.net/zbc1090549839/article/details/44103801).
+- Min-Max scaling has the same drawbacks as Normalization - Standardization, and also new data may not be bounded to [0,1] as they can be out of the original range. Some algorithms, for example some deep learning network prefer input on a 0-1 scale so this is a good choice.
+
+
+
+Below is some additional resource on this topic:
+
+- A comparison of the three methods when facing skewed variables can be found [here](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py).
+- An in-depth study of feature scaling can be found [here](http://sebastianraschka.com/Articles/2014_about_feature_scaling.html).
+
+
+
+### 3.2 Discretize
+
+**Definition**: Discretization is the process of transforming continuous variables into discrete variables by creating a set of contiguous intervals that spans the range of the variable's values.
+
+#### 3.2.1 Why Discretize Matters
+
+- help to improve model performance by grouping of similar attributes with similar predictive strengths
+- enhance interpretability with grouped values
+- minimize the impact of **extreme values/seldom reversal patterns**
+- prevent overfitting possible with numerical variables
+- allow feature interaction between continuous variables (section 3.5.5)
+
+
+
+#### 3.2.2 How to Handle Discretization
+
+| Method                              | Definition                                                   | Pros                                                         | Cons                                                         |
+| ----------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| Equal width binning                 | divides the scope of possible values into N bins of the same width | /                                                            | sensitive to skewed distribution                             |
+| Equal frequency binning             | divides the scope of possible values of the variable into N bins, where each bin carries the same amount of observations | may help boost the algorithm's performance                   | this arbitrary binning may disrupt the relationship with the target |
+| K-means binning                     | using k-means to partition values into clusters              | /                                                            | needs hyper-parameter tuning                                 |
+| Discretization using decision trees | using a decision tree to identify the optimal splitting points that would determine the bins | observations within each  bin are more similar to themselves than to those of other bins | 1. may cause over-fitting<br>2. may not get a good performing tree |
+| ChiMerge[^11]                       | supervised hierarchical bottom-up (merge) method that locally exploits the chi-square criterion to decide whether two adjacent intervals are similar enough to be merged | robust and make use of a priori knowledge                    | cannot handle unlabeled data                                 |
+
+In general there's no best choice of discretization method. It really depends on the dataset and the following learning algorithm. Study carefully about your features and context before deciding. You can also try different methods and compare the model performance.
+
+Some literature reviews on feature discretization can be found [here1](https://pdfs.semanticscholar.org/94c3/d92eccbb66f571153f99b7ae6c6167a00923.pdf), [here2](http://robotics.stanford.edu/users/sahami/papers-dir/disc.pdf), [here3](http://axon.cs.byu.edu/papers/ventura.thesis.ps).
+
+
+
+### 3.3 Feature Encoding
+
+#### 3.3.1 Why Feature Encoding Matters
+
+We must transform strings of categorical variables into numbers so that algorithms can handle those values. Even if you see an algorithm can take into categorical inputs, it's most likely that the algorithm incorporate the encoding process within.
+
+#### 3.3.2 How to Handle Feature Encoding
+
+| Method                   | Definition                                                   | Pros                                                         | Cons                                                         |
+| ------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| One-hot encoding         | replace the categorical variable by different boolean variables (0/1) to indicate whether or not certain label is true for that observation | keep all information of that variable                        | 1. expand feature space dramatically if too many labels in that variable<br />2. does not add additional value to make the variable more predictive |
+| Ordinal-encoding         | replace the labels by some ordinal number if ordinal is meaningful | straightforward                                              | does not add additional value to make the variable more predictive |
+| Count/frequency encoding | replace each label of the categorical variable by the count/frequency within that category | /                                                            | 1. may yield same encoding for two different labels (if they appear same times) and lose valuable info.<br />2. may not add predictive power |
+| Mean encoding     | replace the label by the mean of the target for that label. (the target must be 0/1 valued or continuous) | 1. Capture information within the label, therefore rendering more predictive features<br/>2. Create a monotonic relationship between the variable and the target<br>3. Do not expand the feature space | Prone to cause over-fitting                                  |
+| WOE encoding[^9]         | replace the label  with Weight of Evidence of each label. WOE is computed from the basic odds ratio: ln( (Proportion of Good Outcomes) / (Proportion of Bad Outcomes)) | 1. Establishes a monotonic relationship to the dependent variable<br/>2. Orders the categories on a "logistic" scale which is natural for logistic regression<br>3，The transformed variables, can then be compared because they are on the same scale. Therefore, it is possible to determine which one is more predictive. | 1. May incur in loss of information (variation) due to binning to few categories<br/>2. Prone to cause over-fitting |
+| Target encoding[^10]     | Similar to mean encoding, but use both posterior probability and prior probability of the target | 1. Capture information within the label, therefore rendering more predictive features<br/>2. Create a monotonic relationship between the variable and the target<br/>3. Do not expand the feature space | Prone to cause over-fitting      |
+
+**Note**: if we are using one-hot encoding in linear regression, we should keep k-1 binary variable to avoid multicollinearity. This is true for any algorithms that look at all features at the same time during training. Including SVM, neural network and clustering. Tree-based algorithm, on the other hand, need the entire set of binary variable to select the best split.
+
+An in-detail intro to WOE can be found [here](http://documentation.statsoft.com/StatisticaHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview).
+
+
+
+### 3.4 Feature Transformation
+
+#### 3.4.1 Why Feature Transformation Matters
+
+##### 3.4.1.1 Linear Assumption
+
+**Regression**
+
+Linear regression is a straightforward approach for predicting a  quantitative response Y on the basis of a different predictor variable  X1, X2, ... Xn. It assumes that there is a linear relationship between  X(s) and Y. Mathematically, we can write this linear relationship as Y ≈  β0 + β1X1 + β2X2 + ... + βnXn. 
+
+**Classification**
+
+Similarly, for classification, Logistic Regression assumes a linear relationship between the variables and the log of the odds.
+
+Odds = p / (1 - p), where p is the probability of y = 1
+
+log(odds) = β0 + β1X1 + β2X2 + ... + βnXn
+
+**Why it's important to follow linear assumption**
+
+If the machine learning model assumes a linear dependency between the predictors Xs and the outcome Y, when there is not such a linear relationship, the model will have a poor performance. In such cases, we are better off trying another machine learning model that does not make such assumption.
+
+If there is no linear relationship and we have to use the linear/logistic regression models, mathematical transformation/discretization may help create the relationship, though it cannot guarantee a better result.
+
+##### 3.4.1.2 Variable Distribution
+
+**Linear Regression Assumptions**
+
+Linear Regression has the following assumptions over the predictor variables X:
+
+- Linear relationship with the outcome Y
+
+- Multivariate normality
+- No or little multicollinearity
+- Homoscedasticity
+
+Normality assumption means that every variable X should follow a Gaussian distribution.
+
+Homoscedasticity, also known as homogeneity of variance, describes a situation in which the error term (that is, the “noise” or random disturbance in the relationship between the independent variables (Xs) and the dependent variable (Y)) is the same across all values of the independent variables.
+
+Violations in the assumptions of homoscedasticity and / or normality (assuming a distribution of data is homoscedastic or Gaussian, when in reality it is not) may result in poor model performance.
+
+The remaining machine learning models, including Neural Networks, Support Vector Machines, Tree based methods and PCA do not make any assumption over the distribution of the independent variables. However, in many occasions the model performance may **benefit from a "Gaussian-like" distribution**.
+
+Why may models benefit from a "Gaussian-like" distributions? In variables with a normal distribution, the observations of X available to predict Y vary across a greater range of values, that is, the values of X are "spread" over a greater range.
+
+In the situations above, transformation of the original variable can help give the variable more of a bell-shape of the Gaussian distribution.
+
+#### 3.4.2 How to Handle Feature Transformation
+
+| Method                      | Definition                                               |
+| --------------------------- | -------------------------------------------------------- |
+| Logarithmic transformation  | log(x+1).  We use (x+1) instead of x to avoid value of 0 |
+| Reciprocal transformation   | 1/x. Warning that x should not be 0.                     |
+| Square root transformation  | x**(1/2)                                                 |
+| Exponential transformation  | X**(m)                                                   |
+| Box-cox transformation[^12] | (X**λ-1)/λ                                               |
+| Quantile transformation     | transform features using quantiles information           |
+
+**Log transformation** is useful when applied to skewed distributions as they tend to expand the values which fall in the range of lower magnitudes and tend to compress or reduce the values which fall in the range of higher magnitudes, which helps to make the skewed distribution as normal-like as possible. **Square root transformation** does a similar thing in this sense.
+
+**Box-Cox transformation** in sklearn[^13] is another popular function belonging to the power transform family of functions. This function has a pre-requisite that the numeric values to be transformed must be positive (similar to what log transform expects). In case they are negative, shifting using a constant value helps. Mathematically, the Box-Cox transform function can be denoted as follows.
+
+![](images/box-cox.png)
+
+**Quantile transformation** in sklearn[^14] transforms the features to follow a uniform or a normal distribution. Therefore, for a given feature, this transformation tends to spread out the most frequent values. It also reduces the impact of (marginal) outliers: this is therefore a robust preprocessing scheme. However, this transform is non-linear. It may distort linear correlations between variables measured at the same scale but renders variables measured at different scales more directly comparable.
+
+
+
+We can use **Q-Q plot** to check if the variable is normally distributed (a 45 degree straight line of the values over the theoretical quantiles) after transformation.
+
+Below is an example showing the effect of sklearn's Box-plot/Yeo-johnson/Quantile transform to map data from various distributions to a normal distribution.
+
+<div align=center>
+
+![sphx_glr_plot_map_data_to_normal_001](.\images\sphx_glr_plot_map_data_to_normal_001.png)
+
+[img source](https://scikit-learn.org/stable/auto_examples/preprocessing/plot_map_data_to_normal.html#sphx-glr-auto-examples-preprocessing-plot-map-data-to-normal-py) 
+
+On “small” datasets (less than a few hundred points), the quantile transformer is prone to overfitting. The use of the power transform is then recommended.
+
+
+
+
+### 3.5 Feature Generation
+
+**Definition**: Creating new features as a combination of existing ones. It's a great way to add domain knowledge to the dataset.
+
+#### 3.5.1 Missing Data Derived Feature
+
+As mentioned in section 2.1, we can create new binary feature denoting whether the observations have missing value on raw feature with value 0/1.
+
+#### 3.5.2 Simple Statistical Derived Feature
+
+Creating new features by performing simple statistical calculations on the raw features, including:
+
+ - count/sum
+ - average/median/mode
+ - max/min/stddev/variance/range/IQR/Coefficient of Variation
+ - time span/interval
+
+Take call log for example, we can create new features like: number of calls, number of call-in/call-out, average calling duration, monthly average calling duration, max calling duration, etc.
+
+#### 3.5.3 Feature Crossing
+
+After having some simple statistical derived features, we can have them crossed together. Common dimensions used for crossing include:
+
+- time
+- region
+- business types
+
+Still take call log for example, we can have crossed features like: number of calls during night times/day times, number of calls under different business types (banks/taxi services/travelling/hospitalities), number of calls during the past 3 months, etc. Many of the statistical calculations mentioned in section 3.5.2 can be used again to create more features.
+
+**Note**: An open-source python framework named **Featuretools** that helps automatically generate such features can be found [here](https://github.com/Featuretools/featuretools). 
+
+![featuretools](images/featuretools.png)
+
+Personally I haven't used it in practice. You may try and discover if it can be of industry usage.
+
+#### 3.5.4 Ratios and Proportions
+
+Common techniques. For example, in order to predict future performance of credit card sales of a branch, ratios like credit card sales / sales person or credit card sales / marketing spend would be more powerful than just using absolute number of card sold in the branch.
+
+#### 3.5.5 Cross Products between Categorical Features
+
+Consider a categorical feature A, with two possible values {A1, A2}. Let B be a feature with possibilities {B1, B2}. Then, a feature-cross between A & B  would take one of the following values: {(A1, B1), (A1, B2), (A2, B1), (A2, B2)}. You can basically give these ‘combinations’ any names you like. Just remember that every combination denotes a synergy between the information contained by the corresponding values of A and B.
+
+This is an extremely useful technique, when certain features together denote a property better than individually by themselves. Mathematically speaking, you are doing a cross product between all possible values of the categorical features. The concepts is similar to Feature Crossing of section 3.5.3, but this one particularly refers to the crossing between 2 categorical features.
+
+#### 3.5.6 Polynomial Expansion
+
+The cross product can also be applied to numerical features, which results in a new interaction feature between A and B. This can be done easily be sklearn's  [PolynomialFeatures](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html#sklearn.preprocessing.PolynomialFeatures), which generate a new feature set consisting of all polynomial combinations of the features with degree less than or equal to the specified degree. For example, three raw features {X1, X2, X3} can generate a feature set of  {1, X1X2, X1X3, X2X3, X1X2X3} with a degree of 2.
+
+#### 3.5.7 Feature Learning by Trees
+
+In tree-based algorithms, each sample will be assigned to a particular leaf node. The decision path to each node can be seen as a new non-linear feature, and we can create N new binary features where n equals to the total number of leaf nodes in a tree or tree ensembles. The features can then be fed into other algorithms such as logistic regression.
+
+The idea of using tree algorithm to generate new features is first introduced by Facebook in this [paper](http://quinonero.net/Publications/predicting-clicks-facebook.pdf).
+
+The good things about this method is that we can get a complex combinations of several features together, which is informative (as is constructed by the tree's learning algorithm). This saves us much time compared to doing feature crossing manually, and is widely used in CTR (click-through rate) of online advertising industry.
+
+#### 3.5.8 Feature Learning by Deep Networks
+
+As we can see from all above, feature generation by manual takes lots of effort and may not guarantee good returns, particular when we have huge amounts of features to work with. Feature learning with trees can be seen as an early attempt in creating features automatically, and with the deep learning methods come into fashion from around 2016, they also have achieved some success in this area, such as **autoencoders** and **restricted Boltzmann machines**. They have been shown to automatically and in a unsupervised or semi-supervised way, learn abstract representations of features (a compressed form), that in turn have supported state-of-the-art results in domains such as speech recognition, image classification, object recognition and other areas. However, such features have limited interpretability and deep learning require much more data to be able to extract high quality result.
+
+
+
+
+
+## 4. Feature Selection
+
+**Definition**:  Feature Selection is the process of selecting a subset of relevant features for use in machine learning model building. 
+
+It is not always the truth that the more data, the better the result will be. Including irrelevant features (the ones that are just unhelpful to the prediction) and redundant features (irrelevant in the presence of others) will only make the learning process overwhelmed and easy to cause overfitting.
+
+With feature selection, we can have:
+
+- simplification of models to make them easier to interpret
+- shorter training times and lesser computational cost
+- lesser cost in data collection
+- avoid the curse of dimensionality
+- enhanced generalization by reducing overfitting 
+
+We should keep in mind that different feature subsets render optimal performance for different algorithms. So it's not a separate process along with the machine learning model training. Therefore, if we are selecting features for a linear model, it is better to use selection procedures targeted to those models, like importance by regression coefficient or Lasso. And if we are selecting features for trees, it is better to use tree derived importance.
+
+
+
+### 4.1 Filter Method
+
+Filter methods select features based on a performance measure regardless of the ML algorithm later employed.
+
+Univariate filters evaluate and rank a single feature according to a certain criteria, while multivariate filters evaluate the entire feature space. Filter methods are:
+
+- selecting variable regardless of the model
+- less computationally expensive
+- usually give lower prediction performance
+
+As a result, filter methods are suited for a first step quick screen and removal of irrelevant features.
+
+| Method                    | Definition                                                   |
+| ------------------------- | ------------------------------------------------------------ |
+| Variance                  | removing features that show the same value for the majority/all of the observations (constant/quasi-constant features) |
+| Correlation               | remove features that are highly correlated with each other   |
+| Chi-Square                | Compute chi-squared stats between each non-negative feature and class |
+| Mutual Information Filter | Mutual information measures how much information the presence/absence of a feature contributes to making the correct prediction on Y. |
+| Univariate ROC-AUC or MSE | builds one decision tree per feature, to predict the target, then make predictions and ranks the features according to the machine learning metric (roc-auc or mse) |
+| Information Value (IV)    | a byproduct of WOE. <br>IV = Σ(Proportion of Good Outcomes - Proportion of Bad Outcomes) * WOE |
+
+WOE encoding (see section 3.3.2) and IV often go hand in hand in scorecard development. The two concepts both derived from logistic regression and is kind of standard practice in credit card industry.  IV is a popular and widely used measure as there are very convenient rules of thumb for variables selection associated with IV as below:
+
+![sphx_glr_plot_map_data_to_normal_001](.\images\IV.png)
+
+However, all these filtering methods fail to consider the interaction between features and may reduce our predict power. Personally I only use variance and correlation to filter some absolutely unnecessary features.
+
+
+
+**Note**: One thing to keep in mind when using chi-square test or univariate selection methods, is that in very big datasets, most of the features will show a small p_value, and therefore look like they are highly predictive. This is in fact an effect of the sample size. So care should be taken when selecting features using these procedures. An ultra tiny p_value does not highlight an ultra-important feature, it rather indicates that the dataset contains too many samples. 
+
+**Note**: Correlated features do not necessarily affect model performance (trees, etc), but high dimensionality does and too many features hurt model interpretability. So it's always better to reduce correlated features.
+
+
+
+### 4.2 Wrapper Method
+
+Wrappers use a search strategy to search through the space of possible feature subsets and evaluate each subset by the quality of the performance on a ML algorithm. Practically any combination of search strategy and algorithm can be used as a wrapper. It is featured as:
+
+- use ML models to score the feature subset
+- train a new model on each subset
+- very computationally expensive
+- usually provide the best performing subset for a give ML algorithm, but probably not for another
+- need an arbitrary defined stopping criteria
+
+The most common **search strategy** group is Sequential search, including Forward Selection, Backward Elimination and Exhaustive Search. Randomized search is another popular choice, including Evolutionary computation algorithms such as genetic, and Simulated annealing.
+
+Another key element in wrappers is **stopping criteria**. When to stop the search? In general there're three:
+
+- performance increase
+- performance decrease
+- predefined number of features is reached
+
+
+
+#### 4.2.1 Forward Selection
+
+Step forward feature selection starts by evaluating all features individually and selects the one that generates the best performing algorithm, according to a pre-set evaluation criteria. In the second step, it evaluates all possible combinations of the selected feature and a second feature, and selects the pair that produce the best performing algorithm based on the same pre-set criteria.
+
+The pre-set criteria can be the roc_auc for classification and the r squared for regression for example. 
+
+This selection procedure is called greedy, because it evaluates all possible single, double, triple and so on feature combinations. Therefore, it is quite computationally expensive, and sometimes, if feature space is big, even unfeasible.
+
+There is a special package for python that implements this type of feature selection: [mlxtend](https://github.com/rasbt/mlxtend).
+
+#### 4.2.2 Backward Elimination
+
+Step backward feature selection starts by fitting a model using all features. Then it removes one feature. It will remove the one that produces the highest performing algorithm (least statistically significant) for a certain evaluation criteria. In the second step, it will remove a second feature, the one that again produces the best performing algorithm. And it proceeds, removing feature after feature, until a certain criteria is met.
+
+The pre-set criteria can be the roc_auc for classification and the r squared for regression for example. 
+
+#### 4.2.3 Exhaustive Feature Selection
+
+In an exhaustive feature selection the best subset of features is selected, over all possible feature subsets, by optimizing a specified performance metric for a certain machine learning algorithm. For example, if the classifier is a logistic regression and the dataset consists of **4** features, the algorithm will evaluate all **15** feature combinations as follows:
+
+- all possible combinations of 1 feature
+- all possible combinations of 2 features
+- all possible combinations of 3 features
+- all the 4 features
+
+and select the one that results in the best performance (e.g., classification accuracy) of the logistic regression classifier.
+
+This exhaustive search is very computationally expensive. In practice for this computational cost, it is rarely used.
+
+#### 4.2.4 Genetic Algorithm
+
+TODO
+
+
+
+### 4.3 Embedded Method
+
+Embedded Method combine the advantages of the filter and wrapper methods. A learning algorithm takes advantage of its own variable selection process and performs feature selection and classification at same time. Common embedded methods include Lasso and various types of tree-based algorithms. It is featured as:
+
+- perform feature selection as part of the model building process
+- consider interactions between features
+- less computationally expensive as it only train the model once, compared to Wrappers
+- usually provide the best performing subset for a give ML algorithm, but probably not for another
+
+
+
+#### 4.3.1 Regularization with Lasso
+
+Regularization consists in adding a penalty to the different parameters of the machine learning model to reduce the freedom of the model. Hence, the model will be less likely to fit the noise of the training data so less likely to be overfitting.
+
+In linear model regularization, the penalty is applied over the coefficients that multiply each of the predictors. For linear models there are in general 3 types of regularization:
+
+- L1 regularization (Lasso)
+- L2 regularization (Ridge)
+- L1/L2 (Elastic net)
+
+From the different types of regularization, **Lasso (L1)** has the property that is able to shrink some of the coefficients to zero. Therefore, that feature can be removed from the model.
+
+Both for linear and logistic regression we can use the Lasso regularization to remove non-important features. Keep in mind that increasing the penalization will increase the number of features removed. Therefore, you will need to keep an eye and monitor that you don't set a penalty too high so that to remove even important features, or too low and then not remove non-important features.
+
+Having said this, if the penalty is too high and important features are removed, you should notice a drop in the performance of the algorithm and then realize that you need to decrease the regularization.
+
+Regularization is a large topic. For for information you can refer to here:
+
+- [Least angle and l1 penalised regression: A review](https://projecteuclid.org/download/pdfview_1/euclid.ssu/1211317636)
+- [Penalised feature selection and classification in bioinformatics](https://www.ncbi.nlm.nih.gov/pubmed/18562478)
+- [Feature selection for classification: A review](https://web.archive.org/web/20160314145552/http://www.public.asu.edu/~jtang20/publication/feature_selection_for_classification.pdf)
+
+- [Machine Learning Explained: Regularization](https://www.r-bloggers.com/machine-learning-explained-regularization/)
+
+
+
+#### 4.3.2 Random Forest Importance
+
+Random forests are one of the most popular machine learning algorithms.  They are so successful because they provide in general a good predictive  performance, low overfitting and easy interpretability. This interpretability is given by the fact that it is straightforward to derive the importance of each variable on the tree decision. In other words, it is easy to compute how much each variable is contributing to the decision.
+
+Random forest is a bagging algorithm consists a bunch of base estimators (decision trees), each of them built over a random extraction of the observations from the dataset and a random extraction of the features. Not every tree sees all the features or all the observations, and this guarantees that the trees are **de-correlated** and therefore **less prone to over-fitting.** 
+
+Each tree is also a sequence of yes-no questions based on a single or combination of features. At each split, the question divides the dataset into 2 buckets, each of them hosting observations that are more similar among themselves and different from the ones in the other bucket. Therefore, the importance of each feature is derived by how  "**pure**" each of the buckets is. 
+
+For classification, the measure of impurity is either the **Gini impurity** or the **information gain/entropy**. For regression the measure of impurity is **variance**. Therefore, when training a tree, it is possible to compute how much each feature decreases the impurity. The more a feature decreases the impurity, the more important the feature is. In random forests, the impurity decrease from each feature can be averaged across trees to determine the final importance of the variable.
+
+Selecting features by using tree derived feature importance is a very  straightforward, fast and generally accurate way of selecting good features for machine learning. In particular, if you are going to build tree methods.
+
+However, correlated features will show in a tree similar and lowered importance, compared to what their importance would be if the tree was built without correlated counterparts.
+
+**Limitation**
+
+- correlated features show similar importance
+
+- correlated features importance is lower than real importance, when tree is build without its correlated counterparts
+
+- high carinal variable tend to show higher importance
+
+
+#### 4.3.3 Gradient Boosted Trees Importance
+
+Similarly to selecting features using Random Forests derived feature importance, we can select features based on the importance derived by gradient boosted trees. And we can do that in one go, or in a recursive manner, depending on how much time we have, how many features are in the dataset, and whether they are correlated or not.
+
+
+
+### 4.4 Feature Shuffling
+
+A popular method of feature selection consists in random shuffling the values of a specific variable and determining how that permutation affects the performance metric of the machine learning algorithm. In other words, the idea is to permute the values of each feature, one at the time, and measure how much the permutation decreases the accuracy, or the roc_auc, or the mse of the machine learning model. If the variables are important, this is, highly predictive, a random permutation of their values will decrease dramatically any of these metrics. Contrarily, non-important / non-predictive variables, should have little to no effect on the model performance metric we are assessing.
+
+
+
+### 4.5 Hybrid Method
+
+#### 4.5.1 Recursive Feature Elimination
+
+This method consists of the following steps:
+
+1. Rank the features according to their importance derived from a machine learning algorithm: it can be tree importance, or LASSO / Ridge,  or the linear / logistic regression coefficients.
+2. Remove one feature -the least important- and build a machine learning algorithm utilizing the remaining features.
+
+3. Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.
+4. If the metric decreases by more of an arbitrarily set threshold, then that feature is important and should be kept. Otherwise, we can remove that feature.
+5. Repeat steps 2-4 until all features have been removed (and therefore evaluated) and the drop in performance assessed.
+
+The method combines the selection process like wrappers and feature importance derivation from ML models like embedded methods so it's called hybrid.
+
+The difference between this method and the step backwards feature  selection lies in that it does not remove all features first in order to determine which one to remove. It removes the least important one, based on the machine learning model derived importance. And then, it makes an assessment as to whether that feature should be removed or not. So it removes each feature only once during selection, whereas step backward feature selection removes all the features at each step of selection.
+
+This method is therefore faster than wrapper methods and generally better than embedded methods. In practice it works extremely well. It does also account for correlations (depending on how stringent you set the arbitrary performance drop threshold). On the downside, the drop in performance assessed to decide whether the feature should be kept or removed, is set arbitrarily. The smaller the drop the more features will be selected, and vice versa.
+
+
+
+**Example: Recursive Feature Elimination with Random Forests Importance**
+
+As we talked about in section 4.3.2, Random Forests assign equal or similar importance to features that are highly correlated. In addition, when features are correlated, the  importance assigned is lower than the importance attributed to the feature itself, should the tree be built without the correlated counterparts.
+
+Therefore, instead of eliminating features based on importance **at one time** (from all initial features), we may get a better selection by removing one feature **recursively**, and recalculating the importance on each round.
+
+In this situation, when a feature that is highly correlated to another one is removed, then, the importance of the remaining feature increases. This may lead to a better subset feature space selection. On the downside, building several random forests is quite time consuming, in particular if the dataset contains a high number of features.
+
+#### 4.5.2 Recursive Feature Addition
+
+This method consists of the following steps:
+
+1. Rank the features according to their importance derived from a  machine learning algorithm: it can be tree importance, or LASSO / Ridge,  or the linear / logistic regression coefficients.
+2. Build a machine learning model with only 1 feature, the most important one, and calculate the model metric for performance.
+
+3. Add one feature -the most important- and build a machine learning  algorithm utilizing the added and any feature from previous rounds.
+
+4. Calculate a performance metric of your choice: roc-auc, mse, rmse, accuracy.
+
+5. If the metric increases by more than an arbitrarily set threshold,  then that feature is important and should be kept. Otherwise, we can  remove that feature.
+
+6. Repeat steps 2-5 until all features have been removed (and therefore evaluated) and the drop in performance assessed.
+
+The difference between this method and the step forward feature selection  is similar. It does not look for all features first in order to determine which one to add, so it's faster than wrappers.
+
+
+
+### 4.6 Dimensionality Reduction
+
+- PCA（主成分分析）
+
+- SVD（奇异值分解）
+
+
+
+
+
+## 5. Data Leakage
+
+This section is a remainder to myself as I have had made huge mistakes because of not aware of the problem. Data leakage is when information from outside the training dataset is used to create the model[^15]. The result is that you may be creating overly optimistic models that are practically useless and cannot be used in production. The model shows great result on both your training and testing data but in fact it's not because your model really has a good generalizability but it uses information from the test data.
+
+While it is well known to use cross-validation or at least separate a validation set in training and evaluating the models, people may easily forget to do the same during the feature engineering & selection process. Keep in mind that the test dataset must not be used in any way to make choices about the model, including feature engineering & selection.
+
+
+
+------
+
+**Reference**
+
+[^1]: http://www.simonqueenborough.info/R/basic/missing-data
+[^2]: Rubin, D. B. (1976). Inference and missing data. Biometrika 63(3): 581-592.
+[^3]: D. Hawkins. Identification of Outliers, Chapman and Hall , 1980. 
+[^4]: https://www.springer.com/gp/book/9781461463955
+[^5]: https://github.com/yzhao062/pyod
+[^6]: https://docs.oracle.com/cd/E40248_01/epm.1112/cb_statistical/frameset.htm?ch07s02s10s01.html
+[^7]: https://www.academia.edu/5324493/Detecting_outliers_Do_not_use_standard_deviation_around_the_mean_use_absolute_deviation_around_the_median
+[^8]: https://www.purplemath.com/modules/boxwhisk3.htm
+[^9]: http://documentation.statsoft.com/StatisticaHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview
+[^10]: A Preprocessing Scheme for High-Cardinality Categorical Attributes in Classification and Prediction Problems.  https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
+[^11]: https://www.aaai.org/Papers/AAAI/1992/AAAI92-019.pdf
+[^12]: http://onlinestatbook.com/2/transformations/box-cox.html
+[^13]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PowerTransformer.html#sklearn.preprocessing.PowerTransformer
+[^14]: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html#sklearn.preprocessing.QuantileTransformer
+[^15]: https://machinelearningmastery.com/data-leakage-machine-learning/
--- a/README.md
+++ b/README.md
@@ -0,0 +1,204 @@
+# Feature Engineering & Feature Selection
+
+## About
+
+A comprehensive [guide]() for **Feature Engineering** and **Feature Selection**, with implementations and examples in Python.
+
+
+
+## What You'll Learn
+
+Not only a collection of hands-on functions, but also explanation on  **Why**, **How** and **When** to adopt **Which** techniques of feature engineering in data mining. 
+
+- the nature and risk of data problem we often encounter
+- explanation of the various feature engineering & selection techniques
+- rationale to use it
+- pros & cons of each method 
+- code & example
+
+
+
+## Getting Started
+
+This repo is mainly used as a reference for anyone who are doing feature engineering, and most of the modules are implemented through scikit-learn or its communities.
+
+To run the demos or use the customized function,  please download the ZIP file from the repo or just copy-paste any part of the code you find helpful. They should all be very easy to understand.
+
+**Required Dependencies**:
+
+- Python 3.5, 3.6 or 3.7
+- numpy>=1.15
+- pandas>=0.23
+- scipy>=1.1.0
+- scikit_learn>=0.20.1
+- seaborn>=0.9.0
+
+
+
+## Table of Contents and Code Examples
+
+Below is a list of methods currently implemented in the repo. The complete guide can be found [here]().
+
+**1. Data Exploration**
+
+   1.1 Variables 
+   1.2 Variable Identification   
+             Check Data Types
+   1.3 Univariate Analysis
+             Descriptive Analysis
+             Discrete Variable Barplot
+             Discrete Variable Countplot
+             Discrete Variable Boxplot
+             Continuous Variable Distplot
+   1.4 Bi-variate Analysis
+             Scatter Plot
+             Correlation Plot
+             Heat Map
+
+**2. Feature Cleaning**
+
+   2.1 Missing Values
+             Missing Value Check
+             Listwise Deletion
+             Mean/Median/Mode Imputation
+             End of distribution Imputation
+             Random Imputation
+             Arbitrary Value Imputation
+             Add a variable to denote NA
+   2.2 Outliers
+             Detect by Arbitrary Boundary
+             Detect by Mean & Standard Deviation
+             Detect by IQR 
+             Detect by MAD   
+             Mean/Median/Mode Imputation
+             Discretization
+             Imputation with Arbitrary Value
+             Windsorization
+             Discard Outliers
+   2.3 Rare Values
+             Mode Imputation  
+             Grouping into One New Category
+   2.4 High Cardinality
+             Grouping Labels with Business Understanding 
+             Grouping Labels with Rare Occurrence into One Category
+             Grouping Labels with Decision Tree
+
+**3. Feature Engineering**
+
+   3.1 Feature Scaling  
+             Normalization - Standardization 
+             Min-Max Scaling
+             Robust Scaling
+   3.2 Discretize   
+             Equal Width Binning
+             Equal Frequency Binning
+             K-means Binning   
+             Discretization by Decision Trees
+             ChiMerge
+   3.3 Feature Encoding
+             One-hot Encoding
+             Ordinal-Encoding
+             Count/frequency Encoding 
+             Mean Encoding
+             WOE Encoding
+             Target Encoding
+   3.4 Feature Transformation
+             Logarithmic Transformation
+             Reciprocal Transformation
+             Square Root Transformation
+             Exponential Transformation
+             Box-cox Transformation
+             Quantile Transformation
+   3.5 Feature Generation
+             Missing Data Derived
+             Simple Stats
+             Crossing
+             Ratio & Proportion
+             Cross Product
+             Polynomial
+             Feature Leanring by Tree
+             Feature Leanring by Deep Network
+
+**4. Feature Selection**
+
+   4.1 Filter Method
+             Variance
+             Correlation
+             Chi-Square
+             Mutual Information Filter
+             Univariate ROC-AUC or MSE
+             Information Value (IV)
+   4.2 Wrapper Method
+             Forward Selection
+             Backward Elimination
+             Exhaustive Feature Selection
+             Genetic Algorithm
+   4.3 Embedded Method
+             Lasso (L1)
+             Random Forest Importance
+             Gradient Boosted Trees Importance
+   4.4 Feature Shuffling
+             Random Shuffling
+   4.5 Hybrid Method
+             Recursive Feature Selection 
+             Recursive Feature Addition
+
+
+
+
+## Motivation
+
+Feature Engineering & Selection is the most essential part of building a useable machine learning project, even though hundreds of cutting-edge machine learning algorithms coming in these days like deep learning and transfer learning. Indeed, like what Prof Domingos, the author of  *'The Master Algorithm'* says:
+
+> “At the end of the day, some machine learning projects succeed and some fail. What makes the difference? Easily the most important factor is the features used.”
+>
+> — Prof. Pedro Domingos
+
+![001](./images/001.png)
+Data and feature determine the upper limit of a ML project, while models and algorithms are just approaching that limit. However, few materials could be found that systematically introduce the art of feature engineering, and even fewer could explain the rationale behind. This repo aims at teaching you a good guide for Feature Engineering & Selection.
+
+
+
+## Key Links and Resources
+
+- Udemy's Feature Engineering online course
+
+https://www.udemy.com/feature-engineering-for-machine-learning/
+
+- Udemy's Feature Selection online course
+
+https://www.udemy.com/feature-selection-for-machine-learning
+
+- JMLR Special Issue on Variable and Feature Selection
+
+http://jmlr.org/papers/special/feature03.html
+
+- Data Analysis Using Regression and Multilevel/Hierarchical Models, Chapter 25: Missing data
+
+http://www.stat.columbia.edu/~gelman/arm/missing.pdf
+
+- Data mining and the impact of missing data
+
+http://core.ecu.edu/omgt/krosj/IMDSDataMining2003.pdf
+
+- PyOD: A Python Toolkit for Scalable Outlier Detection
+
+https://github.com/yzhao062/pyod
+
+- Weight of Evidence (WoE) Introductory Overview
+
+http://documentation.statsoft.com/StatisticaHelp.aspx?path=WeightofEvidence/WeightofEvidenceWoEIntroductoryOverview
+
+- About Feature Scaling and Normalization
+
+http://sebastianraschka.com/Articles/2014_about_feature_scaling.html
+
+- Feature Generation with RF, GBDT and Xgboost
+
+https://blog.csdn.net/anshuai_aw1/article/details/82983997
+
+- A review of feature selection methods with applications
+
+https://ieeexplore.ieee.org/iel7/7153596/7160221/07160458.pdf
+
+
--- a/data/housing.data.txt
+++ b/data/housing.data.txt
@@ -0,0 +1,506 @@
+ 0.00632  18.00   2.310  0  0.5380  6.5750  65.20  4.0900   1  296.0  15.30 396.90   4.98  24.00
+ 0.02731   0.00   7.070  0  0.4690  6.4210  78.90  4.9671   2  242.0  17.80 396.90   9.14  21.60
+ 0.02729   0.00   7.070  0  0.4690  7.1850  61.10  4.9671   2  242.0  17.80 392.83   4.03  34.70
+ 0.03237   0.00   2.180  0  0.4580  6.9980  45.80  6.0622   3  222.0  18.70 394.63   2.94  33.40
+ 0.06905   0.00   2.180  0  0.4580  7.1470  54.20  6.0622   3  222.0  18.70 396.90   5.33  36.20
+ 0.02985   0.00   2.180  0  0.4580  6.4300  58.70  6.0622   3  222.0  18.70 394.12   5.21  28.70
+ 0.08829  12.50   7.870  0  0.5240  6.0120  66.60  5.5605   5  311.0  15.20 395.60  12.43  22.90
+ 0.14455  12.50   7.870  0  0.5240  6.1720  96.10  5.9505   5  311.0  15.20 396.90  19.15  27.10
+ 0.21124  12.50   7.870  0  0.5240  5.6310 100.00  6.0821   5  311.0  15.20 386.63  29.93  16.50
+ 0.17004  12.50   7.870  0  0.5240  6.0040  85.90  6.5921   5  311.0  15.20 386.71  17.10  18.90
+ 0.22489  12.50   7.870  0  0.5240  6.3770  94.30  6.3467   5  311.0  15.20 392.52  20.45  15.00
+ 0.11747  12.50   7.870  0  0.5240  6.0090  82.90  6.2267   5  311.0  15.20 396.90  13.27  18.90
+ 0.09378  12.50   7.870  0  0.5240  5.8890  39.00  5.4509   5  311.0  15.20 390.50  15.71  21.70
+ 0.62976   0.00   8.140  0  0.5380  5.9490  61.80  4.7075   4  307.0  21.00 396.90   8.26  20.40
+ 0.63796   0.00   8.140  0  0.5380  6.0960  84.50  4.4619   4  307.0  21.00 380.02  10.26  18.20
+ 0.62739   0.00   8.140  0  0.5380  5.8340  56.50  4.4986   4  307.0  21.00 395.62   8.47  19.90
+ 1.05393   0.00   8.140  0  0.5380  5.9350  29.30  4.4986   4  307.0  21.00 386.85   6.58  23.10
+ 0.78420   0.00   8.140  0  0.5380  5.9900  81.70  4.2579   4  307.0  21.00 386.75  14.67  17.50
+ 0.80271   0.00   8.140  0  0.5380  5.4560  36.60  3.7965   4  307.0  21.00 288.99  11.69  20.20
+ 0.72580   0.00   8.140  0  0.5380  5.7270  69.50  3.7965   4  307.0  21.00 390.95  11.28  18.20
+ 1.25179   0.00   8.140  0  0.5380  5.5700  98.10  3.7979   4  307.0  21.00 376.57  21.02  13.60
+ 0.85204   0.00   8.140  0  0.5380  5.9650  89.20  4.0123   4  307.0  21.00 392.53  13.83  19.60
+ 1.23247   0.00   8.140  0  0.5380  6.1420  91.70  3.9769   4  307.0  21.00 396.90  18.72  15.20
+ 0.98843   0.00   8.140  0  0.5380  5.8130 100.00  4.0952   4  307.0  21.00 394.54  19.88  14.50
+ 0.75026   0.00   8.140  0  0.5380  5.9240  94.10  4.3996   4  307.0  21.00 394.33  16.30  15.60
+ 0.84054   0.00   8.140  0  0.5380  5.5990  85.70  4.4546   4  307.0  21.00 303.42  16.51  13.90
+ 0.67191   0.00   8.140  0  0.5380  5.8130  90.30  4.6820   4  307.0  21.00 376.88  14.81  16.60
+ 0.95577   0.00   8.140  0  0.5380  6.0470  88.80  4.4534   4  307.0  21.00 306.38  17.28  14.80
+ 0.77299   0.00   8.140  0  0.5380  6.4950  94.40  4.4547   4  307.0  21.00 387.94  12.80  18.40
+ 1.00245   0.00   8.140  0  0.5380  6.6740  87.30  4.2390   4  307.0  21.00 380.23  11.98  21.00
+ 1.13081   0.00   8.140  0  0.5380  5.7130  94.10  4.2330   4  307.0  21.00 360.17  22.60  12.70
+ 1.35472   0.00   8.140  0  0.5380  6.0720 100.00  4.1750   4  307.0  21.00 376.73  13.04  14.50
+ 1.38799   0.00   8.140  0  0.5380  5.9500  82.00  3.9900   4  307.0  21.00 232.60  27.71  13.20
+ 1.15172   0.00   8.140  0  0.5380  5.7010  95.00  3.7872   4  307.0  21.00 358.77  18.35  13.10
+ 1.61282   0.00   8.140  0  0.5380  6.0960  96.90  3.7598   4  307.0  21.00 248.31  20.34  13.50
+ 0.06417   0.00   5.960  0  0.4990  5.9330  68.20  3.3603   5  279.0  19.20 396.90   9.68  18.90
+ 0.09744   0.00   5.960  0  0.4990  5.8410  61.40  3.3779   5  279.0  19.20 377.56  11.41  20.00
+ 0.08014   0.00   5.960  0  0.4990  5.8500  41.50  3.9342   5  279.0  19.20 396.90   8.77  21.00
+ 0.17505   0.00   5.960  0  0.4990  5.9660  30.20  3.8473   5  279.0  19.20 393.43  10.13  24.70
+ 0.02763  75.00   2.950  0  0.4280  6.5950  21.80  5.4011   3  252.0  18.30 395.63   4.32  30.80
+ 0.03359  75.00   2.950  0  0.4280  7.0240  15.80  5.4011   3  252.0  18.30 395.62   1.98  34.90
+ 0.12744   0.00   6.910  0  0.4480  6.7700   2.90  5.7209   3  233.0  17.90 385.41   4.84  26.60
+ 0.14150   0.00   6.910  0  0.4480  6.1690   6.60  5.7209   3  233.0  17.90 383.37   5.81  25.30
+ 0.15936   0.00   6.910  0  0.4480  6.2110   6.50  5.7209   3  233.0  17.90 394.46   7.44  24.70
+ 0.12269   0.00   6.910  0  0.4480  6.0690  40.00  5.7209   3  233.0  17.90 389.39   9.55  21.20
+ 0.17142   0.00   6.910  0  0.4480  5.6820  33.80  5.1004   3  233.0  17.90 396.90  10.21  19.30
+ 0.18836   0.00   6.910  0  0.4480  5.7860  33.30  5.1004   3  233.0  17.90 396.90  14.15  20.00
+ 0.22927   0.00   6.910  0  0.4480  6.0300  85.50  5.6894   3  233.0  17.90 392.74  18.80  16.60
+ 0.25387   0.00   6.910  0  0.4480  5.3990  95.30  5.8700   3  233.0  17.90 396.90  30.81  14.40
+ 0.21977   0.00   6.910  0  0.4480  5.6020  62.00  6.0877   3  233.0  17.90 396.90  16.20  19.40
+ 0.08873  21.00   5.640  0  0.4390  5.9630  45.70  6.8147   4  243.0  16.80 395.56  13.45  19.70
+ 0.04337  21.00   5.640  0  0.4390  6.1150  63.00  6.8147   4  243.0  16.80 393.97   9.43  20.50
+ 0.05360  21.00   5.640  0  0.4390  6.5110  21.10  6.8147   4  243.0  16.80 396.90   5.28  25.00
+ 0.04981  21.00   5.640  0  0.4390  5.9980  21.40  6.8147   4  243.0  16.80 396.90   8.43  23.40
+ 0.01360  75.00   4.000  0  0.4100  5.8880  47.60  7.3197   3  469.0  21.10 396.90  14.80  18.90
+ 0.01311  90.00   1.220  0  0.4030  7.2490  21.90  8.6966   5  226.0  17.90 395.93   4.81  35.40
+ 0.02055  85.00   0.740  0  0.4100  6.3830  35.70  9.1876   2  313.0  17.30 396.90   5.77  24.70
+ 0.01432 100.00   1.320  0  0.4110  6.8160  40.50  8.3248   5  256.0  15.10 392.90   3.95  31.60
+ 0.15445  25.00   5.130  0  0.4530  6.1450  29.20  7.8148   8  284.0  19.70 390.68   6.86  23.30
+ 0.10328  25.00   5.130  0  0.4530  5.9270  47.20  6.9320   8  284.0  19.70 396.90   9.22  19.60
+ 0.14932  25.00   5.130  0  0.4530  5.7410  66.20  7.2254   8  284.0  19.70 395.11  13.15  18.70
+ 0.17171  25.00   5.130  0  0.4530  5.9660  93.40  6.8185   8  284.0  19.70 378.08  14.44  16.00
+ 0.11027  25.00   5.130  0  0.4530  6.4560  67.80  7.2255   8  284.0  19.70 396.90   6.73  22.20
+ 0.12650  25.00   5.130  0  0.4530  6.7620  43.40  7.9809   8  284.0  19.70 395.58   9.50  25.00
+ 0.01951  17.50   1.380  0  0.4161  7.1040  59.50  9.2229   3  216.0  18.60 393.24   8.05  33.00
+ 0.03584  80.00   3.370  0  0.3980  6.2900  17.80  6.6115   4  337.0  16.10 396.90   4.67  23.50
+ 0.04379  80.00   3.370  0  0.3980  5.7870  31.10  6.6115   4  337.0  16.10 396.90  10.24  19.40
+ 0.05789  12.50   6.070  0  0.4090  5.8780  21.40  6.4980   4  345.0  18.90 396.21   8.10  22.00
+ 0.13554  12.50   6.070  0  0.4090  5.5940  36.80  6.4980   4  345.0  18.90 396.90  13.09  17.40
+ 0.12816  12.50   6.070  0  0.4090  5.8850  33.00  6.4980   4  345.0  18.90 396.90   8.79  20.90
+ 0.08826   0.00  10.810  0  0.4130  6.4170   6.60  5.2873   4  305.0  19.20 383.73   6.72  24.20
+ 0.15876   0.00  10.810  0  0.4130  5.9610  17.50  5.2873   4  305.0  19.20 376.94   9.88  21.70
+ 0.09164   0.00  10.810  0  0.4130  6.0650   7.80  5.2873   4  305.0  19.20 390.91   5.52  22.80
+ 0.19539   0.00  10.810  0  0.4130  6.2450   6.20  5.2873   4  305.0  19.20 377.17   7.54  23.40
+ 0.07896   0.00  12.830  0  0.4370  6.2730   6.00  4.2515   5  398.0  18.70 394.92   6.78  24.10
+ 0.09512   0.00  12.830  0  0.4370  6.2860  45.00  4.5026   5  398.0  18.70 383.23   8.94  21.40
+ 0.10153   0.00  12.830  0  0.4370  6.2790  74.50  4.0522   5  398.0  18.70 373.66  11.97  20.00
+ 0.08707   0.00  12.830  0  0.4370  6.1400  45.80  4.0905   5  398.0  18.70 386.96  10.27  20.80
+ 0.05646   0.00  12.830  0  0.4370  6.2320  53.70  5.0141   5  398.0  18.70 386.40  12.34  21.20
+ 0.08387   0.00  12.830  0  0.4370  5.8740  36.60  4.5026   5  398.0  18.70 396.06   9.10  20.30
+ 0.04113  25.00   4.860  0  0.4260  6.7270  33.50  5.4007   4  281.0  19.00 396.90   5.29  28.00
+ 0.04462  25.00   4.860  0  0.4260  6.6190  70.40  5.4007   4  281.0  19.00 395.63   7.22  23.90
+ 0.03659  25.00   4.860  0  0.4260  6.3020  32.20  5.4007   4  281.0  19.00 396.90   6.72  24.80
+ 0.03551  25.00   4.860  0  0.4260  6.1670  46.70  5.4007   4  281.0  19.00 390.64   7.51  22.90
+ 0.05059   0.00   4.490  0  0.4490  6.3890  48.00  4.7794   3  247.0  18.50 396.90   9.62  23.90
+ 0.05735   0.00   4.490  0  0.4490  6.6300  56.10  4.4377   3  247.0  18.50 392.30   6.53  26.60
+ 0.05188   0.00   4.490  0  0.4490  6.0150  45.10  4.4272   3  247.0  18.50 395.99  12.86  22.50
+ 0.07151   0.00   4.490  0  0.4490  6.1210  56.80  3.7476   3  247.0  18.50 395.15   8.44  22.20
+ 0.05660   0.00   3.410  0  0.4890  7.0070  86.30  3.4217   2  270.0  17.80 396.90   5.50  23.60
+ 0.05302   0.00   3.410  0  0.4890  7.0790  63.10  3.4145   2  270.0  17.80 396.06   5.70  28.70
+ 0.04684   0.00   3.410  0  0.4890  6.4170  66.10  3.0923   2  270.0  17.80 392.18   8.81  22.60
+ 0.03932   0.00   3.410  0  0.4890  6.4050  73.90  3.0921   2  270.0  17.80 393.55   8.20  22.00
+ 0.04203  28.00  15.040  0  0.4640  6.4420  53.60  3.6659   4  270.0  18.20 395.01   8.16  22.90
+ 0.02875  28.00  15.040  0  0.4640  6.2110  28.90  3.6659   4  270.0  18.20 396.33   6.21  25.00
+ 0.04294  28.00  15.040  0  0.4640  6.2490  77.30  3.6150   4  270.0  18.20 396.90  10.59  20.60
+ 0.12204   0.00   2.890  0  0.4450  6.6250  57.80  3.4952   2  276.0  18.00 357.98   6.65  28.40
+ 0.11504   0.00   2.890  0  0.4450  6.1630  69.60  3.4952   2  276.0  18.00 391.83  11.34  21.40
+ 0.12083   0.00   2.890  0  0.4450  8.0690  76.00  3.4952   2  276.0  18.00 396.90   4.21  38.70
+ 0.08187   0.00   2.890  0  0.4450  7.8200  36.90  3.4952   2  276.0  18.00 393.53   3.57  43.80
+ 0.06860   0.00   2.890  0  0.4450  7.4160  62.50  3.4952   2  276.0  18.00 396.90   6.19  33.20
+ 0.14866   0.00   8.560  0  0.5200  6.7270  79.90  2.7778   5  384.0  20.90 394.76   9.42  27.50
+ 0.11432   0.00   8.560  0  0.5200  6.7810  71.30  2.8561   5  384.0  20.90 395.58   7.67  26.50
+ 0.22876   0.00   8.560  0  0.5200  6.4050  85.40  2.7147   5  384.0  20.90  70.80  10.63  18.60
+ 0.21161   0.00   8.560  0  0.5200  6.1370  87.40  2.7147   5  384.0  20.90 394.47  13.44  19.30
+ 0.13960   0.00   8.560  0  0.5200  6.1670  90.00  2.4210   5  384.0  20.90 392.69  12.33  20.10
+ 0.13262   0.00   8.560  0  0.5200  5.8510  96.70  2.1069   5  384.0  20.90 394.05  16.47  19.50
+ 0.17120   0.00   8.560  0  0.5200  5.8360  91.90  2.2110   5  384.0  20.90 395.67  18.66  19.50
+ 0.13117   0.00   8.560  0  0.5200  6.1270  85.20  2.1224   5  384.0  20.90 387.69  14.09  20.40
+ 0.12802   0.00   8.560  0  0.5200  6.4740  97.10  2.4329   5  384.0  20.90 395.24  12.27  19.80
+ 0.26363   0.00   8.560  0  0.5200  6.2290  91.20  2.5451   5  384.0  20.90 391.23  15.55  19.40
+ 0.10793   0.00   8.560  0  0.5200  6.1950  54.40  2.7778   5  384.0  20.90 393.49  13.00  21.70
+ 0.10084   0.00  10.010  0  0.5470  6.7150  81.60  2.6775   6  432.0  17.80 395.59  10.16  22.80
+ 0.12329   0.00  10.010  0  0.5470  5.9130  92.90  2.3534   6  432.0  17.80 394.95  16.21  18.80
+ 0.22212   0.00  10.010  0  0.5470  6.0920  95.40  2.5480   6  432.0  17.80 396.90  17.09  18.70
+ 0.14231   0.00  10.010  0  0.5470  6.2540  84.20  2.2565   6  432.0  17.80 388.74  10.45  18.50
+ 0.17134   0.00  10.010  0  0.5470  5.9280  88.20  2.4631   6  432.0  17.80 344.91  15.76  18.30
+ 0.13158   0.00  10.010  0  0.5470  6.1760  72.50  2.7301   6  432.0  17.80 393.30  12.04  21.20
+ 0.15098   0.00  10.010  0  0.5470  6.0210  82.60  2.7474   6  432.0  17.80 394.51  10.30  19.20
+ 0.13058   0.00  10.010  0  0.5470  5.8720  73.10  2.4775   6  432.0  17.80 338.63  15.37  20.40
+ 0.14476   0.00  10.010  0  0.5470  5.7310  65.20  2.7592   6  432.0  17.80 391.50  13.61  19.30
+ 0.06899   0.00  25.650  0  0.5810  5.8700  69.70  2.2577   2  188.0  19.10 389.15  14.37  22.00
+ 0.07165   0.00  25.650  0  0.5810  6.0040  84.10  2.1974   2  188.0  19.10 377.67  14.27  20.30
+ 0.09299   0.00  25.650  0  0.5810  5.9610  92.90  2.0869   2  188.0  19.10 378.09  17.93  20.50
+ 0.15038   0.00  25.650  0  0.5810  5.8560  97.00  1.9444   2  188.0  19.10 370.31  25.41  17.30
+ 0.09849   0.00  25.650  0  0.5810  5.8790  95.80  2.0063   2  188.0  19.10 379.38  17.58  18.80
+ 0.16902   0.00  25.650  0  0.5810  5.9860  88.40  1.9929   2  188.0  19.10 385.02  14.81  21.40
+ 0.38735   0.00  25.650  0  0.5810  5.6130  95.60  1.7572   2  188.0  19.10 359.29  27.26  15.70
+ 0.25915   0.00  21.890  0  0.6240  5.6930  96.00  1.7883   4  437.0  21.20 392.11  17.19  16.20
+ 0.32543   0.00  21.890  0  0.6240  6.4310  98.80  1.8125   4  437.0  21.20 396.90  15.39  18.00
+ 0.88125   0.00  21.890  0  0.6240  5.6370  94.70  1.9799   4  437.0  21.20 396.90  18.34  14.30
+ 0.34006   0.00  21.890  0  0.6240  6.4580  98.90  2.1185   4  437.0  21.20 395.04  12.60  19.20
+ 1.19294   0.00  21.890  0  0.6240  6.3260  97.70  2.2710   4  437.0  21.20 396.90  12.26  19.60
+ 0.59005   0.00  21.890  0  0.6240  6.3720  97.90  2.3274   4  437.0  21.20 385.76  11.12  23.00
+ 0.32982   0.00  21.890  0  0.6240  5.8220  95.40  2.4699   4  437.0  21.20 388.69  15.03  18.40
+ 0.97617   0.00  21.890  0  0.6240  5.7570  98.40  2.3460   4  437.0  21.20 262.76  17.31  15.60
+ 0.55778   0.00  21.890  0  0.6240  6.3350  98.20  2.1107   4  437.0  21.20 394.67  16.96  18.10
+ 0.32264   0.00  21.890  0  0.6240  5.9420  93.50  1.9669   4  437.0  21.20 378.25  16.90  17.40
+ 0.35233   0.00  21.890  0  0.6240  6.4540  98.40  1.8498   4  437.0  21.20 394.08  14.59  17.10
+ 0.24980   0.00  21.890  0  0.6240  5.8570  98.20  1.6686   4  437.0  21.20 392.04  21.32  13.30
+ 0.54452   0.00  21.890  0  0.6240  6.1510  97.90  1.6687   4  437.0  21.20 396.90  18.46  17.80
+ 0.29090   0.00  21.890  0  0.6240  6.1740  93.60  1.6119   4  437.0  21.20 388.08  24.16  14.00
+ 1.62864   0.00  21.890  0  0.6240  5.0190 100.00  1.4394   4  437.0  21.20 396.90  34.41  14.40
+ 3.32105   0.00  19.580  1  0.8710  5.4030 100.00  1.3216   5  403.0  14.70 396.90  26.82  13.40
+ 4.09740   0.00  19.580  0  0.8710  5.4680 100.00  1.4118   5  403.0  14.70 396.90  26.42  15.60
+ 2.77974   0.00  19.580  0  0.8710  4.9030  97.80  1.3459   5  403.0  14.70 396.90  29.29  11.80
+ 2.37934   0.00  19.580  0  0.8710  6.1300 100.00  1.4191   5  403.0  14.70 172.91  27.80  13.80
+ 2.15505   0.00  19.580  0  0.8710  5.6280 100.00  1.5166   5  403.0  14.70 169.27  16.65  15.60
+ 2.36862   0.00  19.580  0  0.8710  4.9260  95.70  1.4608   5  403.0  14.70 391.71  29.53  14.60
+ 2.33099   0.00  19.580  0  0.8710  5.1860  93.80  1.5296   5  403.0  14.70 356.99  28.32  17.80
+ 2.73397   0.00  19.580  0  0.8710  5.5970  94.90  1.5257   5  403.0  14.70 351.85  21.45  15.40
+ 1.65660   0.00  19.580  0  0.8710  6.1220  97.30  1.6180   5  403.0  14.70 372.80  14.10  21.50
+ 1.49632   0.00  19.580  0  0.8710  5.4040 100.00  1.5916   5  403.0  14.70 341.60  13.28  19.60
+ 1.12658   0.00  19.580  1  0.8710  5.0120  88.00  1.6102   5  403.0  14.70 343.28  12.12  15.30
+ 2.14918   0.00  19.580  0  0.8710  5.7090  98.50  1.6232   5  403.0  14.70 261.95  15.79  19.40
+ 1.41385   0.00  19.580  1  0.8710  6.1290  96.00  1.7494   5  403.0  14.70 321.02  15.12  17.00
+ 3.53501   0.00  19.580  1  0.8710  6.1520  82.60  1.7455   5  403.0  14.70  88.01  15.02  15.60
+ 2.44668   0.00  19.580  0  0.8710  5.2720  94.00  1.7364   5  403.0  14.70  88.63  16.14  13.10
+ 1.22358   0.00  19.580  0  0.6050  6.9430  97.40  1.8773   5  403.0  14.70 363.43   4.59  41.30
+ 1.34284   0.00  19.580  0  0.6050  6.0660 100.00  1.7573   5  403.0  14.70 353.89   6.43  24.30
+ 1.42502   0.00  19.580  0  0.8710  6.5100 100.00  1.7659   5  403.0  14.70 364.31   7.39  23.30
+ 1.27346   0.00  19.580  1  0.6050  6.2500  92.60  1.7984   5  403.0  14.70 338.92   5.50  27.00
+ 1.46336   0.00  19.580  0  0.6050  7.4890  90.80  1.9709   5  403.0  14.70 374.43   1.73  50.00
+ 1.83377   0.00  19.580  1  0.6050  7.8020  98.20  2.0407   5  403.0  14.70 389.61   1.92  50.00
+ 1.51902   0.00  19.580  1  0.6050  8.3750  93.90  2.1620   5  403.0  14.70 388.45   3.32  50.00
+ 2.24236   0.00  19.580  0  0.6050  5.8540  91.80  2.4220   5  403.0  14.70 395.11  11.64  22.70
+ 2.92400   0.00  19.580  0  0.6050  6.1010  93.00  2.2834   5  403.0  14.70 240.16   9.81  25.00
+ 2.01019   0.00  19.580  0  0.6050  7.9290  96.20  2.0459   5  403.0  14.70 369.30   3.70  50.00
+ 1.80028   0.00  19.580  0  0.6050  5.8770  79.20  2.4259   5  403.0  14.70 227.61  12.14  23.80
+ 2.30040   0.00  19.580  0  0.6050  6.3190  96.10  2.1000   5  403.0  14.70 297.09  11.10  23.80
+ 2.44953   0.00  19.580  0  0.6050  6.4020  95.20  2.2625   5  403.0  14.70 330.04  11.32  22.30
+ 1.20742   0.00  19.580  0  0.6050  5.8750  94.60  2.4259   5  403.0  14.70 292.29  14.43  17.40
+ 2.31390   0.00  19.580  0  0.6050  5.8800  97.30  2.3887   5  403.0  14.70 348.13  12.03  19.10
+ 0.13914   0.00   4.050  0  0.5100  5.5720  88.50  2.5961   5  296.0  16.60 396.90  14.69  23.10
+ 0.09178   0.00   4.050  0  0.5100  6.4160  84.10  2.6463   5  296.0  16.60 395.50   9.04  23.60
+ 0.08447   0.00   4.050  0  0.5100  5.8590  68.70  2.7019   5  296.0  16.60 393.23   9.64  22.60
+ 0.06664   0.00   4.050  0  0.5100  6.5460  33.10  3.1323   5  296.0  16.60 390.96   5.33  29.40
+ 0.07022   0.00   4.050  0  0.5100  6.0200  47.20  3.5549   5  296.0  16.60 393.23  10.11  23.20
+ 0.05425   0.00   4.050  0  0.5100  6.3150  73.40  3.3175   5  296.0  16.60 395.60   6.29  24.60
+ 0.06642   0.00   4.050  0  0.5100  6.8600  74.40  2.9153   5  296.0  16.60 391.27   6.92  29.90
+ 0.05780   0.00   2.460  0  0.4880  6.9800  58.40  2.8290   3  193.0  17.80 396.90   5.04  37.20
+ 0.06588   0.00   2.460  0  0.4880  7.7650  83.30  2.7410   3  193.0  17.80 395.56   7.56  39.80
+ 0.06888   0.00   2.460  0  0.4880  6.1440  62.20  2.5979   3  193.0  17.80 396.90   9.45  36.20
+ 0.09103   0.00   2.460  0  0.4880  7.1550  92.20  2.7006   3  193.0  17.80 394.12   4.82  37.90
+ 0.10008   0.00   2.460  0  0.4880  6.5630  95.60  2.8470   3  193.0  17.80 396.90   5.68  32.50
+ 0.08308   0.00   2.460  0  0.4880  5.6040  89.80  2.9879   3  193.0  17.80 391.00  13.98  26.40
+ 0.06047   0.00   2.460  0  0.4880  6.1530  68.80  3.2797   3  193.0  17.80 387.11  13.15  29.60
+ 0.05602   0.00   2.460  0  0.4880  7.8310  53.60  3.1992   3  193.0  17.80 392.63   4.45  50.00
+ 0.07875  45.00   3.440  0  0.4370  6.7820  41.10  3.7886   5  398.0  15.20 393.87   6.68  32.00
+ 0.12579  45.00   3.440  0  0.4370  6.5560  29.10  4.5667   5  398.0  15.20 382.84   4.56  29.80
+ 0.08370  45.00   3.440  0  0.4370  7.1850  38.90  4.5667   5  398.0  15.20 396.90   5.39  34.90
+ 0.09068  45.00   3.440  0  0.4370  6.9510  21.50  6.4798   5  398.0  15.20 377.68   5.10  37.00
+ 0.06911  45.00   3.440  0  0.4370  6.7390  30.80  6.4798   5  398.0  15.20 389.71   4.69  30.50
+ 0.08664  45.00   3.440  0  0.4370  7.1780  26.30  6.4798   5  398.0  15.20 390.49   2.87  36.40
+ 0.02187  60.00   2.930  0  0.4010  6.8000   9.90  6.2196   1  265.0  15.60 393.37   5.03  31.10
+ 0.01439  60.00   2.930  0  0.4010  6.6040  18.80  6.2196   1  265.0  15.60 376.70   4.38  29.10
+ 0.01381  80.00   0.460  0  0.4220  7.8750  32.00  5.6484   4  255.0  14.40 394.23   2.97  50.00
+ 0.04011  80.00   1.520  0  0.4040  7.2870  34.10  7.3090   2  329.0  12.60 396.90   4.08  33.30
+ 0.04666  80.00   1.520  0  0.4040  7.1070  36.60  7.3090   2  329.0  12.60 354.31   8.61  30.30
+ 0.03768  80.00   1.520  0  0.4040  7.2740  38.30  7.3090   2  329.0  12.60 392.20   6.62  34.60
+ 0.03150  95.00   1.470  0  0.4030  6.9750  15.30  7.6534   3  402.0  17.00 396.90   4.56  34.90
+ 0.01778  95.00   1.470  0  0.4030  7.1350  13.90  7.6534   3  402.0  17.00 384.30   4.45  32.90
+ 0.03445  82.50   2.030  0  0.4150  6.1620  38.40  6.2700   2  348.0  14.70 393.77   7.43  24.10
+ 0.02177  82.50   2.030  0  0.4150  7.6100  15.70  6.2700   2  348.0  14.70 395.38   3.11  42.30
+ 0.03510  95.00   2.680  0  0.4161  7.8530  33.20  5.1180   4  224.0  14.70 392.78   3.81  48.50
+ 0.02009  95.00   2.680  0  0.4161  8.0340  31.90  5.1180   4  224.0  14.70 390.55   2.88  50.00
+ 0.13642   0.00  10.590  0  0.4890  5.8910  22.30  3.9454   4  277.0  18.60 396.90  10.87  22.60
+ 0.22969   0.00  10.590  0  0.4890  6.3260  52.50  4.3549   4  277.0  18.60 394.87  10.97  24.40
+ 0.25199   0.00  10.590  0  0.4890  5.7830  72.70  4.3549   4  277.0  18.60 389.43  18.06  22.50
+ 0.13587   0.00  10.590  1  0.4890  6.0640  59.10  4.2392   4  277.0  18.60 381.32  14.66  24.40
+ 0.43571   0.00  10.590  1  0.4890  5.3440 100.00  3.8750   4  277.0  18.60 396.90  23.09  20.00
+ 0.17446   0.00  10.590  1  0.4890  5.9600  92.10  3.8771   4  277.0  18.60 393.25  17.27  21.70
+ 0.37578   0.00  10.590  1  0.4890  5.4040  88.60  3.6650   4  277.0  18.60 395.24  23.98  19.30
+ 0.21719   0.00  10.590  1  0.4890  5.8070  53.80  3.6526   4  277.0  18.60 390.94  16.03  22.40
+ 0.14052   0.00  10.590  0  0.4890  6.3750  32.30  3.9454   4  277.0  18.60 385.81   9.38  28.10
+ 0.28955   0.00  10.590  0  0.4890  5.4120   9.80  3.5875   4  277.0  18.60 348.93  29.55  23.70
+ 0.19802   0.00  10.590  0  0.4890  6.1820  42.40  3.9454   4  277.0  18.60 393.63   9.47  25.00
+ 0.04560   0.00  13.890  1  0.5500  5.8880  56.00  3.1121   5  276.0  16.40 392.80  13.51  23.30
+ 0.07013   0.00  13.890  0  0.5500  6.6420  85.10  3.4211   5  276.0  16.40 392.78   9.69  28.70
+ 0.11069   0.00  13.890  1  0.5500  5.9510  93.80  2.8893   5  276.0  16.40 396.90  17.92  21.50
+ 0.11425   0.00  13.890  1  0.5500  6.3730  92.40  3.3633   5  276.0  16.40 393.74  10.50  23.00
+ 0.35809   0.00   6.200  1  0.5070  6.9510  88.50  2.8617   8  307.0  17.40 391.70   9.71  26.70
+ 0.40771   0.00   6.200  1  0.5070  6.1640  91.30  3.0480   8  307.0  17.40 395.24  21.46  21.70
+ 0.62356   0.00   6.200  1  0.5070  6.8790  77.70  3.2721   8  307.0  17.40 390.39   9.93  27.50
+ 0.61470   0.00   6.200  0  0.5070  6.6180  80.80  3.2721   8  307.0  17.40 396.90   7.60  30.10
+ 0.31533   0.00   6.200  0  0.5040  8.2660  78.30  2.8944   8  307.0  17.40 385.05   4.14  44.80
+ 0.52693   0.00   6.200  0  0.5040  8.7250  83.00  2.8944   8  307.0  17.40 382.00   4.63  50.00
+ 0.38214   0.00   6.200  0  0.5040  8.0400  86.50  3.2157   8  307.0  17.40 387.38   3.13  37.60
+ 0.41238   0.00   6.200  0  0.5040  7.1630  79.90  3.2157   8  307.0  17.40 372.08   6.36  31.60
+ 0.29819   0.00   6.200  0  0.5040  7.6860  17.00  3.3751   8  307.0  17.40 377.51   3.92  46.70
+ 0.44178   0.00   6.200  0  0.5040  6.5520  21.40  3.3751   8  307.0  17.40 380.34   3.76  31.50
+ 0.53700   0.00   6.200  0  0.5040  5.9810  68.10  3.6715   8  307.0  17.40 378.35  11.65  24.30
+ 0.46296   0.00   6.200  0  0.5040  7.4120  76.90  3.6715   8  307.0  17.40 376.14   5.25  31.70
+ 0.57529   0.00   6.200  0  0.5070  8.3370  73.30  3.8384   8  307.0  17.40 385.91   2.47  41.70
+ 0.33147   0.00   6.200  0  0.5070  8.2470  70.40  3.6519   8  307.0  17.40 378.95   3.95  48.30
+ 0.44791   0.00   6.200  1  0.5070  6.7260  66.50  3.6519   8  307.0  17.40 360.20   8.05  29.00
+ 0.33045   0.00   6.200  0  0.5070  6.0860  61.50  3.6519   8  307.0  17.40 376.75  10.88  24.00
+ 0.52058   0.00   6.200  1  0.5070  6.6310  76.50  4.1480   8  307.0  17.40 388.45   9.54  25.10
+ 0.51183   0.00   6.200  0  0.5070  7.3580  71.60  4.1480   8  307.0  17.40 390.07   4.73  31.50
+ 0.08244  30.00   4.930  0  0.4280  6.4810  18.50  6.1899   6  300.0  16.60 379.41   6.36  23.70
+ 0.09252  30.00   4.930  0  0.4280  6.6060  42.20  6.1899   6  300.0  16.60 383.78   7.37  23.30
+ 0.11329  30.00   4.930  0  0.4280  6.8970  54.30  6.3361   6  300.0  16.60 391.25  11.38  22.00
+ 0.10612  30.00   4.930  0  0.4280  6.0950  65.10  6.3361   6  300.0  16.60 394.62  12.40  20.10
+ 0.10290  30.00   4.930  0  0.4280  6.3580  52.90  7.0355   6  300.0  16.60 372.75  11.22  22.20
+ 0.12757  30.00   4.930  0  0.4280  6.3930   7.80  7.0355   6  300.0  16.60 374.71   5.19  23.70
+ 0.20608  22.00   5.860  0  0.4310  5.5930  76.50  7.9549   7  330.0  19.10 372.49  12.50  17.60
+ 0.19133  22.00   5.860  0  0.4310  5.6050  70.20  7.9549   7  330.0  19.10 389.13  18.46  18.50
+ 0.33983  22.00   5.860  0  0.4310  6.1080  34.90  8.0555   7  330.0  19.10 390.18   9.16  24.30
+ 0.19657  22.00   5.860  0  0.4310  6.2260  79.20  8.0555   7  330.0  19.10 376.14  10.15  20.50
+ 0.16439  22.00   5.860  0  0.4310  6.4330  49.10  7.8265   7  330.0  19.10 374.71   9.52  24.50
+ 0.19073  22.00   5.860  0  0.4310  6.7180  17.50  7.8265   7  330.0  19.10 393.74   6.56  26.20
+ 0.14030  22.00   5.860  0  0.4310  6.4870  13.00  7.3967   7  330.0  19.10 396.28   5.90  24.40
+ 0.21409  22.00   5.860  0  0.4310  6.4380   8.90  7.3967   7  330.0  19.10 377.07   3.59  24.80
+ 0.08221  22.00   5.860  0  0.4310  6.9570   6.80  8.9067   7  330.0  19.10 386.09   3.53  29.60
+ 0.36894  22.00   5.860  0  0.4310  8.2590   8.40  8.9067   7  330.0  19.10 396.90   3.54  42.80
+ 0.04819  80.00   3.640  0  0.3920  6.1080  32.00  9.2203   1  315.0  16.40 392.89   6.57  21.90
+ 0.03548  80.00   3.640  0  0.3920  5.8760  19.10  9.2203   1  315.0  16.40 395.18   9.25  20.90
+ 0.01538  90.00   3.750  0  0.3940  7.4540  34.20  6.3361   3  244.0  15.90 386.34   3.11  44.00
+ 0.61154  20.00   3.970  0  0.6470  8.7040  86.90  1.8010   5  264.0  13.00 389.70   5.12  50.00
+ 0.66351  20.00   3.970  0  0.6470  7.3330 100.00  1.8946   5  264.0  13.00 383.29   7.79  36.00
+ 0.65665  20.00   3.970  0  0.6470  6.8420 100.00  2.0107   5  264.0  13.00 391.93   6.90  30.10
+ 0.54011  20.00   3.970  0  0.6470  7.2030  81.80  2.1121   5  264.0  13.00 392.80   9.59  33.80
+ 0.53412  20.00   3.970  0  0.6470  7.5200  89.40  2.1398   5  264.0  13.00 388.37   7.26  43.10
+ 0.52014  20.00   3.970  0  0.6470  8.3980  91.50  2.2885   5  264.0  13.00 386.86   5.91  48.80
+ 0.82526  20.00   3.970  0  0.6470  7.3270  94.50  2.0788   5  264.0  13.00 393.42  11.25  31.00
+ 0.55007  20.00   3.970  0  0.6470  7.2060  91.60  1.9301   5  264.0  13.00 387.89   8.10  36.50
+ 0.76162  20.00   3.970  0  0.6470  5.5600  62.80  1.9865   5  264.0  13.00 392.40  10.45  22.80
+ 0.78570  20.00   3.970  0  0.6470  7.0140  84.60  2.1329   5  264.0  13.00 384.07  14.79  30.70
+ 0.57834  20.00   3.970  0  0.5750  8.2970  67.00  2.4216   5  264.0  13.00 384.54   7.44  50.00
+ 0.54050  20.00   3.970  0  0.5750  7.4700  52.60  2.8720   5  264.0  13.00 390.30   3.16  43.50
+ 0.09065  20.00   6.960  1  0.4640  5.9200  61.50  3.9175   3  223.0  18.60 391.34  13.65  20.70
+ 0.29916  20.00   6.960  0  0.4640  5.8560  42.10  4.4290   3  223.0  18.60 388.65  13.00  21.10
+ 0.16211  20.00   6.960  0  0.4640  6.2400  16.30  4.4290   3  223.0  18.60 396.90   6.59  25.20
+ 0.11460  20.00   6.960  0  0.4640  6.5380  58.70  3.9175   3  223.0  18.60 394.96   7.73  24.40
+ 0.22188  20.00   6.960  1  0.4640  7.6910  51.80  4.3665   3  223.0  18.60 390.77   6.58  35.20
+ 0.05644  40.00   6.410  1  0.4470  6.7580  32.90  4.0776   4  254.0  17.60 396.90   3.53  32.40
+ 0.09604  40.00   6.410  0  0.4470  6.8540  42.80  4.2673   4  254.0  17.60 396.90   2.98  32.00
+ 0.10469  40.00   6.410  1  0.4470  7.2670  49.00  4.7872   4  254.0  17.60 389.25   6.05  33.20
+ 0.06127  40.00   6.410  1  0.4470  6.8260  27.60  4.8628   4  254.0  17.60 393.45   4.16  33.10
+ 0.07978  40.00   6.410  0  0.4470  6.4820  32.10  4.1403   4  254.0  17.60 396.90   7.19  29.10
+ 0.21038  20.00   3.330  0  0.4429  6.8120  32.20  4.1007   5  216.0  14.90 396.90   4.85  35.10
+ 0.03578  20.00   3.330  0  0.4429  7.8200  64.50  4.6947   5  216.0  14.90 387.31   3.76  45.40
+ 0.03705  20.00   3.330  0  0.4429  6.9680  37.20  5.2447   5  216.0  14.90 392.23   4.59  35.40
+ 0.06129  20.00   3.330  1  0.4429  7.6450  49.70  5.2119   5  216.0  14.90 377.07   3.01  46.00
+ 0.01501  90.00   1.210  1  0.4010  7.9230  24.80  5.8850   1  198.0  13.60 395.52   3.16  50.00
+ 0.00906  90.00   2.970  0  0.4000  7.0880  20.80  7.3073   1  285.0  15.30 394.72   7.85  32.20
+ 0.01096  55.00   2.250  0  0.3890  6.4530  31.90  7.3073   1  300.0  15.30 394.72   8.23  22.00
+ 0.01965  80.00   1.760  0  0.3850  6.2300  31.50  9.0892   1  241.0  18.20 341.60  12.93  20.10
+ 0.03871  52.50   5.320  0  0.4050  6.2090  31.30  7.3172   6  293.0  16.60 396.90   7.14  23.20
+ 0.04590  52.50   5.320  0  0.4050  6.3150  45.60  7.3172   6  293.0  16.60 396.90   7.60  22.30
+ 0.04297  52.50   5.320  0  0.4050  6.5650  22.90  7.3172   6  293.0  16.60 371.72   9.51  24.80
+ 0.03502  80.00   4.950  0  0.4110  6.8610  27.90  5.1167   4  245.0  19.20 396.90   3.33  28.50
+ 0.07886  80.00   4.950  0  0.4110  7.1480  27.70  5.1167   4  245.0  19.20 396.90   3.56  37.30
+ 0.03615  80.00   4.950  0  0.4110  6.6300  23.40  5.1167   4  245.0  19.20 396.90   4.70  27.90
+ 0.08265   0.00  13.920  0  0.4370  6.1270  18.40  5.5027   4  289.0  16.00 396.90   8.58  23.90
+ 0.08199   0.00  13.920  0  0.4370  6.0090  42.30  5.5027   4  289.0  16.00 396.90  10.40  21.70
+ 0.12932   0.00  13.920  0  0.4370  6.6780  31.10  5.9604   4  289.0  16.00 396.90   6.27  28.60
+ 0.05372   0.00  13.920  0  0.4370  6.5490  51.00  5.9604   4  289.0  16.00 392.85   7.39  27.10
+ 0.14103   0.00  13.920  0  0.4370  5.7900  58.00  6.3200   4  289.0  16.00 396.90  15.84  20.30
+ 0.06466  70.00   2.240  0  0.4000  6.3450  20.10  7.8278   5  358.0  14.80 368.24   4.97  22.50
+ 0.05561  70.00   2.240  0  0.4000  7.0410  10.00  7.8278   5  358.0  14.80 371.58   4.74  29.00
+ 0.04417  70.00   2.240  0  0.4000  6.8710  47.40  7.8278   5  358.0  14.80 390.86   6.07  24.80
+ 0.03537  34.00   6.090  0  0.4330  6.5900  40.40  5.4917   7  329.0  16.10 395.75   9.50  22.00
+ 0.09266  34.00   6.090  0  0.4330  6.4950  18.40  5.4917   7  329.0  16.10 383.61   8.67  26.40
+ 0.10000  34.00   6.090  0  0.4330  6.9820  17.70  5.4917   7  329.0  16.10 390.43   4.86  33.10
+ 0.05515  33.00   2.180  0  0.4720  7.2360  41.10  4.0220   7  222.0  18.40 393.68   6.93  36.10
+ 0.05479  33.00   2.180  0  0.4720  6.6160  58.10  3.3700   7  222.0  18.40 393.36   8.93  28.40
+ 0.07503  33.00   2.180  0  0.4720  7.4200  71.90  3.0992   7  222.0  18.40 396.90   6.47  33.40
+ 0.04932  33.00   2.180  0  0.4720  6.8490  70.30  3.1827   7  222.0  18.40 396.90   7.53  28.20
+ 0.49298   0.00   9.900  0  0.5440  6.6350  82.50  3.3175   4  304.0  18.40 396.90   4.54  22.80
+ 0.34940   0.00   9.900  0  0.5440  5.9720  76.70  3.1025   4  304.0  18.40 396.24   9.97  20.30
+ 2.63548   0.00   9.900  0  0.5440  4.9730  37.80  2.5194   4  304.0  18.40 350.45  12.64  16.10
+ 0.79041   0.00   9.900  0  0.5440  6.1220  52.80  2.6403   4  304.0  18.40 396.90   5.98  22.10
+ 0.26169   0.00   9.900  0  0.5440  6.0230  90.40  2.8340   4  304.0  18.40 396.30  11.72  19.40
+ 0.26938   0.00   9.900  0  0.5440  6.2660  82.80  3.2628   4  304.0  18.40 393.39   7.90  21.60
+ 0.36920   0.00   9.900  0  0.5440  6.5670  87.30  3.6023   4  304.0  18.40 395.69   9.28  23.80
+ 0.25356   0.00   9.900  0  0.5440  5.7050  77.70  3.9450   4  304.0  18.40 396.42  11.50  16.20
+ 0.31827   0.00   9.900  0  0.5440  5.9140  83.20  3.9986   4  304.0  18.40 390.70  18.33  17.80
+ 0.24522   0.00   9.900  0  0.5440  5.7820  71.70  4.0317   4  304.0  18.40 396.90  15.94  19.80
+ 0.40202   0.00   9.900  0  0.5440  6.3820  67.20  3.5325   4  304.0  18.40 395.21  10.36  23.10
+ 0.47547   0.00   9.900  0  0.5440  6.1130  58.80  4.0019   4  304.0  18.40 396.23  12.73  21.00
+ 0.16760   0.00   7.380  0  0.4930  6.4260  52.30  4.5404   5  287.0  19.60 396.90   7.20  23.80
+ 0.18159   0.00   7.380  0  0.4930  6.3760  54.30  4.5404   5  287.0  19.60 396.90   6.87  23.10
+ 0.35114   0.00   7.380  0  0.4930  6.0410  49.90  4.7211   5  287.0  19.60 396.90   7.70  20.40
+ 0.28392   0.00   7.380  0  0.4930  5.7080  74.30  4.7211   5  287.0  19.60 391.13  11.74  18.50
+ 0.34109   0.00   7.380  0  0.4930  6.4150  40.10  4.7211   5  287.0  19.60 396.90   6.12  25.00
+ 0.19186   0.00   7.380  0  0.4930  6.4310  14.70  5.4159   5  287.0  19.60 393.68   5.08  24.60
+ 0.30347   0.00   7.380  0  0.4930  6.3120  28.90  5.4159   5  287.0  19.60 396.90   6.15  23.00
+ 0.24103   0.00   7.380  0  0.4930  6.0830  43.70  5.4159   5  287.0  19.60 396.90  12.79  22.20
+ 0.06617   0.00   3.240  0  0.4600  5.8680  25.80  5.2146   4  430.0  16.90 382.44   9.97  19.30
+ 0.06724   0.00   3.240  0  0.4600  6.3330  17.20  5.2146   4  430.0  16.90 375.21   7.34  22.60
+ 0.04544   0.00   3.240  0  0.4600  6.1440  32.20  5.8736   4  430.0  16.90 368.57   9.09  19.80
+ 0.05023  35.00   6.060  0  0.4379  5.7060  28.40  6.6407   1  304.0  16.90 394.02  12.43  17.10
+ 0.03466  35.00   6.060  0  0.4379  6.0310  23.30  6.6407   1  304.0  16.90 362.25   7.83  19.40
+ 0.05083   0.00   5.190  0  0.5150  6.3160  38.10  6.4584   5  224.0  20.20 389.71   5.68  22.20
+ 0.03738   0.00   5.190  0  0.5150  6.3100  38.50  6.4584   5  224.0  20.20 389.40   6.75  20.70
+ 0.03961   0.00   5.190  0  0.5150  6.0370  34.50  5.9853   5  224.0  20.20 396.90   8.01  21.10
+ 0.03427   0.00   5.190  0  0.5150  5.8690  46.30  5.2311   5  224.0  20.20 396.90   9.80  19.50
+ 0.03041   0.00   5.190  0  0.5150  5.8950  59.60  5.6150   5  224.0  20.20 394.81  10.56  18.50
+ 0.03306   0.00   5.190  0  0.5150  6.0590  37.30  4.8122   5  224.0  20.20 396.14   8.51  20.60
+ 0.05497   0.00   5.190  0  0.5150  5.9850  45.40  4.8122   5  224.0  20.20 396.90   9.74  19.00
+ 0.06151   0.00   5.190  0  0.5150  5.9680  58.50  4.8122   5  224.0  20.20 396.90   9.29  18.70
+ 0.01301  35.00   1.520  0  0.4420  7.2410  49.30  7.0379   1  284.0  15.50 394.74   5.49  32.70
+ 0.02498   0.00   1.890  0  0.5180  6.5400  59.70  6.2669   1  422.0  15.90 389.96   8.65  16.50
+ 0.02543  55.00   3.780  0  0.4840  6.6960  56.40  5.7321   5  370.0  17.60 396.90   7.18  23.90
+ 0.03049  55.00   3.780  0  0.4840  6.8740  28.10  6.4654   5  370.0  17.60 387.97   4.61  31.20
+ 0.03113   0.00   4.390  0  0.4420  6.0140  48.50  8.0136   3  352.0  18.80 385.64  10.53  17.50
+ 0.06162   0.00   4.390  0  0.4420  5.8980  52.30  8.0136   3  352.0  18.80 364.61  12.67  17.20
+ 0.01870  85.00   4.150  0  0.4290  6.5160  27.70  8.5353   4  351.0  17.90 392.43   6.36  23.10
+ 0.01501  80.00   2.010  0  0.4350  6.6350  29.70  8.3440   4  280.0  17.00 390.94   5.99  24.50
+ 0.02899  40.00   1.250  0  0.4290  6.9390  34.50  8.7921   1  335.0  19.70 389.85   5.89  26.60
+ 0.06211  40.00   1.250  0  0.4290  6.4900  44.40  8.7921   1  335.0  19.70 396.90   5.98  22.90
+ 0.07950  60.00   1.690  0  0.4110  6.5790  35.90 10.7103   4  411.0  18.30 370.78   5.49  24.10
+ 0.07244  60.00   1.690  0  0.4110  5.8840  18.50 10.7103   4  411.0  18.30 392.33   7.79  18.60
+ 0.01709  90.00   2.020  0  0.4100  6.7280  36.10 12.1265   5  187.0  17.00 384.46   4.50  30.10
+ 0.04301  80.00   1.910  0  0.4130  5.6630  21.90 10.5857   4  334.0  22.00 382.80   8.05  18.20
+ 0.10659  80.00   1.910  0  0.4130  5.9360  19.50 10.5857   4  334.0  22.00 376.04   5.57  20.60
+ 8.98296   0.00  18.100  1  0.7700  6.2120  97.40  2.1222  24  666.0  20.20 377.73  17.60  17.80
+ 3.84970   0.00  18.100  1  0.7700  6.3950  91.00  2.5052  24  666.0  20.20 391.34  13.27  21.70
+ 5.20177   0.00  18.100  1  0.7700  6.1270  83.40  2.7227  24  666.0  20.20 395.43  11.48  22.70
+ 4.26131   0.00  18.100  0  0.7700  6.1120  81.30  2.5091  24  666.0  20.20 390.74  12.67  22.60
+ 4.54192   0.00  18.100  0  0.7700  6.3980  88.00  2.5182  24  666.0  20.20 374.56   7.79  25.00
+ 3.83684   0.00  18.100  0  0.7700  6.2510  91.10  2.2955  24  666.0  20.20 350.65  14.19  19.90
+ 3.67822   0.00  18.100  0  0.7700  5.3620  96.20  2.1036  24  666.0  20.20 380.79  10.19  20.80
+ 4.22239   0.00  18.100  1  0.7700  5.8030  89.00  1.9047  24  666.0  20.20 353.04  14.64  16.80
+ 3.47428   0.00  18.100  1  0.7180  8.7800  82.90  1.9047  24  666.0  20.20 354.55   5.29  21.90
+ 4.55587   0.00  18.100  0  0.7180  3.5610  87.90  1.6132  24  666.0  20.20 354.70   7.12  27.50
+ 3.69695   0.00  18.100  0  0.7180  4.9630  91.40  1.7523  24  666.0  20.20 316.03  14.00  21.90
+13.52220   0.00  18.100  0  0.6310  3.8630 100.00  1.5106  24  666.0  20.20 131.42  13.33  23.10
+ 4.89822   0.00  18.100  0  0.6310  4.9700 100.00  1.3325  24  666.0  20.20 375.52   3.26  50.00
+ 5.66998   0.00  18.100  1  0.6310  6.6830  96.80  1.3567  24  666.0  20.20 375.33   3.73  50.00
+ 6.53876   0.00  18.100  1  0.6310  7.0160  97.50  1.2024  24  666.0  20.20 392.05   2.96  50.00
+ 9.23230   0.00  18.100  0  0.6310  6.2160 100.00  1.1691  24  666.0  20.20 366.15   9.53  50.00
+ 8.26725   0.00  18.100  1  0.6680  5.8750  89.60  1.1296  24  666.0  20.20 347.88   8.88  50.00
+11.10810   0.00  18.100  0  0.6680  4.9060 100.00  1.1742  24  666.0  20.20 396.90  34.77  13.80
+18.49820   0.00  18.100  0  0.6680  4.1380 100.00  1.1370  24  666.0  20.20 396.90  37.97  13.80
+19.60910   0.00  18.100  0  0.6710  7.3130  97.90  1.3163  24  666.0  20.20 396.90  13.44  15.00
+15.28800   0.00  18.100  0  0.6710  6.6490  93.30  1.3449  24  666.0  20.20 363.02  23.24  13.90
+ 9.82349   0.00  18.100  0  0.6710  6.7940  98.80  1.3580  24  666.0  20.20 396.90  21.24  13.30
+23.64820   0.00  18.100  0  0.6710  6.3800  96.20  1.3861  24  666.0  20.20 396.90  23.69  13.10
+17.86670   0.00  18.100  0  0.6710  6.2230 100.00  1.3861  24  666.0  20.20 393.74  21.78  10.20
+88.97620   0.00  18.100  0  0.6710  6.9680  91.90  1.4165  24  666.0  20.20 396.90  17.21  10.40
+15.87440   0.00  18.100  0  0.6710  6.5450  99.10  1.5192  24  666.0  20.20 396.90  21.08  10.90
+ 9.18702   0.00  18.100  0  0.7000  5.5360 100.00  1.5804  24  666.0  20.20 396.90  23.60  11.30
+ 7.99248   0.00  18.100  0  0.7000  5.5200 100.00  1.5331  24  666.0  20.20 396.90  24.56  12.30
+20.08490   0.00  18.100  0  0.7000  4.3680  91.20  1.4395  24  666.0  20.20 285.83  30.63   8.80
+16.81180   0.00  18.100  0  0.7000  5.2770  98.10  1.4261  24  666.0  20.20 396.90  30.81   7.20
+24.39380   0.00  18.100  0  0.7000  4.6520 100.00  1.4672  24  666.0  20.20 396.90  28.28  10.50
+22.59710   0.00  18.100  0  0.7000  5.0000  89.50  1.5184  24  666.0  20.20 396.90  31.99   7.40
+14.33370   0.00  18.100  0  0.7000  4.8800 100.00  1.5895  24  666.0  20.20 372.92  30.62  10.20
+ 8.15174   0.00  18.100  0  0.7000  5.3900  98.90  1.7281  24  666.0  20.20 396.90  20.85  11.50
+ 6.96215   0.00  18.100  0  0.7000  5.7130  97.00  1.9265  24  666.0  20.20 394.43  17.11  15.10
+ 5.29305   0.00  18.100  0  0.7000  6.0510  82.50  2.1678  24  666.0  20.20 378.38  18.76  23.20
+11.57790   0.00  18.100  0  0.7000  5.0360  97.00  1.7700  24  666.0  20.20 396.90  25.68   9.70
+ 8.64476   0.00  18.100  0  0.6930  6.1930  92.60  1.7912  24  666.0  20.20 396.90  15.17  13.80
+13.35980   0.00  18.100  0  0.6930  5.8870  94.70  1.7821  24  666.0  20.20 396.90  16.35  12.70
+ 8.71675   0.00  18.100  0  0.6930  6.4710  98.80  1.7257  24  666.0  20.20 391.98  17.12  13.10
+ 5.87205   0.00  18.100  0  0.6930  6.4050  96.00  1.6768  24  666.0  20.20 396.90  19.37  12.50
+ 7.67202   0.00  18.100  0  0.6930  5.7470  98.90  1.6334  24  666.0  20.20 393.10  19.92   8.50
+38.35180   0.00  18.100  0  0.6930  5.4530 100.00  1.4896  24  666.0  20.20 396.90  30.59   5.00
+ 9.91655   0.00  18.100  0  0.6930  5.8520  77.80  1.5004  24  666.0  20.20 338.16  29.97   6.30
+25.04610   0.00  18.100  0  0.6930  5.9870 100.00  1.5888  24  666.0  20.20 396.90  26.77   5.60
+14.23620   0.00  18.100  0  0.6930  6.3430 100.00  1.5741  24  666.0  20.20 396.90  20.32   7.20
+ 9.59571   0.00  18.100  0  0.6930  6.4040 100.00  1.6390  24  666.0  20.20 376.11  20.31  12.10
+24.80170   0.00  18.100  0  0.6930  5.3490  96.00  1.7028  24  666.0  20.20 396.90  19.77   8.30
+41.52920   0.00  18.100  0  0.6930  5.5310  85.40  1.6074  24  666.0  20.20 329.46  27.38   8.50
+67.92080   0.00  18.100  0  0.6930  5.6830 100.00  1.4254  24  666.0  20.20 384.97  22.98   5.00
+20.71620   0.00  18.100  0  0.6590  4.1380 100.00  1.1781  24  666.0  20.20 370.22  23.34  11.90
+11.95110   0.00  18.100  0  0.6590  5.6080 100.00  1.2852  24  666.0  20.20 332.09  12.13  27.90
+ 7.40389   0.00  18.100  0  0.5970  5.6170  97.90  1.4547  24  666.0  20.20 314.64  26.40  17.20
+14.43830   0.00  18.100  0  0.5970  6.8520 100.00  1.4655  24  666.0  20.20 179.36  19.78  27.50
+51.13580   0.00  18.100  0  0.5970  5.7570 100.00  1.4130  24  666.0  20.20   2.60  10.11  15.00
+14.05070   0.00  18.100  0  0.5970  6.6570 100.00  1.5275  24  666.0  20.20  35.05  21.22  17.20
+18.81100   0.00  18.100  0  0.5970  4.6280 100.00  1.5539  24  666.0  20.20  28.79  34.37  17.90
+28.65580   0.00  18.100  0  0.5970  5.1550 100.00  1.5894  24  666.0  20.20 210.97  20.08  16.30
+45.74610   0.00  18.100  0  0.6930  4.5190 100.00  1.6582  24  666.0  20.20  88.27  36.98   7.00
+18.08460   0.00  18.100  0  0.6790  6.4340 100.00  1.8347  24  666.0  20.20  27.25  29.05   7.20
+10.83420   0.00  18.100  0  0.6790  6.7820  90.80  1.8195  24  666.0  20.20  21.57  25.79   7.50
+25.94060   0.00  18.100  0  0.6790  5.3040  89.10  1.6475  24  666.0  20.20 127.36  26.64  10.40
+73.53410   0.00  18.100  0  0.6790  5.9570 100.00  1.8026  24  666.0  20.20  16.45  20.62   8.80
+11.81230   0.00  18.100  0  0.7180  6.8240  76.50  1.7940  24  666.0  20.20  48.45  22.74   8.40
+11.08740   0.00  18.100  0  0.7180  6.4110 100.00  1.8589  24  666.0  20.20 318.75  15.02  16.70
+ 7.02259   0.00  18.100  0  0.7180  6.0060  95.30  1.8746  24  666.0  20.20 319.98  15.70  14.20
+12.04820   0.00  18.100  0  0.6140  5.6480  87.60  1.9512  24  666.0  20.20 291.55  14.10  20.80
+ 7.05042   0.00  18.100  0  0.6140  6.1030  85.10  2.0218  24  666.0  20.20   2.52  23.29  13.40
+ 8.79212   0.00  18.100  0  0.5840  5.5650  70.60  2.0635  24  666.0  20.20   3.65  17.16  11.70
+15.86030   0.00  18.100  0  0.6790  5.8960  95.40  1.9096  24  666.0  20.20   7.68  24.39   8.30
+12.24720   0.00  18.100  0  0.5840  5.8370  59.70  1.9976  24  666.0  20.20  24.65  15.69  10.20
+37.66190   0.00  18.100  0  0.6790  6.2020  78.70  1.8629  24  666.0  20.20  18.82  14.52  10.90
+ 7.36711   0.00  18.100  0  0.6790  6.1930  78.10  1.9356  24  666.0  20.20  96.73  21.52  11.00
+ 9.33889   0.00  18.100  0  0.6790  6.3800  95.60  1.9682  24  666.0  20.20  60.72  24.08   9.50
+ 8.49213   0.00  18.100  0  0.5840  6.3480  86.10  2.0527  24  666.0  20.20  83.45  17.64  14.50
+10.06230   0.00  18.100  0  0.5840  6.8330  94.30  2.0882  24  666.0  20.20  81.33  19.69  14.10
+ 6.44405   0.00  18.100  0  0.5840  6.4250  74.80  2.2004  24  666.0  20.20  97.95  12.03  16.10
+ 5.58107   0.00  18.100  0  0.7130  6.4360  87.90  2.3158  24  666.0  20.20 100.19  16.22  14.30
+13.91340   0.00  18.100  0  0.7130  6.2080  95.00  2.2222  24  666.0  20.20 100.63  15.17  11.70
+11.16040   0.00  18.100  0  0.7400  6.6290  94.60  2.1247  24  666.0  20.20 109.85  23.27  13.40
+14.42080   0.00  18.100  0  0.7400  6.4610  93.30  2.0026  24  666.0  20.20  27.49  18.05   9.60
+15.17720   0.00  18.100  0  0.7400  6.1520 100.00  1.9142  24  666.0  20.20   9.32  26.45   8.70
+13.67810   0.00  18.100  0  0.7400  5.9350  87.90  1.8206  24  666.0  20.20  68.95  34.02   8.40
+ 9.39063   0.00  18.100  0  0.7400  5.6270  93.90  1.8172  24  666.0  20.20 396.90  22.88  12.80
+22.05110   0.00  18.100  0  0.7400  5.8180  92.40  1.8662  24  666.0  20.20 391.45  22.11  10.50
+ 9.72418   0.00  18.100  0  0.7400  6.4060  97.20  2.0651  24  666.0  20.20 385.96  19.52  17.10
+ 5.66637   0.00  18.100  0  0.7400  6.2190 100.00  2.0048  24  666.0  20.20 395.69  16.59  18.40
+ 9.96654   0.00  18.100  0  0.7400  6.4850 100.00  1.9784  24  666.0  20.20 386.73  18.85  15.40
+12.80230   0.00  18.100  0  0.7400  5.8540  96.60  1.8956  24  666.0  20.20 240.52  23.79  10.80
+10.67180   0.00  18.100  0  0.7400  6.4590  94.80  1.9879  24  666.0  20.20  43.06  23.98  11.80
+ 6.28807   0.00  18.100  0  0.7400  6.3410  96.40  2.0720  24  666.0  20.20 318.01  17.79  14.90
+ 9.92485   0.00  18.100  0  0.7400  6.2510  96.60  2.1980  24  666.0  20.20 388.52  16.44  12.60
+ 9.32909   0.00  18.100  0  0.7130  6.1850  98.70  2.2616  24  666.0  20.20 396.90  18.13  14.10
+ 7.52601   0.00  18.100  0  0.7130  6.4170  98.30  2.1850  24  666.0  20.20 304.21  19.31  13.00
+ 6.71772   0.00  18.100  0  0.7130  6.7490  92.60  2.3236  24  666.0  20.20   0.32  17.44  13.40
+ 5.44114   0.00  18.100  0  0.7130  6.6550  98.20  2.3552  24  666.0  20.20 355.29  17.73  15.20
+ 5.09017   0.00  18.100  0  0.7130  6.2970  91.80  2.3682  24  666.0  20.20 385.09  17.27  16.10
+ 8.24809   0.00  18.100  0  0.7130  7.3930  99.30  2.4527  24  666.0  20.20 375.87  16.74  17.80
+ 9.51363   0.00  18.100  0  0.7130  6.7280  94.10  2.4961  24  666.0  20.20   6.68  18.71  14.90
+ 4.75237   0.00  18.100  0  0.7130  6.5250  86.50  2.4358  24  666.0  20.20  50.92  18.13  14.10
+ 4.66883   0.00  18.100  0  0.7130  5.9760  87.90  2.5806  24  666.0  20.20  10.48  19.01  12.70
+ 8.20058   0.00  18.100  0  0.7130  5.9360  80.30  2.7792  24  666.0  20.20   3.50  16.94  13.50
+ 7.75223   0.00  18.100  0  0.7130  6.3010  83.70  2.7831  24  666.0  20.20 272.21  16.23  14.90
+ 6.80117   0.00  18.100  0  0.7130  6.0810  84.40  2.7175  24  666.0  20.20 396.90  14.70  20.00
+ 4.81213   0.00  18.100  0  0.7130  6.7010  90.00  2.5975  24  666.0  20.20 255.23  16.42  16.40
+ 3.69311   0.00  18.100  0  0.7130  6.3760  88.40  2.5671  24  666.0  20.20 391.43  14.65  17.70
+ 6.65492   0.00  18.100  0  0.7130  6.3170  83.00  2.7344  24  666.0  20.20 396.90  13.99  19.50
+ 5.82115   0.00  18.100  0  0.7130  6.5130  89.90  2.8016  24  666.0  20.20 393.82  10.29  20.20
+ 7.83932   0.00  18.100  0  0.6550  6.2090  65.40  2.9634  24  666.0  20.20 396.90  13.22  21.40
+ 3.16360   0.00  18.100  0  0.6550  5.7590  48.20  3.0665  24  666.0  20.20 334.40  14.13  19.90
+ 3.77498   0.00  18.100  0  0.6550  5.9520  84.70  2.8715  24  666.0  20.20  22.01  17.15  19.00
+ 4.42228   0.00  18.100  0  0.5840  6.0030  94.50  2.5403  24  666.0  20.20 331.29  21.32  19.10
+15.57570   0.00  18.100  0  0.5800  5.9260  71.00  2.9084  24  666.0  20.20 368.74  18.13  19.10
+13.07510   0.00  18.100  0  0.5800  5.7130  56.70  2.8237  24  666.0  20.20 396.90  14.76  20.10
+ 4.34879   0.00  18.100  0  0.5800  6.1670  84.00  3.0334  24  666.0  20.20 396.90  16.29  19.90
+ 4.03841   0.00  18.100  0  0.5320  6.2290  90.70  3.0993  24  666.0  20.20 395.33  12.87  19.60
+ 3.56868   0.00  18.100  0  0.5800  6.4370  75.00  2.8965  24  666.0  20.20 393.37  14.36  23.20
+ 4.64689   0.00  18.100  0  0.6140  6.9800  67.60  2.5329  24  666.0  20.20 374.68  11.66  29.80
+ 8.05579   0.00  18.100  0  0.5840  5.4270  95.40  2.4298  24  666.0  20.20 352.58  18.14  13.80
+ 6.39312   0.00  18.100  0  0.5840  6.1620  97.40  2.2060  24  666.0  20.20 302.76  24.10  13.30
+ 4.87141   0.00  18.100  0  0.6140  6.4840  93.60  2.3053  24  666.0  20.20 396.21  18.68  16.70
+15.02340   0.00  18.100  0  0.6140  5.3040  97.30  2.1007  24  666.0  20.20 349.48  24.91  12.00
+10.23300   0.00  18.100  0  0.6140  6.1850  96.70  2.1705  24  666.0  20.20 379.70  18.03  14.60
+14.33370   0.00  18.100  0  0.6140  6.2290  88.00  1.9512  24  666.0  20.20 383.32  13.11  21.40
+ 5.82401   0.00  18.100  0  0.5320  6.2420  64.70  3.4242  24  666.0  20.20 396.90  10.74  23.00
+ 5.70818   0.00  18.100  0  0.5320  6.7500  74.90  3.3317  24  666.0  20.20 393.07   7.74  23.70
+ 5.73116   0.00  18.100  0  0.5320  7.0610  77.00  3.4106  24  666.0  20.20 395.28   7.01  25.00
+ 2.81838   0.00  18.100  0  0.5320  5.7620  40.30  4.0983  24  666.0  20.20 392.92  10.42  21.80
+ 2.37857   0.00  18.100  0  0.5830  5.8710  41.90  3.7240  24  666.0  20.20 370.73  13.34  20.60
+ 3.67367   0.00  18.100  0  0.5830  6.3120  51.90  3.9917  24  666.0  20.20 388.62  10.58  21.20
+ 5.69175   0.00  18.100  0  0.5830  6.1140  79.80  3.5459  24  666.0  20.20 392.68  14.98  19.10
+ 4.83567   0.00  18.100  0  0.5830  5.9050  53.20  3.1523  24  666.0  20.20 388.22  11.45  20.60
+ 0.15086   0.00  27.740  0  0.6090  5.4540  92.70  1.8209   4  711.0  20.10 395.09  18.06  15.20
+ 0.18337   0.00  27.740  0  0.6090  5.4140  98.30  1.7554   4  711.0  20.10 344.05  23.97   7.00
+ 0.20746   0.00  27.740  0  0.6090  5.0930  98.00  1.8226   4  711.0  20.10 318.43  29.68   8.10
+ 0.10574   0.00  27.740  0  0.6090  5.9830  98.80  1.8681   4  711.0  20.10 390.11  18.07  13.60
+ 0.11132   0.00  27.740  0  0.6090  5.9830  83.50  2.1099   4  711.0  20.10 396.90  13.35  20.10
+ 0.17331   0.00   9.690  0  0.5850  5.7070  54.00  2.3817   6  391.0  19.20 396.90  12.01  21.80
+ 0.27957   0.00   9.690  0  0.5850  5.9260  42.60  2.3817   6  391.0  19.20 396.90  13.59  24.50
+ 0.17899   0.00   9.690  0  0.5850  5.6700  28.80  2.7986   6  391.0  19.20 393.29  17.60  23.10
+ 0.28960   0.00   9.690  0  0.5850  5.3900  72.90  2.7986   6  391.0  19.20 396.90  21.14  19.70
+ 0.26838   0.00   9.690  0  0.5850  5.7940  70.60  2.8927   6  391.0  19.20 396.90  14.10  18.30
+ 0.23912   0.00   9.690  0  0.5850  6.0190  65.30  2.4091   6  391.0  19.20 396.90  12.92  21.20
+ 0.17783   0.00   9.690  0  0.5850  5.5690  73.50  2.3999   6  391.0  19.20 395.77  15.10  17.50
+ 0.22438   0.00   9.690  0  0.5850  6.0270  79.70  2.4982   6  391.0  19.20 396.90  14.33  16.80
+ 0.06263   0.00  11.930  0  0.5730  6.5930  69.10  2.4786   1  273.0  21.00 391.99   9.67  22.40
+ 0.04527   0.00  11.930  0  0.5730  6.1200  76.70  2.2875   1  273.0  21.00 396.90   9.08  20.60
+ 0.06076   0.00  11.930  0  0.5730  6.9760  91.00  2.1675   1  273.0  21.00 396.90   5.64  23.90
+ 0.10959   0.00  11.930  0  0.5730  6.7940  89.30  2.3889   1  273.0  21.00 393.45   6.48  22.00
+ 0.04741   0.00  11.930  0  0.5730  6.0300  80.80  2.5050   1  273.0  21.00 396.90   7.88  11.90
--- a/data/pima-indians-diabetes.data.csv
+++ b/data/pima-indians-diabetes.data.csv
@@ -0,0 +1,768 @@
+6,148,72,35,0,33.6,0.627,50,1
+1,85,66,29,0,26.6,0.351,31,0
+8,183,64,0,0,23.3,0.672,32,1
+1,89,66,23,94,28.1,0.167,21,0
+0,137,40,35,168,43.1,2.288,33,1
+5,116,74,0,0,25.6,0.201,30,0
+3,78,50,32,88,31.0,0.248,26,1
+10,115,0,0,0,35.3,0.134,29,0
+2,197,70,45,543,30.5,0.158,53,1
+8,125,96,0,0,0.0,0.232,54,1
+4,110,92,0,0,37.6,0.191,30,0
+10,168,74,0,0,38.0,0.537,34,1
+10,139,80,0,0,27.1,1.441,57,0
+1,189,60,23,846,30.1,0.398,59,1
+5,166,72,19,175,25.8,0.587,51,1
+7,100,0,0,0,30.0,0.484,32,1
+0,118,84,47,230,45.8,0.551,31,1
+7,107,74,0,0,29.6,0.254,31,1
+1,103,30,38,83,43.3,0.183,33,0
+1,115,70,30,96,34.6,0.529,32,1
+3,126,88,41,235,39.3,0.704,27,0
+8,99,84,0,0,35.4,0.388,50,0
+7,196,90,0,0,39.8,0.451,41,1
+9,119,80,35,0,29.0,0.263,29,1
+11,143,94,33,146,36.6,0.254,51,1
+10,125,70,26,115,31.1,0.205,41,1
+7,147,76,0,0,39.4,0.257,43,1
+1,97,66,15,140,23.2,0.487,22,0
+13,145,82,19,110,22.2,0.245,57,0
+5,117,92,0,0,34.1,0.337,38,0
+5,109,75,26,0,36.0,0.546,60,0
+3,158,76,36,245,31.6,0.851,28,1
+3,88,58,11,54,24.8,0.267,22,0
+6,92,92,0,0,19.9,0.188,28,0
+10,122,78,31,0,27.6,0.512,45,0
+4,103,60,33,192,24.0,0.966,33,0
+11,138,76,0,0,33.2,0.420,35,0
+9,102,76,37,0,32.9,0.665,46,1
+2,90,68,42,0,38.2,0.503,27,1
+4,111,72,47,207,37.1,1.390,56,1
+3,180,64,25,70,34.0,0.271,26,0
+7,133,84,0,0,40.2,0.696,37,0
+7,106,92,18,0,22.7,0.235,48,0
+9,171,110,24,240,45.4,0.721,54,1
+7,159,64,0,0,27.4,0.294,40,0
+0,180,66,39,0,42.0,1.893,25,1
+1,146,56,0,0,29.7,0.564,29,0
+2,71,70,27,0,28.0,0.586,22,0
+7,103,66,32,0,39.1,0.344,31,1
+7,105,0,0,0,0.0,0.305,24,0
+1,103,80,11,82,19.4,0.491,22,0
+1,101,50,15,36,24.2,0.526,26,0
+5,88,66,21,23,24.4,0.342,30,0
+8,176,90,34,300,33.7,0.467,58,1
+7,150,66,42,342,34.7,0.718,42,0
+1,73,50,10,0,23.0,0.248,21,0
+7,187,68,39,304,37.7,0.254,41,1
+0,100,88,60,110,46.8,0.962,31,0
+0,146,82,0,0,40.5,1.781,44,0
+0,105,64,41,142,41.5,0.173,22,0
+2,84,0,0,0,0.0,0.304,21,0
+8,133,72,0,0,32.9,0.270,39,1
+5,44,62,0,0,25.0,0.587,36,0
+2,141,58,34,128,25.4,0.699,24,0
+7,114,66,0,0,32.8,0.258,42,1
+5,99,74,27,0,29.0,0.203,32,0
+0,109,88,30,0,32.5,0.855,38,1
+2,109,92,0,0,42.7,0.845,54,0
+1,95,66,13,38,19.6,0.334,25,0
+4,146,85,27,100,28.9,0.189,27,0
+2,100,66,20,90,32.9,0.867,28,1
+5,139,64,35,140,28.6,0.411,26,0
+13,126,90,0,0,43.4,0.583,42,1
+4,129,86,20,270,35.1,0.231,23,0
+1,79,75,30,0,32.0,0.396,22,0
+1,0,48,20,0,24.7,0.140,22,0
+7,62,78,0,0,32.6,0.391,41,0
+5,95,72,33,0,37.7,0.370,27,0
+0,131,0,0,0,43.2,0.270,26,1
+2,112,66,22,0,25.0,0.307,24,0
+3,113,44,13,0,22.4,0.140,22,0
+2,74,0,0,0,0.0,0.102,22,0
+7,83,78,26,71,29.3,0.767,36,0
+0,101,65,28,0,24.6,0.237,22,0
+5,137,108,0,0,48.8,0.227,37,1
+2,110,74,29,125,32.4,0.698,27,0
+13,106,72,54,0,36.6,0.178,45,0
+2,100,68,25,71,38.5,0.324,26,0
+15,136,70,32,110,37.1,0.153,43,1
+1,107,68,19,0,26.5,0.165,24,0
+1,80,55,0,0,19.1,0.258,21,0
+4,123,80,15,176,32.0,0.443,34,0
+7,81,78,40,48,46.7,0.261,42,0
+4,134,72,0,0,23.8,0.277,60,1
+2,142,82,18,64,24.7,0.761,21,0
+6,144,72,27,228,33.9,0.255,40,0
+2,92,62,28,0,31.6,0.130,24,0
+1,71,48,18,76,20.4,0.323,22,0
+6,93,50,30,64,28.7,0.356,23,0
+1,122,90,51,220,49.7,0.325,31,1
+1,163,72,0,0,39.0,1.222,33,1
+1,151,60,0,0,26.1,0.179,22,0
+0,125,96,0,0,22.5,0.262,21,0
+1,81,72,18,40,26.6,0.283,24,0
+2,85,65,0,0,39.6,0.930,27,0
+1,126,56,29,152,28.7,0.801,21,0
+1,96,122,0,0,22.4,0.207,27,0
+4,144,58,28,140,29.5,0.287,37,0
+3,83,58,31,18,34.3,0.336,25,0
+0,95,85,25,36,37.4,0.247,24,1
+3,171,72,33,135,33.3,0.199,24,1
+8,155,62,26,495,34.0,0.543,46,1
+1,89,76,34,37,31.2,0.192,23,0
+4,76,62,0,0,34.0,0.391,25,0
+7,160,54,32,175,30.5,0.588,39,1
+4,146,92,0,0,31.2,0.539,61,1
+5,124,74,0,0,34.0,0.220,38,1
+5,78,48,0,0,33.7,0.654,25,0
+4,97,60,23,0,28.2,0.443,22,0
+4,99,76,15,51,23.2,0.223,21,0
+0,162,76,56,100,53.2,0.759,25,1
+6,111,64,39,0,34.2,0.260,24,0
+2,107,74,30,100,33.6,0.404,23,0
+5,132,80,0,0,26.8,0.186,69,0
+0,113,76,0,0,33.3,0.278,23,1
+1,88,30,42,99,55.0,0.496,26,1
+3,120,70,30,135,42.9,0.452,30,0
+1,118,58,36,94,33.3,0.261,23,0
+1,117,88,24,145,34.5,0.403,40,1
+0,105,84,0,0,27.9,0.741,62,1
+4,173,70,14,168,29.7,0.361,33,1
+9,122,56,0,0,33.3,1.114,33,1
+3,170,64,37,225,34.5,0.356,30,1
+8,84,74,31,0,38.3,0.457,39,0
+2,96,68,13,49,21.1,0.647,26,0
+2,125,60,20,140,33.8,0.088,31,0
+0,100,70,26,50,30.8,0.597,21,0
+0,93,60,25,92,28.7,0.532,22,0
+0,129,80,0,0,31.2,0.703,29,0
+5,105,72,29,325,36.9,0.159,28,0
+3,128,78,0,0,21.1,0.268,55,0
+5,106,82,30,0,39.5,0.286,38,0
+2,108,52,26,63,32.5,0.318,22,0
+10,108,66,0,0,32.4,0.272,42,1
+4,154,62,31,284,32.8,0.237,23,0
+0,102,75,23,0,0.0,0.572,21,0
+9,57,80,37,0,32.8,0.096,41,0
+2,106,64,35,119,30.5,1.400,34,0
+5,147,78,0,0,33.7,0.218,65,0
+2,90,70,17,0,27.3,0.085,22,0
+1,136,74,50,204,37.4,0.399,24,0
+4,114,65,0,0,21.9,0.432,37,0
+9,156,86,28,155,34.3,1.189,42,1
+1,153,82,42,485,40.6,0.687,23,0
+8,188,78,0,0,47.9,0.137,43,1
+7,152,88,44,0,50.0,0.337,36,1
+2,99,52,15,94,24.6,0.637,21,0
+1,109,56,21,135,25.2,0.833,23,0
+2,88,74,19,53,29.0,0.229,22,0
+17,163,72,41,114,40.9,0.817,47,1
+4,151,90,38,0,29.7,0.294,36,0
+7,102,74,40,105,37.2,0.204,45,0
+0,114,80,34,285,44.2,0.167,27,0
+2,100,64,23,0,29.7,0.368,21,0
+0,131,88,0,0,31.6,0.743,32,1
+6,104,74,18,156,29.9,0.722,41,1
+3,148,66,25,0,32.5,0.256,22,0
+4,120,68,0,0,29.6,0.709,34,0
+4,110,66,0,0,31.9,0.471,29,0
+3,111,90,12,78,28.4,0.495,29,0
+6,102,82,0,0,30.8,0.180,36,1
+6,134,70,23,130,35.4,0.542,29,1
+2,87,0,23,0,28.9,0.773,25,0
+1,79,60,42,48,43.5,0.678,23,0
+2,75,64,24,55,29.7,0.370,33,0
+8,179,72,42,130,32.7,0.719,36,1
+6,85,78,0,0,31.2,0.382,42,0
+0,129,110,46,130,67.1,0.319,26,1
+5,143,78,0,0,45.0,0.190,47,0
+5,130,82,0,0,39.1,0.956,37,1
+6,87,80,0,0,23.2,0.084,32,0
+0,119,64,18,92,34.9,0.725,23,0
+1,0,74,20,23,27.7,0.299,21,0
+5,73,60,0,0,26.8,0.268,27,0
+4,141,74,0,0,27.6,0.244,40,0
+7,194,68,28,0,35.9,0.745,41,1
+8,181,68,36,495,30.1,0.615,60,1
+1,128,98,41,58,32.0,1.321,33,1
+8,109,76,39,114,27.9,0.640,31,1
+5,139,80,35,160,31.6,0.361,25,1
+3,111,62,0,0,22.6,0.142,21,0
+9,123,70,44,94,33.1,0.374,40,0
+7,159,66,0,0,30.4,0.383,36,1
+11,135,0,0,0,52.3,0.578,40,1
+8,85,55,20,0,24.4,0.136,42,0
+5,158,84,41,210,39.4,0.395,29,1
+1,105,58,0,0,24.3,0.187,21,0
+3,107,62,13,48,22.9,0.678,23,1
+4,109,64,44,99,34.8,0.905,26,1
+4,148,60,27,318,30.9,0.150,29,1
+0,113,80,16,0,31.0,0.874,21,0
+1,138,82,0,0,40.1,0.236,28,0
+0,108,68,20,0,27.3,0.787,32,0
+2,99,70,16,44,20.4,0.235,27,0
+6,103,72,32,190,37.7,0.324,55,0
+5,111,72,28,0,23.9,0.407,27,0
+8,196,76,29,280,37.5,0.605,57,1
+5,162,104,0,0,37.7,0.151,52,1
+1,96,64,27,87,33.2,0.289,21,0
+7,184,84,33,0,35.5,0.355,41,1
+2,81,60,22,0,27.7,0.290,25,0
+0,147,85,54,0,42.8,0.375,24,0
+7,179,95,31,0,34.2,0.164,60,0
+0,140,65,26,130,42.6,0.431,24,1
+9,112,82,32,175,34.2,0.260,36,1
+12,151,70,40,271,41.8,0.742,38,1
+5,109,62,41,129,35.8,0.514,25,1
+6,125,68,30,120,30.0,0.464,32,0
+5,85,74,22,0,29.0,1.224,32,1
+5,112,66,0,0,37.8,0.261,41,1
+0,177,60,29,478,34.6,1.072,21,1
+2,158,90,0,0,31.6,0.805,66,1
+7,119,0,0,0,25.2,0.209,37,0
+7,142,60,33,190,28.8,0.687,61,0
+1,100,66,15,56,23.6,0.666,26,0
+1,87,78,27,32,34.6,0.101,22,0
+0,101,76,0,0,35.7,0.198,26,0
+3,162,52,38,0,37.2,0.652,24,1
+4,197,70,39,744,36.7,2.329,31,0
+0,117,80,31,53,45.2,0.089,24,0
+4,142,86,0,0,44.0,0.645,22,1
+6,134,80,37,370,46.2,0.238,46,1
+1,79,80,25,37,25.4,0.583,22,0
+4,122,68,0,0,35.0,0.394,29,0
+3,74,68,28,45,29.7,0.293,23,0
+4,171,72,0,0,43.6,0.479,26,1
+7,181,84,21,192,35.9,0.586,51,1
+0,179,90,27,0,44.1,0.686,23,1
+9,164,84,21,0,30.8,0.831,32,1
+0,104,76,0,0,18.4,0.582,27,0
+1,91,64,24,0,29.2,0.192,21,0
+4,91,70,32,88,33.1,0.446,22,0
+3,139,54,0,0,25.6,0.402,22,1
+6,119,50,22,176,27.1,1.318,33,1
+2,146,76,35,194,38.2,0.329,29,0
+9,184,85,15,0,30.0,1.213,49,1
+10,122,68,0,0,31.2,0.258,41,0
+0,165,90,33,680,52.3,0.427,23,0
+9,124,70,33,402,35.4,0.282,34,0
+1,111,86,19,0,30.1,0.143,23,0
+9,106,52,0,0,31.2,0.380,42,0
+2,129,84,0,0,28.0,0.284,27,0
+2,90,80,14,55,24.4,0.249,24,0
+0,86,68,32,0,35.8,0.238,25,0
+12,92,62,7,258,27.6,0.926,44,1
+1,113,64,35,0,33.6,0.543,21,1
+3,111,56,39,0,30.1,0.557,30,0
+2,114,68,22,0,28.7,0.092,25,0
+1,193,50,16,375,25.9,0.655,24,0
+11,155,76,28,150,33.3,1.353,51,1
+3,191,68,15,130,30.9,0.299,34,0
+3,141,0,0,0,30.0,0.761,27,1
+4,95,70,32,0,32.1,0.612,24,0
+3,142,80,15,0,32.4,0.200,63,0
+4,123,62,0,0,32.0,0.226,35,1
+5,96,74,18,67,33.6,0.997,43,0
+0,138,0,0,0,36.3,0.933,25,1
+2,128,64,42,0,40.0,1.101,24,0
+0,102,52,0,0,25.1,0.078,21,0
+2,146,0,0,0,27.5,0.240,28,1
+10,101,86,37,0,45.6,1.136,38,1
+2,108,62,32,56,25.2,0.128,21,0
+3,122,78,0,0,23.0,0.254,40,0
+1,71,78,50,45,33.2,0.422,21,0
+13,106,70,0,0,34.2,0.251,52,0
+2,100,70,52,57,40.5,0.677,25,0
+7,106,60,24,0,26.5,0.296,29,1
+0,104,64,23,116,27.8,0.454,23,0
+5,114,74,0,0,24.9,0.744,57,0
+2,108,62,10,278,25.3,0.881,22,0
+0,146,70,0,0,37.9,0.334,28,1
+10,129,76,28,122,35.9,0.280,39,0
+7,133,88,15,155,32.4,0.262,37,0
+7,161,86,0,0,30.4,0.165,47,1
+2,108,80,0,0,27.0,0.259,52,1
+7,136,74,26,135,26.0,0.647,51,0
+5,155,84,44,545,38.7,0.619,34,0
+1,119,86,39,220,45.6,0.808,29,1
+4,96,56,17,49,20.8,0.340,26,0
+5,108,72,43,75,36.1,0.263,33,0
+0,78,88,29,40,36.9,0.434,21,0
+0,107,62,30,74,36.6,0.757,25,1
+2,128,78,37,182,43.3,1.224,31,1
+1,128,48,45,194,40.5,0.613,24,1
+0,161,50,0,0,21.9,0.254,65,0
+6,151,62,31,120,35.5,0.692,28,0
+2,146,70,38,360,28.0,0.337,29,1
+0,126,84,29,215,30.7,0.520,24,0
+14,100,78,25,184,36.6,0.412,46,1
+8,112,72,0,0,23.6,0.840,58,0
+0,167,0,0,0,32.3,0.839,30,1
+2,144,58,33,135,31.6,0.422,25,1
+5,77,82,41,42,35.8,0.156,35,0
+5,115,98,0,0,52.9,0.209,28,1
+3,150,76,0,0,21.0,0.207,37,0
+2,120,76,37,105,39.7,0.215,29,0
+10,161,68,23,132,25.5,0.326,47,1
+0,137,68,14,148,24.8,0.143,21,0
+0,128,68,19,180,30.5,1.391,25,1
+2,124,68,28,205,32.9,0.875,30,1
+6,80,66,30,0,26.2,0.313,41,0
+0,106,70,37,148,39.4,0.605,22,0
+2,155,74,17,96,26.6,0.433,27,1
+3,113,50,10,85,29.5,0.626,25,0
+7,109,80,31,0,35.9,1.127,43,1
+2,112,68,22,94,34.1,0.315,26,0
+3,99,80,11,64,19.3,0.284,30,0
+3,182,74,0,0,30.5,0.345,29,1
+3,115,66,39,140,38.1,0.150,28,0
+6,194,78,0,0,23.5,0.129,59,1
+4,129,60,12,231,27.5,0.527,31,0
+3,112,74,30,0,31.6,0.197,25,1
+0,124,70,20,0,27.4,0.254,36,1
+13,152,90,33,29,26.8,0.731,43,1
+2,112,75,32,0,35.7,0.148,21,0
+1,157,72,21,168,25.6,0.123,24,0
+1,122,64,32,156,35.1,0.692,30,1
+10,179,70,0,0,35.1,0.200,37,0
+2,102,86,36,120,45.5,0.127,23,1
+6,105,70,32,68,30.8,0.122,37,0
+8,118,72,19,0,23.1,1.476,46,0
+2,87,58,16,52,32.7,0.166,25,0
+1,180,0,0,0,43.3,0.282,41,1
+12,106,80,0,0,23.6,0.137,44,0
+1,95,60,18,58,23.9,0.260,22,0
+0,165,76,43,255,47.9,0.259,26,0
+0,117,0,0,0,33.8,0.932,44,0
+5,115,76,0,0,31.2,0.343,44,1
+9,152,78,34,171,34.2,0.893,33,1
+7,178,84,0,0,39.9,0.331,41,1
+1,130,70,13,105,25.9,0.472,22,0
+1,95,74,21,73,25.9,0.673,36,0
+1,0,68,35,0,32.0,0.389,22,0
+5,122,86,0,0,34.7,0.290,33,0
+8,95,72,0,0,36.8,0.485,57,0
+8,126,88,36,108,38.5,0.349,49,0
+1,139,46,19,83,28.7,0.654,22,0
+3,116,0,0,0,23.5,0.187,23,0
+3,99,62,19,74,21.8,0.279,26,0
+5,0,80,32,0,41.0,0.346,37,1
+4,92,80,0,0,42.2,0.237,29,0
+4,137,84,0,0,31.2,0.252,30,0
+3,61,82,28,0,34.4,0.243,46,0
+1,90,62,12,43,27.2,0.580,24,0
+3,90,78,0,0,42.7,0.559,21,0
+9,165,88,0,0,30.4,0.302,49,1
+1,125,50,40,167,33.3,0.962,28,1
+13,129,0,30,0,39.9,0.569,44,1
+12,88,74,40,54,35.3,0.378,48,0
+1,196,76,36,249,36.5,0.875,29,1
+5,189,64,33,325,31.2,0.583,29,1
+5,158,70,0,0,29.8,0.207,63,0
+5,103,108,37,0,39.2,0.305,65,0
+4,146,78,0,0,38.5,0.520,67,1
+4,147,74,25,293,34.9,0.385,30,0
+5,99,54,28,83,34.0,0.499,30,0
+6,124,72,0,0,27.6,0.368,29,1
+0,101,64,17,0,21.0,0.252,21,0
+3,81,86,16,66,27.5,0.306,22,0
+1,133,102,28,140,32.8,0.234,45,1
+3,173,82,48,465,38.4,2.137,25,1
+0,118,64,23,89,0.0,1.731,21,0
+0,84,64,22,66,35.8,0.545,21,0
+2,105,58,40,94,34.9,0.225,25,0
+2,122,52,43,158,36.2,0.816,28,0
+12,140,82,43,325,39.2,0.528,58,1
+0,98,82,15,84,25.2,0.299,22,0
+1,87,60,37,75,37.2,0.509,22,0
+4,156,75,0,0,48.3,0.238,32,1
+0,93,100,39,72,43.4,1.021,35,0
+1,107,72,30,82,30.8,0.821,24,0
+0,105,68,22,0,20.0,0.236,22,0
+1,109,60,8,182,25.4,0.947,21,0
+1,90,62,18,59,25.1,1.268,25,0
+1,125,70,24,110,24.3,0.221,25,0
+1,119,54,13,50,22.3,0.205,24,0
+5,116,74,29,0,32.3,0.660,35,1
+8,105,100,36,0,43.3,0.239,45,1
+5,144,82,26,285,32.0,0.452,58,1
+3,100,68,23,81,31.6,0.949,28,0
+1,100,66,29,196,32.0,0.444,42,0
+5,166,76,0,0,45.7,0.340,27,1
+1,131,64,14,415,23.7,0.389,21,0
+4,116,72,12,87,22.1,0.463,37,0
+4,158,78,0,0,32.9,0.803,31,1
+2,127,58,24,275,27.7,1.600,25,0
+3,96,56,34,115,24.7,0.944,39,0
+0,131,66,40,0,34.3,0.196,22,1
+3,82,70,0,0,21.1,0.389,25,0
+3,193,70,31,0,34.9,0.241,25,1
+4,95,64,0,0,32.0,0.161,31,1
+6,137,61,0,0,24.2,0.151,55,0
+5,136,84,41,88,35.0,0.286,35,1
+9,72,78,25,0,31.6,0.280,38,0
+5,168,64,0,0,32.9,0.135,41,1
+2,123,48,32,165,42.1,0.520,26,0
+4,115,72,0,0,28.9,0.376,46,1
+0,101,62,0,0,21.9,0.336,25,0
+8,197,74,0,0,25.9,1.191,39,1
+1,172,68,49,579,42.4,0.702,28,1
+6,102,90,39,0,35.7,0.674,28,0
+1,112,72,30,176,34.4,0.528,25,0
+1,143,84,23,310,42.4,1.076,22,0
+1,143,74,22,61,26.2,0.256,21,0
+0,138,60,35,167,34.6,0.534,21,1
+3,173,84,33,474,35.7,0.258,22,1
+1,97,68,21,0,27.2,1.095,22,0
+4,144,82,32,0,38.5,0.554,37,1
+1,83,68,0,0,18.2,0.624,27,0
+3,129,64,29,115,26.4,0.219,28,1
+1,119,88,41,170,45.3,0.507,26,0
+2,94,68,18,76,26.0,0.561,21,0
+0,102,64,46,78,40.6,0.496,21,0
+2,115,64,22,0,30.8,0.421,21,0
+8,151,78,32,210,42.9,0.516,36,1
+4,184,78,39,277,37.0,0.264,31,1
+0,94,0,0,0,0.0,0.256,25,0
+1,181,64,30,180,34.1,0.328,38,1
+0,135,94,46,145,40.6,0.284,26,0
+1,95,82,25,180,35.0,0.233,43,1
+2,99,0,0,0,22.2,0.108,23,0
+3,89,74,16,85,30.4,0.551,38,0
+1,80,74,11,60,30.0,0.527,22,0
+2,139,75,0,0,25.6,0.167,29,0
+1,90,68,8,0,24.5,1.138,36,0
+0,141,0,0,0,42.4,0.205,29,1
+12,140,85,33,0,37.4,0.244,41,0
+5,147,75,0,0,29.9,0.434,28,0
+1,97,70,15,0,18.2,0.147,21,0
+6,107,88,0,0,36.8,0.727,31,0
+0,189,104,25,0,34.3,0.435,41,1
+2,83,66,23,50,32.2,0.497,22,0
+4,117,64,27,120,33.2,0.230,24,0
+8,108,70,0,0,30.5,0.955,33,1
+4,117,62,12,0,29.7,0.380,30,1
+0,180,78,63,14,59.4,2.420,25,1
+1,100,72,12,70,25.3,0.658,28,0
+0,95,80,45,92,36.5,0.330,26,0
+0,104,64,37,64,33.6,0.510,22,1
+0,120,74,18,63,30.5,0.285,26,0
+1,82,64,13,95,21.2,0.415,23,0
+2,134,70,0,0,28.9,0.542,23,1
+0,91,68,32,210,39.9,0.381,25,0
+2,119,0,0,0,19.6,0.832,72,0
+2,100,54,28,105,37.8,0.498,24,0
+14,175,62,30,0,33.6,0.212,38,1
+1,135,54,0,0,26.7,0.687,62,0
+5,86,68,28,71,30.2,0.364,24,0
+10,148,84,48,237,37.6,1.001,51,1
+9,134,74,33,60,25.9,0.460,81,0
+9,120,72,22,56,20.8,0.733,48,0
+1,71,62,0,0,21.8,0.416,26,0
+8,74,70,40,49,35.3,0.705,39,0
+5,88,78,30,0,27.6,0.258,37,0
+10,115,98,0,0,24.0,1.022,34,0
+0,124,56,13,105,21.8,0.452,21,0
+0,74,52,10,36,27.8,0.269,22,0
+0,97,64,36,100,36.8,0.600,25,0
+8,120,0,0,0,30.0,0.183,38,1
+6,154,78,41,140,46.1,0.571,27,0
+1,144,82,40,0,41.3,0.607,28,0
+0,137,70,38,0,33.2,0.170,22,0
+0,119,66,27,0,38.8,0.259,22,0
+7,136,90,0,0,29.9,0.210,50,0
+4,114,64,0,0,28.9,0.126,24,0
+0,137,84,27,0,27.3,0.231,59,0
+2,105,80,45,191,33.7,0.711,29,1
+7,114,76,17,110,23.8,0.466,31,0
+8,126,74,38,75,25.9,0.162,39,0
+4,132,86,31,0,28.0,0.419,63,0
+3,158,70,30,328,35.5,0.344,35,1
+0,123,88,37,0,35.2,0.197,29,0
+4,85,58,22,49,27.8,0.306,28,0
+0,84,82,31,125,38.2,0.233,23,0
+0,145,0,0,0,44.2,0.630,31,1
+0,135,68,42,250,42.3,0.365,24,1
+1,139,62,41,480,40.7,0.536,21,0
+0,173,78,32,265,46.5,1.159,58,0
+4,99,72,17,0,25.6,0.294,28,0
+8,194,80,0,0,26.1,0.551,67,0
+2,83,65,28,66,36.8,0.629,24,0
+2,89,90,30,0,33.5,0.292,42,0
+4,99,68,38,0,32.8,0.145,33,0
+4,125,70,18,122,28.9,1.144,45,1
+3,80,0,0,0,0.0,0.174,22,0
+6,166,74,0,0,26.6,0.304,66,0
+5,110,68,0,0,26.0,0.292,30,0
+2,81,72,15,76,30.1,0.547,25,0
+7,195,70,33,145,25.1,0.163,55,1
+6,154,74,32,193,29.3,0.839,39,0
+2,117,90,19,71,25.2,0.313,21,0
+3,84,72,32,0,37.2,0.267,28,0
+6,0,68,41,0,39.0,0.727,41,1
+7,94,64,25,79,33.3,0.738,41,0
+3,96,78,39,0,37.3,0.238,40,0
+10,75,82,0,0,33.3,0.263,38,0
+0,180,90,26,90,36.5,0.314,35,1
+1,130,60,23,170,28.6,0.692,21,0
+2,84,50,23,76,30.4,0.968,21,0
+8,120,78,0,0,25.0,0.409,64,0
+12,84,72,31,0,29.7,0.297,46,1
+0,139,62,17,210,22.1,0.207,21,0
+9,91,68,0,0,24.2,0.200,58,0
+2,91,62,0,0,27.3,0.525,22,0
+3,99,54,19,86,25.6,0.154,24,0
+3,163,70,18,105,31.6,0.268,28,1
+9,145,88,34,165,30.3,0.771,53,1
+7,125,86,0,0,37.6,0.304,51,0
+13,76,60,0,0,32.8,0.180,41,0
+6,129,90,7,326,19.6,0.582,60,0
+2,68,70,32,66,25.0,0.187,25,0
+3,124,80,33,130,33.2,0.305,26,0
+6,114,0,0,0,0.0,0.189,26,0
+9,130,70,0,0,34.2,0.652,45,1
+3,125,58,0,0,31.6,0.151,24,0
+3,87,60,18,0,21.8,0.444,21,0
+1,97,64,19,82,18.2,0.299,21,0
+3,116,74,15,105,26.3,0.107,24,0
+0,117,66,31,188,30.8,0.493,22,0
+0,111,65,0,0,24.6,0.660,31,0
+2,122,60,18,106,29.8,0.717,22,0
+0,107,76,0,0,45.3,0.686,24,0
+1,86,66,52,65,41.3,0.917,29,0
+6,91,0,0,0,29.8,0.501,31,0
+1,77,56,30,56,33.3,1.251,24,0
+4,132,0,0,0,32.9,0.302,23,1
+0,105,90,0,0,29.6,0.197,46,0
+0,57,60,0,0,21.7,0.735,67,0
+0,127,80,37,210,36.3,0.804,23,0
+3,129,92,49,155,36.4,0.968,32,1
+8,100,74,40,215,39.4,0.661,43,1
+3,128,72,25,190,32.4,0.549,27,1
+10,90,85,32,0,34.9,0.825,56,1
+4,84,90,23,56,39.5,0.159,25,0
+1,88,78,29,76,32.0,0.365,29,0
+8,186,90,35,225,34.5,0.423,37,1
+5,187,76,27,207,43.6,1.034,53,1
+4,131,68,21,166,33.1,0.160,28,0
+1,164,82,43,67,32.8,0.341,50,0
+4,189,110,31,0,28.5,0.680,37,0
+1,116,70,28,0,27.4,0.204,21,0
+3,84,68,30,106,31.9,0.591,25,0
+6,114,88,0,0,27.8,0.247,66,0
+1,88,62,24,44,29.9,0.422,23,0
+1,84,64,23,115,36.9,0.471,28,0
+7,124,70,33,215,25.5,0.161,37,0
+1,97,70,40,0,38.1,0.218,30,0
+8,110,76,0,0,27.8,0.237,58,0
+11,103,68,40,0,46.2,0.126,42,0
+11,85,74,0,0,30.1,0.300,35,0
+6,125,76,0,0,33.8,0.121,54,1
+0,198,66,32,274,41.3,0.502,28,1
+1,87,68,34,77,37.6,0.401,24,0
+6,99,60,19,54,26.9,0.497,32,0
+0,91,80,0,0,32.4,0.601,27,0
+2,95,54,14,88,26.1,0.748,22,0
+1,99,72,30,18,38.6,0.412,21,0
+6,92,62,32,126,32.0,0.085,46,0
+4,154,72,29,126,31.3,0.338,37,0
+0,121,66,30,165,34.3,0.203,33,1
+3,78,70,0,0,32.5,0.270,39,0
+2,130,96,0,0,22.6,0.268,21,0
+3,111,58,31,44,29.5,0.430,22,0
+2,98,60,17,120,34.7,0.198,22,0
+1,143,86,30,330,30.1,0.892,23,0
+1,119,44,47,63,35.5,0.280,25,0
+6,108,44,20,130,24.0,0.813,35,0
+2,118,80,0,0,42.9,0.693,21,1
+10,133,68,0,0,27.0,0.245,36,0
+2,197,70,99,0,34.7,0.575,62,1
+0,151,90,46,0,42.1,0.371,21,1
+6,109,60,27,0,25.0,0.206,27,0
+12,121,78,17,0,26.5,0.259,62,0
+8,100,76,0,0,38.7,0.190,42,0
+8,124,76,24,600,28.7,0.687,52,1
+1,93,56,11,0,22.5,0.417,22,0
+8,143,66,0,0,34.9,0.129,41,1
+6,103,66,0,0,24.3,0.249,29,0
+3,176,86,27,156,33.3,1.154,52,1
+0,73,0,0,0,21.1,0.342,25,0
+11,111,84,40,0,46.8,0.925,45,1
+2,112,78,50,140,39.4,0.175,24,0
+3,132,80,0,0,34.4,0.402,44,1
+2,82,52,22,115,28.5,1.699,25,0
+6,123,72,45,230,33.6,0.733,34,0
+0,188,82,14,185,32.0,0.682,22,1
+0,67,76,0,0,45.3,0.194,46,0
+1,89,24,19,25,27.8,0.559,21,0
+1,173,74,0,0,36.8,0.088,38,1
+1,109,38,18,120,23.1,0.407,26,0
+1,108,88,19,0,27.1,0.400,24,0
+6,96,0,0,0,23.7,0.190,28,0
+1,124,74,36,0,27.8,0.100,30,0
+7,150,78,29,126,35.2,0.692,54,1
+4,183,0,0,0,28.4,0.212,36,1
+1,124,60,32,0,35.8,0.514,21,0
+1,181,78,42,293,40.0,1.258,22,1
+1,92,62,25,41,19.5,0.482,25,0
+0,152,82,39,272,41.5,0.270,27,0
+1,111,62,13,182,24.0,0.138,23,0
+3,106,54,21,158,30.9,0.292,24,0
+3,174,58,22,194,32.9,0.593,36,1
+7,168,88,42,321,38.2,0.787,40,1
+6,105,80,28,0,32.5,0.878,26,0
+11,138,74,26,144,36.1,0.557,50,1
+3,106,72,0,0,25.8,0.207,27,0
+6,117,96,0,0,28.7,0.157,30,0
+2,68,62,13,15,20.1,0.257,23,0
+9,112,82,24,0,28.2,1.282,50,1
+0,119,0,0,0,32.4,0.141,24,1
+2,112,86,42,160,38.4,0.246,28,0
+2,92,76,20,0,24.2,1.698,28,0
+6,183,94,0,0,40.8,1.461,45,0
+0,94,70,27,115,43.5,0.347,21,0
+2,108,64,0,0,30.8,0.158,21,0
+4,90,88,47,54,37.7,0.362,29,0
+0,125,68,0,0,24.7,0.206,21,0
+0,132,78,0,0,32.4,0.393,21,0
+5,128,80,0,0,34.6,0.144,45,0
+4,94,65,22,0,24.7,0.148,21,0
+7,114,64,0,0,27.4,0.732,34,1
+0,102,78,40,90,34.5,0.238,24,0
+2,111,60,0,0,26.2,0.343,23,0
+1,128,82,17,183,27.5,0.115,22,0
+10,92,62,0,0,25.9,0.167,31,0
+13,104,72,0,0,31.2,0.465,38,1
+5,104,74,0,0,28.8,0.153,48,0
+2,94,76,18,66,31.6,0.649,23,0
+7,97,76,32,91,40.9,0.871,32,1
+1,100,74,12,46,19.5,0.149,28,0
+0,102,86,17,105,29.3,0.695,27,0
+4,128,70,0,0,34.3,0.303,24,0
+6,147,80,0,0,29.5,0.178,50,1
+4,90,0,0,0,28.0,0.610,31,0
+3,103,72,30,152,27.6,0.730,27,0
+2,157,74,35,440,39.4,0.134,30,0
+1,167,74,17,144,23.4,0.447,33,1
+0,179,50,36,159,37.8,0.455,22,1
+11,136,84,35,130,28.3,0.260,42,1
+0,107,60,25,0,26.4,0.133,23,0
+1,91,54,25,100,25.2,0.234,23,0
+1,117,60,23,106,33.8,0.466,27,0
+5,123,74,40,77,34.1,0.269,28,0
+2,120,54,0,0,26.8,0.455,27,0
+1,106,70,28,135,34.2,0.142,22,0
+2,155,52,27,540,38.7,0.240,25,1
+2,101,58,35,90,21.8,0.155,22,0
+1,120,80,48,200,38.9,1.162,41,0
+11,127,106,0,0,39.0,0.190,51,0
+3,80,82,31,70,34.2,1.292,27,1
+10,162,84,0,0,27.7,0.182,54,0
+1,199,76,43,0,42.9,1.394,22,1
+8,167,106,46,231,37.6,0.165,43,1
+9,145,80,46,130,37.9,0.637,40,1
+6,115,60,39,0,33.7,0.245,40,1
+1,112,80,45,132,34.8,0.217,24,0
+4,145,82,18,0,32.5,0.235,70,1
+10,111,70,27,0,27.5,0.141,40,1
+6,98,58,33,190,34.0,0.430,43,0
+9,154,78,30,100,30.9,0.164,45,0
+6,165,68,26,168,33.6,0.631,49,0
+1,99,58,10,0,25.4,0.551,21,0
+10,68,106,23,49,35.5,0.285,47,0
+3,123,100,35,240,57.3,0.880,22,0
+8,91,82,0,0,35.6,0.587,68,0
+6,195,70,0,0,30.9,0.328,31,1
+9,156,86,0,0,24.8,0.230,53,1
+0,93,60,0,0,35.3,0.263,25,0
+3,121,52,0,0,36.0,0.127,25,1
+2,101,58,17,265,24.2,0.614,23,0
+2,56,56,28,45,24.2,0.332,22,0
+0,162,76,36,0,49.6,0.364,26,1
+0,95,64,39,105,44.6,0.366,22,0
+4,125,80,0,0,32.3,0.536,27,1
+5,136,82,0,0,0.0,0.640,69,0
+2,129,74,26,205,33.2,0.591,25,0
+3,130,64,0,0,23.1,0.314,22,0
+1,107,50,19,0,28.3,0.181,29,0
+1,140,74,26,180,24.1,0.828,23,0
+1,144,82,46,180,46.1,0.335,46,1
+8,107,80,0,0,24.6,0.856,34,0
+13,158,114,0,0,42.3,0.257,44,1
+2,121,70,32,95,39.1,0.886,23,0
+7,129,68,49,125,38.5,0.439,43,1
+2,90,60,0,0,23.5,0.191,25,0
+7,142,90,24,480,30.4,0.128,43,1
+3,169,74,19,125,29.9,0.268,31,1
+0,99,0,0,0,25.0,0.253,22,0
+4,127,88,11,155,34.5,0.598,28,0
+4,118,70,0,0,44.5,0.904,26,0
+2,122,76,27,200,35.9,0.483,26,0
+6,125,78,31,0,27.6,0.565,49,1
+1,168,88,29,0,35.0,0.905,52,1
+2,129,0,0,0,38.5,0.304,41,0
+4,110,76,20,100,28.4,0.118,27,0
+6,80,80,36,0,39.8,0.177,28,0
+10,115,0,0,0,0.0,0.261,30,1
+2,127,46,21,335,34.4,0.176,22,0
+9,164,78,0,0,32.8,0.148,45,1
+2,93,64,32,160,38.0,0.674,23,1
+3,158,64,13,387,31.2,0.295,24,0
+5,126,78,27,22,29.6,0.439,40,0
+10,129,62,36,0,41.2,0.441,38,1
+0,134,58,20,291,26.4,0.352,21,0
+3,102,74,0,0,29.5,0.121,32,0
+7,187,50,33,392,33.9,0.826,34,1
+3,173,78,39,185,33.8,0.970,31,1
+10,94,72,18,0,23.1,0.595,56,0
+1,108,60,46,178,35.5,0.415,24,0
+5,97,76,27,0,35.6,0.378,52,1
+4,83,86,19,0,29.3,0.317,34,0
+1,114,66,36,200,38.1,0.289,21,0
+1,149,68,29,127,29.3,0.349,42,1
+5,117,86,30,105,39.1,0.251,42,0
+1,111,94,0,0,32.8,0.265,45,0
+4,112,78,40,0,39.4,0.236,38,0
+1,116,78,29,180,36.1,0.496,25,0
+0,141,84,26,0,32.4,0.433,22,0
+2,175,88,0,0,22.9,0.326,22,0
+2,92,52,0,0,30.1,0.141,22,0
+3,130,78,23,79,28.4,0.323,34,1
+8,120,86,0,0,28.4,0.259,22,1
+2,174,88,37,120,44.5,0.646,24,1
+2,106,56,27,165,29.0,0.426,22,0
+2,105,75,0,0,23.3,0.560,53,0
+4,95,60,32,0,35.4,0.284,28,0
+0,126,86,27,120,27.4,0.515,21,0
+8,65,72,23,0,32.0,0.600,42,0
+2,99,60,17,160,36.6,0.453,21,0
+1,102,74,0,0,39.5,0.293,42,1
+11,120,80,37,150,42.3,0.785,48,1
+3,102,44,20,94,30.8,0.400,26,0
+1,109,58,18,116,28.5,0.219,22,0
+9,140,94,0,0,32.7,0.734,45,1
+13,153,88,37,140,40.6,1.174,39,0
+12,100,84,33,105,30.0,0.488,46,0
+1,147,94,41,0,49.3,0.358,27,1
+1,81,74,41,57,46.3,1.096,32,0
+3,187,70,22,200,36.4,0.408,36,1
+6,162,62,0,0,24.3,0.178,50,1
+4,136,70,0,0,31.2,1.182,22,1
+1,121,78,39,74,39.0,0.261,28,0
+3,108,62,24,0,26.0,0.223,25,0
+0,181,88,44,510,43.3,0.222,26,1
+8,154,78,32,0,32.4,0.443,45,1
+1,128,88,39,110,36.5,1.057,37,1
+7,137,90,41,0,32.0,0.391,39,0
+0,123,72,0,0,36.3,0.258,52,1
+1,106,76,0,0,37.5,0.197,26,0
+6,190,92,0,0,35.5,0.278,66,1
+2,88,58,26,16,28.4,0.766,22,0
+9,170,74,31,0,44.0,0.403,43,1
+9,89,62,0,0,22.5,0.142,33,0
+10,101,76,48,180,32.9,0.171,63,0
+2,122,70,27,0,36.8,0.340,27,0
+5,121,72,23,112,26.2,0.245,30,0
+1,126,60,0,0,30.1,0.349,47,1
+1,93,70,31,0,30.4,0.315,23,0
--- a/data/titanic.csv
+++ b/data/titanic.csv
@@ -0,0 +1,892 @@
+PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
+1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
+2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C
+3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S
+4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S
+5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S
+6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
+7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S
+8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S
+9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S
+10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C
+11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S
+12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S
+13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S
+14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S
+15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S
+16,1,2,"Hewlett, Mrs. (Mary D Kingcome) ",female,55,0,0,248706,16,,S
+17,0,3,"Rice, Master. Eugene",male,2,4,1,382652,29.125,,Q
+18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13,,S
+19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",female,31,1,0,345763,18,,S
+20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C
+21,0,2,"Fynney, Mr. Joseph J",male,35,0,0,239865,26,,S
+22,1,2,"Beesley, Mr. Lawrence",male,34,0,0,248698,13,D56,S
+23,1,3,"McGowan, Miss. Anna ""Annie""",female,15,0,0,330923,8.0292,,Q
+24,1,1,"Sloper, Mr. William Thompson",male,28,0,0,113788,35.5,A6,S
+25,0,3,"Palsson, Miss. Torborg Danira",female,8,3,1,349909,21.075,,S
+26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",female,38,1,5,347077,31.3875,,S
+27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C
+28,0,1,"Fortune, Mr. Charles Alexander",male,19,3,2,19950,263,C23 C25 C27,S
+29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q
+30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S
+31,0,1,"Uruchurtu, Don. Manuel E",male,40,0,0,PC 17601,27.7208,,C
+32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C
+33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q
+34,0,2,"Wheadon, Mr. Edward H",male,66,0,0,C.A. 24579,10.5,,S
+35,0,1,"Meyer, Mr. Edgar Joseph",male,28,1,0,PC 17604,82.1708,,C
+36,0,1,"Holverson, Mr. Alexander Oskar",male,42,1,0,113789,52,,S
+37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C
+38,0,3,"Cann, Mr. Ernest Charles",male,21,0,0,A./5. 2152,8.05,,S
+39,0,3,"Vander Planke, Miss. Augusta Maria",female,18,2,0,345764,18,,S
+40,1,3,"Nicola-Yarred, Miss. Jamila",female,14,1,0,2651,11.2417,,C
+41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40,1,0,7546,9.475,,S
+42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",female,27,1,0,11668,21,,S
+43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C
+44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C
+45,1,3,"Devaney, Miss. Margaret Delia",female,19,0,0,330958,7.8792,,Q
+46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.05,,S
+47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q
+48,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.75,,Q
+49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C
+50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18,1,0,349237,17.8,,S
+51,0,3,"Panula, Master. Juha Niilo",male,7,4,1,3101295,39.6875,,S
+52,0,3,"Nosworthy, Mr. Richard Cater",male,21,0,0,A/4. 39886,7.8,,S
+53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49,1,0,PC 17572,76.7292,D33,C
+54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",female,29,1,0,2926,26,,S
+55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65,0,1,113509,61.9792,B30,C
+56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S
+57,1,2,"Rugg, Miss. Emily",female,21,0,0,C.A. 31026,10.5,,S
+58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C
+59,1,2,"West, Miss. Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S
+60,0,3,"Goodwin, Master. William Frederick",male,11,5,2,CA 2144,46.9,,S
+61,0,3,"Sirayanian, Mr. Orsen",male,22,0,0,2669,7.2292,,C
+62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28,
+63,0,1,"Harris, Mr. Henry Birkhardt",male,45,1,0,36973,83.475,C83,S
+64,0,3,"Skoog, Master. Harald",male,4,3,2,347088,27.9,,S
+65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C
+66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C
+67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29,0,0,C.A. 29395,10.5,F33,S
+68,0,3,"Crease, Mr. Ernest James",male,19,0,0,S.P. 3464,8.1583,,S
+69,1,3,"Andersson, Miss. Erna Alexandra",female,17,4,2,3101281,7.925,,S
+70,0,3,"Kink, Mr. Vincenz",male,26,2,0,315151,8.6625,,S
+71,0,2,"Jenkin, Mr. Stephen Curnow",male,32,0,0,C.A. 33111,10.5,,S
+72,0,3,"Goodwin, Miss. Lillian Amy",female,16,5,2,CA 2144,46.9,,S
+73,0,2,"Hood, Mr. Ambrose Jr",male,21,0,0,S.O.C. 14879,73.5,,S
+74,0,3,"Chronopoulos, Mr. Apostolos",male,26,1,0,2680,14.4542,,C
+75,1,3,"Bing, Mr. Lee",male,32,0,0,1601,56.4958,,S
+76,0,3,"Moen, Mr. Sigurd Hansen",male,25,0,0,348123,7.65,F G73,S
+77,0,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,S
+78,0,3,"Moutal, Mr. Rahamin Haim",male,,0,0,374746,8.05,,S
+79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29,,S
+80,1,3,"Dowdell, Miss. Elizabeth",female,30,0,0,364516,12.475,,S
+81,0,3,"Waelens, Mr. Achille",male,22,0,0,345767,9,,S
+82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29,0,0,345779,9.5,,S
+83,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q
+84,0,1,"Carrau, Mr. Francisco M",male,28,0,0,113059,47.1,,S
+85,1,2,"Ilett, Miss. Bertha",female,17,0,0,SO/C 14885,10.5,,S
+86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",female,33,3,0,3101278,15.85,,S
+87,0,3,"Ford, Mr. William Neal",male,16,1,3,W./C. 6608,34.375,,S
+88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S
+89,1,1,"Fortune, Miss. Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S
+90,0,3,"Celotti, Mr. Francesco",male,24,0,0,343275,8.05,,S
+91,0,3,"Christmann, Mr. Emil",male,29,0,0,343276,8.05,,S
+92,0,3,"Andreasson, Mr. Paul Edvin",male,20,0,0,347466,7.8542,,S
+93,0,1,"Chaffee, Mr. Herbert Fuller",male,46,1,0,W.E.P. 5734,61.175,E31,S
+94,0,3,"Dean, Mr. Bertram Frank",male,26,1,2,C.A. 2315,20.575,,S
+95,0,3,"Coxon, Mr. Daniel",male,59,0,0,364500,7.25,,S
+96,0,3,"Shorney, Mr. Charles Joseph",male,,0,0,374910,8.05,,S
+97,0,1,"Goldschmidt, Mr. George B",male,71,0,0,PC 17754,34.6542,A5,C
+98,1,1,"Greenfield, Mr. William Bertram",male,23,0,1,PC 17759,63.3583,D10 D12,C
+99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34,0,1,231919,23,,S
+100,0,2,"Kantor, Mr. Sinai",male,34,1,0,244367,26,,S
+101,0,3,"Petranec, Miss. Matilda",female,28,0,0,349245,7.8958,,S
+102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S
+103,0,1,"White, Mr. Richard Frasar",male,21,0,1,35281,77.2875,D26,S
+104,0,3,"Johansson, Mr. Gustaf Joel",male,33,0,0,7540,8.6542,,S
+105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37,2,0,3101276,7.925,,S
+106,0,3,"Mionoff, Mr. Stoytcho",male,28,0,0,349207,7.8958,,S
+107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21,0,0,343120,7.65,,S
+108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.775,,S
+109,0,3,"Rekic, Mr. Tido",male,38,0,0,349249,7.8958,,S
+110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.15,,Q
+111,0,1,"Porter, Mr. Walter Chamberlain",male,47,0,0,110465,52,C110,S
+112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C
+113,0,3,"Barton, Mr. David John",male,22,0,0,324669,8.05,,S
+114,0,3,"Jussila, Miss. Katriina",female,20,1,0,4136,9.825,,S
+115,0,3,"Attalah, Miss. Malake",female,17,0,0,2627,14.4583,,C
+116,0,3,"Pekoniemi, Mr. Edvard",male,21,0,0,STON/O 2. 3101294,7.925,,S
+117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q
+118,0,2,"Turpin, Mr. William John Robert",male,29,1,0,11668,21,,S
+119,0,1,"Baxter, Mr. Quigg Edmond",male,24,0,1,PC 17558,247.5208,B58 B60,C
+120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2,4,2,347082,31.275,,S
+121,0,2,"Hickman, Mr. Stanley George",male,21,2,0,S.O.C. 14879,73.5,,S
+122,0,3,"Moore, Mr. Leonard Charles",male,,0,0,A4. 54510,8.05,,S
+123,0,2,"Nasser, Mr. Nicholas",male,32.5,1,0,237736,30.0708,,C
+124,1,2,"Webber, Miss. Susan",female,32.5,0,0,27267,13,E101,S
+125,0,1,"White, Mr. Percival Wayland",male,54,0,1,35281,77.2875,D26,S
+126,1,3,"Nicola-Yarred, Master. Elias",male,12,1,0,2651,11.2417,,C
+127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q
+128,1,3,"Madsen, Mr. Fridtjof Arne",male,24,0,0,C 17369,7.1417,,S
+129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C
+130,0,3,"Ekstrom, Mr. Johan",male,45,0,0,347061,6.975,,S
+131,0,3,"Drazenoic, Mr. Jozef",male,33,0,0,349241,7.8958,,C
+132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20,0,0,SOTON/O.Q. 3101307,7.05,,S
+133,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",female,47,1,0,A/5. 3337,14.5,,S
+134,1,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29,1,0,228414,26,,S
+135,0,2,"Sobey, Mr. Samuel James Hayden",male,25,0,0,C.A. 29178,13,,S
+136,0,2,"Richard, Mr. Emile",male,23,0,0,SC/PARIS 2133,15.0458,,C
+137,1,1,"Newsom, Miss. Helen Monypeny",female,19,0,2,11752,26.2833,D47,S
+138,0,1,"Futrelle, Mr. Jacques Heath",male,37,1,0,113803,53.1,C123,S
+139,0,3,"Osen, Mr. Olaf Elon",male,16,0,0,7534,9.2167,,S
+140,0,1,"Giglio, Mr. Victor",male,24,0,0,PC 17593,79.2,B86,C
+141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C
+142,1,3,"Nysten, Miss. Anna Sofia",female,22,0,0,347081,7.75,,S
+143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda Dolck)",female,24,1,0,STON/O2. 3101279,15.85,,S
+144,0,3,"Burke, Mr. Jeremiah",male,19,0,0,365222,6.75,,Q
+145,0,2,"Andrew, Mr. Edgardo Samuel",male,18,0,0,231945,11.5,,S
+146,0,2,"Nicholls, Mr. Joseph Charles",male,19,1,1,C.A. 33112,36.75,,S
+147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27,0,0,350043,7.7958,,S
+148,0,3,"Ford, Miss. Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S
+149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26,F2,S
+150,0,2,"Byles, Rev. Thomas Roussel Davids",male,42,0,0,244310,13,,S
+151,0,2,"Bateman, Rev. Robert James",male,51,0,0,S.O.P. 1166,12.525,,S
+152,1,1,"Pears, Mrs. Thomas (Edith Wearne)",female,22,1,0,113776,66.6,C2,S
+153,0,3,"Meo, Mr. Alfonzo",male,55.5,0,0,A.5. 11206,8.05,,S
+154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S
+155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S
+156,0,1,"Williams, Mr. Charles Duane",male,51,0,1,PC 17597,61.3792,,C
+157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q
+158,0,3,"Corn, Mr. Harry",male,30,0,0,SOTON/OQ 392090,8.05,,S
+159,0,3,"Smiljanic, Mr. Mile",male,,0,0,315037,8.6625,,S
+160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S
+161,0,3,"Cribb, Mr. John Hatfield",male,44,0,1,371362,16.1,,S
+162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Milne)",female,40,0,0,C.A. 33595,15.75,,S
+163,0,3,"Bengtsson, Mr. John Viktor",male,26,0,0,347068,7.775,,S
+164,0,3,"Calic, Mr. Jovo",male,17,0,0,315093,8.6625,,S
+165,0,3,"Panula, Master. Eino Viljami",male,1,4,1,3101295,39.6875,,S
+166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S
+167,1,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,,0,1,113505,55,E33,S
+168,0,3,"Skoog, Mrs. William (Anna Bernhardina Karlsson)",female,45,1,4,347088,27.9,,S
+169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S
+170,0,3,"Ling, Mr. Lee",male,28,0,0,1601,56.4958,,S
+171,0,1,"Van der hoef, Mr. Wyckoff",male,61,0,0,111240,33.5,B19,S
+172,0,3,"Rice, Master. Arthur",male,4,4,1,382652,29.125,,Q
+173,1,3,"Johnson, Miss. Eleanor Ileen",female,1,1,1,347742,11.1333,,S
+174,0,3,"Sivola, Mr. Antti Wilhelm",male,21,0,0,STON/O 2. 3101280,7.925,,S
+175,0,1,"Smith, Mr. James Clinch",male,56,0,0,17764,30.6958,A7,C
+176,0,3,"Klasen, Mr. Klas Albin",male,18,1,1,350404,7.8542,,S
+177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S
+178,0,1,"Isham, Miss. Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C
+179,0,2,"Hale, Mr. Reginald",male,30,0,0,250653,13,,S
+180,0,3,"Leonard, Mr. Lionel",male,36,0,0,LINE,0,,S
+181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S
+182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C
+183,0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S
+184,1,2,"Becker, Master. Richard F",male,1,2,1,230136,39,F4,S
+185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4,0,2,315153,22.025,,S
+186,0,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50,A32,S
+187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q
+188,1,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45,0,0,111428,26.55,,S
+189,0,3,"Bourke, Mr. John",male,40,1,1,364849,15.5,,Q
+190,0,3,"Turcin, Mr. Stjepan",male,36,0,0,349247,7.8958,,S
+191,1,2,"Pinsky, Mrs. (Rosa)",female,32,0,0,234604,13,,S
+192,0,2,"Carbines, Mr. William",male,19,0,0,28424,13,,S
+193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S
+194,1,2,"Navratil, Master. Michel M",male,3,1,1,230080,26,F2,S
+195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44,0,0,PC 17610,27.7208,B4,C
+196,1,1,"Lurette, Miss. Elise",female,58,0,0,PC 17569,146.5208,B80,C
+197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q
+198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42,0,1,4579,8.4042,,S
+199,1,3,"Madigan, Miss. Margaret ""Maggie""",female,,0,0,370370,7.75,,Q
+200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S
+201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28,0,0,345770,9.5,,S
+202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S
+203,0,3,"Johanson, Mr. Jakob Alfred",male,34,0,0,3101264,6.4958,,S
+204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C
+205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18,0,0,A/5 3540,8.05,,S
+206,0,3,"Strom, Miss. Telma Matilda",female,2,0,1,347054,10.4625,G6,S
+207,0,3,"Backstrom, Mr. Karl Alfred",male,32,1,0,3101278,15.85,,S
+208,1,3,"Albimona, Mr. Nassef Cassem",male,26,0,0,2699,18.7875,,C
+209,1,3,"Carr, Miss. Helen ""Ellen""",female,16,0,0,367231,7.75,,Q
+210,1,1,"Blank, Mr. Henry",male,40,0,0,112277,31,A31,C
+211,0,3,"Ali, Mr. Ahmed",male,24,0,0,SOTON/O.Q. 3101311,7.05,,S
+212,1,2,"Cameron, Miss. Clear Annie",female,35,0,0,F.C.C. 13528,21,,S
+213,0,3,"Perkin, Mr. John Henry",male,22,0,0,A/5 21174,7.25,,S
+214,0,2,"Givard, Mr. Hans Kristensen",male,30,0,0,250646,13,,S
+215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q
+216,1,1,"Newell, Miss. Madeleine",female,31,1,0,35273,113.275,D36,C
+217,1,3,"Honkanen, Miss. Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S
+218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42,1,0,243847,27,,S
+219,1,1,"Bazzani, Miss. Albina",female,32,0,0,11813,76.2917,D15,C
+220,0,2,"Harris, Mr. Walter",male,30,0,0,W/C 14208,10.5,,S
+221,1,3,"Sunderland, Mr. Victor Francis",male,16,0,0,SOTON/OQ 392089,8.05,,S
+222,0,2,"Bracken, Mr. James H",male,27,0,0,220367,13,,S
+223,0,3,"Green, Mr. George Henry",male,51,0,0,21440,8.05,,S
+224,0,3,"Nenkoff, Mr. Christo",male,,0,0,349234,7.8958,,S
+225,1,1,"Hoyt, Mr. Frederick Maxfield",male,38,1,0,19943,90,C93,S
+226,0,3,"Berglund, Mr. Karl Ivar Sven",male,22,0,0,PP 4348,9.35,,S
+227,1,2,"Mellors, Mr. William John",male,19,0,0,SW/PP 751,10.5,,S
+228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S
+229,0,2,"Fahlstrom, Mr. Arne Jonas",male,18,0,0,236171,13,,S
+230,0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S
+231,1,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",female,35,1,0,36973,83.475,C83,S
+232,0,3,"Larsson, Mr. Bengt Edvin",male,29,0,0,347067,7.775,,S
+233,0,2,"Sjostedt, Mr. Ernst Adolf",male,59,0,0,237442,13.5,,S
+234,1,3,"Asplund, Miss. Lillian Gertrud",female,5,4,2,347077,31.3875,,S
+235,0,2,"Leyson, Mr. Robert William Norman",male,24,0,0,C.A. 29566,10.5,,S
+236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S
+237,0,2,"Hold, Mr. Stephen",male,44,1,0,26707,26,,S
+238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S
+239,0,2,"Pengelly, Mr. Frederick William",male,19,0,0,28665,10.5,,S
+240,0,2,"Hunt, Mr. George Henry",male,33,0,0,SCO/W 1585,12.275,,S
+241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C
+242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5,,Q
+243,0,2,"Coleridge, Mr. Reginald Charles",male,29,0,0,W./C. 14263,10.5,,S
+244,0,3,"Maenpaa, Mr. Matti Alexanteri",male,22,0,0,STON/O 2. 3101275,7.125,,S
+245,0,3,"Attalah, Mr. Sleiman",male,30,0,0,2694,7.225,,C
+246,0,1,"Minahan, Dr. William Edward",male,44,2,0,19928,90,C78,Q
+247,0,3,"Lindahl, Miss. Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S
+248,1,2,"Hamalainen, Mrs. William (Anna)",female,24,0,2,250649,14.5,,S
+249,1,1,"Beckwith, Mr. Richard Leonard",male,37,1,1,11751,52.5542,D35,S
+250,0,2,"Carter, Rev. Ernest Courtenay",male,54,1,0,244252,26,,S
+251,0,3,"Reed, Mr. James George",male,,0,0,362316,7.25,,S
+252,0,3,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",female,29,1,1,347054,10.4625,G6,S
+253,0,1,"Stead, Mr. William Thomas",male,62,0,0,113514,26.55,C87,S
+254,0,3,"Lobb, Mr. William Arthur",male,30,1,0,A/5. 3336,16.1,,S
+255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41,0,2,370129,20.2125,,S
+256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29,0,2,2650,15.2458,,C
+257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C
+258,1,1,"Cherry, Miss. Gladys",female,30,0,0,110152,86.5,B77,S
+259,1,1,"Ward, Miss. Anna",female,35,0,0,PC 17755,512.3292,,C
+260,1,2,"Parrish, Mrs. (Lutie Davis)",female,50,0,1,230433,26,,S
+261,0,3,"Smith, Mr. Thomas",male,,0,0,384461,7.75,,Q
+262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S
+263,0,1,"Taussig, Mr. Emil",male,52,1,1,110413,79.65,E67,S
+264,0,1,"Harrison, Mr. William",male,40,0,0,112059,0,B94,S
+265,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q
+266,0,2,"Reeves, Mr. David",male,36,0,0,C.A. 17248,10.5,,S
+267,0,3,"Panula, Mr. Ernesti Arvid",male,16,4,1,3101295,39.6875,,S
+268,1,3,"Persson, Mr. Ernst Ulrik",male,25,1,0,347083,7.775,,S
+269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58,0,1,PC 17582,153.4625,C125,S
+270,1,1,"Bissette, Miss. Amelia",female,35,0,0,PC 17760,135.6333,C99,S
+271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31,,S
+272,1,3,"Tornquist, Mr. William Henry",male,25,0,0,LINE,0,,S
+273,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",female,41,0,1,250644,19.5,,S
+274,0,1,"Natsch, Mr. Charles H",male,37,0,1,PC 17596,29.7,C118,C
+275,1,3,"Healy, Miss. Hanora ""Nora""",female,,0,0,370375,7.75,,Q
+276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S
+277,0,3,"Lindblom, Miss. Augusta Charlotta",female,45,0,0,347073,7.75,,S
+278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0,,S
+279,0,3,"Rice, Master. Eric",male,7,4,1,382652,29.125,,Q
+280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35,1,1,C.A. 2673,20.25,,S
+281,0,3,"Duane, Mr. Frank",male,65,0,0,336439,7.75,,Q
+282,0,3,"Olsson, Mr. Nils Johan Goransson",male,28,0,0,347464,7.8542,,S
+283,0,3,"de Pelsmaeker, Mr. Alfons",male,16,0,0,345778,9.5,,S
+284,1,3,"Dorking, Mr. Edward Arthur",male,19,0,0,A/5. 10482,8.05,,S
+285,0,1,"Smith, Mr. Richard William",male,,0,0,113056,26,A19,S
+286,0,3,"Stankovic, Mr. Ivan",male,33,0,0,349239,8.6625,,C
+287,1,3,"de Mulder, Mr. Theodore",male,30,0,0,345774,9.5,,S
+288,0,3,"Naidenoff, Mr. Penko",male,22,0,0,349206,7.8958,,S
+289,1,2,"Hosono, Mr. Masabumi",male,42,0,0,237798,13,,S
+290,1,3,"Connolly, Miss. Kate",female,22,0,0,370373,7.75,,Q
+291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26,0,0,19877,78.85,,S
+292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19,1,0,11967,91.0792,B49,C
+293,0,2,"Levy, Mr. Rene Jacques",male,36,0,0,SC/Paris 2163,12.875,D,C
+294,0,3,"Haas, Miss. Aloisia",female,24,0,0,349236,8.85,,S
+295,0,3,"Mineff, Mr. Ivan",male,24,0,0,349233,7.8958,,S
+296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C
+297,0,3,"Hanna, Mr. Mansour",male,23.5,0,0,2693,7.2292,,C
+298,0,1,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S
+299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S
+300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50,0,1,PC 17558,247.5208,B58 B60,C
+301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q
+302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q
+303,0,3,"Johnson, Mr. William Cahoone Jr",male,19,0,0,LINE,0,,S
+304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q
+305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S
+306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S
+307,1,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C
+308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)",female,17,1,0,PC 17758,108.9,C65,C
+309,0,2,"Abelson, Mr. Samuel",male,30,1,0,P/PP 3381,24,,C
+310,1,1,"Francatelli, Miss. Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C
+311,1,1,"Hays, Miss. Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C
+312,1,1,"Ryerson, Miss. Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C
+313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26,1,1,250651,26,,S
+314,0,3,"Hendekovic, Mr. Ignjac",male,28,0,0,349243,7.8958,,S
+315,0,2,"Hart, Mr. Benjamin",male,43,1,1,F.C.C. 13529,26.25,,S
+316,1,3,"Nilsson, Miss. Helmina Josefina",female,26,0,0,347470,7.8542,,S
+317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24,1,0,244367,26,,S
+318,0,2,"Moraweck, Dr. Ernest",male,54,0,0,29011,14,,S
+319,1,1,"Wick, Miss. Mary Natalie",female,31,0,2,36928,164.8667,C7,S
+320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)",female,40,1,1,16966,134.5,E34,C
+321,0,3,"Dennis, Mr. Samuel",male,22,0,0,A/5 21172,7.25,,S
+322,0,3,"Danoff, Mr. Yoto",male,27,0,0,349219,7.8958,,S
+323,1,2,"Slayter, Miss. Hilda Mary",female,30,0,0,234818,12.35,,Q
+324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",female,22,1,1,248738,29,,S
+325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S
+326,1,1,"Young, Miss. Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C
+327,0,3,"Nysveen, Mr. Johan Hansen",male,61,0,0,345364,6.2375,,S
+328,1,2,"Ball, Mrs. (Ada E Hall)",female,36,0,0,28551,13,D,S
+329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31,1,1,363291,20.525,,S
+330,1,1,"Hippach, Miss. Jean Gertrude",female,16,0,1,111361,57.9792,B18,C
+331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q
+332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S
+333,0,1,"Graham, Mr. George Edward",male,38,0,1,PC 17582,153.4625,C91,S
+334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16,2,0,345764,18,,S
+335,1,1,"Frauenthal, Mrs. Henry William (Clara Heinsheimer)",female,,1,0,PC 17611,133.65,,S
+336,0,3,"Denkoff, Mr. Mitto",male,,0,0,349225,7.8958,,S
+337,0,1,"Pears, Mr. Thomas Clinton",male,29,1,0,113776,66.6,C2,S
+338,1,1,"Burns, Miss. Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C
+339,1,3,"Dahl, Mr. Karl Edwart",male,45,0,0,7598,8.05,,S
+340,0,1,"Blackwell, Mr. Stephen Weart",male,45,0,0,113784,35.5,T,S
+341,1,2,"Navratil, Master. Edmond Roger",male,2,1,1,230080,26,F2,S
+342,1,1,"Fortune, Miss. Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S
+343,0,2,"Collander, Mr. Erik Gustaf",male,28,0,0,248740,13,,S
+344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25,0,0,244361,13,,S
+345,0,2,"Fox, Mr. Stanley Hubert",male,36,0,0,229236,13,,S
+346,1,2,"Brown, Miss. Amelia ""Mildred""",female,24,0,0,248733,13,F33,S
+347,1,2,"Smith, Miss. Marion Elsie",female,40,0,0,31418,13,,S
+348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1,,S
+349,1,3,"Coutts, Master. William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S
+350,0,3,"Dimic, Mr. Jovan",male,42,0,0,315088,8.6625,,S
+351,0,3,"Odahl, Mr. Nils Martin",male,23,0,0,7267,9.225,,S
+352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35,C128,S
+353,0,3,"Elias, Mr. Tannous",male,15,1,1,2695,7.2292,,C
+354,0,3,"Arnold-Franchi, Mr. Josef",male,25,1,0,349237,17.8,,S
+355,0,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C
+356,0,3,"Vanden Steen, Mr. Leo Peter",male,28,0,0,345783,9.5,,S
+357,1,1,"Bowerman, Miss. Elsie Edith",female,22,0,1,113505,55,E33,S
+358,0,2,"Funk, Miss. Annie Clemmer",female,38,0,0,237671,13,,S
+359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q
+360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q
+361,0,3,"Skoog, Mr. Wilhelm",male,40,1,4,347088,27.9,,S
+362,0,2,"del Carlo, Mr. Sebastiano",male,29,1,0,SC/PARIS 2167,27.7208,,C
+363,0,3,"Barbara, Mrs. (Catherine David)",female,45,0,1,2691,14.4542,,C
+364,0,3,"Asim, Mr. Adola",male,35,0,0,SOTON/O.Q. 3101310,7.05,,S
+365,0,3,"O'Brien, Mr. Thomas",male,,1,0,370365,15.5,,Q
+366,0,3,"Adahl, Mr. Mauritz Nils Martin",male,30,0,0,C 7076,7.25,,S
+367,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",female,60,1,0,110813,75.25,D37,C
+368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C
+369,1,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q
+370,1,1,"Aubart, Mme. Leontine Pauline",female,24,0,0,PC 17477,69.3,B35,C
+371,1,1,"Harder, Mr. George Achilles",male,25,1,0,11765,55.4417,E50,C
+372,0,3,"Wiklund, Mr. Jakob Alfred",male,18,1,0,3101267,6.4958,,S
+373,0,3,"Beavan, Mr. William Thomas",male,19,0,0,323951,8.05,,S
+374,0,1,"Ringhini, Mr. Sante",male,22,0,0,PC 17760,135.6333,,C
+375,0,3,"Palsson, Miss. Stina Viola",female,3,3,1,349909,21.075,,S
+376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C
+377,1,3,"Landergren, Miss. Aurora Adelia",female,22,0,0,C 7077,7.25,,S
+378,0,1,"Widener, Mr. Harry Elkins",male,27,0,2,113503,211.5,C82,C
+379,0,3,"Betros, Mr. Tannous",male,20,0,0,2648,4.0125,,C
+380,0,3,"Gustafsson, Mr. Karl Gideon",male,19,0,0,347069,7.775,,S
+381,1,1,"Bidois, Miss. Rosalie",female,42,0,0,PC 17757,227.525,,C
+382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1,0,2,2653,15.7417,,C
+383,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,,S
+384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",female,35,1,0,113789,52,,S
+385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S
+386,0,2,"Davies, Mr. Charles Henry",male,18,0,0,S.O.C. 14879,73.5,,S
+387,0,3,"Goodwin, Master. Sidney Leonard",male,1,5,2,CA 2144,46.9,,S
+388,1,2,"Buss, Miss. Kate",female,36,0,0,27849,13,,S
+389,0,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,,Q
+390,1,2,"Lehmann, Miss. Bertha",female,17,0,0,SC 1748,12,,C
+391,1,1,"Carter, Mr. William Ernest",male,36,1,2,113760,120,B96 B98,S
+392,1,3,"Jansson, Mr. Carl Olof",male,21,0,0,350034,7.7958,,S
+393,0,3,"Gustafsson, Mr. Johan Birger",male,28,2,0,3101277,7.925,,S
+394,1,1,"Newell, Miss. Marjorie",female,23,1,0,35273,113.275,D36,C
+395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",female,24,0,2,PP 9549,16.7,G6,S
+396,0,3,"Johansson, Mr. Erik",male,22,0,0,350052,7.7958,,S
+397,0,3,"Olsson, Miss. Elina",female,31,0,0,350407,7.8542,,S
+398,0,2,"McKane, Mr. Peter David",male,46,0,0,28403,26,,S
+399,0,2,"Pain, Dr. Alfred",male,23,0,0,244278,10.5,,S
+400,1,2,"Trout, Mrs. William H (Jessie L)",female,28,0,0,240929,12.65,,S
+401,1,3,"Niskanen, Mr. Juha",male,39,0,0,STON/O 2. 3101289,7.925,,S
+402,0,3,"Adams, Mr. John",male,26,0,0,341826,8.05,,S
+403,0,3,"Jussila, Miss. Mari Aina",female,21,1,0,4137,9.825,,S
+404,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28,1,0,STON/O2. 3101279,15.85,,S
+405,0,3,"Oreskovic, Miss. Marija",female,20,0,0,315096,8.6625,,S
+406,0,2,"Gale, Mr. Shadrach",male,34,1,0,28664,21,,S
+407,0,3,"Widegren, Mr. Carl/Charles Peter",male,51,0,0,347064,7.75,,S
+408,1,2,"Richards, Master. William Rowe",male,3,1,1,29106,18.75,,S
+409,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21,0,0,312992,7.775,,S
+410,0,3,"Lefebre, Miss. Ida",female,,3,1,4133,25.4667,,S
+411,0,3,"Sdycoff, Mr. Todor",male,,0,0,349222,7.8958,,S
+412,0,3,"Hart, Mr. Henry",male,,0,0,394140,6.8583,,Q
+413,1,1,"Minahan, Miss. Daisy E",female,33,1,0,19928,90,C78,Q
+414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0,,S
+415,1,3,"Sundman, Mr. Johan Julian",male,44,0,0,STON/O 2. 3101269,7.925,,S
+416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S
+417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34,1,1,28220,32.5,,S
+418,1,2,"Silven, Miss. Lyyli Karoliina",female,18,0,2,250652,13,,S
+419,0,2,"Matthews, Mr. William John",male,30,0,0,28228,13,,S
+420,0,3,"Van Impe, Miss. Catharina",female,10,0,2,345773,24.15,,S
+421,0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C
+422,0,3,"Charters, Mr. David",male,21,0,0,A/5. 13032,7.7333,,Q
+423,0,3,"Zimmerman, Mr. Leo",male,29,0,0,315082,7.875,,S
+424,0,3,"Danbom, Mrs. Ernst Gilbert (Anna Sigrid Maria Brogren)",female,28,1,1,347080,14.4,,S
+425,0,3,"Rosblom, Mr. Viktor Richard",male,18,1,1,370129,20.2125,,S
+426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S
+427,1,2,"Clarke, Mrs. Charles V (Ada Maria Winfield)",female,28,1,0,2003,26,,S
+428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S
+429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q
+430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32,0,0,SOTON/O.Q. 392078,8.05,E10,S
+431,1,1,"Bjornstrom-Steffansson, Mr. Mauritz Hakan",male,28,0,0,110564,26.55,C52,S
+432,1,3,"Thorneycroft, Mrs. Percival (Florence Kate White)",female,,1,0,376564,16.1,,S
+433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide Slow)",female,42,1,0,SC/AH 3085,26,,S
+434,0,3,"Kallio, Mr. Nikolai Erland",male,17,0,0,STON/O 2. 3101274,7.125,,S
+435,0,1,"Silvey, Mr. William Baird",male,50,1,0,13507,55.9,E44,S
+436,1,1,"Carter, Miss. Lucile Polk",female,14,1,2,113760,120,B96 B98,S
+437,0,3,"Ford, Miss. Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S
+438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24,2,3,29106,18.75,,S
+439,0,1,"Fortune, Mr. Mark",male,64,1,4,19950,263,C23 C25 C27,S
+440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31,0,0,C.A. 18723,10.5,,S
+441,1,2,"Hart, Mrs. Benjamin (Esther Ada Bloomfield)",female,45,1,1,F.C.C. 13529,26.25,,S
+442,0,3,"Hampe, Mr. Leon",male,20,0,0,345769,9.5,,S
+443,0,3,"Petterson, Mr. Johan Emil",male,25,1,0,347076,7.775,,S
+444,1,2,"Reynaldo, Ms. Encarnacion",female,28,0,0,230434,13,,S
+445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S
+446,1,1,"Dodge, Master. Washington",male,4,0,2,33638,81.8583,A34,S
+447,1,2,"Mellinger, Miss. Madeleine Violet",female,13,0,1,250644,19.5,,S
+448,1,1,"Seward, Mr. Frederic Kimber",male,34,0,0,113794,26.55,,S
+449,1,3,"Baclini, Miss. Marie Catherine",female,5,2,1,2666,19.2583,,C
+450,1,1,"Peuchen, Major. Arthur Godfrey",male,52,0,0,113786,30.5,C104,S
+451,0,2,"West, Mr. Edwy Arthur",male,36,1,2,C.A. 34651,27.75,,S
+452,0,3,"Hagland, Mr. Ingvald Olai Olsen",male,,1,0,65303,19.9667,,S
+453,0,1,"Foreman, Mr. Benjamin Laventall",male,30,0,0,113051,27.75,C111,C
+454,1,1,"Goldenberg, Mr. Samuel L",male,49,1,0,17453,89.1042,C92,C
+455,0,3,"Peduzzi, Mr. Joseph",male,,0,0,A/5 2817,8.05,,S
+456,1,3,"Jalsevac, Mr. Ivan",male,29,0,0,349240,7.8958,,C
+457,0,1,"Millet, Mr. Francis Davis",male,65,0,0,13509,26.55,E38,S
+458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S
+459,1,2,"Toomey, Miss. Ellen",female,50,0,0,F.C.C. 13531,10.5,,S
+460,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q
+461,1,1,"Anderson, Mr. Harry",male,48,0,0,19952,26.55,E12,S
+462,0,3,"Morley, Mr. William",male,34,0,0,364506,8.05,,S
+463,0,1,"Gee, Mr. Arthur H",male,47,0,0,111320,38.5,E63,S
+464,0,2,"Milling, Mr. Jacob Christian",male,48,0,0,234360,13,,S
+465,0,3,"Maisner, Mr. Simon",male,,0,0,A/S 2816,8.05,,S
+466,0,3,"Goncalves, Mr. Manuel Estanslas",male,38,0,0,SOTON/O.Q. 3101306,7.05,,S
+467,0,2,"Campbell, Mr. William",male,,0,0,239853,0,,S
+468,0,1,"Smart, Mr. John Montgomery",male,56,0,0,113792,26.55,,S
+469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q
+470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C
+471,0,3,"Keefe, Mr. Arthur",male,,0,0,323592,7.25,,S
+472,0,3,"Cacic, Mr. Luka",male,38,0,0,315089,8.6625,,S
+473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33,1,2,C.A. 34651,27.75,,S
+474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23,0,0,SC/AH Basle 541,13.7917,D,C
+475,0,3,"Strandberg, Miss. Ida Sofia",female,22,0,0,7553,9.8375,,S
+476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52,A14,S
+477,0,2,"Renouf, Mr. Peter Henry",male,34,1,0,31027,21,,S
+478,0,3,"Braund, Mr. Lewis Richard",male,29,1,0,3460,7.0458,,S
+479,0,3,"Karlsson, Mr. Nils August",male,22,0,0,350060,7.5208,,S
+480,1,3,"Hirvonen, Miss. Hildur E",female,2,0,1,3101298,12.2875,,S
+481,0,3,"Goodwin, Master. Harold Victor",male,9,5,2,CA 2144,46.9,,S
+482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0,,S
+483,0,3,"Rouse, Mr. Richard Henry",male,50,0,0,A/5 3594,8.05,,S
+484,1,3,"Turkula, Mrs. (Hedwig)",female,63,0,0,4134,9.5875,,S
+485,1,1,"Bishop, Mr. Dickinson H",male,25,1,0,11967,91.0792,B49,C
+486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S
+487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35,1,0,19943,90,C93,S
+488,0,1,"Kent, Mr. Edward Austin",male,58,0,0,11771,29.7,B37,C
+489,0,3,"Somerton, Mr. Francis William",male,30,0,0,A.5. 18509,8.05,,S
+490,1,3,"Coutts, Master. Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S
+491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S
+492,0,3,"Windelov, Mr. Einar",male,21,0,0,SOTON/OQ 3101317,7.25,,S
+493,0,1,"Molson, Mr. Harry Markland",male,55,0,0,113787,30.5,C30,S
+494,0,1,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,,C
+495,0,3,"Stanley, Mr. Edward Roland",male,21,0,0,A/4 45380,8.05,,S
+496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C
+497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C
+498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,,S
+499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S
+500,0,3,"Svensson, Mr. Olof",male,24,0,0,350035,7.7958,,S
+501,0,3,"Calic, Mr. Petar",male,17,0,0,315086,8.6625,,S
+502,0,3,"Canavan, Miss. Mary",female,21,0,0,364846,7.75,,Q
+503,0,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,330909,7.6292,,Q
+504,0,3,"Laitinen, Miss. Kristina Sofia",female,37,0,0,4135,9.5875,,S
+505,1,1,"Maioni, Miss. Roberta",female,16,0,0,110152,86.5,B79,S
+506,0,1,"Penasco y Castellana, Mr. Victor de Satode",male,18,1,0,PC 17758,108.9,C65,C
+507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33,0,2,26360,26,,S
+508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S
+509,0,3,"Olsen, Mr. Henry Margido",male,28,0,0,C 4001,22.525,,S
+510,1,3,"Lang, Mr. Fang",male,26,0,0,1601,56.4958,,S
+511,1,3,"Daly, Mr. Eugene Patrick",male,29,0,0,382651,7.75,,Q
+512,0,3,"Webber, Mr. James",male,,0,0,SOTON/OQ 3101316,8.05,,S
+513,1,1,"McGough, Mr. James Robert",male,36,0,0,PC 17473,26.2875,E25,S
+514,1,1,"Rothschild, Mrs. Martin (Elizabeth L. Barrett)",female,54,1,0,PC 17603,59.4,,C
+515,0,3,"Coleff, Mr. Satio",male,24,0,0,349209,7.4958,,S
+516,0,1,"Walker, Mr. William Anderson",male,47,0,0,36967,34.0208,D46,S
+517,1,2,"Lemore, Mrs. (Amelia Milley)",female,34,0,0,C.A. 34260,10.5,F33,S
+518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q
+519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes Hughes)",female,36,1,0,226875,26,,S
+520,0,3,"Pavlovic, Mr. Stefo",male,32,0,0,349242,7.8958,,S
+521,1,1,"Perreault, Miss. Anne",female,30,0,0,12749,93.5,B73,S
+522,0,3,"Vovk, Mr. Janko",male,22,0,0,349252,7.8958,,S
+523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.225,,C
+524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44,0,1,111361,57.9792,B18,C
+525,0,3,"Kassem, Mr. Fared",male,,0,0,2700,7.2292,,C
+526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q
+527,1,2,"Ridsdale, Miss. Lucy",female,50,0,0,W./C. 14258,10.5,,S
+528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S
+529,0,3,"Salonen, Mr. Johan Werner",male,39,0,0,3101296,7.925,,S
+530,0,2,"Hocking, Mr. Richard George",male,23,2,1,29104,11.5,,S
+531,1,2,"Quick, Miss. Phyllis May",female,2,1,1,26360,26,,S
+532,0,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C
+533,0,3,"Elias, Mr. Joseph Jr",male,17,1,1,2690,7.2292,,C
+534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C
+535,0,3,"Cacic, Miss. Marija",female,30,0,0,315084,8.6625,,S
+536,1,2,"Hart, Miss. Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S
+537,0,1,"Butt, Major. Archibald Willingham",male,45,0,0,113050,26.55,B38,S
+538,1,1,"LeRoy, Miss. Bertha",female,30,0,0,PC 17761,106.425,,C
+539,0,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S
+540,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C
+541,1,1,"Crosby, Miss. Harriet R",female,36,0,2,WE/P 5735,71,B22,S
+542,0,3,"Andersson, Miss. Ingeborg Constanzia",female,9,4,2,347082,31.275,,S
+543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11,4,2,347082,31.275,,S
+544,1,2,"Beane, Mr. Edward",male,32,1,0,2908,26,,S
+545,0,1,"Douglas, Mr. Walter Donald",male,50,1,0,PC 17761,106.425,C86,C
+546,0,1,"Nicholson, Mr. Arthur Ernest",male,64,0,0,693,26,,S
+547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19,1,0,2908,26,,S
+548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C
+549,0,3,"Goldsmith, Mr. Frank John",male,33,1,1,363291,20.525,,S
+550,1,2,"Davies, Master. John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S
+551,1,1,"Thayer, Mr. John Borland Jr",male,17,0,2,17421,110.8833,C70,C
+552,0,2,"Sharp, Mr. Percival James R",male,27,0,0,244358,26,,S
+553,0,3,"O'Brien, Mr. Timothy",male,,0,0,330979,7.8292,,Q
+554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22,0,0,2620,7.225,,C
+555,1,3,"Ohman, Miss. Velin",female,22,0,0,347085,7.775,,S
+556,0,1,"Wright, Mr. George",male,62,0,0,113807,26.55,,S
+557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherland) (""Mrs Morgan"")",female,48,1,0,11755,39.6,A16,C
+558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C
+559,1,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",female,39,1,1,110413,79.65,E67,S
+560,1,3,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",female,36,1,0,345572,17.4,,S
+561,0,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.75,,Q
+562,0,3,"Sivic, Mr. Husein",male,40,0,0,349251,7.8958,,S
+563,0,2,"Norman, Mr. Robert Douglas",male,28,0,0,218629,13.5,,S
+564,0,3,"Simmons, Mr. John",male,,0,0,SOTON/OQ 392082,8.05,,S
+565,0,3,"Meanwell, Miss. (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S
+566,0,3,"Davies, Mr. Alfred J",male,24,2,0,A/4 48871,24.15,,S
+567,0,3,"Stoytcheff, Mr. Ilia",male,19,0,0,349205,7.8958,,S
+568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29,0,4,349909,21.075,,S
+569,0,3,"Doharr, Mr. Tannous",male,,0,0,2686,7.2292,,C
+570,1,3,"Jonsson, Mr. Carl",male,32,0,0,350417,7.8542,,S
+571,1,2,"Harris, Mr. George",male,62,0,0,S.W./PP 752,10.5,,S
+572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53,2,0,11769,51.4792,C101,S
+573,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36,0,0,PC 17474,26.3875,E25,S
+574,1,3,"Kelly, Miss. Mary",female,,0,0,14312,7.75,,Q
+575,0,3,"Rush, Mr. Alfred George John",male,16,0,0,A/4. 20589,8.05,,S
+576,0,3,"Patchett, Mr. George",male,19,0,0,358585,14.5,,S
+577,1,2,"Garside, Miss. Ethel",female,34,0,0,243880,13,,S
+578,1,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39,1,0,13507,55.9,E44,S
+579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C
+580,1,3,"Jussila, Mr. Eiriik",male,32,0,0,STON/O 2. 3101286,7.925,,S
+581,1,2,"Christy, Miss. Julie Rachel",female,25,1,1,237789,30,,S
+582,1,1,"Thayer, Mrs. John Borland (Marian Longstreth Morris)",female,39,1,1,17421,110.8833,C68,C
+583,0,2,"Downton, Mr. William James",male,54,0,0,28403,26,,S
+584,0,1,"Ross, Mr. John Hugo",male,36,0,0,13049,40.125,A10,C
+585,0,3,"Paulner, Mr. Uscher",male,,0,0,3411,8.7125,,C
+586,1,1,"Taussig, Miss. Ruth",female,18,0,2,110413,79.65,E68,S
+587,0,2,"Jarvis, Mr. John Denzil",male,47,0,0,237565,15,,S
+588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60,1,1,13567,79.2,B41,C
+589,0,3,"Gilinski, Mr. Eliezer",male,22,0,0,14973,8.05,,S
+590,0,3,"Murdlin, Mr. Joseph",male,,0,0,A./5. 3235,8.05,,S
+591,0,3,"Rintamaki, Mr. Matti",male,35,0,0,STON/O 2. 3101273,7.125,,S
+592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",female,52,1,0,36947,78.2667,D20,C
+593,0,3,"Elsbury, Mr. William James",male,47,0,0,A/5 3902,7.25,,S
+594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q
+595,0,2,"Chapman, Mr. John Henry",male,37,1,0,SC/AH 29037,26,,S
+596,0,3,"Van Impe, Mr. Jean Baptiste",male,36,1,1,345773,24.15,,S
+597,1,2,"Leitch, Miss. Jessie Wills",female,,0,0,248727,33,,S
+598,0,3,"Johnson, Mr. Alfred",male,49,0,0,LINE,0,,S
+599,0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C
+600,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49,1,0,PC 17485,56.9292,A20,C
+601,1,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Christy)",female,24,2,1,243847,27,,S
+602,0,3,"Slabenoff, Mr. Petco",male,,0,0,349214,7.8958,,S
+603,0,1,"Harrington, Mr. Charles H",male,,0,0,113796,42.4,,S
+604,0,3,"Torber, Mr. Ernst William",male,44,0,0,364511,8.05,,S
+605,1,1,"Homer, Mr. Harry (""Mr E Haven"")",male,35,0,0,111426,26.55,,C
+606,0,3,"Lindell, Mr. Edvard Bengtsson",male,36,1,0,349910,15.55,,S
+607,0,3,"Karaic, Mr. Milan",male,30,0,0,349246,7.8958,,S
+608,1,1,"Daniel, Mr. Robert Williams",male,27,0,0,113804,30.5,,S
+609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)",female,22,1,2,SC/Paris 2123,41.5792,,C
+610,1,1,"Shutes, Miss. Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S
+611,0,3,"Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)",female,39,1,5,347082,31.275,,S
+612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.05,,S
+613,1,3,"Murphy, Miss. Margaret Jane",female,,1,0,367230,15.5,,Q
+614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q
+615,0,3,"Brocklebank, Mr. William Alfred",male,35,0,0,364512,8.05,,S
+616,1,2,"Herman, Miss. Alice",female,24,1,2,220845,65,,S
+617,0,3,"Danbom, Mr. Ernst Gilbert",male,34,1,1,347080,14.4,,S
+618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26,1,0,A/5. 3336,16.1,,S
+619,1,2,"Becker, Miss. Marion Louise",female,4,2,1,230136,39,F4,S
+620,0,2,"Gavey, Mr. Lawrence",male,26,0,0,31028,10.5,,S
+621,0,3,"Yasbeck, Mr. Antoni",male,27,1,0,2659,14.4542,,C
+622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42,1,0,11753,52.5542,D19,S
+623,1,3,"Nakid, Mr. Sahid",male,20,1,1,2653,15.7417,,C
+624,0,3,"Hansen, Mr. Henry Damsgaard",male,21,0,0,350029,7.8542,,S
+625,0,3,"Bowen, Mr. David John ""Dai""",male,21,0,0,54636,16.1,,S
+626,0,1,"Sutton, Mr. Frederick",male,61,0,0,36963,32.3208,D50,S
+627,0,2,"Kirkland, Rev. Charles Leonard",male,57,0,0,219533,12.35,,Q
+628,1,1,"Longley, Miss. Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S
+629,0,3,"Bostandyeff, Mr. Guentcho",male,26,0,0,349224,7.8958,,S
+630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q
+631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80,0,0,27042,30,A23,S
+632,0,3,"Lundahl, Mr. Johan Svensson",male,51,0,0,347743,7.0542,,S
+633,1,1,"Stahelin-Maeglin, Dr. Max",male,32,0,0,13214,30.5,B50,C
+634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0,,S
+635,0,3,"Skoog, Miss. Mabel",female,9,3,2,347088,27.9,,S
+636,1,2,"Davis, Miss. Mary",female,28,0,0,237668,13,,S
+637,0,3,"Leinonen, Mr. Antti Gustaf",male,32,0,0,STON/O 2. 3101292,7.925,,S
+638,0,2,"Collyer, Mr. Harvey",male,31,1,1,C.A. 31921,26.25,,S
+639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41,0,5,3101295,39.6875,,S
+640,0,3,"Thorneycroft, Mr. Percival",male,,1,0,376564,16.1,,S
+641,0,3,"Jensen, Mr. Hans Peder",male,20,0,0,350050,7.8542,,S
+642,1,1,"Sagesser, Mlle. Emma",female,24,0,0,PC 17477,69.3,B35,C
+643,0,3,"Skoog, Miss. Margit Elizabeth",female,2,3,2,347088,27.9,,S
+644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S
+645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C
+646,1,1,"Harper, Mr. Henry Sleeper",male,48,1,0,PC 17572,76.7292,D33,C
+647,0,3,"Cor, Mr. Liudevit",male,19,0,0,349231,7.8958,,S
+648,1,1,"Simonius-Blumer, Col. Oberst Alfons",male,56,0,0,13213,35.5,A26,C
+649,0,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.55,,S
+650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S
+651,0,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S
+652,1,2,"Doling, Miss. Elsie",female,18,0,1,231919,23,,S
+653,0,3,"Kalvik, Mr. Johannes Halvorsen",male,21,0,0,8475,8.4333,,S
+654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q
+655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18,0,0,365226,6.75,,Q
+656,0,2,"Hickman, Mr. Leonard Mark",male,24,2,0,S.O.C. 14879,73.5,,S
+657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S
+658,0,3,"Bourke, Mrs. John (Catherine)",female,32,1,1,364849,15.5,,Q
+659,0,2,"Eitemiller, Mr. George Floyd",male,23,0,0,29751,13,,S
+660,0,1,"Newell, Mr. Arthur Webster",male,58,0,2,35273,113.275,D48,C
+661,1,1,"Frauenthal, Dr. Henry William",male,50,2,0,PC 17611,133.65,,S
+662,0,3,"Badt, Mr. Mohamed",male,40,0,0,2623,7.225,,C
+663,0,1,"Colley, Mr. Edward Pomeroy",male,47,0,0,5727,25.5875,E58,S
+664,0,3,"Coleff, Mr. Peju",male,36,0,0,349210,7.4958,,S
+665,1,3,"Lindqvist, Mr. Eino William",male,20,1,0,STON/O 2. 3101285,7.925,,S
+666,0,2,"Hickman, Mr. Lewis",male,32,2,0,S.O.C. 14879,73.5,,S
+667,0,2,"Butler, Mr. Reginald Fenton",male,25,0,0,234686,13,,S
+668,0,3,"Rommetvedt, Mr. Knud Paust",male,,0,0,312993,7.775,,S
+669,0,3,"Cook, Mr. Jacob",male,43,0,0,A/5 3536,8.05,,S
+670,1,1,"Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)",female,,1,0,19996,52,C126,S
+671,1,2,"Brown, Mrs. Thomas William Solomon (Elizabeth Catherine Ford)",female,40,1,1,29750,39,,S
+672,0,1,"Davidson, Mr. Thornton",male,31,1,0,F.C. 12750,52,B71,S
+673,0,2,"Mitchell, Mr. Henry Michael",male,70,0,0,C.A. 24580,10.5,,S
+674,1,2,"Wilhelms, Mr. Charles",male,31,0,0,244270,13,,S
+675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0,,S
+676,0,3,"Edvardsson, Mr. Gustaf Hjalmar",male,18,0,0,349912,7.775,,S
+677,0,3,"Sawyer, Mr. Frederick Charles",male,24.5,0,0,342826,8.05,,S
+678,1,3,"Turja, Miss. Anna Sofia",female,18,0,0,4138,9.8417,,S
+679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43,1,6,CA 2144,46.9,,S
+680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36,0,1,PC 17755,512.3292,B51 B53 B55,C
+681,0,3,"Peters, Miss. Katie",female,,0,0,330935,8.1375,,Q
+682,1,1,"Hassab, Mr. Hammad",male,27,0,0,PC 17572,76.7292,D49,C
+683,0,3,"Olsvigen, Mr. Thor Anderson",male,20,0,0,6563,9.225,,S
+684,0,3,"Goodwin, Mr. Charles Edward",male,14,5,2,CA 2144,46.9,,S
+685,0,2,"Brown, Mr. Thomas William Solomon",male,60,1,1,29750,39,,S
+686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25,1,2,SC/Paris 2123,41.5792,,C
+687,0,3,"Panula, Mr. Jaako Arnold",male,14,4,1,3101295,39.6875,,S
+688,0,3,"Dakic, Mr. Branko",male,19,0,0,349228,10.1708,,S
+689,0,3,"Fischer, Mr. Eberhard Thelander",male,18,0,0,350036,7.7958,,S
+690,1,1,"Madill, Miss. Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S
+691,1,1,"Dick, Mr. Albert Adrian",male,31,1,0,17474,57,B20,S
+692,1,3,"Karun, Miss. Manca",female,4,0,1,349256,13.4167,,C
+693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S
+694,0,3,"Saad, Mr. Khalil",male,25,0,0,2672,7.225,,C
+695,0,1,"Weir, Col. John",male,60,0,0,113800,26.55,,S
+696,0,2,"Chapman, Mr. Charles Henry",male,52,0,0,248731,13.5,,S
+697,0,3,"Kelly, Mr. James",male,44,0,0,363592,8.05,,S
+698,1,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q
+699,0,1,"Thayer, Mr. John Borland",male,49,1,1,17421,110.8833,C68,C
+700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42,0,0,348121,7.65,F G63,S
+701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18,1,0,PC 17757,227.525,C62 C64,C
+702,1,1,"Silverthorne, Mr. Spencer Victor",male,35,0,0,PC 17475,26.2875,E24,S
+703,0,3,"Barbara, Miss. Saiide",female,18,0,1,2691,14.4542,,C
+704,0,3,"Gallagher, Mr. Martin",male,25,0,0,36864,7.7417,,Q
+705,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,,S
+706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39,0,0,250655,26,,S
+707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45,0,0,223596,13.5,,S
+708,1,1,"Calderhead, Mr. Edward Pennington",male,42,0,0,PC 17476,26.2875,E24,S
+709,1,1,"Cleaver, Miss. Alice",female,22,0,0,113781,151.55,,S
+710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C
+711,1,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",female,24,0,0,PC 17482,49.5042,C90,C
+712,0,1,"Klaber, Mr. Herman",male,,0,0,113028,26.55,C124,S
+713,1,1,"Taylor, Mr. Elmer Zebley",male,48,1,0,19996,52,C126,S
+714,0,3,"Larsson, Mr. August Viktor",male,29,0,0,7545,9.4833,,S
+715,0,2,"Greenberg, Mr. Samuel",male,52,0,0,250647,13,,S
+716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19,0,0,348124,7.65,F G73,S
+717,1,1,"Endres, Miss. Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C
+718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S
+719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q
+720,0,3,"Johnson, Mr. Malkolm Joackim",male,33,0,0,347062,7.775,,S
+721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6,0,1,248727,33,,S
+722,0,3,"Jensen, Mr. Svend Lauritz",male,17,1,0,350048,7.0542,,S
+723,0,2,"Gillespie, Mr. William Henry",male,34,0,0,12233,13,,S
+724,0,2,"Hodges, Mr. Henry Price",male,50,0,0,250643,13,,S
+725,1,1,"Chambers, Mr. Norman Campbell",male,27,1,0,113806,53.1,E8,S
+726,0,3,"Oreskovic, Mr. Luka",male,20,0,0,315094,8.6625,,S
+727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30,3,0,31027,21,,S
+728,1,3,"Mannion, Miss. Margareth",female,,0,0,36866,7.7375,,Q
+729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25,1,0,236853,26,,S
+730,0,3,"Ilmakangas, Miss. Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S
+731,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S
+732,0,3,"Hassan, Mr. Houssein G N",male,11,0,0,2699,18.7875,,C
+733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0,,S
+734,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13,,S
+735,0,2,"Troupiansky, Mr. Moses Aaron",male,23,0,0,233639,13,,S
+736,0,3,"Williams, Mr. Leslie",male,28.5,0,0,54636,16.1,,S
+737,0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",female,48,1,3,W./C. 6608,34.375,,S
+738,1,1,"Lesurer, Mr. Gustave J",male,35,0,0,PC 17755,512.3292,B101,C
+739,0,3,"Ivanoff, Mr. Kanio",male,,0,0,349201,7.8958,,S
+740,0,3,"Nankoff, Mr. Minko",male,,0,0,349218,7.8958,,S
+741,1,1,"Hawksford, Mr. Walter James",male,,0,0,16988,30,D45,S
+742,0,1,"Cavendish, Mr. Tyrell William",male,36,1,0,19877,78.85,C46,S
+743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C
+744,0,3,"McNamee, Mr. Neal",male,24,1,0,376566,16.1,,S
+745,1,3,"Stranden, Mr. Juho",male,31,0,0,STON/O 2. 3101288,7.925,,S
+746,0,1,"Crosby, Capt. Edward Gifford",male,70,1,1,WE/P 5735,71,B22,S
+747,0,3,"Abbott, Mr. Rossmore Edward",male,16,1,1,C.A. 2673,20.25,,S
+748,1,2,"Sinkkonen, Miss. Anna",female,30,0,0,250648,13,,S
+749,0,1,"Marvin, Mr. Daniel Warner",male,19,1,0,113773,53.1,D30,S
+750,0,3,"Connaghton, Mr. Michael",male,31,0,0,335097,7.75,,Q
+751,1,2,"Wells, Miss. Joan",female,4,1,1,29103,23,,S
+752,1,3,"Moor, Master. Meier",male,6,0,1,392096,12.475,E121,S
+753,0,3,"Vande Velde, Mr. Johannes Joseph",male,33,0,0,345780,9.5,,S
+754,0,3,"Jonkoff, Mr. Lalio",male,23,0,0,349204,7.8958,,S
+755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48,1,2,220845,65,,S
+756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S
+757,0,3,"Carlsson, Mr. August Sigfrid",male,28,0,0,350042,7.7958,,S
+758,0,2,"Bailey, Mr. Percy Andrew",male,18,0,0,29108,11.5,,S
+759,0,3,"Theobald, Mr. Thomas Leonard",male,34,0,0,363294,8.05,,S
+760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)",female,33,0,0,110152,86.5,B77,S
+761,0,3,"Garfirth, Mr. John",male,,0,0,358585,14.5,,S
+762,0,3,"Nirva, Mr. Iisakki Antino Aijo",male,41,0,0,SOTON/O2 3101272,7.125,,S
+763,1,3,"Barah, Mr. Hanna Assi",male,20,0,0,2663,7.2292,,C
+764,1,1,"Carter, Mrs. William Ernest (Lucile Polk)",female,36,1,2,113760,120,B96 B98,S
+765,0,3,"Eklund, Mr. Hans Linus",male,16,0,0,347074,7.775,,S
+766,1,1,"Hogeboom, Mrs. John C (Anna Andrews)",female,51,1,0,13502,77.9583,D11,S
+767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C
+768,0,3,"Mangan, Miss. Mary",female,30.5,0,0,364850,7.75,,Q
+769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q
+770,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32,0,0,8471,8.3625,,S
+771,0,3,"Lievens, Mr. Rene Aime",male,24,0,0,345781,9.5,,S
+772,0,3,"Jensen, Mr. Niels Peder",male,48,0,0,350047,7.8542,,S
+773,0,2,"Mack, Mrs. (Mary)",female,57,0,0,S.O./P.P. 3,10.5,E77,S
+774,0,3,"Elias, Mr. Dibo",male,,0,0,2674,7.225,,C
+775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54,1,3,29105,23,,S
+776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18,0,0,347078,7.75,,S
+777,0,3,"Tobin, Mr. Roger",male,,0,0,383121,7.75,F38,Q
+778,1,3,"Emanuel, Miss. Virginia Ethel",female,5,0,0,364516,12.475,,S
+779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q
+780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)",female,43,0,1,24160,211.3375,B3,S
+781,1,3,"Ayoub, Miss. Banoura",female,13,0,0,2687,7.2292,,C
+782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17,1,0,17474,57,B20,S
+783,0,1,"Long, Mr. Milton Clyde",male,29,0,0,113501,30,D6,S
+784,0,3,"Johnston, Mr. Andrew G",male,,1,2,W./C. 6607,23.45,,S
+785,0,3,"Ali, Mr. William",male,25,0,0,SOTON/O.Q. 3101312,7.05,,S
+786,0,3,"Harmer, Mr. Abraham (David Lishin)",male,25,0,0,374887,7.25,,S
+787,1,3,"Sjoblom, Miss. Anna Sofia",female,18,0,0,3101265,7.4958,,S
+788,0,3,"Rice, Master. George Hugh",male,8,4,1,382652,29.125,,Q
+789,1,3,"Dean, Master. Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S
+790,0,1,"Guggenheim, Mr. Benjamin",male,46,0,0,PC 17593,79.2,B82 B84,C
+791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q
+792,0,2,"Gaskell, Mr. Alfred",male,16,0,0,239865,26,,S
+793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S
+794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C
+795,0,3,"Dantcheff, Mr. Ristiu",male,25,0,0,349203,7.8958,,S
+796,0,2,"Otter, Mr. Richard",male,39,0,0,28213,13,,S
+797,1,1,"Leader, Dr. Alice (Farnham)",female,49,0,0,17465,25.9292,D17,S
+798,1,3,"Osman, Mrs. Mara",female,31,0,0,349244,8.6833,,S
+799,0,3,"Ibrahim Shawah, Mr. Yousseff",male,30,0,0,2685,7.2292,,C
+800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)",female,30,1,1,345773,24.15,,S
+801,0,2,"Ponesell, Mr. Martin",male,34,0,0,250647,13,,S
+802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31,1,1,C.A. 31921,26.25,,S
+803,1,1,"Carter, Master. William Thornton II",male,11,1,2,113760,120,B96 B98,S
+804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C
+805,1,3,"Hedman, Mr. Oskar Arvid",male,27,0,0,347089,6.975,,S
+806,0,3,"Johansson, Mr. Karl Johan",male,31,0,0,347063,7.775,,S
+807,0,1,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0,A36,S
+808,0,3,"Pettersson, Miss. Ellen Natalia",female,18,0,0,347087,7.775,,S
+809,0,2,"Meyer, Mr. August",male,39,0,0,248723,13,,S
+810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",female,33,1,0,113806,53.1,E8,S
+811,0,3,"Alexander, Mr. William",male,26,0,0,3474,7.8875,,S
+812,0,3,"Lester, Mr. James",male,39,0,0,A/4 48871,24.15,,S
+813,0,2,"Slemen, Mr. Richard James",male,35,0,0,28206,10.5,,S
+814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S
+815,0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,364499,8.05,,S
+816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0,B102,S
+817,0,3,"Heininen, Miss. Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S
+818,0,2,"Mallet, Mr. Albert",male,31,1,1,S.C./PARIS 2079,37.0042,,C
+819,0,3,"Holm, Mr. John Fredrik Alexander",male,43,0,0,C 7075,6.45,,S
+820,0,3,"Skoog, Master. Karl Thorsten",male,10,3,2,347088,27.9,,S
+821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gregg)",female,52,1,1,12749,93.5,B69,S
+822,1,3,"Lulic, Mr. Nikola",male,27,0,0,315098,8.6625,,S
+823,0,1,"Reuchlin, Jonkheer. John George",male,38,0,0,19972,0,,S
+824,1,3,"Moor, Mrs. (Beila)",female,27,0,1,392096,12.475,E121,S
+825,0,3,"Panula, Master. Urho Abraham",male,2,4,1,3101295,39.6875,,S
+826,0,3,"Flynn, Mr. John",male,,0,0,368323,6.95,,Q
+827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S
+828,1,2,"Mallet, Master. Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C
+829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q
+830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28,
+831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15,1,0,2659,14.4542,,C
+832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,,S
+833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C
+834,0,3,"Augustsson, Mr. Albert",male,23,0,0,347468,7.8542,,S
+835,0,3,"Allum, Mr. Owen George",male,18,0,0,2223,8.3,,S
+836,1,1,"Compton, Miss. Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C
+837,0,3,"Pasic, Mr. Jakob",male,21,0,0,315097,8.6625,,S
+838,0,3,"Sirota, Mr. Maurice",male,,0,0,392092,8.05,,S
+839,1,3,"Chip, Mr. Chang",male,32,0,0,1601,56.4958,,S
+840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7,C47,C
+841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20,0,0,SOTON/O2 3101287,7.925,,S
+842,0,2,"Mudd, Mr. Thomas Charles",male,16,0,0,S.O./P.P. 3,10.5,,S
+843,1,1,"Serepeca, Miss. Augusta",female,30,0,0,113798,31,,C
+844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C
+845,0,3,"Culumovic, Mr. Jeso",male,17,0,0,315090,8.6625,,S
+846,0,3,"Abbing, Mr. Anthony",male,42,0,0,C.A. 5547,7.55,,S
+847,0,3,"Sage, Mr. Douglas Bullen",male,,8,2,CA. 2343,69.55,,S
+848,0,3,"Markoff, Mr. Marin",male,35,0,0,349213,7.8958,,C
+849,0,2,"Harper, Rev. John",male,28,0,1,248727,33,,S
+850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C
+851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4,4,2,347082,31.275,,S
+852,0,3,"Svensson, Mr. Johan",male,74,0,0,347060,7.775,,S
+853,0,3,"Boulos, Miss. Nourelain",female,9,1,1,2678,15.2458,,C
+854,1,1,"Lines, Miss. Mary Conover",female,16,0,1,PC 17592,39.4,D28,S
+855,0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",female,44,1,0,244252,26,,S
+856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18,0,1,392091,9.35,,S
+857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45,1,1,36928,164.8667,,S
+858,1,1,"Daly, Mr. Peter Denis ",male,51,0,0,113055,26.55,E17,S
+859,1,3,"Baclini, Mrs. Solomon (Latifa Qurban)",female,24,0,3,2666,19.2583,,C
+860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
+861,0,3,"Hansen, Mr. Claus Peter",male,41,2,0,350026,14.1083,,S
+862,0,2,"Giles, Mr. Frederick Edward",male,21,1,0,28134,11.5,,S
+863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",female,48,0,0,17466,25.9292,D17,S
+864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S
+865,0,2,"Gill, Mr. John William",male,24,0,0,233866,13,,S
+866,1,2,"Bystrom, Mrs. (Karolina)",female,42,0,0,236852,13,,S
+867,1,2,"Duran y More, Miss. Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C
+868,0,1,"Roebling, Mr. Washington Augustus II",male,31,0,0,PC 17590,50.4958,A24,S
+869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S
+870,1,3,"Johnson, Master. Harold Theodor",male,4,1,1,347742,11.1333,,S
+871,0,3,"Balkic, Mr. Cerin",male,26,0,0,349248,7.8958,,S
+872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47,1,1,11751,52.5542,D35,S
+873,0,1,"Carlsson, Mr. Frans Olof",male,33,0,0,695,5,B51 B53 B55,S
+874,0,3,"Vander Cruyssen, Mr. Victor",male,47,0,0,345765,9,,S
+875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28,1,0,P/PP 3381,24,,C
+876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C
+877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20,0,0,7534,9.8458,,S
+878,0,3,"Petroff, Mr. Nedelio",male,19,0,0,349212,7.8958,,S
+879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S
+880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56,0,1,11767,83.1583,C50,C
+881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25,0,1,230433,26,,S
+882,0,3,"Markun, Mr. Johann",male,33,0,0,349257,7.8958,,S
+883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22,0,0,7552,10.5167,,S
+884,0,2,"Banfield, Mr. Frederick James",male,28,0,0,C.A./SOTON 34068,10.5,,S
+885,0,3,"Sutehall, Mr. Henry Jr",male,25,0,0,SOTON/OQ 392076,7.05,,S
+886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39,0,5,382652,29.125,,Q
+887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S
+888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S
+889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
+890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C
+891,0,3,"Dooley, Mr. Patrick",male,32,0,0,370376,7.75,,Q
--- a/data_exploration/explore.py
+++ b/data_exploration/explore.py
@@ -0,0 +1,207 @@
+#import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+import os
+plt.style.use('seaborn-colorblind')
+
+# 2018.11.07 Created by Eamon.Zhang
+
+
+def get_dtypes(data,drop_col=[]):
+    """Return the dtypes for each column of a pandas Dataframe
+
+    Parameters
+    ----------
+    data : pandas Dataframe
+
+    drop_col : columns to omit in a list
+
+    Returns
+    -------
+    str_var_list, num_var_list, all_var_list
+    
+    """
+
+    name_of_col = list(data.columns)
+    num_var_list = []
+    str_var_list = []
+    all_var_list = []
+
+    str_var_list = name_of_col.copy()
+    for var in name_of_col:
+        # check if column belongs to numeric type
+        if (data[var].dtypes in (np.int, np.int64, np.uint, np.int32, np.float,
+                               np.float64, np.float32, np.double)):
+            str_var_list.remove(var)
+            num_var_list.append(var)
+    # drop the omit column from list
+    for var in drop_col:
+        if var in str_var_list:
+            str_var_list.remove(var)
+        if var in num_var_list:
+            num_var_list.remove(var)
+
+    all_var_list.extend(str_var_list)
+    all_var_list.extend(num_var_list)
+    return str_var_list, num_var_list, all_var_list
+
+
+def describe(data,output_path=None):
+    """output the general description of a  pandas Dataframe
+       into a csv file
+    
+    """
+    
+    result = data.describe(include='all')
+    if output_path is not None:
+        output = os.path.join(output_path,'describe.csv')
+        result.to_csv(output)
+        print('result saved at:', str(output))
+    return result
+    
+    
+def discrete_var_barplot(x,y,data,output_path=None):
+    """draw the barplot of a discrete variable x against y(target variable). 
+    By default the bar shows the mean value of y.
+
+    Parameters
+    ----------
+
+
+    Returns
+    -------
+    figure save as PNG
+    """
+    
+    plt.figure(figsize=(15,10))
+    sns.barplot(x=x,y=y,data=data)
+    if output_path is not None:
+        output = os.path.join(output_path,'Barplot_'+str(x)+'_'+str(y)+'.png')
+        plt.savefig(output)   
+        print('Image saved at', str(output))
+    
+    
+def discrete_var_countplot(x,data,output_path=None):
+    """draw the countplot of a discrete variable x.
+
+    Parameters
+    ----------
+
+
+    Returns
+    -------
+    figure save as PNG
+    """    
+    
+    plt.figure(figsize=(15,10))
+    sns.countplot(x=x,data=data)
+    if output_path is not None:
+        output = os.path.join(output_path,'Countplot_'+str(x)+'.png')
+        plt.savefig(output) 
+        print('Image saved at',str(output))
+
+
+def discrete_var_boxplot(x,y,data,output_path=None):
+    """draw the boxplot of a discrete variable x against y.
+
+    Parameters
+    ----------
+
+
+    Returns
+    -------
+    figure save as PNG
+    """    
+        
+    plt.figure(figsize=(15,10))
+    sns.boxplot(x=x,y=y,data=data)
+    if output_path is not None:
+        output = os.path.join(output_path,'Boxplot_'+str(x)+'_'+str(y)+'.png')
+        plt.savefig(output) 
+        print('Image saved at',str(output))
+
+
+def continuous_var_distplot(x,output_path=None,bins=None):
+    """draw the distplot of a continuous variable x.
+
+    Parameters
+    ----------
+
+
+    Returns
+    -------
+    figure save as PNG
+    """    
+    
+    plt.figure(figsize=(15,10))
+    sns.distplot(a=x,kde=False,bins=bins)
+    if output_path is not None:
+        output=os.path.join(output_path,'Distplot_'+str(x.name)+'.png')
+        plt.savefig(output)
+        print('Image saved at',str(output))    
+    
+    
+# 2018.11.28 Created by Eamon.Zhang 
+
+def scatter_plot(x,y,data,output_path=None):
+    """draw the scatter-plot of two variables.
+
+    Parameters
+    ----------
+
+
+    Returns
+    -------
+    figure save as PNG
+    """    
+    
+    plt.figure(figsize=(15,10))
+    sns.scatterplot(x=x,y=y,data=data)
+    if output_path is not None:
+        output = os.path.join(output_path,'Scatter_plot_'+str(x.name)+'_'+str(y.name)+'.png')
+        plt.savefig(output)
+        print('Image saved at',str(output))       
+        
+    
+def correlation_plot(data,output_path=None):
+    """draw the correlation plot between variables.
+
+    Parameters
+    ----------
+
+
+    Returns
+    -------
+    figure save as PNG
+    """    
+    
+    corrmat = data.corr()
+    fig, ax = plt.subplots()
+    fig.set_size_inches(11,11)
+    sns.heatmap(corrmat,cmap="YlGnBu",linewidths=.5,annot=True)
+    if output_path is not None:
+        output = os.path.join(output_path,'Corr_plot'+'.png')
+        plt.savefig(output)
+        print('Image saved at',str(output))  
+    
+    
+def heatmap(data,output_path=None,fmt='d'):
+    """draw the heatmap between 2 variables.
+
+    Parameters
+    ----------
+
+
+    Returns
+    -------
+    figure save as PNG
+    """    
+    
+    fig, ax = plt.subplots()
+    fig.set_size_inches(11,11)
+    sns.heatmap(data,cmap="YlGnBu",linewidths=.5,annot=True,fmt=fmt)
+    if output_path is not None:
+        output = os.path.join(output_path,'Heatmap'+'.png')
+        plt.savefig(output)
+        print('Image saved at',str(output)) 
--- a/feature_cleaning/missing_data.py
+++ b/feature_cleaning/missing_data.py
@@ -0,0 +1,122 @@
+import pandas as pd
+import numpy as np
+from warnings import warn
+
+# 2018.11.07 Created by Eamon.Zhang
+
+
+def check_missing(data,output_path=None):
+    """
+    check the total number & percentage of missing values
+    per variable of a pandas Dataframe
+    """
+    
+    result = pd.concat([data.isnull().sum(),data.isnull().mean()],axis=1)
+    result = result.rename(index=str,columns={0:'total missing',1:'proportion'})
+    if output_path is not None:
+        result.to_csv(output_path+'missing.csv')
+        print('result saved at', output_path, 'missing.csv')
+    return result
+
+
+def drop_missing(data,axis=0):
+    """
+    Listwise deletion:
+    excluding all cases (listwise) that have missing values
+
+    Parameters
+    ----------
+    axis: drop cases(0)/columns(1),default 0
+
+    Returns
+    -------
+    Pandas dataframe with missing cases/columns dropped
+    """    
+    
+    data_copy = data.copy(deep=True)
+    data_copy = data_copy.dropna(axis=axis,inplace=False)
+    return data_copy
+    
+
+def add_var_denote_NA(data,NA_col=[]):
+    """
+    creating an additional variable indicating whether the data 
+    was missing for that observation (1) or not (0).
+    """
+  
+    data_copy = data.copy(deep=True)
+    for i in NA_col:
+        if data_copy[i].isnull().sum()>0:
+            data_copy[i+'_is_NA'] = np.where(data_copy[i].isnull(),1,0)
+        else:
+            warn("Column %s has no missing cases" % i)
+            
+    return data_copy
+
+
+def impute_NA_with_arbitrary(data,impute_value,NA_col=[]):
+    """
+    replacing NA with arbitrary values. 
+    """
+    
+    data_copy = data.copy(deep=True)
+    for i in NA_col:
+        if data_copy[i].isnull().sum()>0:
+            data_copy[i+'_'+str(impute_value)] = data_copy[i].fillna(impute_value)
+        else:
+            warn("Column %s has no missing cases" % i)
+    return data_copy
+
+
+def impute_NA_with_avg(data,strategy='mean',NA_col=[]):
+    """
+    replacing the NA with mean/median/most frequent values of that variable. 
+    Note it should only be performed over training set and then propagated to test set.
+    """
+    
+    data_copy = data.copy(deep=True)
+    for i in NA_col:
+        if data_copy[i].isnull().sum()>0:
+            if strategy=='mean':
+                data_copy[i+'_impute_mean'] = data_copy[i].fillna(data[i].mean())
+            elif strategy=='median':
+                data_copy[i+'_impute_median'] = data_copy[i].fillna(data[i].median())
+            elif strategy=='mode':
+                data_copy[i+'_impute_mode'] = data_copy[i].fillna(data[i].mode()[0])
+        else:
+            warn("Column %s has no missing" % i)
+    return data_copy            
+
+
+def impute_NA_with_end_of_distribution(data,NA_col=[]):
+    """
+    replacing the NA by values that are at the far end of the distribution of that variable
+    calculated by mean + 3*std
+    """
+    
+    data_copy = data.copy(deep=True)
+    for i in NA_col:
+        if data_copy[i].isnull().sum()>0:
+            data_copy[i+'_impute_end_of_distri'] = data_copy[i].fillna(data[i].mean()+3*data[i].std())
+        else:
+            warn("Column %s has no missing" % i)
+    return data_copy            
+    
+
+def impute_NA_with_random(data,NA_col=[],random_state=0):
+    """
+    replacing the NA with random sampling from the pool of available observations of the variable
+    """
+    
+    data_copy = data.copy(deep=True)
+    for i in NA_col:
+        if data_copy[i].isnull().sum()>0:
+            data_copy[i+'_random'] = data_copy[i]
+            # extract the random sample to fill the na
+            random_sample = data_copy[i].dropna().sample(data_copy[i].isnull().sum(), random_state=random_state)
+            random_sample.index = data_copy[data_copy[i].isnull()].index
+            data_copy.loc[data_copy[i].isnull(), str(i)+'_random'] = random_sample
+        else:
+            warn("Column %s has no missing" % i)
+    return data_copy 
+    
--- a/feature_cleaning/outlier.py
+++ b/feature_cleaning/outlier.py
@@ -0,0 +1,138 @@
+import pandas as pd
+import numpy as np
+# from warnings import warn
+
+# 2018.11.07 Created by Eamon.Zhang
+
+def outlier_detect_arbitrary(data,col,upper_fence,lower_fence):
+    '''
+    identify outliers based on arbitrary boundaries passed to the function.
+    '''
+
+    para = (upper_fence, lower_fence)
+    tmp = pd.concat([data[col]>upper_fence,data[col]<lower_fence],axis=1)
+    outlier_index = tmp.any(axis=1)
+    print('Num of outlier detected:',outlier_index.value_counts()[1])
+    print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))    
+    return outlier_index, para
+
+
+
+def outlier_detect_IQR(data,col,threshold=3):
+    '''
+    outlier detection by Interquartile Ranges Rule, also known as Tukey's test. 
+    calculate the IQR ( 75th quantile - 25th quantile) 
+    and the 25th 75th quantile. 
+    Any value beyond:
+        upper bound = 75th quantile + （IQR * threshold）
+        lower bound = 25th quantile - （IQR * threshold）   
+    are regarded as outliers. Default threshold is 3.
+    '''
+     
+    IQR = data[col].quantile(0.75) - data[col].quantile(0.25)
+    Lower_fence = data[col].quantile(0.25) - (IQR * threshold)
+    Upper_fence = data[col].quantile(0.75) + (IQR * threshold)
+    para = (Upper_fence, Lower_fence)
+    tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
+    outlier_index = tmp.any(axis=1)
+    print('Num of outlier detected:',outlier_index.value_counts()[1])
+    print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
+    return outlier_index, para
+
+
+def outlier_detect_mean_std(data,col,threshold=3):
+    '''
+    outlier detection by Mean and Standard Deviation Method.
+    If a value is a certain number(called threshold) of standard deviations away 
+    from the mean, that data point is identified as an outlier. 
+    Default threshold is 3.
+
+    This method can fail to detect outliers because the outliers increase the standard deviation. 
+    The more extreme the outlier, the more the standard deviation is affected.
+    '''
+   
+    Upper_fence = data[col].mean() + threshold * data[col].std()
+    Lower_fence = data[col].mean() - threshold * data[col].std()   
+    para = (Upper_fence, Lower_fence)   
+    tmp = pd.concat([data[col]>Upper_fence,data[col]<Lower_fence],axis=1)
+    outlier_index = tmp.any(axis=1)
+    print('Num of outlier detected:',outlier_index.value_counts()[1])
+    print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
+    return outlier_index, para
+
+
+def outlier_detect_MAD(data,col,threshold=3.5):
+    """
+    outlier detection by Median and Median Absolute Deviation Method (MAD)
+    The median of the residuals is calculated. Then, the difference is calculated between each historical value and this median. 
+    These differences are expressed as their absolute values, and a new median is calculated and multiplied by 
+    an empirically derived constant to yield the median absolute deviation (MAD). 
+    If a value is a certain number of MAD away from the median of the residuals, 
+    that value is classified as an outlier. The default threshold is 3 MAD.
+    
+    This method is generally more effective than the mean and standard deviation method for detecting outliers, 
+    but it can be too aggressive in classifying values that are not really extremely different. 
+    Also, if more than 50% of the data points have the same value, MAD is computed to be 0, 
+    so any value different from the residual median is classified as an outlier.
+    """
+    
+    median = data[col].median()
+    median_absolute_deviation = np.median([np.abs(y - median) for y in data[col]])
+    modified_z_scores = pd.Series([0.6745 * (y - median) / median_absolute_deviation for y in data[col]])
+    outlier_index = np.abs(modified_z_scores) > threshold
+    print('Num of outlier detected:',outlier_index.value_counts()[1])
+    print('Proportion of outlier detected',outlier_index.value_counts()[1]/len(outlier_index))
+    return outlier_index
+
+
+# 2018.11.10 outlier treatment
+def impute_outlier_with_arbitrary(data,outlier_index,value,col=[]):
+    """
+    impute outliers with arbitrary value
+    """
+    
+    data_copy = data.copy(deep=True)
+    for i in col:
+        data_copy.loc[outlier_index,i] = value
+    return data_copy
+    
+    
+def windsorization(data,col,para,strategy='both'):
+    """
+    top-coding & bottom coding (capping the maximum of a distribution at an arbitrarily set value,vice versa)
+    """
+    
+    data_copy = data.copy(deep=True)  
+    if strategy == 'both':
+        data_copy.loc[data_copy[col]>para[0],col] = para[0]
+        data_copy.loc[data_copy[col]<para[1],col] = para[1]
+    elif strategy == 'top':
+        data_copy.loc[data_copy[col]>para[0],col] = para[0]
+    elif strategy == 'bottom':
+        data_copy.loc[data_copy[col]<para[1],col] = para[1]  
+    return data_copy
+
+
+def drop_outlier(data,outlier_index):
+    """
+    drop the cases that are outliers
+    """
+    
+    data_copy = data[~outlier_index]
+    return data_copy
+
+
+def impute_outlier_with_avg(data,col,outlier_index,strategy='mean'):
+    """
+    impute outlier with mean/median/most frequent values of that variable.
+    """
+    
+    data_copy = data.copy(deep=True)
+    if strategy=='mean':
+        data_copy.loc[outlier_index,col] = data_copy[col].mean()
+    elif strategy=='median':
+        data_copy.loc[outlier_index,col] = data_copy[col].median()
+    elif strategy=='mode':
+        data_copy.loc[outlier_index,col] = data_copy[col].mode()[0]   
+        
+    return data_copy
--- a/feature_cleaning/rare_values.py
+++ b/feature_cleaning/rare_values.py
@@ -0,0 +1,247 @@
+import pandas as pd
+# import numpy as np
+# from warnings import warn
+
+# 2018.11.07 Created by Eamon.Zhang
+# 2018.11.12 change into fit() transform() format
+
+class GroupingRareValues():
+    """
+    Grouping the observations that show rare labels into a unique category ('rare')
+    
+    Parameters
+    ----------
+   
+    """
+
+    def __init__(self, mapping=None, cols=None, threshold=0.01):
+        self.cols = cols
+        self.mapping = mapping
+        self._dim = None
+        self.threshold = threshold
+
+
+    def fit(self, X, y=None, **kwargs):
+        """Fit encoder according to X and y.
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            Training vectors, where n_samples is the number of samples
+            and n_features is the number of features.
+        y : array-like, shape = [n_samples]
+            Target values.
+        Returns
+        -------
+        self : encoder
+            Returns self.
+        """
+
+        self._dim = X.shape[1]
+
+        _, categories = self.grouping(
+            X,
+            mapping=self.mapping,
+            cols=self.cols,
+            threshold=self.threshold
+        )
+        self.mapping = categories
+        return self
+
+
+    def transform(self, X):
+        """Perform the transformation to new categorical data.
+        Will use the mapping (if available) and the column list to encode the
+        data.
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+        Returns
+        -------
+        X : Transformed values with encoding applied.
+        """
+
+        if self._dim is None:
+            raise ValueError('Must train encoder before it can be used to transform data.')
+
+        #  make sure that it is the right size
+        if X.shape[1] != self._dim:
+            raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
+
+        X, _ = self.grouping(
+            X,
+            mapping=self.mapping,
+            cols=self.cols,
+            threshold=self.threshold
+        )
+
+        return X 
+
+
+    def grouping(self, X_in, threshold, mapping=None, cols=None):
+        """
+        Grouping the observations that show rare labels into a unique category ('rare')
+
+        """
+
+        X = X_in.copy(deep=True)
+
+#        if cols is None:
+#            cols = X.columns.values
+
+        if mapping is not None:  # transform
+            mapping_out = mapping
+            for i in mapping:
+                column = i.get('col') # get the column name
+                X[column] = X[column].map(i['mapping'])
+
+#                try:
+#                    X[column] = X[column].astype(int)
+#                except ValueError as e:
+#                    X[column] = X[column].astype(float)
+        else: # fit
+            mapping_out = []
+            for col in cols:
+#                if util.is_category(X[col].dtype):
+#                    categories = X[col].cat.categories
+#                else:
+                temp_df = pd.Series(X[col].value_counts()/len(X))
+                mapping = { k: ('rare' if k not in temp_df[temp_df >= threshold].index else k)
+                          for k in temp_df.index}
+
+                mapping = pd.Series(mapping)
+                mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )
+
+        return X, mapping_out
+
+
+
+#==============================================================================
+# def rare_imputation(X_train, X_test, variable):
+#     
+#     # find the most frequent category
+#     frequent_cat = X_train.groupby(variable)[variable].count().sort_values().tail(1).index.values[0]
+#     
+#     # find rare labels
+#     temp = X_train.groupby([variable])[variable].count()/np.float(len(X_train))
+#     rare_cat = [x for x in temp.loc[temp<0.05].index.values]
+#     
+#     # create new variables, with Rare labels imputed
+#     
+#     # by the most frequent category
+#     X_train[variable+'_freq_imp'] = np.where(X_train[variable].isin(rare_cat), frequent_cat, X_train[variable])
+#     X_test[variable+'_freq_imp'] = np.where(X_test[variable].isin(rare_cat), frequent_cat, X_test[variable])
+#     
+#     # by adding a new label 'Rare'
+#     X_train[variable+'_rare_imp'] = np.where(X_train[variable].isin(rare_cat), 'Rare', X_train[variable])
+#     X_test[variable+'_rare_imp'] = np.where(X_test[variable].isin(rare_cat), 'Rare', X_test[variable])
+#==============================================================================
+
+# 2018.11.26 created by Eamon.Zhang
+class ModeImputation():
+    """
+    Replacing the rare label by most frequent label
+    
+    Parameters
+    ----------
+   
+    """
+
+    def __init__(self, mapping=None, cols=None, threshold=0.01):
+        self.cols = cols
+        self.mapping = mapping
+        self._dim = None
+        self.threshold = threshold
+
+
+    def fit(self, X, y=None, **kwargs):
+        """Fit encoder according to X and y.
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            Training vectors, where n_samples is the number of samples
+            and n_features is the number of features.
+        y : array-like, shape = [n_samples]
+            Target values.
+        Returns
+        -------
+        self : encoder
+            Returns self.
+        """
+
+        self._dim = X.shape[1]
+
+        _, categories = self.impute_with_mode(
+            X,
+            mapping=self.mapping,
+            cols=self.cols,
+            threshold=self.threshold
+        )
+        self.mapping = categories
+        return self
+
+
+    def transform(self, X):
+        """Perform the transformation to new categorical data.
+        Will use the mapping (if available) and the column list to encode the
+        data.
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+        Returns
+        -------
+        X : Transformed values with encoding applied.
+        """
+
+        if self._dim is None:
+            raise ValueError('Must train encoder before it can be used to transform data.')
+
+        #  make sure that it is the right size
+        if X.shape[1] != self._dim:
+            raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
+
+        X, _ = self.impute_with_mode(
+            X,
+            mapping=self.mapping,
+            cols=self.cols,
+            threshold=self.threshold
+        )
+
+        return X 
+
+
+    def impute_with_mode(self, X_in, threshold, mapping=None, cols=None):
+        """
+        Grouping the observations that show rare labels into a unique category ('rare')
+
+        """
+
+        X = X_in.copy(deep=True)
+
+#        if cols is None:
+#            cols = X.columns.values
+
+        if mapping is not None:  # transform
+            mapping_out = mapping
+            for i in mapping:
+                column = i.get('col') # get the column name
+                X[column] = X[column].map(i['mapping'])
+
+#                try:
+#                    X[column] = X[column].astype(int)
+#                except ValueError as e:
+#                    X[column] = X[column].astype(float)
+        else: # fit
+            mapping_out = []
+            for col in cols:
+#                if util.is_category(X[col].dtype):
+#                    categories = X[col].cat.categories
+#                else:
+                temp_df = pd.Series(X[col].value_counts()/len(X))
+                median = X[col].mode()[0]
+                mapping = { k: (median if k not in temp_df[temp_df >= threshold].index else k)
+                          for k in temp_df.index}
+
+                mapping = pd.Series(mapping)
+                mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )
+
+        return X, mapping_out
--- a/feature_engineering/discretization.py
+++ b/feature_engineering/discretization.py
@@ -0,0 +1,329 @@
+import pandas as pd
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.model_selection import cross_val_score
+import numpy as np
+
+# from warnings import warn
+
+# 2018.11.17 Created by Eamon.Zhang
+# ChiMerge method modeified from https://github.com/tatsumiw/ChiMerge/blob/master/ChiMerge.py
+# TODO: add more constraits to the discretized result.
+class ChiMerge():
+    """
+    supervised discretization using the ChiMerge method.
+    
+    
+    Parameters
+    ----------
+    confidenceVal: number
+        default=3.841, correspond to p=0.05 dof=1
+    num_of_bins: int
+        number of bins after discretize
+    col: str
+        the column to be performed
+        
+    """
+    
+    def __init__(self, col=None, bins=None, confidenceVal=3.841, num_of_bins=10):
+        self.col = col
+        self._dim = None
+        self.confidenceVal = confidenceVal
+        self.bins = bins
+        self.num_of_bins = num_of_bins
+
+
+    def fit(self, X, y, **kwargs):
+        """Fit encoder according to X and y.
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            Training vectors, where n_samples is the number of samples
+            and n_features is the number of features.
+        y : array-like, shape = [n_samples]
+            Target values.
+        Returns
+        -------
+        self : encoder
+            Returns self.
+        """
+
+        self._dim = X.shape[1]
+
+        _, bins = self.chimerge(
+            X_in=X,
+            y=y,
+            confidenceVal=self.confidenceVal,
+            col=self.col,
+            num_of_bins=self.num_of_bins
+        )
+        self.bins = bins
+        return self
+    
+    
+    def transform(self, X):
+            """Perform the transformation to new data.
+            Will use the tree model and the column list to discretize the
+            column.
+            Parameters
+            ----------
+            X : array-like, shape = [n_samples, n_features]
+            Returns
+            -------
+            X : new dataframe with discretized new column.
+            """
+    
+            if self._dim is None:
+                raise ValueError('Must train encoder before it can be used to transform data.')
+    
+            #  make sure that it is the right size
+            if X.shape[1] != self._dim:
+                raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
+    
+            X, _ = self.chimerge(
+                X_in=X,
+                col=self.col,
+                bins=self.bins
+            )
+    
+            return X 
+
+    def chimerge(self, X_in, y=None, confidenceVal=None, num_of_bins=None, col=None, bins=None):
+        """
+        discretize a variable using ChiMerge
+
+        """
+
+        X = X_in.copy(deep=True)
+
+        if bins is not None:  # transform
+            try:
+                X[col+'_chimerge'] = pd.cut(X[col],bins=bins,include_lowest=True)
+            except Exception as e:
+                print(e)
+       
+        else: # fit
+            try:               
+                # create an array which save the num of 0/1 samples of the column to be chimerge
+                total_num = X.groupby([col])[y].count()
+                total_num = pd.DataFrame({'total_num': total_num}) 
+                positive_class = X.groupby([col])[y].sum()
+                positive_class = pd.DataFrame({'positive_class': positive_class}) 
+                regroup = pd.merge(total_num, positive_class, left_index=True, right_index=True,how='inner')  
+                regroup.reset_index(inplace=True)
+                regroup['negative_class'] = regroup['total_num'] - regroup['positive_class']  
+                regroup = regroup.drop('total_num', axis=1)
+                np_regroup = np.array(regroup)  
+                # merge interval that have 0 pos/neg samples
+                i = 0
+                while (i <= np_regroup.shape[0] - 2):
+                    if ((np_regroup[i, 1] == 0 and np_regroup[i + 1, 1] == 0) or ( np_regroup[i, 2] == 0 and np_regroup[i + 1, 2] == 0)):
+                        np_regroup[i, 1] = np_regroup[i, 1] + np_regroup[i + 1, 1]  # pos
+                        np_regroup[i, 2] = np_regroup[i, 2] + np_regroup[i + 1, 2]  # neg
+                        np_regroup[i, 0] = np_regroup[i + 1, 0]
+                        np_regroup = np.delete(np_regroup, i + 1, 0)
+                        i = i - 1
+                    i = i + 1
+                # calculate chi for neighboring intervals
+                # ∑[(yA-yB)²/yB]
+                chi_table = np.array([])
+                for i in np.arange(np_regroup.shape[0] - 1):
+                    chi = (np_regroup[i, 1] * np_regroup[i + 1, 2] - np_regroup[i, 2] * np_regroup[i + 1, 1]) ** 2 \
+                      * (np_regroup[i, 1] + np_regroup[i, 2] + np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) / \
+                      ((np_regroup[i, 1] + np_regroup[i, 2]) * (np_regroup[i + 1, 1] + np_regroup[i + 1, 2]) * (
+                      np_regroup[i, 1] + np_regroup[i + 1, 1]) * (np_regroup[i, 2] + np_regroup[i + 1, 2]))
+                    chi_table = np.append(chi_table, chi)
+                # merge intervals that have closing chi
+                while (1):
+                    if (len(chi_table) <= (num_of_bins - 1) and min(chi_table) >= confidenceVal):
+                        break
+                    chi_min_index = np.argwhere(chi_table == min(chi_table))[0]  
+                    np_regroup[chi_min_index, 1] = np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]
+                    np_regroup[chi_min_index, 2] = np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]
+                    np_regroup[chi_min_index, 0] = np_regroup[chi_min_index + 1, 0]
+                    np_regroup = np.delete(np_regroup, chi_min_index + 1, 0)
+        
+                    if (chi_min_index == np_regroup.shape[0] - 1): 
+                        chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
+                                                       * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
+                                                   ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
+                        chi_table = np.delete(chi_table, chi_min_index, axis=0)
+        
+                    else:
+                        chi_table[chi_min_index - 1] = (np_regroup[chi_min_index - 1, 1] * np_regroup[chi_min_index, 2] - np_regroup[chi_min_index - 1, 2] * np_regroup[chi_min_index, 1]) ** 2 \
+                                                   * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) / \
+                                                   ((np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index - 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index - 1, 1] + np_regroup[chi_min_index, 1]) * (np_regroup[chi_min_index - 1, 2] + np_regroup[chi_min_index, 2]))
+                        chi_table[chi_min_index] = (np_regroup[chi_min_index, 1] * np_regroup[chi_min_index + 1, 2] - np_regroup[chi_min_index, 2] * np_regroup[chi_min_index + 1, 1]) ** 2 \
+                                                   * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) / \
+                                               ((np_regroup[chi_min_index, 1] + np_regroup[chi_min_index, 2]) * (np_regroup[chi_min_index + 1, 1] + np_regroup[chi_min_index + 1, 2]) * (np_regroup[chi_min_index, 1] + np_regroup[chi_min_index + 1, 1]) * (np_regroup[chi_min_index, 2] + np_regroup[chi_min_index + 1, 2]))
+                        chi_table = np.delete(chi_table, chi_min_index + 1, axis=0)
+                result_data = pd.DataFrame()
+                result_data['variable'] = [col] * np_regroup.shape[0]
+                bins = []
+                tmp = []
+                for i in np.arange(np_regroup.shape[0]):
+                    if i == 0:
+                        y = '-inf' + ',' + str(np_regroup[i, 0])
+                        #x = np_regroup[i, 0]
+                        #list_temp.append(x)
+                    elif i == np_regroup.shape[0] - 1:
+                        y = str(np_regroup[i - 1, 0]) + '+'
+                        #x = 100000000.
+                        #list_temp.append(x)
+                    else:
+                        y = str(np_regroup[i - 1, 0]) + ',' + str(np_regroup[i, 0])
+                        #x = np_regroup[i, 0]
+                        #list_temp.append(x)
+                    bins.append(np_regroup[i - 1, 0])
+                    tmp.append(y)
+                
+                #list_temp.append(df[variable].max()+0.1)
+                bins.append(X[col].min()-0.1)
+                
+                result_data['interval'] = tmp  
+                result_data['flag_0'] = np_regroup[:, 2] 
+                result_data['flag_1'] = np_regroup[:, 1]  
+                bins.sort(reverse=False)
+                print('Interval for variable %s' % col)
+                print(result_data)
+                
+            except Exception as e:
+                print(e)
+        
+        return X, bins
+        
+        
+        
+        
+# 2018.11.15 Created by Eamon.Zhang
+class DiscretizeByDecisionTree():
+    """
+    Discretisation with Decision Trees consists of using a decision tree 
+    to identify the optimal splitting points that would determine the bins 
+    or contiguous intervals:  
+        
+    1.train a decision tree of limited depth (2, 3 or 4) using the variable 
+    we want to discretise to predict the target.
+    2.the original variable values are then replaced by the 
+    probability returned by the tree.
+
+    Parameters
+    ----------
+    col: str
+      column to discretise
+    max_depth: int or list of int
+      max depth of the tree. Can be an int or a list of int we want the tree model to search 
+      for the optimal depth.
+    
+    """
+
+    def __init__(self, col=None, max_depth=None, tree_model=None):
+        self.col = col
+        self._dim = None
+        self.max_depth = max_depth
+        self.tree_model = tree_model
+
+
+    def fit(self, X, y, **kwargs):
+        """Fit encoder according to X and y.
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            Training vectors, where n_samples is the number of samples
+            and n_features is the number of features.
+        y : array-like, shape = [n_samples]
+            Target values.
+        Returns
+        -------
+        self : encoder
+            Returns self.
+        """
+
+        self._dim = X.shape[1]
+
+        _, tree = self.discretize(
+            X_in=X,
+            y=y,
+            max_depth=self.max_depth,
+            col=self.col,
+            tree_model=self.tree_model
+        )
+        self.tree_model = tree
+        return self
+
+    def transform(self, X):
+        """Perform the transformation to new categorical data.
+        Will use the tree model and the column list to discretize the
+        column.
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+        Returns
+        -------
+        X : new dataframe with discretized new column.
+        """
+
+        if self._dim is None:
+            raise ValueError('Must train encoder before it can be used to transform data.')
+
+        #  make sure that it is the right size
+        if X.shape[1] != self._dim:
+            raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
+
+        X, _ = self.discretize(
+            X_in=X,
+            col=self.col,
+            tree_model=self.tree_model
+        )
+
+        return X 
+
+
+    def discretize(self, X_in, y=None, max_depth=None, tree_model=None, col=None):
+        """
+        discretize a variable using DecisionTreeClassifier
+
+        """
+
+        X = X_in.copy(deep=True)
+
+        if tree_model is not None:  # transform
+            X[col+'_tree_discret'] = tree_model.predict_proba(X[col].to_frame())[:,1]
+
+        else: # fit
+            if isinstance(max_depth,int):
+                tree_model = DecisionTreeClassifier(max_depth=max_depth)
+                tree_model.fit(X[col].to_frame(), y)
+                # X[col+'_tree_discret'] = tree_model.predict_proba(X[col].to_frame())[:,1]
+                #print(x.tree_discret.unique())
+#                bins = pd.concat( [X.groupby([col+'_tree_discret'])[col].min(),
+#                                  X.groupby([col+'_tree_discret'])[col].max()], axis=1)
+#                print('bins:')            
+#                print(bins)
+            
+            elif len(max_depth)>1:
+                score_ls = [] # here I will store the roc auc
+                score_std_ls = [] # here I will store the standard deviation of the roc_auc
+                for tree_depth in max_depth:
+                    tree_model = DecisionTreeClassifier(max_depth=tree_depth)
+                    scores = cross_val_score(tree_model, X[col].to_frame(), y, cv=3, scoring='roc_auc')
+                    score_ls.append(np.mean(scores))
+                    score_std_ls.append(np.std(scores))
+                temp = pd.concat([pd.Series(max_depth), pd.Series(score_ls), pd.Series(score_std_ls)], axis=1)
+                temp.columns = ['depth', 'roc_auc_mean', 'roc_auc_std']
+                print('result ROC-AUC for each depth')
+                print(temp)
+                max_roc = temp.roc_auc_mean.max()
+                optimal_depth=temp[temp.roc_auc_mean==max_roc]['depth'].values
+                print('optimal_depth:',optimal_depth)
+                tree_model = DecisionTreeClassifier(max_depth=optimal_depth)
+                tree_model.fit(X[col].to_frame(), y)
+#                bins = pd.concat( [X.groupby([col+'_tree_discret'])[col].min(),
+#                                  X.groupby([col+'_tree_discret'])[col].max()], axis=1)
+#                print('bins:')            
+#                print(bins)
+            else:
+                raise ValueError('max_depth of a tree must be an integer or a list')
+
+        return X, tree_model
+
+
--- a/feature_engineering/encoding.py
+++ b/feature_engineering/encoding.py
@@ -0,0 +1,109 @@
+import pandas as pd
+
+# 2018.11.28 Created by Eamon.Zhang
+
+class MeanEncoding():
+    """
+    replacing the label by the mean of the target for that label. 
+    
+    Parameters
+    ----------
+   
+    """
+
+    def __init__(self, mapping=None, cols=None):
+        self.cols = cols
+        self.mapping = mapping
+        self._dim = None
+        # self.threshold = threshold
+
+
+    def fit(self, X, y=None, **kwargs):
+        """Fit encoder according to X and y.
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+            Training vectors, where n_samples is the number of samples
+            and n_features is the number of features.
+        y : array-like, shape = [n_samples]
+            Target values.
+        Returns
+        -------
+        self : encoder
+            Returns self.
+        """
+
+        self._dim = X.shape[1]
+
+        _, categories = self.mean_encoding(
+            X,
+            y,
+            mapping=self.mapping,
+            cols=self.cols
+            # threshold=self.threshold
+        )
+        self.mapping = categories
+        return self
+
+
+    def transform(self, X):
+        """Perform the transformation to new categorical data.
+        Will use the mapping (if available) and the column list to encode the
+        data.
+        Parameters
+        ----------
+        X : array-like, shape = [n_samples, n_features]
+        Returns
+        -------
+        X : Transformed values with encoding applied.
+        """
+
+        if self._dim is None:
+            raise ValueError('Must train encoder before it can be used to transform data.')
+
+        #  make sure that it is the right size
+        if X.shape[1] != self._dim:
+            raise ValueError('Unexpected input dimension %d, expected %d' % (X.shape[1], self._dim,))
+
+        X, _ = self.mean_encoding(
+            X,
+            mapping=self.mapping,
+            cols=self.cols
+            # threshold=self.threshold
+        )
+
+        return X 
+
+
+    def mean_encoding(self, X_in, y=None, mapping=None, cols=None):
+        """
+        Grouping the observations that show rare labels into a unique category ('rare')
+
+        """
+
+        X = X_in.copy(deep=True)
+
+#        if cols is None:
+#            cols = X.columns.values
+
+        if mapping is not None:  # transform
+            mapping_out = mapping
+            for i in mapping:
+                column = i.get('col') # get the column name
+                X[column] = X[column].map(i['mapping'])
+
+#                try:
+#                    X[column] = X[column].astype(int)
+#                except ValueError as e:
+#                    X[column] = X[column].astype(float)
+        else: # fit
+            mapping_out = []
+            for col in cols:
+#                if util.is_category(X[col].dtype):
+#                    categories = X[col].cat.categories
+#                else:
+                mapping = X[y.name].groupby(X[col]).mean().to_dict()
+                mapping = pd.Series(mapping)
+                mapping_out.append({'col': col, 'mapping': mapping, 'data_type': X[col].dtype}, )
+
+        return X, mapping_out
--- a/feature_engineering/transformation.py
+++ b/feature_engineering/transformation.py
@@ -0,0 +1,73 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import scipy.stats as stats
+import pylab
+# from warnings import warn
+
+# 2018.11.26 Created by Eamon.Zhang
+def diagnostic_plots(df, variable):
+    # function to plot a histogram and a Q-Q plot
+    # side by side, for a certain variable
+    
+    plt.figure(figsize=(15,6))
+    plt.subplot(1, 2, 1)
+    df[variable].hist()
+
+    plt.subplot(1, 2, 2)
+    stats.probplot(df[variable], dist="norm", plot=pylab)
+
+    plt.show()
+    
+    
+def log_transform(data,cols=[]):
+    """
+    Logarithmic transformation
+    """
+    
+    data_copy = data.copy(deep=True)
+    for i in cols:
+        data_copy[i+'_log'] = np.log(data_copy[i]+1)
+        print('Variable ' + i +' Q-Q plot')
+        diagnostic_plots(data_copy,str(i+'_log'))       
+    return data_copy 
+
+
+def reciprocal_transform(data,cols=[]):
+    """
+    Reciprocal transformation
+    """
+    
+    data_copy = data.copy(deep=True)
+    for i in cols:
+        data_copy[i+'_reciprocal'] = 1/(data_copy[i])
+        print('Variable ' + i +' Q-Q plot')
+        diagnostic_plots(data_copy,str(i+'_reciprocal'))       
+    return data_copy 
+
+
+def square_root_transform(data,cols=[]):
+    """
+    square root transformation
+    """
+    
+    data_copy = data.copy(deep=True)
+    for i in cols:
+        data_copy[i+'_square_root'] = (data_copy[i])**(0.5)
+        print('Variable ' + i +' Q-Q plot')
+        diagnostic_plots(data_copy,str(i+'_square_root'))        
+    return data_copy 
+
+
+def exp_transform(data,coef,cols=[]):
+    """
+    exp transformation
+    """
+    
+    data_copy = data.copy(deep=True)
+    for i in cols:
+        data_copy[i+'_exp'] = (data_copy[i])**coef
+        print('Variable ' + i +' Q-Q plot')
+        diagnostic_plots(data_copy,str(i+'_exp'))         
+    return data_copy 
+
--- a/feature_selection/embedded_method.py
+++ b/feature_selection/embedded_method.py
@@ -0,0 +1,76 @@
+#import pandas as pd
+import numpy as np
+
+import matplotlib.pyplot as plt
+#import seaborn as sns
+#from sklearn.model_selection import train_test_split
+
+from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier #RandomForestRegressor
+#from sklearn.feature_selection import SelectFromModel
+
+# 2018.11.27 Created by Eamon.Zhang
+
+def rf_importance(X_train,y_train,max_depth=10,class_weight=None,top_n=15,n_estimators=50,random_state=0):
+    
+    model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
+                                    random_state=random_state,class_weight=class_weight,
+                                    n_jobs=-1)
+    model.fit(X_train, y_train)
+    importances = model.feature_importances_
+    indices = np.argsort(importances)[::-1]
+    feat_labels = X_train.columns
+    std = np.std([tree.feature_importances_ for tree in model.estimators_],
+                 axis=0) #  inter-trees variability. 
+    print("Feature ranking:") 
+#    l1,l2,l3,l4 = [],[],[],[]
+    for f in range(X_train.shape[1]):
+        print("%d. feature no:%d feature name:%s (%f)" % (f + 1, indices[f], feat_labels[indices[f]], importances[indices[f]]))
+#        l1.append(f+1)
+#        l2.append(indices[f])
+#        l3.append(feat_labels[indices[f]])
+#        l4.append(importances[indices[f]])
+    #feature_rank = pd.Dataframe(zip(l1,l2,l3,l4),columns=['id','indice','feature','importances'])
+    
+    # plotting
+    indices = indices[0:top_n]
+    plt.figure()
+    plt.title("Feature importances top %d" % top_n)
+    plt.bar(range(top_n), importances[indices],
+           color="r", yerr=std[indices], align="center")
+    plt.xticks(range(top_n), indices)
+    plt.xlim([-1,top_n])
+    plt.show() 
+    
+    return model
+
+
+def gbt_importance(X_train,y_train,max_depth=10,top_n=15,n_estimators=50,random_state=0):
+    
+    model = GradientBoostingClassifier(n_estimators=n_estimators,max_depth=max_depth,
+                                       random_state=random_state)
+    model.fit(X_train, y_train)
+    importances = model.feature_importances_
+    indices = np.argsort(importances)[::-1]
+    feat_labels = X_train.columns
+    std = np.std([tree[0].feature_importances_ for tree in model.estimators_],
+                 axis=0) #  inter-trees variability. 
+    print("Feature ranking:")   
+#    l1,l2,l3,l4 = [],[],[],[]
+    for f in range(X_train.shape[1]):
+        print("%d. feature no:%d feature name:%s (%f)" % (f + 1, indices[f], feat_labels[indices[f]], importances[indices[f]]))
+#        l1.append(f+1)
+#        l2.append(indices[f])
+#        l3.append(feat_labels[indices[f]])
+#        l4.append(importances[indices[f]])
+#    feature_rank = pd.Dataframe(zip(l1,l2,l3,l4),columns=['id','indice','feature','importances'])   
+    # plotting
+    indices = indices[0:top_n]
+    plt.figure()
+    plt.title("Feature importances top %d" % top_n)
+    plt.bar(range(top_n), importances[indices],
+           color="r", yerr=std[indices], align="center")
+    plt.xticks(range(top_n), indices)
+    plt.xlim([-1,top_n])
+    plt.show() 
+    
+    return model
--- a/feature_selection/feature_shuffle.py
+++ b/feature_selection/feature_shuffle.py
@@ -0,0 +1,43 @@
+import pandas as pd
+#import numpy as np
+
+
+from sklearn.ensemble import RandomForestClassifier #, RandomForestRegressor
+from sklearn.metrics import roc_auc_score #, mean_squared_error
+
+# 2018.11.28 Created by Eamon.Zhang
+
+
+def feature_shuffle_rf(X_train,y_train,max_depth=None,class_weight=None,top_n=15,n_estimators=50,random_state=0):
+    
+    model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
+                                    random_state=random_state,class_weight=class_weight,
+                                    n_jobs=-1)
+    model.fit(X_train, y_train)
+    train_auc = roc_auc_score(y_train, (model.predict_proba(X_train))[:, 1])
+    feature_dict = {}
+
+    # selection  logic
+    for feature in X_train.columns:
+        X_train_c = X_train.copy().reset_index(drop=True)
+        y_train_c = y_train.copy().reset_index(drop=True)
+        
+        # shuffle individual feature
+        X_train_c[feature] = X_train_c[feature].sample(frac=1,random_state=random_state).reset_index(
+            drop=True)
+        #print(X_train_c.isnull().sum())
+        # make prediction with shuffled feature and calculate roc-auc
+        shuff_auc = roc_auc_score(y_train_c,
+                                  (model.predict_proba(X_train_c))[:, 1])
+        #print(shuff_auc)
+        # save the drop in roc-auc
+        feature_dict[feature] = (train_auc - shuff_auc)
+        #print(feature_dict)
+    
+    auc_drop = pd.Series(feature_dict).reset_index()
+    auc_drop.columns = ['feature', 'auc_drop']
+    auc_drop.sort_values(by=['auc_drop'], ascending=False, inplace=True)
+    selected_features = auc_drop[auc_drop.auc_drop>0]['feature']
+
+    return auc_drop, selected_features
+
--- a/feature_selection/filter_method.py
+++ b/feature_selection/filter_method.py
@@ -0,0 +1,156 @@
+import pandas as pd
+import numpy as np
+#from sklearn.feature_selection import VarianceThreshold
+from sklearn.feature_selection import mutual_info_classif,chi2
+from sklearn.feature_selection import SelectKBest, SelectPercentile
+from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
+from sklearn.metrics import roc_auc_score, mean_squared_error
+
+# 2018.11.17 Created by Eamon.Zhang
+
+def constant_feature_detect(data,threshold=0.98):
+    """ detect features that show the same value for the 
+    majority/all of the observations (constant/quasi-constant features)
+    
+    Parameters
+    ----------
+    data : pd.Dataframe
+    threshold : threshold to identify the variable as constant
+        
+    Returns
+    -------
+    list of variables names
+    """
+    
+    data_copy = data.copy(deep=True)
+    quasi_constant_feature = []
+    for feature in data_copy.columns:
+        predominant = (data_copy[feature].value_counts() / np.float(
+                      len(data_copy))).sort_values(ascending=False).values[0]
+        if predominant >= threshold:
+            quasi_constant_feature.append(feature)
+    print(len(quasi_constant_feature),' variables are found to be almost constant')    
+    return quasi_constant_feature
+
+
+def corr_feature_detect(data,threshold=0.8):
+    """ detect highly-correlated features of a Dataframe
+    Parameters
+    ----------
+    data : pd.Dataframe
+    threshold : threshold to identify the variable correlated
+        
+    Returns
+    -------
+    pairs of correlated variables
+    """
+    
+    corrmat = data.corr()
+    corrmat = corrmat.abs().unstack() # absolute value of corr coef
+    corrmat = corrmat.sort_values(ascending=False)
+    corrmat = corrmat[corrmat >= threshold]
+    corrmat = corrmat[corrmat < 1] # remove the digonal
+    corrmat = pd.DataFrame(corrmat).reset_index()
+    corrmat.columns = ['feature1', 'feature2', 'corr']
+   
+    grouped_feature_ls = []
+    correlated_groups = []
+    
+    for feature in corrmat.feature1.unique():
+        if feature not in grouped_feature_ls:
+    
+            # find all features correlated to a single feature
+            correlated_block = corrmat[corrmat.feature1 == feature]
+            grouped_feature_ls = grouped_feature_ls + list(
+                correlated_block.feature2.unique()) + [feature]
+    
+            # append the block of features to the list
+            correlated_groups.append(correlated_block)
+    return correlated_groups
+
+
+def mutual_info(X,y,select_k=10):
+    
+#    mi = mutual_info_classif(X,y)
+#    mi = pd.Series(mi)
+#    mi.index = X.columns
+#    mi.sort_values(ascending=False)
+    
+    if select_k >= 1:
+        sel_ = SelectKBest(mutual_info_classif, k=select_k).fit(X,y)
+        col = X.columns[sel_.get_support()]
+        
+    elif 0 < select_k < 1:
+        sel_ = SelectPercentile(mutual_info_classif, percentile=select_k*100).fit(X,y)
+        col = X.columns[sel_.get_support()]   
+        
+    else:
+        raise ValueError("select_k must be a positive number")
+    
+    return col
+    
+
+# 2018.11.27 edit Chi-square test
+def chi_square_test(X,y,select_k=10):
+   
+    """
+    Compute chi-squared stats between each non-negative feature and class.
+    This score should be used to evaluate categorical variables in a classification task
+    """
+    if select_k >= 1:
+        sel_ = SelectKBest(chi2, k=select_k).fit(X,y)
+        col = X.columns[sel_.get_support()]
+    elif 0 < select_k < 1:
+        sel_ = SelectPercentile(chi2, percentile=select_k*100).fit(X,y)
+        col = X.columns[sel_.get_support()]   
+    else:
+        raise ValueError("select_k must be a positive number")  
+    
+    return col
+    
+
+def univariate_roc_auc(X_train,y_train,X_test,y_test,threshold):
+   
+    """
+    First, it builds one decision tree per feature, to predict the target
+    Second, it makes predictions using the decision tree and the mentioned feature
+    Third, it ranks the features according to the machine learning metric (roc-auc or mse)
+    It selects the highest ranked features
+
+    """
+    roc_values = []
+    for feature in X_train.columns:
+        clf = DecisionTreeClassifier()
+        clf.fit(X_train[feature].to_frame(), y_train)
+        y_scored = clf.predict_proba(X_test[feature].to_frame())
+        roc_values.append(roc_auc_score(y_test, y_scored[:, 1]))
+    roc_values = pd.Series(roc_values)
+    roc_values.index = X_train.columns
+    print(roc_values.sort_values(ascending=False))
+    print(len(roc_values[roc_values > threshold]),'out of the %s featues are kept'% len(X_train.columns))
+    keep_col = roc_values[roc_values > threshold]
+    return keep_col
+        
+        
+def univariate_mse(X_train,y_train,X_test,y_test,threshold):
+   
+    """
+    First, it builds one decision tree per feature, to predict the target
+    Second, it makes predictions using the decision tree and the mentioned feature
+    Third, it ranks the features according to the machine learning metric (roc-auc or mse)
+    It selects the highest ranked features
+
+    """
+    mse_values = []
+    for feature in X_train.columns:
+        clf = DecisionTreeRegressor()
+        clf.fit(X_train[feature].to_frame(), y_train)
+        y_scored = clf.predict(X_test[feature].to_frame())
+        mse_values.append(mean_squared_error(y_test, y_scored))
+    mse_values = pd.Series(mse_values)
+    mse_values.index = X_train.columns
+    print(mse_values.sort_values(ascending=False))
+    print(len(mse_values[mse_values > threshold]),'out of the %s featues are kept'% len(X_train.columns))
+    keep_col = mse_values[mse_values > threshold]
+    return keep_col        
+        
--- a/feature_selection/hybrid.py
+++ b/feature_selection/hybrid.py
@@ -0,0 +1,128 @@
+#import pandas as pd
+#import numpy as np
+
+from sklearn.ensemble import RandomForestClassifier #, RandomForestRegressor
+from sklearn.metrics import roc_auc_score #, mean_squared_error
+
+# 2018.12.02 Created by Eamon.Zhang
+
+
+def recursive_feature_elimination_rf(X_train,y_train,X_test,y_test,
+                                     tol=0.001,max_depth=None,
+                                     class_weight=None,
+                                     top_n=15,n_estimators=50,random_state=0):
+    
+   
+    features_to_remove = []
+    count = 1
+    # initial model using all the features
+    model_all_features = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
+                                    random_state=random_state,class_weight=class_weight,
+                                    n_jobs=-1)
+    model_all_features.fit(X_train, y_train)
+    y_pred_test = model_all_features.predict_proba(X_test)[:, 1]
+    auc_score_all = roc_auc_score(y_test, y_pred_test)
+    
+    for feature in X_train.columns:
+        print()
+        print('testing feature: ', feature, ' which is feature ', count,
+          ' out of ', len(X_train.columns))
+        count += 1
+        model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
+                                    random_state=random_state,class_weight=class_weight,
+                                    n_jobs=-1)
+        
+        # fit model with all variables minus the removed features
+        # and the feature to be evaluated
+        model.fit(X_train.drop(features_to_remove + [feature], axis=1), y_train)
+        y_pred_test = model.predict_proba(
+                    X_test.drop(features_to_remove + [feature], axis=1))[:, 1]    
+        auc_score_int = roc_auc_score(y_test, y_pred_test)
+        print('New Test ROC AUC={}'.format((auc_score_int)))
+    
+        # print the original roc-auc with all the features
+        print('All features Test ROC AUC={}'.format((auc_score_all)))
+    
+        # determine the drop in the roc-auc
+        diff_auc = auc_score_all - auc_score_int
+    
+        # compare the drop in roc-auc with the tolerance
+        if diff_auc >= tol:
+            print('Drop in ROC AUC={}'.format(diff_auc))
+            print('keep: ', feature)
+            
+        else:
+            print('Drop in ROC AUC={}'.format(diff_auc))
+            print('remove: ', feature)
+            
+            # if the drop in the roc is small and we remove the
+            # feature, we need to set the new roc to the one based on
+            # the remaining features
+            auc_score_all = auc_score_int
+            
+            # and append the feature to remove to the list
+            features_to_remove.append(feature)
+    print('DONE!!')
+    print('total features to remove: ', len(features_to_remove))  
+    features_to_keep = [x for x in X_train.columns if x not in features_to_remove]
+    print('total features to keep: ', len(features_to_keep))
+    
+    return features_to_keep
+
+
+def recursive_feature_addition_rf(X_train,y_train,X_test,y_test,
+                                     tol=0.001,max_depth=None,
+                                     class_weight=None,
+                                     top_n=15,n_estimators=50,random_state=0):
+    
+   
+    features_to_keep = [X_train.columns[0]]
+    count = 1
+    # initial model using only one feature
+    model_one_feature = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
+                                    random_state=random_state,class_weight=class_weight,
+                                    n_jobs=-1)
+    model_one_feature.fit(X_train[[X_train.columns[0]]], y_train)
+    y_pred_test = model_one_feature.predict_proba(X_test[[X_train.columns[0]]])[:, 1]  
+    auc_score_all = roc_auc_score(y_test, y_pred_test)
+    
+    for feature in X_train.columns[1:]:
+        print()
+        print('testing feature: ', feature, ' which is feature ', count,
+          ' out of ', len(X_train.columns))
+        count += 1
+        model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
+                                    random_state=random_state,class_weight=class_weight,
+                                    n_jobs=-1)
+        
+        # fit model with  the selected features
+        # and the feature to be evaluated
+        model.fit(X_train[features_to_keep + [feature]], y_train)
+        y_pred_test = model.predict_proba(
+                    X_test[features_to_keep + [feature]])[:, 1]    
+        auc_score_int = roc_auc_score(y_test, y_pred_test)
+        print('New Test ROC AUC={}'.format((auc_score_int)))
+    
+        # print the original roc-auc with all the features
+        print('All features Test ROC AUC={}'.format((auc_score_all)))
+    
+        # determine the drop in the roc-auc
+        diff_auc = auc_score_int - auc_score_all
+    
+        # compare the drop in roc-auc with the tolerance
+        if diff_auc >= tol:
+            # if the increase in the roc is bigger than the threshold
+            # we keep the feature and re-adjust the roc-auc to the new value
+            # considering the added feature
+            print('Increase in ROC AUC={}'.format(diff_auc))
+            print('keep: ', feature)
+            auc_score_all = auc_score_int
+            features_to_keep.append(feature)
+        else:
+            print('Increase in ROC AUC={}'.format(diff_auc))
+            print('remove: ', feature)          
+
+    print('DONE!!')
+    print('total features to keep: ', len(features_to_keep))  
+   
+    return features_to_keep
--- a/images/001.png
+++ b/images/001.png
--- a/images/IV.png
+++ b/images/IV.png
--- a/images/box-cox.png
+++ b/images/box-cox.png
--- a/images/embedded.png
+++ b/images/embedded.png
--- a/images/featuretools.png
+++ b/images/featuretools.png
--- a/images/filter.png
+++ b/images/filter.png
--- a/images/scaling.png
+++ b/images/scaling.png
--- a/images/sphx_glr_plot_map_data_to_normal_001.png
+++ b/images/sphx_glr_plot_map_data_to_normal_001.png
--- a/images/workflow2.png
+++ b/images/workflow2.png
--- a/images/wrapper.png
+++ b/images/wrapper.png
--- a/output/Barplot_Pclass_Survived.png
+++ b/output/Barplot_Pclass_Survived.png
--- a/output/Boxplot_Pclass_Fare.png
+++ b/output/Boxplot_Pclass_Fare.png
--- a/output/Corr_plot.png
+++ b/output/Corr_plot.png
--- a/output/Countplot_Pclass.png
+++ b/output/Countplot_Pclass.png
--- a/output/Distplot_Fare.png
+++ b/output/Distplot_Fare.png
--- a/output/Heatmap.png
+++ b/output/Heatmap.png
--- a/output/Scatter_plot_Fare_Pclass.png
+++ b/output/Scatter_plot_Fare_Pclass.png
--- a/output/describe.csv
+++ b/output/describe.csv
@@ -0,0 +1,12 @@
+,Survived,Pclass,Sex,Age,SibSp,Fare
+count,891.0,891.0,891,714.0,891.0,891.0
+unique,,,2,,,
+top,,,male,,,
+freq,,,577,,,
+mean,0.3838383838383838,2.308641975308642,,29.69911764705882,0.5230078563411896,32.2042079685746
+std,0.4865924542648585,0.8360712409770513,,14.526497332334044,1.1027434322934275,49.693428597180905
+min,0.0,1.0,,0.42,0.0,0.0
+25%,0.0,2.0,,20.125,0.0,7.9104
+50%,0.0,3.0,,28.0,0.0,14.4542
+75%,1.0,3.0,,38.0,1.0,31.0
+max,1.0,3.0,,80.0,8.0,512.3292
--- a/output/missing.csv
+++ b/output/missing.csv
@@ -0,0 +1,7 @@
+,total missing,proportion
+Survived,0,0.0
+Pclass,0,0.0
+Sex,0,0.0
+Age,177,0.19865319865319866
+SibSp,0,0.0
+Fare,0,0.0