Amazing-Feature-Engineering/3.5_Demo_Feature_Generation.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "# import seaborn as sns\n",
    "# import matplotlib.pyplot as plt\n",
    "import os\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import roc_curve,  roc_auc_score\n",
    "\n",
    "# plt.style.use('seaborn-colorblind')\n",
    "# %matplotlib inline\n",
    "#from feature_cleaning import rare_values as ra"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "use_cols = [\n",
    "    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
    "    'Survived'\n",
    "]\n",
    "\n",
    "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Fare</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>7.2500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>71.2833</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass     Sex   Age  SibSp     Fare\n",
       "0         0       3    male  22.0      1   7.2500\n",
       "1         1       1  female  38.0      1  71.2833\n",
       "2         1       3  female  26.0      0   7.9250"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((623, 6), (268, 6))"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Note that we include target variable in the X_train \n",
    "# because we need it to supervise our discretization\n",
    "# this is not the standard way of using train-test-split\n",
    "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
    "                                                    random_state=0)\n",
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Polynomial Expansion\n",
    "\n",
    "generate a new feature set consisting of all polynomial combinations of the features with degree less than or equal to the specified degree"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Pclass  SibSp  Pclass^2  Pclass SibSp  SibSp^2\n",
      "0     1.0    0.0       1.0           0.0      0.0\n",
      "1     1.0    1.0       1.0           1.0      1.0\n",
      "2     3.0    5.0       9.0          15.0     25.0\n",
      "3     1.0    0.0       1.0           0.0      0.0\n",
      "4     3.0    1.0       9.0           3.0      1.0\n",
      "5     2.0    1.0       4.0           2.0      1.0\n"
     ]
    }
   ],
   "source": [
    "# create polynomial combinations of feature 'Pclass','SibSp' with degree 2\n",
    "from sklearn.preprocessing import PolynomialFeatures\n",
    "pf = PolynomialFeatures(degree=2,include_bias=False).fit(X_train[['Pclass','SibSp']])\n",
    "tmp = pf.transform(X_train[['Pclass','SibSp']])\n",
    "X_train_copy = pd.DataFrame(tmp,columns=pf.get_feature_names(['Pclass','SibSp']))\n",
    "print(X_train_copy.head(6))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Learning by Trees\n",
    "GBDT derived feature + LR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sample's belonging node of each base tree \n",
      "' [[ 7.  7.  6. ...  4.  7.  4.]\n",
      " [ 7.  7.  6. ... 14.  7.  7.]\n",
      " [11. 11. 11. ...  4.  6. 11.]\n",
      " ...\n",
      " [10. 10. 10. ...  4.  6. 10.]\n",
      " [13. 14. 13. ...  4.  7. 13.]\n",
      " [ 7.  7.  6. ...  6.  7.  7.]]\n",
      "AUC for GBDT derived feature + LR： 0.7746130952380953\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
      "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
      "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
      "  warnings.warn(msg, FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "\n",
    "gbdt = GradientBoostingClassifier(n_estimators=20)\n",
    "one_hot = OneHotEncoder()\n",
    "\n",
    "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
    "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
    "\n",
    "gbdt.fit(X_train, y_train)\n",
    "\n",
    "X_leaf_index = gbdt.apply(X_train)[:, :, 0]  # apply return the node index on each tree \n",
    "print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n",
    "# fit one-hot encoder\n",
    "one_hot.fit(X_leaf_index)   \n",
    "X_one_hot = one_hot.transform(X_leaf_index)  \n",
    "\n",
    "\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
    "lr.fit(X_one_hot,y_train)\n",
    "y_pred = lr.predict_proba(\n",
    "    one_hot.transform(gbdt.apply(X_test)[:, :, 0]))[:,1]\n",
    "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
    "print(\"AUC for GBDT derived feature + LR：\", roc_auc_score(y_test, y_pred))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Learning by Trees\n",
    "RandomForest derived feature + LR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sample's belonging node of each base tree \n",
      "' [[212  35  79 ... 146  60  46]\n",
      " [307 165 266 ... 136 132  44]\n",
      " [285 285 320 ... 301 294 300]\n",
      " ...\n",
      " [ 13 177 133 ... 186 169 117]\n",
      " [190 296 311 ... 282 289 297]\n",
      " [264 165 243 ... 152 110 314]]\n",
      "AUC for RandomForest derived feature + LR： 0.759672619047619\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
      "If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
      "In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
      "  warnings.warn(msg, FutureWarning)\n"
     ]
    }
   ],
   "source": [
    "rf = RandomForestClassifier(n_estimators=20)\n",
    "one_hot = OneHotEncoder()\n",
    "\n",
    "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
    "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
    "\n",
    "rf.fit(X_train, y_train)\n",
    "\n",
    "X_leaf_index = rf.apply(X_train)  # apply return the node index on each tree \n",
    "print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n",
    "# fit one-hot encoder\n",
    "one_hot.fit(X_leaf_index)   \n",
    "X_one_hot = one_hot.transform(X_leaf_index)  \n",
    "\n",
    "\n",
    "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
    "lr.fit(X_one_hot,y_train)\n",
    "y_pred = lr.predict_proba(\n",
    "    one_hot.transform(rf.apply(X_test)))[:,1]\n",
    "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
    "print(\"AUC for RandomForest derived feature + LR：\", roc_auc_score(y_test, y_pred))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "##  Feature Learning by Trees\n",
    "GBDT derived feature + Raw feature +LR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AUC for GBDT derived feature + Raw feature +LR： 0.7603571428571428\n"
     ]
    }
   ],
   "source": [
    "from scipy.sparse import hstack\n",
    "\n",
    "X_train_ext = hstack([one_hot.transform(gbdt.apply(X_train)[:, :, 0]), X_train])\n",
    "X_test_ext = hstack([one_hot.transform(gbdt.apply(X_test)[:, :, 0]), X_test])\n",
    "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
    "lr.fit(X_train_ext,y_train)\n",
    "y_pred = lr.predict_proba(X_test_ext)[:,1]\n",
    "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
    "print(\"AUC for GBDT derived feature + Raw feature +LR：\", roc_auc_score(y_test, y_pred))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Feature Learning by Trees\n",
    "RandomForest derived feature + Raw feature +LR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AUC for RandomForest derived feature + Raw feature + LR： 0.76\n"
     ]
    }
   ],
   "source": [
    "X_train_ext = hstack([one_hot.transform(rf.apply(X_train)), X_train])\n",
    "X_test_ext = hstack([one_hot.transform(rf.apply(X_test)), X_test])\n",
    "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
    "lr.fit(X_train_ext,y_train)\n",
    "y_pred = lr.predict_proba(X_test_ext)[:,1]\n",
    "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
    "print(\"AUC for RandomForest derived feature + Raw feature + LR：\", roc_auc_score(y_test, y_pred))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##  Feature Learning by Trees\n",
    "Use only Raw Feature + LR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AUC for RandomForest derived feature + LR： 0.6988690476190476\n"
     ]
    }
   ],
   "source": [
    "lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
    "lr.fit(X_train,y_train)\n",
    "y_pred = lr.predict_proba(X_test)[:,1]\n",
    "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
    "print(\"AUC for RandomForest derived feature + LR：\", roc_auc_score(y_test, y_pred))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Learning by Trees\n",
    "\n",
    "Use only Raw Feature + GBDT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AUC for Raw feature + GBDT： 0.7613988095238096\n"
     ]
    }
   ],
   "source": [
    "gbdt = GradientBoostingClassifier(n_estimators=20)\n",
    "\n",
    "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
    "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
    "\n",
    "gbdt.fit(X_train, y_train)\n",
    "y_pred = gbdt.predict_proba(X_test)[:,1]\n",
    "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
    "print(\"AUC for Raw feature + GBDT：\", roc_auc_score(y_test, y_pred))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Learning by Trees\n",
    "\n",
    "Use only Raw Feature + RF\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AUC for Raw feature + RF： 0.7235119047619047\n"
     ]
    }
   ],
   "source": [
    "rf = RandomForestClassifier(n_estimators=20)\n",
    "\n",
    "X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
    "X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
    "\n",
    "rf.fit(X_train, y_train)\n",
    "y_pred = rf.predict_proba(X_test)[:,1]\n",
    "fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
    "print(\"AUC for Raw feature + RF：\", roc_auc_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Without tuning, we can see GBDT derived feature + LR get the best result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}