Files
Amazing-Feature-Engineering/3.5_Demo_Feature_Generation.ipynb
DESKTOP-SAT83DL\yimeng.zhang 149c80cedf 2018.12.2 First commit.
2018-12-02 21:11:32 +08:00

523 lines
15 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"# import seaborn as sns\n",
"# import matplotlib.pyplot as plt\n",
"import os\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import roc_curve, roc_auc_score\n",
"\n",
"# plt.style.use('seaborn-colorblind')\n",
"# %matplotlib inline\n",
"#from feature_cleaning import rare_values as ra"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Dataset"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"use_cols = [\n",
" 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
" 'Survived'\n",
"]\n",
"\n",
"data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Survived</th>\n",
" <th>Pclass</th>\n",
" <th>Sex</th>\n",
" <th>Age</th>\n",
" <th>SibSp</th>\n",
" <th>Fare</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>male</td>\n",
" <td>22.0</td>\n",
" <td>1</td>\n",
" <td>7.2500</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>female</td>\n",
" <td>38.0</td>\n",
" <td>1</td>\n",
" <td>71.2833</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>female</td>\n",
" <td>26.0</td>\n",
" <td>0</td>\n",
" <td>7.9250</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" Survived Pclass Sex Age SibSp Fare\n",
"0 0 3 male 22.0 1 7.2500\n",
"1 1 1 female 38.0 1 71.2833\n",
"2 1 3 female 26.0 0 7.9250"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head(3)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((623, 6), (268, 6))"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Note that we include target variable in the X_train \n",
"# because we need it to supervise our discretization\n",
"# this is not the standard way of using train-test-split\n",
"X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
" random_state=0)\n",
"X_train.shape, X_test.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Polynomial Expansion\n",
"\n",
"generate a new feature set consisting of all polynomial combinations of the features with degree less than or equal to the specified degree"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Pclass SibSp Pclass^2 Pclass SibSp SibSp^2\n",
"0 1.0 0.0 1.0 0.0 0.0\n",
"1 1.0 1.0 1.0 1.0 1.0\n",
"2 3.0 5.0 9.0 15.0 25.0\n",
"3 1.0 0.0 1.0 0.0 0.0\n",
"4 3.0 1.0 9.0 3.0 1.0\n",
"5 2.0 1.0 4.0 2.0 1.0\n"
]
}
],
"source": [
"# create polynomial combinations of feature 'Pclass','SibSp' with degree 2\n",
"from sklearn.preprocessing import PolynomialFeatures\n",
"pf = PolynomialFeatures(degree=2,include_bias=False).fit(X_train[['Pclass','SibSp']])\n",
"tmp = pf.transform(X_train[['Pclass','SibSp']])\n",
"X_train_copy = pd.DataFrame(tmp,columns=pf.get_feature_names(['Pclass','SibSp']))\n",
"print(X_train_copy.head(6))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Feature Learning by Trees\n",
"GBDT derived feature + LR"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"sample's belonging node of each base tree \n",
"' [[ 7. 7. 6. ... 4. 7. 4.]\n",
" [ 7. 7. 6. ... 14. 7. 7.]\n",
" [11. 11. 11. ... 4. 6. 11.]\n",
" ...\n",
" [10. 10. 10. ... 4. 6. 10.]\n",
" [13. 14. 13. ... 4. 7. 13.]\n",
" [ 7. 7. 6. ... 6. 7. 7.]]\n",
"AUC for GBDT derived feature + LR 0.7746130952380953\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
"If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
"In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
" warnings.warn(msg, FutureWarning)\n"
]
}
],
"source": [
"from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier\n",
"from sklearn.preprocessing import OneHotEncoder\n",
"\n",
"gbdt = GradientBoostingClassifier(n_estimators=20)\n",
"one_hot = OneHotEncoder()\n",
"\n",
"X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
"X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
"\n",
"gbdt.fit(X_train, y_train)\n",
"\n",
"X_leaf_index = gbdt.apply(X_train)[:, :, 0] # apply return the node index on each tree \n",
"print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n",
"# fit one-hot encoder\n",
"one_hot.fit(X_leaf_index) \n",
"X_one_hot = one_hot.transform(X_leaf_index) \n",
"\n",
"\n",
"from sklearn.linear_model import LogisticRegression\n",
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
"lr.fit(X_one_hot,y_train)\n",
"y_pred = lr.predict_proba(\n",
" one_hot.transform(gbdt.apply(X_test)[:, :, 0]))[:,1]\n",
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
"print(\"AUC for GBDT derived feature + LR\", roc_auc_score(y_test, y_pred))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Feature Learning by Trees\n",
"RandomForest derived feature + LR"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"sample's belonging node of each base tree \n",
"' [[212 35 79 ... 146 60 46]\n",
" [307 165 266 ... 136 132 44]\n",
" [285 285 320 ... 301 294 300]\n",
" ...\n",
" [ 13 177 133 ... 186 169 117]\n",
" [190 296 311 ... 282 289 297]\n",
" [264 165 243 ... 152 110 314]]\n",
"AUC for RandomForest derived feature + LR 0.759672619047619\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
"If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
"In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
" warnings.warn(msg, FutureWarning)\n"
]
}
],
"source": [
"rf = RandomForestClassifier(n_estimators=20)\n",
"one_hot = OneHotEncoder()\n",
"\n",
"X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
"X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
"\n",
"rf.fit(X_train, y_train)\n",
"\n",
"X_leaf_index = rf.apply(X_train) # apply return the node index on each tree \n",
"print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n",
"# fit one-hot encoder\n",
"one_hot.fit(X_leaf_index) \n",
"X_one_hot = one_hot.transform(X_leaf_index) \n",
"\n",
"\n",
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
"lr.fit(X_one_hot,y_train)\n",
"y_pred = lr.predict_proba(\n",
" one_hot.transform(rf.apply(X_test)))[:,1]\n",
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
"print(\"AUC for RandomForest derived feature + LR\", roc_auc_score(y_test, y_pred))\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"## Feature Learning by Trees\n",
"GBDT derived feature + Raw feature +LR"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AUC for GBDT derived feature + Raw feature +LR 0.7603571428571428\n"
]
}
],
"source": [
"from scipy.sparse import hstack\n",
"\n",
"X_train_ext = hstack([one_hot.transform(gbdt.apply(X_train)[:, :, 0]), X_train])\n",
"X_test_ext = hstack([one_hot.transform(gbdt.apply(X_test)[:, :, 0]), X_test])\n",
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
"lr.fit(X_train_ext,y_train)\n",
"y_pred = lr.predict_proba(X_test_ext)[:,1]\n",
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
"print(\"AUC for GBDT derived feature + Raw feature +LR\", roc_auc_score(y_test, y_pred))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Feature Learning by Trees\n",
"RandomForest derived feature + Raw feature +LR"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AUC for RandomForest derived feature + Raw feature + LR 0.76\n"
]
}
],
"source": [
"X_train_ext = hstack([one_hot.transform(rf.apply(X_train)), X_train])\n",
"X_test_ext = hstack([one_hot.transform(rf.apply(X_test)), X_test])\n",
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
"lr.fit(X_train_ext,y_train)\n",
"y_pred = lr.predict_proba(X_test_ext)[:,1]\n",
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
"print(\"AUC for RandomForest derived feature + Raw feature + LR\", roc_auc_score(y_test, y_pred))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Feature Learning by Trees\n",
"Use only Raw Feature + LR"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AUC for RandomForest derived feature + LR 0.6988690476190476\n"
]
}
],
"source": [
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
"lr.fit(X_train,y_train)\n",
"y_pred = lr.predict_proba(X_test)[:,1]\n",
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
"print(\"AUC for RandomForest derived feature + LR\", roc_auc_score(y_test, y_pred))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Feature Learning by Trees\n",
"\n",
"Use only Raw Feature + GBDT"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AUC for Raw feature + GBDT 0.7613988095238096\n"
]
}
],
"source": [
"gbdt = GradientBoostingClassifier(n_estimators=20)\n",
"\n",
"X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
"X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
"\n",
"gbdt.fit(X_train, y_train)\n",
"y_pred = gbdt.predict_proba(X_test)[:,1]\n",
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
"print(\"AUC for Raw feature + GBDT\", roc_auc_score(y_test, y_pred))\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Feature Learning by Trees\n",
"\n",
"Use only Raw Feature + RF\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"AUC for Raw feature + RF 0.7235119047619047\n"
]
}
],
"source": [
"rf = RandomForestClassifier(n_estimators=20)\n",
"\n",
"X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
"X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
"\n",
"rf.fit(X_train, y_train)\n",
"y_pred = rf.predict_proba(X_test)[:,1]\n",
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
"print(\"AUC for Raw feature + RF\", roc_auc_score(y_test, y_pred))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Without tuning, we can see GBDT derived feature + LR get the best result"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}