mirror of
https://github.com/ashishpatel26/Amazing-Feature-Engineering.git
synced 2022-05-07 18:26:02 +03:00
523 lines
15 KiB
Plaintext
523 lines
15 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"# import seaborn as sns\n",
|
||
"# import matplotlib.pyplot as plt\n",
|
||
"import os\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.metrics import roc_curve, roc_auc_score\n",
|
||
"\n",
|
||
"# plt.style.use('seaborn-colorblind')\n",
|
||
"# %matplotlib inline\n",
|
||
"#from feature_cleaning import rare_values as ra"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Load Dataset"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"use_cols = [\n",
|
||
" 'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
|
||
" 'Survived'\n",
|
||
"]\n",
|
||
"\n",
|
||
"data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>Survived</th>\n",
|
||
" <th>Pclass</th>\n",
|
||
" <th>Sex</th>\n",
|
||
" <th>Age</th>\n",
|
||
" <th>SibSp</th>\n",
|
||
" <th>Fare</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>0</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>male</td>\n",
|
||
" <td>22.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>7.2500</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>38.0</td>\n",
|
||
" <td>1</td>\n",
|
||
" <td>71.2833</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>1</td>\n",
|
||
" <td>3</td>\n",
|
||
" <td>female</td>\n",
|
||
" <td>26.0</td>\n",
|
||
" <td>0</td>\n",
|
||
" <td>7.9250</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" Survived Pclass Sex Age SibSp Fare\n",
|
||
"0 0 3 male 22.0 1 7.2500\n",
|
||
"1 1 1 female 38.0 1 71.2833\n",
|
||
"2 1 3 female 26.0 0 7.9250"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data.head(3)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"((623, 6), (268, 6))"
|
||
]
|
||
},
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# Note that we include target variable in the X_train \n",
|
||
"# because we need it to supervise our discretization\n",
|
||
"# this is not the standard way of using train-test-split\n",
|
||
"X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
|
||
" random_state=0)\n",
|
||
"X_train.shape, X_test.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Polynomial Expansion\n",
|
||
"\n",
|
||
"generate a new feature set consisting of all polynomial combinations of the features with degree less than or equal to the specified degree"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
" Pclass SibSp Pclass^2 Pclass SibSp SibSp^2\n",
|
||
"0 1.0 0.0 1.0 0.0 0.0\n",
|
||
"1 1.0 1.0 1.0 1.0 1.0\n",
|
||
"2 3.0 5.0 9.0 15.0 25.0\n",
|
||
"3 1.0 0.0 1.0 0.0 0.0\n",
|
||
"4 3.0 1.0 9.0 3.0 1.0\n",
|
||
"5 2.0 1.0 4.0 2.0 1.0\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# create polynomial combinations of feature 'Pclass','SibSp' with degree 2\n",
|
||
"from sklearn.preprocessing import PolynomialFeatures\n",
|
||
"pf = PolynomialFeatures(degree=2,include_bias=False).fit(X_train[['Pclass','SibSp']])\n",
|
||
"tmp = pf.transform(X_train[['Pclass','SibSp']])\n",
|
||
"X_train_copy = pd.DataFrame(tmp,columns=pf.get_feature_names(['Pclass','SibSp']))\n",
|
||
"print(X_train_copy.head(6))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Feature Learning by Trees\n",
|
||
"GBDT derived feature + LR"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"sample's belonging node of each base tree \n",
|
||
"' [[ 7. 7. 6. ... 4. 7. 4.]\n",
|
||
" [ 7. 7. 6. ... 14. 7. 7.]\n",
|
||
" [11. 11. 11. ... 4. 6. 11.]\n",
|
||
" ...\n",
|
||
" [10. 10. 10. ... 4. 6. 10.]\n",
|
||
" [13. 14. 13. ... 4. 7. 13.]\n",
|
||
" [ 7. 7. 6. ... 6. 7. 7.]]\n",
|
||
"AUC for GBDT derived feature + LR: 0.7746130952380953\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
|
||
"If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
|
||
"In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
|
||
" warnings.warn(msg, FutureWarning)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier\n",
|
||
"from sklearn.preprocessing import OneHotEncoder\n",
|
||
"\n",
|
||
"gbdt = GradientBoostingClassifier(n_estimators=20)\n",
|
||
"one_hot = OneHotEncoder()\n",
|
||
"\n",
|
||
"X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||
"X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||
"\n",
|
||
"gbdt.fit(X_train, y_train)\n",
|
||
"\n",
|
||
"X_leaf_index = gbdt.apply(X_train)[:, :, 0] # apply return the node index on each tree \n",
|
||
"print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n",
|
||
"# fit one-hot encoder\n",
|
||
"one_hot.fit(X_leaf_index) \n",
|
||
"X_one_hot = one_hot.transform(X_leaf_index) \n",
|
||
"\n",
|
||
"\n",
|
||
"from sklearn.linear_model import LogisticRegression\n",
|
||
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
|
||
"lr.fit(X_one_hot,y_train)\n",
|
||
"y_pred = lr.predict_proba(\n",
|
||
" one_hot.transform(gbdt.apply(X_test)[:, :, 0]))[:,1]\n",
|
||
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
|
||
"print(\"AUC for GBDT derived feature + LR:\", roc_auc_score(y_test, y_pred))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Feature Learning by Trees\n",
|
||
"RandomForest derived feature + LR"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"sample's belonging node of each base tree \n",
|
||
"' [[212 35 79 ... 146 60 46]\n",
|
||
" [307 165 266 ... 136 132 44]\n",
|
||
" [285 285 320 ... 301 294 300]\n",
|
||
" ...\n",
|
||
" [ 13 177 133 ... 186 169 117]\n",
|
||
" [190 296 311 ... 282 289 297]\n",
|
||
" [264 165 243 ... 152 110 314]]\n",
|
||
"AUC for RandomForest derived feature + LR: 0.759672619047619\n"
|
||
]
|
||
},
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"C:\\ProgramData\\Anaconda3\\lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:368: FutureWarning: The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.\n",
|
||
"If you want the future behaviour and silence this warning, you can specify \"categories='auto'\".\n",
|
||
"In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.\n",
|
||
" warnings.warn(msg, FutureWarning)\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"rf = RandomForestClassifier(n_estimators=20)\n",
|
||
"one_hot = OneHotEncoder()\n",
|
||
"\n",
|
||
"X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||
"X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||
"\n",
|
||
"rf.fit(X_train, y_train)\n",
|
||
"\n",
|
||
"X_leaf_index = rf.apply(X_train) # apply return the node index on each tree \n",
|
||
"print(\"sample's belonging node of each base tree \\n'\",X_leaf_index)\n",
|
||
"# fit one-hot encoder\n",
|
||
"one_hot.fit(X_leaf_index) \n",
|
||
"X_one_hot = one_hot.transform(X_leaf_index) \n",
|
||
"\n",
|
||
"\n",
|
||
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
|
||
"lr.fit(X_one_hot,y_train)\n",
|
||
"y_pred = lr.predict_proba(\n",
|
||
" one_hot.transform(rf.apply(X_test)))[:,1]\n",
|
||
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
|
||
"print(\"AUC for RandomForest derived feature + LR:\", roc_auc_score(y_test, y_pred))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"source": [
|
||
"## Feature Learning by Trees\n",
|
||
"GBDT derived feature + Raw feature +LR"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"AUC for GBDT derived feature + Raw feature +LR: 0.7603571428571428\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"from scipy.sparse import hstack\n",
|
||
"\n",
|
||
"X_train_ext = hstack([one_hot.transform(gbdt.apply(X_train)[:, :, 0]), X_train])\n",
|
||
"X_test_ext = hstack([one_hot.transform(gbdt.apply(X_test)[:, :, 0]), X_test])\n",
|
||
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
|
||
"lr.fit(X_train_ext,y_train)\n",
|
||
"y_pred = lr.predict_proba(X_test_ext)[:,1]\n",
|
||
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
|
||
"print(\"AUC for GBDT derived feature + Raw feature +LR:\", roc_auc_score(y_test, y_pred))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Feature Learning by Trees\n",
|
||
"RandomForest derived feature + Raw feature +LR"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"AUC for RandomForest derived feature + Raw feature + LR: 0.76\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"X_train_ext = hstack([one_hot.transform(rf.apply(X_train)), X_train])\n",
|
||
"X_test_ext = hstack([one_hot.transform(rf.apply(X_test)), X_test])\n",
|
||
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
|
||
"lr.fit(X_train_ext,y_train)\n",
|
||
"y_pred = lr.predict_proba(X_test_ext)[:,1]\n",
|
||
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
|
||
"print(\"AUC for RandomForest derived feature + Raw feature + LR:\", roc_auc_score(y_test, y_pred))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Feature Learning by Trees\n",
|
||
"Use only Raw Feature + LR"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"AUC for RandomForest derived feature + LR: 0.6988690476190476\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"lr = LogisticRegression(solver='lbfgs', max_iter=1000)\n",
|
||
"lr.fit(X_train,y_train)\n",
|
||
"y_pred = lr.predict_proba(X_test)[:,1]\n",
|
||
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
|
||
"print(\"AUC for RandomForest derived feature + LR:\", roc_auc_score(y_test, y_pred))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Feature Learning by Trees\n",
|
||
"\n",
|
||
"Use only Raw Feature + GBDT"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"AUC for Raw feature + GBDT: 0.7613988095238096\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"gbdt = GradientBoostingClassifier(n_estimators=20)\n",
|
||
"\n",
|
||
"X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||
"X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||
"\n",
|
||
"gbdt.fit(X_train, y_train)\n",
|
||
"y_pred = gbdt.predict_proba(X_test)[:,1]\n",
|
||
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
|
||
"print(\"AUC for Raw feature + GBDT:\", roc_auc_score(y_test, y_pred))\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Feature Learning by Trees\n",
|
||
"\n",
|
||
"Use only Raw Feature + RF\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stdout",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"AUC for Raw feature + RF: 0.7235119047619047\n"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"rf = RandomForestClassifier(n_estimators=20)\n",
|
||
"\n",
|
||
"X_train = X_train[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||
"X_test = X_test[[ 'Pclass', 'Age', 'Fare', 'SibSp']].fillna(0)\n",
|
||
"\n",
|
||
"rf.fit(X_train, y_train)\n",
|
||
"y_pred = rf.predict_proba(X_test)[:,1]\n",
|
||
"fpr_grd_lm, tpr_grd_lm, _ = roc_curve(y_test, y_pred)\n",
|
||
"print(\"AUC for Raw feature + RF:\", roc_auc_score(y_test, y_pred))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"#### Without tuning, we can see GBDT derived feature + LR get the best result"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.6.1"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|