mirror of
https://github.com/ashishpatel26/Amazing-Feature-Engineering.git
synced 2022-05-07 18:26:02 +03:00
549 lines
19 KiB
Plaintext
549 lines
19 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 45,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"# import seaborn as sns\n",
|
||
"# import matplotlib.pyplot as plt\n",
|
||
"import os\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from mlxtend.feature_selection import SequentialFeatureSelector as SFS\n",
|
||
"from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS\n",
|
||
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
|
||
"\n",
|
||
"# plt.style.use('seaborn-colorblind')\n",
|
||
"# %matplotlib inline\n",
|
||
"# from feature_selection import filter_method as ft"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Load Dataset"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.datasets import load_breast_cancer\n",
|
||
"data = load_breast_cancer()\n",
|
||
"data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
|
||
" columns= np.append(data['feature_names'], ['target']))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>mean radius</th>\n",
|
||
" <th>mean texture</th>\n",
|
||
" <th>mean perimeter</th>\n",
|
||
" <th>mean area</th>\n",
|
||
" <th>mean smoothness</th>\n",
|
||
" <th>mean compactness</th>\n",
|
||
" <th>mean concavity</th>\n",
|
||
" <th>mean concave points</th>\n",
|
||
" <th>mean symmetry</th>\n",
|
||
" <th>mean fractal dimension</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>worst texture</th>\n",
|
||
" <th>worst perimeter</th>\n",
|
||
" <th>worst area</th>\n",
|
||
" <th>worst smoothness</th>\n",
|
||
" <th>worst compactness</th>\n",
|
||
" <th>worst concavity</th>\n",
|
||
" <th>worst concave points</th>\n",
|
||
" <th>worst symmetry</th>\n",
|
||
" <th>worst fractal dimension</th>\n",
|
||
" <th>target</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>17.99</td>\n",
|
||
" <td>10.38</td>\n",
|
||
" <td>122.80</td>\n",
|
||
" <td>1001.0</td>\n",
|
||
" <td>0.11840</td>\n",
|
||
" <td>0.27760</td>\n",
|
||
" <td>0.3001</td>\n",
|
||
" <td>0.14710</td>\n",
|
||
" <td>0.2419</td>\n",
|
||
" <td>0.07871</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>17.33</td>\n",
|
||
" <td>184.60</td>\n",
|
||
" <td>2019.0</td>\n",
|
||
" <td>0.1622</td>\n",
|
||
" <td>0.6656</td>\n",
|
||
" <td>0.7119</td>\n",
|
||
" <td>0.2654</td>\n",
|
||
" <td>0.4601</td>\n",
|
||
" <td>0.11890</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>20.57</td>\n",
|
||
" <td>17.77</td>\n",
|
||
" <td>132.90</td>\n",
|
||
" <td>1326.0</td>\n",
|
||
" <td>0.08474</td>\n",
|
||
" <td>0.07864</td>\n",
|
||
" <td>0.0869</td>\n",
|
||
" <td>0.07017</td>\n",
|
||
" <td>0.1812</td>\n",
|
||
" <td>0.05667</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>23.41</td>\n",
|
||
" <td>158.80</td>\n",
|
||
" <td>1956.0</td>\n",
|
||
" <td>0.1238</td>\n",
|
||
" <td>0.1866</td>\n",
|
||
" <td>0.2416</td>\n",
|
||
" <td>0.1860</td>\n",
|
||
" <td>0.2750</td>\n",
|
||
" <td>0.08902</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>19.69</td>\n",
|
||
" <td>21.25</td>\n",
|
||
" <td>130.00</td>\n",
|
||
" <td>1203.0</td>\n",
|
||
" <td>0.10960</td>\n",
|
||
" <td>0.15990</td>\n",
|
||
" <td>0.1974</td>\n",
|
||
" <td>0.12790</td>\n",
|
||
" <td>0.2069</td>\n",
|
||
" <td>0.05999</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>25.53</td>\n",
|
||
" <td>152.50</td>\n",
|
||
" <td>1709.0</td>\n",
|
||
" <td>0.1444</td>\n",
|
||
" <td>0.4245</td>\n",
|
||
" <td>0.4504</td>\n",
|
||
" <td>0.2430</td>\n",
|
||
" <td>0.3613</td>\n",
|
||
" <td>0.08758</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>11.42</td>\n",
|
||
" <td>20.38</td>\n",
|
||
" <td>77.58</td>\n",
|
||
" <td>386.1</td>\n",
|
||
" <td>0.14250</td>\n",
|
||
" <td>0.28390</td>\n",
|
||
" <td>0.2414</td>\n",
|
||
" <td>0.10520</td>\n",
|
||
" <td>0.2597</td>\n",
|
||
" <td>0.09744</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>26.50</td>\n",
|
||
" <td>98.87</td>\n",
|
||
" <td>567.7</td>\n",
|
||
" <td>0.2098</td>\n",
|
||
" <td>0.8663</td>\n",
|
||
" <td>0.6869</td>\n",
|
||
" <td>0.2575</td>\n",
|
||
" <td>0.6638</td>\n",
|
||
" <td>0.17300</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>20.29</td>\n",
|
||
" <td>14.34</td>\n",
|
||
" <td>135.10</td>\n",
|
||
" <td>1297.0</td>\n",
|
||
" <td>0.10030</td>\n",
|
||
" <td>0.13280</td>\n",
|
||
" <td>0.1980</td>\n",
|
||
" <td>0.10430</td>\n",
|
||
" <td>0.1809</td>\n",
|
||
" <td>0.05883</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>16.67</td>\n",
|
||
" <td>152.20</td>\n",
|
||
" <td>1575.0</td>\n",
|
||
" <td>0.1374</td>\n",
|
||
" <td>0.2050</td>\n",
|
||
" <td>0.4000</td>\n",
|
||
" <td>0.1625</td>\n",
|
||
" <td>0.2364</td>\n",
|
||
" <td>0.07678</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 31 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
|
||
"0 17.99 10.38 122.80 1001.0 0.11840 \n",
|
||
"1 20.57 17.77 132.90 1326.0 0.08474 \n",
|
||
"2 19.69 21.25 130.00 1203.0 0.10960 \n",
|
||
"3 11.42 20.38 77.58 386.1 0.14250 \n",
|
||
"4 20.29 14.34 135.10 1297.0 0.10030 \n",
|
||
"\n",
|
||
" mean compactness mean concavity mean concave points mean symmetry \\\n",
|
||
"0 0.27760 0.3001 0.14710 0.2419 \n",
|
||
"1 0.07864 0.0869 0.07017 0.1812 \n",
|
||
"2 0.15990 0.1974 0.12790 0.2069 \n",
|
||
"3 0.28390 0.2414 0.10520 0.2597 \n",
|
||
"4 0.13280 0.1980 0.10430 0.1809 \n",
|
||
"\n",
|
||
" mean fractal dimension ... worst texture worst perimeter worst area \\\n",
|
||
"0 0.07871 ... 17.33 184.60 2019.0 \n",
|
||
"1 0.05667 ... 23.41 158.80 1956.0 \n",
|
||
"2 0.05999 ... 25.53 152.50 1709.0 \n",
|
||
"3 0.09744 ... 26.50 98.87 567.7 \n",
|
||
"4 0.05883 ... 16.67 152.20 1575.0 \n",
|
||
"\n",
|
||
" worst smoothness worst compactness worst concavity worst concave points \\\n",
|
||
"0 0.1622 0.6656 0.7119 0.2654 \n",
|
||
"1 0.1238 0.1866 0.2416 0.1860 \n",
|
||
"2 0.1444 0.4245 0.4504 0.2430 \n",
|
||
"3 0.2098 0.8663 0.6869 0.2575 \n",
|
||
"4 0.1374 0.2050 0.4000 0.1625 \n",
|
||
"\n",
|
||
" worst symmetry worst fractal dimension target \n",
|
||
"0 0.4601 0.11890 0.0 \n",
|
||
"1 0.2750 0.08902 0.0 \n",
|
||
"2 0.3613 0.08758 0.0 \n",
|
||
"3 0.6638 0.17300 0.0 \n",
|
||
"4 0.2364 0.07678 0.0 \n",
|
||
"\n",
|
||
"[5 rows x 31 columns]"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data.head(5)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"((455, 30), (114, 30))"
|
||
]
|
||
},
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
|
||
" data.target, test_size=0.2,\n",
|
||
" random_state=0)\n",
|
||
"X_train.shape, X_test.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Forward Selection\n",
|
||
" "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 30 out of 30 | elapsed: 11.4s finished\n",
|
||
"Features: 1/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 29 out of 29 | elapsed: 11.2s finished\n",
|
||
"Features: 2/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 28 out of 28 | elapsed: 10.7s finished\n",
|
||
"Features: 3/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 27 out of 27 | elapsed: 10.3s finished\n",
|
||
"Features: 4/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 26 out of 26 | elapsed: 10.0s finished\n",
|
||
"Features: 5/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 25 out of 25 | elapsed: 9.6s finished\n",
|
||
"Features: 6/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 24 out of 24 | elapsed: 9.2s finished\n",
|
||
"Features: 7/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 23 out of 23 | elapsed: 8.8s finished\n",
|
||
"Features: 8/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 22 out of 22 | elapsed: 8.4s finished\n",
|
||
"Features: 9/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 21 out of 21 | elapsed: 8.1s finished\n",
|
||
"Features: 10/10"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# step forward feature selection\n",
|
||
"# select top 10 features based on the optimal roc_auc and RandomForest Classifier\n",
|
||
"\n",
|
||
"sfs1 = SFS(RandomForestClassifier(n_jobs=-1,n_estimators=5), \n",
|
||
" k_features=10, \n",
|
||
" forward=True, \n",
|
||
" floating=False, \n",
|
||
" verbose=1,\n",
|
||
" scoring='roc_auc',\n",
|
||
" cv=3)\n",
|
||
"\n",
|
||
"sfs1 = sfs1.fit(np.array(X_train), y_train)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['mean texture', 'mean perimeter', 'mean concavity',\n",
|
||
" 'mean fractal dimension', 'area error', 'compactness error',\n",
|
||
" 'worst perimeter', 'worst area', 'worst smoothness', 'worst symmetry'],\n",
|
||
" dtype='object')"
|
||
]
|
||
},
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"selected_feat1= X_train.columns[list(sfs1.k_feature_idx_)]\n",
|
||
"selected_feat1"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Backward Elimination"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 30 out of 30 | elapsed: 11.5s finished\n",
|
||
"Features: 1/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 29 out of 29 | elapsed: 11.2s finished\n",
|
||
"Features: 2/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 28 out of 28 | elapsed: 10.7s finished\n",
|
||
"Features: 3/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 27 out of 27 | elapsed: 10.2s finished\n",
|
||
"Features: 4/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 26 out of 26 | elapsed: 10.1s finished\n",
|
||
"Features: 5/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 25 out of 25 | elapsed: 9.6s finished\n",
|
||
"Features: 6/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 24 out of 24 | elapsed: 9.2s finished\n",
|
||
"Features: 7/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 23 out of 23 | elapsed: 8.8s finished\n",
|
||
"Features: 8/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 22 out of 22 | elapsed: 8.5s finished\n",
|
||
"Features: 9/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
|
||
"[Parallel(n_jobs=1)]: Done 21 out of 21 | elapsed: 8.2s finished\n",
|
||
"Features: 10/10"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"# step backward feature selection\n",
|
||
"# select top 10 features based on the optimal roc_auc and RandomForest Classifier\n",
|
||
"\n",
|
||
"sfs2 = SFS(RandomForestClassifier(n_jobs=-1,n_estimators=5), \n",
|
||
" k_features=10, \n",
|
||
" forward=False, \n",
|
||
" floating=False, \n",
|
||
" verbose=1,\n",
|
||
" scoring='roc_auc',\n",
|
||
" cv=3)\n",
|
||
"\n",
|
||
"sfs2 = sfs1.fit(np.array(X_train.fillna(0)), y_train)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 44,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['mean area', 'mean compactness', 'texture error', 'area error',\n",
|
||
" 'compactness error', 'concavity error', 'worst texture',\n",
|
||
" 'worst perimeter', 'worst smoothness', 'worst concavity'],\n",
|
||
" dtype='object')"
|
||
]
|
||
},
|
||
"execution_count": 44,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"selected_feat2= X_train.columns[list(sfs2.k_feature_idx_)]\n",
|
||
"selected_feat2\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Note that SFS and SBE return different results"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Exhaustive Feature Selection"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 51,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"name": "stderr",
|
||
"output_type": "stream",
|
||
"text": [
|
||
"Features: 847/847"
|
||
]
|
||
}
|
||
],
|
||
"source": [
|
||
"efs1 = EFS(RandomForestClassifier(n_jobs=-1,n_estimators=5, random_state=0), \n",
|
||
" min_features=1,\n",
|
||
" max_features=6, \n",
|
||
" scoring='roc_auc',\n",
|
||
" print_progress=True,\n",
|
||
" cv=2)\n",
|
||
"\n",
|
||
"# in order to shorter search time for the demonstration\n",
|
||
"# we only try all possible 1,2,3,4,5,6\n",
|
||
"# feature combinations from a dataset of 10 features\n",
|
||
"\n",
|
||
"efs1 = efs1.fit(np.array(X_train[X_train.columns[0:10]].fillna(0)), y_train)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 52,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"Index(['mean radius', 'mean texture', 'mean area', 'mean smoothness',\n",
|
||
" 'mean concavity'],\n",
|
||
" dtype='object')"
|
||
]
|
||
},
|
||
"execution_count": 52,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"selected_feat3= X_train.columns[list(efs1.best_idx_)]\n",
|
||
"selected_feat3"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.6.1"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|