Files
Amazing-Feature-Engineering/4.2_Demo_Feature_Selection_Wrapper.ipynb
DESKTOP-SAT83DL\yimeng.zhang 149c80cedf 2018.12.2 First commit.
2018-12-02 21:11:32 +08:00

549 lines
19 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"# import seaborn as sns\n",
"# import matplotlib.pyplot as plt\n",
"import os\n",
"from sklearn.model_selection import train_test_split\n",
"from mlxtend.feature_selection import SequentialFeatureSelector as SFS\n",
"from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS\n",
"from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier\n",
"\n",
"# plt.style.use('seaborn-colorblind')\n",
"# %matplotlib inline\n",
"# from feature_selection import filter_method as ft"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Dataset"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.datasets import load_breast_cancer\n",
"data = load_breast_cancer()\n",
"data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
" columns= np.append(data['feature_names'], ['target']))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>mean radius</th>\n",
" <th>mean texture</th>\n",
" <th>mean perimeter</th>\n",
" <th>mean area</th>\n",
" <th>mean smoothness</th>\n",
" <th>mean compactness</th>\n",
" <th>mean concavity</th>\n",
" <th>mean concave points</th>\n",
" <th>mean symmetry</th>\n",
" <th>mean fractal dimension</th>\n",
" <th>...</th>\n",
" <th>worst texture</th>\n",
" <th>worst perimeter</th>\n",
" <th>worst area</th>\n",
" <th>worst smoothness</th>\n",
" <th>worst compactness</th>\n",
" <th>worst concavity</th>\n",
" <th>worst concave points</th>\n",
" <th>worst symmetry</th>\n",
" <th>worst fractal dimension</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>17.99</td>\n",
" <td>10.38</td>\n",
" <td>122.80</td>\n",
" <td>1001.0</td>\n",
" <td>0.11840</td>\n",
" <td>0.27760</td>\n",
" <td>0.3001</td>\n",
" <td>0.14710</td>\n",
" <td>0.2419</td>\n",
" <td>0.07871</td>\n",
" <td>...</td>\n",
" <td>17.33</td>\n",
" <td>184.60</td>\n",
" <td>2019.0</td>\n",
" <td>0.1622</td>\n",
" <td>0.6656</td>\n",
" <td>0.7119</td>\n",
" <td>0.2654</td>\n",
" <td>0.4601</td>\n",
" <td>0.11890</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>20.57</td>\n",
" <td>17.77</td>\n",
" <td>132.90</td>\n",
" <td>1326.0</td>\n",
" <td>0.08474</td>\n",
" <td>0.07864</td>\n",
" <td>0.0869</td>\n",
" <td>0.07017</td>\n",
" <td>0.1812</td>\n",
" <td>0.05667</td>\n",
" <td>...</td>\n",
" <td>23.41</td>\n",
" <td>158.80</td>\n",
" <td>1956.0</td>\n",
" <td>0.1238</td>\n",
" <td>0.1866</td>\n",
" <td>0.2416</td>\n",
" <td>0.1860</td>\n",
" <td>0.2750</td>\n",
" <td>0.08902</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>19.69</td>\n",
" <td>21.25</td>\n",
" <td>130.00</td>\n",
" <td>1203.0</td>\n",
" <td>0.10960</td>\n",
" <td>0.15990</td>\n",
" <td>0.1974</td>\n",
" <td>0.12790</td>\n",
" <td>0.2069</td>\n",
" <td>0.05999</td>\n",
" <td>...</td>\n",
" <td>25.53</td>\n",
" <td>152.50</td>\n",
" <td>1709.0</td>\n",
" <td>0.1444</td>\n",
" <td>0.4245</td>\n",
" <td>0.4504</td>\n",
" <td>0.2430</td>\n",
" <td>0.3613</td>\n",
" <td>0.08758</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>11.42</td>\n",
" <td>20.38</td>\n",
" <td>77.58</td>\n",
" <td>386.1</td>\n",
" <td>0.14250</td>\n",
" <td>0.28390</td>\n",
" <td>0.2414</td>\n",
" <td>0.10520</td>\n",
" <td>0.2597</td>\n",
" <td>0.09744</td>\n",
" <td>...</td>\n",
" <td>26.50</td>\n",
" <td>98.87</td>\n",
" <td>567.7</td>\n",
" <td>0.2098</td>\n",
" <td>0.8663</td>\n",
" <td>0.6869</td>\n",
" <td>0.2575</td>\n",
" <td>0.6638</td>\n",
" <td>0.17300</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>20.29</td>\n",
" <td>14.34</td>\n",
" <td>135.10</td>\n",
" <td>1297.0</td>\n",
" <td>0.10030</td>\n",
" <td>0.13280</td>\n",
" <td>0.1980</td>\n",
" <td>0.10430</td>\n",
" <td>0.1809</td>\n",
" <td>0.05883</td>\n",
" <td>...</td>\n",
" <td>16.67</td>\n",
" <td>152.20</td>\n",
" <td>1575.0</td>\n",
" <td>0.1374</td>\n",
" <td>0.2050</td>\n",
" <td>0.4000</td>\n",
" <td>0.1625</td>\n",
" <td>0.2364</td>\n",
" <td>0.07678</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 31 columns</p>\n",
"</div>"
],
"text/plain": [
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
"0 17.99 10.38 122.80 1001.0 0.11840 \n",
"1 20.57 17.77 132.90 1326.0 0.08474 \n",
"2 19.69 21.25 130.00 1203.0 0.10960 \n",
"3 11.42 20.38 77.58 386.1 0.14250 \n",
"4 20.29 14.34 135.10 1297.0 0.10030 \n",
"\n",
" mean compactness mean concavity mean concave points mean symmetry \\\n",
"0 0.27760 0.3001 0.14710 0.2419 \n",
"1 0.07864 0.0869 0.07017 0.1812 \n",
"2 0.15990 0.1974 0.12790 0.2069 \n",
"3 0.28390 0.2414 0.10520 0.2597 \n",
"4 0.13280 0.1980 0.10430 0.1809 \n",
"\n",
" mean fractal dimension ... worst texture worst perimeter worst area \\\n",
"0 0.07871 ... 17.33 184.60 2019.0 \n",
"1 0.05667 ... 23.41 158.80 1956.0 \n",
"2 0.05999 ... 25.53 152.50 1709.0 \n",
"3 0.09744 ... 26.50 98.87 567.7 \n",
"4 0.05883 ... 16.67 152.20 1575.0 \n",
"\n",
" worst smoothness worst compactness worst concavity worst concave points \\\n",
"0 0.1622 0.6656 0.7119 0.2654 \n",
"1 0.1238 0.1866 0.2416 0.1860 \n",
"2 0.1444 0.4245 0.4504 0.2430 \n",
"3 0.2098 0.8663 0.6869 0.2575 \n",
"4 0.1374 0.2050 0.4000 0.1625 \n",
"\n",
" worst symmetry worst fractal dimension target \n",
"0 0.4601 0.11890 0.0 \n",
"1 0.2750 0.08902 0.0 \n",
"2 0.3613 0.08758 0.0 \n",
"3 0.6638 0.17300 0.0 \n",
"4 0.2364 0.07678 0.0 \n",
"\n",
"[5 rows x 31 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((455, 30), (114, 30))"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
" data.target, test_size=0.2,\n",
" random_state=0)\n",
"X_train.shape, X_test.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Forward Selection\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 30 out of 30 | elapsed: 11.4s finished\n",
"Features: 1/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 29 out of 29 | elapsed: 11.2s finished\n",
"Features: 2/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 28 out of 28 | elapsed: 10.7s finished\n",
"Features: 3/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 27 out of 27 | elapsed: 10.3s finished\n",
"Features: 4/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 26 out of 26 | elapsed: 10.0s finished\n",
"Features: 5/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 25 out of 25 | elapsed: 9.6s finished\n",
"Features: 6/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 24 out of 24 | elapsed: 9.2s finished\n",
"Features: 7/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 23 out of 23 | elapsed: 8.8s finished\n",
"Features: 8/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 22 out of 22 | elapsed: 8.4s finished\n",
"Features: 9/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 21 out of 21 | elapsed: 8.1s finished\n",
"Features: 10/10"
]
}
],
"source": [
"# step forward feature selection\n",
"# select top 10 features based on the optimal roc_auc and RandomForest Classifier\n",
"\n",
"sfs1 = SFS(RandomForestClassifier(n_jobs=-1,n_estimators=5), \n",
" k_features=10, \n",
" forward=True, \n",
" floating=False, \n",
" verbose=1,\n",
" scoring='roc_auc',\n",
" cv=3)\n",
"\n",
"sfs1 = sfs1.fit(np.array(X_train), y_train)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['mean texture', 'mean perimeter', 'mean concavity',\n",
" 'mean fractal dimension', 'area error', 'compactness error',\n",
" 'worst perimeter', 'worst area', 'worst smoothness', 'worst symmetry'],\n",
" dtype='object')"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"selected_feat1= X_train.columns[list(sfs1.k_feature_idx_)]\n",
"selected_feat1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Backward Elimination"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 30 out of 30 | elapsed: 11.5s finished\n",
"Features: 1/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 29 out of 29 | elapsed: 11.2s finished\n",
"Features: 2/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 28 out of 28 | elapsed: 10.7s finished\n",
"Features: 3/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 27 out of 27 | elapsed: 10.2s finished\n",
"Features: 4/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 26 out of 26 | elapsed: 10.1s finished\n",
"Features: 5/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 25 out of 25 | elapsed: 9.6s finished\n",
"Features: 6/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 24 out of 24 | elapsed: 9.2s finished\n",
"Features: 7/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 23 out of 23 | elapsed: 8.8s finished\n",
"Features: 8/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 22 out of 22 | elapsed: 8.5s finished\n",
"Features: 9/10[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.\n",
"[Parallel(n_jobs=1)]: Done 21 out of 21 | elapsed: 8.2s finished\n",
"Features: 10/10"
]
}
],
"source": [
"# step backward feature selection\n",
"# select top 10 features based on the optimal roc_auc and RandomForest Classifier\n",
"\n",
"sfs2 = SFS(RandomForestClassifier(n_jobs=-1,n_estimators=5), \n",
" k_features=10, \n",
" forward=False, \n",
" floating=False, \n",
" verbose=1,\n",
" scoring='roc_auc',\n",
" cv=3)\n",
"\n",
"sfs2 = sfs1.fit(np.array(X_train.fillna(0)), y_train)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['mean area', 'mean compactness', 'texture error', 'area error',\n",
" 'compactness error', 'concavity error', 'worst texture',\n",
" 'worst perimeter', 'worst smoothness', 'worst concavity'],\n",
" dtype='object')"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"selected_feat2= X_train.columns[list(sfs2.k_feature_idx_)]\n",
"selected_feat2\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note that SFS and SBE return different results"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Exhaustive Feature Selection"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Features: 847/847"
]
}
],
"source": [
"efs1 = EFS(RandomForestClassifier(n_jobs=-1,n_estimators=5, random_state=0), \n",
" min_features=1,\n",
" max_features=6, \n",
" scoring='roc_auc',\n",
" print_progress=True,\n",
" cv=2)\n",
"\n",
"# in order to shorter search time for the demonstration\n",
"# we only try all possible 1,2,3,4,5,6\n",
"# feature combinations from a dataset of 10 features\n",
"\n",
"efs1 = efs1.fit(np.array(X_train[X_train.columns[0:10]].fillna(0)), y_train)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['mean radius', 'mean texture', 'mean area', 'mean smoothness',\n",
" 'mean concavity'],\n",
" dtype='object')"
]
},
"execution_count": 52,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"selected_feat3= X_train.columns[list(efs1.best_idx_)]\n",
"selected_feat3"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}