Files
Amazing-Feature-Engineering/4.4_Demo_Feature_Selection_Feature_Shuffling.ipynb
DESKTOP-SAT83DL\yimeng.zhang 149c80cedf 2018.12.2 First commit.
2018-12-02 21:11:32 +08:00

596 lines
19 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"# import seaborn as sns\n",
"# import matplotlib.pyplot as plt\n",
"import os\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_selection import SelectFromModel\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"# plt.style.use('seaborn-colorblind')\n",
"# %matplotlib inline\n",
"from feature_selection import feature_shuffle\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Dataset"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.datasets import load_breast_cancer\n",
"data = load_breast_cancer()\n",
"data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
" columns= np.append(data['feature_names'], ['target']))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>mean radius</th>\n",
" <th>mean texture</th>\n",
" <th>mean perimeter</th>\n",
" <th>mean area</th>\n",
" <th>mean smoothness</th>\n",
" <th>mean compactness</th>\n",
" <th>mean concavity</th>\n",
" <th>mean concave points</th>\n",
" <th>mean symmetry</th>\n",
" <th>mean fractal dimension</th>\n",
" <th>...</th>\n",
" <th>worst texture</th>\n",
" <th>worst perimeter</th>\n",
" <th>worst area</th>\n",
" <th>worst smoothness</th>\n",
" <th>worst compactness</th>\n",
" <th>worst concavity</th>\n",
" <th>worst concave points</th>\n",
" <th>worst symmetry</th>\n",
" <th>worst fractal dimension</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>17.99</td>\n",
" <td>10.38</td>\n",
" <td>122.80</td>\n",
" <td>1001.0</td>\n",
" <td>0.11840</td>\n",
" <td>0.27760</td>\n",
" <td>0.3001</td>\n",
" <td>0.14710</td>\n",
" <td>0.2419</td>\n",
" <td>0.07871</td>\n",
" <td>...</td>\n",
" <td>17.33</td>\n",
" <td>184.60</td>\n",
" <td>2019.0</td>\n",
" <td>0.1622</td>\n",
" <td>0.6656</td>\n",
" <td>0.7119</td>\n",
" <td>0.2654</td>\n",
" <td>0.4601</td>\n",
" <td>0.11890</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>20.57</td>\n",
" <td>17.77</td>\n",
" <td>132.90</td>\n",
" <td>1326.0</td>\n",
" <td>0.08474</td>\n",
" <td>0.07864</td>\n",
" <td>0.0869</td>\n",
" <td>0.07017</td>\n",
" <td>0.1812</td>\n",
" <td>0.05667</td>\n",
" <td>...</td>\n",
" <td>23.41</td>\n",
" <td>158.80</td>\n",
" <td>1956.0</td>\n",
" <td>0.1238</td>\n",
" <td>0.1866</td>\n",
" <td>0.2416</td>\n",
" <td>0.1860</td>\n",
" <td>0.2750</td>\n",
" <td>0.08902</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>19.69</td>\n",
" <td>21.25</td>\n",
" <td>130.00</td>\n",
" <td>1203.0</td>\n",
" <td>0.10960</td>\n",
" <td>0.15990</td>\n",
" <td>0.1974</td>\n",
" <td>0.12790</td>\n",
" <td>0.2069</td>\n",
" <td>0.05999</td>\n",
" <td>...</td>\n",
" <td>25.53</td>\n",
" <td>152.50</td>\n",
" <td>1709.0</td>\n",
" <td>0.1444</td>\n",
" <td>0.4245</td>\n",
" <td>0.4504</td>\n",
" <td>0.2430</td>\n",
" <td>0.3613</td>\n",
" <td>0.08758</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>11.42</td>\n",
" <td>20.38</td>\n",
" <td>77.58</td>\n",
" <td>386.1</td>\n",
" <td>0.14250</td>\n",
" <td>0.28390</td>\n",
" <td>0.2414</td>\n",
" <td>0.10520</td>\n",
" <td>0.2597</td>\n",
" <td>0.09744</td>\n",
" <td>...</td>\n",
" <td>26.50</td>\n",
" <td>98.87</td>\n",
" <td>567.7</td>\n",
" <td>0.2098</td>\n",
" <td>0.8663</td>\n",
" <td>0.6869</td>\n",
" <td>0.2575</td>\n",
" <td>0.6638</td>\n",
" <td>0.17300</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>20.29</td>\n",
" <td>14.34</td>\n",
" <td>135.10</td>\n",
" <td>1297.0</td>\n",
" <td>0.10030</td>\n",
" <td>0.13280</td>\n",
" <td>0.1980</td>\n",
" <td>0.10430</td>\n",
" <td>0.1809</td>\n",
" <td>0.05883</td>\n",
" <td>...</td>\n",
" <td>16.67</td>\n",
" <td>152.20</td>\n",
" <td>1575.0</td>\n",
" <td>0.1374</td>\n",
" <td>0.2050</td>\n",
" <td>0.4000</td>\n",
" <td>0.1625</td>\n",
" <td>0.2364</td>\n",
" <td>0.07678</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 31 columns</p>\n",
"</div>"
],
"text/plain": [
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
"0 17.99 10.38 122.80 1001.0 0.11840 \n",
"1 20.57 17.77 132.90 1326.0 0.08474 \n",
"2 19.69 21.25 130.00 1203.0 0.10960 \n",
"3 11.42 20.38 77.58 386.1 0.14250 \n",
"4 20.29 14.34 135.10 1297.0 0.10030 \n",
"\n",
" mean compactness mean concavity mean concave points mean symmetry \\\n",
"0 0.27760 0.3001 0.14710 0.2419 \n",
"1 0.07864 0.0869 0.07017 0.1812 \n",
"2 0.15990 0.1974 0.12790 0.2069 \n",
"3 0.28390 0.2414 0.10520 0.2597 \n",
"4 0.13280 0.1980 0.10430 0.1809 \n",
"\n",
" mean fractal dimension ... worst texture worst perimeter worst area \\\n",
"0 0.07871 ... 17.33 184.60 2019.0 \n",
"1 0.05667 ... 23.41 158.80 1956.0 \n",
"2 0.05999 ... 25.53 152.50 1709.0 \n",
"3 0.09744 ... 26.50 98.87 567.7 \n",
"4 0.05883 ... 16.67 152.20 1575.0 \n",
"\n",
" worst smoothness worst compactness worst concavity worst concave points \\\n",
"0 0.1622 0.6656 0.7119 0.2654 \n",
"1 0.1238 0.1866 0.2416 0.1860 \n",
"2 0.1444 0.4245 0.4504 0.2430 \n",
"3 0.2098 0.8663 0.6869 0.2575 \n",
"4 0.1374 0.2050 0.4000 0.1625 \n",
"\n",
" worst symmetry worst fractal dimension target \n",
"0 0.4601 0.11890 0.0 \n",
"1 0.2750 0.08902 0.0 \n",
"2 0.3613 0.08758 0.0 \n",
"3 0.6638 0.17300 0.0 \n",
"4 0.2364 0.07678 0.0 \n",
"\n",
"[5 rows x 31 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"((455, 30), (114, 30))"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
" data.target, test_size=0.2,\n",
" random_state=0)\n",
"X_train.shape, X_test.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Feature Shuffling\n",
"permute the values of each feature, one at the time, and measure how much the permutation decreases the accuracy, or the roc_auc, or the mse of the machine learning model.\n",
"If the variables are important, this is, highly predictive, a random permutation of their values will decrease dramatically any of these metrics."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"auc_drop, selected_features = feature_shuffle.feature_shuffle_rf(X_train=X_train,\n",
" y_train=y_train,\n",
" random_state=0)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>feature</th>\n",
" <th>auc_drop</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>worst perimeter</td>\n",
" <td>8.359457e-05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>worst concave points</td>\n",
" <td>3.134796e-05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>worst area</td>\n",
" <td>1.110223e-16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>perimeter error</td>\n",
" <td>1.110223e-16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>mean radius</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>concavity error</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>worst symmetry</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>worst concavity</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>worst compactness</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>worst smoothness</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>worst texture</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>worst radius</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>fractal dimension error</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>symmetry error</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>concave points error</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>compactness error</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>mean texture</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>smoothness error</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>area error</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>texture error</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>radius error</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>mean fractal dimension</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>mean symmetry</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>mean concave points</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>mean concavity</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>mean compactness</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>mean smoothness</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>mean area</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>mean perimeter</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>worst fractal dimension</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" feature auc_drop\n",
"22 worst perimeter 8.359457e-05\n",
"27 worst concave points 3.134796e-05\n",
"23 worst area 1.110223e-16\n",
"12 perimeter error 1.110223e-16\n",
"0 mean radius 0.000000e+00\n",
"16 concavity error 0.000000e+00\n",
"28 worst symmetry 0.000000e+00\n",
"26 worst concavity 0.000000e+00\n",
"25 worst compactness 0.000000e+00\n",
"24 worst smoothness 0.000000e+00\n",
"21 worst texture 0.000000e+00\n",
"20 worst radius 0.000000e+00\n",
"19 fractal dimension error 0.000000e+00\n",
"18 symmetry error 0.000000e+00\n",
"17 concave points error 0.000000e+00\n",
"15 compactness error 0.000000e+00\n",
"1 mean texture 0.000000e+00\n",
"14 smoothness error 0.000000e+00\n",
"13 area error 0.000000e+00\n",
"11 texture error 0.000000e+00\n",
"10 radius error 0.000000e+00\n",
"9 mean fractal dimension 0.000000e+00\n",
"8 mean symmetry 0.000000e+00\n",
"7 mean concave points 0.000000e+00\n",
"6 mean concavity 0.000000e+00\n",
"5 mean compactness 0.000000e+00\n",
"4 mean smoothness 0.000000e+00\n",
"3 mean area 0.000000e+00\n",
"2 mean perimeter 0.000000e+00\n",
"29 worst fractal dimension 0.000000e+00"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# we select features that have auc_drop > 0\n",
"auc_drop"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"22 worst perimeter\n",
"27 worst concave points\n",
"23 worst area\n",
"12 perimeter error\n",
"Name: feature, dtype: object"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"selected_features"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}