mirror of
https://github.com/ashishpatel26/Amazing-Feature-Engineering.git
synced 2022-05-07 18:26:02 +03:00
596 lines
19 KiB
Plaintext
596 lines
19 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"import numpy as np\n",
|
||
"# import seaborn as sns\n",
|
||
"# import matplotlib.pyplot as plt\n",
|
||
"import os\n",
|
||
"from sklearn.model_selection import train_test_split\n",
|
||
"from sklearn.feature_selection import SelectFromModel\n",
|
||
"from sklearn.ensemble import RandomForestClassifier\n",
|
||
"# plt.style.use('seaborn-colorblind')\n",
|
||
"# %matplotlib inline\n",
|
||
"from feature_selection import feature_shuffle\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Load Dataset"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"from sklearn.datasets import load_breast_cancer\n",
|
||
"data = load_breast_cancer()\n",
|
||
"data = pd.DataFrame(np.c_[data['data'], data['target']],\n",
|
||
" columns= np.append(data['feature_names'], ['target']))"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>mean radius</th>\n",
|
||
" <th>mean texture</th>\n",
|
||
" <th>mean perimeter</th>\n",
|
||
" <th>mean area</th>\n",
|
||
" <th>mean smoothness</th>\n",
|
||
" <th>mean compactness</th>\n",
|
||
" <th>mean concavity</th>\n",
|
||
" <th>mean concave points</th>\n",
|
||
" <th>mean symmetry</th>\n",
|
||
" <th>mean fractal dimension</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>worst texture</th>\n",
|
||
" <th>worst perimeter</th>\n",
|
||
" <th>worst area</th>\n",
|
||
" <th>worst smoothness</th>\n",
|
||
" <th>worst compactness</th>\n",
|
||
" <th>worst concavity</th>\n",
|
||
" <th>worst concave points</th>\n",
|
||
" <th>worst symmetry</th>\n",
|
||
" <th>worst fractal dimension</th>\n",
|
||
" <th>target</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>17.99</td>\n",
|
||
" <td>10.38</td>\n",
|
||
" <td>122.80</td>\n",
|
||
" <td>1001.0</td>\n",
|
||
" <td>0.11840</td>\n",
|
||
" <td>0.27760</td>\n",
|
||
" <td>0.3001</td>\n",
|
||
" <td>0.14710</td>\n",
|
||
" <td>0.2419</td>\n",
|
||
" <td>0.07871</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>17.33</td>\n",
|
||
" <td>184.60</td>\n",
|
||
" <td>2019.0</td>\n",
|
||
" <td>0.1622</td>\n",
|
||
" <td>0.6656</td>\n",
|
||
" <td>0.7119</td>\n",
|
||
" <td>0.2654</td>\n",
|
||
" <td>0.4601</td>\n",
|
||
" <td>0.11890</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>20.57</td>\n",
|
||
" <td>17.77</td>\n",
|
||
" <td>132.90</td>\n",
|
||
" <td>1326.0</td>\n",
|
||
" <td>0.08474</td>\n",
|
||
" <td>0.07864</td>\n",
|
||
" <td>0.0869</td>\n",
|
||
" <td>0.07017</td>\n",
|
||
" <td>0.1812</td>\n",
|
||
" <td>0.05667</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>23.41</td>\n",
|
||
" <td>158.80</td>\n",
|
||
" <td>1956.0</td>\n",
|
||
" <td>0.1238</td>\n",
|
||
" <td>0.1866</td>\n",
|
||
" <td>0.2416</td>\n",
|
||
" <td>0.1860</td>\n",
|
||
" <td>0.2750</td>\n",
|
||
" <td>0.08902</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>19.69</td>\n",
|
||
" <td>21.25</td>\n",
|
||
" <td>130.00</td>\n",
|
||
" <td>1203.0</td>\n",
|
||
" <td>0.10960</td>\n",
|
||
" <td>0.15990</td>\n",
|
||
" <td>0.1974</td>\n",
|
||
" <td>0.12790</td>\n",
|
||
" <td>0.2069</td>\n",
|
||
" <td>0.05999</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>25.53</td>\n",
|
||
" <td>152.50</td>\n",
|
||
" <td>1709.0</td>\n",
|
||
" <td>0.1444</td>\n",
|
||
" <td>0.4245</td>\n",
|
||
" <td>0.4504</td>\n",
|
||
" <td>0.2430</td>\n",
|
||
" <td>0.3613</td>\n",
|
||
" <td>0.08758</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>11.42</td>\n",
|
||
" <td>20.38</td>\n",
|
||
" <td>77.58</td>\n",
|
||
" <td>386.1</td>\n",
|
||
" <td>0.14250</td>\n",
|
||
" <td>0.28390</td>\n",
|
||
" <td>0.2414</td>\n",
|
||
" <td>0.10520</td>\n",
|
||
" <td>0.2597</td>\n",
|
||
" <td>0.09744</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>26.50</td>\n",
|
||
" <td>98.87</td>\n",
|
||
" <td>567.7</td>\n",
|
||
" <td>0.2098</td>\n",
|
||
" <td>0.8663</td>\n",
|
||
" <td>0.6869</td>\n",
|
||
" <td>0.2575</td>\n",
|
||
" <td>0.6638</td>\n",
|
||
" <td>0.17300</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>20.29</td>\n",
|
||
" <td>14.34</td>\n",
|
||
" <td>135.10</td>\n",
|
||
" <td>1297.0</td>\n",
|
||
" <td>0.10030</td>\n",
|
||
" <td>0.13280</td>\n",
|
||
" <td>0.1980</td>\n",
|
||
" <td>0.10430</td>\n",
|
||
" <td>0.1809</td>\n",
|
||
" <td>0.05883</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>16.67</td>\n",
|
||
" <td>152.20</td>\n",
|
||
" <td>1575.0</td>\n",
|
||
" <td>0.1374</td>\n",
|
||
" <td>0.2050</td>\n",
|
||
" <td>0.4000</td>\n",
|
||
" <td>0.1625</td>\n",
|
||
" <td>0.2364</td>\n",
|
||
" <td>0.07678</td>\n",
|
||
" <td>0.0</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 31 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" mean radius mean texture mean perimeter mean area mean smoothness \\\n",
|
||
"0 17.99 10.38 122.80 1001.0 0.11840 \n",
|
||
"1 20.57 17.77 132.90 1326.0 0.08474 \n",
|
||
"2 19.69 21.25 130.00 1203.0 0.10960 \n",
|
||
"3 11.42 20.38 77.58 386.1 0.14250 \n",
|
||
"4 20.29 14.34 135.10 1297.0 0.10030 \n",
|
||
"\n",
|
||
" mean compactness mean concavity mean concave points mean symmetry \\\n",
|
||
"0 0.27760 0.3001 0.14710 0.2419 \n",
|
||
"1 0.07864 0.0869 0.07017 0.1812 \n",
|
||
"2 0.15990 0.1974 0.12790 0.2069 \n",
|
||
"3 0.28390 0.2414 0.10520 0.2597 \n",
|
||
"4 0.13280 0.1980 0.10430 0.1809 \n",
|
||
"\n",
|
||
" mean fractal dimension ... worst texture worst perimeter worst area \\\n",
|
||
"0 0.07871 ... 17.33 184.60 2019.0 \n",
|
||
"1 0.05667 ... 23.41 158.80 1956.0 \n",
|
||
"2 0.05999 ... 25.53 152.50 1709.0 \n",
|
||
"3 0.09744 ... 26.50 98.87 567.7 \n",
|
||
"4 0.05883 ... 16.67 152.20 1575.0 \n",
|
||
"\n",
|
||
" worst smoothness worst compactness worst concavity worst concave points \\\n",
|
||
"0 0.1622 0.6656 0.7119 0.2654 \n",
|
||
"1 0.1238 0.1866 0.2416 0.1860 \n",
|
||
"2 0.1444 0.4245 0.4504 0.2430 \n",
|
||
"3 0.2098 0.8663 0.6869 0.2575 \n",
|
||
"4 0.1374 0.2050 0.4000 0.1625 \n",
|
||
"\n",
|
||
" worst symmetry worst fractal dimension target \n",
|
||
"0 0.4601 0.11890 0.0 \n",
|
||
"1 0.2750 0.08902 0.0 \n",
|
||
"2 0.3613 0.08758 0.0 \n",
|
||
"3 0.6638 0.17300 0.0 \n",
|
||
"4 0.2364 0.07678 0.0 \n",
|
||
"\n",
|
||
"[5 rows x 31 columns]"
|
||
]
|
||
},
|
||
"execution_count": 3,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"data.head(5)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"((455, 30), (114, 30))"
|
||
]
|
||
},
|
||
"execution_count": 4,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"X_train, X_test, y_train, y_test = train_test_split(data.drop(labels=['target'], axis=1), \n",
|
||
" data.target, test_size=0.2,\n",
|
||
" random_state=0)\n",
|
||
"X_train.shape, X_test.shape"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Feature Shuffling\n",
|
||
"permute the values of each feature, one at the time, and measure how much the permutation decreases the accuracy, or the roc_auc, or the mse of the machine learning model.\n",
|
||
"If the variables are important, this is, highly predictive, a random permutation of their values will decrease dramatically any of these metrics."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 17,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"auc_drop, selected_features = feature_shuffle.feature_shuffle_rf(X_train=X_train,\n",
|
||
" y_train=y_train,\n",
|
||
" random_state=0)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>feature</th>\n",
|
||
" <th>auc_drop</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>22</th>\n",
|
||
" <td>worst perimeter</td>\n",
|
||
" <td>8.359457e-05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>27</th>\n",
|
||
" <td>worst concave points</td>\n",
|
||
" <td>3.134796e-05</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>23</th>\n",
|
||
" <td>worst area</td>\n",
|
||
" <td>1.110223e-16</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>12</th>\n",
|
||
" <td>perimeter error</td>\n",
|
||
" <td>1.110223e-16</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>mean radius</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>16</th>\n",
|
||
" <td>concavity error</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>28</th>\n",
|
||
" <td>worst symmetry</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>26</th>\n",
|
||
" <td>worst concavity</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>25</th>\n",
|
||
" <td>worst compactness</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>24</th>\n",
|
||
" <td>worst smoothness</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>21</th>\n",
|
||
" <td>worst texture</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>20</th>\n",
|
||
" <td>worst radius</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>19</th>\n",
|
||
" <td>fractal dimension error</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>18</th>\n",
|
||
" <td>symmetry error</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>17</th>\n",
|
||
" <td>concave points error</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>15</th>\n",
|
||
" <td>compactness error</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>mean texture</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>14</th>\n",
|
||
" <td>smoothness error</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>13</th>\n",
|
||
" <td>area error</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>11</th>\n",
|
||
" <td>texture error</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>10</th>\n",
|
||
" <td>radius error</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>9</th>\n",
|
||
" <td>mean fractal dimension</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>8</th>\n",
|
||
" <td>mean symmetry</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>7</th>\n",
|
||
" <td>mean concave points</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>6</th>\n",
|
||
" <td>mean concavity</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>5</th>\n",
|
||
" <td>mean compactness</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>mean smoothness</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>mean area</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>mean perimeter</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>29</th>\n",
|
||
" <td>worst fractal dimension</td>\n",
|
||
" <td>0.000000e+00</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" feature auc_drop\n",
|
||
"22 worst perimeter 8.359457e-05\n",
|
||
"27 worst concave points 3.134796e-05\n",
|
||
"23 worst area 1.110223e-16\n",
|
||
"12 perimeter error 1.110223e-16\n",
|
||
"0 mean radius 0.000000e+00\n",
|
||
"16 concavity error 0.000000e+00\n",
|
||
"28 worst symmetry 0.000000e+00\n",
|
||
"26 worst concavity 0.000000e+00\n",
|
||
"25 worst compactness 0.000000e+00\n",
|
||
"24 worst smoothness 0.000000e+00\n",
|
||
"21 worst texture 0.000000e+00\n",
|
||
"20 worst radius 0.000000e+00\n",
|
||
"19 fractal dimension error 0.000000e+00\n",
|
||
"18 symmetry error 0.000000e+00\n",
|
||
"17 concave points error 0.000000e+00\n",
|
||
"15 compactness error 0.000000e+00\n",
|
||
"1 mean texture 0.000000e+00\n",
|
||
"14 smoothness error 0.000000e+00\n",
|
||
"13 area error 0.000000e+00\n",
|
||
"11 texture error 0.000000e+00\n",
|
||
"10 radius error 0.000000e+00\n",
|
||
"9 mean fractal dimension 0.000000e+00\n",
|
||
"8 mean symmetry 0.000000e+00\n",
|
||
"7 mean concave points 0.000000e+00\n",
|
||
"6 mean concavity 0.000000e+00\n",
|
||
"5 mean compactness 0.000000e+00\n",
|
||
"4 mean smoothness 0.000000e+00\n",
|
||
"3 mean area 0.000000e+00\n",
|
||
"2 mean perimeter 0.000000e+00\n",
|
||
"29 worst fractal dimension 0.000000e+00"
|
||
]
|
||
},
|
||
"execution_count": 18,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"# we select features that have auc_drop > 0\n",
|
||
"auc_drop"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/plain": [
|
||
"22 worst perimeter\n",
|
||
"27 worst concave points\n",
|
||
"23 worst area\n",
|
||
"12 perimeter error\n",
|
||
"Name: feature, dtype: object"
|
||
]
|
||
},
|
||
"execution_count": 19,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"selected_features"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"collapsed": true
|
||
},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.6.1"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|