Amazing-Feature-Engineering/3.1_Demo_Feature_Scaling.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "# import seaborn as sns\n",
    "# import matplotlib.pyplot as plt\n",
    "import os\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "# plt.style.use('seaborn-colorblind')\n",
    "# %matplotlib inline\n",
    "#from feature_cleaning import rare_values as ra"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "use_cols = [\n",
    "    'Pclass', 'Sex', 'Age', 'Fare', 'SibSp',\n",
    "    'Survived'\n",
    "]\n",
    "\n",
    "data = pd.read_csv('./data/titanic.csv', usecols=use_cols)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Survived</th>\n",
       "      <th>Pclass</th>\n",
       "      <th>Sex</th>\n",
       "      <th>Age</th>\n",
       "      <th>SibSp</th>\n",
       "      <th>Fare</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>male</td>\n",
       "      <td>22.0</td>\n",
       "      <td>1</td>\n",
       "      <td>7.2500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>female</td>\n",
       "      <td>38.0</td>\n",
       "      <td>1</td>\n",
       "      <td>71.2833</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>female</td>\n",
       "      <td>26.0</td>\n",
       "      <td>0</td>\n",
       "      <td>7.9250</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Survived  Pclass     Sex   Age  SibSp     Fare\n",
       "0         0       3    male  22.0      1   7.2500\n",
       "1         1       1  female  38.0      1  71.2833\n",
       "2         1       3  female  26.0      0   7.9250"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((623, 6), (268, 6))"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Note that we include target variable in the X_train \n",
    "# because we need it to supervise our discretization\n",
    "# this is not the standard way of using train-test-split\n",
    "X_train, X_test, y_train, y_test = train_test_split(data, data.Survived, test_size=0.3,\n",
    "                                                    random_state=0)\n",
    "X_train.shape, X_test.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Normalization - Standardization (Z-score scaling)\n",
    "\n",
    "removes the mean and scales the data to unit variance.<br />z = (X - X.mean) /  std"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     Survived  Pclass     Sex   Age  SibSp     Fare  Fare_zscore\n",
      "857         1       1    male  51.0      0  26.5500    -0.122530\n",
      "52          1       1  female  49.0      1  76.7292     0.918124\n",
      "386         0       3    male   1.0      5  46.9000     0.299503\n",
      "124         0       1    male  54.0      0  77.2875     0.929702\n",
      "578         0       3  female   NaN      1  14.4583    -0.373297\n",
      "549         1       2    male   8.0      1  36.7500     0.089005\n"
     ]
    }
   ],
   "source": [
    "# add the new created feature\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "ss = StandardScaler().fit(X_train[['Fare']])\n",
    "X_train_copy = X_train.copy(deep=True)\n",
    "X_train_copy['Fare_zscore'] = ss.transform(X_train_copy[['Fare']])\n",
    "print(X_train_copy.head(6))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5.916437306188636e-17\n",
      "1.0008035356861\n"
     ]
    }
   ],
   "source": [
    "# check if it is with mean=0 std=1\n",
    "print(X_train_copy['Fare_zscore'].mean())\n",
    "print(X_train_copy['Fare_zscore'].std())\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Min-Max scaling\n",
    "transforms features by scaling each feature to a given range. Default to [0,1].<br />X_scaled = (X - X.min / (X.max - X.min)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     Survived  Pclass     Sex   Age  SibSp     Fare  Fare_minmax\n",
      "857         1       1    male  51.0      0  26.5500     0.051822\n",
      "52          1       1  female  49.0      1  76.7292     0.149765\n",
      "386         0       3    male   1.0      5  46.9000     0.091543\n",
      "124         0       1    male  54.0      0  77.2875     0.150855\n",
      "578         0       3  female   NaN      1  14.4583     0.028221\n",
      "549         1       2    male   8.0      1  36.7500     0.071731\n"
     ]
    }
   ],
   "source": [
    "# add the new created feature\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "mms = MinMaxScaler().fit(X_train[['Fare']])\n",
    "X_train_copy = X_train.copy(deep=True)\n",
    "X_train_copy['Fare_minmax'] = mms.transform(X_train_copy[['Fare']])\n",
    "print(X_train_copy.head(6))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.0\n",
      "0.0\n"
     ]
    }
   ],
   "source": [
    "# check the range of Fare_minmax\n",
    "print(X_train_copy['Fare_minmax'].max())\n",
    "print(X_train_copy['Fare_minmax'].min())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": true
   },
   "source": [
    "## Robust scaling\n",
    "removes the median and scales the data according to the quantile range (defaults to IQR)<br />X_scaled = (X - X.median) / IQR"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "     Survived  Pclass     Sex   Age  SibSp     Fare  Fare_robust\n",
      "857         1       1    male  51.0      0  26.5500     0.492275\n",
      "52          1       1  female  49.0      1  76.7292     2.630973\n",
      "386         0       3    male   1.0      5  46.9000     1.359616\n",
      "124         0       1    male  54.0      0  77.2875     2.654768\n",
      "578         0       3  female   NaN      1  14.4583    -0.023088\n",
      "549         1       2    male   8.0      1  36.7500     0.927011\n"
     ]
    }
   ],
   "source": [
    "# add the new created feature\n",
    "from sklearn.preprocessing import RobustScaler\n",
    "rs = RobustScaler().fit(X_train[['Fare']])\n",
    "X_train_copy = X_train.copy(deep=True)\n",
    "X_train_copy['Fare_robust'] = rs.transform(X_train_copy[['Fare']])\n",
    "print(X_train_copy.head(6))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}