Created using Colaboratory

This commit is contained in:
George Mihaila
2018-10-22 13:08:30 -05:00
parent d121f85ed2
commit d8f87140a6

397
label_encoding.ipynb Normal file
View File

@@ -0,0 +1,397 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "label_encoding.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"[View in Colaboratory](https://colab.research.google.com/github/gmihaila/machine_learning_toolbox/blob/master/label_encoding.ipynb)"
]
},
{
"metadata": {
"id": "z5t9TWqa8lbK",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"## Data label encoding"
]
},
{
"metadata": {
"id": "kyGI-kk38hIB",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.pipeline import Pipeline\n",
"\n",
"class MultiColumnLabelEncoder:\n",
" def __init__(self,columns = None):\n",
" self.columns = columns # array of column names to encode\n",
"\n",
" def fit(self,X,y=None):\n",
" return self # not relevant here\n",
"\n",
" def transform(self,X):\n",
" '''\n",
" Transforms columns of X specified in self.columns using\n",
" LabelEncoder(). If no columns specified, transforms all\n",
" columns in X.\n",
" '''\n",
" output = X.copy()\n",
" if self.columns is not None:\n",
" for col in self.columns:\n",
" output[col] = LabelEncoder().fit_transform(output[col])\n",
" else:\n",
" for colname,col in output.iteritems():\n",
" output[colname] = LabelEncoder().fit_transform(col)\n",
" return output\n",
"\n",
" def fit_transform(self,X,y=None):\n",
" return self.fit(X,y).transform(X)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "EWwTdWYk8te8",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"from IPython.display import clear_output\n",
"\n",
"print(\"Downloading toy data\")\n",
"!wget https://www.openml.org/data/get_csv/1751/BayesianNetworkGenerator_breast-cancer_small.csv\n",
"clear_output()\n",
"\n",
"!ls"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "mhqihPqw8458",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"outputId": "4c2b69e1-69cc-4bcb-b4dd-9d4c3227e33f"
},
"cell_type": "code",
"source": [
"import pandas as pd\n",
"\n",
"# Parse data\n",
"path_file = 'BayesianNetworkGenerator_breast-cancer_small.csv'\n",
"df = pd.read_csv(path_file)\n",
"\n",
"# get columns\n",
"columns = list(df.columns)\n",
"\n",
"# show sample\n",
"df.head()"
],
"execution_count": 5,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>menopause</th>\n",
" <th>tumor-size</th>\n",
" <th>inv-nodes</th>\n",
" <th>node-caps</th>\n",
" <th>deg-malig</th>\n",
" <th>breast</th>\n",
" <th>breast-quad</th>\n",
" <th>irradiat</th>\n",
" <th>Class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>40-49</td>\n",
" <td>premeno</td>\n",
" <td>25-29</td>\n",
" <td>24-26</td>\n",
" <td>yes</td>\n",
" <td>3</td>\n",
" <td>right</td>\n",
" <td>central</td>\n",
" <td>no</td>\n",
" <td>recurrence-events</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>40-49</td>\n",
" <td>premeno</td>\n",
" <td>25-29</td>\n",
" <td>0-2</td>\n",
" <td>no</td>\n",
" <td>2</td>\n",
" <td>left</td>\n",
" <td>left_low</td>\n",
" <td>no</td>\n",
" <td>no-recurrence-events</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>60-69</td>\n",
" <td>lt40</td>\n",
" <td>25-29</td>\n",
" <td>0-2</td>\n",
" <td>no</td>\n",
" <td>2</td>\n",
" <td>right</td>\n",
" <td>left_low</td>\n",
" <td>no</td>\n",
" <td>no-recurrence-events</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>40-49</td>\n",
" <td>ge40</td>\n",
" <td>10-14</td>\n",
" <td>0-2</td>\n",
" <td>no</td>\n",
" <td>3</td>\n",
" <td>left</td>\n",
" <td>left_low</td>\n",
" <td>no</td>\n",
" <td>no-recurrence-events</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>40-49</td>\n",
" <td>premeno</td>\n",
" <td>5-9</td>\n",
" <td>0-2</td>\n",
" <td>no</td>\n",
" <td>1</td>\n",
" <td>right</td>\n",
" <td>central</td>\n",
" <td>no</td>\n",
" <td>no-recurrence-events</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age menopause tumor-size inv-nodes node-caps deg-malig breast \\\n",
"0 40-49 premeno 25-29 24-26 yes 3 right \n",
"1 40-49 premeno 25-29 0-2 no 2 left \n",
"2 60-69 lt40 25-29 0-2 no 2 right \n",
"3 40-49 ge40 10-14 0-2 no 3 left \n",
"4 40-49 premeno 5-9 0-2 no 1 right \n",
"\n",
" breast-quad irradiat Class \n",
"0 central no recurrence-events \n",
"1 left_low no no-recurrence-events \n",
"2 left_low no no-recurrence-events \n",
"3 left_low no no-recurrence-events \n",
"4 central no no-recurrence-events "
]
},
"metadata": {
"tags": []
},
"execution_count": 5
}
]
},
{
"metadata": {
"id": "AfA-mUji9Lef",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 204
},
"outputId": "de452b05-3d05-4be9-8d69-3f2b5c8d901c"
},
"cell_type": "code",
"source": [
"# perform label encoding\n",
"df_enc = MultiColumnLabelEncoder(columns = columns).fit_transform(df)\n",
"\n",
"# show sample\n",
"df_enc.head()"
],
"execution_count": 6,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>menopause</th>\n",
" <th>tumor-size</th>\n",
" <th>inv-nodes</th>\n",
" <th>node-caps</th>\n",
" <th>deg-malig</th>\n",
" <th>breast</th>\n",
" <th>breast-quad</th>\n",
" <th>irradiat</th>\n",
" <th>Class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>5</td>\n",
" <td>1</td>\n",
" <td>4</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>3</td>\n",
" <td>2</td>\n",
" <td>9</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age menopause tumor-size inv-nodes node-caps deg-malig breast \\\n",
"0 3 2 4 5 1 2 1 \n",
"1 3 2 4 0 0 1 0 \n",
"2 5 1 4 0 0 1 1 \n",
"3 3 0 1 0 0 2 0 \n",
"4 3 2 9 0 0 0 1 \n",
"\n",
" breast-quad irradiat Class \n",
"0 0 0 1 \n",
"1 1 0 0 \n",
"2 1 0 0 \n",
"3 1 0 0 \n",
"4 0 0 0 "
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
}
]
}