ml_things/keras_generator.ipynb

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "keras_generator.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": []
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "[View in Colaboratory](https://colab.research.google.com/github/gmihaila/deep_learning_toolbox/blob/master/keras_generator.ipynb)"
      ]
    },
    {
      "metadata": {
        "id": "g8LvSrDRLJc9",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "### Use data Generator in Keras\n",
        "\n",
        "by GeorgeM."
      ]
    },
    {
      "metadata": {
        "id": "pEF8oMXXLIkx",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 212
        },
        "outputId": "3f519e62-695b-497b-8c92-3016932bd6f7"
      },
      "cell_type": "code",
      "source": [
        "!wget https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "--2018-05-28 19:23:55--  https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv\r\n",
            "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...\r\n",
            "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 23278 (23K) [text/plain]\n",
            "Saving to: ‘pima-indians-diabetes.data.csv’\n",
            "\n",
            "pima-indians-diabet 100%[===================>]  22.73K  --.-KB/s    in 0.01s   \n",
            "\n",
            "2018-05-28 19:23:55 (1.58 MB/s) - ‘pima-indians-diabetes.data.csv’ saved [23278/23278]\n",
            "\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "d0iOq0HYLkPu",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "2a5df017-04dc-49a9-e4ea-d35f14b74dc9"
      },
      "cell_type": "code",
      "source": [
        "ls"
      ],
      "execution_count": 14,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "\u001b[0m\u001b[01;34mdatalab\u001b[0m/  pima-indians-diabetes.data.csv\r\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "MfFKQB9oLa4n",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 52
        },
        "outputId": "d312ce22-099a-4d71-84db-fd93cdcc8079"
      },
      "cell_type": "code",
      "source": [
        "# Create your first MLP in Keras\n",
        "from keras.models import Sequential\n",
        "from keras.layers import Dense\n",
        "import time\n",
        "import numpy as np\n",
        "\n",
        "# fix random seed for reproducibility\n",
        "np.random.seed(7)\n",
        "# load pima indians dataset\n",
        "dataset = np.loadtxt('pima-indians-diabetes.data.csv', delimiter=\",\")\n",
        "# split into input (X) and output (Y) variables\n",
        "X = dataset[:,0:8]\n",
        "Y = dataset[:,8]\n",
        "data_size = len(X)\n",
        "print 'Data size: ', data_size\n",
        "\n",
        "data = []\n",
        "for x,y in zip(X,Y):\n",
        "  data.append([x,y])\n",
        "print 'Data created'\n",
        "\n",
        "# create model\n",
        "model = Sequential()\n",
        "model.add(Dense(12, input_dim=8, activation='relu'))\n",
        "model.add(Dense(8, activation='relu'))\n",
        "model.add(Dense(1, activation='sigmoid'))\n",
        "# Compile model\n",
        "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
        "\n",
        "\n",
        "def gen(data, mini_batch, n_epochs):\n",
        "    print('Generator initiated!\\n')\n",
        "    # read or split data\n",
        "    data_size = len(data)\n",
        "    x = []    # input batch examples\n",
        "    y = []    # output batch examples\n",
        "    epoch = 0\n",
        "    \n",
        "    while epoch < n_epochs:    # <- VERY IMPORTANT\n",
        "      for index, pair in enumerate(data):\n",
        "        # append data        \n",
        "        x.append(pair[0])\n",
        "        y.append(pair[1])\n",
        "\n",
        "        if (index+1)%mini_batch == 0:\n",
        "          x = np.array(x)\n",
        "          y = np.array(y)\n",
        "          # output data\n",
        "          yield x, y\n",
        "          x = []; y = []\n",
        "          \n",
        "      epoch += 1\n",
        "    return"
      ],
      "execution_count": 22,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Data size:  768\n",
            "Data created\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "XMaurYBBZFw8",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 514
        },
        "outputId": "f9bc62b7-ac12-4299-de0f-343e7fd7139d"
      },
      "cell_type": "code",
      "source": [
        "tr_gen = gen(data, 1, n_epochs=10)\n",
        "s = time.time()\n",
        "model.fit_generator(generator=tr_gen, steps_per_epoch=768, epochs=10, max_queue_size=0, workers=0, verbose=2)\n",
        "e = time.time()\n",
        "print '\\n\\n Generator: ',(e-s)\n",
        "\n",
        "\n",
        "# steps_per_epoch: batches of examples/examples form Generator to finish 1 epoch\n",
        "# max_queue_size: Integer. Maximum size for the generator queue. Default 10.\n",
        "# workers: Integer. Maximum number of processes to spin up when using process-based threading. Default to 1. \n",
        "         # 0, will execute the generator on the main thread.\n",
        "# epochs: number of times model sees data\n",
        "\n",
        "\n",
        "\n",
        "# Fit the model\n",
        "# s = time.time()\n",
        "# model.fit(X, Y, epochs=1, batch_size=1)\n",
        "# e = time.time()\n",
        "# print '\\n\\n Generator: ',(e-s)\n",
        "\n",
        "# evaluate the model\n",
        "\n",
        "scores = model.evaluate(X, Y)\n",
        "print(\"\\n%s: %.2f%%\" % (model.metrics_names[1], scores[1]*100))\n",
        "\n",
        "del model"
      ],
      "execution_count": 23,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Epoch 1/10\n",
            "Generator initiated!\n",
            "\n",
            " - 2s - loss: 1.7948 - acc: 0.6068\n",
            "Epoch 2/10\n",
            " - 1s - loss: 0.7456 - acc: 0.6302\n",
            "Epoch 3/10\n",
            " - 1s - loss: 0.7028 - acc: 0.6576\n",
            "Epoch 4/10\n",
            " - 1s - loss: 0.6732 - acc: 0.6576\n",
            "Epoch 5/10\n",
            " - 1s - loss: 0.6522 - acc: 0.6836\n",
            "Epoch 6/10\n",
            " - 1s - loss: 0.6369 - acc: 0.6732\n",
            "Epoch 7/10\n",
            " - 1s - loss: 0.6223 - acc: 0.6901\n",
            "Epoch 8/10\n",
            " - 1s - loss: 0.6152 - acc: 0.6823\n",
            "Epoch 9/10\n",
            " - 1s - loss: 0.6047 - acc: 0.7031\n",
            "Epoch 10/10\n",
            " - 1s - loss: 0.5941 - acc: 0.7044\n",
            "\n",
            "\n",
            " Generator:  11.5527789593\n",
            "768/768 [==============================] - 0s 176us/step\n",
            "\n",
            "acc: 68.10%\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "metadata": {
        "id": "A-KDhE9Forgg",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "# prediciton generator some code\n",
        "\n",
        "def TestGenerator(path_data, max_queue_size, n_articles):\n",
        "  x = []    # input batch examples\n",
        "    \n",
        "  index = 1\n",
        "  epoch = 0\n",
        "\n",
        "  with (open('%s.pickle'%path_data, \"rb\")) as openfile:\n",
        "    while True:\n",
        "      try:\n",
        "        ID, content, title, media, source, published = pickle.load(openfile)\n",
        "        x.append(content)\n",
        "\n",
        "        if ((index)%max_queue_size == 0) or ((n_articles - index) < max_queue_size):\n",
        "          x = np.array(x)\n",
        "          # output data\n",
        "          yield x\n",
        "          x = []\n",
        "\n",
        "        index += 1\n",
        "      except EOFError:\n",
        "        break\n",
        "  \n",
        "test_gen = TestGenerator(path_data='test', max_queue_size=2, n_articles=n_test)  \n",
        "\n",
        "predict = model.predict_generator(generator=test_gen, steps=5, max_queue_size=10, workers=1)\n",
        "print predict[0]"
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}