Created using Colaboratory

2021-10-04 01:29:04 +03:00 · 2018-05-27 15:04:17 -05:00
parent bc6fc163bc
commit 8d1a11cba5
1 changed files with 106 additions and 0 deletions
--- a/keras_tokenizer_fix.ipynb
+++ b/keras_tokenizer_fix.ipynb
@@ -0,0 +1,106 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "keras_tokenizer_fix.ipynb",
+      "version": "0.3.2",
+      "provenance": [],
+      "collapsed_sections": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "view-in-github",
+        "colab_type": "text"
+      },
+      "source": [
+        "[View in Colaboratory](https://colab.research.google.com/github/gmihaila/deep_learning_toolbox/blob/master/keras_tokenizer_fix.ipynb)"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "NRdntd0DKAqB",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "## Probably my most usefull tool: How to fix the keras tokenizer to speed things up\n",
+        "\n",
+        "by GeorgeM"
+      ]
+    },
+    {
+      "metadata": {
+        "id": "9gQhQzeeJ-49",
+        "colab_type": "code",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 204
+        },
+        "outputId": "dbd05f30-af4f-4c2f-ab24-66c056d380dc"
+      },
+      "cell_type": "code",
+      "source": [
+        "from keras.preprocessing.text import Tokenizer\n",
+        "import string\n",
+        "\n",
+        "n_words = 3\n",
+        "\n",
+        "\n",
+        "tk = Tokenizer(num_words=(n_words+1), filters=string.punctuation, lower=True, split=' ', char_level=False, oov_token='<unk>')\n",
+        "texts = [\"my name is far faraway asdasd\", \"my name is\",\"your name is\"]\n",
+        "tk.fit_on_texts(texts)\n",
+        "\n",
+        "print('Original text: %s\\n'%texts)\n",
+        "print('Only use top %s words\\n\\n'%n_words)\n",
+        "\n",
+        "print(tk.word_index)\n",
+        "print('%s  <-WRONG!'%tk.texts_to_sequences(texts))\n",
+        "\n",
+        "print('\\n')\n",
+        "\n",
+        "## **Key Step**\n",
+        "tk.word_index = {e:i for e,i in tk.word_index.items() if i <= n_words} # <= because tokenizer is 1 indexed\n",
+        "tk.word_index[tk.oov_token] = n_words + 1\n",
+        "\n",
+        "\n",
+        "print(tk.word_index)\n",
+        "print('%s  <-RIGHT!'%tk.texts_to_sequences(texts))\n"
+      ],
+      "execution_count": 21,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Original text: ['my name is far faraway asdasd', 'my name is', 'your name is']\n",
+            "\n",
+            "Only use top 3 words\n",
+            "\n",
+            "\n",
+            "{'name': 1, 'far': 4, 'is': 2, 'asdasd': 6, 'faraway': 5, 'my': 3, 'your': 7, '<unk>': 8}\n",
+            "[[3, 1, 2], [3, 1, 2], [1, 2]]  <-WRONG!\n",
+            "\n",
+            "\n",
+            "{'is': 2, 'my': 3, 'name': 1, '<unk>': 4}\n",
+            "[[3, 1, 2, 4, 4, 4], [3, 1, 2], [4, 1, 2]]  <-RIGHT!\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "metadata": {
+        "id": "bchfX1IMME5d",
+        "colab_type": "text"
+      },
+      "cell_type": "markdown",
+      "source": [
+        "Thank you se4u(https://github.com/se4u)\n",
+        "Also: https://github.com/keras-team/keras/issues/8092"
+      ]
+    }
+  ]
+}