mirror of
https://github.com/gmihaila/ml_things.git
synced 2021-10-04 01:29:04 +03:00
Created using Colaboratory
This commit is contained in:
106
keras_tokenizer_fix.ipynb
Normal file
106
keras_tokenizer_fix.ipynb
Normal file
@@ -0,0 +1,106 @@
|
||||
{
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 0,
|
||||
"metadata": {
|
||||
"colab": {
|
||||
"name": "keras_tokenizer_fix.ipynb",
|
||||
"version": "0.3.2",
|
||||
"provenance": [],
|
||||
"collapsed_sections": []
|
||||
}
|
||||
},
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {
|
||||
"id": "view-in-github",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"source": [
|
||||
"[View in Colaboratory](https://colab.research.google.com/github/gmihaila/deep_learning_toolbox/blob/master/keras_tokenizer_fix.ipynb)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"id": "NRdntd0DKAqB",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"## Probably my most usefull tool: How to fix the keras tokenizer to speed things up\n",
|
||||
"\n",
|
||||
"by GeorgeM"
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"id": "9gQhQzeeJ-49",
|
||||
"colab_type": "code",
|
||||
"colab": {
|
||||
"base_uri": "https://localhost:8080/",
|
||||
"height": 204
|
||||
},
|
||||
"outputId": "dbd05f30-af4f-4c2f-ab24-66c056d380dc"
|
||||
},
|
||||
"cell_type": "code",
|
||||
"source": [
|
||||
"from keras.preprocessing.text import Tokenizer\n",
|
||||
"import string\n",
|
||||
"\n",
|
||||
"n_words = 3\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"tk = Tokenizer(num_words=(n_words+1), filters=string.punctuation, lower=True, split=' ', char_level=False, oov_token='<unk>')\n",
|
||||
"texts = [\"my name is far faraway asdasd\", \"my name is\",\"your name is\"]\n",
|
||||
"tk.fit_on_texts(texts)\n",
|
||||
"\n",
|
||||
"print('Original text: %s\\n'%texts)\n",
|
||||
"print('Only use top %s words\\n\\n'%n_words)\n",
|
||||
"\n",
|
||||
"print(tk.word_index)\n",
|
||||
"print('%s <-WRONG!'%tk.texts_to_sequences(texts))\n",
|
||||
"\n",
|
||||
"print('\\n')\n",
|
||||
"\n",
|
||||
"## **Key Step**\n",
|
||||
"tk.word_index = {e:i for e,i in tk.word_index.items() if i <= n_words} # <= because tokenizer is 1 indexed\n",
|
||||
"tk.word_index[tk.oov_token] = n_words + 1\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"print(tk.word_index)\n",
|
||||
"print('%s <-RIGHT!'%tk.texts_to_sequences(texts))\n"
|
||||
],
|
||||
"execution_count": 21,
|
||||
"outputs": [
|
||||
{
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Original text: ['my name is far faraway asdasd', 'my name is', 'your name is']\n",
|
||||
"\n",
|
||||
"Only use top 3 words\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"{'name': 1, 'far': 4, 'is': 2, 'asdasd': 6, 'faraway': 5, 'my': 3, 'your': 7, '<unk>': 8}\n",
|
||||
"[[3, 1, 2], [3, 1, 2], [1, 2]] <-WRONG!\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"{'is': 2, 'my': 3, 'name': 1, '<unk>': 4}\n",
|
||||
"[[3, 1, 2, 4, 4, 4], [3, 1, 2], [4, 1, 2]] <-RIGHT!\n"
|
||||
],
|
||||
"name": "stdout"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"metadata": {
|
||||
"id": "bchfX1IMME5d",
|
||||
"colab_type": "text"
|
||||
},
|
||||
"cell_type": "markdown",
|
||||
"source": [
|
||||
"Thank you se4u(https://github.com/se4u)\n",
|
||||
"Also: https://github.com/keras-team/keras/issues/8092"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user