Files
ml_things/universal_sentence_embedding.ipynb
2018-11-21 22:29:01 -06:00

299 lines
8.1 KiB
Plaintext

{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "universal_sentence_embedding.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/gmihaila/machine_learning_toolbox/blob/master/universal_sentence_embedding.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"metadata": {
"id": "tWcVKGkl-hNW",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"### Creating universal sentence embedding"
]
},
{
"metadata": {
"id": "Yh10snyf_UjR",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"### Make tf hub safe to specific folder\n",
"export TFHUB_CACHE_DIR=/usr/local/my_module_cache"
]
},
{
"metadata": {
"id": "C15N2A5S_eWU",
"colab_type": "text"
},
"cell_type": "markdown",
"source": [
"[TF Module](https://www.tensorflow.org/hub/modules/google/universal-sentence-encoder-large/3)"
]
},
{
"metadata": {
"id": "MwNfhAcNzBOq",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"!pip install gensim\n"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "6LjOOhS_x0bK",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"# Download Google Word2Vec\n",
"print(\"\\n\\nDownloading...\")\n",
"!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n",
"# Unzip \n",
"print(\"\\n\\nUnzip...\")\n",
"!gunzip GoogleNews-vectors-negative300.bin.gz\n",
"# Check File\n",
"print(\"Finsihed!\")\n",
"!ls"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "B_ReYhPA8Hq9",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"print(\"\\n\\nDownload ZIP...\")\n",
"!wget https://www.dropbox.com/s/6ve2ssmxltagcuy/tf_sentence_emb.zip\n",
"print(\"\\n\\nUnzip...\")\n",
"!unzip tf_sentence_emb.zip\n",
"print(\"\\n\\nFinished!\")\n",
"!ls\n"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "xTZFFnDxxv6R",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"import os\n",
"import numpy as np\n",
"import tensorflow as tf\n",
"import tensorflow_hub as hub\n",
"from keras import backend as K\n",
"from keras import layers\n",
"from keras.models import Model\n",
"from gensim.models import KeyedVectors\n",
"\n",
"\n",
"\n",
"# Onlye used for testing\n",
"def CosineSim(a, b):\n",
" \"\"\"Takes 2 vectors a, b and returns the cosine similarity according \n",
" to the definition of the dot product\n",
" \"\"\"\n",
" dot_product = np.dot(a, b)\n",
" norm_a = np.linalg.norm(a)\n",
" norm_b = np.linalg.norm(b)\n",
"\n",
" return dot_product / (norm_a * norm_b)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "fANZCX63z8EV",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"sentence_1 = \"I am hungry\"\n",
"sentence_2 = \"I am starving\""
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "Wk8jIXBVyZdX",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"path_emb = \"GoogleNews-vectors-negative300.bin\"\n",
"\n",
"print('Loading pre-trained embedding model...')\n",
"word_emb = KeyedVectors.load_word2vec_format(path_emb, binary=True)\n",
"print('Finished!\\n')"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "ipAeCNfmy572",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"'''\n",
" Deep Averaging Network Fucntion\n",
"'''\n",
"\n",
"def Dan(text, emb_size, emb_model):\n",
" emb_dan = np.zeros(emb_size, dtype=float)\n",
" num_words = 0\n",
" for word in text.split():\n",
" if word in emb_model.vocab:\n",
" # add each embedding\n",
" emb_dan += np.array(emb_model[word], dtype=float)\n",
" num_words += 1\n",
" # average embedding\n",
" emb_dan = emb_dan/num_words\n",
" return emb_dan"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "bsB0wjcQz__a",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"sentence_emb_1 = Dan(sentence_1, emb_size=300, emb_model=word_emb)\n",
"sentence_emb_2 = Dan(sentence_2, emb_size=300, emb_model=word_emb)\n",
"\n",
"CosineSim(sentence_emb_1, sentence_emb_2)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "K_WSCr7u0vnU",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"path_hub = \"tf_sentence_emb\"\n",
"\n",
"\n",
"# keras model\n",
"def UniversalEmbedding(x):\n",
" return embed(tf.squeeze(tf.cast(x, tf.string)), \n",
" signature=\"default\", as_dict=True)[\"default\"]\n",
"\n",
"# load pre-trained model\n",
"embed = hub.Module(path_hub)\n",
"\n",
"# input layer\n",
"input_text = layers.Input(shape=(1,), dtype=tf.string)\n",
"\n",
"# embedding layer output\n",
"embedding = layers.Lambda(UniversalEmbedding,\n",
" output_shape=(512,))(input_text)\n",
"\n",
"'''\n",
" Add here rest of your model if needed\n",
"'''\n",
"\n",
"# assamble model\n",
"model = Model(inputs=[input_text], outputs=embedding)\n",
"\n",
"# compile model\n",
"model.compile(loss='categorical_crossentropy', \n",
" optimizer='adam', metrics=['accuracy'])\n",
"\n",
"# show modle architecture\n",
"print(model.summary())"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "-2As3hd4005u",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
"emb_txt = None\n",
"in_txt = np.array([sentence_1, sentence_2])\n",
"with tf.Session() as session:\n",
" K.set_session(session)\n",
" session.run(tf.global_variables_initializer())\n",
" session.run(tf.tables_initializer())\n",
" emb_txt = model.predict(in_txt)\n",
" \n",
"sentence_emb_1 = emb_txt[0]\n",
"sentence_emb_2 = emb_txt[1]\n",
"\n",
"CosineSim(sentence_emb_1, sentence_emb_2)"
],
"execution_count": 0,
"outputs": []
},
{
"metadata": {
"id": "HzWSDFpx009J",
"colab_type": "code",
"colab": {}
},
"cell_type": "code",
"source": [
""
],
"execution_count": 0,
"outputs": []
}
]
}