ml_things/universal_sentence_embedding.ipynb

{
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "universal_sentence_embedding.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/gmihaila/machine_learning_toolbox/blob/master/universal_sentence_embedding.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "metadata": {
        "id": "tWcVKGkl-hNW",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "### Creating universal sentence embedding"
      ]
    },
    {
      "metadata": {
        "id": "Yh10snyf_UjR",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "### Make tf hub safe to specific folder\n",
        "export TFHUB_CACHE_DIR=/usr/local/my_module_cache"
      ]
    },
    {
      "metadata": {
        "id": "C15N2A5S_eWU",
        "colab_type": "text"
      },
      "cell_type": "markdown",
      "source": [
        "[TF Module](https://www.tensorflow.org/hub/modules/google/universal-sentence-encoder-large/3)"
      ]
    },
    {
      "metadata": {
        "id": "MwNfhAcNzBOq",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "!pip install gensim\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "6LjOOhS_x0bK",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "# Download Google Word2Vec\n",
        "print(\"\\n\\nDownloading...\")\n",
        "!wget https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n",
        "# Unzip \n",
        "print(\"\\n\\nUnzip...\")\n",
        "!gunzip GoogleNews-vectors-negative300.bin.gz\n",
        "# Check File\n",
        "print(\"Finsihed!\")\n",
        "!ls"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "B_ReYhPA8Hq9",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "print(\"\\n\\nDownload ZIP...\")\n",
        "!wget https://www.dropbox.com/s/6ve2ssmxltagcuy/tf_sentence_emb.zip\n",
        "print(\"\\n\\nUnzip...\")\n",
        "!unzip tf_sentence_emb.zip\n",
        "print(\"\\n\\nFinished!\")\n",
        "!ls\n"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "xTZFFnDxxv6R",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "import os\n",
        "import numpy as np\n",
        "import tensorflow as tf\n",
        "import tensorflow_hub as hub\n",
        "from keras import backend as K\n",
        "from keras import layers\n",
        "from keras.models import Model\n",
        "from gensim.models import KeyedVectors\n",
        "\n",
        "\n",
        "\n",
        "# Onlye used for testing\n",
        "def CosineSim(a, b):\n",
        "    \"\"\"Takes 2 vectors a, b and returns the cosine similarity according \n",
        "      to the definition of the dot product\n",
        "    \"\"\"\n",
        "    dot_product = np.dot(a, b)\n",
        "    norm_a = np.linalg.norm(a)\n",
        "    norm_b = np.linalg.norm(b)\n",
        "\n",
        "    return dot_product / (norm_a * norm_b)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "fANZCX63z8EV",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "sentence_1 = \"I am hungry\"\n",
        "sentence_2 = \"I am starving\""
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "Wk8jIXBVyZdX",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "path_emb = \"GoogleNews-vectors-negative300.bin\"\n",
        "\n",
        "print('Loading pre-trained embedding model...')\n",
        "word_emb = KeyedVectors.load_word2vec_format(path_emb, binary=True)\n",
        "print('Finished!\\n')"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "ipAeCNfmy572",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "'''\n",
        "  Deep Averaging Network Fucntion\n",
        "'''\n",
        "\n",
        "def Dan(text, emb_size, emb_model):\n",
        "    emb_dan = np.zeros(emb_size, dtype=float)\n",
        "    num_words = 0\n",
        "    for word in text.split():\n",
        "        if word in emb_model.vocab:\n",
        "            # add each embedding\n",
        "            emb_dan += np.array(emb_model[word], dtype=float)\n",
        "            num_words += 1\n",
        "    # average embedding\n",
        "    emb_dan = emb_dan/num_words\n",
        "    return emb_dan"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "bsB0wjcQz__a",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "sentence_emb_1 = Dan(sentence_1, emb_size=300, emb_model=word_emb)\n",
        "sentence_emb_2 = Dan(sentence_2, emb_size=300, emb_model=word_emb)\n",
        "\n",
        "CosineSim(sentence_emb_1, sentence_emb_2)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "K_WSCr7u0vnU",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "path_hub = \"tf_sentence_emb\"\n",
        "\n",
        "\n",
        "# keras model\n",
        "def UniversalEmbedding(x):\n",
        "    return embed(tf.squeeze(tf.cast(x, tf.string)), \n",
        "                 signature=\"default\", as_dict=True)[\"default\"]\n",
        "\n",
        "# load pre-trained model\n",
        "embed = hub.Module(path_hub)\n",
        "\n",
        "# input layer\n",
        "input_text = layers.Input(shape=(1,), dtype=tf.string)\n",
        "\n",
        "# embedding layer output\n",
        "embedding = layers.Lambda(UniversalEmbedding,\n",
        "                          output_shape=(512,))(input_text)\n",
        "\n",
        "'''\n",
        "    Add here rest of your model if needed\n",
        "'''\n",
        "\n",
        "# assamble model\n",
        "model = Model(inputs=[input_text], outputs=embedding)\n",
        "\n",
        "# compile model\n",
        "model.compile(loss='categorical_crossentropy', \n",
        "              optimizer='adam', metrics=['accuracy'])\n",
        "\n",
        "# show modle architecture\n",
        "print(model.summary())"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "-2As3hd4005u",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        "emb_txt = None\n",
        "in_txt = np.array([sentence_1, sentence_2])\n",
        "with tf.Session() as session:\n",
        "    K.set_session(session)\n",
        "    session.run(tf.global_variables_initializer())\n",
        "    session.run(tf.tables_initializer())\n",
        "    emb_txt = model.predict(in_txt)\n",
        "    \n",
        "sentence_emb_1 = emb_txt[0]\n",
        "sentence_emb_2 = emb_txt[1]\n",
        "\n",
        "CosineSim(sentence_emb_1, sentence_emb_2)"
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "metadata": {
        "id": "HzWSDFpx009J",
        "colab_type": "code",
        "colab": {}
      },
      "cell_type": "code",
      "source": [
        ""
      ],
      "execution_count": 0,
      "outputs": []
    }
  ]
}