{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "This is a companion notebook for the book [Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition?a_aid=keras&a_bid=76564dff). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.\n\n**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**\n\nThis notebook was generated for TensorFlow 2.6." ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "## The Transformer architecture" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "### Understanding self-attention" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "#### Generalized self-attention: the query-key-value model" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "### Multi-Head attention" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "### The Transformer encoder" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Getting the data**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n", "!tar -xf aclImdb_v1.tar.gz\n", "!rm -r aclImdb/train/unsup" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Preparing the data**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "import os, pathlib, shutil, random\n", "from tensorflow import keras\n", "batch_size = 32\n", "base_dir = pathlib.Path(\"aclImdb\")\n", "val_dir = base_dir / \"val\"\n", "train_dir = base_dir / \"train\"\n", "for category in (\"neg\", \"pos\"):\n", " os.makedirs(val_dir / category)\n", " files = os.listdir(train_dir / category)\n", " random.Random(1337).shuffle(files)\n", " num_val_samples = int(0.2 * len(files))\n", " val_files = files[-num_val_samples:]\n", " for fname in val_files:\n", " shutil.move(train_dir / category / fname,\n", " val_dir / category / fname)\n", "\n", "train_ds = keras.preprocessing.text_dataset_from_directory(\n", " \"aclImdb/train\", batch_size=batch_size\n", ")\n", "val_ds = keras.preprocessing.text_dataset_from_directory(\n", " \"aclImdb/val\", batch_size=batch_size\n", ")\n", "test_ds = keras.preprocessing.text_dataset_from_directory(\n", " \"aclImdb/test\", batch_size=batch_size\n", ")\n", "text_only_train_ds = train_ds.map(lambda x, y: x)" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Vectorizing the data**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "from tensorflow.keras.layers.experimental.preprocessing import TextVectorization\n", "max_length = 600\n", "max_tokens = 20000\n", "text_vectorization = TextVectorization(\n", " max_tokens=max_tokens,\n", " output_mode=\"int\",\n", " output_sequence_length=max_length,\n", ")\n", "text_vectorization.adapt(text_only_train_ds)\n", "\n", "int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))\n", "int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))\n", "int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Transformer encoder implemented as a subclassed Layer**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "import tensorflow as tf\n", "from tensorflow import keras\n", "from tensorflow.keras import layers\n", "\n", "class TransformerEncoder(layers.Layer):\n", " def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):\n", " super().__init__(**kwargs)\n", " self.embed_dim = embed_dim\n", " self.dense_dim = dense_dim\n", " self.num_heads = num_heads\n", " self.attention = layers.MultiHeadAttention(\n", " num_heads=num_heads, key_dim=embed_dim)\n", " self.dense_proj = keras.Sequential(\n", " [layers.Dense(dense_dim, activation=\"relu\"),\n", " layers.Dense(embed_dim),]\n", " )\n", " self.layernorm_1 = layers.LayerNormalization()\n", " self.layernorm_2 = layers.LayerNormalization()\n", "\n", " def call(self, inputs, mask=None):\n", " if mask is not None:\n", " mask = mask[:, tf.newaxis, :]\n", " attention_output = self.attention(\n", " inputs, inputs, attention_mask=mask)\n", " proj_input = self.layernorm_1(inputs + attention_output)\n", " proj_output = self.dense_proj(proj_input)\n", " return self.layernorm_2(proj_input + proj_output)\n", "\n", " def get_config(self):\n", " config = super(TransformerEncoder, self).get_config()\n", " config.update({\n", " \"embed_dim\": self.embed_dim,\n", " \"num_heads\": self.num_heads,\n", " \"dense_dim\": self.dense_dim,\n", " })\n", " return config" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Text classification model that combines the Transformer encoder and a pooling layer**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "vocab_size = 20000\n", "embed_dim = 256\n", "num_heads = 2\n", "dense_dim = 32\n", "\n", "inputs = keras.Input(shape=(None,), dtype=\"int64\")\n", "x = layers.Embedding(vocab_size, embed_dim)(inputs)\n", "x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)\n", "x = layers.GlobalMaxPooling1D()(x)\n", "x = layers.Dropout(0.5)(x)\n", "outputs = layers.Dense(1, activation=\"sigmoid\")(x)\n", "model = keras.Model(inputs, outputs)\n", "model.compile(optimizer=\"rmsprop\",\n", " loss=\"binary_crossentropy\",\n", " metrics=[\"accuracy\"])\n", "model.summary()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Training and evaluating the Transformer encoder based model**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "callbacks = [\n", " keras.callbacks.ModelCheckpoint(\"transformer_encoder.keras\",\n", " save_best_only=True)\n", "]\n", "model.fit(int_train_ds, validation_data=int_val_ds, epochs=20, callbacks=callbacks)\n", "model = keras.models.load_model(\n", " \"transformer_encoder.keras\",\n", " custom_objects={\"TransformerEncoder\": TransformerEncoder})\n", "print(f\"Test acc: {model.evaluate(int_test_ds)[1]:.3f}\")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "#### Using positional encoding to reinject order information" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Implementing positional embedding as a subclassed layer**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "class PositionalEmbedding(layers.Layer):\n", " def __init__(self, sequence_length, input_dim, output_dim, **kwargs):\n", " super().__init__(**kwargs)\n", " self.token_embeddings = layers.Embedding(\n", " input_dim=input_dim, output_dim=output_dim)\n", " self.position_embeddings = layers.Embedding(\n", " input_dim=sequence_length, output_dim=output_dim)\n", " self.sequence_length = sequence_length\n", " self.input_dim = input_dim\n", " self.output_dim = output_dim\n", "\n", " def call(self, inputs):\n", " length = tf.shape(inputs)[-1]\n", " positions = tf.range(start=0, limit=length, delta=1)\n", " embedded_tokens = self.token_embeddings(inputs)\n", " embedded_positions = self.position_embeddings(positions)\n", " return embedded_tokens + embedded_positions\n", "\n", " def compute_mask(self, inputs, mask=None):\n", " return tf.math.not_equal(inputs, 0)\n", "\n", " def get_config(self):\n", " config = super(PositionalEmbedding, self).get_config()\n", " config.update({\n", " \"output_dim\": self.output_dim,\n", " \"sequence_length\": self.sequence_length,\n", " \"input_dim\": self.input_dim,\n", " })\n", " return config" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "#### Putting it all together: a text-classification Transformer" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "**Text classification model that combines positional embedding, the Transformer encoder, and a pooling layer**" ] }, { "cell_type": "code", "execution_count": 0, "metadata": { "colab_type": "code" }, "outputs": [], "source": [ "vocab_size = 20000\n", "sequence_length = 600\n", "embed_dim = 256\n", "num_heads = 2\n", "dense_dim = 32\n", "\n", "inputs = keras.Input(shape=(None,), dtype=\"int64\")\n", "x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs)\n", "x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)\n", "x = layers.GlobalMaxPooling1D()(x)\n", "x = layers.Dropout(0.5)(x)\n", "outputs = layers.Dense(1, activation=\"sigmoid\")(x)\n", "model = keras.Model(inputs, outputs)\n", "model.compile(optimizer=\"rmsprop\",\n", " loss=\"binary_crossentropy\",\n", " metrics=[\"accuracy\"])\n", "model.summary()\n", "\n", "callbacks = [\n", " keras.callbacks.ModelCheckpoint(\"full_transformer_encoder.keras\",\n", " save_best_only=True)\n", "]\n", "model.fit(int_train_ds, validation_data=int_val_ds, epochs=20, callbacks=callbacks)\n", "model = keras.models.load_model(\n", " \"full_transformer_encoder.keras\",\n", " custom_objects={\"TransformerEncoder\": TransformerEncoder,\n", " \"PositionalEmbedding\": PositionalEmbedding})\n", "print(f\"Test acc: {model.evaluate(int_test_ds)[1]:.3f}\")" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text" }, "source": [ "### When to use sequence models over bag-of-words models?" ] } ], "metadata": { "colab": { "collapsed_sections": [], "name": "chapter11_part03_transformer.i", "private_outputs": false, "provenance": [], "toc_visible": true }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 0 }