mirror of
https://github.com/fchollet/deep-learning-with-python-notebooks.git
synced 2021-07-27 01:28:40 +03:00
425 lines
12 KiB
Plaintext
425 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"This is a companion notebook for the book [Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition?a_aid=keras&a_bid=76564dff). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.\n\n**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**\n\nThis notebook was generated for TensorFlow 2.6."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"## The Transformer architecture"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"### Understanding self-attention"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"#### Generalized self-attention: the query-key-value model"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"### Multi-Head attention"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"### The Transformer encoder"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"**Getting the data**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 0,
|
|
"metadata": {
|
|
"colab_type": "code"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n",
|
|
"!tar -xf aclImdb_v1.tar.gz\n",
|
|
"!rm -r aclImdb/train/unsup"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"**Preparing the data**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 0,
|
|
"metadata": {
|
|
"colab_type": "code"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import os, pathlib, shutil, random\n",
|
|
"from tensorflow import keras\n",
|
|
"batch_size = 32\n",
|
|
"base_dir = pathlib.Path(\"aclImdb\")\n",
|
|
"val_dir = base_dir / \"val\"\n",
|
|
"train_dir = base_dir / \"train\"\n",
|
|
"for category in (\"neg\", \"pos\"):\n",
|
|
" os.makedirs(val_dir / category)\n",
|
|
" files = os.listdir(train_dir / category)\n",
|
|
" random.Random(1337).shuffle(files)\n",
|
|
" num_val_samples = int(0.2 * len(files))\n",
|
|
" val_files = files[-num_val_samples:]\n",
|
|
" for fname in val_files:\n",
|
|
" shutil.move(train_dir / category / fname,\n",
|
|
" val_dir / category / fname)\n",
|
|
"\n",
|
|
"train_ds = keras.preprocessing.text_dataset_from_directory(\n",
|
|
" \"aclImdb/train\", batch_size=batch_size\n",
|
|
")\n",
|
|
"val_ds = keras.preprocessing.text_dataset_from_directory(\n",
|
|
" \"aclImdb/val\", batch_size=batch_size\n",
|
|
")\n",
|
|
"test_ds = keras.preprocessing.text_dataset_from_directory(\n",
|
|
" \"aclImdb/test\", batch_size=batch_size\n",
|
|
")\n",
|
|
"text_only_train_ds = train_ds.map(lambda x, y: x)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"**Vectorizing the data**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 0,
|
|
"metadata": {
|
|
"colab_type": "code"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"from tensorflow.keras.layers.experimental.preprocessing import TextVectorization\n",
|
|
"max_length = 600\n",
|
|
"max_tokens = 20000\n",
|
|
"text_vectorization = TextVectorization(\n",
|
|
" max_tokens=max_tokens,\n",
|
|
" output_mode=\"int\",\n",
|
|
" output_sequence_length=max_length,\n",
|
|
")\n",
|
|
"text_vectorization.adapt(text_only_train_ds)\n",
|
|
"\n",
|
|
"int_train_ds = train_ds.map(lambda x, y: (text_vectorization(x), y))\n",
|
|
"int_val_ds = val_ds.map(lambda x, y: (text_vectorization(x), y))\n",
|
|
"int_test_ds = test_ds.map(lambda x, y: (text_vectorization(x), y))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"**Transformer encoder implemented as a subclassed Layer**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 0,
|
|
"metadata": {
|
|
"colab_type": "code"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"import tensorflow as tf\n",
|
|
"from tensorflow import keras\n",
|
|
"from tensorflow.keras import layers\n",
|
|
"\n",
|
|
"class TransformerEncoder(layers.Layer):\n",
|
|
" def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):\n",
|
|
" super().__init__(**kwargs)\n",
|
|
" self.embed_dim = embed_dim\n",
|
|
" self.dense_dim = dense_dim\n",
|
|
" self.num_heads = num_heads\n",
|
|
" self.attention = layers.MultiHeadAttention(\n",
|
|
" num_heads=num_heads, key_dim=embed_dim)\n",
|
|
" self.dense_proj = keras.Sequential(\n",
|
|
" [layers.Dense(dense_dim, activation=\"relu\"),\n",
|
|
" layers.Dense(embed_dim),]\n",
|
|
" )\n",
|
|
" self.layernorm_1 = layers.LayerNormalization()\n",
|
|
" self.layernorm_2 = layers.LayerNormalization()\n",
|
|
"\n",
|
|
" def call(self, inputs, mask=None):\n",
|
|
" if mask is not None:\n",
|
|
" mask = mask[:, tf.newaxis, :]\n",
|
|
" attention_output = self.attention(\n",
|
|
" inputs, inputs, attention_mask=mask)\n",
|
|
" proj_input = self.layernorm_1(inputs + attention_output)\n",
|
|
" proj_output = self.dense_proj(proj_input)\n",
|
|
" return self.layernorm_2(proj_input + proj_output)\n",
|
|
"\n",
|
|
" def get_config(self):\n",
|
|
" config = super(TransformerEncoder, self).get_config()\n",
|
|
" config.update({\n",
|
|
" \"embed_dim\": self.embed_dim,\n",
|
|
" \"num_heads\": self.num_heads,\n",
|
|
" \"dense_dim\": self.dense_dim,\n",
|
|
" })\n",
|
|
" return config"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"**Text classification model that combines the Transformer encoder and a pooling layer**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 0,
|
|
"metadata": {
|
|
"colab_type": "code"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"vocab_size = 20000\n",
|
|
"embed_dim = 256\n",
|
|
"num_heads = 2\n",
|
|
"dense_dim = 32\n",
|
|
"\n",
|
|
"inputs = keras.Input(shape=(None,), dtype=\"int64\")\n",
|
|
"x = layers.Embedding(vocab_size, embed_dim)(inputs)\n",
|
|
"x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)\n",
|
|
"x = layers.GlobalMaxPooling1D()(x)\n",
|
|
"x = layers.Dropout(0.5)(x)\n",
|
|
"outputs = layers.Dense(1, activation=\"sigmoid\")(x)\n",
|
|
"model = keras.Model(inputs, outputs)\n",
|
|
"model.compile(optimizer=\"rmsprop\",\n",
|
|
" loss=\"binary_crossentropy\",\n",
|
|
" metrics=[\"accuracy\"])\n",
|
|
"model.summary()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"**Training and evaluating the Transformer encoder based model**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 0,
|
|
"metadata": {
|
|
"colab_type": "code"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"callbacks = [\n",
|
|
" keras.callbacks.ModelCheckpoint(\"transformer_encoder.keras\",\n",
|
|
" save_best_only=True)\n",
|
|
"]\n",
|
|
"model.fit(int_train_ds, validation_data=int_val_ds, epochs=20, callbacks=callbacks)\n",
|
|
"model = keras.models.load_model(\n",
|
|
" \"transformer_encoder.keras\",\n",
|
|
" custom_objects={\"TransformerEncoder\": TransformerEncoder})\n",
|
|
"print(f\"Test acc: {model.evaluate(int_test_ds)[1]:.3f}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"#### Using positional encoding to reinject order information"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"**Implementing positional embedding as a subclassed layer**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 0,
|
|
"metadata": {
|
|
"colab_type": "code"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"class PositionalEmbedding(layers.Layer):\n",
|
|
" def __init__(self, sequence_length, input_dim, output_dim, **kwargs):\n",
|
|
" super().__init__(**kwargs)\n",
|
|
" self.token_embeddings = layers.Embedding(\n",
|
|
" input_dim=input_dim, output_dim=output_dim)\n",
|
|
" self.position_embeddings = layers.Embedding(\n",
|
|
" input_dim=sequence_length, output_dim=output_dim)\n",
|
|
" self.sequence_length = sequence_length\n",
|
|
" self.input_dim = input_dim\n",
|
|
" self.output_dim = output_dim\n",
|
|
"\n",
|
|
" def call(self, inputs):\n",
|
|
" length = tf.shape(inputs)[-1]\n",
|
|
" positions = tf.range(start=0, limit=length, delta=1)\n",
|
|
" embedded_tokens = self.token_embeddings(inputs)\n",
|
|
" embedded_positions = self.position_embeddings(positions)\n",
|
|
" return embedded_tokens + embedded_positions\n",
|
|
"\n",
|
|
" def compute_mask(self, inputs, mask=None):\n",
|
|
" return tf.math.not_equal(inputs, 0)\n",
|
|
"\n",
|
|
" def get_config(self):\n",
|
|
" config = super(PositionalEmbedding, self).get_config()\n",
|
|
" config.update({\n",
|
|
" \"output_dim\": self.output_dim,\n",
|
|
" \"sequence_length\": self.sequence_length,\n",
|
|
" \"input_dim\": self.input_dim,\n",
|
|
" })\n",
|
|
" return config"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"#### Putting it all together: a text-classification Transformer"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"**Text classification model that combines positional embedding, the Transformer encoder, and a pooling layer**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 0,
|
|
"metadata": {
|
|
"colab_type": "code"
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"vocab_size = 20000\n",
|
|
"sequence_length = 600\n",
|
|
"embed_dim = 256\n",
|
|
"num_heads = 2\n",
|
|
"dense_dim = 32\n",
|
|
"\n",
|
|
"inputs = keras.Input(shape=(None,), dtype=\"int64\")\n",
|
|
"x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(inputs)\n",
|
|
"x = TransformerEncoder(embed_dim, dense_dim, num_heads)(x)\n",
|
|
"x = layers.GlobalMaxPooling1D()(x)\n",
|
|
"x = layers.Dropout(0.5)(x)\n",
|
|
"outputs = layers.Dense(1, activation=\"sigmoid\")(x)\n",
|
|
"model = keras.Model(inputs, outputs)\n",
|
|
"model.compile(optimizer=\"rmsprop\",\n",
|
|
" loss=\"binary_crossentropy\",\n",
|
|
" metrics=[\"accuracy\"])\n",
|
|
"model.summary()\n",
|
|
"\n",
|
|
"callbacks = [\n",
|
|
" keras.callbacks.ModelCheckpoint(\"full_transformer_encoder.keras\",\n",
|
|
" save_best_only=True)\n",
|
|
"]\n",
|
|
"model.fit(int_train_ds, validation_data=int_val_ds, epochs=20, callbacks=callbacks)\n",
|
|
"model = keras.models.load_model(\n",
|
|
" \"full_transformer_encoder.keras\",\n",
|
|
" custom_objects={\"TransformerEncoder\": TransformerEncoder,\n",
|
|
" \"PositionalEmbedding\": PositionalEmbedding})\n",
|
|
"print(f\"Test acc: {model.evaluate(int_test_ds)[1]:.3f}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {
|
|
"colab_type": "text"
|
|
},
|
|
"source": [
|
|
"### When to use sequence models over bag-of-words models?"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"colab": {
|
|
"collapsed_sections": [],
|
|
"name": "chapter11_part03_transformer.i",
|
|
"private_outputs": false,
|
|
"provenance": [],
|
|
"toc_visible": true
|
|
},
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.7.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 0
|
|
} |