Created using Colaboratory

This commit is contained in:
George Mihaila
2020-02-10 00:29:42 -06:00
parent f91b268e33
commit d73fb44018

View File

@@ -0,0 +1,329 @@
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "pytorch_transformer_setup.ipynb",
"provenance": [],
"collapsed_sections": [],
"toc_visible": true,
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/github/gmihaila/machine_learning_things/blob/master/learning_pytorch/pytorch_transformer_setup.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "o03a5jjGv40i",
"colab_type": "text"
},
"source": [
"## Test cache directory Pytorch Tranformers"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "aZ0mvSclJNb5",
"colab_type": "text"
},
"source": [
"### Create directory to store torch cache"
]
},
{
"cell_type": "code",
"metadata": {
"id": "eiaZwVF4EDbG",
"colab_type": "code",
"outputId": "a6f629c7-5e64-407e-91e2-e5fea221f35f",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 35
}
},
"source": [
"%mkdir test\n",
"!ls"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"sample_data test\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "o2V3_9hwJQxs",
"colab_type": "text"
},
"source": [
"### Add environmnet variables"
]
},
{
"cell_type": "code",
"metadata": {
"id": "mS-PVZ27fpYl",
"colab_type": "code",
"colab": {}
},
"source": [
"# in linux use export VAR_NAME=\"value\"\n",
"'''\n",
"export PYTORCH_TRANSFORMERS_CACHE=\"/storage/scratch2/share/natural_language_processing_tools/pytorch_transformer_models\"\n",
"export PYTORCH_PRETRAINED_BERT_CACHE=\"/storage/scratch2/share/natural_language_processing_tools/pytorch_transformer_models\"\n",
"export ENV_XDG_CACHE_HOME=\"/storage/scratch2/share/natural_language_processing_tools/pytorch_transformer_models\"\n",
"export ENV_TORCH_HOME=\"/storage/scratch2/share/natural_language_processing_tools/pytorch_transformer_models\"\n",
"'''\n",
"import os\n",
"\n",
"os.environ['PYTORCH_TRANSFORMERS_CACHE'] = '/content/test'\n",
"os.environ['PYTORCH_PRETRAINED_BERT_CACHE'] = '/content/test'\n",
"os.environ['ENV_XDG_CACHE_HOME'] = '/content/test'\n",
"os.environ['ENV_TORCH_HOME'] = '/content/test'"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "SjwxRshSJUPM",
"colab_type": "text"
},
"source": [
"### Install Pytorch-Transformers"
]
},
{
"cell_type": "code",
"metadata": {
"id": "ni9ckbNqfrEs",
"colab_type": "code",
"colab": {}
},
"source": [
"from IPython.display import clear_output\n",
"\n",
"!pip install transformers\n",
"clear_output()"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "RyCrh6v0Jgal",
"colab_type": "text"
},
"source": [
"### Tes loading model"
]
},
{
"cell_type": "code",
"metadata": {
"id": "46-HROqifZ4l",
"colab_type": "code",
"outputId": "52ab7d0d-dbcb-438f-9a85-ae7c9ffce059",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 71
}
},
"source": [
"import torch\n",
"from transformers import *\n",
"\n",
"tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
"model = BertModel.from_pretrained('bert-base-uncased')"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"100%|██████████| 231508/231508 [00:00<00:00, 1159095.57B/s]\n",
"100%|██████████| 313/313 [00:00<00:00, 64974.87B/s]\n",
"100%|██████████| 440473133/440473133 [00:11<00:00, 39267895.47B/s]\n"
],
"name": "stderr"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "0QmG_6-8Jl5Y",
"colab_type": "text"
},
"source": [
"### See if it works"
]
},
{
"cell_type": "code",
"metadata": {
"id": "lJ3SCktBfiev",
"colab_type": "code",
"outputId": "d5404a08-f198-47d3-c791-91493bdf8f25",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 125
}
},
"source": [
"!ls /content/test\n"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084\n",
"26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.json\n",
"4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c\n",
"4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c.json\n",
"aa1ef1aede4482d0dbcd4d52baad8ae300e60902e88fcb0bebdec09afd232066.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157\n",
"aa1ef1aede4482d0dbcd4d52baad8ae300e60902e88fcb0bebdec09afd232066.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157.json\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "q03sVEpqJp7f",
"colab_type": "text"
},
"source": [
"### Download all models from Pytorch-Transformers\n",
"\n",
"Hugging faces [list](https://huggingface.co/pytorch-transformers/pretrained_models.html)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "tobGgYlU9rxz",
"colab_type": "code",
"outputId": "778878d7-5f56-44bd-82b7-38d01534c183",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 556
}
},
"source": [
"import torch\n",
"from transformers import *\n",
"\n",
"# PyTorch-Transformers has a unified API\n",
"# for 8 transformer architectures and 30 pretrained weights.\n",
"\n",
"# Model | Tokenizer | Pretrained weights shortcut\n",
"MODELS = [(BertModel, BertTokenizer, BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys()),\n",
" (OpenAIGPTModel, OpenAIGPTTokenizer, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys()),\n",
" (GPT2Model, GPT2Tokenizer, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys()),\n",
" (TransfoXLModel, TransfoXLTokenizer, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys()),\n",
" (XLNetModel, XLNetTokenizer, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys()),\n",
" (XLMModel, XLMTokenizer, XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys()),\n",
" (RobertaModel, RobertaTokenizer, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys()),\n",
" (DistilBertModel, DistilBertTokenizer, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())]\n",
"\n",
"n_models = 0\n",
"\n",
"# Go throught each model and download it\n",
"for model_class, tokenizer_class, pretrained_weights in MODELS:\n",
" \n",
" for pretrained_weights in pretrained_weights:\n",
" print(pretrained_weights)\n",
" \n",
" # if pretrained_weights == \"xlm-mlm-enro-1024\":\n",
" # print(\"Skip \",pretrained_weights)\n",
" # continue\n",
" \n",
" # Load pretrained model/tokenizer\n",
" tokenizer = tokenizer_class.from_pretrained(pretrained_weights,)\n",
" model = model_class.from_pretrained(pretrained_weights)\n",
" n_models +=1\n",
"\n",
"print(\"%s pre-train models available!\"%n_models)\n"
],
"execution_count": 0,
"outputs": [
{
"output_type": "stream",
"text": [
"bert-base-uncased\n",
"bert-large-uncased\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"100%|██████████| 231508/231508 [00:00<00:00, 2398349.41B/s]\n",
"100%|██████████| 314/314 [00:00<00:00, 64851.85B/s]\n",
"100%|██████████| 1344997306/1344997306 [00:20<00:00, 64754896.69B/s]\n"
],
"name": "stderr"
},
{
"output_type": "stream",
"text": [
"bert-base-cased\n"
],
"name": "stdout"
},
{
"output_type": "stream",
"text": [
"100%|██████████| 213450/213450 [00:00<00:00, 2254127.79B/s]\n",
"100%|██████████| 313/313 [00:00<00:00, 151403.20B/s]\n",
" 91%|█████████▏| 398308352/435779157 [00:05<00:00, 74077564.58B/s]"
],
"name": "stderr"
},
{
"output_type": "error",
"ename": "KeyboardInterrupt",
"evalue": "ignored",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-6-e8af17786ee6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 29\u001b[0m \u001b[0;31m# Load pretrained model/tokenizer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 30\u001b[0m \u001b[0mtokenizer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtokenizer_class\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_pretrained\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpretrained_weights\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 31\u001b[0;31m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel_class\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfrom_pretrained\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpretrained_weights\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 32\u001b[0m \u001b[0mn_models\u001b[0m \u001b[0;34m+=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 33\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/modeling_utils.py\u001b[0m in \u001b[0;36mfrom_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, *model_args, **kwargs)\u001b[0m\n\u001b[1;32m 316\u001b[0m \u001b[0;31m# redirect to the cache, if necessary\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 317\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 318\u001b[0;31m \u001b[0mresolved_archive_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcached_path\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marchive_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforce_download\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforce_download\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproxies\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mproxies\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 319\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mEnvironmentError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 320\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mpretrained_model_name_or_path\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpretrained_model_archive_map\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/file_utils.py\u001b[0m in \u001b[0;36mcached_path\u001b[0;34m(url_or_filename, cache_dir, force_download, proxies)\u001b[0m\n\u001b[1;32m 174\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mparsed\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mscheme\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;34m'http'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'https'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m's3'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 175\u001b[0m \u001b[0;31m# URL, so get it from the cache (downloading if necessary)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 176\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mget_from_cache\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcache_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcache_dir\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mforce_download\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mforce_download\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproxies\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mproxies\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 177\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0mos\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpath\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mexists\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl_or_filename\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 178\u001b[0m \u001b[0;31m# File, and it exists.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/file_utils.py\u001b[0m in \u001b[0;36mget_from_cache\u001b[0;34m(url, cache_dir, force_download, proxies)\u001b[0m\n\u001b[1;32m 300\u001b[0m \u001b[0ms3_get\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtemp_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproxies\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mproxies\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 301\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 302\u001b[0;31m \u001b[0mhttp_get\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0murl\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtemp_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mproxies\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mproxies\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 303\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 304\u001b[0m \u001b[0;31m# we are copying the file before closing it, so flush to avoid truncation\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/local/lib/python3.6/dist-packages/transformers/file_utils.py\u001b[0m in \u001b[0;36mhttp_get\u001b[0;34m(url, temp_file, proxies)\u001b[0m\n\u001b[1;32m 243\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mchunk\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# filter out keep-alive new chunks\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 245\u001b[0;31m \u001b[0mtemp_file\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mchunk\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 246\u001b[0m \u001b[0mprogress\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/usr/lib/python3.6/tempfile.py\u001b[0m in \u001b[0;36mfunc_wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 622\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0m_functools\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwraps\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfunc\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 623\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfunc_wrapper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 624\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 625\u001b[0m \u001b[0;31m# Avoid closing the file as long as the wrapper is alive,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 626\u001b[0m \u001b[0;31m# see issue #18879.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
]
}
]
}