OpenPipe-llm/examples/classify-recipes/train.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Now let's get to the fun part -- training a model. I'll start by installing the dependencies."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: peft==0.5.0 in /usr/local/lib/python3.10/dist-packages (0.5.0)\n",
      "\u001b[31mERROR: Could not find a version that satisfies the requirement python-dotenv==2.0.0 (from versions: 0.1.0, 0.1.2, 0.1.3, 0.1.5, 0.2.0, 0.3.0, 0.4.0, 0.5.0, 0.5.1, 0.6.0, 0.6.1, 0.6.2, 0.6.3, 0.6.4, 0.6.5, 0.7.0, 0.7.1, 0.8.0, 0.8.1, 0.8.2, 0.9.0, 0.9.1, 0.10.0, 0.10.1, 0.10.2, 0.10.3, 0.10.4, 0.10.5, 0.11.0, 0.12.0, 0.13.0, 0.14.0, 0.15.0, 0.16.0, 0.17.0, 0.17.1, 0.18.0, 0.19.0, 0.19.1, 0.19.2, 0.20.0, 0.21.0, 0.21.1, 1.0.0)\u001b[0m\u001b[31m\n",
      "\u001b[0m\u001b[31mERROR: No matching distribution found for python-dotenv==2.0.0\u001b[0m\u001b[31m\n",
      "\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.10 -m pip install --upgrade pip\u001b[0m\n",
      "Note: you may need to restart the kernel to use updated packages.\n",
      "fatal: destination path 'axolotl' already exists and is not an empty directory.\n",
      "Obtaining file:///workspace/OpenPipe/examples/classify-recipes/axolotl\n",
      "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
      "\u001b[?25hCollecting transformers@ git+https://github.com/huggingface/transformers.git (from axolotl==0.1)\n",
      "  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-install-ckp96ans/transformers_783779e09ad546a5be81c173eca5fd38\n",
      "  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-install-ckp96ans/transformers_783779e09ad546a5be81c173eca5fd38\n",
      "  Resolved https://github.com/huggingface/transformers.git to commit f26099e7b5cf579f99a42bab6ddd371bf2c8d548\n",
      "  Installing build dependencies ... \u001b[?25ldone\n",
      "\u001b[?25h  Getting requirements to build wheel ... \u001b[?25ldone\n",
      "\u001b[?25h  Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
      "\u001b[?25hCollecting accelerate@ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b (from axolotl==0.1)\n",
      "  Using cached accelerate-0.22.0.dev0-py3-none-any.whl\n",
      "Requirement already satisfied: bitsandbytes>=0.41.1 in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (0.41.1)\n",
      "Requirement already satisfied: addict in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (2.4.0)\n",
      "Requirement already satisfied: fire in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (0.5.0)\n",
      "Requirement already satisfied: PyYAML==6.0 in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (6.0)\n",
      "Requirement already satisfied: datasets in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (2.14.4)\n",
      "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (0.1.99)\n",
      "Requirement already satisfied: wandb in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (0.15.8)\n",
      "Requirement already satisfied: einops in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (0.6.1)\n",
      "Requirement already satisfied: xformers in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (0.0.21)\n",
      "Requirement already satisfied: optimum in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (1.11.2)\n",
      "Requirement already satisfied: hf_transfer in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (0.1.3)\n",
      "Requirement already satisfied: colorama in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (0.4.6)\n",
      "Requirement already satisfied: numba in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (0.57.1)\n",
      "Requirement already satisfied: numpy==1.24.4 in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (1.24.4)\n",
      "Requirement already satisfied: bert-score==0.3.13 in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (0.3.13)\n",
      "Requirement already satisfied: evaluate==0.4.0 in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (0.4.0)\n",
      "Requirement already satisfied: rouge-score==0.1.2 in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (0.1.2)\n",
      "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (1.11.2)\n",
      "Requirement already satisfied: scikit-learn==1.2.2 in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (1.2.2)\n",
      "Requirement already satisfied: pynvml in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (11.5.0)\n",
      "Requirement already satisfied: flash-attn==2.0.8 in /usr/local/lib/python3.10/dist-packages (from axolotl==0.1) (2.0.8)\n",
      "Requirement already satisfied: torch>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from bert-score==0.3.13->axolotl==0.1) (2.0.1+cu118)\n",
      "Requirement already satisfied: pandas>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from bert-score==0.3.13->axolotl==0.1) (2.0.3)\n",
      "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from bert-score==0.3.13->axolotl==0.1) (2.28.1)\n",
      "Requirement already satisfied: tqdm>=4.31.1 in /usr/local/lib/python3.10/dist-packages (from bert-score==0.3.13->axolotl==0.1) (4.66.1)\n",
      "Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from bert-score==0.3.13->axolotl==0.1) (3.7.2)\n",
      "Requirement already satisfied: packaging>=20.9 in /usr/local/lib/python3.10/dist-packages (from bert-score==0.3.13->axolotl==0.1) (23.1)\n",
      "Requirement already satisfied: dill in /usr/local/lib/python3.10/dist-packages (from evaluate==0.4.0->axolotl==0.1) (0.3.7)\n",
      "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from evaluate==0.4.0->axolotl==0.1) (3.3.0)\n",
      "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from evaluate==0.4.0->axolotl==0.1) (0.70.15)\n",
      "Requirement already satisfied: fsspec[http]>=2021.05.0 in /usr/local/lib/python3.10/dist-packages (from evaluate==0.4.0->axolotl==0.1) (2023.6.0)\n",
      "Requirement already satisfied: huggingface-hub>=0.7.0 in /usr/local/lib/python3.10/dist-packages (from evaluate==0.4.0->axolotl==0.1) (0.16.4)\n",
      "Requirement already satisfied: responses<0.19 in /usr/local/lib/python3.10/dist-packages (from evaluate==0.4.0->axolotl==0.1) (0.18.0)\n",
      "Requirement already satisfied: ninja in /usr/local/lib/python3.10/dist-packages (from flash-attn==2.0.8->axolotl==0.1) (1.11.1)\n",
      "Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from rouge-score==0.1.2->axolotl==0.1) (1.4.0)\n",
      "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from rouge-score==0.1.2->axolotl==0.1) (3.8.1)\n",
      "Requirement already satisfied: six>=1.14.0 in /usr/lib/python3/dist-packages (from rouge-score==0.1.2->axolotl==0.1) (1.16.0)\n",
      "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2->axolotl==0.1) (1.3.2)\n",
      "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn==1.2.2->axolotl==0.1) (3.2.0)\n",
      "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets->axolotl==0.1) (12.0.1)\n",
      "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets->axolotl==0.1) (3.8.5)\n",
      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers@ git+https://github.com/huggingface/transformers.git->axolotl==0.1) (3.9.0)\n",
      "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers@ git+https://github.com/huggingface/transformers.git->axolotl==0.1) (2023.8.8)\n",
      "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in /usr/local/lib/python3.10/dist-packages (from transformers@ git+https://github.com/huggingface/transformers.git->axolotl==0.1) (0.13.3)\n",
      "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from transformers@ git+https://github.com/huggingface/transformers.git->axolotl==0.1) (0.3.2)\n",
      "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate@ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b->axolotl==0.1) (5.9.5)\n",
      "Requirement already satisfied: termcolor in /usr/local/lib/python3.10/dist-packages (from fire->axolotl==0.1) (2.3.0)\n",
      "Requirement already satisfied: llvmlite<0.41,>=0.40.0dev0 in /usr/local/lib/python3.10/dist-packages (from numba->axolotl==0.1) (0.40.1)\n",
      "Requirement already satisfied: coloredlogs in /usr/local/lib/python3.10/dist-packages (from optimum->axolotl==0.1) (15.0.1)\n",
      "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from optimum->axolotl==0.1) (1.11.1)\n",
      "Requirement already satisfied: Click!=8.0.0,>=7.1 in /usr/local/lib/python3.10/dist-packages (from wandb->axolotl==0.1) (8.1.7)\n",
      "Requirement already satisfied: GitPython!=3.1.29,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from wandb->axolotl==0.1) (3.1.32)\n",
      "Requirement already satisfied: sentry-sdk>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from wandb->axolotl==0.1) (1.29.2)\n",
      "Requirement already satisfied: docker-pycreds>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from wandb->axolotl==0.1) (0.4.0)\n",
      "Requirement already satisfied: pathtools in /usr/local/lib/python3.10/dist-packages (from wandb->axolotl==0.1) (0.1.2)\n",
      "Requirement already satisfied: setproctitle in /usr/local/lib/python3.10/dist-packages (from wandb->axolotl==0.1) (1.3.2)\n",
      "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from wandb->axolotl==0.1) (68.0.0)\n",
      "Requirement already satisfied: appdirs>=1.4.3 in /usr/local/lib/python3.10/dist-packages (from wandb->axolotl==0.1) (1.4.4)\n",
      "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.19.0 in /usr/local/lib/python3.10/dist-packages (from wandb->axolotl==0.1) (4.24.1)\n",
      "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.0.0->bert-score==0.3.13->axolotl==0.1) (4.7.1)\n",
      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.0.0->bert-score==0.3.13->axolotl==0.1) (3.0)\n",
      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.0.0->bert-score==0.3.13->axolotl==0.1) (3.1.2)\n",
      "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.0.0->bert-score==0.3.13->axolotl==0.1) (2.0.0)\n",
      "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.0.0->bert-score==0.3.13->axolotl==0.1) (3.25.0)\n",
      "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.0.0->bert-score==0.3.13->axolotl==0.1) (15.0.7)\n",
      "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->axolotl==0.1) (23.1.0)\n",
      "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->axolotl==0.1) (2.1.1)\n",
      "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->axolotl==0.1) (6.0.4)\n",
      "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->axolotl==0.1) (4.0.3)\n",
      "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->axolotl==0.1) (1.9.2)\n",
      "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->axolotl==0.1) (1.4.0)\n",
      "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->axolotl==0.1) (1.3.1)\n",
      "Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.10/dist-packages (from GitPython!=3.1.29,>=1.0.0->wandb->axolotl==0.1) (4.0.10)\n",
      "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.0.1->bert-score==0.3.13->axolotl==0.1) (2.8.2)\n",
      "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.0.1->bert-score==0.3.13->axolotl==0.1) (2023.3)\n",
      "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas>=1.0.1->bert-score==0.3.13->axolotl==0.1) (2023.3)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->bert-score==0.3.13->axolotl==0.1) (3.4)\n",
      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->bert-score==0.3.13->axolotl==0.1) (1.26.13)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->bert-score==0.3.13->axolotl==0.1) (2022.12.7)\n",
      "Requirement already satisfied: humanfriendly>=9.1 in /usr/local/lib/python3.10/dist-packages (from coloredlogs->optimum->axolotl==0.1) (10.0)\n",
      "Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->bert-score==0.3.13->axolotl==0.1) (1.1.0)\n",
      "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->bert-score==0.3.13->axolotl==0.1) (0.11.0)\n",
      "Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->bert-score==0.3.13->axolotl==0.1) (4.42.1)\n",
      "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->bert-score==0.3.13->axolotl==0.1) (1.4.4)\n",
      "Requirement already satisfied: pillow>=6.2.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->bert-score==0.3.13->axolotl==0.1) (9.3.0)\n",
      "Requirement already satisfied: pyparsing<3.1,>=2.3.1 in /usr/lib/python3/dist-packages (from matplotlib->bert-score==0.3.13->axolotl==0.1) (2.4.7)\n",
      "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->optimum->axolotl==0.1) (1.2.1)\n",
      "Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.10/dist-packages (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb->axolotl==0.1) (5.0.0)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.0.0->bert-score==0.3.13->axolotl==0.1) (2.1.2)\n",
      "Installing collected packages: axolotl\n",
      "  Attempting uninstall: axolotl\n",
      "    Found existing installation: axolotl 0.1\n",
      "    Uninstalling axolotl-0.1:\n",
      "      Successfully uninstalled axolotl-0.1\n",
      "  Running setup.py develop for axolotl\n",
      "Successfully installed axolotl\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.1.2\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m23.2.1\u001b[0m\n",
      "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpython3.10 -m pip install --upgrade pip\u001b[0m\n",
      "Note: you may need to restart the kernel to use updated packages.\n"
     ]
    }
   ],
   "source": [
    "%pip install peft==0.5.0 python-dotenv==2.0.0\n",
    "\n",
    "!git clone https://github.com/OpenAccess-AI-Collective/axolotl\n",
    "%pip install -e \"./axolotl[flash-attn]\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note to the reader: since we'll be basing our fine-tuned model on Meta's Llama 2, you need to apply for access to the weights (which will be automatically granted). Follow the steps on [HuggingFace](https://huggingface.co/meta-llama/Llama-2-7b-hf), then create a read-only access token [here](https://huggingface.co/settings/tokens) and copy it into your .env file."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Hugging Face token set: True\n"
     ]
    }
   ],
   "source": [
    "import dotenv\n",
    "import os\n",
    "\n",
    "dotenv.load_dotenv()\n",
    "\n",
    "has_token = os.getenv(\"HUGGING_FACE_HUB_TOKEN\") is not None\n",
    "\n",
    "print(f\"Hugging Face token set: {has_token}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "I'll use the [axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) library to manage this training run. It includes a lot of neat tricks that speed up training without sacrificing quality.\n",
    "\n",
    "In this case I'm using 8-bit training to use less GPU RAM, and sample packing to maximize GPU utilization. You can read more about the available options at https://github.com/OpenAccess-AI-Collective/axolotl.\n",
    "\n",
    "The training run options are defined in [training-config.yaml](./training-config.yaml)."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "The following values were not passed to `accelerate launch` and had defaults used instead:\n",
      "\t`--num_processes` was set to a value of `1`\n",
      "\t`--num_machines` was set to a value of `1`\n",
      "\t`--mixed_precision` was set to a value of `'no'`\n",
      "\t`--dynamo_backend` was set to a value of `'no'`\n",
      "To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.\n",
      "\n",
      "                           dP            dP   dP\n",
      "                           88            88   88\n",
      ".d8888b. dP.  .dP .d8888b. 88 .d8888b. d8888P 88\n",
      "88'  `88  `8bd8'  88'  `88 88 88'  `88   88   88\n",
      "88.  .88  .d88b.  88.  .88 88 88.  .88   88   88\n",
      "`88888P8 dP'  `dP `88888P' dP `88888P'   dP   dP\n",
      "\n",
      "[2023-08-24 20:18:54,867] [INFO] [axolotl.normalize_config:72] [PID:125016] GPU memory usage baseline: 0.000GB (+0.674GB misc)\u001b[39m\n",
      "[2023-08-24 20:18:54,867] [INFO] [axolotl.scripts.train:189] [PID:125016] loading tokenizer... meta-llama/Llama-2-7b-hf\u001b[39m\n",
      "[2023-08-24 20:18:55,078] [DEBUG] [axolotl.load_tokenizer:64] [PID:125016] EOS: 2 / </s>\u001b[39m\n",
      "[2023-08-24 20:18:55,078] [DEBUG] [axolotl.load_tokenizer:65] [PID:125016] BOS: 1 / <s>\u001b[39m\n",
      "[2023-08-24 20:18:55,078] [DEBUG] [axolotl.load_tokenizer:66] [PID:125016] PAD: 0 / [PAD]\u001b[39m\n",
      "[2023-08-24 20:18:55,078] [DEBUG] [axolotl.load_tokenizer:67] [PID:125016] UNK: 0 / <unk>\u001b[39m\n",
      "[2023-08-24 20:18:55,079] [INFO] [axolotl.load_tokenized_prepared_datasets:126] [PID:125016] Unable to find prepared dataset in data/last_run_prepared/82cd9d58e34e0db98296199248c92d0d\u001b[39m\n",
      "[2023-08-24 20:18:55,079] [INFO] [axolotl.load_tokenized_prepared_datasets:127] [PID:125016] Loading raw datasets...\u001b[39m\n",
      "[2023-08-24 20:18:55,079] [INFO] [axolotl.load_tokenized_prepared_datasets:132] [PID:125016] No seed provided, using default seed of 42\u001b[39m\n",
      "/usr/local/lib/python3.10/dist-packages/datasets/load.py:2072: FutureWarning: 'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n",
      "You can remove this warning by passing 'token=None' instead.\n",
      "  warnings.warn(\n",
      "Downloading data files: 100%|███████████████████| 1/1 [00:00<00:00, 1909.97it/s]\n",
      "Extracting data files: 100%|█████████████████████| 1/1 [00:00<00:00, 130.16it/s]\n",
      "Generating train split: 4501 examples [00:00, 72594.78 examples/s]\n",
      "Map (num_proc=64): 100%|███████████| 4501/4501 [00:01<00:00, 3465.17 examples/s]\n",
      "[2023-08-24 20:18:58,085] [INFO] [axolotl.load_tokenized_prepared_datasets:330] [PID:125016] merging datasets\u001b[39m\n",
      "[2023-08-24 20:18:58,092] [INFO] [axolotl.load_tokenized_prepared_datasets:337] [PID:125016] Saving merged prepared dataset to disk... data/last_run_prepared/82cd9d58e34e0db98296199248c92d0d\u001b[39m\n",
      "Saving the dataset (1/1 shards): 100%|█| 4501/4501 [00:00<00:00, 63380.02 exampl\n",
      "Filter (num_proc=255): 100%|███████| 4275/4275 [00:01<00:00, 3385.29 examples/s]\n",
      "Filter (num_proc=226): 100%|██████████| 226/226 [00:01<00:00, 196.38 examples/s]\n",
      "Map (num_proc=255): 100%|██████████| 4275/4275 [00:02<00:00, 1480.29 examples/s]\n",
      "Map (num_proc=226): 100%|██████████████| 226/226 [00:05<00:00, 44.33 examples/s]\n",
      "[2023-08-24 20:19:33,527] [INFO] [axolotl.calculate_total_num_steps:346] [PID:125016] calculating total_num_tokens\u001b[39m\n",
      "[2023-08-24 20:19:33,536] [INFO] [axolotl.calculate_total_num_steps:353] [PID:125016] 📝 UPDATE CONFIG WITH: `total_num_tokens: 1514815`\u001b[39m\n",
      "[2023-08-24 20:19:33,552] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 20:19:33,590] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] 2ae1e19cb9bd6022bcc024ba552b1341f4c424a75595ff3419969cc2f838c2ba\u001b[39m\n",
      "[2023-08-24 20:19:40,094] [INFO] [axolotl.utils.dataloader.len_w_stats:293] [PID:125016] packing_efficiency_estimate: 1.0 actual packing efficiency: 0.9732312654194079\u001b[39m\n",
      "[2023-08-24 20:19:40,094] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 1.0 total_num_tokens per device: 1514815\u001b[39m\n",
      "[2023-08-24 20:19:40,094] [INFO] [axolotl.calculate_total_num_steps:393] [PID:125016] data_loader_len: 182\u001b[39m\n",
      "[2023-08-24 20:19:40,094] [INFO] [axolotl.calculate_total_num_steps:402] [PID:125016] 📝 UPDATE CONFIG WITH: `sample_packing_eff_est: 0.98`\u001b[39m\n",
      "[2023-08-24 20:19:40,094] [INFO] [axolotl.calculate_total_num_steps:410] [PID:125016] total_num_steps: 227\u001b[39m\n",
      "[2023-08-24 20:19:40,094] [INFO] [axolotl.scripts.train:211] [PID:125016] loading model and (optionally) peft_config...\u001b[39m\n",
      "[2023-08-24 20:19:40,114] [INFO] [axolotl.load_model:106] [PID:125016] patching with flash attention\u001b[39m\n",
      "[2023-08-24 20:19:40,117] [INFO] [axolotl.load_model:147] [PID:125016] patching _expand_mask\u001b[39m\n",
      "Loading checkpoint shards: 100%|██████████████████| 2/2 [00:17<00:00,  8.60s/it]\n",
      "\u001b[33m[2023-08-24 20:19:58,136] [WARNING] [axolotl.load_model:337] [PID:125016] increasing model.config.max_position_embeddings to 4096\u001b[39m\n",
      "[2023-08-24 20:19:58,136] [INFO] [axolotl.load_model:343] [PID:125016] GPU memory usage after model load: 6.681GB (+0.364GB cache, +1.159GB misc)\u001b[39m\n",
      "[2023-08-24 20:19:58,136] [INFO] [axolotl.load_model:349] [PID:125016] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
      "[2023-08-24 20:19:58,146] [INFO] [axolotl.load_lora:473] [PID:125016] found linear modules: ['k_proj', 'q_proj', 'o_proj', 'up_proj', 'down_proj', 'gate_proj', 'v_proj']\u001b[39m\n",
      "trainable params: 79,953,920 || all params: 6,818,369,536 || trainable%: 1.172625208678628\n",
      "[2023-08-24 20:20:53,348] [INFO] [axolotl.load_model:394] [PID:125016] GPU memory usage after adapters: 6.830GB (+1.365GB cache, +1.159GB misc)\u001b[39m\n",
      "[2023-08-24 20:20:53,380] [INFO] [axolotl.scripts.train:267] [PID:125016] Compiling torch model\u001b[39m\n",
      "[2023-08-24 20:20:53,544] [INFO] [axolotl.scripts.train:272] [PID:125016] Pre-saving adapter config to ./models/run1\u001b[39m\n",
      "[2023-08-24 20:20:53,548] [INFO] [axolotl.scripts.train:288] [PID:125016] Starting trainer...\u001b[39m\n",
      "[2023-08-24 20:20:53,747] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 1514815\u001b[39m\n",
      "[2023-08-24 20:20:53,747] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 1514815\u001b[39m\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mopenpipe\u001b[0m (\u001b[33mopenpipe-team\u001b[0m). Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Tracking run with wandb version 0.15.8\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Run data is saved locally in \u001b[35m\u001b[1m/workspace/OpenPipe/examples/classify-recipes/wandb/run-20230824_202055-run1\u001b[0m\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Run \u001b[1m`wandb offline`\u001b[0m to turn off syncing.\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Syncing run \u001b[33mrun1\u001b[0m\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: ⭐️ View project at \u001b[34m\u001b[4mhttps://wandb.ai/openpipe-team/classify-recipes\u001b[0m\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run at \u001b[34m\u001b[4mhttps://wandb.ai/openpipe-team/classify-recipes/runs/run1\u001b[0m\n",
      "  0%|                                                   | 0/230 [00:00<?, ?it/s][2023-08-24 20:20:56,099] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 1514815\u001b[39m\n",
      "[2023-08-24 20:20:56,099] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 20:20:56,102] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] ac74b20cc92d80020bfc11a9bed8f0bf75dfc745b23630320c27a53a549d7cae\u001b[39m\n",
      "[2023-08-24 20:20:56,106] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 1514815\u001b[39m\n",
      "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "{'loss': 1.7489, 'learning_rate': 2e-05, 'epoch': 0.02}                         \n",
      "  0%|▏                                        | 1/230 [00:19<1:13:24, 19.24s/it][2023-08-24 20:21:34,307] [INFO] [axolotl.callbacks.on_step_end:96] [PID:125016] GPU memory usage while training: 7.107GB (+10.436GB cache, +1.190GB misc)\u001b[39m\n",
      "{'loss': 1.7393, 'learning_rate': 4e-05, 'epoch': 0.04}                         \n",
      "{'loss': 1.7469, 'learning_rate': 6e-05, 'epoch': 0.06}                         \n",
      "{'loss': 1.7368, 'learning_rate': 8e-05, 'epoch': 0.09}                         \n",
      "{'loss': 1.6956, 'learning_rate': 0.0001, 'epoch': 0.11}                        \n",
      "{'loss': 1.6289, 'learning_rate': 0.00012, 'epoch': 0.13}                       \n",
      "{'loss': 1.4673, 'learning_rate': 0.00014, 'epoch': 0.15}                       \n",
      "{'loss': 1.2552, 'learning_rate': 0.00016, 'epoch': 0.17}                       \n",
      "{'loss': 0.9807, 'learning_rate': 0.00018, 'epoch': 0.19}                       \n",
      "{'loss': 0.7046, 'learning_rate': 0.0002, 'epoch': 0.22}                        \n",
      "{'loss': 0.4783, 'learning_rate': 0.00019998952044849376, 'epoch': 0.24}        \n",
      "{'loss': 0.3099, 'learning_rate': 0.00019995808399039496, 'epoch': 0.26}        \n",
      "{'loss': 0.2095, 'learning_rate': 0.00019990569721450326, 'epoch': 0.28}        \n",
      "{'loss': 0.0851, 'learning_rate': 0.00019983237110061697, 'epoch': 0.3}         \n",
      "{'loss': 0.0949, 'learning_rate': 0.00019973812101723188, 'epoch': 0.32}        \n",
      "{'loss': 0.0496, 'learning_rate': 0.00019962296671832003, 'epoch': 0.35}        \n",
      "{'loss': 0.0415, 'learning_rate': 0.00019948693233918952, 'epoch': 0.37}        \n",
      "{'loss': 0.0405, 'learning_rate': 0.00019933004639142605, 'epoch': 0.39}        \n",
      "{'loss': 0.0451, 'learning_rate': 0.000199152341756917, 'epoch': 0.41}          \n",
      "{'loss': 0.0326, 'learning_rate': 0.00019895385568095982, 'epoch': 0.43}        \n",
      "  9%|███▍                                    | 20/230 [06:15<1:05:47, 18.80s/it][2023-08-24 20:27:11,801] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 20:27:11,810] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 20:27:11,810] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] c0ef04db402ba917eb072daff58b8c0ef38c662600f92eee3292e60918d59b78\u001b[39m\n",
      "[2023-08-24 20:27:11,810] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "[2023-08-24 20:27:13,176] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:27:13,176] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 20:27:13,177] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "  0%|                                                     | 0/8 [00:00<?, ?it/s]\u001b[A[2023-08-24 20:27:14,581] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:27:14,582] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 25%|███████████▎                                 | 2/8 [00:01<00:04,  1.42it/s]\u001b[A[2023-08-24 20:27:16,012] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:27:16,013] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 38%|████████████████▉                            | 3/8 [00:02<00:05,  1.01s/it]\u001b[A[2023-08-24 20:27:17,381] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:27:17,381] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 50%|██████████████████████▌                      | 4/8 [00:04<00:04,  1.14s/it]\u001b[A[2023-08-24 20:27:18,789] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:27:18,789] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 62%|████████████████████████████▏                | 5/8 [00:05<00:03,  1.23s/it]\u001b[A[2023-08-24 20:27:20,178] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:27:20,178] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 75%|█████████████████████████████████▊           | 6/8 [00:07<00:02,  1.29s/it]\u001b[A[2023-08-24 20:27:21,602] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:27:21,602] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 88%|███████████████████████████████████████▍     | 7/8 [00:08<00:01,  1.33s/it]\u001b[A[2023-08-24 20:27:22,986] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:27:22,986] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "                                                                                \u001b[A\n",
      "\u001b[A{'eval_loss': 0.03450942039489746, 'eval_runtime': 11.2098, 'eval_samples_per_second': 20.161, 'eval_steps_per_second': 10.08, 'epoch': 0.43}\n",
      "  9%|███▍                                    | 20/230 [06:26<1:05:47, 18.80s/it]\n",
      "100%|█████████████████████████████████████████████| 8/8 [00:09<00:00,  1.35s/it]\u001b[A\n",
      "{'loss': 0.0336, 'learning_rate': 0.00019873462976445553, 'epoch': 0.45}        \u001b[A\n",
      "{'loss': 0.0329, 'learning_rate': 0.00019849470995518992, 'epoch': 0.48}        \n",
      "{'loss': 0.0317, 'learning_rate': 0.0001982341465382029, 'epoch': 0.5}          \n",
      "{'loss': 0.0319, 'learning_rate': 0.00019795299412524945, 'epoch': 0.52}        \n",
      "{'loss': 0.0258, 'learning_rate': 0.00019765131164335345, 'epoch': 0.54}        \n",
      "{'loss': 0.024, 'learning_rate': 0.000197329162322457, 'epoch': 0.56}           \n",
      "{'loss': 0.0251, 'learning_rate': 0.00019698661368216817, 'epoch': 0.58}        \n",
      "{'loss': 0.025, 'learning_rate': 0.00019662373751760934, 'epoch': 0.61}         \n",
      "{'loss': 0.0258, 'learning_rate': 0.00019624060988436966, 'epoch': 0.63}        \n",
      "{'loss': 0.0225, 'learning_rate': 0.0001958373110825644, 'epoch': 0.65}         \n",
      "{'loss': 0.0252, 'learning_rate': 0.00019541392564000488, 'epoch': 0.67}        \n",
      "{'loss': 0.0233, 'learning_rate': 0.00019497054229448223, 'epoch': 0.69}        \n",
      "{'loss': 0.0231, 'learning_rate': 0.0001945072539751685, 'epoch': 0.71}         \n",
      "{'loss': 0.0208, 'learning_rate': 0.00019402415778313977, 'epoch': 0.74}        \n",
      "{'loss': 0.0221, 'learning_rate': 0.00019352135497102463, 'epoch': 0.76}        \n",
      "{'loss': 0.0251, 'learning_rate': 0.0001929989509217824, 'epoch': 0.78}         \n",
      "{'loss': 0.0192, 'learning_rate': 0.0001924570551266159, 'epoch': 0.8}          \n",
      "{'loss': 0.021, 'learning_rate': 0.00019189578116202307, 'epoch': 0.82}         \n",
      "{'loss': 0.017, 'learning_rate': 0.00019131524666599233, 'epoch': 0.84}         \n",
      "{'loss': 0.0235, 'learning_rate': 0.00019071557331334669, 'epoch': 0.86}        \n",
      " 17%|███████▎                                  | 40/230 [12:42<59:13, 18.70s/it][2023-08-24 20:33:38,317] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 20:33:38,326] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 20:33:38,326] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] c0ef04db402ba917eb072daff58b8c0ef38c662600f92eee3292e60918d59b78\u001b[39m\n",
      "[2023-08-24 20:33:38,327] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "[2023-08-24 20:33:39,687] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:33:39,688] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 20:33:39,688] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "  0%|                                                     | 0/8 [00:00<?, ?it/s]\u001b[A[2023-08-24 20:33:41,085] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:33:41,085] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 25%|███████████▎                                 | 2/8 [00:01<00:04,  1.43it/s]\u001b[A[2023-08-24 20:33:42,510] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:33:42,511] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 38%|████████████████▉                            | 3/8 [00:02<00:05,  1.00s/it]\u001b[A[2023-08-24 20:33:43,870] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:33:43,870] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 50%|██████████████████████▌                      | 4/8 [00:04<00:04,  1.13s/it]\u001b[A[2023-08-24 20:33:45,268] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:33:45,268] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 62%|████████████████████████████▏                | 5/8 [00:05<00:03,  1.23s/it]\u001b[A[2023-08-24 20:33:46,650] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:33:46,650] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 75%|█████████████████████████████████▊           | 6/8 [00:06<00:02,  1.28s/it]\u001b[A[2023-08-24 20:33:48,071] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:33:48,072] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 88%|███████████████████████████████████████▍     | 7/8 [00:08<00:01,  1.32s/it]\u001b[A[2023-08-24 20:33:49,455] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:33:49,455] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "                                                                                \u001b[A\n",
      "\u001b[A{'eval_loss': 0.018499867990612984, 'eval_runtime': 11.1626, 'eval_samples_per_second': 20.246, 'eval_steps_per_second': 10.123, 'epoch': 0.86}\n",
      " 17%|███████▎                                  | 40/230 [12:53<59:13, 18.70s/it]\n",
      "100%|█████████████████████████████████████████████| 8/8 [00:09<00:00,  1.34s/it]\u001b[A\n",
      "{'loss': 0.0207, 'learning_rate': 0.0001900968867902419, 'epoch': 0.89}         \u001b[A\n",
      "{'loss': 0.0188, 'learning_rate': 0.00018945931676782373, 'epoch': 0.91}        \n",
      "{'loss': 0.0169, 'learning_rate': 0.0001888029968750498, 'epoch': 0.93}         \n",
      "{'loss': 0.0176, 'learning_rate': 0.00018812806467068268, 'epoch': 0.95}        \n",
      "{'loss': 0.0162, 'learning_rate': 0.00018743466161445823, 'epoch': 0.97}        \n",
      "{'loss': 0.0204, 'learning_rate': 0.00018672293303743738, 'epoch': 0.99}        \n",
      " 20%|████████▍                                 | 46/230 [14:46<59:16, 19.33s/it][2023-08-24 20:35:47,036] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 1514815\u001b[39m\n",
      "[2023-08-24 20:35:47,036] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 20:35:47,038] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] 6076f9186c2a908489c30feee8b4739eb4ac652346e4a07ad9ea9efc2cefc22f\u001b[39m\n",
      "[2023-08-24 20:35:47,040] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 1514815\u001b[39m\n",
      "{'loss': 0.0158, 'learning_rate': 0.00018599302811154572, 'epoch': 1.02}        \n",
      "{'loss': 0.0182, 'learning_rate': 0.00018524509981830852, 'epoch': 1.04}        \n",
      "{'loss': 0.0185, 'learning_rate': 0.00018447930491678733, 'epoch': 1.06}        \n",
      "{'loss': 0.0169, 'learning_rate': 0.00018369580391072433, 'epoch': 1.08}        \n",
      "{'loss': 0.0167, 'learning_rate': 0.00018289476101490256, 'epoch': 1.1}         \n",
      "{'loss': 0.0177, 'learning_rate': 0.00018207634412072764, 'epoch': 1.12}        \n",
      "{'loss': 0.0196, 'learning_rate': 0.00018124072476103956, 'epoch': 1.15}        \n",
      "{'loss': 0.0165, 'learning_rate': 0.00018038807807416068, 'epoch': 1.17}        \n",
      "{'loss': 0.0148, 'learning_rate': 0.00017951858276718844, 'epoch': 1.19}        \n",
      "{'loss': 0.0149, 'learning_rate': 0.00017863242107853995, 'epoch': 1.21}        \n",
      "{'loss': 0.0161, 'learning_rate': 0.0001777297787397563, 'epoch': 1.23}         \n",
      "{'loss': 0.0165, 'learning_rate': 0.00017681084493657525, 'epoch': 1.25}        \n",
      "{'loss': 0.0169, 'learning_rate': 0.0001758758122692791, 'epoch': 1.28}         \n",
      "{'loss': 0.0183, 'learning_rate': 0.00017492487671232784, 'epoch': 1.3}         \n",
      " 26%|██████████▉                               | 60/230 [19:07<52:41, 18.59s/it][2023-08-24 20:40:03,988] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 20:40:03,996] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 20:40:03,996] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] c0ef04db402ba917eb072daff58b8c0ef38c662600f92eee3292e60918d59b78\u001b[39m\n",
      "[2023-08-24 20:40:03,996] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "[2023-08-24 20:40:05,357] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:40:05,357] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 20:40:05,357] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "  0%|                                                     | 0/8 [00:00<?, ?it/s]\u001b[A[2023-08-24 20:40:06,753] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:40:06,753] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 25%|███████████▎                                 | 2/8 [00:01<00:04,  1.43it/s]\u001b[A[2023-08-24 20:40:08,176] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:40:08,177] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 38%|████████████████▉                            | 3/8 [00:02<00:05,  1.00s/it]\u001b[A[2023-08-24 20:40:09,534] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:40:09,534] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 50%|██████████████████████▌                      | 4/8 [00:04<00:04,  1.13s/it]\u001b[A[2023-08-24 20:40:10,931] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:40:10,931] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 62%|████████████████████████████▏                | 5/8 [00:05<00:03,  1.23s/it]\u001b[A[2023-08-24 20:40:12,312] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:40:12,312] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 75%|█████████████████████████████████▊           | 6/8 [00:06<00:02,  1.28s/it]\u001b[A[2023-08-24 20:40:13,734] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:40:13,735] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 88%|███████████████████████████████████████▍     | 7/8 [00:08<00:01,  1.32s/it]\u001b[A[2023-08-24 20:40:15,118] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:40:15,118] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "                                                                                \u001b[A\n",
      "\u001b[A{'eval_loss': 0.016062971204519272, 'eval_runtime': 11.1551, 'eval_samples_per_second': 20.26, 'eval_steps_per_second': 10.13, 'epoch': 1.3}\n",
      " 26%|██████████▉                               | 60/230 [19:19<52:41, 18.59s/it]\n",
      "100%|█████████████████████████████████████████████| 8/8 [00:09<00:00,  1.34s/it]\u001b[A\n",
      "                                                                                \u001b[A/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "{'loss': 0.0161, 'learning_rate': 0.00017395823757328444, 'epoch': 1.32}        \n",
      "{'loss': 0.0153, 'learning_rate': 0.00017297609745104184, 'epoch': 1.34}        \n",
      "{'loss': 0.018, 'learning_rate': 0.0001719786621933599, 'epoch': 1.36}          \n",
      "{'loss': 0.0128, 'learning_rate': 0.00017096614085372185, 'epoch': 1.38}        \n",
      "{'loss': 0.0156, 'learning_rate': 0.00016993874564751822, 'epoch': 1.41}        \n",
      "{'loss': 0.0139, 'learning_rate': 0.00016889669190756868, 'epoch': 1.43}        \n",
      "{'loss': 0.0205, 'learning_rate': 0.00016784019803899, 'epoch': 1.45}           \n",
      "{'loss': 0.0151, 'learning_rate': 0.0001667694854734204, 'epoch': 1.47}         \n",
      "{'loss': 0.0148, 'learning_rate': 0.0001656847786226095, 'epoch': 1.49}         \n",
      "{'loss': 0.0142, 'learning_rate': 0.00016458630483138356, 'epoch': 1.51}        \n",
      "{'loss': 0.0159, 'learning_rate': 0.00016347429432999602, 'epoch': 1.54}        \n",
      "{'loss': 0.018, 'learning_rate': 0.00016234898018587337, 'epoch': 1.56}         \n",
      "{'loss': 0.0149, 'learning_rate': 0.0001612105982547663, 'epoch': 1.58}         \n",
      "{'loss': 0.0147, 'learning_rate': 0.00016005938713131642, 'epoch': 1.6}         \n",
      "{'loss': 0.0138, 'learning_rate': 0.00015889558809904902, 'epoch': 1.62}        \n",
      "{'loss': 0.0146, 'learning_rate': 0.00015771944507980207, 'epoch': 1.64}        \n",
      "{'loss': 0.0156, 'learning_rate': 0.00015653120458260263, 'epoch': 1.66}        \n",
      "{'loss': 0.0159, 'learning_rate': 0.00015533111565200044, 'epoch': 1.69}        \n",
      "{'loss': 0.0152, 'learning_rate': 0.0001541194298158708, 'epoch': 1.71}         \n",
      "{'loss': 0.0179, 'learning_rate': 0.00015289640103269625, 'epoch': 1.73}        \n",
      " 35%|██████████████▌                           | 80/230 [25:36<46:59, 18.80s/it][2023-08-24 20:46:32,302] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 20:46:32,310] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 20:46:32,311] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] c0ef04db402ba917eb072daff58b8c0ef38c662600f92eee3292e60918d59b78\u001b[39m\n",
      "[2023-08-24 20:46:32,311] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "[2023-08-24 20:46:33,670] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:46:33,670] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 20:46:33,670] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "  0%|                                                     | 0/8 [00:00<?, ?it/s]\u001b[A[2023-08-24 20:46:35,068] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:46:35,068] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 25%|███████████▎                                 | 2/8 [00:01<00:04,  1.43it/s]\u001b[A[2023-08-24 20:46:36,491] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:46:36,491] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 38%|████████████████▉                            | 3/8 [00:02<00:05,  1.00s/it]\u001b[A[2023-08-24 20:46:37,849] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:46:37,849] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 50%|██████████████████████▌                      | 4/8 [00:04<00:04,  1.13s/it]\u001b[A[2023-08-24 20:46:39,247] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:46:39,247] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 62%|████████████████████████████▏                | 5/8 [00:05<00:03,  1.23s/it]\u001b[A[2023-08-24 20:46:40,629] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:46:40,629] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 75%|█████████████████████████████████▊           | 6/8 [00:06<00:02,  1.28s/it]\u001b[A[2023-08-24 20:46:42,051] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:46:42,051] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 88%|███████████████████████████████████████▍     | 7/8 [00:08<00:01,  1.32s/it]\u001b[A[2023-08-24 20:46:43,434] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:46:43,435] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "                                                                                \u001b[A\n",
      "\u001b[A{'eval_loss': 0.015303226187825203, 'eval_runtime': 11.157, 'eval_samples_per_second': 20.256, 'eval_steps_per_second': 10.128, 'epoch': 1.73}\n",
      " 35%|██████████████▌                           | 80/230 [25:47<46:59, 18.80s/it]\n",
      "100%|█████████████████████████████████████████████| 8/8 [00:09<00:00,  1.34s/it]\u001b[A\n",
      "{'loss': 0.016, 'learning_rate': 0.00015166228563833934, 'epoch': 1.75}         \u001b[A\n",
      "{'loss': 0.0149, 'learning_rate': 0.00015041734229231688, 'epoch': 1.77}        \n",
      "{'loss': 0.0159, 'learning_rate': 0.00014916183192358718, 'epoch': 1.79}        \n",
      "{'loss': 0.0181, 'learning_rate': 0.00014789601767586173, 'epoch': 1.82}        \n",
      "{'loss': 0.0126, 'learning_rate': 0.00014662016485245274, 'epoch': 1.84}        \n",
      "{'loss': 0.0195, 'learning_rate': 0.00014533454086066772, 'epoch': 1.86}        \n",
      "{'loss': 0.0134, 'learning_rate': 0.00014403941515576344, 'epoch': 1.88}        \n",
      "{'loss': 0.0131, 'learning_rate': 0.00014273505918447054, 'epoch': 1.9}         \n",
      "{'loss': 0.0126, 'learning_rate': 0.00014142174632810072, 'epoch': 1.92}        \n",
      "{'loss': 0.013, 'learning_rate': 0.0001400997518452484, 'epoch': 1.95}          \n",
      "{'loss': 0.0177, 'learning_rate': 0.00013876935281409907, 'epoch': 1.97}        \n",
      "{'loss': 0.0148, 'learning_rate': 0.00013743082807435615, 'epoch': 1.99}        \n",
      " 40%|████████████████▊                         | 92/230 [29:32<43:20, 18.85s/it][2023-08-24 20:50:38,268] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 1514815\u001b[39m\n",
      "[2023-08-24 20:50:38,269] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 20:50:38,271] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] 066d0afd59aa1c812e1335cead6076c131869d6997468e47f96e2f7244232bfe\u001b[39m\n",
      "[2023-08-24 20:50:38,273] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 1514815\u001b[39m\n",
      "{'loss': 0.0145, 'learning_rate': 0.00013608445816879866, 'epoch': 2.01}        \n",
      "{'loss': 0.0151, 'learning_rate': 0.00013473052528448201, 'epoch': 2.03}        \n",
      "{'loss': 0.0128, 'learning_rate': 0.00013336931319359426, 'epoch': 2.05}        \n",
      "{'loss': 0.0163, 'learning_rate': 0.00013200110719397968, 'epoch': 2.08}        \n",
      "{'loss': 0.016, 'learning_rate': 0.00013062619404934317, 'epoch': 2.1}          \n",
      "{'loss': 0.015, 'learning_rate': 0.00012924486192914705, 'epoch': 2.12}         \n",
      "{'loss': 0.0124, 'learning_rate': 0.00012785740034821329, 'epoch': 2.14}        \n",
      "{'loss': 0.0134, 'learning_rate': 0.00012646410010604397, 'epoch': 2.16}        \n",
      " 43%|█████████████████▊                       | 100/230 [32:02<40:34, 18.73s/it][2023-08-24 20:52:58,789] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 20:52:58,797] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 20:52:58,797] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] c0ef04db402ba917eb072daff58b8c0ef38c662600f92eee3292e60918d59b78\u001b[39m\n",
      "[2023-08-24 20:52:58,798] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "[2023-08-24 20:53:00,162] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:53:00,163] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 20:53:00,163] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "  0%|                                                     | 0/8 [00:00<?, ?it/s]\u001b[A[2023-08-24 20:53:01,560] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:53:01,561] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 25%|███████████▎                                 | 2/8 [00:01<00:04,  1.43it/s]\u001b[A[2023-08-24 20:53:02,985] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:53:02,985] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 38%|████████████████▉                            | 3/8 [00:02<00:05,  1.00s/it]\u001b[A[2023-08-24 20:53:04,343] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:53:04,344] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 50%|██████████████████████▌                      | 4/8 [00:04<00:04,  1.13s/it]\u001b[A[2023-08-24 20:53:05,742] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:53:05,742] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 62%|████████████████████████████▏                | 5/8 [00:05<00:03,  1.23s/it]\u001b[A[2023-08-24 20:53:07,122] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:53:07,123] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 75%|█████████████████████████████████▊           | 6/8 [00:06<00:02,  1.28s/it]\u001b[A[2023-08-24 20:53:08,545] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:53:08,545] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 88%|███████████████████████████████████████▍     | 7/8 [00:08<00:01,  1.32s/it]\u001b[A[2023-08-24 20:53:09,929] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:53:09,929] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "                                                                                \u001b[A\n",
      "\u001b[A{'eval_loss': 0.01423712819814682, 'eval_runtime': 11.165, 'eval_samples_per_second': 20.242, 'eval_steps_per_second': 10.121, 'epoch': 2.16}\n",
      " 43%|█████████████████▊                       | 100/230 [32:13<40:34, 18.73s/it]\n",
      "100%|█████████████████████████████████████████████| 8/8 [00:09<00:00,  1.34s/it]\u001b[A\n",
      "{'loss': 0.0148, 'learning_rate': 0.00012506525322587207, 'epoch': 2.18}        \u001b[A\n",
      "{'loss': 0.0163, 'learning_rate': 0.0001236611528934562, 'epoch': 2.21}         \n",
      "{'loss': 0.0145, 'learning_rate': 0.00012225209339563145, 'epoch': 2.23}        \n",
      "{'loss': 0.0166, 'learning_rate': 0.00012083837005862946, 'epoch': 2.25}        \n",
      "{'loss': 0.0115, 'learning_rate': 0.00011942027918618074, 'epoch': 2.27}        \n",
      "{'loss': 0.0107, 'learning_rate': 0.0001179981179974121, 'epoch': 2.29}         \n",
      "{'loss': 0.012, 'learning_rate': 0.00011657218456455206, 'epoch': 2.31}         \n",
      "{'loss': 0.0129, 'learning_rate': 0.00011514277775045768, 'epoch': 2.34}        \n",
      "{'loss': 0.0118, 'learning_rate': 0.00011371019714597562, 'epoch': 2.36}        \n",
      "{'loss': 0.0138, 'learning_rate': 0.00011227474300715055, 'epoch': 2.38}        \n",
      "{'loss': 0.013, 'learning_rate': 0.00011083671619229408, 'epoch': 2.4}          \n",
      "{'loss': 0.0119, 'learning_rate': 0.00010939641809892767, 'epoch': 2.42}        \n",
      "{'loss': 0.0139, 'learning_rate': 0.00010795415060061243, 'epoch': 2.44}        \n",
      "{'loss': 0.0159, 'learning_rate': 0.00010651021598367906, 'epoch': 2.46}        \n",
      "{'loss': 0.0143, 'learning_rate': 0.00010506491688387127, 'epoch': 2.49}        \n",
      "{'loss': 0.0154, 'learning_rate': 0.00010361855622291637, 'epoch': 2.51}        \n",
      "{'loss': 0.0131, 'learning_rate': 0.00010217143714503508, 'epoch': 2.53}        \n",
      "{'loss': 0.0124, 'learning_rate': 0.00010072386295340572, 'epoch': 2.55}        \n",
      "{'loss': 0.0127, 'learning_rate': 9.927613704659429e-05, 'epoch': 2.57}         \n",
      "{'loss': 0.0141, 'learning_rate': 9.782856285496495e-05, 'epoch': 2.59}         \n",
      " 52%|█████████████████████▍                   | 120/230 [38:28<34:16, 18.69s/it][2023-08-24 20:59:24,312] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 20:59:24,320] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 20:59:24,320] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] c0ef04db402ba917eb072daff58b8c0ef38c662600f92eee3292e60918d59b78\u001b[39m\n",
      "[2023-08-24 20:59:24,320] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "[2023-08-24 20:59:25,678] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:59:25,679] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 20:59:25,679] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "  0%|                                                     | 0/8 [00:00<?, ?it/s]\u001b[A[2023-08-24 20:59:27,076] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:59:27,076] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 25%|███████████▎                                 | 2/8 [00:01<00:04,  1.43it/s]\u001b[A[2023-08-24 20:59:28,499] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:59:28,499] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 38%|████████████████▉                            | 3/8 [00:02<00:05,  1.00s/it]\u001b[A[2023-08-24 20:59:29,857] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:59:29,857] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 50%|██████████████████████▌                      | 4/8 [00:04<00:04,  1.13s/it]\u001b[A[2023-08-24 20:59:31,255] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:59:31,255] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 62%|████████████████████████████▏                | 5/8 [00:05<00:03,  1.23s/it]\u001b[A[2023-08-24 20:59:32,636] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:59:32,636] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 75%|█████████████████████████████████▊           | 6/8 [00:06<00:02,  1.28s/it]\u001b[A[2023-08-24 20:59:34,057] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:59:34,057] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 88%|███████████████████████████████████████▍     | 7/8 [00:08<00:01,  1.32s/it]\u001b[A[2023-08-24 20:59:35,441] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 20:59:35,441] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "                                                                                \u001b[A\n",
      "\u001b[A{'eval_loss': 0.01440380234271288, 'eval_runtime': 11.1537, 'eval_samples_per_second': 20.262, 'eval_steps_per_second': 10.131, 'epoch': 2.59}\n",
      " 52%|█████████████████████▍                   | 120/230 [38:39<34:16, 18.69s/it]\n",
      "100%|█████████████████████████████████████████████| 8/8 [00:09<00:00,  1.34s/it]\u001b[A\n",
      "                                                                                \u001b[A/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "{'loss': 0.0133, 'learning_rate': 9.638144377708367e-05, 'epoch': 2.62}         \n",
      "{'loss': 0.0151, 'learning_rate': 9.493508311612874e-05, 'epoch': 2.64}         \n",
      "{'loss': 0.0136, 'learning_rate': 9.348978401632101e-05, 'epoch': 2.66}         \n",
      "{'loss': 0.0129, 'learning_rate': 9.204584939938762e-05, 'epoch': 2.68}         \n",
      "{'loss': 0.0141, 'learning_rate': 9.060358190107234e-05, 'epoch': 2.7}          \n",
      "{'loss': 0.0131, 'learning_rate': 8.916328380770595e-05, 'epoch': 2.72}         \n",
      "{'loss': 0.0154, 'learning_rate': 8.772525699284946e-05, 'epoch': 2.75}         \n",
      "{'loss': 0.0119, 'learning_rate': 8.628980285402439e-05, 'epoch': 2.77}         \n",
      "{'loss': 0.0104, 'learning_rate': 8.485722224954237e-05, 'epoch': 2.79}         \n",
      "{'loss': 0.013, 'learning_rate': 8.342781543544798e-05, 'epoch': 2.81}          \n",
      "{'loss': 0.0112, 'learning_rate': 8.200188200258791e-05, 'epoch': 2.83}         \n",
      "{'loss': 0.0112, 'learning_rate': 8.057972081381927e-05, 'epoch': 2.85}         \n",
      "{'loss': 0.0127, 'learning_rate': 7.916162994137056e-05, 'epoch': 2.88}         \n",
      "{'loss': 0.0149, 'learning_rate': 7.774790660436858e-05, 'epoch': 2.9}          \n",
      "{'loss': 0.0178, 'learning_rate': 7.633884710654383e-05, 'epoch': 2.92}         \n",
      "{'loss': 0.0119, 'learning_rate': 7.493474677412794e-05, 'epoch': 2.94}         \n",
      "{'loss': 0.0137, 'learning_rate': 7.353589989395604e-05, 'epoch': 2.96}         \n",
      "{'loss': 0.0145, 'learning_rate': 7.214259965178674e-05, 'epoch': 2.98}         \n",
      " 60%|████████████████████████▌                | 138/230 [44:18<28:53, 18.84s/it][2023-08-24 21:05:28,422] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 1514815\u001b[39m\n",
      "[2023-08-24 21:05:28,423] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 21:05:28,424] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] b60ca8a353f86b08d0005489b946fc3b062142d53d2ef59949adfba0b078763f\u001b[39m\n",
      "[2023-08-24 21:05:28,427] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 1514815\u001b[39m\n",
      "{'loss': 0.0133, 'learning_rate': 7.075513807085299e-05, 'epoch': 3.01}         \n",
      "{'loss': 0.0128, 'learning_rate': 6.937380595065685e-05, 'epoch': 3.03}         \n",
      " 61%|████████████████████████▉                | 140/230 [44:55<28:12, 18.81s/it][2023-08-24 21:05:51,811] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 21:05:51,819] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 21:05:51,820] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] c0ef04db402ba917eb072daff58b8c0ef38c662600f92eee3292e60918d59b78\u001b[39m\n",
      "[2023-08-24 21:05:51,820] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "[2023-08-24 21:05:53,181] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:05:53,181] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 21:05:53,181] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "  0%|                                                     | 0/8 [00:00<?, ?it/s]\u001b[A[2023-08-24 21:05:54,578] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:05:54,578] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 25%|███████████▎                                 | 2/8 [00:01<00:04,  1.43it/s]\u001b[A[2023-08-24 21:05:56,002] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:05:56,002] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 38%|████████████████▉                            | 3/8 [00:02<00:05,  1.00s/it]\u001b[A[2023-08-24 21:05:57,363] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:05:57,363] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 50%|██████████████████████▌                      | 4/8 [00:04<00:04,  1.14s/it]\u001b[A[2023-08-24 21:05:58,762] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:05:58,762] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 62%|████████████████████████████▏                | 5/8 [00:05<00:03,  1.23s/it]\u001b[A[2023-08-24 21:06:00,145] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:06:00,145] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 75%|█████████████████████████████████▊           | 6/8 [00:06<00:02,  1.28s/it]\u001b[A[2023-08-24 21:06:01,569] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:06:01,569] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 88%|███████████████████████████████████████▍     | 7/8 [00:08<00:01,  1.33s/it]\u001b[A[2023-08-24 21:06:02,956] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:06:02,956] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "                                                                                \u001b[A\n",
      "\u001b[A{'eval_loss': 0.01361126359552145, 'eval_runtime': 11.1696, 'eval_samples_per_second': 20.233, 'eval_steps_per_second': 10.117, 'epoch': 3.03}\n",
      " 61%|████████████████████████▉                | 140/230 [45:06<28:12, 18.81s/it]\n",
      "100%|█████████████████████████████████████████████| 8/8 [00:09<00:00,  1.34s/it]\u001b[A\n",
      "{'loss': 0.0129, 'learning_rate': 6.799889280602031e-05, 'epoch': 3.05}         \u001b[A\n",
      "{'loss': 0.0133, 'learning_rate': 6.663068680640574e-05, 'epoch': 3.07}         \n",
      "{'loss': 0.0142, 'learning_rate': 6.526947471551798e-05, 'epoch': 3.09}         \n",
      "{'loss': 0.0118, 'learning_rate': 6.391554183120138e-05, 'epoch': 3.11}         \n",
      "{'loss': 0.0144, 'learning_rate': 6.25691719256439e-05, 'epoch': 3.14}          \n",
      "{'loss': 0.0142, 'learning_rate': 6.123064718590099e-05, 'epoch': 3.16}         \n",
      "{'loss': 0.013, 'learning_rate': 5.9900248154751616e-05, 'epoch': 3.18}         \n",
      "{'loss': 0.0123, 'learning_rate': 5.857825367189931e-05, 'epoch': 3.2}          \n",
      "{'loss': 0.0109, 'learning_rate': 5.7264940815529485e-05, 'epoch': 3.22}        \n",
      "{'loss': 0.0107, 'learning_rate': 5.596058484423656e-05, 'epoch': 3.24}         \n",
      "{'loss': 0.0104, 'learning_rate': 5.46654591393323e-05, 'epoch': 3.26}          \n",
      "{'loss': 0.0139, 'learning_rate': 5.337983514754723e-05, 'epoch': 3.29}         \n",
      "{'loss': 0.0138, 'learning_rate': 5.2103982324138244e-05, 'epoch': 3.31}        \n",
      "{'loss': 0.0155, 'learning_rate': 5.083816807641284e-05, 'epoch': 3.33}         \n",
      "{'loss': 0.0129, 'learning_rate': 4.958265770768316e-05, 'epoch': 3.35}         \n",
      "{'loss': 0.0143, 'learning_rate': 4.833771436166069e-05, 'epoch': 3.37}         \n",
      "{'loss': 0.0146, 'learning_rate': 4.710359896730379e-05, 'epoch': 3.39}         \n",
      "{'loss': 0.0112, 'learning_rate': 4.5880570184129215e-05, 'epoch': 3.42}        \n",
      "{'loss': 0.0105, 'learning_rate': 4.466888434799958e-05, 'epoch': 3.44}         \n",
      "{'loss': 0.0121, 'learning_rate': 4.34687954173974e-05, 'epoch': 3.46}          \n",
      " 70%|████████████████████████████▌            | 160/230 [51:23<21:52, 18.75s/it][2023-08-24 21:12:19,255] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 21:12:19,264] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 21:12:19,264] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] c0ef04db402ba917eb072daff58b8c0ef38c662600f92eee3292e60918d59b78\u001b[39m\n",
      "[2023-08-24 21:12:19,264] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "[2023-08-24 21:12:20,628] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:12:20,628] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 21:12:20,628] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "  0%|                                                     | 0/8 [00:00<?, ?it/s]\u001b[A[2023-08-24 21:12:22,028] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:12:22,028] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 25%|███████████▎                                 | 2/8 [00:01<00:04,  1.43it/s]\u001b[A[2023-08-24 21:12:23,454] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:12:23,454] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 38%|████████████████▉                            | 3/8 [00:02<00:05,  1.00s/it]\u001b[A[2023-08-24 21:12:24,814] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:12:24,814] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 50%|██████████████████████▌                      | 4/8 [00:04<00:04,  1.14s/it]\u001b[A[2023-08-24 21:12:26,214] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:12:26,215] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 62%|████████████████████████████▏                | 5/8 [00:05<00:03,  1.23s/it]\u001b[A[2023-08-24 21:12:27,598] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:12:27,598] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 75%|█████████████████████████████████▊           | 6/8 [00:06<00:02,  1.28s/it]\u001b[A[2023-08-24 21:12:29,021] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:12:29,021] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 88%|███████████████████████████████████████▍     | 7/8 [00:08<00:01,  1.33s/it]\u001b[A[2023-08-24 21:12:30,407] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:12:30,407] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "                                                                                \u001b[A\n",
      "\u001b[A{'eval_loss': 0.013194510713219643, 'eval_runtime': 11.1763, 'eval_samples_per_second': 20.221, 'eval_steps_per_second': 10.111, 'epoch': 3.46}\n",
      " 70%|████████████████████████████▌            | 160/230 [51:34<21:52, 18.75s/it]\n",
      "100%|█████████████████████████████████████████████| 8/8 [00:09<00:00,  1.34s/it]\u001b[A\n",
      "{'loss': 0.0126, 'learning_rate': 4.2280554920197936e-05, 'epoch': 3.48}        \u001b[A\n",
      "{'loss': 0.0139, 'learning_rate': 4.1104411900951015e-05, 'epoch': 3.5}         \n",
      "{'loss': 0.0094, 'learning_rate': 3.994061286868361e-05, 'epoch': 3.52}         \n",
      "{'loss': 0.0144, 'learning_rate': 3.878940174523371e-05, 'epoch': 3.55}         \n",
      "{'loss': 0.0119, 'learning_rate': 3.7651019814126654e-05, 'epoch': 3.57}        \n",
      "{'loss': 0.0128, 'learning_rate': 3.652570567000402e-05, 'epoch': 3.59}         \n",
      "{'loss': 0.0141, 'learning_rate': 3.541369516861648e-05, 'epoch': 3.61}         \n",
      "{'loss': 0.0118, 'learning_rate': 3.431522137739049e-05, 'epoch': 3.63}         \n",
      "{'loss': 0.0142, 'learning_rate': 3.323051452657961e-05, 'epoch': 3.65}         \n",
      "{'loss': 0.0149, 'learning_rate': 3.215980196101002e-05, 'epoch': 3.68}         \n",
      "{'loss': 0.0106, 'learning_rate': 3.110330809243134e-05, 'epoch': 3.7}          \n",
      "{'loss': 0.0112, 'learning_rate': 3.0061254352481804e-05, 'epoch': 3.72}        \n",
      "{'loss': 0.0103, 'learning_rate': 2.9033859146278197e-05, 'epoch': 3.74}        \n",
      "{'loss': 0.0133, 'learning_rate': 2.8021337806640135e-05, 'epoch': 3.76}        \n",
      "{'loss': 0.0138, 'learning_rate': 2.702390254895819e-05, 'epoch': 3.78}         \n",
      "{'loss': 0.0095, 'learning_rate': 2.6041762426715566e-05, 'epoch': 3.81}        \n",
      "{'loss': 0.0152, 'learning_rate': 2.5075123287672175e-05, 'epoch': 3.83}        \n",
      "{'loss': 0.0126, 'learning_rate': 2.4124187730720917e-05, 'epoch': 3.85}        \n",
      "{'loss': 0.0106, 'learning_rate': 2.3189155063424782e-05, 'epoch': 3.87}        \n",
      "{'loss': 0.0135, 'learning_rate': 2.2270221260243673e-05, 'epoch': 3.89}        \n",
      " 78%|████████████████████████████████         | 180/230 [57:52<15:42, 18.85s/it][2023-08-24 21:18:48,116] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 21:18:48,124] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 21:18:48,124] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] c0ef04db402ba917eb072daff58b8c0ef38c662600f92eee3292e60918d59b78\u001b[39m\n",
      "[2023-08-24 21:18:48,125] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "[2023-08-24 21:18:49,486] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:18:49,486] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 21:18:49,486] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "  0%|                                                     | 0/8 [00:00<?, ?it/s]\u001b[A[2023-08-24 21:18:50,888] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:18:50,888] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 25%|███████████▎                                 | 2/8 [00:01<00:04,  1.43it/s]\u001b[A[2023-08-24 21:18:52,314] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:18:52,314] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 38%|████████████████▉                            | 3/8 [00:02<00:05,  1.00s/it]\u001b[A[2023-08-24 21:18:53,675] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:18:53,676] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 50%|██████████████████████▌                      | 4/8 [00:04<00:04,  1.14s/it]\u001b[A[2023-08-24 21:18:55,074] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:18:55,074] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 62%|████████████████████████████▏                | 5/8 [00:05<00:03,  1.23s/it]\u001b[A[2023-08-24 21:18:56,457] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:18:56,457] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 75%|█████████████████████████████████▊           | 6/8 [00:06<00:02,  1.28s/it]\u001b[A[2023-08-24 21:18:57,881] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:18:57,881] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 88%|███████████████████████████████████████▍     | 7/8 [00:08<00:01,  1.33s/it]\u001b[A[2023-08-24 21:18:59,267] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:18:59,267] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "                                                                                \u001b[A\n",
      "\u001b[A{'eval_loss': 0.013087373226881027, 'eval_runtime': 11.1759, 'eval_samples_per_second': 20.222, 'eval_steps_per_second': 10.111, 'epoch': 3.89}\n",
      " 78%|████████████████████████████████         | 180/230 [58:03<15:42, 18.85s/it]\n",
      "100%|█████████████████████████████████████████████| 8/8 [00:09<00:00,  1.34s/it]\u001b[A\n",
      "                                                                                \u001b[A/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "{'loss': 0.0128, 'learning_rate': 2.1367578921460074e-05, 'epoch': 3.91}        \n",
      "{'loss': 0.0103, 'learning_rate': 2.0481417232811573e-05, 'epoch': 3.94}        \n",
      "{'loss': 0.0099, 'learning_rate': 1.961192192583934e-05, 'epoch': 3.96}         \n",
      "{'loss': 0.0112, 'learning_rate': 1.8759275238960473e-05, 'epoch': 3.98}        \n",
      "{'loss': 0.0132, 'learning_rate': 1.7923655879272393e-05, 'epoch': 4.0}         \n",
      " 80%|████████████████████████████████▉        | 185/230 [59:38<14:45, 19.68s/it][2023-08-24 21:20:34,361] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 1514815\u001b[39m\n",
      "[2023-08-24 21:20:34,361] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 21:20:34,363] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] b61d2abf3bc15c84376d0af3386cd5fac907d76f1a3fd6fec08d54c6b52d49cb\u001b[39m\n",
      "[2023-08-24 21:20:34,365] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 1514815\u001b[39m\n",
      "{'loss': 0.0102, 'learning_rate': 1.7105238985097472e-05, 'epoch': 4.02}        \n",
      "{'loss': 0.0113, 'learning_rate': 1.6304196089275658e-05, 'epoch': 4.04}        \n",
      "{'loss': 0.0117, 'learning_rate': 1.5520695083212678e-05, 'epoch': 4.06}        \n",
      "{'loss': 0.012, 'learning_rate': 1.4754900181691467e-05, 'epoch': 4.09}         \n",
      "{'loss': 0.0144, 'learning_rate': 1.4006971888454323e-05, 'epoch': 4.11}        \n",
      "{'loss': 0.0102, 'learning_rate': 1.3277066962562645e-05, 'epoch': 4.13}        \n",
      "{'loss': 0.0132, 'learning_rate': 1.2565338385541792e-05, 'epoch': 4.15}        \n",
      "{'loss': 0.013, 'learning_rate': 1.1871935329317363e-05, 'epoch': 4.17}         \n",
      "{'loss': 0.0137, 'learning_rate': 1.1197003124950222e-05, 'epoch': 4.19}        \n",
      "{'loss': 0.0114, 'learning_rate': 1.0540683232176307e-05, 'epoch': 4.22}        \n",
      "{'loss': 0.0119, 'learning_rate': 9.903113209758096e-06, 'epoch': 4.24}         \n",
      "{'loss': 0.0126, 'learning_rate': 9.284426686653303e-06, 'epoch': 4.26}         \n",
      "{'loss': 0.0137, 'learning_rate': 8.68475333400769e-06, 'epoch': 4.28}          \n",
      "{'loss': 0.0132, 'learning_rate': 8.10421883797694e-06, 'epoch': 4.3}           \n",
      "{'loss': 0.0111, 'learning_rate': 7.542944873384106e-06, 'epoch': 4.32}         \n",
      " 87%|█████████████████████████████████▉     | 200/230 [1:04:20<09:26, 18.88s/it][2023-08-24 21:25:17,008] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 21:25:17,016] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 21:25:17,016] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] c0ef04db402ba917eb072daff58b8c0ef38c662600f92eee3292e60918d59b78\u001b[39m\n",
      "[2023-08-24 21:25:17,017] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "[2023-08-24 21:25:18,378] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:25:18,378] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 21:25:18,378] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "  0%|                                                     | 0/8 [00:00<?, ?it/s]\u001b[A[2023-08-24 21:25:19,776] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:25:19,776] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 25%|███████████▎                                 | 2/8 [00:01<00:04,  1.43it/s]\u001b[A[2023-08-24 21:25:21,199] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:25:21,199] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 38%|████████████████▉                            | 3/8 [00:02<00:05,  1.00s/it]\u001b[A[2023-08-24 21:25:22,559] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:25:22,559] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 50%|██████████████████████▌                      | 4/8 [00:04<00:04,  1.13s/it]\u001b[A[2023-08-24 21:25:23,958] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:25:23,958] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 62%|████████████████████████████▏                | 5/8 [00:05<00:03,  1.23s/it]\u001b[A[2023-08-24 21:25:25,341] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:25:25,341] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 75%|█████████████████████████████████▊           | 6/8 [00:06<00:02,  1.28s/it]\u001b[A[2023-08-24 21:25:26,764] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:25:26,764] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 88%|███████████████████████████████████████▍     | 7/8 [00:08<00:01,  1.32s/it]\u001b[A[2023-08-24 21:25:28,152] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:25:28,152] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "                                                                                \u001b[A\n",
      "\u001b[A{'eval_loss': 0.013082703575491905, 'eval_runtime': 11.1685, 'eval_samples_per_second': 20.235, 'eval_steps_per_second': 10.118, 'epoch': 4.32}\n",
      " 87%|█████████████████████████████████▉     | 200/230 [1:04:32<09:26, 18.88s/it]\n",
      "100%|█████████████████████████████████████████████| 8/8 [00:09<00:00,  1.34s/it]\u001b[A\n",
      "{'loss': 0.0109, 'learning_rate': 7.0010490782176145e-06, 'epoch': 4.35}        \u001b[A\n",
      "{'loss': 0.0151, 'learning_rate': 6.4786450289753715e-06, 'epoch': 4.37}        \n",
      "{'loss': 0.016, 'learning_rate': 5.975842216860239e-06, 'epoch': 4.39}          \n",
      "{'loss': 0.0119, 'learning_rate': 5.492746024831541e-06, 'epoch': 4.41}         \n",
      "{'loss': 0.0132, 'learning_rate': 5.029457705517793e-06, 'epoch': 4.43}         \n",
      "{'loss': 0.0109, 'learning_rate': 4.586074359995119e-06, 'epoch': 4.45}         \n",
      "{'loss': 0.01, 'learning_rate': 4.162688917435631e-06, 'epoch': 4.48}           \n",
      "{'loss': 0.0125, 'learning_rate': 3.7593901156303566e-06, 'epoch': 4.5}         \n",
      "{'loss': 0.0137, 'learning_rate': 3.3762624823906573e-06, 'epoch': 4.52}        \n",
      "{'loss': 0.0122, 'learning_rate': 3.0133863178318232e-06, 'epoch': 4.54}        \n",
      "{'loss': 0.0125, 'learning_rate': 2.6708376775430033e-06, 'epoch': 4.56}        \n",
      "{'loss': 0.0132, 'learning_rate': 2.3486883566465777e-06, 'epoch': 4.58}        \n",
      "{'loss': 0.0118, 'learning_rate': 2.0470058747505516e-06, 'epoch': 4.61}        \n",
      "{'loss': 0.0124, 'learning_rate': 1.7658534617971067e-06, 'epoch': 4.63}        \n",
      "{'loss': 0.0121, 'learning_rate': 1.5052900448100815e-06, 'epoch': 4.65}        \n",
      "{'loss': 0.0145, 'learning_rate': 1.2653702355444608e-06, 'epoch': 4.67}        \n",
      "{'loss': 0.0133, 'learning_rate': 1.0461443190402099e-06, 'epoch': 4.69}        \n",
      "{'loss': 0.0132, 'learning_rate': 8.476582430830049e-07, 'epoch': 4.71}         \n",
      "{'loss': 0.0106, 'learning_rate': 6.699536085739588e-07, 'epoch': 4.74}         \n",
      "{'loss': 0.0106, 'learning_rate': 5.130676608104845e-07, 'epoch': 4.76}         \n",
      " 96%|█████████████████████████████████████▎ | 220/230 [1:10:47<03:08, 18.85s/it][2023-08-24 21:31:44,045] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 21:31:44,053] [INFO] [axolotl.utils.dataloader.generate_batches:181] [PID:125016] generating packed batches\u001b[39m\n",
      "[2023-08-24 21:31:44,053] [INFO] [axolotl.utils.dataloader.generate_batches:187] [PID:125016] c0ef04db402ba917eb072daff58b8c0ef38c662600f92eee3292e60918d59b78\u001b[39m\n",
      "[2023-08-24 21:31:44,054] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "/usr/local/lib/python3.10/dist-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
      "  warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
      "[2023-08-24 21:31:45,414] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:31:45,415] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "[2023-08-24 21:31:45,415] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "  0%|                                                     | 0/8 [00:00<?, ?it/s]\u001b[A[2023-08-24 21:31:46,811] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:31:46,811] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 25%|███████████▎                                 | 2/8 [00:01<00:04,  1.43it/s]\u001b[A[2023-08-24 21:31:48,235] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:31:48,235] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 38%|████████████████▉                            | 3/8 [00:02<00:05,  1.00s/it]\u001b[A[2023-08-24 21:31:49,595] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:31:49,596] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 50%|██████████████████████▌                      | 4/8 [00:04<00:04,  1.13s/it]\u001b[A[2023-08-24 21:31:50,994] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:31:50,994] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 62%|████████████████████████████▏                | 5/8 [00:05<00:03,  1.23s/it]\u001b[A[2023-08-24 21:31:52,376] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:31:52,377] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 75%|█████████████████████████████████▊           | 6/8 [00:06<00:02,  1.28s/it]\u001b[A[2023-08-24 21:31:53,798] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:31:53,799] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      " 88%|███████████████████████████████████████▍     | 7/8 [00:08<00:01,  1.32s/it]\u001b[A[2023-08-24 21:31:55,183] [INFO] [accelerate.accelerator.log:60] [PID:125016] The used dataset had no length, returning gathered tensors. You should drop the remainder yourself.\n",
      "[2023-08-24 21:31:55,183] [INFO] [axolotl.utils.dataloader._len_est:262] [PID:125016] packing_efficiency_estimate: 0.98 total_num_tokens per device: 79978\u001b[39m\n",
      "\n",
      "                                                                                \u001b[A\n",
      "\u001b[A{'eval_loss': 0.01321522518992424, 'eval_runtime': 11.1631, 'eval_samples_per_second': 20.245, 'eval_steps_per_second': 10.123, 'epoch': 4.76}\n",
      " 96%|█████████████████████████████████████▎ | 220/230 [1:10:59<03:08, 18.85s/it]\n",
      "100%|█████████████████████████████████████████████| 8/8 [00:09<00:00,  1.34s/it]\u001b[A\n",
      "{'loss': 0.013, 'learning_rate': 3.7703328167999485e-07, 'epoch': 4.78}         \u001b[A\n",
      "{'loss': 0.0115, 'learning_rate': 2.6187898276813784e-07, 'epoch': 4.8}         \n",
      "{'loss': 0.0091, 'learning_rate': 1.6762889938303217e-07, 'epoch': 4.82}        \n",
      "{'loss': 0.0112, 'learning_rate': 9.430278549675819e-08, 'epoch': 4.84}         \n",
      "{'loss': 0.0111, 'learning_rate': 4.191600960505859e-08, 'epoch': 4.86}         \n",
      "{'loss': 0.0109, 'learning_rate': 1.0479551506259456e-08, 'epoch': 4.89}        \n",
      "{'loss': 0.0148, 'learning_rate': 0.0, 'epoch': 4.91}                           \n",
      "{'loss': 0.0152, 'learning_rate': 1.0479551506270558e-08, 'epoch': 4.93}        \n",
      "{'loss': 0.0119, 'learning_rate': 4.191600960505859e-08, 'epoch': 4.95}         \n",
      "{'loss': 0.0092, 'learning_rate': 9.430278549675819e-08, 'epoch': 4.97}         \n",
      "{'train_runtime': 4450.2443, 'train_samples_per_second': 4.803, 'train_steps_per_second': 0.052, 'train_loss': 0.08349336267084531, 'epoch': 4.97}\n",
      "100%|███████████████████████████████████████| 230/230 [1:14:07<00:00, 19.34s/it]\n",
      "[2023-08-24 21:35:04,013] [INFO] [axolotl.scripts.train:303] [PID:125016] Training Completed!!! Saving pre-trained model to ./models/run1\u001b[39m\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Waiting for W&B process to finish... \u001b[32m(success).\u001b[0m\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Run history:\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:                      eval/loss █▃▂▂▁▁▁▁▁▁▁\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:                   eval/runtime █▂▁▁▂▁▃▄▄▃▂\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:        eval/samples_per_second ▁▇██▇█▆▅▅▆▇\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:          eval/steps_per_second ▁▇██▇█▆▅▅▆▇\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:                    train/epoch ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/global_step ▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:            train/learning_rate ▂▅██████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:                     train/loss ██▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:               train/total_flos ▁\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:               train/train_loss ▁\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:            train/train_runtime ▁\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: train/train_samples_per_second ▁\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:   train/train_steps_per_second ▁\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Run summary:\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:                      eval/loss 0.01322\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:                   eval/runtime 11.1631\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:        eval/samples_per_second 20.245\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:          eval/steps_per_second 10.123\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:                    train/epoch 4.97\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:              train/global_step 230\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:            train/learning_rate 0.0\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:                     train/loss 0.0092\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:               train/total_flos 2.966052920056873e+17\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:               train/train_loss 0.08349\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:            train/train_runtime 4450.2443\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: train/train_samples_per_second 4.803\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m:   train/train_steps_per_second 0.052\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: \n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: 🚀 View run \u001b[33mrun1\u001b[0m at: \u001b[34m\u001b[4mhttps://wandb.ai/openpipe-team/classify-recipes/runs/run1\u001b[0m\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: ️⚡ View job at \u001b[34m\u001b[4mhttps://wandb.ai/openpipe-team/classify-recipes/jobs/QXJ0aWZhY3RDb2xsZWN0aW9uOjkyNjYwODUw/version_details/v0\u001b[0m\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Synced 5 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)\n",
      "\u001b[34m\u001b[1mwandb\u001b[0m: Find logs at: \u001b[35m\u001b[1m./wandb/run-20230824_202055-run1/logs\u001b[0m\n",
      "Exception in thread NetStatThr:\n",
      "Traceback (most recent call last):\n",
      "  File \"/usr/lib/python3.10/threading.py\", line 1016, in _bootstrap_inner\n",
      "    self.run()\n",
      "  File \"/usr/lib/python3.10/threading.py\", line 953, in run\n",
      "    self._target(*self._args, **self._kwargs)\n",
      "  File \"/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_run.py\", line 256, in check_network_status\n",
      "    self._loop_check_status(\n",
      "  File \"/usr/local/lib/python3.10/dist-packages/wandb/sdk/wandb_run.py\", line 212, in _loop_check_status\n",
      "    local_handle = request()\n",
      "  File \"/usr/local/lib/python3.10/dist-packages/wandb/sdk/interface/interface.py\", line 864, in deliver_network_status\n",
      "    return self._deliver_network_status(status)\n",
      "  File \"/usr/local/lib/python3.10/dist-packages/wandb/sdk/interface/interface_shared.py\", line 610, in _deliver_network_status\n",
      "    return self._deliver_record(record)\n",
      "  File \"/usr/local/lib/python3.10/dist-packages/wandb/sdk/interface/interface_shared.py\", line 569, in _deliver_record\n",
      "    handle = mailbox._deliver_record(record, interface=self)\n",
      "  File \"/usr/local/lib/python3.10/dist-packages/wandb/sdk/lib/mailbox.py\", line 455, in _deliver_record\n",
      "    interface._publish(record)\n",
      "  File \"/usr/local/lib/python3.10/dist-packages/wandb/sdk/interface/interface_sock.py\", line 51, in _publish\n",
      "    self._sock_client.send_record_publish(record)\n",
      "  File \"/usr/local/lib/python3.10/dist-packages/wandb/sdk/lib/sock_client.py\", line 221, in send_record_publish\n",
      "    self.send_server_request(server_req)\n",
      "  File \"/usr/local/lib/python3.10/dist-packages/wandb/sdk/lib/sock_client.py\", line 155, in send_server_request\n",
      "    self._send_message(msg)\n",
      "  File \"/usr/local/lib/python3.10/dist-packages/wandb/sdk/lib/sock_client.py\", line 152, in _send_message\n",
      "    self._sendall_with_error_handle(header + data)\n",
      "  File \"/usr/local/lib/python3.10/dist-packages/wandb/sdk/lib/sock_client.py\", line 130, in _sendall_with_error_handle\n",
      "    sent = self._sock.send(data)\n",
      "BrokenPipeError: [Errno 32] Broken pipe\n",
      "\u001b[0m"
     ]
    }
   ],
   "source": [
    "!accelerate launch ./axolotl/scripts/finetune.py training-config.yaml"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Sweet! I now have a new directory `./models/run1`. This contains my trained model, which I can use to classify more recipes.\n",
    "\n",
    "There's one more step though. I trained our model using [LoRA](https://huggingface.co/docs/peft/conceptual_guides/lora), which is a memory-efficient training method. But the inference library we'll use for testing doesn't support LoRA models directly yet, so we need to \"merge\" our LoRA model to transform it into a standard Llama2-shaped model. I've defined a small helper to do that called `merge_lora_model` that I'll use below."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Merging model (this could take a while)\n",
      "Loading base model\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "13b36646399a45eab184327f17165046",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading PEFT model\n",
      "Running merge_and_unload\n",
      "Model saved to ./models/run1/merged\n",
      "Final model saved to './models/run1/merged'\n"
     ]
    }
   ],
   "source": [
    "from utils import merge_lora_model\n",
    "\n",
    "print(\"Merging model (this could take a while)\")\n",
    "final_model_dir = merge_lora_model(\"training-config.yaml\")\n",
    "print(f\"Final model saved to '{final_model_dir}'\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ok, I have a model, but is it actually any good? I'll run some evaluations in [./evaluate.ipynb](./evaluate.ipynb) to check."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.6"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}