mirror of
				https://github.com/mlabonne/llm-course.git
				synced 2024-01-14 16:32:03 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			353 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			353 lines
		
	
	
		
			12 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
{
 | 
						|
  "nbformat": 4,
 | 
						|
  "nbformat_minor": 0,
 | 
						|
  "metadata": {
 | 
						|
    "colab": {
 | 
						|
      "provenance": [],
 | 
						|
      "machine_shape": "hm",
 | 
						|
      "gpuType": "V100",
 | 
						|
      "authorship_tag": "ABX9TyPNl/WKBYXOzuJCP/puYm6d",
 | 
						|
      "include_colab_link": true
 | 
						|
    },
 | 
						|
    "kernelspec": {
 | 
						|
      "name": "python3",
 | 
						|
      "display_name": "Python 3"
 | 
						|
    },
 | 
						|
    "language_info": {
 | 
						|
      "name": "python"
 | 
						|
    },
 | 
						|
    "accelerator": "GPU"
 | 
						|
  },
 | 
						|
  "cells": [
 | 
						|
    {
 | 
						|
      "cell_type": "markdown",
 | 
						|
      "metadata": {
 | 
						|
        "id": "view-in-github",
 | 
						|
        "colab_type": "text"
 | 
						|
      },
 | 
						|
      "source": [
 | 
						|
        "<a href=\"https://colab.research.google.com/github/mlabonne/llm-course/blob/main/Fine_tune_Llama_2_in_Google_Colab.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 | 
						|
      ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
      "cell_type": "markdown",
 | 
						|
      "source": [
 | 
						|
        "# Fine-tune Llama 2 in Google Colab\n",
 | 
						|
        "> 🗣️ Large Language Model Course\n",
 | 
						|
        "\n",
 | 
						|
        "❤️ Created by [@maximelabonne](), based on Younes Belkada's [GitHub Gist](https://gist.github.com/younesbelkada/9f7f75c94bdc1981c8ca5cc937d4a4da).\n",
 | 
						|
        "\n",
 | 
						|
        "This notebook runs on a T4 GPU with high RAM. (Last update: 23 Jul 2023)\n"
 | 
						|
      ],
 | 
						|
      "metadata": {
 | 
						|
        "id": "OSHlAbqzDFDq"
 | 
						|
      }
 | 
						|
    },
 | 
						|
    {
 | 
						|
      "cell_type": "code",
 | 
						|
      "execution_count": null,
 | 
						|
      "metadata": {
 | 
						|
        "id": "GLXwJqbjtPho"
 | 
						|
      },
 | 
						|
      "outputs": [],
 | 
						|
      "source": [
 | 
						|
        "!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7"
 | 
						|
      ]
 | 
						|
    },
 | 
						|
    {
 | 
						|
      "cell_type": "code",
 | 
						|
      "source": [
 | 
						|
        "import os\n",
 | 
						|
        "import torch\n",
 | 
						|
        "from datasets import load_dataset\n",
 | 
						|
        "from transformers import (\n",
 | 
						|
        "    AutoModelForCausalLM,\n",
 | 
						|
        "    AutoTokenizer,\n",
 | 
						|
        "    BitsAndBytesConfig,\n",
 | 
						|
        "    HfArgumentParser,\n",
 | 
						|
        "    TrainingArguments,\n",
 | 
						|
        "    pipeline,\n",
 | 
						|
        "    logging,\n",
 | 
						|
        ")\n",
 | 
						|
        "from peft import LoraConfig, PeftModel\n",
 | 
						|
        "from trl import SFTTrainer"
 | 
						|
      ],
 | 
						|
      "metadata": {
 | 
						|
        "id": "nAMzy_0FtaUZ"
 | 
						|
      },
 | 
						|
      "execution_count": null,
 | 
						|
      "outputs": []
 | 
						|
    },
 | 
						|
    {
 | 
						|
      "cell_type": "code",
 | 
						|
      "source": [
 | 
						|
        "# The model that you want to train from the Hugging Face hub\n",
 | 
						|
        "model_name = \"daryl149/llama-2-7b-chat-hf\"\n",
 | 
						|
        "\n",
 | 
						|
        "# The instruction dataset to use\n",
 | 
						|
        "dataset_name = \"mlabonne/guanaco-llama2-1k\"\n",
 | 
						|
        "\n",
 | 
						|
        "# Fine-tuned model name\n",
 | 
						|
        "new_model = \"llama-2-7b-guanaco\"\n",
 | 
						|
        "\n",
 | 
						|
        "################################################################################\n",
 | 
						|
        "# QLoRA parameters\n",
 | 
						|
        "################################################################################\n",
 | 
						|
        "\n",
 | 
						|
        "# Lora attention dimension\n",
 | 
						|
        "lora_r = 64\n",
 | 
						|
        "\n",
 | 
						|
        "# Alpha parameter for Lora scaling\n",
 | 
						|
        "lora_alpha = 16\n",
 | 
						|
        "\n",
 | 
						|
        "# Dropout probability for Lora layers\n",
 | 
						|
        "lora_dropout = 0.1\n",
 | 
						|
        "\n",
 | 
						|
        "################################################################################\n",
 | 
						|
        "# bitsandbytes parameters\n",
 | 
						|
        "################################################################################\n",
 | 
						|
        "\n",
 | 
						|
        "# Activate 4-bit precision base model loading\n",
 | 
						|
        "use_4bit = True\n",
 | 
						|
        "\n",
 | 
						|
        "# Compute dtype for 4-bit base models\n",
 | 
						|
        "bnb_4bit_compute_dtype = \"float16\"\n",
 | 
						|
        "\n",
 | 
						|
        "# Quantization type (fp4 or nf4)\n",
 | 
						|
        "bnb_4bit_quant_type = \"nf4\"\n",
 | 
						|
        "\n",
 | 
						|
        "# Activate nested quantization for 4-bit base models (double quantization)\n",
 | 
						|
        "use_nested_quant = False\n",
 | 
						|
        "\n",
 | 
						|
        "################################################################################\n",
 | 
						|
        "# TrainingArguments parameters\n",
 | 
						|
        "################################################################################\n",
 | 
						|
        "\n",
 | 
						|
        "# Output directory where the model predictions and checkpoints will be stored\n",
 | 
						|
        "output_dir = \"./results\"\n",
 | 
						|
        "\n",
 | 
						|
        "# Number of training epochs\n",
 | 
						|
        "num_train_epochs = 1\n",
 | 
						|
        "\n",
 | 
						|
        "# Enable fp16/bf16 training (set bf16 to True with an A100)\n",
 | 
						|
        "fp16 = False\n",
 | 
						|
        "bf16 = False\n",
 | 
						|
        "\n",
 | 
						|
        "# Batch size per GPU for training\n",
 | 
						|
        "per_device_train_batch_size = 4\n",
 | 
						|
        "\n",
 | 
						|
        "# Batch size per GPU for evaluation\n",
 | 
						|
        "per_device_eval_batch_size = 4\n",
 | 
						|
        "\n",
 | 
						|
        "# Number of update steps to accumulate the gradients for\n",
 | 
						|
        "gradient_accumulation_steps = 1\n",
 | 
						|
        "\n",
 | 
						|
        "# Enable gradient checkpointing\n",
 | 
						|
        "gradient_checkpointing = True\n",
 | 
						|
        "\n",
 | 
						|
        "# Maximum gradient normal (gradient clipping)\n",
 | 
						|
        "max_grad_norm = 0.3\n",
 | 
						|
        "\n",
 | 
						|
        "# Initial learning rate (AdamW optimizer)\n",
 | 
						|
        "learning_rate = 2e-4\n",
 | 
						|
        "\n",
 | 
						|
        "# Weight decay to apply to all layers except bias/LayerNorm weights\n",
 | 
						|
        "weight_decay = 0.001\n",
 | 
						|
        "\n",
 | 
						|
        "# Optimizer to use\n",
 | 
						|
        "optim = \"paged_adamw_32bit\"\n",
 | 
						|
        "\n",
 | 
						|
        "# Learning rate schedule (constant a bit better than cosine)\n",
 | 
						|
        "lr_scheduler_type = \"constant\"\n",
 | 
						|
        "\n",
 | 
						|
        "# Number of training steps (overrides num_train_epochs)\n",
 | 
						|
        "max_steps = -1\n",
 | 
						|
        "\n",
 | 
						|
        "# Ratio of steps for a linear warmup (from 0 to learning rate)\n",
 | 
						|
        "warmup_ratio = 0.03\n",
 | 
						|
        "\n",
 | 
						|
        "# Group sequences into batches with same length\n",
 | 
						|
        "# Saves memory and speeds up training considerably\n",
 | 
						|
        "group_by_length = True\n",
 | 
						|
        "\n",
 | 
						|
        "# Save checkpoint every X updates steps\n",
 | 
						|
        "save_steps = 10\n",
 | 
						|
        "\n",
 | 
						|
        "# Log every X updates steps\n",
 | 
						|
        "logging_steps = 1\n",
 | 
						|
        "\n",
 | 
						|
        "################################################################################\n",
 | 
						|
        "# SFT parameters\n",
 | 
						|
        "################################################################################\n",
 | 
						|
        "\n",
 | 
						|
        "# Maximum sequence length to use\n",
 | 
						|
        "max_seq_length = None\n",
 | 
						|
        "\n",
 | 
						|
        "# Pack multiple short examples in the same input sequence to increase efficiency\n",
 | 
						|
        "packing = False\n",
 | 
						|
        "\n",
 | 
						|
        "# Load the entire model on the GPU 0\n",
 | 
						|
        "device_map = {\"\": 0}"
 | 
						|
      ],
 | 
						|
      "metadata": {
 | 
						|
        "id": "ib_We3NLtj2E"
 | 
						|
      },
 | 
						|
      "execution_count": null,
 | 
						|
      "outputs": []
 | 
						|
    },
 | 
						|
    {
 | 
						|
      "cell_type": "code",
 | 
						|
      "source": [
 | 
						|
        "# Load dataset (you can process it here)\n",
 | 
						|
        "dataset = load_dataset(dataset_name, split=\"train\")\n",
 | 
						|
        "\n",
 | 
						|
        "# Load tokenizer and model with QLoRA configuration\n",
 | 
						|
        "compute_dtype = getattr(torch, bnb_4bit_compute_dtype)\n",
 | 
						|
        "\n",
 | 
						|
        "bnb_config = BitsAndBytesConfig(\n",
 | 
						|
        "    load_in_4bit=use_4bit,\n",
 | 
						|
        "    bnb_4bit_quant_type=bnb_4bit_quant_type,\n",
 | 
						|
        "    bnb_4bit_compute_dtype=compute_dtype,\n",
 | 
						|
        "    bnb_4bit_use_double_quant=use_nested_quant,\n",
 | 
						|
        ")\n",
 | 
						|
        "\n",
 | 
						|
        "# Check GPU compatibility with bfloat16\n",
 | 
						|
        "if compute_dtype == torch.float16 and use_4bit:\n",
 | 
						|
        "    major, _ = torch.cuda.get_device_capability()\n",
 | 
						|
        "    if major >= 8:\n",
 | 
						|
        "        print(\"=\" * 80)\n",
 | 
						|
        "        print(\"Your GPU supports bfloat16: accelerate training with bf16=True\")\n",
 | 
						|
        "        print(\"=\" * 80)\n",
 | 
						|
        "\n",
 | 
						|
        "# Load base model\n",
 | 
						|
        "model = AutoModelForCausalLM.from_pretrained(\n",
 | 
						|
        "    model_name,\n",
 | 
						|
        "    quantization_config=bnb_config,\n",
 | 
						|
        "    device_map=device_map\n",
 | 
						|
        ")\n",
 | 
						|
        "model.config.use_cache = False\n",
 | 
						|
        "model.config.pretraining_tp = 1\n",
 | 
						|
        "\n",
 | 
						|
        "# Load LoRA configuration\n",
 | 
						|
        "peft_config = LoraConfig(\n",
 | 
						|
        "    lora_alpha=lora_alpha,\n",
 | 
						|
        "    lora_dropout=lora_dropout,\n",
 | 
						|
        "    r=lora_r,\n",
 | 
						|
        "    bias=\"none\",\n",
 | 
						|
        "    task_type=\"CAUSAL_LM\",\n",
 | 
						|
        ")\n",
 | 
						|
        "\n",
 | 
						|
        "# Load LLaMA tokenizer\n",
 | 
						|
        "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
 | 
						|
        "tokenizer.pad_token = tokenizer.eos_token\n",
 | 
						|
        "tokenizer.padding_side = \"right\" # Fix weird overflow issue with fp16 training\n",
 | 
						|
        "\n",
 | 
						|
        "# Set training parameters\n",
 | 
						|
        "training_arguments = TrainingArguments(\n",
 | 
						|
        "    output_dir=output_dir,\n",
 | 
						|
        "    num_train_epochs=num_train_epochs,\n",
 | 
						|
        "    per_device_train_batch_size=per_device_train_batch_size,\n",
 | 
						|
        "    gradient_accumulation_steps=gradient_accumulation_steps,\n",
 | 
						|
        "    optim=optim,\n",
 | 
						|
        "    save_steps=save_steps,\n",
 | 
						|
        "    logging_steps=logging_steps,\n",
 | 
						|
        "    learning_rate=learning_rate,\n",
 | 
						|
        "    weight_decay=weight_decay,\n",
 | 
						|
        "    fp16=fp16,\n",
 | 
						|
        "    bf16=bf16,\n",
 | 
						|
        "    max_grad_norm=max_grad_norm,\n",
 | 
						|
        "    max_steps=max_steps,\n",
 | 
						|
        "    warmup_ratio=warmup_ratio,\n",
 | 
						|
        "    group_by_length=group_by_length,\n",
 | 
						|
        "    lr_scheduler_type=lr_scheduler_type,\n",
 | 
						|
        "    report_to=\"tensorboard\"\n",
 | 
						|
        ")\n",
 | 
						|
        "\n",
 | 
						|
        "# Set supervised fine-tuning parameters\n",
 | 
						|
        "trainer = SFTTrainer(\n",
 | 
						|
        "    model=model,\n",
 | 
						|
        "    train_dataset=dataset,\n",
 | 
						|
        "    peft_config=peft_config,\n",
 | 
						|
        "    dataset_text_field=\"text\",\n",
 | 
						|
        "    max_seq_length=max_seq_length,\n",
 | 
						|
        "    tokenizer=tokenizer,\n",
 | 
						|
        "    args=training_arguments,\n",
 | 
						|
        "    packing=packing,\n",
 | 
						|
        ")\n",
 | 
						|
        "\n",
 | 
						|
        "# Train model\n",
 | 
						|
        "trainer.train()\n",
 | 
						|
        "\n",
 | 
						|
        "# Save trained model\n",
 | 
						|
        "trainer.model.save_pretrained(output_dir)"
 | 
						|
      ],
 | 
						|
      "metadata": {
 | 
						|
        "id": "OJXpOgBFuSrc"
 | 
						|
      },
 | 
						|
      "execution_count": null,
 | 
						|
      "outputs": []
 | 
						|
    },
 | 
						|
    {
 | 
						|
      "cell_type": "code",
 | 
						|
      "source": [
 | 
						|
        "logging.set_verbosity(logging.CRITICAL)\n",
 | 
						|
        "pipe = pipeline(task=\"text-generation\", model=model, tokenizer=tokenizer, max_length=200)\n",
 | 
						|
        "result = pipe(\"Tell me a joke\")\n",
 | 
						|
        "print(result[0]['generated_text'])"
 | 
						|
      ],
 | 
						|
      "metadata": {
 | 
						|
        "id": "frlSLPin4IJ4"
 | 
						|
      },
 | 
						|
      "execution_count": null,
 | 
						|
      "outputs": []
 | 
						|
    },
 | 
						|
    {
 | 
						|
      "cell_type": "markdown",
 | 
						|
      "source": [
 | 
						|
        "There is a problem with the VRAM here despite `del model` and emptying the VRAM. You probably need to restart the notebook, re-execute the three first cells, and then execute this one. Please contact me if you have a fix!"
 | 
						|
      ],
 | 
						|
      "metadata": {
 | 
						|
        "id": "6WjzALHtSfdb"
 | 
						|
      }
 | 
						|
    },
 | 
						|
    {
 | 
						|
      "cell_type": "code",
 | 
						|
      "source": [
 | 
						|
        "# Reload model in FP16 and merge it with LoRA weights\n",
 | 
						|
        "base_model = AutoModelForCausalLM.from_pretrained(\n",
 | 
						|
        "    model_name,\n",
 | 
						|
        "    low_cpu_mem_usage=True,\n",
 | 
						|
        "    return_dict=True,\n",
 | 
						|
        "    torch_dtype=torch.float16,\n",
 | 
						|
        "    device_map=device_map,\n",
 | 
						|
        ")\n",
 | 
						|
        "model = PeftModel.from_pretrained(base_model, output_dir)\n",
 | 
						|
        "model = model.merge_and_unload()\n",
 | 
						|
        "\n",
 | 
						|
        "# Reload tokenizer to save it\n",
 | 
						|
        "tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
 | 
						|
        "tokenizer.pad_token = tokenizer.eos_token\n",
 | 
						|
        "tokenizer.padding_side = \"right\""
 | 
						|
      ],
 | 
						|
      "metadata": {
 | 
						|
        "id": "QQn30cRtAZ-P"
 | 
						|
      },
 | 
						|
      "execution_count": null,
 | 
						|
      "outputs": []
 | 
						|
    },
 | 
						|
    {
 | 
						|
      "cell_type": "code",
 | 
						|
      "source": [
 | 
						|
        "!huggingface-cli login\n",
 | 
						|
        "\n",
 | 
						|
        "model.push_to_hub(new_model, use_temp_dir=False)\n",
 | 
						|
        "tokenizer.push_to_hub(new_model, use_temp_dir=False)"
 | 
						|
      ],
 | 
						|
      "metadata": {
 | 
						|
        "id": "x-xPb-_qB0dz"
 | 
						|
      },
 | 
						|
      "execution_count": null,
 | 
						|
      "outputs": []
 | 
						|
    }
 | 
						|
  ]
 | 
						|
} |