Files
FinGPT/fingpt/FinGPT-v3/training_8bit/train.ipynb
2023-08-04 18:17:59 +08:00

508 lines
18 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# only for WSL\n",
"import os \n",
"os.environ[\"PATH\"] = f\"{os.environ['PATH']}:/usr/local/cuda/bin\"\n",
"os.environ['LD_LIBRARY_PATH'] = \"/usr/lib/wsl/lib:/usr/local/cuda/lib64\""
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2023-07-15 02:35:25,700] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
"\n",
"===================================BUG REPORT===================================\n",
"Welcome to bitsandbytes. For bug reports, please run\n",
"\n",
"python -m bitsandbytes\n",
"\n",
" and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
"================================================================================\n",
"bin /home/oliverwang15/miniconda3/envs/fingpt/lib/python3.9/site-packages/bitsandbytes-0.40.0.post4-py3.9.egg/bitsandbytes/libbitsandbytes_cuda120.so\n",
"CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so\n",
"CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n",
"CUDA SETUP: Detected CUDA version 120\n",
"CUDA SETUP: Loading binary /home/oliverwang15/miniconda3/envs/fingpt/lib/python3.9/site-packages/bitsandbytes-0.40.0.post4-py3.9.egg/bitsandbytes/libbitsandbytes_cuda120.so...\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/oliverwang15/miniconda3/envs/fingpt/lib/python3.9/site-packages/bitsandbytes-0.40.0.post4-py3.9.egg/bitsandbytes/cuda_setup/main.py:149: UserWarning: /home/oliverwang15/miniconda3/envs/fingpt did not contain ['libcudart.so', 'libcudart.so.11.0', 'libcudart.so.12.0'] as expected! Searching further paths...\n",
" warn(msg)\n"
]
}
],
"source": [
"from typing import List, Dict, Optional\n",
"\n",
"import datasets\n",
"import torch\n",
"from loguru import logger\n",
"from datasets import load_dataset\n",
"from transformers import (\n",
" AutoModel,\n",
" AutoTokenizer,\n",
" TrainingArguments,\n",
" Trainer,\n",
" BitsAndBytesConfig\n",
")\n",
"from peft import (\n",
" TaskType,\n",
" LoraConfig,\n",
" get_peft_model,\n",
" set_peft_model_state_dict,\n",
" prepare_model_for_kbit_training,\n",
" prepare_model_for_int8_training,\n",
")\n",
"from peft.utils import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"training_args = TrainingArguments(\n",
" output_dir='./finetuned_model', # saved model path\n",
" logging_steps = 500,\n",
" # max_steps=10000,\n",
" num_train_epochs = 2,\n",
" per_device_train_batch_size=4,\n",
" gradient_accumulation_steps=8,\n",
" learning_rate=1e-4,\n",
" weight_decay=0.01,\n",
" warmup_steps=1000,\n",
" save_steps=500,\n",
" fp16=True,\n",
" # bf16=True,\n",
" torch_compile = False,\n",
" load_best_model_at_end = True,\n",
" evaluation_strategy=\"steps\",\n",
" remove_unused_columns=False,\n",
"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# # Quantization\n",
"# q_config = BitsAndBytesConfig(load_in_4bit=True,\n",
"# bnb_4bit_quant_type='nf4',\n",
"# bnb_4bit_use_double_quant=True,\n",
"# bnb_4bit_compute_dtype=torch.float16\n",
"# )"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "278d0501a0f94fdabe335bc98e0af31b",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Loading checkpoint shards: 0%| | 0/7 [00:00<?, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/oliverwang15/miniconda3/envs/fingpt/lib/python3.9/site-packages/peft/utils/other.py:102: FutureWarning: prepare_model_for_int8_training is deprecated and will be removed in a future version. Use prepare_model_for_kbit_training instead.\n",
" warnings.warn(\n"
]
}
],
"source": [
"# Load tokenizer & model\n",
"model_name = \"THUDM/chatglm2-6b\"\n",
"tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
"model = AutoModel.from_pretrained(\n",
" model_name, \n",
" # quantization_config=q_config,\n",
" load_in_8bit = True,\n",
" trust_remote_code=True, \n",
" device='cuda'\n",
" )\n",
"model = prepare_model_for_int8_training(model, use_gradient_checkpointing=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def print_trainable_parameters(model):\n",
" \"\"\"\n",
" Prints the number of trainable parameters in the model.\n",
" \"\"\"\n",
" trainable_params = 0\n",
" all_param = 0\n",
" for _, param in model.named_parameters():\n",
" all_param += param.numel()\n",
" if param.requires_grad:\n",
" trainable_params += param.numel()\n",
" print(\n",
" f\"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"trainable params: 1949696 || all params: 6245533696 || trainable%: 0.031217444255383614\n"
]
}
],
"source": [
"# LoRA\n",
"target_modules = TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING['chatglm']\n",
"lora_config = LoraConfig(\n",
" task_type=TaskType.CAUSAL_LM,\n",
" inference_mode=False,\n",
" r=8,\n",
" lora_alpha=32,\n",
" lora_dropout=0.1,\n",
" target_modules=target_modules,\n",
" bias='none',\n",
")\n",
"model = get_peft_model(model, lora_config)\n",
"print_trainable_parameters(model)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"resume_from_checkpoint = None\n",
"if resume_from_checkpoint is not None:\n",
" checkpoint_name = os.path.join(resume_from_checkpoint, 'pytorch_model.bin')\n",
" if not os.path.exists(checkpoint_name):\n",
" checkpoint_name = os.path.join(\n",
" resume_from_checkpoint, 'adapter_model.bin'\n",
" )\n",
" resume_from_checkpoint = False\n",
" if os.path.exists(checkpoint_name):\n",
" logger.info(f'Restarting from {checkpoint_name}')\n",
" adapters_weights = torch.load(checkpoint_name)\n",
" set_peft_model_state_dict(model, adapters_weights)\n",
" else:\n",
" logger.info(f'Checkpoint {checkpoint_name} not found')"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"trainable params: 1,949,696 || all params: 6,245,533,696 || trainable%: 0.031217444255383614\n"
]
}
],
"source": [
"model.print_trainable_parameters()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# load data\n",
"dataset = datasets.load_from_disk(\"../data/dataset_new\")\n",
"dataset = dataset.train_test_split(0.2, shuffle=True, seed = 42)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"class ModifiedTrainer(Trainer):\n",
" def compute_loss(self, model, inputs, return_outputs=False):\n",
" return model(\n",
" input_ids=inputs[\"input_ids\"],\n",
" labels=inputs[\"labels\"],\n",
" ).loss\n",
"\n",
" def prediction_step(self, model: torch.nn.Module, inputs, prediction_loss_only: bool, ignore_keys = None):\n",
" with torch.no_grad():\n",
" res = model(\n",
" input_ids=inputs[\"input_ids\"].to(model.device),\n",
" labels=inputs[\"labels\"].to(model.device),\n",
" ).loss\n",
" return (res, None, None)\n",
"\n",
" def save_model(self, output_dir=None, _internal_call=False):\n",
" from transformers.trainer import TRAINING_ARGS_NAME\n",
"\n",
" os.makedirs(output_dir, exist_ok=True)\n",
" torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))\n",
" saved_params = {\n",
" k: v.to(\"cpu\") for k, v in self.model.named_parameters() if v.requires_grad\n",
" }\n",
" torch.save(saved_params, os.path.join(output_dir, \"adapter_model.bin\"))\n",
"\n",
"def data_collator(features: list) -> dict:\n",
" len_ids = [len(feature[\"input_ids\"]) for feature in features]\n",
" longest = max(len_ids)\n",
" input_ids = []\n",
" labels_list = []\n",
" for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]):\n",
" ids = feature[\"input_ids\"]\n",
" seq_len = feature[\"seq_len\"]\n",
" labels = (\n",
" [tokenizer.pad_token_id] * (seq_len - 1) + ids[(seq_len - 1) :] + [tokenizer.pad_token_id] * (longest - ids_l)\n",
" )\n",
" ids = ids + [tokenizer.pad_token_id] * (longest - ids_l)\n",
" _ids = torch.LongTensor(ids)\n",
" labels_list.append(torch.LongTensor(labels))\n",
" input_ids.append(_ids)\n",
" input_ids = torch.stack(input_ids)\n",
" labels = torch.stack(labels_list)\n",
" return {\n",
" \"input_ids\": input_ids,\n",
" \"labels\": labels,\n",
" }\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from torch.utils.tensorboard import SummaryWriter\n",
"from transformers.integrations import TensorBoardCallback"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"You are adding a <class 'transformers.integrations.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is\n",
":DefaultFlowCallback\n",
"TensorBoardCallback\n",
"/home/oliverwang15/miniconda3/envs/fingpt/lib/python3.9/site-packages/transformers/optimization.py:411: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning\n",
" warnings.warn(\n",
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...\n",
"/home/oliverwang15/miniconda3/envs/fingpt/lib/python3.9/site-packages/bitsandbytes-0.40.0.post4-py3.9.egg/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n"
]
},
{
"data": {
"text/html": [
"\n",
" <div>\n",
" \n",
" <progress value='3838' max='3838' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" [3838/3838 7:41:31, Epoch 1/2]\n",
" </div>\n",
" <table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: left;\">\n",
" <th>Step</th>\n",
" <th>Training Loss</th>\n",
" <th>Validation Loss</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>500</td>\n",
" <td>9.616700</td>\n",
" <td>5.970341</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1000</td>\n",
" <td>5.773300</td>\n",
" <td>5.637556</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1500</td>\n",
" <td>5.606400</td>\n",
" <td>5.624945</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2000</td>\n",
" <td>5.598000</td>\n",
" <td>5.620837</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2500</td>\n",
" <td>5.594000</td>\n",
" <td>5.618898</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3000</td>\n",
" <td>5.593100</td>\n",
" <td>5.617580</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3500</td>\n",
" <td>5.591000</td>\n",
" <td>5.616888</td>\n",
" </tr>\n",
" </tbody>\n",
"</table><p>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/oliverwang15/miniconda3/envs/fingpt/lib/python3.9/site-packages/bitsandbytes-0.40.0.post4-py3.9.egg/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"/home/oliverwang15/miniconda3/envs/fingpt/lib/python3.9/site-packages/bitsandbytes-0.40.0.post4-py3.9.egg/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"/home/oliverwang15/miniconda3/envs/fingpt/lib/python3.9/site-packages/bitsandbytes-0.40.0.post4-py3.9.egg/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"/home/oliverwang15/miniconda3/envs/fingpt/lib/python3.9/site-packages/bitsandbytes-0.40.0.post4-py3.9.egg/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"/home/oliverwang15/miniconda3/envs/fingpt/lib/python3.9/site-packages/bitsandbytes-0.40.0.post4-py3.9.egg/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"/home/oliverwang15/miniconda3/envs/fingpt/lib/python3.9/site-packages/bitsandbytes-0.40.0.post4-py3.9.egg/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
"/home/oliverwang15/miniconda3/envs/fingpt/lib/python3.9/site-packages/bitsandbytes-0.40.0.post4-py3.9.egg/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.float32 to float16 during quantization\n",
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n"
]
}
],
"source": [
"# Train\n",
"writer = SummaryWriter()\n",
"trainer = ModifiedTrainer(\n",
" model=model, \n",
" args=training_args, # Trainer args\n",
" train_dataset=dataset[\"train\"], # Training set\n",
" eval_dataset=dataset[\"test\"], # Testing set\n",
" data_collator=data_collator, # Data Collator\n",
" callbacks=[TensorBoardCallback(writer)],\n",
")\n",
"trainer.train()\n",
"writer.close()\n",
"# save model\n",
"model.save_pretrained(training_args.output_dir)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## GPU Trining MEM: 46.7%\n",
"## GPU Evaling MEM: 83.5%"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "fingpt",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.17"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}