mirror of
				https://github.com/ParisNeo/lollms-webui.git
				synced 2023-09-17 23:29:16 +03:00 
			
		
		
		
	added training code
This commit is contained in:
		
							
								
								
									
										26
									
								
								app.py
									
									
									
									
									
								
							
							
						
						
									
										26
									
								
								app.py
									
									
									
									
									
								
							@@ -877,6 +877,32 @@ class LoLLMsWebUI(LoLLMsAPPI):
 | 
				
			|||||||
        return jsonify(models)
 | 
					        return jsonify(models)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def train(self):
 | 
				
			||||||
 | 
					        form_data = request.form
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Create and populate the config file
 | 
				
			||||||
 | 
					        config = {
 | 
				
			||||||
 | 
					            'model_name': form_data['model_name'],
 | 
				
			||||||
 | 
					            'tokenizer_name': form_data['tokenizer_name'],
 | 
				
			||||||
 | 
					            'dataset_path': form_data['dataset_path'],
 | 
				
			||||||
 | 
					            'max_length': form_data['max_length'],
 | 
				
			||||||
 | 
					            'batch_size': form_data['batch_size'],
 | 
				
			||||||
 | 
					            'lr': form_data['lr'],
 | 
				
			||||||
 | 
					            'num_epochs': form_data['num_epochs'],
 | 
				
			||||||
 | 
					            'output_dir': form_data['output_dir'],
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        with open('train/configs/train/local_cfg.yaml', 'w') as f:
 | 
				
			||||||
 | 
					            yaml.dump(config, f)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Trigger the train.py script
 | 
				
			||||||
 | 
					        # Place your code here to run the train.py script with the created config file
 | 
				
			||||||
 | 
					        # accelerate launch --dynamo_backend=inductor --num_processes=8 --num_machines=1 --machine_rank=0 --deepspeed_multinode_launcher standard --mixed_precision=bf16  --use_deepspeed --deepspeed_config_file=configs/deepspeed/ds_config_gptj.json train.py --config configs/train/finetune_gptj.yaml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        subprocess.check_call(["accelerate","launch", "--dynamo_backend=inductor", "--num_processes=8", "--num_machines=1", "--machine_rank=0", "--deepspeed_multinode_launcher standard", "--mixed_precision=bf16", "--use_deepspeed", "--deepspeed_config_file=train/configs/deepspeed/ds_config_gptj.json", "train/train.py", "--config", "train/configs/train/local_cfg.yaml"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return jsonify({'message': 'Training started'})
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
    def get_config(self):
 | 
					    def get_config(self):
 | 
				
			||||||
        return jsonify(self.config.to_dict())
 | 
					        return jsonify(self.config.to_dict())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 
 | 
				
			|||||||
							
								
								
									
										2
									
								
								train/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										2
									
								
								train/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1,2 @@
 | 
				
			|||||||
 | 
					output
 | 
				
			||||||
 | 
					!output/.keep
 | 
				
			||||||
							
								
								
									
										48
									
								
								train/configs/deepspeed/ds_config.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										48
									
								
								train/configs/deepspeed/ds_config.yaml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,48 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
						"train_batch_size": "auto",
 | 
				
			||||||
 | 
						"gradient_accumulation_steps": "auto",
 | 
				
			||||||
 | 
						"train_micro_batch_size_per_gpu": "auto",
 | 
				
			||||||
 | 
						"fp16": {
 | 
				
			||||||
 | 
						  "enabled": "auto",
 | 
				
			||||||
 | 
						  "min_loss_scale": 1,
 | 
				
			||||||
 | 
						  "loss_scale_window": 1000,
 | 
				
			||||||
 | 
						  "hysteresis": 2,
 | 
				
			||||||
 | 
						  "initial_scale_power": 32
 | 
				
			||||||
 | 
						},
 | 
				
			||||||
 | 
						"bf16": {
 | 
				
			||||||
 | 
							"enabled": "auto"
 | 
				
			||||||
 | 
						},
 | 
				
			||||||
 | 
						"gradient_clipping": 1,
 | 
				
			||||||
 | 
						"zero_optimization": {
 | 
				
			||||||
 | 
						  "stage": 2,
 | 
				
			||||||
 | 
						  "offload_param": {
 | 
				
			||||||
 | 
							"device": "none"
 | 
				
			||||||
 | 
						  },
 | 
				
			||||||
 | 
						  "offload_optimizer": {
 | 
				
			||||||
 | 
							"device": "none"
 | 
				
			||||||
 | 
						  },
 | 
				
			||||||
 | 
						  "allgather_partitions": true,
 | 
				
			||||||
 | 
						  "allgather_bucket_size": 5e8,
 | 
				
			||||||
 | 
						  "contiguous_gradients": true
 | 
				
			||||||
 | 
						},
 | 
				
			||||||
 | 
						"optimizer": {
 | 
				
			||||||
 | 
						  "type": "AdamW",
 | 
				
			||||||
 | 
						  "params": {
 | 
				
			||||||
 | 
							"lr": "auto",
 | 
				
			||||||
 | 
							"betas": [
 | 
				
			||||||
 | 
							  0.9,
 | 
				
			||||||
 | 
							  0.999
 | 
				
			||||||
 | 
							],
 | 
				
			||||||
 | 
							"eps": 1e-08
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						},
 | 
				
			||||||
 | 
						"scheduler": {
 | 
				
			||||||
 | 
						  "type": "WarmupLR",
 | 
				
			||||||
 | 
						  "params": {
 | 
				
			||||||
 | 
							"warmup_min_lr": 0,
 | 
				
			||||||
 | 
							"warmup_max_lr": "auto",
 | 
				
			||||||
 | 
							"warmup_num_steps": "auto",
 | 
				
			||||||
 | 
							"warmup_type": "linear"
 | 
				
			||||||
 | 
						  }
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
							
								
								
									
										1
									
								
								train/configs/train/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								train/configs/train/.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					local_cfg.yaml
 | 
				
			||||||
							
								
								
									
										29
									
								
								train/configs/train/finetune.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								train/configs/train/finetune.yaml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,29 @@
 | 
				
			|||||||
 | 
					# model/tokenizer
 | 
				
			||||||
 | 
					model_name: # add model here
 | 
				
			||||||
 | 
					tokenizer_name: # add model here
 | 
				
			||||||
 | 
					gradient_checkpointing: true
 | 
				
			||||||
 | 
					save_name: # CHANGE 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# dataset
 | 
				
			||||||
 | 
					streaming: false
 | 
				
			||||||
 | 
					num_proc: 64
 | 
				
			||||||
 | 
					dataset_path: # update
 | 
				
			||||||
 | 
					max_length: 1024
 | 
				
			||||||
 | 
					batch_size: 32
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# train dynamics
 | 
				
			||||||
 | 
					lr: 5.0e-5
 | 
				
			||||||
 | 
					eval_every: 800
 | 
				
			||||||
 | 
					eval_steps: 100
 | 
				
			||||||
 | 
					save_every: 800
 | 
				
			||||||
 | 
					output_dir: # CHANGE
 | 
				
			||||||
 | 
					checkpoint: null
 | 
				
			||||||
 | 
					lora: false
 | 
				
			||||||
 | 
					warmup_steps: 100
 | 
				
			||||||
 | 
					num_epochs: 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# logging
 | 
				
			||||||
 | 
					wandb: true
 | 
				
			||||||
 | 
					wandb_entity: # update
 | 
				
			||||||
 | 
					wandb_project_name: # update
 | 
				
			||||||
 | 
					seed: 42
 | 
				
			||||||
							
								
								
									
										31
									
								
								train/configs/train/finetune_lora.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								train/configs/train/finetune_lora.yaml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,31 @@
 | 
				
			|||||||
 | 
					# model/tokenizer
 | 
				
			||||||
 | 
					model_name: # update
 | 
				
			||||||
 | 
					tokenizer_name: # update
 | 
				
			||||||
 | 
					gradient_checkpointing: false
 | 
				
			||||||
 | 
					save_name: # CHANGE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# dataset
 | 
				
			||||||
 | 
					streaming: false
 | 
				
			||||||
 | 
					num_proc: 64
 | 
				
			||||||
 | 
					dataset_path: # CHANGE
 | 
				
			||||||
 | 
					max_length: 1024
 | 
				
			||||||
 | 
					batch_size: 4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# train dynamics
 | 
				
			||||||
 | 
					lr: 5.0e-5
 | 
				
			||||||
 | 
					min_lr: 0
 | 
				
			||||||
 | 
					weight_decay: 0.0
 | 
				
			||||||
 | 
					eval_every: 2000
 | 
				
			||||||
 | 
					eval_steps: 100
 | 
				
			||||||
 | 
					save_every: 2000
 | 
				
			||||||
 | 
					output_dir: # CHANGE
 | 
				
			||||||
 | 
					checkpoint: null
 | 
				
			||||||
 | 
					lora: true
 | 
				
			||||||
 | 
					warmup_steps: 100
 | 
				
			||||||
 | 
					num_epochs: 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# logging
 | 
				
			||||||
 | 
					wandb: true
 | 
				
			||||||
 | 
					wandb_entity: # update
 | 
				
			||||||
 | 
					wandb_project_name: # update
 | 
				
			||||||
 | 
					seed: 42
 | 
				
			||||||
							
								
								
									
										31
									
								
								train/configs/train/finetune_lora_ airoboros-7b-gpt4.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								train/configs/train/finetune_lora_ airoboros-7b-gpt4.yaml
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,31 @@
 | 
				
			|||||||
 | 
					# model/tokenizer
 | 
				
			||||||
 | 
					model_name: jondurbin/airoboros-7b-gpt4 # update
 | 
				
			||||||
 | 
					tokenizer_name: jondurbin/airoboros-7b-gpt4 # update
 | 
				
			||||||
 | 
					gradient_checkpointing: false
 | 
				
			||||||
 | 
					save_name: parisneo-7b_gpt42_lora # CHANGE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# dataset
 | 
				
			||||||
 | 
					streaming: false
 | 
				
			||||||
 | 
					num_proc: 64
 | 
				
			||||||
 | 
					dataset_path: # CHANGE
 | 
				
			||||||
 | 
					max_length: 1024
 | 
				
			||||||
 | 
					batch_size: 4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# train dynamics
 | 
				
			||||||
 | 
					lr: 5.0e-5
 | 
				
			||||||
 | 
					min_lr: 0
 | 
				
			||||||
 | 
					weight_decay: 0.0
 | 
				
			||||||
 | 
					eval_every: 2000
 | 
				
			||||||
 | 
					eval_steps: 100
 | 
				
			||||||
 | 
					save_every: 2000
 | 
				
			||||||
 | 
					output_dir: output # CHANGE
 | 
				
			||||||
 | 
					checkpoint: null
 | 
				
			||||||
 | 
					lora: true
 | 
				
			||||||
 | 
					warmup_steps: 100
 | 
				
			||||||
 | 
					num_epochs: 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# logging
 | 
				
			||||||
 | 
					wandb: false # update if you want to use weights and biases
 | 
				
			||||||
 | 
					wandb_entity: # update
 | 
				
			||||||
 | 
					wandb_project_name: # update
 | 
				
			||||||
 | 
					seed: 42
 | 
				
			||||||
							
								
								
									
										15
									
								
								train/requirements.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								train/requirements.txt
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,15 @@
 | 
				
			|||||||
 | 
					accelerate
 | 
				
			||||||
 | 
					datasets
 | 
				
			||||||
 | 
					torchmetrics
 | 
				
			||||||
 | 
					evaluate
 | 
				
			||||||
 | 
					transformers>=4.28.0
 | 
				
			||||||
 | 
					wandb
 | 
				
			||||||
 | 
					pip
 | 
				
			||||||
 | 
					peft
 | 
				
			||||||
 | 
					nodelist-inflator
 | 
				
			||||||
 | 
					deepspeed
 | 
				
			||||||
 | 
					sentencepiece
 | 
				
			||||||
 | 
					jsonlines
 | 
				
			||||||
 | 
					nomic
 | 
				
			||||||
 | 
					scikit-learn
 | 
				
			||||||
 | 
					matplotlib
 | 
				
			||||||
							
								
								
									
										233
									
								
								train/train.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										233
									
								
								train/train.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,233 @@
 | 
				
			|||||||
 | 
					import os
 | 
				
			||||||
 | 
					from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler, LlamaForCausalLM
 | 
				
			||||||
 | 
					import torch
 | 
				
			||||||
 | 
					from torch.optim import AdamW
 | 
				
			||||||
 | 
					from argparse import ArgumentParser
 | 
				
			||||||
 | 
					from read import read_config
 | 
				
			||||||
 | 
					from accelerate import Accelerator
 | 
				
			||||||
 | 
					from accelerate.utils import DummyScheduler, DummyOptim, set_seed
 | 
				
			||||||
 | 
					from peft import get_peft_model, LoraConfig, TaskType
 | 
				
			||||||
 | 
					from data import load_data
 | 
				
			||||||
 | 
					from torchmetrics import MeanMetric
 | 
				
			||||||
 | 
					from tqdm import tqdm
 | 
				
			||||||
 | 
					import wandb
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					torch.backends.cuda.matmul.allow_tf32 = True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def format_metrics(metrics, split, prefix=""):
 | 
				
			||||||
 | 
					    log = f"[{split}]" + prefix
 | 
				
			||||||
 | 
					    log += " ".join([f"{key}: {value:.4f}" for key, value in metrics.items()])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return log
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def evaluate(model, val_dataloader):
 | 
				
			||||||
 | 
					    model.eval()
 | 
				
			||||||
 | 
					    val_loss = MeanMetric(nan_strategy="error").to(model.device)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    with torch.no_grad():
 | 
				
			||||||
 | 
					        for batch in tqdm(val_dataloader):
 | 
				
			||||||
 | 
					            loss = model(**batch).loss
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            loss_values = accelerator.gather_for_metrics({"loss": loss.detach()})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            val_loss.update(loss_values["loss"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return val_loss
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def train(accelerator, config):
 | 
				
			||||||
 | 
					    set_seed(config['seed'])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    accelerator.print(config)
 | 
				
			||||||
 | 
					    accelerator.print(f"Using {accelerator.num_processes} GPUs")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'], model_max_length=config['max_length'])
 | 
				
			||||||
 | 
					    # if no pad token, set it to eos
 | 
				
			||||||
 | 
					    if tokenizer.pad_token is None:
 | 
				
			||||||
 | 
					        tokenizer.pad_token = tokenizer.eos_token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        
 | 
				
			||||||
 | 
					    with accelerator.main_process_first():
 | 
				
			||||||
 | 
					        train_dataloader, val_dataloader = load_data(config, tokenizer) 
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    checkpoint = config["gradient_checkpointing"]
 | 
				
			||||||
 | 
					    model = AutoModelForCausalLM.from_pretrained(config["model_name"], 
 | 
				
			||||||
 | 
					                                                    use_cache=False if checkpoint else True,
 | 
				
			||||||
 | 
					                                                    trust_remote_code=True) 
 | 
				
			||||||
 | 
					    if checkpoint:
 | 
				
			||||||
 | 
					        model.gradient_checkpointing_enable()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if config["lora"]:
 | 
				
			||||||
 | 
					        peft_config = LoraConfig(
 | 
				
			||||||
 | 
					            # should R be configurable?
 | 
				
			||||||
 | 
					            task_type=TaskType.CAUSAL_LM, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        model = get_peft_model(model, peft_config)
 | 
				
			||||||
 | 
					        model.print_trainable_parameters()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    optimizer_cls = (
 | 
				
			||||||
 | 
					        AdamW
 | 
				
			||||||
 | 
					        if accelerator.state.deepspeed_plugin is None
 | 
				
			||||||
 | 
					        or "optimizer" not in accelerator.state.deepspeed_plugin.deepspeed_config
 | 
				
			||||||
 | 
					        else DummyOptim
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # karpathy doesn't decay embeddding, maybe we should exclude
 | 
				
			||||||
 | 
					    # https://github.com/karpathy/minGPT/commit/bbbdac74fa9b2e55574d70056163ffbae42310c1#diff-2075fa9c224b395be5bda85544dd36572b59c76c54562819eadadbf268602834R157s
 | 
				
			||||||
 | 
					    optimizer = optimizer_cls(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if accelerator.state.deepspeed_plugin is not None:
 | 
				
			||||||
 | 
					        gradient_accumulation_steps = accelerator.state.deepspeed_plugin.deepspeed_config[
 | 
				
			||||||
 | 
					            "gradient_accumulation_steps"
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # decay to min_lr instead of 0
 | 
				
			||||||
 | 
					    lr_ratio = config["min_lr"] / config["lr"]
 | 
				
			||||||
 | 
					    accelerator.print(f"Len of train_dataloader: {len(train_dataloader)}")
 | 
				
			||||||
 | 
					    total_num_steps = (len(train_dataloader) / gradient_accumulation_steps) * config["num_epochs"]
 | 
				
			||||||
 | 
					    # instead of decaying to zero, decay to ratio of min_lr / lr
 | 
				
			||||||
 | 
					    total_num_steps += int(total_num_steps * lr_ratio) + config["warmup_steps"]
 | 
				
			||||||
 | 
					    accelerator.print(f"Total training steps: {total_num_steps}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Creates Dummy Scheduler if `scheduler` was specified in the config file else creates `args.lr_scheduler_type` Scheduler
 | 
				
			||||||
 | 
					    if (
 | 
				
			||||||
 | 
					        accelerator.state.deepspeed_plugin is None
 | 
				
			||||||
 | 
					        or "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
 | 
					        scheduler = get_scheduler(
 | 
				
			||||||
 | 
					            name="cosine",
 | 
				
			||||||
 | 
					            optimizer=optimizer,
 | 
				
			||||||
 | 
					            num_warmup_steps=config["warmup_steps"] * accelerator.num_processes,
 | 
				
			||||||
 | 
					            num_training_steps=total_num_steps,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        scheduler = DummyScheduler(
 | 
				
			||||||
 | 
					            optimizer, total_num_steps=config["warmup_steps"], warmup_num_steps=config["warmup_steps"]
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    model, optimizer, train_dataloader, val_dataloader, scheduler = accelerator.prepare(
 | 
				
			||||||
 | 
					            model, optimizer, train_dataloader, val_dataloader, scheduler
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # setup for saving training states in case preemption
 | 
				
			||||||
 | 
					    accelerator.register_for_checkpointing(scheduler)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if config["checkpoint"]:
 | 
				
			||||||
 | 
					        accelerator.load_state(config["checkpoint"])
 | 
				
			||||||
 | 
					        accelerator.print(f"Resumed from checkpoint: {config['checkpoint']}")
 | 
				
			||||||
 | 
					        path = os.path.basename(config["train_args"]["resume_from_checkpoint"])
 | 
				
			||||||
 | 
					        training_difference = os.path.splitext(path)[0]
 | 
				
			||||||
 | 
					        resume_step = int(training_difference.replace("step_", ""))
 | 
				
			||||||
 | 
					        accelerator.skip_first_batches(train_dataloader, resume_step)
 | 
				
			||||||
 | 
					        accelerator.print(f"Resuming from step {resume_step}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # log gradients
 | 
				
			||||||
 | 
					    if accelerator.is_main_process and config["wandb"]:
 | 
				
			||||||
 | 
					        wandb.watch(model, log_freq=config["log_grads_every"], log="all")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for epoch in range(config["num_epochs"]):
 | 
				
			||||||
 | 
					        train_loss = MeanMetric(nan_strategy="error").to(model.device)
 | 
				
			||||||
 | 
					        for step, batch in enumerate(tqdm(train_dataloader)):
 | 
				
			||||||
 | 
					            model.train()
 | 
				
			||||||
 | 
					            outputs = model(**batch)
 | 
				
			||||||
 | 
					            loss = outputs.loss
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # gather loss before backprop in case of gradient accumulation
 | 
				
			||||||
 | 
					            loss_values = accelerator.gather_for_metrics({"loss": loss.detach().float()})
 | 
				
			||||||
 | 
					            train_loss.update(loss_values["loss"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            loss = loss / gradient_accumulation_steps
 | 
				
			||||||
 | 
					            accelerator.backward(loss)
 | 
				
			||||||
 | 
					            # get gradient norm of all params
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # log LR in case something weird happens 
 | 
				
			||||||
 | 
					            if step > 0 and step % (config["eval_every"] // 10) == 0:
 | 
				
			||||||
 | 
					                if config["wandb"]:
 | 
				
			||||||
 | 
					                    curr_step = step + epoch * len(train_dataloader)
 | 
				
			||||||
 | 
					                    accelerator.log({"lr": scheduler.get_last_lr()[0]}, step=curr_step)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
 | 
				
			||||||
 | 
					                optimizer.step()
 | 
				
			||||||
 | 
					                scheduler.step()
 | 
				
			||||||
 | 
					                optimizer.zero_grad()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if step > 0 and step % config["save_every"] == 0:
 | 
				
			||||||
 | 
					                curr_step = step + epoch * len(train_dataloader)
 | 
				
			||||||
 | 
					                accelerator.save_state(f"{config['output_dir']}/step_{curr_step}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if step > 0 and (step % config["eval_every"] == 0 or step == len(train_dataloader) - 1):
 | 
				
			||||||
 | 
					                val_loss = evaluate(model, val_dataloader)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                log_train = {
 | 
				
			||||||
 | 
					                        "train_loss": train_loss.compute()
 | 
				
			||||||
 | 
					                    }
 | 
				
			||||||
 | 
					                log_val = {
 | 
				
			||||||
 | 
					                    "val_loss": val_loss.compute()
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                if config["wandb"]:
 | 
				
			||||||
 | 
					                    curr_step = step + epoch * len(train_dataloader)
 | 
				
			||||||
 | 
					                    accelerator.log({**log_train, **log_val}, step=curr_step)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                accelerator.print(f"Current LR: {scheduler.get_last_lr()[0]}")
 | 
				
			||||||
 | 
					                accelerator.print(format_metrics(log_train, "train", f" step {step} "))
 | 
				
			||||||
 | 
					                accelerator.print(format_metrics(log_val, "val", f" step {step} "))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                train_loss.reset()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        accelerator.print(f"Epoch {epoch} finished")
 | 
				
			||||||
 | 
					        accelerator.print(f"Pushing to HF hub")
 | 
				
			||||||
 | 
					        accelerator.wait_for_everyone()
 | 
				
			||||||
 | 
					        unwrapped_model = accelerator.unwrap_model(model)
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            if accelerator.is_main_process:
 | 
				
			||||||
 | 
					                unwrapped_model.push_to_hub(config["save_name"] + f"-epoch_{epoch}", private=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            accelerator.print(e)
 | 
				
			||||||
 | 
					            accelerator.print(f"Failed to push to hub")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        unwrapped_model.save_pretrained(
 | 
				
			||||||
 | 
					            f"{config['output_dir']}/epoch_{epoch}",
 | 
				
			||||||
 | 
					            is_main_process=accelerator.is_main_process,
 | 
				
			||||||
 | 
					            save_function=accelerator.save,
 | 
				
			||||||
 | 
					            state_dict=accelerator.get_state_dict(model),
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					    accelerator.wait_for_everyone()
 | 
				
			||||||
 | 
					    unwrapped_model = accelerator.unwrap_model(model)
 | 
				
			||||||
 | 
					    unwrapped_model.save_pretrained(
 | 
				
			||||||
 | 
					        f"{config['output_dir']}/final",
 | 
				
			||||||
 | 
					        is_main_process=accelerator.is_main_process,
 | 
				
			||||||
 | 
					        save_function=accelerator.save,
 | 
				
			||||||
 | 
					        state_dict=accelerator.get_state_dict(model),
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    accelerator.end_training()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    # parse arguments by reading in a config
 | 
				
			||||||
 | 
					    parser = ArgumentParser()
 | 
				
			||||||
 | 
					    parser.add_argument("--config", type=str, default="config.yaml")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    args = parser.parse_args()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    config = read_config(args.config)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if config["wandb"]:
 | 
				
			||||||
 | 
					        accelerator = Accelerator(log_with="wandb")
 | 
				
			||||||
 | 
					        accelerator.init_trackers(
 | 
				
			||||||
 | 
					            project_name=config["wandb_project_name"],
 | 
				
			||||||
 | 
					            config=config,
 | 
				
			||||||
 | 
					            init_kwargs={"wandb": {"entity": config["wandb_entity"]}},
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        accelerator = Accelerator()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    train(accelerator, config=config)
 | 
				
			||||||
							
								
								
									
										1
									
								
								web/dist/assets/index-488cca87.css
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										1
									
								
								web/dist/assets/index-488cca87.css
									
									
									
									
										vendored
									
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										1
									
								
								web/dist/assets/index-54621153.css
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								web/dist/assets/index-54621153.css
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										4
									
								
								web/dist/index.html
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								web/dist/index.html
									
									
									
									
										vendored
									
									
								
							@@ -6,8 +6,8 @@
 | 
				
			|||||||
    
 | 
					    
 | 
				
			||||||
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
 | 
					    <meta name="viewport" content="width=device-width, initial-scale=1.0">
 | 
				
			||||||
    <title>GPT4All - WEBUI</title>
 | 
					    <title>GPT4All - WEBUI</title>
 | 
				
			||||||
    <script type="module" crossorigin src="/assets/index-0344eb9b.js"></script>
 | 
					    <script type="module" crossorigin src="/assets/index-f5f472ed.js"></script>
 | 
				
			||||||
    <link rel="stylesheet" href="/assets/index-488cca87.css">
 | 
					    <link rel="stylesheet" href="/assets/index-54621153.css">
 | 
				
			||||||
  </head>
 | 
					  </head>
 | 
				
			||||||
  <body>
 | 
					  <body>
 | 
				
			||||||
    <div id="app"></div>
 | 
					    <div id="app"></div>
 | 
				
			||||||
 
 | 
				
			|||||||
@@ -1,16 +1,159 @@
 | 
				
			|||||||
<template>
 | 
					<template>
 | 
				
			||||||
    <div>
 | 
					    <div class="container overflow-y-scroll flex flex-col no-scrollbar shadow-lg p-10 pt-0">
 | 
				
			||||||
        Training
 | 
					      <form @submit.prevent="submitForm" class="max-w-md mx-auto">
 | 
				
			||||||
 | 
					        <!-- Model/Tokenizer -->
 | 
				
			||||||
 | 
					        <div class="mb-4">
 | 
				
			||||||
 | 
					          <label for="model_name" class="text-sm">Model Name:</label>
 | 
				
			||||||
 | 
					          <input
 | 
				
			||||||
 | 
					            type="text"
 | 
				
			||||||
 | 
					            id="model_name"
 | 
				
			||||||
 | 
					            v-model="model_name"
 | 
				
			||||||
 | 
					            required
 | 
				
			||||||
 | 
					            class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
 | 
				
			||||||
 | 
					          >
 | 
				
			||||||
 | 
					        </div>
 | 
				
			||||||
 | 
					        <div class="mb-4">
 | 
				
			||||||
 | 
					          <label for="tokenizer_name" class="text-sm">Tokenizer Name:</label>
 | 
				
			||||||
 | 
					          <input
 | 
				
			||||||
 | 
					            type="text"
 | 
				
			||||||
 | 
					            id="tokenizer_name"
 | 
				
			||||||
 | 
					            v-model="tokenizer_name"
 | 
				
			||||||
 | 
					            required
 | 
				
			||||||
 | 
					            class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
 | 
				
			||||||
 | 
					          >
 | 
				
			||||||
        </div>
 | 
					        </div>
 | 
				
			||||||
</template>
 | 
					 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
<script>
 | 
					        <!-- Dataset -->
 | 
				
			||||||
export default {
 | 
					        <div class="mb-4">
 | 
				
			||||||
    setup () {
 | 
					          <label for="dataset_path" class="text-sm">Dataset:</label>
 | 
				
			||||||
 | 
					          <input
 | 
				
			||||||
 | 
					            type="file"
 | 
				
			||||||
 | 
					            id="dataset_path"
 | 
				
			||||||
 | 
					            ref="dataset_path"
 | 
				
			||||||
 | 
					            accept=".parquet"
 | 
				
			||||||
 | 
					            v-on:change="selectDatasetPath"
 | 
				
			||||||
 | 
					            class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
 | 
				
			||||||
 | 
					          >
 | 
				
			||||||
 | 
					          <p class="mt-2 text-xs">Selected File: {{ selectedDatasetPath }}</p>
 | 
				
			||||||
 | 
					        </div>
 | 
				
			||||||
 | 
					        <div class="mb-4">
 | 
				
			||||||
 | 
					          <label for="max_length" class="text-sm">Max Length:</label>
 | 
				
			||||||
 | 
					          <input
 | 
				
			||||||
 | 
					            type="number"
 | 
				
			||||||
 | 
					            id="max_length"
 | 
				
			||||||
 | 
					            v-model.number="max_length"
 | 
				
			||||||
 | 
					            required
 | 
				
			||||||
 | 
					            class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
 | 
				
			||||||
 | 
					          >
 | 
				
			||||||
 | 
					        </div>
 | 
				
			||||||
 | 
					        <div class="mb-4">
 | 
				
			||||||
 | 
					          <label for="batch_size" class="text-sm">Batch Size:</label>
 | 
				
			||||||
 | 
					          <input
 | 
				
			||||||
 | 
					            type="number"
 | 
				
			||||||
 | 
					            id="batch_size"
 | 
				
			||||||
 | 
					            v-model.number="batch_size"
 | 
				
			||||||
 | 
					            required
 | 
				
			||||||
 | 
					            class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
 | 
				
			||||||
 | 
					          >
 | 
				
			||||||
 | 
					        </div>
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
 | 
					        <!-- Train Dynamics -->
 | 
				
			||||||
 | 
					        <div class="mb-4">
 | 
				
			||||||
 | 
					          <label for="lr" class="text-sm">Learning Rate:</label>
 | 
				
			||||||
 | 
					          <input
 | 
				
			||||||
 | 
					            type="number"
 | 
				
			||||||
 | 
					            id="lr"
 | 
				
			||||||
 | 
					            v-model.number="lr"
 | 
				
			||||||
 | 
					            required
 | 
				
			||||||
 | 
					            class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
 | 
				
			||||||
 | 
					          >
 | 
				
			||||||
 | 
					        </div>
 | 
				
			||||||
 | 
					        <div class="mb-4">
 | 
				
			||||||
 | 
					          <label for="num_epochs" class="text-sm">Number of Epochs:</label>
 | 
				
			||||||
 | 
					          <input
 | 
				
			||||||
 | 
					            type="number"
 | 
				
			||||||
 | 
					            id="num_epochs"
 | 
				
			||||||
 | 
					            v-model.number="num_epochs"
 | 
				
			||||||
 | 
					            required
 | 
				
			||||||
 | 
					            class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
 | 
				
			||||||
 | 
					          >
 | 
				
			||||||
 | 
					        </div>
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
        return {}
 | 
					        <!-- Logging -->
 | 
				
			||||||
 | 
					        <div class="mb-4">
 | 
				
			||||||
 | 
					        <label for="output_dir" class="text-sm">Output Directory:</label>
 | 
				
			||||||
 | 
					        <input
 | 
				
			||||||
 | 
					          type="text"
 | 
				
			||||||
 | 
					          id="output_dir"
 | 
				
			||||||
 | 
					          v-model="selectedFolder"
 | 
				
			||||||
 | 
					          class="w-full mt-1 px-2 py-1 border border-gray-300 rounded"
 | 
				
			||||||
 | 
					          placeholder="Enter or select the output folder"
 | 
				
			||||||
 | 
					        >
 | 
				
			||||||
 | 
					        <input
 | 
				
			||||||
 | 
					          type="file"
 | 
				
			||||||
 | 
					          id="folder_selector"
 | 
				
			||||||
 | 
					          ref="folder_selector"
 | 
				
			||||||
 | 
					          style="display: none"
 | 
				
			||||||
 | 
					          webkitdirectory
 | 
				
			||||||
 | 
					          v-on:change="selectOutputDirectory"
 | 
				
			||||||
 | 
					        >
 | 
				
			||||||
 | 
					        <button type="button" @click="openFolderSelector" class="bg-blue-500 text-white px-4 py-2 rounded">Select Folder</button>
 | 
				
			||||||
 | 
					      </div>
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					        <button type="submit" class="bg-blue-500 text-white px-4 py-2 rounded">Train LLM</button>
 | 
				
			||||||
 | 
					      </form>
 | 
				
			||||||
 | 
					    </div>
 | 
				
			||||||
 | 
					  </template>
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					  <script>
 | 
				
			||||||
 | 
					  export default {
 | 
				
			||||||
 | 
					    data() {
 | 
				
			||||||
 | 
					        return {
 | 
				
			||||||
 | 
					            model_name: 'jondurbin/airoboros-7b-gpt4',
 | 
				
			||||||
 | 
					            tokenizer_name: 'jondurbin/airoboros-7b-gpt4',
 | 
				
			||||||
 | 
					            dataset_path: '',
 | 
				
			||||||
 | 
					            max_length: 1024,
 | 
				
			||||||
 | 
					            batch_size: 4,
 | 
				
			||||||
 | 
					            lr: 5.0e-5,
 | 
				
			||||||
 | 
					            num_epochs: 2,
 | 
				
			||||||
 | 
					            selectedFolder: '',
 | 
				
			||||||
 | 
					            selectedDatasetPath: '',
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					    methods: {
 | 
				
			||||||
 | 
					      submitForm() {
 | 
				
			||||||
 | 
					        const formData = {
 | 
				
			||||||
 | 
					          model_name: this.model_name,
 | 
				
			||||||
 | 
					          tokenizer_name: this.tokenizer_name,
 | 
				
			||||||
 | 
					          dataset_path: this.selectedDatasetPath,
 | 
				
			||||||
 | 
					          max_length: this.max_length,
 | 
				
			||||||
 | 
					          batch_size: this.batch_size,
 | 
				
			||||||
 | 
					          lr: this.lr,
 | 
				
			||||||
 | 
					          num_epochs: this.num_epochs,
 | 
				
			||||||
 | 
					          output_dir: this.selectedFolder,
 | 
				
			||||||
 | 
					        };
 | 
				
			||||||
 | 
					  
 | 
				
			||||||
 | 
					        // Send the form data to the backend
 | 
				
			||||||
 | 
					        // ...
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					      openFolderSelector() {
 | 
				
			||||||
 | 
					      this.$refs.folder_selector.click();
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					      selectOutputDirectory(event) {
 | 
				
			||||||
 | 
					        console.log("here")
 | 
				
			||||||
 | 
					        const folderPath = event.target.files[0]?.path;
 | 
				
			||||||
 | 
					        console.log(folderPath)
 | 
				
			||||||
 | 
					        if (folderPath) {
 | 
				
			||||||
 | 
					            this.selectedFolder = folderPath;
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
}
 | 
					      },
 | 
				
			||||||
</script>
 | 
					      selectDatasetPath(event) {
 | 
				
			||||||
 | 
					        const files = event.target.files;
 | 
				
			||||||
 | 
					        if (files.length > 0) {
 | 
				
			||||||
 | 
					            this.selectedDatasetPath = files[0].webkitRelativePath;
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					      },
 | 
				
			||||||
 | 
					    },
 | 
				
			||||||
 | 
					  };
 | 
				
			||||||
 | 
					  </script>
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
		Reference in New Issue
	
	Block a user