| @@ -48,7 +48,11 @@ print(f"Max sequence length: {tokenizer.model_max_length}") | |||
| # ---------------------------- | |||
| # [2/7] Quantization config (QLoRA) | |||
| # ---------------------------- | |||
| print(f"{80 * '_'}\n[2/7] Configuring 4-bit quantization (BitsAndBytes)...") | |||
| print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (optimized QLoRA)...") | |||
| assert torch.cuda.is_available(), "CUDA GPU not detected!" | |||
| print(f"Using GPU: {torch.cuda.get_device_name(0)}") | |||
| bnb_config = BitsAndBytesConfig( | |||
| load_in_4bit=True, | |||
| bnb_4bit_quant_type="nf4", | |||
| @@ -56,17 +60,16 @@ bnb_config = BitsAndBytesConfig( | |||
| bnb_4bit_use_double_quant=True, | |||
| ) | |||
| print("4-bit NF4 quantization configured.") | |||
| print("Loading model...") | |||
| model = AutoModelForCausalLM.from_pretrained( | |||
| MODEL_NAME, | |||
| device_map="auto", | |||
| device_map="cuda", # 🔥 SAFE | |||
| quantization_config=bnb_config, | |||
| dtype=torch.float16, | |||
| low_cpu_mem_usage=True, | |||
| trust_remote_code=True, | |||
| ) | |||
| print("Model loaded successfully.") | |||
| print("Model loaded successfully in 4-bit mode on GPU.") | |||
| # ---------------------------- | |||
| # [3/7] Prepare model for k-bit training | |||
| @@ -119,8 +122,7 @@ print("Formatting dataset for Ukrainian → French translation...") | |||
| def format_prompt(example): | |||
| return { | |||
| "text": ( | |||
| "<|user|>\n" | |||
| "text": ("<|user|>\n" | |||
| "Translate the following Ukrainian text into French.\n" | |||
| f"Ukrainian: {example['text']}\n" | |||
| "<|assistant|>\n" | |||
| @@ -154,13 +156,13 @@ training_args = TrainingArguments( | |||
| save_steps=500, | |||
| save_total_limit=2, | |||
| report_to="none", | |||
| dataloader_pin_memory=False, | |||
| ) | |||
| print("Training arguments ready.") | |||
| print(f"Output directory: {OUTPUT_DIR}") | |||
| print(f"Epochs: {training_args.num_train_epochs}") | |||
| print( | |||
| f"Effective batch size: " | |||
| print(f"Effective batch size: " | |||
| f"{training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}" | |||
| ) | |||
| @@ -171,7 +173,7 @@ print("Initializing SFTTrainer...") | |||
| trainer = SFTTrainer( | |||
| model=model, | |||
| train_dataset=dataset["train"], | |||
| tokenizer=tokenizer, | |||
| processing_class=tokenizer, | |||
| args=training_args, | |||
| ) | |||
| print("Trainer initialized.") | |||
| @@ -181,12 +183,16 @@ print("Trainer initialized.") | |||
| # ---------------------------- | |||
| print(f"{80 * '_'}\n[7/7] Starting training...") | |||
| try: | |||
| trainer.train(resume_from_checkpoint=True) | |||
| train_output = trainer.train(resume_from_checkpoint=True) | |||
| except Exception as e: | |||
| print("No checkpoint found or resume failed, starting fresh training.") | |||
| print(f"Reason: {e}") | |||
| trainer.train() | |||
| train_output = trainer.train() | |||
| print("\n=== Training summary ===") | |||
| print(f"Global steps: {train_output.global_step}") | |||
| print(f"Training loss: {train_output.training_loss}") | |||
| print(f"Metrics: {train_output.metrics}") | |||
| print("Training completed successfully.") | |||
| # ---------------------------- | |||