import os import torch from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig, ) from peft import ( LoraConfig, get_peft_model, prepare_model_for_kbit_training, ) from trl import SFTTrainer # ---------------------------- # Environment safety (Windows + AMP fixes) # ---------------------------- os.environ["TORCHDYNAMO_DISABLE"] = "1" os.environ["ACCELERATE_MIXED_PRECISION"] = "no" # ✅ disable AMP completely os.environ["TORCH_AMP_DISABLE"] = "1" # ✅ disable GradScaler os.environ["CUDA_VISIBLE_DEVICES"] = "0" # optional: force first GPU # ---------------------------- # Global configuration # ---------------------------- MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct" OUTPUT_DIR = "./qwen2.5-7b-uk-fr-lora" DATA_FILE = "paires_clean.json" MAX_SEQ_LENGTH = 512 # Reduce for RTX 4080 SUPER VRAM print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n") # ---------------------------- # [1/7] Tokenizer # ---------------------------- print(f"{80 * '_'}\n[1/7] Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, trust_remote_code=True, ) tokenizer.pad_token = tokenizer.eos_token tokenizer.model_max_length = MAX_SEQ_LENGTH print("Tokenizer loaded.") print(f"Pad token id: {tokenizer.pad_token_id}") print(f"Max sequence length: {tokenizer.model_max_length}") # ---------------------------- # [2/7] Load model in 4-bit (QLoRA) # ---------------------------- print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (QLoRA)...") assert torch.cuda.is_available(), "CUDA GPU not detected!" print(f"Using GPU: {torch.cuda.get_device_name(0)}") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, # fp16 internally bnb_4bit_use_double_quant=True, ) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="auto", quantization_config=bnb_config, low_cpu_mem_usage=True, trust_remote_code=True, ) # Align model tokens with tokenizer model.config.pad_token_id = tokenizer.pad_token_id model.config.bos_token_id = tokenizer.bos_token_id model.config.eos_token_id = tokenizer.eos_token_id print("Model loaded successfully in 4-bit mode on GPU.") # ---------------------------- # [3/7] Prepare model for k-bit training # ---------------------------- print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...") model = prepare_model_for_kbit_training(model) model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False}) model.config.use_cache = False # Important with gradient checkpointing + QLoRA print("Model prepared for k-bit training.") # ---------------------------- # [4/7] LoRA configuration # ---------------------------- print(f"{80 * '_'}\n[4/7] Configuring LoRA adapters...") lora_config = LoraConfig( r=32, lora_alpha=64, lora_dropout=0.02, bias="none", task_type="CAUSAL_LM", target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() print("LoRA adapters successfully attached.") # ---------------------------- # [5/7] Dataset loading & formatting # ---------------------------- print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...") dataset = load_dataset("json", data_files=DATA_FILE) print(f"Dataset loaded with {len(dataset['train'])} samples.") print("Formatting dataset for Ukrainian → French translation...") def format_prompt(example): return { "text": ( "<|im_start|>user\n" "Translate the following Ukrainian text into French.\n" f"Ukrainian: {example['text']}\n" "<|im_end|>\n" "<|im_start|>assistant\n" f"{example['translation']}" "<|im_end|>" ) } dataset = dataset.map( format_prompt, remove_columns=dataset["train"].column_names ) print("Dataset formatting completed.") print("Example prompt:\n") print(dataset["train"][0]["text"]) # ---------------------------- # [6/7] Training arguments # ---------------------------- print(f"{80 * '_'}\n[6/7] Initializing training arguments...") training_args = TrainingArguments( output_dir=OUTPUT_DIR, per_device_train_batch_size=1, gradient_accumulation_steps=16, learning_rate=1e-4, num_train_epochs=3, max_steps=1000, fp16=False, # ⚠ disable AMP bf16=False, # ⚠ disable BF16 optim="paged_adamw_32bit", logging_steps=10, save_steps=500, save_total_limit=2, report_to="none", dataloader_pin_memory=False, max_grad_norm=0.0, # avoid AMP gradient clipping ) print("Training arguments ready.") print(f"Output directory: {OUTPUT_DIR}") print(f"Epochs: {training_args.num_train_epochs}") print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}") # ---------------------------- # [7/7] Trainer # ---------------------------- print(f"{80 * '_'}\nInitializing SFTTrainer...") trainer = SFTTrainer( model=model, train_dataset=dataset["train"], args=training_args, ) print("Trainer initialized.") # ---------------------------- # Training # ---------------------------- print(f"{80 * '_'}\nStarting training...") checkpoint_exists = False if os.path.exists(OUTPUT_DIR): checkpoint_exists = any( d.startswith("checkpoint-") for d in os.listdir(OUTPUT_DIR) ) if checkpoint_exists: print("Checkpoint found → resuming training") train_output = trainer.train(resume_from_checkpoint=True) else: print("No checkpoint found → starting fresh training") train_output = trainer.train() print("\n=== Training summary ===") print(f"Global steps: {train_output.global_step}") print(f"Training loss: {train_output.training_loss}") print(f"Metrics: {train_output.metrics}") print("Training completed successfully.") # ---------------------------- # Save LoRA adapter and tokenizer # ---------------------------- print(f"{80 * '_'}\nSaving LoRA adapter and tokenizer...") trainer.model.save_pretrained(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR) print("\n=== Fine-tuning finished ===") print(f"LoRA adapter saved in: {OUTPUT_DIR}")