import os import torch from datasets import load_dataset from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig, ) from peft import ( LoraConfig, get_peft_model, prepare_model_for_kbit_training, ) from trl import SFTTrainer # ---------------------------- # Environment safety (Windows) # ---------------------------- os.environ["TORCHDYNAMO_DISABLE"] = "1" # ---------------------------- # Global configuration # ---------------------------- MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct" OUTPUT_DIR = "./qwen2.5-7b-uk-fr-lora" DATA_FILE = "paires_clean.json" MAX_SEQ_LENGTH = 1024 print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n") # ---------------------------- # [1/7] Tokenizer # ---------------------------- print(f"{80 * '_'}\n[1/7] Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, trust_remote_code=True ) tokenizer.pad_token = tokenizer.eos_token tokenizer.model_max_length = MAX_SEQ_LENGTH print("Tokenizer loaded.") print(f"Pad token id: {tokenizer.pad_token_id}") print(f"Max sequence length: {tokenizer.model_max_length}") # ---------------------------- # [2/7] Quantization config (QLoRA) # ---------------------------- print(f"{80 * '_'}\n[2/7] Configuring 4-bit quantization (BitsAndBytes)...") bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, ) print("4-bit NF4 quantization configured.") print("Loading model...") model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, device_map="auto", quantization_config=bnb_config, dtype=torch.float16, trust_remote_code=True, ) print("Model loaded successfully.") # ---------------------------- # [3/7] Prepare model for k-bit training # ---------------------------- print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...") model = prepare_model_for_kbit_training(model) model.gradient_checkpointing_enable( gradient_checkpointing_kwargs={"use_reentrant": False} ) print("Model prepared for k-bit training.") print("Gradient checkpointing enabled (non-reentrant).") # ---------------------------- # [4/7] LoRA configuration # ---------------------------- print(f"{80 * '_'}\n[4/7] Configuring LoRA adapters...") lora_config = LoraConfig( r=32, lora_alpha=64, lora_dropout=0.02, bias="none", task_type="CAUSAL_LM", target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() print("LoRA adapters successfully attached.") # ---------------------------- # [5/7] Dataset loading & formatting # ---------------------------- print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...") dataset = load_dataset("json", data_files=DATA_FILE) print(f"Dataset loaded with {len(dataset['train'])} samples.") print("Formatting dataset for Ukrainian → French translation...") def format_prompt(example): return { "text": ( "<|user|>\n" "Translate the following Ukrainian text into French.\n" f"Ukrainian: {example['text']}\n" "<|assistant|>\n" f"{example['translation']}" ) } dataset = dataset.map( format_prompt, remove_columns=dataset["train"].column_names ) print("Dataset formatting completed.") print("Example prompt:\n") print(dataset["train"][0]["text"]) # ---------------------------- # [6/7] Training arguments # ---------------------------- print(f"{80 * '_'}\n[6/7] Initializing training arguments...") training_args = TrainingArguments( output_dir=OUTPUT_DIR, per_device_train_batch_size=1, gradient_accumulation_steps=8, learning_rate=1e-4, num_train_epochs=3, fp16=False, bf16=False, optim="paged_adamw_32bit", logging_steps=10, save_steps=500, save_total_limit=2, report_to="none", ) print("Training arguments ready.") print(f"Output directory: {OUTPUT_DIR}") print(f"Epochs: {training_args.num_train_epochs}") print( f"Effective batch size: " f"{training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}" ) # ---------------------------- # Trainer # ---------------------------- print("Initializing SFTTrainer...") trainer = SFTTrainer( model=model, train_dataset=dataset["train"], tokenizer=tokenizer, args=training_args, ) print("Trainer initialized.") # ---------------------------- # [7/7] Training # ---------------------------- print(f"{80 * '_'}\n[7/7] Starting training...") try: trainer.train(resume_from_checkpoint=True) except Exception as e: print("No checkpoint found or resume failed, starting fresh training.") print(f"Reason: {e}") trainer.train() print("Training completed successfully.") # ---------------------------- # Save LoRA adapter # ---------------------------- print(f"{80 * '_'}\nSaving LoRA adapter and tokenizer...") trainer.model.save_pretrained(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR) print("\n=== Fine-tuning finished ===") print(f"LoRA adapter saved in: {OUTPUT_DIR}")