@ -15,9 +15,12 @@ from peft import (
from trl import SFTTrainer
# ----------------------------
# Environment safety (Windows)
# Environment safety (Windows + AMP fixes )
# ----------------------------
os . environ [ " TORCHDYNAMO_DISABLE " ] = " 1 "
os . environ [ " ACCELERATE_MIXED_PRECISION " ] = " no " # ✅ disable AMP completely
os . environ [ " TORCH_AMP_DISABLE " ] = " 1 " # ✅ disable GradScaler
os . environ [ " CUDA_VISIBLE_DEVICES " ] = " 0 " # optional: force first GPU
# ----------------------------
# Global configuration
@ -25,7 +28,7 @@ os.environ["TORCHDYNAMO_DISABLE"] = "1"
MODEL_NAME = " Qwen/Qwen2.5-7B-Instruct "
OUTPUT_DIR = " ./qwen2.5-7b-uk-fr-lora "
DATA_FILE = " paires_clean.json "
MAX_SEQ_LENGTH = 10 24
MAX_SEQ_LENGTH = 5 12 # Reduce for RTX 4080 SUPER VRAM
print ( f " \n === Starting fine-tuning script for {MODEL_NAME} === \n " )
@ -35,54 +38,50 @@ print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n")
print ( f " {80 * ' _ ' } \n [1/7] Loading tokenizer... " )
tokenizer = AutoTokenizer . from_pretrained (
MODEL_NAME ,
trust_remote_code = True
trust_remote_code = True ,
)
tokenizer . pad_token = tokenizer . eos_token
tokenizer . model_max_length = MAX_SEQ_LENGTH
print ( " Tokenizer loaded. " )
print ( f " Pad token id: {tokenizer.pad_token_id} " )
print ( f " Max sequence length: {tokenizer.model_max_length} " )
# ----------------------------
# [2/7] Quantization config (QLoRA)
# [2/7] Load model in 4-bit (QLoRA)
# ----------------------------
print ( f " {80 * ' _ ' } \n [2/7] Loading model in 4-bit mode (optimized QLoRA)... " )
print ( f " {80 * ' _ ' } \n [2/7] Loading model in 4-bit mode (QLoRA)... " )
assert torch . cuda . is_available ( ) , " CUDA GPU not detected! "
print ( f " Using GPU: {torch.cuda.get_device_name(0)} " )
bnb_config = BitsAndBytesConfig (
load_in_4bit = True ,
bnb_4bit_quant_type = " nf4 " ,
bnb_4bit_compute_dtype = torch . float16 ,
bnb_4bit_compute_dtype = torch . float16 , # fp16 internally
bnb_4bit_use_double_quant = True ,
)
model = AutoModelForCausalLM . from_pretrained (
MODEL_NAME ,
device_map = " cud a" , # 🔥 SAFE
device_map = " auto " ,
quantization_config = bnb_config ,
low_cpu_mem_usage = True ,
trust_remote_code = True ,
)
# Align model tokens with tokenizer
model . config . pad_token_id = tokenizer . pad_token_id
model . config . bos_token_id = tokenizer . bos_token_id
model . config . eos_token_id = tokenizer . eos_token_id
print ( " Model loaded successfully in 4-bit mode on GPU. " )
# ----------------------------
# [3/7] Prepare model for k-bit training
# ----------------------------
print ( f " {80 * ' _ ' } \n [3/7] Preparing model for k-bit training... " )
model = prepare_model_for_kbit_training ( model )
model . gradient_checkpointing_enable (
gradient_checkpointing_kwargs = { " use_reentrant " : False }
)
model . gradient_checkpointing_enable ( gradient_checkpointing_kwargs = { " use_reentrant " : False } )
model . config . use_cache = False # Important with gradient checkpointing + QLoRA
print ( " Model prepared for k-bit training. " )
print ( " Gradient checkpointing enabled (non-reentrant). " )
# ----------------------------
# [4/7] LoRA configuration
@ -104,10 +103,8 @@ lora_config = LoraConfig(
" down_proj " ,
] ,
)
model = get_peft_model ( model , lora_config )
model . print_trainable_parameters ( )
print ( " LoRA adapters successfully attached. " )
# ----------------------------
@ -115,18 +112,19 @@ print("LoRA adapters successfully attached.")
# ----------------------------
print ( f " {80 * ' _ ' } \n [5/7] Loading dataset from JSON file... " )
dataset = load_dataset ( " json " , data_files = DATA_FILE )
print ( f " Dataset loaded with {len(dataset[ ' train ' ])} samples. " )
print ( " Formatting dataset for Ukrainian → French translation... " )
def format_prompt ( example ) :
return {
" text " : ( " <|user|> \n "
" text " : (
" <|im_start|>user \n "
" Translate the following Ukrainian text into French. \n "
f " Ukrainian: {example[ ' text ' ]} \n "
" <|assistant|> \n "
" <|im_end|> \n "
" <|im_start|>assistant \n "
f " {example[ ' translation ' ]} "
" <|im_end|> "
)
}
@ -134,7 +132,6 @@ dataset = dataset.map(
format_prompt ,
remove_columns = dataset [ " train " ] . column_names
)
print ( " Dataset formatting completed. " )
print ( " Example prompt: \n " )
print ( dataset [ " train " ] [ 0 ] [ " text " ] )
@ -146,46 +143,49 @@ print(f"{80 * '_'}\n[6/7] Initializing training arguments...")
training_args = TrainingArguments (
output_dir = OUTPUT_DIR ,
per_device_train_batch_size = 1 ,
gradient_accumulation_steps = 8 ,
gradient_accumulation_steps = 16 ,
learning_rate = 1e-4 ,
num_train_epochs = 3 ,
fp16 = False ,
bf16 = False ,
max_steps = 1000 ,
fp16 = False , # ⚠ disable AMP
bf16 = False , # ⚠ disable BF16
optim = " paged_adamw_32bit " ,
logging_steps = 10 ,
save_steps = 500 ,
save_total_limit = 2 ,
report_to = " none " ,
dataloader_pin_memory = False ,
max_grad_norm = 0.0 , # avoid AMP gradient clipping
)
print ( " Training arguments ready. " )
print ( f " Output directory: {OUTPUT_DIR} " )
print ( f " Epochs: {training_args.num_train_epochs} " )
print ( f " Effective batch size: "
f " {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps} "
)
print ( f " Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps} " )
# ----------------------------
# Trainer
# [7/7] Trainer
# ----------------------------
print ( " Initializing SFTTrainer... " )
print ( f " {80 * ' _ ' } \n Initializing SFTTrainer... " )
trainer = SFTTrainer (
model = model ,
train_dataset = dataset [ " train " ] ,
processing_class = tokenizer ,
args = training_args ,
)
print ( " Trainer initialized. " )
# ----------------------------
# [7/7] Training
# Training
# ----------------------------
print ( f " {80 * ' _ ' } \n [7/7] Starting training... " )
checkpoint_exists = any (
print ( f " {80 * ' _ ' } \n Starting training... " )
checkpoint_exists = False
if os . path . exists ( OUTPUT_DIR ) :
checkpoint_exists = any (
d . startswith ( " checkpoint- " )
for d in os . listdir ( OUTPUT_DIR )
) if os . path . exists ( OUTPUT_DIR ) else False
)
if checkpoint_exists :
print ( " Checkpoint found → resuming training " )
@ -194,7 +194,6 @@ else:
print ( " No checkpoint found → starting fresh training " )
train_output = trainer . train ( )
print ( " \n === Training summary === " )
print ( f " Global steps: {train_output.global_step} " )
print ( f " Training loss: {train_output.training_loss} " )
@ -202,11 +201,10 @@ print(f"Metrics: {train_output.metrics}")
print ( " Training completed successfully. " )
# ----------------------------
# Save LoRA adapter
# Save LoRA adapter and tokenizer
# ----------------------------
print ( f " {80 * ' _ ' } \n Saving LoRA adapter and tokenizer... " )
trainer . model . save_pretrained ( OUTPUT_DIR )
tokenizer . save_pretrained ( OUTPUT_DIR )
print ( " \n === Fine-tuning finished === " )
print ( f " LoRA adapter saved in: {OUTPUT_DIR} " )