Script python permettant de traduire un long texte
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

finetunning.py 6.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. import os
  2. import torch
  3. from datasets import load_dataset
  4. from transformers import (
  5. AutoTokenizer,
  6. AutoModelForCausalLM,
  7. TrainingArguments,
  8. BitsAndBytesConfig,
  9. )
  10. from peft import (
  11. LoraConfig,
  12. get_peft_model,
  13. prepare_model_for_kbit_training,
  14. )
  15. from trl import SFTTrainer
  16. # ----------------------------
  17. # Environment safety (Windows + AMP fixes)
  18. # ----------------------------
  19. os.environ["TORCHDYNAMO_DISABLE"] = "1"
  20. os.environ["ACCELERATE_MIXED_PRECISION"] = "no" # ✅ disable AMP completely
  21. os.environ["TORCH_AMP_DISABLE"] = "1" # ✅ disable GradScaler
  22. os.environ["CUDA_VISIBLE_DEVICES"] = "0" # optional: force first GPU
  23. # ----------------------------
  24. # Global configuration
  25. # ----------------------------
  26. MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
  27. OUTPUT_DIR = "./qwen2.5-7b-uk-fr-lora"
  28. DATA_FILE = "paires_clean.json"
  29. MAX_SEQ_LENGTH = 512 # Reduce for RTX 4080 SUPER VRAM
  30. print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n")
  31. # ----------------------------
  32. # [1/7] Tokenizer
  33. # ----------------------------
  34. print(f"{80 * '_'}\n[1/7] Loading tokenizer...")
  35. tokenizer = AutoTokenizer.from_pretrained(
  36. MODEL_NAME,
  37. trust_remote_code=True,
  38. )
  39. tokenizer.pad_token = tokenizer.eos_token
  40. tokenizer.model_max_length = MAX_SEQ_LENGTH
  41. print("Tokenizer loaded.")
  42. print(f"Pad token id: {tokenizer.pad_token_id}")
  43. print(f"Max sequence length: {tokenizer.model_max_length}")
  44. # ----------------------------
  45. # [2/7] Load model in 4-bit (QLoRA)
  46. # ----------------------------
  47. print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (QLoRA)...")
  48. assert torch.cuda.is_available(), "CUDA GPU not detected!"
  49. print(f"Using GPU: {torch.cuda.get_device_name(0)}")
  50. bnb_config = BitsAndBytesConfig(
  51. load_in_4bit=True,
  52. bnb_4bit_quant_type="nf4",
  53. bnb_4bit_compute_dtype=torch.float16, # fp16 internally
  54. bnb_4bit_use_double_quant=True,
  55. )
  56. model = AutoModelForCausalLM.from_pretrained(
  57. MODEL_NAME,
  58. device_map="auto",
  59. quantization_config=bnb_config,
  60. low_cpu_mem_usage=True,
  61. trust_remote_code=True,
  62. )
  63. # Align model tokens with tokenizer
  64. model.config.pad_token_id = tokenizer.pad_token_id
  65. model.config.bos_token_id = tokenizer.bos_token_id
  66. model.config.eos_token_id = tokenizer.eos_token_id
  67. print("Model loaded successfully in 4-bit mode on GPU.")
  68. # ----------------------------
  69. # [3/7] Prepare model for k-bit training
  70. # ----------------------------
  71. print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...")
  72. model = prepare_model_for_kbit_training(model)
  73. model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
  74. model.config.use_cache = False # Important with gradient checkpointing + QLoRA
  75. print("Model prepared for k-bit training.")
  76. # ----------------------------
  77. # [4/7] LoRA configuration
  78. # ----------------------------
  79. print(f"{80 * '_'}\n[4/7] Configuring LoRA adapters...")
  80. lora_config = LoraConfig(
  81. r=32,
  82. lora_alpha=64,
  83. lora_dropout=0.02,
  84. bias="none",
  85. task_type="CAUSAL_LM",
  86. target_modules=[
  87. "q_proj",
  88. "k_proj",
  89. "v_proj",
  90. "o_proj",
  91. "gate_proj",
  92. "up_proj",
  93. "down_proj",
  94. ],
  95. )
  96. model = get_peft_model(model, lora_config)
  97. model.print_trainable_parameters()
  98. print("LoRA adapters successfully attached.")
  99. # ----------------------------
  100. # [5/7] Dataset loading & formatting
  101. # ----------------------------
  102. print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...")
  103. dataset = load_dataset("json", data_files=DATA_FILE)
  104. print(f"Dataset loaded with {len(dataset['train'])} samples.")
  105. print("Formatting dataset for Ukrainian → French translation...")
  106. def format_prompt(example):
  107. return {
  108. "text": (
  109. "<|im_start|>user\n"
  110. "Translate the following Ukrainian text into French.\n"
  111. f"Ukrainian: {example['text']}\n"
  112. "<|im_end|>\n"
  113. "<|im_start|>assistant\n"
  114. f"{example['translation']}"
  115. "<|im_end|>"
  116. )
  117. }
  118. dataset = dataset.map(
  119. format_prompt,
  120. remove_columns=dataset["train"].column_names
  121. )
  122. print("Dataset formatting completed.")
  123. print("Example prompt:\n")
  124. print(dataset["train"][0]["text"])
  125. # ----------------------------
  126. # [6/7] Training arguments
  127. # ----------------------------
  128. print(f"{80 * '_'}\n[6/7] Initializing training arguments...")
  129. training_args = TrainingArguments(
  130. output_dir=OUTPUT_DIR,
  131. per_device_train_batch_size=1,
  132. gradient_accumulation_steps=16,
  133. learning_rate=1e-4,
  134. num_train_epochs=3,
  135. max_steps=1000,
  136. fp16=False, # ⚠ disable AMP
  137. bf16=False, # ⚠ disable BF16
  138. optim="paged_adamw_32bit",
  139. logging_steps=10,
  140. save_steps=500,
  141. save_total_limit=2,
  142. report_to="none",
  143. dataloader_pin_memory=False,
  144. max_grad_norm=0.0, # avoid AMP gradient clipping
  145. )
  146. print("Training arguments ready.")
  147. print(f"Output directory: {OUTPUT_DIR}")
  148. print(f"Epochs: {training_args.num_train_epochs}")
  149. print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
  150. # ----------------------------
  151. # [7/7] Trainer
  152. # ----------------------------
  153. print(f"{80 * '_'}\nInitializing SFTTrainer...")
  154. trainer = SFTTrainer(
  155. model=model,
  156. train_dataset=dataset["train"],
  157. args=training_args,
  158. )
  159. print("Trainer initialized.")
  160. # ----------------------------
  161. # Training
  162. # ----------------------------
  163. print(f"{80 * '_'}\nStarting training...")
  164. checkpoint_exists = False
  165. if os.path.exists(OUTPUT_DIR):
  166. checkpoint_exists = any(
  167. d.startswith("checkpoint-")
  168. for d in os.listdir(OUTPUT_DIR)
  169. )
  170. if checkpoint_exists:
  171. print("Checkpoint found → resuming training")
  172. train_output = trainer.train(resume_from_checkpoint=True)
  173. else:
  174. print("No checkpoint found → starting fresh training")
  175. train_output = trainer.train()
  176. print("\n=== Training summary ===")
  177. print(f"Global steps: {train_output.global_step}")
  178. print(f"Training loss: {train_output.training_loss}")
  179. print(f"Metrics: {train_output.metrics}")
  180. print("Training completed successfully.")
  181. # ----------------------------
  182. # Save LoRA adapter and tokenizer
  183. # ----------------------------
  184. print(f"{80 * '_'}\nSaving LoRA adapter and tokenizer...")
  185. trainer.model.save_pretrained(OUTPUT_DIR)
  186. tokenizer.save_pretrained(OUTPUT_DIR)
  187. print("\n=== Fine-tuning finished ===")
  188. print(f"LoRA adapter saved in: {OUTPUT_DIR}")