Script python permettant de traduire un long texte
選択できるのは25トピックまでです。 トピックは、先頭が英数字で、英数字とダッシュ('-')を使用した35文字以内のものにしてください。

finetunning.py 6.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. import os
  2. import torch
  3. from datasets import load_dataset
  4. from transformers import (
  5. AutoTokenizer,
  6. AutoModelForCausalLM,
  7. TrainingArguments,
  8. BitsAndBytesConfig,
  9. )
  10. from peft import (
  11. LoraConfig,
  12. get_peft_model,
  13. prepare_model_for_kbit_training,
  14. )
  15. from trl import SFTTrainer
  16. # ----------------------------
  17. # Environment safety (Windows + AMP fixes)
  18. # ----------------------------
  19. os.environ["TORCHDYNAMO_DISABLE"] = "1"
  20. os.environ["ACCELERATE_MIXED_PRECISION"] = "no" # ✅ disable AMP completely
  21. os.environ["TORCH_AMP_DISABLE"] = "1" # ✅ disable GradScaler
  22. os.environ["CUDA_VISIBLE_DEVICES"] = "0" # optional: force first GPU
  23. # ----------------------------
  24. # Global configuration
  25. # ----------------------------
  26. MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
  27. OUTPUT_DIR = "./qwen2.5-7b-uk-fr-lora"
  28. DATA_FILE = "paires_clean.json"
  29. MAX_SEQ_LENGTH = 512 # Reduce for RTX 4080 SUPER VRAM
  30. print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n")
  31. # ----------------------------
  32. # [1/7] Tokenizer
  33. # ----------------------------
  34. print(f"{80 * '_'}\n[1/7] Loading tokenizer...")
  35. tokenizer = AutoTokenizer.from_pretrained(
  36. MODEL_NAME,
  37. trust_remote_code=True,
  38. )
  39. tokenizer.pad_token = tokenizer.eos_token
  40. tokenizer.model_max_length = MAX_SEQ_LENGTH
  41. print("Tokenizer loaded.")
  42. print(f"Pad token id: {tokenizer.pad_token_id}")
  43. print(f"Max sequence length: {tokenizer.model_max_length}")
  44. # ----------------------------
  45. # [2/7] Load model in 4-bit (QLoRA)
  46. # ----------------------------
  47. print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (QLoRA)...")
  48. assert torch.cuda.is_available(), "CUDA GPU not detected!"
  49. print(f"Using GPU: {torch.cuda.get_device_name(0)}")
  50. bnb_config = BitsAndBytesConfig(
  51. load_in_4bit=True,
  52. bnb_4bit_quant_type="nf4",
  53. bnb_4bit_compute_dtype=torch.float16, # fp16 internally
  54. bnb_4bit_use_double_quant=True,
  55. )
  56. model = AutoModelForCausalLM.from_pretrained(
  57. MODEL_NAME,
  58. device_map="auto",
  59. quantization_config=bnb_config,
  60. low_cpu_mem_usage=True,
  61. trust_remote_code=True,
  62. )
  63. # Align model tokens with tokenizer
  64. model.config.pad_token_id = tokenizer.pad_token_id
  65. model.config.bos_token_id = tokenizer.bos_token_id
  66. model.config.eos_token_id = tokenizer.eos_token_id
  67. print("Model loaded successfully in 4-bit mode on GPU.")
  68. # ----------------------------
  69. # [3/7] Prepare model for k-bit training
  70. # ----------------------------
  71. print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...")
  72. model = prepare_model_for_kbit_training(model)
  73. model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
  74. model.config.use_cache = False # Important with gradient checkpointing + QLoRA
  75. print("Model prepared for k-bit training.")
  76. # ----------------------------
  77. # [4/7] LoRA configuration
  78. # ----------------------------
  79. print(f"{80 * '_'}\n[4/7] Configuring LoRA adapters...")
  80. lora_config = LoraConfig(
  81. r=32,
  82. lora_alpha=64,
  83. lora_dropout=0.02,
  84. bias="none",
  85. task_type="CAUSAL_LM",
  86. target_modules=[
  87. "q_proj",
  88. "k_proj",
  89. "v_proj",
  90. "o_proj",
  91. "gate_proj",
  92. "up_proj",
  93. "down_proj",
  94. ],
  95. )
  96. model = get_peft_model(model, lora_config)
  97. model.print_trainable_parameters()
  98. print("LoRA adapters successfully attached.")
  99. # ----------------------------
  100. # [5/7] Dataset loading & formatting
  101. # ----------------------------
  102. print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...")
  103. dataset = load_dataset("json", data_files=DATA_FILE)
  104. print(f"Dataset loaded with {len(dataset['train'])} samples.")
  105. print("Formatting dataset for Ukrainian → French translation...")
  106. def format_prompt(example):
  107. return {
  108. "text": (
  109. "<|im_start|>user\n"
  110. "Translate the following Ukrainian text into French.\n"
  111. f"Ukrainian: {example['text']}\n"
  112. "<|im_end|>\n"
  113. "<|im_start|>assistant\n"
  114. f"{example['translation']}"
  115. "<|im_end|>"
  116. )
  117. }
  118. dataset = dataset.map(
  119. format_prompt,
  120. remove_columns=dataset["train"].column_names
  121. )
  122. print("Dataset formatting completed.")
  123. print("Example prompt:\n")
  124. print(dataset["train"][0]["text"])
  125. # ----------------------------
  126. # [6/7] Training arguments
  127. # ----------------------------
  128. print(f"{80 * '_'}\n[6/7] Initializing training arguments...")
  129. training_args = TrainingArguments(
  130. output_dir=OUTPUT_DIR,
  131. per_device_train_batch_size=1,
  132. gradient_accumulation_steps=16,
  133. learning_rate=1e-4,
  134. num_train_epochs=3,
  135. max_steps=1000,
  136. fp16=False, # ⚠ disable AMP
  137. bf16=False, # ⚠ disable BF16
  138. optim="paged_adamw_32bit",
  139. logging_steps=10,
  140. save_steps=500,
  141. save_total_limit=2,
  142. report_to="none",
  143. dataloader_pin_memory=False,
  144. max_grad_norm=0.0, # avoid AMP gradient clipping
  145. )
  146. print("Training arguments ready.")
  147. print(f"Output directory: {OUTPUT_DIR}")
  148. print(f"Epochs: {training_args.num_train_epochs}")
  149. print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
  150. # ----------------------------
  151. # [7/7] Trainer
  152. # ----------------------------
  153. print(f"{80 * '_'}\nInitializing SFTTrainer...")
  154. trainer = SFTTrainer(
  155. model=model,
  156. train_dataset=dataset["train"],
  157. args=training_args,
  158. )
  159. print("Trainer initialized.")
  160. # ----------------------------
  161. # Training
  162. # ----------------------------
  163. print(f"{80 * '_'}\nStarting training...")
  164. checkpoint_exists = False
  165. if os.path.exists(OUTPUT_DIR):
  166. checkpoint_exists = any(
  167. d.startswith("checkpoint-")
  168. for d in os.listdir(OUTPUT_DIR)
  169. )
  170. if checkpoint_exists:
  171. print("Checkpoint found → resuming training")
  172. train_output = trainer.train(resume_from_checkpoint=True)
  173. else:
  174. print("No checkpoint found → starting fresh training")
  175. train_output = trainer.train()
  176. print("\n=== Training summary ===")
  177. print(f"Global steps: {train_output.global_step}")
  178. print(f"Training loss: {train_output.training_loss}")
  179. print(f"Metrics: {train_output.metrics}")
  180. print("Training completed successfully.")
  181. # ----------------------------
  182. # Save LoRA adapter and tokenizer
  183. # ----------------------------
  184. print(f"{80 * '_'}\nSaving LoRA adapter and tokenizer...")
  185. trainer.model.save_pretrained(OUTPUT_DIR)
  186. tokenizer.save_pretrained(OUTPUT_DIR)
  187. print("\n=== Fine-tuning finished ===")
  188. print(f"LoRA adapter saved in: {OUTPUT_DIR}")