Script python permettant de traduire un long texte
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

validation.py 2.5KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. import torch
  2. from transformers import AutoTokenizer, AutoModelForCausalLM
  3. from datasets import load_dataset
  4. from nltk.translate.bleu_score import corpus_bleu
  5. # ----------------------------
  6. # Configuration
  7. # ----------------------------
  8. MODEL_DIR = "./qwen2.5-7b-uk-fr-lora" # dossier où tu as sauvegardé LoRA
  9. VALIDATION_FILE = "validation.jsonl" # petit subset de test (5-50 phrases)
  10. MAX_INPUT_LENGTH = 1024
  11. DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
  12. print("=== Loading model and tokenizer ===")
  13. tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
  14. tokenizer.pad_token = tokenizer.eos_token
  15. tokenizer.model_max_length = MAX_INPUT_LENGTH
  16. model = AutoModelForCausalLM.from_pretrained(
  17. MODEL_DIR,
  18. device_map="auto",
  19. torch_dtype=torch.float16,
  20. trust_remote_code=True
  21. )
  22. model.eval()
  23. print("Model loaded.")
  24. # ----------------------------
  25. # Load validation dataset
  26. # ----------------------------
  27. print("Loading validation dataset...")
  28. dataset = load_dataset("json", data_files=VALIDATION_FILE)
  29. examples = dataset["train"] # petit subset
  30. print(f"{len(examples)} examples loaded for testing.")
  31. # ----------------------------
  32. # Function to generate translation
  33. # ----------------------------
  34. def translate(text):
  35. prompt = f"Translate the following Ukrainian text into French:\nUkrainian: {text}\nFrench:"
  36. inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LENGTH).to(DEVICE)
  37. with torch.no_grad():
  38. outputs = model.generate(
  39. **inputs,
  40. max_new_tokens=256,
  41. do_sample=False, # deterministic
  42. eos_token_id=tokenizer.eos_token_id,
  43. pad_token_id=tokenizer.pad_token_id
  44. )
  45. result = tokenizer.decode(outputs[0], skip_special_tokens=True)
  46. # Remove prompt from result
  47. return result.replace(prompt, "").strip()
  48. # ----------------------------
  49. # Test all examples and compute BLEU
  50. # ----------------------------
  51. print("Generating translations...")
  52. references = []
  53. hypotheses = []
  54. for i, example in enumerate(examples):
  55. src_text = example["text"]
  56. ref_text = example["translation"]
  57. pred_text = translate(src_text)
  58. print(f"\n[{i+1}] Source: {src_text}")
  59. print(f" Reference: {ref_text}")
  60. print(f" Prediction: {pred_text}")
  61. # Prepare for BLEU (tokenized by space)
  62. references.append([ref_text.split()])
  63. hypotheses.append(pred_text.split())
  64. # Compute corpus BLEU
  65. bleu_score = corpus_bleu(references, hypotheses)
  66. print(f"\n=== Corpus BLEU score: {bleu_score:.4f} ===")