Script python permettant de traduire un long texte
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

validation.py 2.5KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. import torch
  2. from transformers import AutoTokenizer, AutoModelForCausalLM
  3. from datasets import load_dataset
  4. from nltk.translate.bleu_score import corpus_bleu
  5. # ----------------------------
  6. # Configuration
  7. # ----------------------------
  8. MODEL_DIR = "./qwen2.5-7b-uk-fr-lora" # dossier où tu as sauvegardé LoRA
  9. VALIDATION_FILE = "validation.jsonl" # petit subset de test (5-50 phrases)
  10. MAX_INPUT_LENGTH = 1024
  11. DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
  12. print("=== Loading model and tokenizer ===")
  13. tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
  14. tokenizer.pad_token = tokenizer.eos_token
  15. tokenizer.model_max_length = MAX_INPUT_LENGTH
  16. model = AutoModelForCausalLM.from_pretrained(
  17. MODEL_DIR,
  18. device_map="auto",
  19. torch_dtype=torch.float16,
  20. trust_remote_code=True
  21. )
  22. model.eval()
  23. print("Model loaded.")
  24. # ----------------------------
  25. # Load validation dataset
  26. # ----------------------------
  27. print("Loading validation dataset...")
  28. dataset = load_dataset("json", data_files=VALIDATION_FILE)
  29. examples = dataset["train"] # petit subset
  30. print(f"{len(examples)} examples loaded for testing.")
  31. # ----------------------------
  32. # Function to generate translation
  33. # ----------------------------
  34. def translate(text):
  35. prompt = f"Translate the following Ukrainian text into French:\nUkrainian: {text}\nFrench:"
  36. inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_INPUT_LENGTH).to(DEVICE)
  37. with torch.no_grad():
  38. outputs = model.generate(
  39. **inputs,
  40. max_new_tokens=256,
  41. do_sample=False, # deterministic
  42. eos_token_id=tokenizer.eos_token_id,
  43. pad_token_id=tokenizer.pad_token_id
  44. )
  45. result = tokenizer.decode(outputs[0], skip_special_tokens=True)
  46. # Remove prompt from result
  47. return result.replace(prompt, "").strip()
  48. # ----------------------------
  49. # Test all examples and compute BLEU
  50. # ----------------------------
  51. print("Generating translations...")
  52. references = []
  53. hypotheses = []
  54. for i, example in enumerate(examples):
  55. src_text = example["text"]
  56. ref_text = example["translation"]
  57. pred_text = translate(src_text)
  58. print(f"\n[{i+1}] Source: {src_text}")
  59. print(f" Reference: {ref_text}")
  60. print(f" Prediction: {pred_text}")
  61. # Prepare for BLEU (tokenized by space)
  62. references.append([ref_text.split()])
  63. hypotheses.append(pred_text.split())
  64. # Compute corpus BLEU
  65. bleu_score = corpus_bleu(references, hypotheses)
  66. print(f"\n=== Corpus BLEU score: {bleu_score:.4f} ===")