Script python permettant de traduire un long texte
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

cleanDataSet.py 3.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. import json
  2. import unicodedata
  3. import re
  4. from collections import OrderedDict
  5. # ----------------------------
  6. # Configuration
  7. # ----------------------------
  8. INPUT_FILE = "paires.json"
  9. OUTPUT_FILE = "paires_clean.json"
  10. MIN_TOKENS = 5
  11. MAX_TOKENS = 200
  12. MIN_QUALITY_SCORE = 0.60
  13. print("=== Dataset cleaning + quality scoring started ===")
  14. # ----------------------------
  15. # Normalization helpers
  16. # ----------------------------
  17. def normalize_text(text: str) -> str:
  18. text = unicodedata.normalize("NFKC", text)
  19. text = re.sub(r"\s+", " ", text).strip()
  20. text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
  21. return text
  22. def token_count(text: str) -> int:
  23. return len(text.split())
  24. # ----------------------------
  25. # Quality scoring
  26. # ----------------------------
  27. def length_ratio_score(src_len, tgt_len):
  28. """
  29. Ideal ratio FR/UK ≈ 0.9 – 1.3
  30. """
  31. ratio = tgt_len / max(src_len, 1)
  32. if ratio < 0.5 or ratio > 2.0:
  33. return 0.0
  34. elif 0.75 <= ratio <= 1.5:
  35. return 1.0
  36. else:
  37. return max(0.0, 1.0 - abs(ratio - 1.1))
  38. def lexical_density_score(text):
  39. """
  40. Penalize very repetitive or trivial translations
  41. """
  42. tokens = text.split()
  43. if not tokens:
  44. return 0.0
  45. unique_ratio = len(set(tokens)) / len(tokens)
  46. return min(1.0, unique_ratio * 1.5)
  47. def quality_score(src, tgt):
  48. src_len = token_count(src)
  49. tgt_len = token_count(tgt)
  50. l_score = length_ratio_score(src_len, tgt_len)
  51. d_score = lexical_density_score(tgt)
  52. return 0.7 * l_score + 0.3 * d_score
  53. # ----------------------------
  54. # Load + clean + score
  55. # ----------------------------
  56. unique_sources = OrderedDict()
  57. stats = {
  58. "total": 0,
  59. "removed_length": 0,
  60. "removed_duplicates": 0,
  61. "removed_quality": 0,
  62. }
  63. with open(INPUT_FILE, "r", encoding="utf-8") as f:
  64. for line in f:
  65. stats["total"] += 1
  66. item = json.loads(line)
  67. src = normalize_text(item["text"])
  68. tgt = normalize_text(item["translation"])
  69. src_len = token_count(src)
  70. tgt_len = token_count(tgt)
  71. # Length filtering
  72. if not (MIN_TOKENS <= src_len <= MAX_TOKENS):
  73. stats["removed_length"] += 1
  74. continue
  75. if not (MIN_TOKENS <= tgt_len <= MAX_TOKENS):
  76. stats["removed_length"] += 1
  77. continue
  78. # Deduplication
  79. if src in unique_sources:
  80. stats["removed_duplicates"] += 1
  81. continue
  82. # Quality score
  83. q_score = quality_score(src, tgt)
  84. if q_score < MIN_QUALITY_SCORE:
  85. stats["removed_quality"] += 1
  86. continue
  87. unique_sources[src] = {
  88. "translation": tgt,
  89. "quality_score": round(q_score, 3)
  90. }
  91. # ----------------------------
  92. # Report
  93. # ----------------------------
  94. print(f"Total lines processed: {stats['total']}")
  95. print(f"Removed (length): {stats['removed_length']}")
  96. print(f"Removed (duplicates): {stats['removed_duplicates']}")
  97. print(f"Removed (quality): {stats['removed_quality']}")
  98. print(f"Final kept pairs: {len(unique_sources)}")
  99. # ----------------------------
  100. # Save cleaned dataset
  101. # ----------------------------
  102. with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
  103. for src, data in unique_sources.items():
  104. json.dump(
  105. {
  106. "text": src,
  107. "translation": data["translation"],
  108. "quality_score": data["quality_score"],
  109. },
  110. f,
  111. ensure_ascii=False
  112. )
  113. f.write("\n")
  114. print(f"=== Cleaning completed ===")
  115. print(f"Clean dataset saved to: {OUTPUT_FILE}")