Script python permettant de traduire un long texte
Du kannst nicht mehr als 25 Themen auswählen Themen müssen mit entweder einem Buchstaben oder einer Ziffer beginnen. Sie können Bindestriche („-“) enthalten und bis zu 35 Zeichen lang sein.

cleanDataSet.py 3.6KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
  1. import json
  2. import unicodedata
  3. import re
  4. from collections import OrderedDict
  5. # ----------------------------
  6. # Configuration
  7. # ----------------------------
  8. INPUT_FILE = "paires.json"
  9. OUTPUT_FILE = "paires_clean.json"
  10. MIN_TOKENS = 5
  11. MAX_TOKENS = 200
  12. MIN_QUALITY_SCORE = 0.60
  13. print("=== Dataset cleaning + quality scoring started ===")
  14. # ----------------------------
  15. # Normalization helpers
  16. # ----------------------------
  17. def normalize_text(text: str) -> str:
  18. text = unicodedata.normalize("NFKC", text)
  19. text = re.sub(r"\s+", " ", text).strip()
  20. text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
  21. return text
  22. def token_count(text: str) -> int:
  23. return len(text.split())
  24. # ----------------------------
  25. # Quality scoring
  26. # ----------------------------
  27. def length_ratio_score(src_len, tgt_len):
  28. """
  29. Ideal ratio FR/UK ≈ 0.9 – 1.3
  30. """
  31. ratio = tgt_len / max(src_len, 1)
  32. if ratio < 0.5 or ratio > 2.0:
  33. return 0.0
  34. elif 0.75 <= ratio <= 1.5:
  35. return 1.0
  36. else:
  37. return max(0.0, 1.0 - abs(ratio - 1.1))
  38. def lexical_density_score(text):
  39. """
  40. Penalize very repetitive or trivial translations
  41. """
  42. tokens = text.split()
  43. if not tokens:
  44. return 0.0
  45. unique_ratio = len(set(tokens)) / len(tokens)
  46. return min(1.0, unique_ratio * 1.5)
  47. def quality_score(src, tgt):
  48. src_len = token_count(src)
  49. tgt_len = token_count(tgt)
  50. l_score = length_ratio_score(src_len, tgt_len)
  51. d_score = lexical_density_score(tgt)
  52. return 0.7 * l_score + 0.3 * d_score
  53. # ----------------------------
  54. # Load + clean + score
  55. # ----------------------------
  56. unique_sources = OrderedDict()
  57. stats = {
  58. "total": 0,
  59. "removed_length": 0,
  60. "removed_duplicates": 0,
  61. "removed_quality": 0,
  62. }
  63. with open(INPUT_FILE, "r", encoding="utf-8") as f:
  64. for line in f:
  65. stats["total"] += 1
  66. item = json.loads(line)
  67. src = normalize_text(item["text"])
  68. tgt = normalize_text(item["translation"])
  69. src_len = token_count(src)
  70. tgt_len = token_count(tgt)
  71. # Length filtering
  72. if not (MIN_TOKENS <= src_len <= MAX_TOKENS):
  73. stats["removed_length"] += 1
  74. continue
  75. if not (MIN_TOKENS <= tgt_len <= MAX_TOKENS):
  76. stats["removed_length"] += 1
  77. continue
  78. # Deduplication
  79. if src in unique_sources:
  80. stats["removed_duplicates"] += 1
  81. continue
  82. # Quality score
  83. q_score = quality_score(src, tgt)
  84. if q_score < MIN_QUALITY_SCORE:
  85. stats["removed_quality"] += 1
  86. continue
  87. unique_sources[src] = {
  88. "translation": tgt,
  89. "quality_score": round(q_score, 3)
  90. }
  91. # ----------------------------
  92. # Report
  93. # ----------------------------
  94. print(f"Total lines processed: {stats['total']}")
  95. print(f"Removed (length): {stats['removed_length']}")
  96. print(f"Removed (duplicates): {stats['removed_duplicates']}")
  97. print(f"Removed (quality): {stats['removed_quality']}")
  98. print(f"Final kept pairs: {len(unique_sources)}")
  99. # ----------------------------
  100. # Save cleaned dataset
  101. # ----------------------------
  102. with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
  103. for src, data in unique_sources.items():
  104. json.dump(
  105. {
  106. "text": src,
  107. "translation": data["translation"],
  108. "quality_score": data["quality_score"],
  109. },
  110. f,
  111. ensure_ascii=False
  112. )
  113. f.write("\n")
  114. print(f"=== Cleaning completed ===")
  115. print(f"Clean dataset saved to: {OUTPUT_FILE}")