Script python permettant de traduire un long texte
Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. import PyPDF2
  2. import requests
  3. import json
  4. import os
  5. from datetime import datetime
  6. # Configuration
  7. PDF_PATH = "TaniaBorecMemoir(Ukr).pdf"
  8. OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
  9. OLLAMA_URL = "http://localhost:11434/api/generate"
  10. TARGET_LANGUAGE = "français"
  11. CHECKPOINT_FILE = "checkpoint.json"
  12. TEMP_OUTPUT_TXT = "output_temp.txt"
  13. FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f" ({TARGET_LANGUAGE.upper()[:2]})_V2.pdf")
  14. FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f" ({TARGET_LANGUAGE.upper()[:2]})_V2.txt")
  15. # Charge ou initialise le checkpoint
  16. def load_checkpoint():
  17. if os.path.exists(CHECKPOINT_FILE):
  18. with open(CHECKPOINT_FILE, "r") as f:
  19. return json.load(f)
  20. return {"last_processed_index": -1, "results": {}}
  21. # Sauvegarde le checkpoint
  22. def save_checkpoint(last_index, results):
  23. with open(CHECKPOINT_FILE, "w") as f:
  24. json.dump({"last_processed_index": last_index, "results": results}, f)
  25. # Sauvegarde les résultats temporaires dans un fichier TXT
  26. def save_temp_results(results):
  27. with open(TEMP_OUTPUT_TXT, "w", encoding="utf-8") as f:
  28. for idx, translation in results.items():
  29. f.write(f"Paragraphe {idx}:\n{translation}\n\n")
  30. # Extraction du texte du PDF (inchangée)
  31. def extract_text_from_pdf(pdf_path):
  32. text_by_page = []
  33. with open(pdf_path, "rb") as file:
  34. reader = PyPDF2.PdfReader(file)
  35. for page in reader.pages:
  36. text = page.extract_text()
  37. text_by_page.append(text)
  38. return text_by_page
  39. # Découpage en paragraphes (inchangé)
  40. def split_pages_in_paragraphs(pages_text):
  41. import re
  42. full_text = "\n".join(pages_text)
  43. full_text = re.sub(r'(?<![.!?])\n+(?![.!?])', ' ', full_text)
  44. paragraphs = re.split(r'(?<=[.!?])\s*\n+', full_text.strip())
  45. paragraphs = [re.sub(r'\s+', ' ', p).strip() for p in paragraphs if p.strip()]
  46. return paragraphs
  47. # Envoi à Ollama (inchangé)
  48. def send_to_ollama(text, target_lang=TARGET_LANGUAGE, model=OLLAMA_MODEL):
  49. full_prompt = f"\n\nTraduis le texte suivant de l'ukrainien vers le {target_lang} :\n{text}"
  50. payload = {"model": model, "prompt": full_prompt, "stream": False}
  51. response = requests.post(OLLAMA_URL, data=json.dumps(payload))
  52. if response.status_code == 200:
  53. return response.json()["response"]
  54. else:
  55. raise Exception(f"Erreur Ollama: {response.text}")
  56. # Création du PDF final (inchangée)
  57. def create_pdf_from_results(results, output_path):
  58. from reportlab.lib.pagesizes import letter
  59. from reportlab.lib.units import inch
  60. from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
  61. from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
  62. from reportlab.lib.enums import TA_JUSTIFY
  63. from reportlab.pdfbase import pdfmetrics
  64. from reportlab.pdfbase.ttfonts import TTFont
  65. doc = SimpleDocTemplate(output_path, pagesize=letter)
  66. story = []
  67. styles = getSampleStyleSheet()
  68. body_style = styles["BodyText"]
  69. for idx, translation in results.items():
  70. story.append(Paragraph(translation, body_style))
  71. doc.build(story)
  72. print(f"PDF final généré : {output_path}")
  73. # Fonction principale
  74. def main():
  75. # Charge le checkpoint
  76. checkpoint = load_checkpoint()
  77. last_index = checkpoint["last_processed_index"]
  78. results = checkpoint["results"]
  79. # Extraction des paragraphes
  80. pages = extract_text_from_pdf(PDF_PATH)
  81. paragraphs = split_pages_in_paragraphs(pages)
  82. # Traitement des paragraphes
  83. batch_size = 3
  84. for i in range(last_index + 1, len(paragraphs), batch_size):
  85. batch = paragraphs[i:i + batch_size]
  86. paragraph_cumul = "\n".join(batch)
  87. print(f"{15 * '-'} Traduction des paragraphes {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
  88. try:
  89. result = send_to_ollama(paragraph_cumul)
  90. print(f"{result}")
  91. results[i] = result
  92. save_checkpoint(i, results) # Sauvegarde le checkpoint
  93. save_temp_results(results) # Sauvegarde les résultats temporaires
  94. except Exception as e:
  95. print(f"Erreur : {e}")
  96. continue
  97. # Génération des fichiers finaux
  98. save_temp_results(results)
  99. create_pdf_from_results(results, FINAL_OUTPUT_PDF)
  100. os.rename(TEMP_OUTPUT_TXT, FINAL_OUTPUT_TXT)
  101. print("Traduction terminée !")
  102. if __name__ == "__main__":
  103. main()