Script python permettant de traduire un long texte
Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.

main.py 4.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. import PyPDF2
  2. import requests
  3. import json
  4. import os
  5. from datetime import datetime
  6. # Configuration
  7. PDF_PATH = "TaniaBorecMemoir(Ukr).pdf"
  8. OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
  9. OLLAMA_URL = "http://localhost:11434/api/generate"
  10. TARGET_LANGUAGE = "français"
  11. CHECKPOINT_FILE = "checkpoint.json"
  12. TEMP_OUTPUT_TXT = "output_temp.txt"
  13. FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f" ({TARGET_LANGUAGE.upper()[:2]})_V2.pdf")
  14. FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f" ({TARGET_LANGUAGE.upper()[:2]})_V2.txt")
  15. # Charge ou initialise le checkpoint
  16. def load_checkpoint():
  17. if os.path.exists(CHECKPOINT_FILE):
  18. with open(CHECKPOINT_FILE, "r") as f:
  19. return json.load(f)
  20. return {"last_processed_index": -1, "results": {}}
  21. # Sauvegarde le checkpoint
  22. def save_checkpoint(last_index, results):
  23. with open(CHECKPOINT_FILE, "w") as f:
  24. json.dump({"last_processed_index": last_index, "results": results}, f)
  25. # Sauvegarde les résultats temporaires dans un fichier TXT
  26. def save_temp_results(results):
  27. with open(TEMP_OUTPUT_TXT, "w", encoding="utf-8") as f:
  28. for idx, translation in results.items():
  29. f.write(f"Paragraphe {idx}:\n{translation}\n\n")
  30. # Extraction du texte du PDF (inchangée)
  31. def extract_text_from_pdf(pdf_path):
  32. text_by_page = []
  33. with open(pdf_path, "rb") as file:
  34. reader = PyPDF2.PdfReader(file)
  35. for page in reader.pages:
  36. text = page.extract_text()
  37. text_by_page.append(text)
  38. return text_by_page
  39. # Découpage en paragraphes (inchangé)
  40. def split_pages_in_paragraphs(pages_text):
  41. import re
  42. full_text = "\n".join(pages_text)
  43. full_text = re.sub(r'(?<![.!?])\n+(?![.!?])', ' ', full_text)
  44. paragraphs = re.split(r'(?<=[.!?])\s*\n+', full_text.strip())
  45. paragraphs = [re.sub(r'\s+', ' ', p).strip() for p in paragraphs if p.strip()]
  46. return paragraphs
  47. # Envoi à Ollama (inchangé)
  48. def send_to_ollama(text, target_lang=TARGET_LANGUAGE, model=OLLAMA_MODEL):
  49. full_prompt = f"\n\nTraduis le texte suivant de l'ukrainien vers le {target_lang} :\n{text}"
  50. payload = {"model": model, "prompt": full_prompt, "stream": False}
  51. response = requests.post(OLLAMA_URL, data=json.dumps(payload))
  52. if response.status_code == 200:
  53. return response.json()["response"]
  54. else:
  55. raise Exception(f"Erreur Ollama: {response.text}")
  56. # Création du PDF final (inchangée)
  57. def create_pdf_from_results(results, output_path):
  58. from reportlab.lib.pagesizes import letter
  59. from reportlab.lib.units import inch
  60. from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
  61. from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
  62. from reportlab.lib.enums import TA_JUSTIFY
  63. from reportlab.pdfbase import pdfmetrics
  64. from reportlab.pdfbase.ttfonts import TTFont
  65. doc = SimpleDocTemplate(output_path, pagesize=letter)
  66. story = []
  67. styles = getSampleStyleSheet()
  68. body_style = styles["BodyText"]
  69. for idx, translation in results.items():
  70. story.append(Paragraph(translation, body_style))
  71. doc.build(story)
  72. print(f"PDF final généré : {output_path}")
  73. # Fonction principale
  74. def main():
  75. # Charge le checkpoint
  76. checkpoint = load_checkpoint()
  77. last_index = checkpoint["last_processed_index"]
  78. results = checkpoint["results"]
  79. # Extraction des paragraphes
  80. pages = extract_text_from_pdf(PDF_PATH)
  81. paragraphs = split_pages_in_paragraphs(pages)
  82. # Traitement des paragraphes
  83. batch_size = 3
  84. for i in range(last_index + 1, len(paragraphs), batch_size):
  85. batch = paragraphs[i:i + batch_size]
  86. paragraph_cumul = "\n".join(batch)
  87. print(f"{15 * '-'} Traduction des paragraphes {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
  88. try:
  89. result = send_to_ollama(paragraph_cumul)
  90. print(f"{result}")
  91. results[i] = result
  92. save_checkpoint(i, results) # Sauvegarde le checkpoint
  93. save_temp_results(results) # Sauvegarde les résultats temporaires
  94. except Exception as e:
  95. print(f"Erreur : {e}")
  96. continue
  97. # Génération des fichiers finaux
  98. save_temp_results(results)
  99. create_pdf_from_results(results, FINAL_OUTPUT_PDF)
  100. os.rename(TEMP_OUTPUT_TXT, FINAL_OUTPUT_TXT)
  101. print("Traduction terminée !")
  102. if __name__ == "__main__":
  103. main()