| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121 |
- import PyPDF2
- import requests
- import json
- import os
- from datetime import datetime
-
- # Configuration
- PDF_PATH = "TaniaBorecMemoir(Ukr).pdf"
- OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
- OLLAMA_URL = "http://localhost:11434/api/generate"
- TARGET_LANGUAGE = "français"
- CHECKPOINT_FILE = "checkpoint.json"
- TEMP_OUTPUT_TXT = "output_temp.txt"
- FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f" ({TARGET_LANGUAGE.upper()[:2]})_V2.pdf")
- FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f" ({TARGET_LANGUAGE.upper()[:2]})_V2.txt")
-
- # Charge ou initialise le checkpoint
- def load_checkpoint():
- if os.path.exists(CHECKPOINT_FILE):
- with open(CHECKPOINT_FILE, "r") as f:
- return json.load(f)
- return {"last_processed_index": -1, "results": {}}
-
- # Sauvegarde le checkpoint
- def save_checkpoint(last_index, results):
- with open(CHECKPOINT_FILE, "w") as f:
- json.dump({"last_processed_index": last_index, "results": results}, f)
-
- # Sauvegarde les résultats temporaires dans un fichier TXT
- def save_temp_results(results):
- with open(TEMP_OUTPUT_TXT, "w", encoding="utf-8") as f:
- for idx, translation in results.items():
- f.write(f"Paragraphe {idx}:\n{translation}\n\n")
-
- # Extraction du texte du PDF (inchangée)
- def extract_text_from_pdf(pdf_path):
- text_by_page = []
- with open(pdf_path, "rb") as file:
- reader = PyPDF2.PdfReader(file)
- for page in reader.pages:
- text = page.extract_text()
- text_by_page.append(text)
- return text_by_page
-
- # Découpage en paragraphes (inchangé)
- def split_pages_in_paragraphs(pages_text):
- import re
- full_text = "\n".join(pages_text)
- full_text = re.sub(r'(?<![.!?])\n+(?![.!?])', ' ', full_text)
- paragraphs = re.split(r'(?<=[.!?])\s*\n+', full_text.strip())
- paragraphs = [re.sub(r'\s+', ' ', p).strip() for p in paragraphs if p.strip()]
- return paragraphs
-
- # Envoi à Ollama (inchangé)
- def send_to_ollama(text, target_lang=TARGET_LANGUAGE, model=OLLAMA_MODEL):
- full_prompt = f"\n\nTraduis le texte suivant de l'ukrainien vers le {target_lang} :\n{text}"
- payload = {"model": model, "prompt": full_prompt, "stream": False}
- response = requests.post(OLLAMA_URL, data=json.dumps(payload))
- if response.status_code == 200:
- return response.json()["response"]
- else:
- raise Exception(f"Erreur Ollama: {response.text}")
-
- # Création du PDF final (inchangée)
- def create_pdf_from_results(results, output_path):
- from reportlab.lib.pagesizes import letter
- from reportlab.lib.units import inch
- from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
- from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
- from reportlab.lib.enums import TA_JUSTIFY
- from reportlab.pdfbase import pdfmetrics
- from reportlab.pdfbase.ttfonts import TTFont
-
- doc = SimpleDocTemplate(output_path, pagesize=letter)
- story = []
- styles = getSampleStyleSheet()
- body_style = styles["BodyText"]
-
- for idx, translation in results.items():
- story.append(Paragraph(translation, body_style))
-
- doc.build(story)
- print(f"PDF final généré : {output_path}")
-
- # Fonction principale
- def main():
- # Charge le checkpoint
- checkpoint = load_checkpoint()
- last_index = checkpoint["last_processed_index"]
- results = checkpoint["results"]
-
- # Extraction des paragraphes
- pages = extract_text_from_pdf(PDF_PATH)
- paragraphs = split_pages_in_paragraphs(pages)
-
- # Traitement des paragraphes
- batch_size = 3
- for i in range(last_index + 1, len(paragraphs), batch_size):
- batch = paragraphs[i:i + batch_size]
- paragraph_cumul = "\n".join(batch)
-
- print(f"{15 * '-'} Traduction des paragraphes {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
-
- try:
- result = send_to_ollama(paragraph_cumul)
- print(f"{result}")
- results[i] = result
- save_checkpoint(i, results) # Sauvegarde le checkpoint
- save_temp_results(results) # Sauvegarde les résultats temporaires
- except Exception as e:
- print(f"Erreur : {e}")
- continue
-
- # Génération des fichiers finaux
- save_temp_results(results)
- create_pdf_from_results(results, FINAL_OUTPUT_PDF)
- os.rename(TEMP_OUTPUT_TXT, FINAL_OUTPUT_TXT)
- print("Traduction terminée !")
-
- if __name__ == "__main__":
- main()
|