Alex
/
python.traduction


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
							import PyPDF2
import requests
import json
import os
from datetime import datetime

# Configuration
PDF_PATH = "TaniaBorecMemoir(Ukr).pdf"
OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
OLLAMA_URL = "http://localhost:11434/api/generate"
TARGET_LANGUAGE = "français"
CHECKPOINT_FILE = "checkpoint.json"
TEMP_OUTPUT_TXT = "output_temp.txt"
FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f" ({TARGET_LANGUAGE.upper()[:2]})_V2.pdf")
FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f" ({TARGET_LANGUAGE.upper()[:2]})_V2.txt")

# Charge ou initialise le checkpoint
def load_checkpoint():
    if os.path.exists(CHECKPOINT_FILE):
        with open(CHECKPOINT_FILE, "r") as f:
            return json.load(f)
    return {"last_processed_index": -1, "results": {}}

# Sauvegarde le checkpoint
def save_checkpoint(last_index, results):
    with open(CHECKPOINT_FILE, "w") as f:
        json.dump({"last_processed_index": last_index, "results": results}, f)

# Sauvegarde les résultats temporaires dans un fichier TXT
def save_temp_results(results):
    with open(TEMP_OUTPUT_TXT, "w", encoding="utf-8") as f:
        for idx, translation in results.items():
            f.write(f"Paragraphe {idx}:\n{translation}\n\n")

# Extraction du texte du PDF (inchangée)
def extract_text_from_pdf(pdf_path):
    text_by_page = []
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text = page.extract_text()
            text_by_page.append(text)
    return text_by_page

# Découpage en paragraphes (inchangé)
def split_pages_in_paragraphs(pages_text):
    import re
    full_text = "\n".join(pages_text)
    full_text = re.sub(r'(?<![.!?])\n+(?![.!?])', ' ', full_text)
    paragraphs = re.split(r'(?<=[.!?])\s*\n+', full_text.strip())
    paragraphs = [re.sub(r'\s+', ' ', p).strip() for p in paragraphs if p.strip()]
    return paragraphs

# Envoi à Ollama (inchangé)
def send_to_ollama(text, target_lang=TARGET_LANGUAGE, model=OLLAMA_MODEL):
    full_prompt = f"\n\nTraduis le texte suivant de l'ukrainien vers le {target_lang} :\n{text}"
    payload = {"model": model, "prompt": full_prompt, "stream": False}
    response = requests.post(OLLAMA_URL, data=json.dumps(payload))
    if response.status_code == 200:
        return response.json()["response"]
    else:
        raise Exception(f"Erreur Ollama: {response.text}")

# Création du PDF final (inchangée)
def create_pdf_from_results(results, output_path):
    from reportlab.lib.pagesizes import letter
    from reportlab.lib.units import inch
    from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
    from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
    from reportlab.lib.enums import TA_JUSTIFY
    from reportlab.pdfbase import pdfmetrics
    from reportlab.pdfbase.ttfonts import TTFont

    doc = SimpleDocTemplate(output_path, pagesize=letter)
    story = []
    styles = getSampleStyleSheet()
    body_style = styles["BodyText"]

    for idx, translation in results.items():
        story.append(Paragraph(translation, body_style))

    doc.build(story)
    print(f"PDF final généré : {output_path}")

# Fonction principale
def main():
    # Charge le checkpoint
    checkpoint = load_checkpoint()
    last_index = checkpoint["last_processed_index"]
    results = checkpoint["results"]

    # Extraction des paragraphes
    pages = extract_text_from_pdf(PDF_PATH)
    paragraphs = split_pages_in_paragraphs(pages)

    # Traitement des paragraphes
    batch_size = 3
    for i in range(last_index + 1, len(paragraphs), batch_size):
        batch = paragraphs[i:i + batch_size]
        paragraph_cumul = "\n".join(batch)

        print(f"{15 * '-'} Traduction des paragraphes {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")

        try:
            result = send_to_ollama(paragraph_cumul)
            print(f"{result}")
            results[i] = result
            save_checkpoint(i, results)  # Sauvegarde le checkpoint
            save_temp_results(results)   # Sauvegarde les résultats temporaires
        except Exception as e:
            print(f"Erreur : {e}")
            continue

    # Génération des fichiers finaux
    save_temp_results(results)
    create_pdf_from_results(results, FINAL_OUTPUT_PDF)
    os.rename(TEMP_OUTPUT_TXT, FINAL_OUTPUT_TXT)
    print("Traduction terminée !")

if __name__ == "__main__":
    main()