traducteur de texte long

2026-01-04 19:38:02 +01:00
commit db7a60bc12
4 changed files with 151 additions and 0 deletions
@@ -0,0 +1,36 @@
+# Traduction PDF avec Ollama
+
+Ce projet permet de traduire un document PDF page par page en utilisant un modèle LLM local (Ollama) optimisé pour la traduction.
+
+---
+
+## Prérequis
+
+- **Python** (version 3.8 ou supérieure)
+- **Ollama** installé et en cours d'exécution sur ta machine (en mode "serveur")
+- Un **document PDF** à traduire
+- Un modèle LLM spécialisé dans la traduction avec un context long.
+
+---
+
+## Création d'un modèle LLM de traduction avec Ollama
+En partant de zongwei/gemma3-translator:4b, voici le fichier de customisation :
+```shell
+FROM zongwei/gemma3-translator:4b
+PARAMETER	temperature    0.1
+PARAMETER	num_ctx	131072
+SYSTEM """
+Tu es un traducteur professionnel spécialisé dans la traduction de texte ukrainien vers le français.
+Traduis fidèlement et naturellement en respectant l'intonation originale utilisée par l'auteur du texte.
+Tu ne dois pas interpréter les pensées ou les réflexions de l'auteur.
+Tu dois toujours répondre en français.
+"""
+```
+Il faut ensuite compiler le modèle avec la commande :
+```
+ollama create traductionUkrainienVersFrancais -f .\Modelfile
+```
+
+## Installation
+
+
@@ -0,0 +1,108 @@
+import PyPDF2
+import requests
+import json
+from reportlab.lib.pagesizes import letter
+from reportlab.lib.units import inch
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from reportlab.lib.enums import TA_JUSTIFY
+
+# Configuration
+PDF_PATH = "TaniaBorecMemoir(Ukr).pdf"  # Fichier original
+OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
+OLLAMA_URL = "http://localhost:11434/api/generate"  # URL par défaut d'Ollama
+OUTPUT_PDF_PATH = PDF_PATH.replace(".pdf", " (FR).pdf")  # Chemin du PDF de sortie
+
+def extract_text_from_pdf(pdf_path):
+    """Extrait le texte page par page d'un PDF."""
+    text_by_page = []
+    with open(pdf_path, "rb") as file:
+        reader = PyPDF2.PdfReader(file)
+        for page in reader.pages:
+            text_by_page.append(page.extract_text())
+    return text_by_page
+
+def send_to_ollama(prompt, model=OLLAMA_MODEL, context_size=128000):
+    """Envoie une requête à Ollama et retourne la réponse."""
+    payload = {
+        "model": model,
+        "prompt": prompt,
+        "stream": False,
+        "options": {"num_ctx": context_size}
+    }
+    response = requests.post(OLLAMA_URL, data=json.dumps(payload))
+    if response.status_code == 200:
+        return response.json()["response"]
+    else:
+        raise Exception(f"Erreur Ollama: {response.text}")
+
+def create_pdf_from_results(results, output_path):
+    """Crée un PDF à partir des résultats de traduction."""
+    doc = SimpleDocTemplate(output_path, pagesize=letter, topMargin=inch, bottomMargin=inch)
+    story = []
+    
+    # Style personnalisé
+    styles = getSampleStyleSheet()
+    title_style = ParagraphStyle(
+        'CustomTitle',
+        parent=styles['Heading1'],
+        fontSize=16,
+        textColor='#1f4788',
+        spaceAfter=0.3*inch,
+        alignment=TA_JUSTIFY
+    )
+    
+    page_style = ParagraphStyle(
+        'PageHeading',
+        parent=styles['Heading2'],
+        fontSize=12,
+        textColor='#1f4788',
+        spaceAfter=0.2*inch,
+        spaceBefore=0.2*inch
+    )
+    
+    body_style = ParagraphStyle(
+        'CustomBody',
+        parent=styles['BodyText'],
+        fontSize=11,
+        alignment=TA_JUSTIFY,
+        spaceAfter=0.2*inch
+    )
+    
+    # Titre
+    story.append(Paragraph("Traduction - Ukrainien vers Français", title_style))
+    story.append(Spacer(1, 0.2*inch))
+    
+    # Contenu
+    for page_num, translation in results.items():
+        story.append(Paragraph(f"Page {page_num}", page_style))
+        story.append(Paragraph(translation, body_style))
+        story.append(Spacer(1, 0.1*inch))
+    
+    # Construction du PDF
+    doc.build(story)
+    print(f"PDF généré avec succès : {output_path}")
+
+def main():
+    # Extraction du texte
+    pages = extract_text_from_pdf(PDF_PATH)
+    print(f"Nombre de pages extraites : {len(pages)}")
+
+    # Dictionnaire pour stocker les résultats
+    results = {}
+    
+    # Traitement page par page
+    for i, page_text in enumerate(pages, start=1):
+        print(f"Traitement de la page {i}...")
+        prompt = f"Traduis le texte suivant de l'ukrainien vers le français : {page_text}"
+        try:
+            result = send_to_ollama(prompt)
+            results[i] = result
+        except Exception as e:
+            results[i] = f"Erreur lors du traitement de la page {i} : {e}"
+    
+    # Création du PDF avec tous les résultats
+    create_pdf_from_results(results, OUTPUT_PDF_PATH)
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,7 @@
+certifi==2026.1.4
+charset-normalizer==3.4.4
+idna==3.11
+PyPDF2==3.0.1
+requests==2.32.5
+reportlab==4.0.9
+urllib3==2.6.2