Formattage des paragraphes

2026-01-04 23:31:20 +01:00 · 2026-01-04 23:31:20 +01:00 · 4284cd357e
commit 4284cd357e
parent db20e54f5d
2 changed files with 17 additions and 36 deletions
--- a/1
+++ b/1
@ -4,6 +4,7 @@ PARAMETER	num_ctx	131072
 SYSTEM """
 You are a professional translator specialising in translating Ukrainian text into English.
 Translate accurately and naturally, respecting the original intonation used by the author of the text.
+You must always answer in french.
 You must not interpret the author's thoughts or reflections.
 Do not add any text before or after the text provided.
 """
--- a/main.py
+++ b/main.py
@ -17,9 +17,6 @@ OLLAMA_URL = "http://localhost:11434/api/generate"  # URL par défaut d'Ollama
 TARGET_LANGUAGE = "français"  # Langue cible (ex: "français", "anglais", "allemand", "espagnol", etc.)
 OUTPUT_PDF_PATH = PDF_PATH.replace(".pdf", f" ({TARGET_LANGUAGE.upper()[:2]}).pdf")  # Chemin du PDF de sortie

-# Prompt système personnalisé (instructions pour le LLM)
-SYSTEM_PROMPT = """"""
-
 def extract_parameters_from_template(template_str):
    """Extrait les paramètres du modèle à partir du template."""
    import re
@ -133,46 +130,28 @@ def extract_text_from_pdf(pdf_path):
    return text_by_page

 def merge_paragraphs_across_pages(pages_text):
-    """Divise le texte en chunks raisonnables pour la traduction."""
+    """
+    Divise le texte en paragraphes en détectant un point suivi d'un saut de ligne ou d'un retour à la ligne.
+    Conserve les sauts de ligne à l'intérieur des paragraphes.
+    """
    import re
-    
+
    # Concatène tout le texte
    full_text = "\n".join(pages_text)
-    
-    # Essaie d'abord de diviser par les doubles sauts de ligne
-    paragraphs = re.split(r'\n\s*\n+', full_text.strip())
-    
-    # Si on obtient seulement un paragraphe, on divise par une taille maximale
-    if len(paragraphs) == 1:
-        print("Aucune séparation par double saut de ligne détectée. Division par taille...")
-        # Divise par les phrases (points suivis d'un espace)
-        sentences = re.split(r'(?<=[.!?])\s+', full_text.strip())
-        
-        # Regroupe les phrases en chunks d'environ 1500 caractères
-        max_chunk_size = 1500
-        paragraphs = []
-        current_chunk = ""
-        
-        for sentence in sentences:
-            if len(current_chunk) + len(sentence) < max_chunk_size:
-                current_chunk += (" " + sentence) if current_chunk else sentence
-            else:
-                if current_chunk:
-                    paragraphs.append(current_chunk)
-                current_chunk = sentence
-        
-        if current_chunk:
-            paragraphs.append(current_chunk)
-    else:
-        # Normalise les sauts de ligne internes
-        paragraphs = [re.sub(r'\n+', ' ', p.strip()) for p in paragraphs if p.strip()]
-    
+
+    # Divise le texte en paragraphes : un point suivi d'un saut de ligne ou d'un retour à la ligne
+    paragraphs = re.split(r'(?<=[.!?])\s*\n+', full_text.strip())
+
+    # Conserve les sauts de ligne à l'intérieur des paragraphes
+    paragraphs = [p.strip() for p in paragraphs if p.strip()]
+
    return paragraphs

-def send_to_ollama(text, target_lang=TARGET_LANGUAGE, model=OLLAMA_MODEL, context_size=128000, system_prompt=SYSTEM_PROMPT):
+
+def send_to_ollama(text, target_lang=TARGET_LANGUAGE, model=OLLAMA_MODEL, context_size=128000):
    """Envoie une requête à Ollama et retourne la réponse traduite."""
    # Construit le prompt avec les instructions système et la demande de traduction
-    full_prompt = f"{system_prompt}\n\nTraduis le texte suivant de l'ukrainien vers le {target_lang} :\n{text}"
+    full_prompt = f"\n\nTraduis le texte suivant de l'ukrainien vers le {target_lang} :\n{text}"
    payload = {
        "model": model,
        "prompt": full_prompt,
@ -282,6 +261,7 @@ def main():
    
    # Traitement des paragraphes complets
    for i, paragraph_text in enumerate(paragraphs, start=1):
+        if( i > 8 ): break
        print(f"{15 * '-'} Traduction du paragraphe {i}/{len(paragraphs)}...")
        try:
            result = send_to_ollama(paragraph_text, target_lang=TARGET_LANGUAGE)