utilisation du LLM translategemma:12b

Suppression des fichiers PDF et TXT dans le répertoire Traduction
remove pdf files
2026-02-11 17:29:59 +01:00 · 2026-02-11 17:11:46 +01:00 · 2026-02-11 17:07:57 +01:00 · 2026-02-11 17:06:07 +01:00 · 2026-02-11 17:02:29 +01:00 · 2026-01-15 19:11:55 +01:00
19 changed files with 77719 additions and 102 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,6 @@
 output_temp.txt
 checkpoint.json
 Traduction/Modelfile
 .env
 Traduction/*.pdf
 Traduction/*.txt
--- a/Finetunning/.gitignore
+++ b/Finetunning/.gitignore
@@ -0,0 +1,2 @@
 # Les modèles générés 
 qwen2.5*/
--- a/Finetunning/Paires-de-phrases-en-ukrainien-francais-2026-01-06.tsv
+++ b/Finetunning/Paires-de-phrases-en-ukrainien-francais-2026-01-06.tsv
--- a/Finetunning/cleanDataSet.py
+++ b/Finetunning/cleanDataSet.py
@@ -0,0 +1,144 @@
 import json
 import unicodedata
 import re
 from collections import OrderedDict
 # ----------------------------
 # Configuration
 # ----------------------------
 INPUT_FILE = "paires.json"
 OUTPUT_FILE = "paires_clean.json"
 MIN_TOKENS = 5
 MAX_TOKENS = 200
 MIN_QUALITY_SCORE = 0.60
 print("=== Dataset cleaning + quality scoring started ===")
 # ----------------------------
 # Normalization helpers
 # ----------------------------
 def normalize_text(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
    return text
 def token_count(text: str) -> int:
    return len(text.split())
 # ----------------------------
 # Quality scoring
 # ----------------------------
 def length_ratio_score(src_len, tgt_len):
    """
    Ideal ratio FR/UK ≈ 0.9 – 1.3
    """
    ratio = tgt_len / max(src_len, 1)
    if ratio < 0.5 or ratio > 2.0:
        return 0.0
    elif 0.75 <= ratio <= 1.5:
        return 1.0
    else:
        return max(0.0, 1.0 - abs(ratio - 1.1))
 def lexical_density_score(text):
    """
    Penalize very repetitive or trivial translations
    """
    tokens = text.split()
    if not tokens:
        return 0.0
    unique_ratio = len(set(tokens)) / len(tokens)
    return min(1.0, unique_ratio * 1.5)
 def quality_score(src, tgt):
    src_len = token_count(src)
    tgt_len = token_count(tgt)
    l_score = length_ratio_score(src_len, tgt_len)
    d_score = lexical_density_score(tgt)
    return 0.7 * l_score + 0.3 * d_score
 # ----------------------------
 # Load + clean + score
 # ----------------------------
 unique_sources = OrderedDict()
 stats = {
    "total": 0,
    "removed_length": 0,
    "removed_duplicates": 0,
    "removed_quality": 0,
 }
 with open(INPUT_FILE, "r", encoding="utf-8") as f:
    for line in f:
        stats["total"] += 1
        item = json.loads(line)
        src = normalize_text(item["text"])
        tgt = normalize_text(item["translation"])
        src_len = token_count(src)
        tgt_len = token_count(tgt)
        # Length filtering
        if not (MIN_TOKENS <= src_len <= MAX_TOKENS):
            stats["removed_length"] += 1
            continue
        if not (MIN_TOKENS <= tgt_len <= MAX_TOKENS):
            stats["removed_length"] += 1
            continue
        # Deduplication
        if src in unique_sources:
            stats["removed_duplicates"] += 1
            continue
        # Quality score
        q_score = quality_score(src, tgt)
        if q_score < MIN_QUALITY_SCORE:
            stats["removed_quality"] += 1
            continue
        unique_sources[src] = {
            "translation": tgt,
            "quality_score": round(q_score, 3)
        }
 # ----------------------------
 # Report
 # ----------------------------
 print(f"Total lines processed: {stats['total']}")
 print(f"Removed (length): {stats['removed_length']}")
 print(f"Removed (duplicates): {stats['removed_duplicates']}")
 print(f"Removed (quality): {stats['removed_quality']}")
 print(f"Final kept pairs: {len(unique_sources)}")
 # ----------------------------
 # Save cleaned dataset
 # ----------------------------
 with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for src, data in unique_sources.items():
        json.dump(
            {
                "text": src,
                "translation": data["translation"],
                "quality_score": data["quality_score"],
            },
            f,
            ensure_ascii=False
        )
        f.write("\n")
 print(f"=== Cleaning completed ===")
 print(f"Clean dataset saved to: {OUTPUT_FILE}")
--- a/Finetunning/finetunning.py
+++ b/Finetunning/finetunning.py
@@ -0,0 +1,210 @@
 import os
 import torch
 from datasets import load_dataset
 from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    BitsAndBytesConfig,
 )
 from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
 )
 from trl import SFTTrainer
 # ----------------------------
 # Environment safety (Windows + AMP fixes)
 # ----------------------------
 os.environ["TORCHDYNAMO_DISABLE"] = "1"
 os.environ["ACCELERATE_MIXED_PRECISION"] = "no"  # ✅ disable AMP completely
 os.environ["TORCH_AMP_DISABLE"] = "1"            # ✅ disable GradScaler
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"         # optional: force first GPU
 # ----------------------------
 # Global configuration
 # ----------------------------
 MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
 OUTPUT_DIR = "./qwen2.5-7b-uk-fr-lora"
 DATA_FILE = "paires_clean.json"
 MAX_SEQ_LENGTH = 512  # Reduce for RTX 4080 SUPER VRAM
 print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n")
 # ----------------------------
 # [1/7] Tokenizer
 # ----------------------------
 print(f"{80 * '_'}\n[1/7] Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
 )
 tokenizer.pad_token = tokenizer.eos_token
 tokenizer.model_max_length = MAX_SEQ_LENGTH
 print("Tokenizer loaded.")
 print(f"Pad token id: {tokenizer.pad_token_id}")
 print(f"Max sequence length: {tokenizer.model_max_length}")
 # ----------------------------
 # [2/7] Load model in 4-bit (QLoRA)
 # ----------------------------
 print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (QLoRA)...")
 assert torch.cuda.is_available(), "CUDA GPU not detected!"
 print(f"Using GPU: {torch.cuda.get_device_name(0)}")
 bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,  # fp16 internally
    bnb_4bit_use_double_quant=True,
 )
 model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
 )
 # Align model tokens with tokenizer
 model.config.pad_token_id = tokenizer.pad_token_id
 model.config.bos_token_id = tokenizer.bos_token_id
 model.config.eos_token_id = tokenizer.eos_token_id
 print("Model loaded successfully in 4-bit mode on GPU.")
 # ----------------------------
 # [3/7] Prepare model for k-bit training
 # ----------------------------
 print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...")
 model = prepare_model_for_kbit_training(model)
 model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
 model.config.use_cache = False  # Important with gradient checkpointing + QLoRA
 print("Model prepared for k-bit training.")
 # ----------------------------
 # [4/7] LoRA configuration
 # ----------------------------
 print(f"{80 * '_'}\n[4/7] Configuring LoRA adapters...")
 lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.02,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
 )
 model = get_peft_model(model, lora_config)
 model.print_trainable_parameters()
 print("LoRA adapters successfully attached.")
 # ----------------------------
 # [5/7] Dataset loading & formatting
 # ----------------------------
 print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...")
 dataset = load_dataset("json", data_files=DATA_FILE)
 print(f"Dataset loaded with {len(dataset['train'])} samples.")
 print("Formatting dataset for Ukrainian → French translation...")
 def format_prompt(example):
    return {
        "text": (
            "<|im_start|>user\n"
            "Translate the following Ukrainian text into French.\n"
            f"Ukrainian: {example['text']}\n"
            "<|im_end|>\n"
            "<|im_start|>assistant\n"
            f"{example['translation']}"
            "<|im_end|>"
        )
    }
 dataset = dataset.map(
    format_prompt,
    remove_columns=dataset["train"].column_names
 )
 print("Dataset formatting completed.")
 print("Example prompt:\n")
 print(dataset["train"][0]["text"])
 # ----------------------------
 # [6/7] Training arguments
 # ----------------------------
 print(f"{80 * '_'}\n[6/7] Initializing training arguments...")
 training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    learning_rate=1e-4,
    num_train_epochs=2,
    max_steps=1000,
    fp16=False,      # ⚠ disable AMP
    bf16=False,      # ⚠ disable BF16
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
    dataloader_pin_memory=False,
    max_grad_norm=0.0,  # avoid AMP gradient clipping
 )
 print("Training arguments ready.")
 print(f"Output directory: {OUTPUT_DIR}")
 print(f"Epochs: {training_args.num_train_epochs}")
 print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
 # ----------------------------
 # [7/7] Trainer
 # ----------------------------
 print(f"{80 * '_'}\nInitializing SFTTrainer...")
 trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    args=training_args,
 )
 print("Trainer initialized.")
 # ----------------------------
 # Training
 # ----------------------------
 print(f"{80 * '_'}\nStarting training...")
 checkpoint_exists = False
 if os.path.exists(OUTPUT_DIR):
    checkpoint_exists = any(
        d.startswith("checkpoint-")
        for d in os.listdir(OUTPUT_DIR)
    )
 if checkpoint_exists:
    print("Checkpoint found → resuming training")
    train_output = trainer.train(resume_from_checkpoint=True)
 else:
    print("No checkpoint found → starting fresh training")
    train_output = trainer.train()
 print("\n=== Training summary ===")
 print(f"Global steps: {train_output.global_step}")
 print(f"Training loss: {train_output.training_loss}")
 print(f"Metrics: {train_output.metrics}")
 print("Training completed successfully.")
 # ----------------------------
 # Save LoRA adapter and tokenizer
 # ----------------------------
 print(f"{80 * '_'}\nSaving LoRA adapter and tokenizer...")
 trainer.model.save_pretrained(OUTPUT_DIR)
 tokenizer.save_pretrained(OUTPUT_DIR)
 print("\n=== Fine-tuning finished ===")
 print(f"LoRA adapter saved in: {OUTPUT_DIR}")
--- a/Finetunning/mergeLora.py
+++ b/Finetunning/mergeLora.py
@@ -0,0 +1,68 @@
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import PeftModel
 # ----------------------------
 # Configuration
 # ----------------------------
 BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
 LORA_DIR = "./qwen2.5-7b-uk-fr-lora"      # dossier issu du fine-tuning
 OUTPUT_DIR = "./qwen2.5-7b-uk-fr-merged"  # modèle fusionné final
 DTYPE = torch.float16   # GGUF-friendly
 DEVICE = "cpu"          # merge sur CPU (stable, sûr)
 print("=== LoRA merge script started ===")
 # ----------------------------
 # Load base model
 # ----------------------------
 print(f"{80 * '_'}\n[1/4] Loading base model...")
 base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=DTYPE,
    device_map=DEVICE,
    trust_remote_code=True,
 )
 print("Base model loaded.")
 # ----------------------------
 # Load tokenizer
 # ----------------------------
 print(f"{80 * '_'}\n[2/4] Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL,
    trust_remote_code=True
 )
 tokenizer.pad_token = tokenizer.eos_token
 print("Tokenizer loaded.")
 # ----------------------------
 # Load LoRA adapter
 # ----------------------------
 print(f"{80 * '_'}\n[3/4] Loading LoRA adapter...")
 model = PeftModel.from_pretrained(
    base_model,
    LORA_DIR,
 )
 print("LoRA adapter loaded.")
 # ----------------------------
 # Merge LoRA into base model
 # ----------------------------
 print(f"{80 * '_'}\n[4/4] Merging LoRA into base model...")
 model = model.merge_and_unload()
 print("LoRA successfully merged.")
 # ----------------------------
 # Save merged model
 # ----------------------------
 print("Saving merged model...")
 model.save_pretrained(
    OUTPUT_DIR,
    safe_serialization=True,
 )
 tokenizer.save_pretrained(OUTPUT_DIR)
 print("=== Merge completed successfully ===")
 print(f"Merged model saved in: {OUTPUT_DIR}")
--- a/Finetunning/paires.json
+++ b/Finetunning/paires.json
--- a/Finetunning/paires_clean.json
+++ b/Finetunning/paires_clean.json
--- a/Finetunning/tsv2json.py
+++ b/Finetunning/tsv2json.py
@@ -0,0 +1,35 @@
 import json
 from collections import defaultdict
 # Chemin vers ton fichier d'entrée et de sortie
 input_file = "Paires-de-phrases-en-ukrainien-francais-2026-01-06.tsv"  # Remplace par ton chemin
 output_file = "paires.json"     # Fichier de sortie
 # Dictionnaire pour stocker les paires uniques (clé = phrase ukrainienne, valeur = liste de traductions)
 unique_pairs = defaultdict(list)
 # Lire le fichier d'entrée
 with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        # Diviser la ligne en colonnes (séparateur = tabulation)
        parts = line.strip().split("\t")
        if len(parts) >= 3:
            uk_text = parts[1]  # Texte ukrainien
            fr_text = parts[3]  # Traduction française
            # Ajouter la paire au dictionnaire (évite les doublons)
            if fr_text not in unique_pairs[uk_text]:
                unique_pairs[uk_text].append(fr_text)
 # Écrire le fichier JSONL de sortie
 with open(output_file, "w", encoding="utf-8") as f_out:
    for uk_text, fr_translations in unique_pairs.items():
        # Prendre la première traduction (ou toutes si tu préfères)
        for fr_text in fr_translations:
            # Créer une entrée JSONL
            entry = {
                "text": uk_text,
                "translation": fr_text
            }
            f_out.write(json.dumps(entry, ensure_ascii=False) + "\n")
 print(f"Fichier JSONL généré : {output_file}")
--- a/Finetunning/validation.jsonl
+++ b/Finetunning/validation.jsonl
@@ -0,0 +1,30 @@
 {"text": "Як би ти не намагався, ти не вивчиш англійську за два-три місяці.", "translation": "Quels que soient tes efforts, tu ne pourras pas apprendre l’anglais en deux-trois mois."}
 {"text": "Поки я не подзвонив, він не прийшов.", "translation": "Il n’est pas venu avant que je ne l’appelle."}
 {"text": "У всесвіті багато галактик.", "translation": "Il y a beaucoup de galaxies dans l'univers."}
 {"text": "Вона приймає душ щоранку.", "translation": "Elle prend une douche chaque matin."}
 {"text": "У Майка є декілька друзів у Флориді.", "translation": "Mike a quelques amis en Floride."}
 {"text": "Я зустрінуся з тобою в неділю о третій.", "translation": "On se voit dimanche à trois heures."}
 {"text": "Я сказав собі: «Це гарна ідея».", "translation": "Je me suis dit : « C’est une bonne idée. »"}
 {"text": "Ми збиралися пробути там біля двох тижнів.", "translation": "Nous avions l’intention de rester là près de deux semaines."}
 {"text": "Я чищу зуби двічі на день.", "translation": "Je me brosse les dents deux fois par jour."}
 {"text": "Він ніжно поклав руку на її плече.", "translation": "Il posa la main gentiment sur son épaule."}
 {"text": "Сьогодні жахливо холодно.", "translation": "Il fait horriblement froid aujourd'hui."}
 {"text": "У цю суму включено податки.", "translation": "Cette somme inclut les taxes."}
 {"text": "Ця школа була заснована в 1650 році.", "translation": "Cette école fut fondée en 1650."}
 {"text": "Я випадково знайшов цей ресторан.", "translation": "J'ai trouvé ce restaurant par hasard."}
 {"text": "Я не хотів нікого образити.", "translation": "Je ne voulais vexer personne."}
 {"text": "Цей сад найкраще виглядає весною.", "translation": "Ce parc est plus joli au printemps."}
 {"text": "Цей сир виготовлено з овечого молока.", "translation": "Ce fromage est fait avec du lait de chèvre."}
 {"text": "Він спить як немовля.", "translation": "Il dort comme un bébé."}
 {"text": "Гора вкрита снігом.", "translation": "La montagne est recouverte de neige."}
 {"text": "Я попав під дощ і промок.", "translation": "J’ai été pris sous la pluie, et suis tout trempé."}
 {"text": "Прошу, дайте мені ще один шанс.", "translation": "Je vous en prie, donnez-moi encore une chance."}
 {"text": "Я все сказав.", "translation": "J’ai tout dit."}
 {"text": "Не забувай нас!", "translation": "Ne nous oublie pas !"}
 {"text": "Випало багато снігу.", "translation": "Beaucoup de neige est tombée."}
 {"text": "Йде сніг.", "translation": "Il est en train de neiger."}
 {"text": "Може піти сніг.", "translation": "Il neigera peut-être."}
 {"text": "У нас у січні йде сніг.", "translation": "Chez nous, il neige en janvier."}
 {"text": "Сніг розтав.", "translation": "La neige a fondu."}
 {"text": "Наша компанія планує побудувати новий хімічний завод у Росії.", "translation": "Notre entreprise a le projet de construire une nouvelle usine chimique en Russie."}
 {"text": "Франція воювала з Росією.", "translation": "La France fut en guerre avec la Russie."}
--- a/Finetunning/validation.py
+++ b/Finetunning/validation.py
@@ -0,0 +1,170 @@
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
 from peft import PeftModel
 from datasets import load_dataset
 from nltk.translate.bleu_score import corpus_bleu
 # ----------------------------
 # Configuration
 # ----------------------------
 BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"  # base model
 LORA_DIR = "./qwen2.5-7b-uk-fr-lora-2epoch"    # fine-tuned LoRA
 VALIDATION_FILE = "validation.jsonl"    # small validation subset
 MAX_INPUT_LENGTH = 1024
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # Liste des prompts à tester
 PROMPTS_TO_TEST = [
    {
        "name": "Prompt de base",
        "prompt": "Traduis la phrase ukrainienne suivante en français: {text}"
    },
    {
        "name": "Prompt spécialisé mémoires",
        "prompt": (
            "Tu es un traducteur spécialisé dans les mémoires ukrainiennes des années 1910.\n"
            "- Garde le style narratif et les tournures orales de l'auteur.\n"
            "- Respecte les règles de traduction suivantes :\n\n"
            "Règles strictes :\n"
            "1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]).\n"
            "2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l'auteur.\n\n"
            "Voici la phrase à traduire :\nUkrainien : {text}\nFrançais :"
        )
    },
    {
        "name": "Prompt détaillé",
        "prompt": (
            "Tu es un expert en traduction littéraire spécialisé dans les textes historiques ukrainiens.\n"
            "Règles à suivre absolument :\n"
            "1. Conserve tous les noms propres et toponymes dans leur forme originale\n"
            "2. Préserve le style et le registre de l'auteur original\n"
            "3. Ajoute des notes entre crochets pour expliquer les références culturelles si nécessaire\n"
            "4. Traduis de manière naturelle en français tout en restant fidèle au texte source\n\n"
            "Texte à traduire :\nUkrainien : {text}\nTraduction française :"
        )
    },
    {
        "name": "Prompt minimaliste",
        "prompt": "Traduction fidèle de l'ukrainien vers le français : {text}"
    }
 ]
 print("=== Loading tokenizer and model ===")
 # ----------------------------
 # Load tokenizer
 # ----------------------------
 tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL,
    trust_remote_code=True
 )
 tokenizer.pad_token = tokenizer.eos_token
 tokenizer.model_max_length = MAX_INPUT_LENGTH
 # ----------------------------
 # Load base model directly on GPU
 # ----------------------------
 print(f"{80 * '_'}\nLoading base model on GPU...")
 base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map={"": 0},  # all on GPU
    trust_remote_code=True
 )
 # ----------------------------
 # Apply LoRA adapter
 # ----------------------------
 print(f"{80 * '_'}\nApplying LoRA adapter...")
 model = PeftModel.from_pretrained(base_model, LORA_DIR)
 model.eval()
 model.to(DEVICE)  # ensure everything on GPU
 print("Model ready for validation.")
 # ----------------------------
 # Load validation dataset
 # ----------------------------
 print(f"{80 * '_'}\nLoading validation dataset...")
 dataset = load_dataset("json", data_files=VALIDATION_FILE)
 examples = dataset["train"]
 print(f"{len(examples)} examples loaded for testing.")
 # ----------------------------
 # Translation function
 # ----------------------------
@torch.inference_mode()
 def translate(text, prompt_template):
    prompt = prompt_template.format(text=text)
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=MAX_INPUT_LENGTH
    ).to(DEVICE)
    # Utilisation de GenerationConfig pour éviter les avertissements
    generation_config = GenerationConfig.from_model_config(model.config)
    generation_config.max_new_tokens = 256
    generation_config.do_sample = False
    outputs = model.generate(
        **inputs,
        generation_config=generation_config
    )
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Extraction de la partie traduction
    if "Français :" in result:
        translation_part = result.split("Français :")[-1].strip()
    elif "Traduction française :" in result:
        translation_part = result.split("Traduction française :")[-1].strip()
    else:
        translation_part = result.split(text)[-1].strip()
    return translation_part
 # ----------------------------
 # Evaluate all prompts and select best BLEU
 # ----------------------------
 best_bleu = 0
 best_prompt = None
 all_results = {}
 print(f"{80 * '_'}\nTesting all prompts and computing BLEU scores...")
 for prompt_config in PROMPTS_TO_TEST:
    print(f"\n{80 * '='}\nTesting prompt: {prompt_config['name']}\n{80 * '='}")
    references = []
    hypotheses = []
    for i, example in enumerate(examples):
        src_text = example["text"]
        ref_text = example["translation"]
        pred_text = translate(src_text, prompt_config["prompt"])
        print(f"\n[{i+1}] Source: {src_text}")
        print(f"    Reference:  {ref_text}")
        print(f"    Prediction: {pred_text}")
        references.append([ref_text.split()])
        hypotheses.append(pred_text.split())
    bleu_score = corpus_bleu(references, hypotheses) * 100
    print(f"\n=== Corpus BLEU score for '{prompt_config['name']}': {bleu_score:.4f} ===")
    all_results[prompt_config["name"]] = bleu_score
    if bleu_score > best_bleu:
        best_bleu = bleu_score
        best_prompt = prompt_config
 # ----------------------------
 # Display results
 # ----------------------------
 print(f"\n{80 * '='}\nFINAL RESULTS\n{80 * '='}")
 for prompt_name, score in all_results.items():
    print(f"{prompt_name}: {score:.4f}")
 print(f"\nBEST PROMPT: {best_prompt['name']} with BLEU score: {best_bleu:.4f}")
 print(f"Prompt content:\n{best_prompt['prompt']}")
--- a/16
+++ b/16
@@ -1,16 +0,0 @@
 FROM qwen2.5:14b
 PARAMETER	temperature    0.2
 PARAMETER num_ctx 8192
 SYSTEM """
 Tu es un traducteur spécialisé dans les mémoires ukrainiennes des années 1910.
 - Utilise le glossaire fourni pour les noms de lieux et termes historiques.
 - Garde le style narratif et les tournures orales de l'auteur.
 Règles strictes :
 1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire : "[Lemberg en 1910]").
 2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l’auteur.
 3. **Pour les termes historiques** (ex. : "powiat"), utilise le terme français standard ou ajoute une note explicative.
 4. **Ne traduis pas** les mots en russe/allemand/polonais intégrés au texte (ex. : citations, noms officiels).
 5. **Structure** : Garde les sauts de ligne et la mise en page originale.
 6. **Notes du traducteur** : Ajoute entre crochets [ ] les explications contextuelles (ex. : "[Note : ville alors sous domination autrichienne]").
 """
--- a/README.md
+++ b/README.md
@@ -9,29 +9,6 @@ Ce projet permet de traduire un document PDF page par page en utilisant un modè
 - **Python** (version 3.8 ou supérieure)
 - **Ollama** installé et en cours d'exécution sur ta machine (en mode "serveur")
 - Un **document PDF** à traduire
 - Un modèle LLM spécialisé dans la traduction avec un context long.
 ---
 ## Création d'un modèle LLM de traduction avec Ollama
 En partant du LLM [zongwei/gemma3-translator:4b](https://ollama.com/zongwei/gemma3-translator), nous allons créer un modèle optimisé pour la traduction avec Ollama.
 Pour info : Inutile de le downloader, il le serra automatiquement au lancement de la commande.
 ```shell
 FROM zongwei/gemma3-translator:4b
 PARAMETER	temperature    0.1
 PARAMETER	num_ctx	131072
 SYSTEM """
 You are a professional translator specialising in translating Ukrainian text into English.
 Translate accurately and naturally, respecting the original intonation used by the author of the text.
 You must not interpret the author's thoughts or reflections.
 Do not add any text before or after the text provided.
 """
 ```
 Il faut ensuite compiler le modèle avec la commande :
 ```
 ollama create traductionUkrainienVersFrancais -f .\Modelfile
 ```
 ## Installation
@@ -48,7 +25,12 @@ ollama create traductionUkrainienVersFrancais -f .\Modelfile
 pip install -r requirements.txt
 ```
-3. **Placer votre fichier PDF** dans le répertoire du projet avec le nom configuré dans `main.py` (par défaut : `TaniaBorecMemoir(Ukr).pdf`)
+Puis faire :
 ```bash
 pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
 ```
 3. **Placer votre fichier PDF** dans le répertoire `Traduction` du projet avec le nom configuré dans `main.py` (par défaut : `TaniaBorecMemoir(Ukr).pdf`)
 ---
@@ -61,27 +43,6 @@ ollama create traductionUkrainienVersFrancais -f .\Modelfile
 ollama serve
 ```
 2. **Vérifier que le modèle de traduction est disponible**
   ```bash
   ollama list
   ```
   Vous devez voir `traductionUkrainienVersFrancais` dans la liste.
   Si ce n'est pas le cas, vous devez le générer comme décrit plus haut (paragraphe "Création d'un modèle LLM de traduction avec Ollama")
 3. **Placer votre PDF** dans le même répertoire que le script `main.py`
 ### Paramétrage du script
 `PDF_PATH`= "TaniaBorecMemoir(Ukr).pdf" <- Le nom du fichier pdf à traduire.
 `OLLAMA_MODEL` = "mitmul/plamo-2-translate:latest" <- Le nom
 `OLLAMA_URL` = "http://localhost:11434/api/generate"  <- URL par défaut d'Ollama
 `TARGET_LANGUAGE` = "français"  <- Langue cible (ex: "français", "anglais", "allemand", "espagnol", etc.)
 `SYSTEM_PROMPT` = """You are a professional translator specialising in Ukrainian text translation.
 Translate accurately and naturally, respecting the original intonation used by the author of the text.
 You must not interpret the author's thoughts or reflections.
 Do not add any text before or after the text provided.
 Preserve the layout and structure of the original text."""
 ### Exécution
 1. **Lancer le script de traduction**
@@ -120,18 +81,74 @@ Vous pouvez modifier les paramètres suivants dans `main.py` :
 ---
-## Dépannage
+## Finnetunning
 /!\ Expérimental !!!
-### Erreur : "Connexion refusée à Ollama"
+Le finne-tunning permet d'avoir une meilleur traduction. C'est un processus long en temps de calcul, mais permet une traduction plus précise.
 - Vérifiez que Ollama est en cours d'exécution avec `ollama serve`
 - Vérifiez que l'URL est correcte (par défaut : `http://localhost:11434`)
-### Erreur : "Modèle non trouvé"
+Le principe est le suivant :
 - Exécutez : `ollama create traductionUkrainienVersFrancais -f .\Modelfile`
-### Le texte n'est pas bien séparé en paragraphes
+```
- Le script détecte automatiquement les doubles sauts de ligne
+1️⃣ Dataset d’entraînement (pairs.json)
- Si absent, il divise par phrases et regroupe en chunks de 1500 caractères
+      ↓
 1️⃣ Dataset nettoyé ( cleanDataSet.py -> pairs_clean.json)
      ↓
 2️⃣ Fine-tuning LoRA (finetuning.py)
      ↓
 3️⃣ Validation / Évaluation BLEU (validation.py)
      ↓
 4️⃣ Merge LoRA + modèle de base (mergeLora.py)
      ↓
 5️⃣ Conversion en GGUF ()
      ↓
 6️⃣ Ollama (inférence finale)
---
+```
 ### Nettoyage du dataset
 Executer le script ```python cleanDataSet.py```
 ### Validation
 Executer le script ```python validation.py```
 Le script tests plusieurs prompts et renvoie celui avec le meilleur score BLEU.
 Il faut ensuite copier ce prompt dans le fichier ModelFile.
 ### Merge
 Executer le script ```python mergeLora.py```
 ### Conversion en GGUF
 En étant à la racine du projet (et toujours dans le venv), cloner le projet llama.cpp
 ```bash
 git clone https://github.com/ggerganov/llama.cpp
 cd llama.cpp
 pip install -r requirements.txt
 ```
 Et lancer la commande (/!\ ca prend eviron 10 minutes):
 ```bash
 python convert_hf_to_gguf.py ../Finetunning/qwen2.5-7b-uk-fr-merged --outfile qwen2.5-7b-uk-fr.gguf --outtype q8_0
 ```
 Vérification :
 ```bash
 ./main -m qwen2.5-7b-uk-fr.gguf -p "Translate into French: Привіт світ"
 ```
 Pour que ce nouveau modèle soit exploitable par ollama, il faut TODO
 ## Utilisation du modèle fine-tunné pour la traduction
 Créer un Modelfile :
 ```
 FROM ./qwen2.5-7b-uk-fr.gguf
 PARAMETER temperature 0.1
 PARAMETER top_p 0.95
 PARAMETER num_ctx 4096
 SYSTEM """
 You are a professional Ukrainian to French translator.
 Produce faithful, literal translations.
 """
 ```
--- a/TaniaBorecMemoir(Ukr).pdf
+++ b/TaniaBorecMemoir(Ukr).pdf
--- a/Traduction/Modelfile
+++ b/Traduction/Modelfile
@@ -0,0 +1,19 @@
 FROM translategemma:12b
 PARAMETER	temperature    0.2
 PARAMETER num_ctx 8192
 SYSTEM """
 You are a professional Ukrainian (uk) to french (fr) translator. Your goal is to accurately convey the meaning and nuances of the original Ukrainian text while adhering to french grammar, vocabulary, and cultural sensitivities.
 Produce only the french translation, without any additional explanations or commentary. Please translate the following Ukrainian text into french:
 {TEXT}
 Règles strictes :
 1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]).
 2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l’auteur.
 3. **Pour les termes historiques** (ex. : "powiat"), utilise le terme français standard et ajoute une note explicative.
 4. **Conserve les citations** russe/allemand/polonais intégrés au texte (mais ajoute une note de fin de paragraphe entre [ ] en la traduisant et en précisant la langue d'origine.
 5. **Structure** : Garde les sauts de ligne et la mise en page originale.
 6. **Notes du traducteur** : Ajoute entre crochets [ ] les explications contextuelles si un contexte historique existe (ex. : "[Note : le context]").
 """
--- a/Traduction/main.py
+++ b/Traduction/main.py
@@ -3,23 +3,22 @@ import requests
 import json
 from reportlab.lib.pagesizes import letter
 from reportlab.lib.units import inch
-from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Flowable
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 from reportlab.lib.enums import TA_JUSTIFY
 from reportlab.pdfbase import pdfmetrics
 from reportlab.pdfbase.ttfonts import TTFont
-import os
+import os, time
 # Configuration
-DEBUG = True
+PDF_PATH = "Traduction/TaniaBorecMemoir(Ukr).pdf"
 PDF_PATH = "TaniaBorecMemoir(Ukr).pdf"
 OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
 OLLAMA_URL = "http://localhost:11434/api/generate"
 TARGET_LANGUAGE = "français"
-CHECKPOINT_FILE = "checkpoint.json"
+CHECKPOINT_FILE = "Traduction/checkpoint.json"
-TEMP_OUTPUT_TXT = "output_temp.txt"
+TEMP_OUTPUT_TXT = "Traduction/output_temp.txt"
-FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V6.pdf")
+FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V10.pdf")
-FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V6.txt")
+FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V10.txt")
 DEBUG = True
@@ -174,7 +173,6 @@ def load_checkpoint():
            return json.load(f)
    return {"last_processed_index": -1, "results": {}}
 # Sauvegarde le checkpoint
 # Sauvegarde le checkpoint
 def save_checkpoint(last_index, results):
    # Trier les clés du dictionnaire results
@@ -341,6 +339,7 @@ def main():
    print(f"Batches manquants détectés : {missing_batches}")
    # Traduction des paragraphes manquants
    temps_cumule = 0.0
    for i in missing_batches:
        batch = paragraphs[i:i + batch_size]
        paragraph_cumul = "\n".join(batch)
@@ -348,13 +347,24 @@ def main():
        print(f"{15 * '-'} Traduction des paragraphes manquants {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
        try:
            debut_chrono = time.time()
            result = send_to_ollama(paragraph_cumul)
            fin_chrono = time.time()
            temps_paragraphe = fin_chrono - debut_chrono
            temps_cumule += temps_paragraphe
            # Conversion en minutes et secondes
            minutes_paragraphe, secondes_paragraphe = divmod(temps_paragraphe, 60)
            minutes_cumule, secondes_cumule = divmod(temps_cumule, 60)
            print(f"{result}")
            results[str(i)] = result
            save_checkpoint(len(paragraphs), results)  # Met à jour le dernier indice du batch
            save_temp_results(results)
        except Exception as e:
            print(f"Erreur lors de la traduction du paragraphe {i}: {e}")
        print(f"  Temps de traduction : {int(minutes_paragraphe)} min {secondes_paragraphe:.2f} sec")
        print(f"  Temps cumulé : {int(minutes_cumule)} min {secondes_cumule:.2f} sec")
    # Traitement des paragraphes suivants
    for i in range(last_index + 1, len(paragraphs), batch_size):
@@ -364,7 +374,16 @@ def main():
        print(f"{15 * '-'} Traduction des paragraphes {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
        try:
            debut_chrono = time.time()
            result = send_to_ollama(paragraph_cumul)
            fin_chrono = time.time()
            temps_paragraphe = fin_chrono - debut_chrono
            temps_cumule += temps_paragraphe
            # Conversion en minutes et secondes
            minutes_paragraphe, secondes_paragraphe = divmod(temps_paragraphe, 60)
            minutes_cumule, secondes_cumule = divmod(temps_cumule, 60)
            print(f"{result}")
            results[str(i)] = result
            save_checkpoint(i + batch_size - 1, results)
@@ -372,6 +391,9 @@ def main():
        except Exception as e:
            print(f"Erreur : {e}")
            continue
        print(f"  Temps de traduction : {int(minutes_paragraphe)} min {secondes_paragraphe:.2f} sec")
        print(f"  Temps cumulé : {int(minutes_cumule)} min {secondes_cumule:.2f} sec")
    save_temp_results(results)
    create_pdf_from_results(results, FINAL_OUTPUT_PDF)
--- a/llama.cpp
+++ b/llama.cpp
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,17 @@
-certifi==2026.1.4
+certifi
-charset-normalizer==3.4.4
+charset-normalizer
-idna==3.11
+idna
-pillow==12.1.0
+pillow
-PyPDF2==3.0.1
+PyPDF2
-reportlab==4.4.7
+reportlab
-requests==2.32.5
+requests
-urllib3==2.6.2
+urllib3
 torch
 transformers
 datasets
 peft
 bitsandbytes
 accelerate
 trl
 nltk
--- a/run.bat
+++ b/run.bat
@@ -11,18 +11,19 @@ REM Chemin vers l'environnement virtuel Python (relatif au répertoire courant)
 set VENV_PATH=%CURRENT_DIR%\venv
 REM Chemin vers votre script principal (relatif au répertoire courant)
-set MAIN_SCRIPT_PATH=%CURRENT_DIR%\main.py
+set MAIN_SCRIPT_PATH=%CURRENT_DIR%\Traduction\main.py
 REM Activer l'environnement virtuel Python
 call %VENV_PATH%\Scripts\activate.bat
 REM Lancer la compilation du modèle LLM pour Ollama
-ollama create traductionUkrainienVersFrancais -f .\Modelfile
+echo Compilation du modèle LLM pour Ollama
 ollama create traductionUkrainienVersFrancais -f .\Traduction\Modelfile
 :: 1. Vérifie si le processus ollama.exe est en cours d'exécution
 tasklist | find "ollama.exe" >nul
 if %ERRORLEVEL% equ 0 (
-    echo [OK] Le processus Ollama est en cours d'execution.
+    echo [OK] Le processus Ollama est bien en cours d'execution.
 ) else (
    echo [ERREUR] Ollama n'est pas lancé.
    pause
@@ -39,6 +40,7 @@ if %ERRORLEVEL% neq 0 (
 )
 REM Exécuter le script principal
 echo Lancement du script principal de traduction
 python %MAIN_SCRIPT_PATH%
 endlocal
Author	SHA1	Message	Date
Alex	dc66ac9520	utilisation du LLM translategemma:12b	2026-02-11 17:29:59 +01:00
Alex	8b45028101	Suppression des fichiers PDF et TXT dans le répertoire Traduction	2026-02-11 17:11:46 +01:00
Alex	ef4515adcc	remove pdf files	2026-02-11 17:07:57 +01:00
Alex	7aea840821	optimisation pour traduction plus fuilde	2026-02-11 17:06:07 +01:00
Alex	71e595a966	fine tunning	2026-02-11 17:02:29 +01:00
Alex	d5313fb143	version fonctionnelle	2026-01-15 19:11:55 +01:00
Alex	51e114b1ee	meilleur gestion de la reprise du trainning	2026-01-15 17:03:18 +01:00
Alex	fa3ad61dd7	utilisation du GPU	2026-01-15 16:59:25 +01:00
Alex	70e4932cd0	optimisation	2026-01-14 23:54:12 +01:00
Alex	aee2716a41	nettoyage dataset	2026-01-14 23:47:39 +01:00
Alex	bf7949d8c3	conversion en LLM	2026-01-14 18:41:27 +01:00
Alex	a4296d012e	méthode à suivre	2026-01-14 18:12:25 +01:00
Alex	c5d372e98d	ne pas versioner le lora	2026-01-14 18:00:42 +01:00
Alex	8d2e5ac021	optimisation de la relance	2026-01-14 17:59:36 +01:00
Alex	adca297850	validation	2026-01-14 17:59:12 +01:00
Alex	83b2eccd07	validation du modele	2026-01-14 12:12:39 +01:00
Alex	8dfb2b81e0	conversion du fichier tsv en json	2026-01-12 19:55:01 +01:00
Alex	4ed1ffa226	autorise la reprise d'unentrainement	2026-01-12 14:40:23 +01:00
Alex	182e6e7a98	finnetunning	2026-01-12 00:14:26 +01:00
Alex	50f5bef7f1	finetunning	2026-01-08 23:46:32 +01:00
Alex	9c3ac3f977	retraduction de certains passages	2026-01-07 10:12:53 +01:00