meilleur gestion de la reprise du trainning

utilisation du GPU
optimisation
2026-01-15 17:03:18 +01:00 · 2026-01-15 16:59:25 +01:00 · 2026-01-14 23:54:12 +01:00 · 2026-01-14 23:47:39 +01:00 · 2026-01-14 18:41:27 +01:00 · 2026-01-14 18:12:25 +01:00
32 changed files with 25741 additions and 71 deletions
@@ -0,0 +1,2 @@
+# Les modèles générés 
+qwen2.5*/
@@ -0,0 +1,144 @@
+import json
+import unicodedata
+import re
+from collections import OrderedDict
+
+# ----------------------------
+# Configuration
+# ----------------------------
+INPUT_FILE = "paires.json"
+OUTPUT_FILE = "paires_clean.json"
+
+MIN_TOKENS = 5
+MAX_TOKENS = 200
+MIN_QUALITY_SCORE = 0.60
+
+print("=== Dataset cleaning + quality scoring started ===")
+
+# ----------------------------
+# Normalization helpers
+# ----------------------------
+def normalize_text(text: str) -> str:
+    text = unicodedata.normalize("NFKC", text)
+    text = re.sub(r"\s+", " ", text).strip()
+    text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
+    return text
+
+
+def token_count(text: str) -> int:
+    return len(text.split())
+
+
+# ----------------------------
+# Quality scoring
+# ----------------------------
+def length_ratio_score(src_len, tgt_len):
+    """
+    Ideal ratio FR/UK ≈ 0.9 – 1.3
+    """
+    ratio = tgt_len / max(src_len, 1)
+
+    if ratio < 0.5 or ratio > 2.0:
+        return 0.0
+    elif 0.75 <= ratio <= 1.5:
+        return 1.0
+    else:
+        return max(0.0, 1.0 - abs(ratio - 1.1))
+
+
+def lexical_density_score(text):
+    """
+    Penalize very repetitive or trivial translations
+    """
+    tokens = text.split()
+    if not tokens:
+        return 0.0
+    unique_ratio = len(set(tokens)) / len(tokens)
+    return min(1.0, unique_ratio * 1.5)
+
+
+def quality_score(src, tgt):
+    src_len = token_count(src)
+    tgt_len = token_count(tgt)
+
+    l_score = length_ratio_score(src_len, tgt_len)
+    d_score = lexical_density_score(tgt)
+
+    return 0.7 * l_score + 0.3 * d_score
+
+
+# ----------------------------
+# Load + clean + score
+# ----------------------------
+unique_sources = OrderedDict()
+
+stats = {
+    "total": 0,
+    "removed_length": 0,
+    "removed_duplicates": 0,
+    "removed_quality": 0,
+}
+
+with open(INPUT_FILE, "r", encoding="utf-8") as f:
+    for line in f:
+        stats["total"] += 1
+        item = json.loads(line)
+
+        src = normalize_text(item["text"])
+        tgt = normalize_text(item["translation"])
+
+        src_len = token_count(src)
+        tgt_len = token_count(tgt)
+
+        # Length filtering
+        if not (MIN_TOKENS <= src_len <= MAX_TOKENS):
+            stats["removed_length"] += 1
+            continue
+
+        if not (MIN_TOKENS <= tgt_len <= MAX_TOKENS):
+            stats["removed_length"] += 1
+            continue
+
+        # Deduplication
+        if src in unique_sources:
+            stats["removed_duplicates"] += 1
+            continue
+
+        # Quality score
+        q_score = quality_score(src, tgt)
+        if q_score < MIN_QUALITY_SCORE:
+            stats["removed_quality"] += 1
+            continue
+
+        unique_sources[src] = {
+            "translation": tgt,
+            "quality_score": round(q_score, 3)
+        }
+
+# ----------------------------
+# Report
+# ----------------------------
+print(f"Total lines processed: {stats['total']}")
+print(f"Removed (length): {stats['removed_length']}")
+print(f"Removed (duplicates): {stats['removed_duplicates']}")
+print(f"Removed (quality): {stats['removed_quality']}")
+print(f"Final kept pairs: {len(unique_sources)}")
+
+# ----------------------------
+# Save cleaned dataset
+# ----------------------------
+with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
+    for src, data in unique_sources.items():
+        json.dump(
+            {
+                "text": src,
+                "translation": data["translation"],
+                "quality_score": data["quality_score"],
+            },
+            f,
+            ensure_ascii=False
+        )
+        f.write("\n")
+
+print(f"=== Cleaning completed ===")
+print(f"Clean dataset saved to: {OUTPUT_FILE}")
@@ -1,9 +1,11 @@
+import os
 import torch
 from datasets import load_dataset
 from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
+    BitsAndBytesConfig,
 )
 from peft import (
    LoraConfig,
@@ -11,50 +13,85 @@ from peft import (
    prepare_model_for_kbit_training,
 )
 from trl import SFTTrainer
-import os
+
+# ----------------------------
+# Environment safety (Windows)
+# ----------------------------
 os.environ["TORCHDYNAMO_DISABLE"] = "1"

 # ----------------------------
-# Model configuration
+# Global configuration
 # ----------------------------
-MODEL_NAME = "Qwen/Qwen2.5-14B-Instruct"
+MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
+OUTPUT_DIR = "./qwen2.5-7b-uk-fr-lora"
+DATA_FILE = "paires_clean.json"
+MAX_SEQ_LENGTH = 1024

-print("=== Starting fine-tuning script ===")
+print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n")

+# ----------------------------
+# [1/7] Tokenizer
+# ----------------------------
 print(f"{80 * '_'}\n[1/7] Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True
 )

-# Ensure padding token is defined
 tokenizer.pad_token = tokenizer.eos_token
-tokenizer.model_max_length = 1024
+tokenizer.model_max_length = MAX_SEQ_LENGTH

-print("Tokenizer loaded and configured.")
-
-print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (QLoRA)...")
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_NAME,
-    load_in_4bit=True,
-    device_map="auto",
-    torch_dtype=torch.float16,  # OK for weights
-    trust_remote_code=True,
-)
-print("Model loaded.")
-
-print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...")
-model = prepare_model_for_kbit_training(model)
-print("Model prepared for k-bit training.")
+print("Tokenizer loaded.")
+print(f"Pad token id: {tokenizer.pad_token_id}")
+print(f"Max sequence length: {tokenizer.model_max_length}")

 # ----------------------------
-# LoRA configuration
+# [2/7] Quantization config (QLoRA)
+# ----------------------------
+print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (optimized QLoRA)...")
+
+assert torch.cuda.is_available(), "CUDA GPU not detected!"
+print(f"Using GPU: {torch.cuda.get_device_name(0)}")
+
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_use_double_quant=True,
+)
+
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    device_map="cuda",          # 🔥 SAFE
+    quantization_config=bnb_config,
+    low_cpu_mem_usage=True,
+    trust_remote_code=True,
+)
+
+print("Model loaded successfully in 4-bit mode on GPU.")
+
+
+# ----------------------------
+# [3/7] Prepare model for k-bit training
+# ----------------------------
+print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...")
+model = prepare_model_for_kbit_training(model)
+
+model.gradient_checkpointing_enable(
+    gradient_checkpointing_kwargs={"use_reentrant": False}
+)
+
+print("Model prepared for k-bit training.")
+print("Gradient checkpointing enabled (non-reentrant).")
+
+# ----------------------------
+# [4/7] LoRA configuration
 # ----------------------------
 print(f"{80 * '_'}\n[4/7] Configuring LoRA adapters...")
 lora_config = LoraConfig(
-    r=16,
-    lora_alpha=32,
-    lora_dropout=0.05,
+    r=32,
+    lora_alpha=64,
+    lora_dropout=0.02,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=[
@@ -70,57 +107,64 @@ lora_config = LoraConfig(

 model = get_peft_model(model, lora_config)
 model.print_trainable_parameters()
-print("LoRA adapters attached to the model.")
+
+print("LoRA adapters successfully attached.")

 # ----------------------------
-# Dataset loading
+# [5/7] Dataset loading & formatting
 # ----------------------------
 print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...")
-dataset = load_dataset(
-    "json",
-    data_files="traductions.json"
-)
+dataset = load_dataset("json", data_files=DATA_FILE)
+
 print(f"Dataset loaded with {len(dataset['train'])} samples.")

 print("Formatting dataset for Ukrainian → French translation...")

 def format_prompt(example):
-    prompt = (
-        "Translate the following Ukrainian text into French.\n\n"
-        f"Ukrainian: {example['text']}\n"
-        f"French: {example['translation']}"
-    )
-    return {"text": prompt}
+    return {
+        "text": ("<|user|>\n"
+            "Translate the following Ukrainian text into French.\n"
+            f"Ukrainian: {example['text']}\n"
+            "<|assistant|>\n"
+            f"{example['translation']}"
+        )
+    }
+
+dataset = dataset.map(
+    format_prompt,
+    remove_columns=dataset["train"].column_names
+)

-dataset = dataset.map(format_prompt, remove_columns=dataset["train"].column_names)
 print("Dataset formatting completed.")
+print("Example prompt:\n")
+print(dataset["train"][0]["text"])

 # ----------------------------
-# Training arguments
+# [6/7] Training arguments
 # ----------------------------
 print(f"{80 * '_'}\n[6/7] Initializing training arguments...")
 training_args = TrainingArguments(
-    output_dir="./qwen-uk-fr-lora",
+    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
-    learning_rate=2e-4,
+    learning_rate=1e-4,
    num_train_epochs=3,
-
    fp16=False,
    bf16=False,
-
+    optim="paged_adamw_32bit",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
-
-    # Use 32-bit optimizer
-    optim="paged_adamw_32bit",
-
    report_to="none",
+    dataloader_pin_memory=False,
 )

-
 print("Training arguments ready.")
+print(f"Output directory: {OUTPUT_DIR}")
+print(f"Epochs: {training_args.num_train_epochs}")
+print(f"Effective batch size: "
+    f"{training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}"
+)

 # ----------------------------
 # Trainer
@@ -135,18 +179,34 @@ trainer = SFTTrainer(
 print("Trainer initialized.")

 # ----------------------------
-# Train
+# [7/7] Training
 # ----------------------------
 print(f"{80 * '_'}\n[7/7] Starting training...")
-trainer.train()
+checkpoint_exists = any(
+    d.startswith("checkpoint-")
+    for d in os.listdir(OUTPUT_DIR)
+) if os.path.exists(OUTPUT_DIR) else False
+
+if checkpoint_exists:
+    print("Checkpoint found → resuming training")
+    train_output = trainer.train(resume_from_checkpoint=True)
+else:
+    print("No checkpoint found → starting fresh training")
+    train_output = trainer.train()
+
+
+print("\n=== Training summary ===")
+print(f"Global steps: {train_output.global_step}")
+print(f"Training loss: {train_output.training_loss}")
+print(f"Metrics: {train_output.metrics}")
 print("Training completed successfully.")

 # ----------------------------
 # Save LoRA adapter
 # ----------------------------
-print("Saving LoRA adapter and tokenizer...")
-trainer.model.save_pretrained("./qwen-uk-fr-lora")
-tokenizer.save_pretrained("./qwen-uk-fr-lora")
+print(f"{80 * '_'}\nSaving LoRA adapter and tokenizer...")
+trainer.model.save_pretrained(OUTPUT_DIR)
+tokenizer.save_pretrained(OUTPUT_DIR)

-print("=== Fine-tuning finished ===")
-print("LoRA adapter saved in ./qwen-uk-fr-lora")
+print("\n=== Fine-tuning finished ===")
+print(f"LoRA adapter saved in: {OUTPUT_DIR}")
@@ -0,0 +1,68 @@
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+
+# ----------------------------
+# Configuration
+# ----------------------------
+BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
+LORA_DIR = "./qwen2.5-7b-uk-fr-lora"      # dossier issu du fine-tuning
+OUTPUT_DIR = "./qwen2.5-7b-uk-fr-merged"  # modèle fusionné final
+
+DTYPE = torch.float16   # GGUF-friendly
+DEVICE = "cpu"          # merge sur CPU (stable, sûr)
+
+print("=== LoRA merge script started ===")
+
+# ----------------------------
+# Load base model
+# ----------------------------
+print(f"{80 * '_'}\n[1/4] Loading base model...")
+base_model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    torch_dtype=DTYPE,
+    device_map=DEVICE,
+    trust_remote_code=True,
+)
+print("Base model loaded.")
+
+# ----------------------------
+# Load tokenizer
+# ----------------------------
+print(f"{80 * '_'}\n[2/4] Loading tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(
+    BASE_MODEL,
+    trust_remote_code=True
+)
+tokenizer.pad_token = tokenizer.eos_token
+print("Tokenizer loaded.")
+
+# ----------------------------
+# Load LoRA adapter
+# ----------------------------
+print(f"{80 * '_'}\n[3/4] Loading LoRA adapter...")
+model = PeftModel.from_pretrained(
+    base_model,
+    LORA_DIR,
+)
+print("LoRA adapter loaded.")
+
+# ----------------------------
+# Merge LoRA into base model
+# ----------------------------
+print(f"{80 * '_'}\n[4/4] Merging LoRA into base model...")
+model = model.merge_and_unload()
+print("LoRA successfully merged.")
+
+# ----------------------------
+# Save merged model
+# ----------------------------
+print("Saving merged model...")
+model.save_pretrained(
+    OUTPUT_DIR,
+    safe_serialization=True,
+)
+tokenizer.save_pretrained(OUTPUT_DIR)
+
+print("=== Merge completed successfully ===")
+print(f"Merged model saved in: {OUTPUT_DIR}")
@@ -11,13 +11,11 @@
 {"text": "Як на мене, то наразі помовчу.", "translation": "En ce qui me concerne, je n’ai pour le moment rien à dire."}
 {"text": "Мій дядько вчора помер від раку шлунку.", "translation": "Mon oncle est mort hier d’un cancer à l’estomac."}
 {"text": "Я не знаю, що ще можна зробити.", "translation": "Je ne sais plus quoi faire."}
-{"text": "Я навчився жити без неї.", "translation": "J’ai appris à vivre sans elle."}
 {"text": "Я навчився жити без неї.", "translation": "J'ai appris à vivre sans elle."}
 {"text": "Справді?", "translation": "Vraiment ?"}
 {"text": "Справді?", "translation": "C'est vrai ?"}
 {"text": "Справді?", "translation": "Vrai ?"}
 {"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J’ai toujours préféré les personnages mystérieux."}
-{"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J'ai toujours préféré les personnages mystérieux."}
 {"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J'ai toujours plus apprécié les personnages mystérieux."}
 {"text": "Тобі краще поспати.", "translation": "Tu ferais mieux de dormir."}
 {"text": "Обдумай це.", "translation": "Penses-y."}
@@ -69,7 +67,6 @@
 {"text": "Ця традиція зародилася в Китаї.", "translation": "Cette tradition est née en Chine."}
 {"text": "У Японії є дипломатичні стосунки з Китаєм.", "translation": "Le Japon a des relations diplomatiques avec la Chine."}
 {"text": "Він повернувся з Китаю.", "translation": "Il est revenu de Chine."}
-{"text": "Він повернувся з Китаю.", "translation": "Il est reparti de Chine."}
 {"text": "Він повернувся з Китаю.", "translation": "Il revint de Chine."}
 {"text": "Він повернувся з Китаю.", "translation": "Il est rentré de Chine."}
 {"text": "Він написав книжку про порцеляну.", "translation": "Il a écrit un livre sur la porcelaine."}
@@ -125,7 +122,6 @@
 {"text": "Я кохаю тебе.", "translation": "Je t'aime !"}
 {"text": "З днем народження!", "translation": "Bon anniversaire !"}
 {"text": "З днем народження!", "translation": "Joyeux anniversaire !"}
-{"text": "З днем народження!", "translation": "Joyeux anniversaire."}
 {"text": "Кожному своє.", "translation": "À chacun son goût."}
 {"text": "Кожному своє.", "translation": "Chacun son truc."}
 {"text": "Скільки це коштує?", "translation": "Ça coûte combien ?"}
@@ -171,7 +167,6 @@
 {"text": "В інтернеті мало сайтів татарською мовою.", "translation": "Il y a peu de sites en langue Tatar sur Internet."}
 {"text": "Удачі на іспиті!", "translation": "Bonne chance pour ton examen !"}
 {"text": "Де ти живеш?", "translation": "Où habites-tu ?"}
-{"text": "Де ти живеш?", "translation": "Où résides-tu ?"}
 {"text": "Де ти живеш?", "translation": "Tu habites où ?"}
 {"text": "Де ти живеш?", "translation": "Où demeures-tu ?"}
 {"text": "Де ти живеш?", "translation": "Où vis-tu ?"}
@@ -2,7 +2,7 @@ import json
 from collections import defaultdict

 # Chemin vers ton fichier d'entrée et de sortie
-input_file = "Paires de phrases en ukrainien-français - 2026-01-06.tsv"  # Remplace par ton chemin
+input_file = "Paires-de-phrases-en-ukrainien-francais-2026-01-06.tsv"  # Remplace par ton chemin
 output_file = "paires.json"     # Fichier de sortie

 # Dictionnaire pour stocker les paires uniques (clé = phrase ukrainienne, valeur = liste de traductions)
@@ -0,0 +1,30 @@
+{"text": "Як би ти не намагався, ти не вивчиш англійську за два-три місяці.", "translation": "Quels que soient tes efforts, tu ne pourras pas apprendre l’anglais en deux-trois mois."}
+{"text": "Поки я не подзвонив, він не прийшов.", "translation": "Il n’est pas venu avant que je ne l’appelle."}
+{"text": "У всесвіті багато галактик.", "translation": "Il y a beaucoup de galaxies dans l'univers."}
+{"text": "Вона приймає душ щоранку.", "translation": "Elle prend une douche chaque matin."}
+{"text": "У Майка є декілька друзів у Флориді.", "translation": "Mike a quelques amis en Floride."}
+{"text": "Я зустрінуся з тобою в неділю о третій.", "translation": "On se voit dimanche à trois heures."}
+{"text": "Я сказав собі: «Це гарна ідея».", "translation": "Je me suis dit : « C’est une bonne idée. »"}
+{"text": "Ми збиралися пробути там біля двох тижнів.", "translation": "Nous avions l’intention de rester là près de deux semaines."}
+{"text": "Я чищу зуби двічі на день.", "translation": "Je me brosse les dents deux fois par jour."}
+{"text": "Він ніжно поклав руку на її плече.", "translation": "Il posa la main gentiment sur son épaule."}
+{"text": "Сьогодні жахливо холодно.", "translation": "Il fait horriblement froid aujourd'hui."}
+{"text": "У цю суму включено податки.", "translation": "Cette somme inclut les taxes."}
+{"text": "Ця школа була заснована в 1650 році.", "translation": "Cette école fut fondée en 1650."}
+{"text": "Я випадково знайшов цей ресторан.", "translation": "J'ai trouvé ce restaurant par hasard."}
+{"text": "Я не хотів нікого образити.", "translation": "Je ne voulais vexer personne."}
+{"text": "Цей сад найкраще виглядає весною.", "translation": "Ce parc est plus joli au printemps."}
+{"text": "Цей сир виготовлено з овечого молока.", "translation": "Ce fromage est fait avec du lait de chèvre."}
+{"text": "Він спить як немовля.", "translation": "Il dort comme un bébé."}
+{"text": "Гора вкрита снігом.", "translation": "La montagne est recouverte de neige."}
+{"text": "Я попав під дощ і промок.", "translation": "J’ai été pris sous la pluie, et suis tout trempé."}
+{"text": "Прошу, дайте мені ще один шанс.", "translation": "Je vous en prie, donnez-moi encore une chance."}
+{"text": "Я все сказав.", "translation": "J’ai tout dit."}
+{"text": "Не забувай нас!", "translation": "Ne nous oublie pas !"}
+{"text": "Випало багато снігу.", "translation": "Beaucoup de neige est tombée."}
+{"text": "Йде сніг.", "translation": "Il est en train de neiger."}
+{"text": "Може піти сніг.", "translation": "Il neigera peut-être."}
+{"text": "У нас у січні йде сніг.", "translation": "Chez nous, il neige en janvier."}
+{"text": "Сніг розтав.", "translation": "La neige a fondu."}
+{"text": "Наша компанія планує побудувати новий хімічний завод у Росії.", "translation": "Notre entreprise a le projet de construire une nouvelle usine chimique en Russie."}
+{"text": "Франція воювала з Росією.", "translation": "La France fut en guerre avec la Russie."}
@@ -0,0 +1,170 @@
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
+from peft import PeftModel
+from datasets import load_dataset
+from nltk.translate.bleu_score import corpus_bleu
+
+# ----------------------------
+# Configuration
+# ----------------------------
+BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"  # base model
+LORA_DIR = "./qwen2.5-7b-uk-fr-lora"    # fine-tuned LoRA
+VALIDATION_FILE = "validation.jsonl"    # small validation subset
+MAX_INPUT_LENGTH = 1024
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+# Liste des prompts à tester
+PROMPTS_TO_TEST = [
+    {
+        "name": "Prompt de base",
+        "prompt": "Traduis la phrase ukrainienne suivante en français: {text}"
+    },
+    {
+        "name": "Prompt spécialisé mémoires",
+        "prompt": (
+            "Tu es un traducteur spécialisé dans les mémoires ukrainiennes des années 1910.\n"
+            "- Garde le style narratif et les tournures orales de l'auteur.\n"
+            "- Respecte les règles de traduction suivantes :\n\n"
+            "Règles strictes :\n"
+            "1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]).\n"
+            "2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l'auteur.\n\n"
+            "Voici la phrase à traduire :\nUkrainien : {text}\nFrançais :"
+        )
+    },
+    {
+        "name": "Prompt détaillé",
+        "prompt": (
+            "Tu es un expert en traduction littéraire spécialisé dans les textes historiques ukrainiens.\n"
+            "Règles à suivre absolument :\n"
+            "1. Conserve tous les noms propres et toponymes dans leur forme originale\n"
+            "2. Préserve le style et le registre de l'auteur original\n"
+            "3. Ajoute des notes entre crochets pour expliquer les références culturelles si nécessaire\n"
+            "4. Traduis de manière naturelle en français tout en restant fidèle au texte source\n\n"
+            "Texte à traduire :\nUkrainien : {text}\nTraduction française :"
+        )
+    },
+    {
+        "name": "Prompt minimaliste",
+        "prompt": "Traduction fidèle de l'ukrainien vers le français : {text}"
+    }
+]
+
+print("=== Loading tokenizer and model ===")
+
+# ----------------------------
+# Load tokenizer
+# ----------------------------
+tokenizer = AutoTokenizer.from_pretrained(
+    BASE_MODEL,
+    trust_remote_code=True
+)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.model_max_length = MAX_INPUT_LENGTH
+
+# ----------------------------
+# Load base model directly on GPU
+# ----------------------------
+print(f"{80 * '_'}\nLoading base model on GPU...")
+base_model = AutoModelForCausalLM.from_pretrained(
+    BASE_MODEL,
+    torch_dtype=torch.float16,
+    device_map={"": 0},  # all on GPU
+    trust_remote_code=True
+)
+
+# ----------------------------
+# Apply LoRA adapter
+# ----------------------------
+print(f"{80 * '_'}\nApplying LoRA adapter...")
+model = PeftModel.from_pretrained(base_model, LORA_DIR)
+model.eval()
+model.to(DEVICE)  # ensure everything on GPU
+print("Model ready for validation.")
+
+# ----------------------------
+# Load validation dataset
+# ----------------------------
+print(f"{80 * '_'}\nLoading validation dataset...")
+dataset = load_dataset("json", data_files=VALIDATION_FILE)
+examples = dataset["train"]
+print(f"{len(examples)} examples loaded for testing.")
+
+# ----------------------------
+# Translation function
+# ----------------------------
+@torch.inference_mode()
+def translate(text, prompt_template):
+    prompt = prompt_template.format(text=text)
+    inputs = tokenizer(
+        prompt,
+        return_tensors="pt",
+        truncation=True,
+        max_length=MAX_INPUT_LENGTH
+    ).to(DEVICE)
+
+    # Utilisation de GenerationConfig pour éviter les avertissements
+    generation_config = GenerationConfig.from_model_config(model.config)
+    generation_config.max_new_tokens = 256
+    generation_config.do_sample = False
+
+    outputs = model.generate(
+        **inputs,
+        generation_config=generation_config
+    )
+
+    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+
+    # Extraction de la partie traduction
+    if "Français :" in result:
+        translation_part = result.split("Français :")[-1].strip()
+    elif "Traduction française :" in result:
+        translation_part = result.split("Traduction française :")[-1].strip()
+    else:
+        translation_part = result.split(text)[-1].strip()
+
+    return translation_part
+
+# ----------------------------
+# Evaluate all prompts and select best BLEU
+# ----------------------------
+best_bleu = 0
+best_prompt = None
+all_results = {}
+
+print(f"{80 * '_'}\nTesting all prompts and computing BLEU scores...")
+
+for prompt_config in PROMPTS_TO_TEST:
+    print(f"\n{80 * '='}\nTesting prompt: {prompt_config['name']}\n{80 * '='}")
+    references = []
+    hypotheses = []
+
+    for i, example in enumerate(examples):
+        src_text = example["text"]
+        ref_text = example["translation"]
+        pred_text = translate(src_text, prompt_config["prompt"])
+
+        print(f"\n[{i+1}] Source: {src_text}")
+        print(f"    Reference:  {ref_text}")
+        print(f"    Prediction: {pred_text}")
+
+        references.append([ref_text.split()])
+        hypotheses.append(pred_text.split())
+
+    bleu_score = corpus_bleu(references, hypotheses) * 100
+    print(f"\n=== Corpus BLEU score for '{prompt_config['name']}': {bleu_score:.4f} ===")
+
+    all_results[prompt_config["name"]] = bleu_score
+
+    if bleu_score > best_bleu:
+        best_bleu = bleu_score
+        best_prompt = prompt_config
+
+# ----------------------------
+# Display results
+# ----------------------------
+print(f"\n{80 * '='}\nFINAL RESULTS\n{80 * '='}")
+for prompt_name, score in all_results.items():
+    print(f"{prompt_name}: {score:.4f}")
+
+print(f"\nBEST PROMPT: {best_prompt['name']} with BLEU score: {best_bleu:.4f}")
+print(f"Prompt content:\n{best_prompt['prompt']}")
@@ -80,3 +80,73 @@ Vous pouvez modifier les paramètres suivants dans `main.py` :
 - `OUTPUT_PDF_PATH` : Chemin et nom du fichier PDF de sortie (généré autoamtiquement)

 ---
+
+## Finnetunning
+Le finne-tunning permet d'avoir une meilleur traduction. C'est un processus long en temps de calcul, mais permet une traduction plus précise.
+
+Le principe est le suivant :
+
+```
+1️⃣ Dataset d’entraînement (pairs.json)
+      ↓
+1️⃣ Dataset nettoyé ( cleanDataSet.py -> pairs_clean.json)
+      ↓
+2️⃣ Fine-tuning LoRA (finetuning.py)
+      ↓
+3️⃣ Validation / Évaluation BLEU (validation.py)
+      ↓
+4️⃣ Merge LoRA + modèle de base (mergeLora.py)
+      ↓
+5️⃣ Conversion en GGUF ()
+      ↓
+6️⃣ Ollama (inférence finale)
+
+```
+
+### Nettoyage du dataset
+Executer le script ```python cleanDataSet.py```
+
+### Validation
+Executer le script ```python validation.py```
+
+Le script tests plusieurs prompts et renvoie celui avec le meilleur score BLEU.
+
+Il faut ensuite copier ce prompt dans le fichier ModelFile.
+
+### Merge
+Executer le script ```python mergeLora.py```
+
+### Conversion en GGUF
+En étant à la racine du projet (et toujours dans le venv), cloner le projet llama.cpp
+```bash
+git clone https://github.com/ggerganov/llama.cpp
+cd llama.cpp
+pip install -r requirements.txt
+```
+
+Et lancer la commande (/!\ ca prend eviron 10 minutes):
+```bash
+python convert_hf_to_gguf.py ../Finetunning/qwen2.5-7b-uk-fr-merged --outfile qwen2.5-7b-uk-fr.gguf --outtype q8_0
+```
+
+Vérification :
+```bash
+./main -m qwen2.5-7b-uk-fr.gguf -p "Translate into French: Привіт світ"
+```
+Pour que ce nouveau modèle soit exploitable par ollama, il faut TODO
+
+## Utilisation du modèle fine-tunné pour la traduction
+Créer un Modelfile :
+```
+FROM ./qwen2.5-7b-uk-fr.gguf
+
+PARAMETER temperature 0.1
+PARAMETER top_p 0.95
+PARAMETER num_ctx 4096
+
+SYSTEM """
+You are a professional Ukrainian to French translator.
+Produce faithful, literal translations.
+"""
+
+```
@@ -3,14 +3,15 @@ PARAMETER	temperature    0.2
 PARAMETER num_ctx 8192

 SYSTEM """
+
 Tu es un traducteur spécialisé dans les mémoires ukrainiennes des années 1910.
- Utilise le glossaire fourni pour les noms de lieux et termes historiques.
 - Garde le style narratif et les tournures orales de l'auteur.
+- Respecte les règles de traduction suivantes :
 Règles strictes :
 1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]).
-2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l’auteur.
+2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l'auteur.
 3. **Pour les termes historiques** (ex. : "powiat"), utilise le terme français standard et ajoute une note explicative.
 4. **Conserve les citations** russe/allemand/polonais intégrés au texte (mais ajoute une note de fin de paragraphe entre [ ] en la traduisant et en précisant la langue d'origine.
 5. **Structure** : Garde les sauts de ligne et la mise en page originale.
-6. **Notes du traducteur** : Ajoute entre crochets [ ] les explications contextuelles si un contexte historique existe (ex. : "[Note : le context]").
+6. **Notes du traducteur** : Ajoute entre crochets [ ] les explications contextuelles si un contexte historique exist.
 """
@@ -3,23 +3,23 @@ import requests
 import json
 from reportlab.lib.pagesizes import letter
 from reportlab.lib.units import inch
-from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Flowable
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
 from reportlab.lib.enums import TA_JUSTIFY
 from reportlab.pdfbase import pdfmetrics
 from reportlab.pdfbase.ttfonts import TTFont
-import os
+import os, time

 # Configuration
 DEBUG = True
-PDF_PATH = "Traduction\TaniaBorecMemoir(Ukr).pdf"
+PDF_PATH = "Traduction/TaniaBorecMemoir(Ukr).pdf"
 OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
 OLLAMA_URL = "http://localhost:11434/api/generate"
 TARGET_LANGUAGE = "français"
-CHECKPOINT_FILE = "Traduction\checkpoint.json"
-TEMP_OUTPUT_TXT = "Traduction\output_temp.txt"
-FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V8.pdf")
-FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V8.txt")
+CHECKPOINT_FILE = "Traduction/checkpoint.json"
+TEMP_OUTPUT_TXT = "Traduction/output_temp.txt"
+FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V9.pdf")
+FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V9.txt")

 DEBUG = True

@@ -341,6 +341,7 @@ def main():
    print(f"Batches manquants détectés : {missing_batches}")

    # Traduction des paragraphes manquants
+    temps_cumule = 0.0
    for i in missing_batches:
        batch = paragraphs[i:i + batch_size]
        paragraph_cumul = "\n".join(batch)
@@ -348,13 +349,24 @@ def main():
        print(f"{15 * '-'} Traduction des paragraphes manquants {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")

        try:
+            debut_chrono = time.time()
            result = send_to_ollama(paragraph_cumul)
+            fin_chrono = time.time()
+            temps_paragraphe = fin_chrono - debut_chrono
+            temps_cumule += temps_paragraphe
+            
+            # Conversion en minutes et secondes
+            minutes_paragraphe, secondes_paragraphe = divmod(temps_paragraphe, 60)
+            minutes_cumule, secondes_cumule = divmod(temps_cumule, 60)
+            
            print(f"{result}")
            results[str(i)] = result
            save_checkpoint(len(paragraphs), results)  # Met à jour le dernier indice du batch
            save_temp_results(results)
        except Exception as e:
            print(f"Erreur lors de la traduction du paragraphe {i}: {e}")
+        print(f"  Temps de traduction : {int(minutes_paragraphe)} min {secondes_paragraphe:.2f} sec")
+        print(f"  Temps cumulé : {int(minutes_cumule)} min {secondes_cumule:.2f} sec")

    # Traitement des paragraphes suivants
    for i in range(last_index + 1, len(paragraphs), batch_size):
@@ -364,7 +376,16 @@ def main():
        print(f"{15 * '-'} Traduction des paragraphes {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")

        try:
+            debut_chrono = time.time()
            result = send_to_ollama(paragraph_cumul)
+            fin_chrono = time.time()
+            temps_paragraphe = fin_chrono - debut_chrono
+            temps_cumule += temps_paragraphe
+            
+            # Conversion en minutes et secondes
+            minutes_paragraphe, secondes_paragraphe = divmod(temps_paragraphe, 60)
+            minutes_cumule, secondes_cumule = divmod(temps_cumule, 60)
+
            print(f"{result}")
            results[str(i)] = result
            save_checkpoint(i + batch_size - 1, results)
@@ -372,6 +393,9 @@ def main():
        except Exception as e:
            print(f"Erreur : {e}")
            continue
+        print(f"  Temps de traduction : {int(minutes_paragraphe)} min {secondes_paragraphe:.2f} sec")
+        print(f"  Temps cumulé : {int(minutes_cumule)} min {secondes_cumule:.2f} sec")
+

    save_temp_results(results)
    create_pdf_from_results(results, FINAL_OUTPUT_PDF)
@@ -14,3 +14,4 @@ peft
 bitsandbytes
 accelerate
 trl
+nltk
@@ -17,6 +17,7 @@ REM Activer l'environnement virtuel Python
 call %VENV_PATH%\Scripts\activate.bat

 REM Lancer la compilation du modèle LLM pour Ollama
+echo Compilation du modèle LLM pour Ollama
 ollama create traductionUkrainienVersFrancais -f .\Traduction\Modelfile

 :: 1. Vérifie si le processus ollama.exe est en cours d'exécution
@@ -39,6 +40,7 @@ if %ERRORLEVEL% neq 0 (
 )

 REM Exécuter le script principal
+echo Lancement du script principal de traduction
 python %MAIN_SCRIPT_PATH%

 endlocal
Author	SHA1	Message	Date
Alex	51e114b1ee	meilleur gestion de la reprise du trainning	2026-01-15 17:03:18 +01:00
Alex	fa3ad61dd7	utilisation du GPU	2026-01-15 16:59:25 +01:00
Alex	70e4932cd0	optimisation	2026-01-14 23:54:12 +01:00
Alex	aee2716a41	nettoyage dataset	2026-01-14 23:47:39 +01:00
Alex	bf7949d8c3	conversion en LLM	2026-01-14 18:41:27 +01:00
Alex	a4296d012e	méthode à suivre	2026-01-14 18:12:25 +01:00
Alex	c5d372e98d	ne pas versioner le lora	2026-01-14 18:00:42 +01:00
Alex	8d2e5ac021	optimisation de la relance	2026-01-14 17:59:36 +01:00
Alex	adca297850	validation	2026-01-14 17:59:12 +01:00
Alex	83b2eccd07	validation du modele	2026-01-14 12:12:39 +01:00
Alex	8dfb2b81e0	conversion du fichier tsv en json	2026-01-12 19:55:01 +01:00
Alex	4ed1ffa226	autorise la reprise d'unentrainement	2026-01-12 14:40:23 +01:00