utilisation du LLM translategemma:12b

Suppression des fichiers PDF et TXT dans le répertoire Traduction
remove pdf files
2026-02-11 17:29:59 +01:00 · 2026-02-11 17:11:46 +01:00 · 2026-02-11 17:07:57 +01:00 · 2026-02-11 17:06:07 +01:00 · 2026-02-11 17:02:29 +01:00 · 2026-01-15 19:11:55 +01:00
27 changed files with 9411 additions and 25160 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,6 @@
 output_temp.txt
 checkpoint.json
+Traduction/Modelfile
+.env
+Traduction/*.pdf
+Traduction/*.txt
--- a/Finetunning/finetunning.py
+++ b/Finetunning/finetunning.py
@@ -15,9 +15,12 @@ from peft import (
 from trl import SFTTrainer

 # ----------------------------
-# Environment safety (Windows)
+# Environment safety (Windows + AMP fixes)
 # ----------------------------
 os.environ["TORCHDYNAMO_DISABLE"] = "1"
+os.environ["ACCELERATE_MIXED_PRECISION"] = "no"  # ✅ disable AMP completely
+os.environ["TORCH_AMP_DISABLE"] = "1"            # ✅ disable GradScaler
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"         # optional: force first GPU

 # ----------------------------
 # Global configuration
@@ -25,7 +28,7 @@ os.environ["TORCHDYNAMO_DISABLE"] = "1"
 MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
 OUTPUT_DIR = "./qwen2.5-7b-uk-fr-lora"
 DATA_FILE = "paires_clean.json"
-MAX_SEQ_LENGTH = 1024
+MAX_SEQ_LENGTH = 512  # Reduce for RTX 4080 SUPER VRAM

 print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n")

@@ -35,54 +38,50 @@ print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n")
 print(f"{80 * '_'}\n[1/7] Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
-    trust_remote_code=True
+    trust_remote_code=True,
 )
-
 tokenizer.pad_token = tokenizer.eos_token
 tokenizer.model_max_length = MAX_SEQ_LENGTH
-
 print("Tokenizer loaded.")
 print(f"Pad token id: {tokenizer.pad_token_id}")
 print(f"Max sequence length: {tokenizer.model_max_length}")

 # ----------------------------
-# [2/7] Quantization config (QLoRA)
+# [2/7] Load model in 4-bit (QLoRA)
 # ----------------------------
-print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (optimized QLoRA)...")
-
+print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (QLoRA)...")
 assert torch.cuda.is_available(), "CUDA GPU not detected!"
 print(f"Using GPU: {torch.cuda.get_device_name(0)}")

 bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_compute_dtype=torch.float16,  # fp16 internally
    bnb_4bit_use_double_quant=True,
 )

 model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
-    device_map="cuda",          # 🔥 SAFE
+    device_map="auto",
    quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
 )

+# Align model tokens with tokenizer
+model.config.pad_token_id = tokenizer.pad_token_id
+model.config.bos_token_id = tokenizer.bos_token_id
+model.config.eos_token_id = tokenizer.eos_token_id
 print("Model loaded successfully in 4-bit mode on GPU.")

-
 # ----------------------------
 # [3/7] Prepare model for k-bit training
 # ----------------------------
 print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...")
 model = prepare_model_for_kbit_training(model)
-
-model.gradient_checkpointing_enable(
-    gradient_checkpointing_kwargs={"use_reentrant": False}
-)
-
+model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
+model.config.use_cache = False  # Important with gradient checkpointing + QLoRA
 print("Model prepared for k-bit training.")
-print("Gradient checkpointing enabled (non-reentrant).")

 # ----------------------------
 # [4/7] LoRA configuration
@@ -104,10 +103,8 @@ lora_config = LoraConfig(
        "down_proj",
    ],
 )
-
 model = get_peft_model(model, lora_config)
 model.print_trainable_parameters()
-
 print("LoRA adapters successfully attached.")

 # ----------------------------
@@ -115,18 +112,19 @@ print("LoRA adapters successfully attached.")
 # ----------------------------
 print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...")
 dataset = load_dataset("json", data_files=DATA_FILE)
-
 print(f"Dataset loaded with {len(dataset['train'])} samples.")

 print("Formatting dataset for Ukrainian → French translation...")
-
 def format_prompt(example):
    return {
-        "text": ("<|user|>\n"
+        "text": (
+            "<|im_start|>user\n"
            "Translate the following Ukrainian text into French.\n"
            f"Ukrainian: {example['text']}\n"
-            "<|assistant|>\n"
+            "<|im_end|>\n"
+            "<|im_start|>assistant\n"
            f"{example['translation']}"
+            "<|im_end|>"
        )
    }

@@ -134,7 +132,6 @@ dataset = dataset.map(
    format_prompt,
    remove_columns=dataset["train"].column_names
 )
-
 print("Dataset formatting completed.")
 print("Example prompt:\n")
 print(dataset["train"][0]["text"])
@@ -146,46 +143,49 @@ print(f"{80 * '_'}\n[6/7] Initializing training arguments...")
 training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
-    gradient_accumulation_steps=8,
+    gradient_accumulation_steps=16,
    learning_rate=1e-4,
-    num_train_epochs=3,
-    fp16=False,
-    bf16=False,
+    num_train_epochs=2,
+    max_steps=1000,
+
+    fp16=False,      # ⚠ disable AMP
+    bf16=False,      # ⚠ disable BF16
+
    optim="paged_adamw_32bit",
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
-    dataloader_pin_memory=False,
-)

+    dataloader_pin_memory=False,
+    max_grad_norm=0.0,  # avoid AMP gradient clipping
+)
 print("Training arguments ready.")
 print(f"Output directory: {OUTPUT_DIR}")
 print(f"Epochs: {training_args.num_train_epochs}")
-print(f"Effective batch size: "
-    f"{training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}"
-)
+print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")

 # ----------------------------
-# Trainer
+# [7/7] Trainer
 # ----------------------------
-print("Initializing SFTTrainer...")
+print(f"{80 * '_'}\nInitializing SFTTrainer...")
 trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
-    processing_class=tokenizer,
    args=training_args,
 )
 print("Trainer initialized.")

 # ----------------------------
-# [7/7] Training
+# Training
 # ----------------------------
-print(f"{80 * '_'}\n[7/7] Starting training...")
-checkpoint_exists = any(
+print(f"{80 * '_'}\nStarting training...")
+checkpoint_exists = False
+if os.path.exists(OUTPUT_DIR):
+    checkpoint_exists = any(
        d.startswith("checkpoint-")
        for d in os.listdir(OUTPUT_DIR)
-) if os.path.exists(OUTPUT_DIR) else False
+    )

 if checkpoint_exists:
    print("Checkpoint found → resuming training")
@@ -194,7 +194,6 @@ else:
    print("No checkpoint found → starting fresh training")
    train_output = trainer.train()

-
 print("\n=== Training summary ===")
 print(f"Global steps: {train_output.global_step}")
 print(f"Training loss: {train_output.training_loss}")
@@ -202,11 +201,10 @@ print(f"Metrics: {train_output.metrics}")
 print("Training completed successfully.")

 # ----------------------------
-# Save LoRA adapter
+# Save LoRA adapter and tokenizer
 # ----------------------------
 print(f"{80 * '_'}\nSaving LoRA adapter and tokenizer...")
 trainer.model.save_pretrained(OUTPUT_DIR)
 tokenizer.save_pretrained(OUTPUT_DIR)
-
 print("\n=== Fine-tuning finished ===")
 print(f"LoRA adapter saved in: {OUTPUT_DIR}")
--- a/Finetunning/paires_clean.json
+++ b/Finetunning/paires_clean.json
--- a/Finetunning/validation.py
+++ b/Finetunning/validation.py
@@ -8,7 +8,7 @@ from nltk.translate.bleu_score import corpus_bleu
 # Configuration
 # ----------------------------
 BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"  # base model
-LORA_DIR = "./qwen2.5-7b-uk-fr-lora"    # fine-tuned LoRA
+LORA_DIR = "./qwen2.5-7b-uk-fr-lora-2epoch"    # fine-tuned LoRA
 VALIDATION_FILE = "validation.jsonl"    # small validation subset
 MAX_INPUT_LENGTH = 1024
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ pip install -r requirements.txt

 Puis faire :
 ```bash
-pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
 ```

 3. **Placer votre fichier PDF** dans le répertoire `Traduction` du projet avec le nom configuré dans `main.py` (par défaut : `TaniaBorecMemoir(Ukr).pdf`)
@@ -82,6 +82,8 @@ Vous pouvez modifier les paramètres suivants dans `main.py` :
 ---

 ## Finnetunning
+/!\ Expérimental !!!
+
 Le finne-tunning permet d'avoir une meilleur traduction. C'est un processus long en temps de calcul, mais permet une traduction plus précise.

 Le principe est le suivant :
--- a/Traduction/Modelfile
+++ b/Traduction/Modelfile
@@ -1,17 +1,19 @@
-FROM qwen2.5:14b
+FROM translategemma:12b
 PARAMETER	temperature    0.2
 PARAMETER num_ctx 8192

 SYSTEM """
+You are a professional Ukrainian (uk) to french (fr) translator. Your goal is to accurately convey the meaning and nuances of the original Ukrainian text while adhering to french grammar, vocabulary, and cultural sensitivities.
+Produce only the french translation, without any additional explanations or commentary. Please translate the following Ukrainian text into french:
+
+
+{TEXT}

-Tu es un traducteur spécialisé dans les mémoires ukrainiennes des années 1910.
- Garde le style narratif et les tournures orales de l'auteur.
- Respecte les règles de traduction suivantes :
 Règles strictes :
 1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]).
-2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l'auteur.
+2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l’auteur.
 3. **Pour les termes historiques** (ex. : "powiat"), utilise le terme français standard et ajoute une note explicative.
 4. **Conserve les citations** russe/allemand/polonais intégrés au texte (mais ajoute une note de fin de paragraphe entre [ ] en la traduisant et en précisant la langue d'origine.
 5. **Structure** : Garde les sauts de ligne et la mise en page originale.
-6. **Notes du traducteur** : Ajoute entre crochets [ ] les explications contextuelles si un contexte historique exist.
+6. **Notes du traducteur** : Ajoute entre crochets [ ] les explications contextuelles si un contexte historique existe (ex. : "[Note : le context]").
 """
--- a/Traduction/TaniaBorecMemoir(Ukr)
+++ b/Traduction/TaniaBorecMemoir(Ukr)
--- a/Traduction/TaniaBorecMemoir(Ukr)
+++ b/Traduction/TaniaBorecMemoir(Ukr)
--- a/Traduction/TaniaBorecMemoir(Ukr)
+++ b/Traduction/TaniaBorecMemoir(Ukr)
--- a/Traduction/TaniaBorecMemoir(Ukr)
+++ b/Traduction/TaniaBorecMemoir(Ukr)
--- a/Traduction/TaniaBorecMemoir(Ukr)
+++ b/Traduction/TaniaBorecMemoir(Ukr)
--- a/Traduction/TaniaBorecMemoir(Ukr)(FR)_V1.txt
+++ b/Traduction/TaniaBorecMemoir(Ukr)(FR)_V1.txt
--- a/Traduction/TaniaBorecMemoir(Ukr)(FR)_V2.txt
+++ b/Traduction/TaniaBorecMemoir(Ukr)(FR)_V2.txt
--- a/Traduction/TaniaBorecMemoir(Ukr)(FR)_V3.txt
+++ b/Traduction/TaniaBorecMemoir(Ukr)(FR)_V3.txt
--- a/Traduction/TaniaBorecMemoir(Ukr)(FR)_V4.pdf
+++ b/Traduction/TaniaBorecMemoir(Ukr)(FR)_V4.pdf
--- a/Traduction/TaniaBorecMemoir(Ukr)(FR)_V4.txt
+++ b/Traduction/TaniaBorecMemoir(Ukr)(FR)_V4.txt
--- a/Traduction/TaniaBorecMemoir(Ukr)(FR)_V5.pdf
+++ b/Traduction/TaniaBorecMemoir(Ukr)(FR)_V5.pdf
--- a/Traduction/TaniaBorecMemoir(Ukr)(FR)_V5.txt
+++ b/Traduction/TaniaBorecMemoir(Ukr)(FR)_V5.txt
--- a/Traduction/TaniaBorecMemoir(Ukr)(FR)_V6.pdf
+++ b/Traduction/TaniaBorecMemoir(Ukr)(FR)_V6.pdf
--- a/Traduction/TaniaBorecMemoir(Ukr)(FR)_V6.txt
+++ b/Traduction/TaniaBorecMemoir(Ukr)(FR)_V6.txt
--- a/Traduction/TaniaBorecMemoir(Ukr)(FR)_V7.pdf
+++ b/Traduction/TaniaBorecMemoir(Ukr)(FR)_V7.pdf
--- a/Traduction/TaniaBorecMemoir(Ukr)(FR)_V7.txt
+++ b/Traduction/TaniaBorecMemoir(Ukr)(FR)_V7.txt
--- a/Traduction/TaniaBorecMemoir(Ukr)(FR)_V8.pdf
+++ b/Traduction/TaniaBorecMemoir(Ukr)(FR)_V8.pdf
--- a/Traduction/TaniaBorecMemoir(Ukr)(FR)_V8.txt
+++ b/Traduction/TaniaBorecMemoir(Ukr)(FR)_V8.txt
--- a/Traduction/TaniaBorecMemoir(Ukr).pdf
+++ b/Traduction/TaniaBorecMemoir(Ukr).pdf
--- a/Traduction/main.py
+++ b/Traduction/main.py
@@ -11,15 +11,14 @@ from reportlab.pdfbase.ttfonts import TTFont
 import os, time

 # Configuration
-DEBUG = True
 PDF_PATH = "Traduction/TaniaBorecMemoir(Ukr).pdf"
 OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
 OLLAMA_URL = "http://localhost:11434/api/generate"
 TARGET_LANGUAGE = "français"
 CHECKPOINT_FILE = "Traduction/checkpoint.json"
 TEMP_OUTPUT_TXT = "Traduction/output_temp.txt"
-FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V9.pdf")
-FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V9.txt")
+FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V10.pdf")
+FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V10.txt")

 DEBUG = True

@@ -174,7 +173,6 @@ def load_checkpoint():
            return json.load(f)
    return {"last_processed_index": -1, "results": {}}

-# Sauvegarde le checkpoint
 # Sauvegarde le checkpoint
 def save_checkpoint(last_index, results):
    # Trier les clés du dictionnaire results
--- a/llama.cpp
+++ b/llama.cpp
Author	SHA1	Message	Date
Alex	dc66ac9520	utilisation du LLM translategemma:12b	2026-02-11 17:29:59 +01:00
Alex	8b45028101	Suppression des fichiers PDF et TXT dans le répertoire Traduction	2026-02-11 17:11:46 +01:00
Alex	ef4515adcc	remove pdf files	2026-02-11 17:07:57 +01:00
Alex	7aea840821	optimisation pour traduction plus fuilde	2026-02-11 17:06:07 +01:00
Alex	71e595a966	fine tunning	2026-02-11 17:02:29 +01:00
Alex	d5313fb143	version fonctionnelle	2026-01-15 19:11:55 +01:00