finnetunning

1 month ago · 182e6e7a98
3 changed files with 185 additions and 80 deletions
--- a/Finetunning/finetunning.py
+++ b/Finetunning/finetunning.py
@ -0,0 +1,152 @@
+import torch
+from datasets import load_dataset
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    TrainingArguments,
+)
+from peft import (
+    LoraConfig,
+    get_peft_model,
+    prepare_model_for_kbit_training,
+)
+from trl import SFTTrainer
+import os
+os.environ["TORCHDYNAMO_DISABLE"] = "1"
+
+# ----------------------------
+# Model configuration
+# ----------------------------
+MODEL_NAME = "Qwen/Qwen2.5-14B-Instruct"
+
+print("=== Starting fine-tuning script ===")
+
+print(f"{80 * '_'}\n[1/7] Loading tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_NAME,
+    trust_remote_code=True
+)
+
+# Ensure padding token is defined
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.model_max_length = 1024
+
+print("Tokenizer loaded and configured.")
+
+print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (QLoRA)...")
+model = AutoModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    load_in_4bit=True,
+    device_map="auto",
+    torch_dtype=torch.float16,  # OK for weights
+    trust_remote_code=True,
+)
+print("Model loaded.")
+
+print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...")
+model = prepare_model_for_kbit_training(model)
+print("Model prepared for k-bit training.")
+
+# ----------------------------
+# LoRA configuration
+# ----------------------------
+print(f"{80 * '_'}\n[4/7] Configuring LoRA adapters...")
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=[
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+    ],
+)
+
+model = get_peft_model(model, lora_config)
+model.print_trainable_parameters()
+print("LoRA adapters attached to the model.")
+
+# ----------------------------
+# Dataset loading
+# ----------------------------
+print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...")
+dataset = load_dataset(
+    "json",
+    data_files="traductions.json"
+)
+print(f"Dataset loaded with {len(dataset['train'])} samples.")
+
+print("Formatting dataset for Ukrainian → French translation...")
+
+def format_prompt(example):
+    prompt = (
+        "Translate the following Ukrainian text into French.\n\n"
+        f"Ukrainian: {example['text']}\n"
+        f"French: {example['translation']}"
+    )
+    return {"text": prompt}
+
+dataset = dataset.map(format_prompt, remove_columns=dataset["train"].column_names)
+print("Dataset formatting completed.")
+
+# ----------------------------
+# Training arguments
+# ----------------------------
+print(f"{80 * '_'}\n[6/7] Initializing training arguments...")
+training_args = TrainingArguments(
+    output_dir="./qwen-uk-fr-lora",
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=8,
+    learning_rate=2e-4,
+    num_train_epochs=3,
+
+    fp16=False,
+    bf16=False,
+
+    logging_steps=10,
+    save_steps=500,
+    save_total_limit=2,
+
+    # Use 32-bit optimizer
+    optim="paged_adamw_32bit",
+
+    report_to="none",
+)
+
+
+print("Training arguments ready.")
+
+# ----------------------------
+# Trainer
+# ----------------------------
+print("Initializing SFTTrainer...")
+trainer = SFTTrainer(
+    model=model,
+    train_dataset=dataset["train"],
+    processing_class=tokenizer,
+    args=training_args,
+)
+print("Trainer initialized.")
+
+# ----------------------------
+# Train
+# ----------------------------
+print(f"{80 * '_'}\n[7/7] Starting training...")
+trainer.train()
+print("Training completed successfully.")
+
+# ----------------------------
+# Save LoRA adapter
+# ----------------------------
+print("Saving LoRA adapter and tokenizer...")
+trainer.model.save_pretrained("./qwen-uk-fr-lora")
+tokenizer.save_pretrained("./qwen-uk-fr-lora")
+
+print("=== Fine-tuning finished ===")
+print("LoRA adapter saved in ./qwen-uk-fr-lora")
--- a/README.md
+++ b/README.md
@ -9,29 +9,6 @@ Ce projet permet de traduire un document PDF page par page en utilisant un modè
 - **Python** (version 3.8 ou supérieure)
 - **Ollama** installé et en cours d'exécution sur ta machine (en mode "serveur")
 - Un **document PDF** à traduire
- Un modèle LLM spécialisé dans la traduction avec un context long.
-
---
-
-## Création d'un modèle LLM de traduction avec Ollama
-En partant du LLM [zongwei/gemma3-translator:4b](https://ollama.com/zongwei/gemma3-translator), nous allons créer un modèle optimisé pour la traduction avec Ollama.
-Pour info : Inutile de le downloader, il le serra automatiquement au lancement de la commande.
-
-```shell
-FROM zongwei/gemma3-translator:4b
-PARAMETER	temperature    0.1
-PARAMETER	num_ctx	131072
-SYSTEM """
-You are a professional translator specialising in translating Ukrainian text into English.
-Translate accurately and naturally, respecting the original intonation used by the author of the text.
-You must not interpret the author's thoughts or reflections.
-Do not add any text before or after the text provided.
-"""
-```
-Il faut ensuite compiler le modèle avec la commande :
-```
-ollama create traductionUkrainienVersFrancais -f .\Modelfile
-```

 ## Installation

@ -48,7 +25,12 @@ ollama create traductionUkrainienVersFrancais -f .\Modelfile
 pip install -r requirements.txt
 ```

-3. **Placer votre fichier PDF** dans le répertoire du projet avec le nom configuré dans `main.py` (par défaut : `TaniaBorecMemoir(Ukr).pdf`)
+Puis faire :
+```bash
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+```
+
+3. **Placer votre fichier PDF** dans le répertoire `Traduction` du projet avec le nom configuré dans `main.py` (par défaut : `TaniaBorecMemoir(Ukr).pdf`)

 ---

@ -61,27 +43,6 @@ ollama create traductionUkrainienVersFrancais -f .\Modelfile
 ollama serve
 ```

-2. **Vérifier que le modèle de traduction est disponible**
-   ```bash
-   ollama list
-   ```
-   Vous devez voir `traductionUkrainienVersFrancais` dans la liste.
-
-   Si ce n'est pas le cas, vous devez le générer comme décrit plus haut (paragraphe "Création d'un modèle LLM de traduction avec Ollama")
-
-3. **Placer votre PDF** dans le même répertoire que le script `main.py`
-### Paramétrage du script
-`PDF_PATH`= "TaniaBorecMemoir(Ukr).pdf" <- Le nom du fichier pdf à traduire.
-`OLLAMA_MODEL` = "mitmul/plamo-2-translate:latest" <- Le nom
-`OLLAMA_URL` = "http://localhost:11434/api/generate"  <- URL par défaut d'Ollama
-`TARGET_LANGUAGE` = "français"  <- Langue cible (ex: "français", "anglais", "allemand", "espagnol", etc.)
-`SYSTEM_PROMPT` = """You are a professional translator specialising in Ukrainian text translation.
-Translate accurately and naturally, respecting the original intonation used by the author of the text.
-You must not interpret the author's thoughts or reflections.
-Do not add any text before or after the text provided.
-Preserve the layout and structure of the original text."""
-
-
 ### Exécution

 1. **Lancer le script de traduction**
@ -119,19 +80,3 @@ Vous pouvez modifier les paramètres suivants dans `main.py` :
 - `OUTPUT_PDF_PATH` : Chemin et nom du fichier PDF de sortie (généré autoamtiquement)

 ---
-
-## Dépannage
-
-### Erreur : "Connexion refusée à Ollama"
- Vérifiez que Ollama est en cours d'exécution avec `ollama serve`
- Vérifiez que l'URL est correcte (par défaut : `http://localhost:11434`)
-
-### Erreur : "Modèle non trouvé"
- Exécutez : `ollama create traductionUkrainienVersFrancais -f .\Modelfile`
-
-### Le texte n'est pas bien séparé en paragraphes
- Le script détecte automatiquement les doubles sauts de ligne
- Si absent, il divise par phrases et regroupe en chunks de 1500 caractères
-
---
-
--- a/requirements.txt
+++ b/requirements.txt
@ -1,8 +1,16 @@
-certifi==2026.1.4
-charset-normalizer==3.4.4
-idna==3.11
-pillow==12.1.0
-PyPDF2==3.0.1
-reportlab==4.4.7
-requests==2.32.5
-urllib3==2.6.2
+certifi
+charset-normalizer
+idna
+pillow
+PyPDF2
+reportlab
+requests
+urllib3
+
+torch
+transformers
+datasets
+peft
+bitsandbytes
+accelerate
+trl