Compare commits

..

6 Commits

Author SHA1 Message Date
Alex
dc66ac9520 utilisation du LLM translategemma:12b 2026-02-11 17:29:59 +01:00
Alex
8b45028101 Suppression des fichiers PDF et TXT dans le répertoire Traduction 2026-02-11 17:11:46 +01:00
Alex
ef4515adcc remove pdf files 2026-02-11 17:07:57 +01:00
Alex
7aea840821 optimisation pour traduction plus fuilde 2026-02-11 17:06:07 +01:00
Alex
71e595a966 fine tunning 2026-02-11 17:02:29 +01:00
Alex
d5313fb143 version fonctionnelle 2026-01-15 19:11:55 +01:00
27 changed files with 9411 additions and 25160 deletions

4
.gitignore vendored
View File

@@ -1,2 +1,6 @@
output_temp.txt output_temp.txt
checkpoint.json checkpoint.json
Traduction/Modelfile
.env
Traduction/*.pdf
Traduction/*.txt

View File

@@ -15,9 +15,12 @@ from peft import (
from trl import SFTTrainer from trl import SFTTrainer
# ---------------------------- # ----------------------------
# Environment safety (Windows) # Environment safety (Windows + AMP fixes)
# ---------------------------- # ----------------------------
os.environ["TORCHDYNAMO_DISABLE"] = "1" os.environ["TORCHDYNAMO_DISABLE"] = "1"
os.environ["ACCELERATE_MIXED_PRECISION"] = "no" # ✅ disable AMP completely
os.environ["TORCH_AMP_DISABLE"] = "1" # ✅ disable GradScaler
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # optional: force first GPU
# ---------------------------- # ----------------------------
# Global configuration # Global configuration
@@ -25,7 +28,7 @@ os.environ["TORCHDYNAMO_DISABLE"] = "1"
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct" MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
OUTPUT_DIR = "./qwen2.5-7b-uk-fr-lora" OUTPUT_DIR = "./qwen2.5-7b-uk-fr-lora"
DATA_FILE = "paires_clean.json" DATA_FILE = "paires_clean.json"
MAX_SEQ_LENGTH = 1024 MAX_SEQ_LENGTH = 512 # Reduce for RTX 4080 SUPER VRAM
print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n") print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n")
@@ -35,54 +38,50 @@ print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n")
print(f"{80 * '_'}\n[1/7] Loading tokenizer...") print(f"{80 * '_'}\n[1/7] Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained( tokenizer = AutoTokenizer.from_pretrained(
MODEL_NAME, MODEL_NAME,
trust_remote_code=True trust_remote_code=True,
) )
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = MAX_SEQ_LENGTH tokenizer.model_max_length = MAX_SEQ_LENGTH
print("Tokenizer loaded.") print("Tokenizer loaded.")
print(f"Pad token id: {tokenizer.pad_token_id}") print(f"Pad token id: {tokenizer.pad_token_id}")
print(f"Max sequence length: {tokenizer.model_max_length}") print(f"Max sequence length: {tokenizer.model_max_length}")
# ---------------------------- # ----------------------------
# [2/7] Quantization config (QLoRA) # [2/7] Load model in 4-bit (QLoRA)
# ---------------------------- # ----------------------------
print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (optimized QLoRA)...") print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (QLoRA)...")
assert torch.cuda.is_available(), "CUDA GPU not detected!" assert torch.cuda.is_available(), "CUDA GPU not detected!"
print(f"Using GPU: {torch.cuda.get_device_name(0)}") print(f"Using GPU: {torch.cuda.get_device_name(0)}")
bnb_config = BitsAndBytesConfig( bnb_config = BitsAndBytesConfig(
load_in_4bit=True, load_in_4bit=True,
bnb_4bit_quant_type="nf4", bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16, bnb_4bit_compute_dtype=torch.float16, # fp16 internally
bnb_4bit_use_double_quant=True, bnb_4bit_use_double_quant=True,
) )
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME, MODEL_NAME,
device_map="cuda", # 🔥 SAFE device_map="auto",
quantization_config=bnb_config, quantization_config=bnb_config,
low_cpu_mem_usage=True, low_cpu_mem_usage=True,
trust_remote_code=True, trust_remote_code=True,
) )
# Align model tokens with tokenizer
model.config.pad_token_id = tokenizer.pad_token_id
model.config.bos_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
print("Model loaded successfully in 4-bit mode on GPU.") print("Model loaded successfully in 4-bit mode on GPU.")
# ---------------------------- # ----------------------------
# [3/7] Prepare model for k-bit training # [3/7] Prepare model for k-bit training
# ---------------------------- # ----------------------------
print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...") print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...")
model = prepare_model_for_kbit_training(model) model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
model.gradient_checkpointing_enable( model.config.use_cache = False # Important with gradient checkpointing + QLoRA
gradient_checkpointing_kwargs={"use_reentrant": False}
)
print("Model prepared for k-bit training.") print("Model prepared for k-bit training.")
print("Gradient checkpointing enabled (non-reentrant).")
# ---------------------------- # ----------------------------
# [4/7] LoRA configuration # [4/7] LoRA configuration
@@ -104,10 +103,8 @@ lora_config = LoraConfig(
"down_proj", "down_proj",
], ],
) )
model = get_peft_model(model, lora_config) model = get_peft_model(model, lora_config)
model.print_trainable_parameters() model.print_trainable_parameters()
print("LoRA adapters successfully attached.") print("LoRA adapters successfully attached.")
# ---------------------------- # ----------------------------
@@ -115,18 +112,19 @@ print("LoRA adapters successfully attached.")
# ---------------------------- # ----------------------------
print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...") print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...")
dataset = load_dataset("json", data_files=DATA_FILE) dataset = load_dataset("json", data_files=DATA_FILE)
print(f"Dataset loaded with {len(dataset['train'])} samples.") print(f"Dataset loaded with {len(dataset['train'])} samples.")
print("Formatting dataset for Ukrainian → French translation...") print("Formatting dataset for Ukrainian → French translation...")
def format_prompt(example): def format_prompt(example):
return { return {
"text": ("<|user|>\n" "text": (
"<|im_start|>user\n"
"Translate the following Ukrainian text into French.\n" "Translate the following Ukrainian text into French.\n"
f"Ukrainian: {example['text']}\n" f"Ukrainian: {example['text']}\n"
"<|assistant|>\n" "<|im_end|>\n"
"<|im_start|>assistant\n"
f"{example['translation']}" f"{example['translation']}"
"<|im_end|>"
) )
} }
@@ -134,7 +132,6 @@ dataset = dataset.map(
format_prompt, format_prompt,
remove_columns=dataset["train"].column_names remove_columns=dataset["train"].column_names
) )
print("Dataset formatting completed.") print("Dataset formatting completed.")
print("Example prompt:\n") print("Example prompt:\n")
print(dataset["train"][0]["text"]) print(dataset["train"][0]["text"])
@@ -146,46 +143,49 @@ print(f"{80 * '_'}\n[6/7] Initializing training arguments...")
training_args = TrainingArguments( training_args = TrainingArguments(
output_dir=OUTPUT_DIR, output_dir=OUTPUT_DIR,
per_device_train_batch_size=1, per_device_train_batch_size=1,
gradient_accumulation_steps=8, gradient_accumulation_steps=16,
learning_rate=1e-4, learning_rate=1e-4,
num_train_epochs=3, num_train_epochs=2,
fp16=False, max_steps=1000,
bf16=False,
fp16=False, # ⚠ disable AMP
bf16=False, # ⚠ disable BF16
optim="paged_adamw_32bit", optim="paged_adamw_32bit",
logging_steps=10, logging_steps=10,
save_steps=500, save_steps=500,
save_total_limit=2, save_total_limit=2,
report_to="none", report_to="none",
dataloader_pin_memory=False,
)
dataloader_pin_memory=False,
max_grad_norm=0.0, # avoid AMP gradient clipping
)
print("Training arguments ready.") print("Training arguments ready.")
print(f"Output directory: {OUTPUT_DIR}") print(f"Output directory: {OUTPUT_DIR}")
print(f"Epochs: {training_args.num_train_epochs}") print(f"Epochs: {training_args.num_train_epochs}")
print(f"Effective batch size: " print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
f"{training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}"
)
# ---------------------------- # ----------------------------
# Trainer # [7/7] Trainer
# ---------------------------- # ----------------------------
print("Initializing SFTTrainer...") print(f"{80 * '_'}\nInitializing SFTTrainer...")
trainer = SFTTrainer( trainer = SFTTrainer(
model=model, model=model,
train_dataset=dataset["train"], train_dataset=dataset["train"],
processing_class=tokenizer,
args=training_args, args=training_args,
) )
print("Trainer initialized.") print("Trainer initialized.")
# ---------------------------- # ----------------------------
# [7/7] Training # Training
# ---------------------------- # ----------------------------
print(f"{80 * '_'}\n[7/7] Starting training...") print(f"{80 * '_'}\nStarting training...")
checkpoint_exists = any( checkpoint_exists = False
d.startswith("checkpoint-") if os.path.exists(OUTPUT_DIR):
for d in os.listdir(OUTPUT_DIR) checkpoint_exists = any(
) if os.path.exists(OUTPUT_DIR) else False d.startswith("checkpoint-")
for d in os.listdir(OUTPUT_DIR)
)
if checkpoint_exists: if checkpoint_exists:
print("Checkpoint found → resuming training") print("Checkpoint found → resuming training")
@@ -194,7 +194,6 @@ else:
print("No checkpoint found → starting fresh training") print("No checkpoint found → starting fresh training")
train_output = trainer.train() train_output = trainer.train()
print("\n=== Training summary ===") print("\n=== Training summary ===")
print(f"Global steps: {train_output.global_step}") print(f"Global steps: {train_output.global_step}")
print(f"Training loss: {train_output.training_loss}") print(f"Training loss: {train_output.training_loss}")
@@ -202,11 +201,10 @@ print(f"Metrics: {train_output.metrics}")
print("Training completed successfully.") print("Training completed successfully.")
# ---------------------------- # ----------------------------
# Save LoRA adapter # Save LoRA adapter and tokenizer
# ---------------------------- # ----------------------------
print(f"{80 * '_'}\nSaving LoRA adapter and tokenizer...") print(f"{80 * '_'}\nSaving LoRA adapter and tokenizer...")
trainer.model.save_pretrained(OUTPUT_DIR) trainer.model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR) tokenizer.save_pretrained(OUTPUT_DIR)
print("\n=== Fine-tuning finished ===") print("\n=== Fine-tuning finished ===")
print(f"LoRA adapter saved in: {OUTPUT_DIR}") print(f"LoRA adapter saved in: {OUTPUT_DIR}")

File diff suppressed because it is too large Load Diff

View File

@@ -8,7 +8,7 @@ from nltk.translate.bleu_score import corpus_bleu
# Configuration # Configuration
# ---------------------------- # ----------------------------
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct" # base model BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct" # base model
LORA_DIR = "./qwen2.5-7b-uk-fr-lora" # fine-tuned LoRA LORA_DIR = "./qwen2.5-7b-uk-fr-lora-2epoch" # fine-tuned LoRA
VALIDATION_FILE = "validation.jsonl" # small validation subset VALIDATION_FILE = "validation.jsonl" # small validation subset
MAX_INPUT_LENGTH = 1024 MAX_INPUT_LENGTH = 1024
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

View File

@@ -27,7 +27,7 @@ pip install -r requirements.txt
Puis faire : Puis faire :
```bash ```bash
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
``` ```
3. **Placer votre fichier PDF** dans le répertoire `Traduction` du projet avec le nom configuré dans `main.py` (par défaut : `TaniaBorecMemoir(Ukr).pdf`) 3. **Placer votre fichier PDF** dans le répertoire `Traduction` du projet avec le nom configuré dans `main.py` (par défaut : `TaniaBorecMemoir(Ukr).pdf`)
@@ -82,6 +82,8 @@ Vous pouvez modifier les paramètres suivants dans `main.py` :
--- ---
## Finnetunning ## Finnetunning
/!\ Expérimental !!!
Le finne-tunning permet d'avoir une meilleur traduction. C'est un processus long en temps de calcul, mais permet une traduction plus précise. Le finne-tunning permet d'avoir une meilleur traduction. C'est un processus long en temps de calcul, mais permet une traduction plus précise.
Le principe est le suivant : Le principe est le suivant :

View File

@@ -1,17 +1,19 @@
FROM qwen2.5:14b FROM translategemma:12b
PARAMETER temperature 0.2 PARAMETER temperature 0.2
PARAMETER num_ctx 8192 PARAMETER num_ctx 8192
SYSTEM """ SYSTEM """
You are a professional Ukrainian (uk) to french (fr) translator. Your goal is to accurately convey the meaning and nuances of the original Ukrainian text while adhering to french grammar, vocabulary, and cultural sensitivities.
Produce only the french translation, without any additional explanations or commentary. Please translate the following Ukrainian text into french:
{TEXT}
Tu es un traducteur spécialisé dans les mémoires ukrainiennes des années 1910.
- Garde le style narratif et les tournures orales de l'auteur.
- Respecte les règles de traduction suivantes :
Règles strictes : Règles strictes :
1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]). 1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]).
2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l'auteur. 2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à lauteur.
3. **Pour les termes historiques** (ex. : "powiat"), utilise le terme français standard et ajoute une note explicative. 3. **Pour les termes historiques** (ex. : "powiat"), utilise le terme français standard et ajoute une note explicative.
4. **Conserve les citations** russe/allemand/polonais intégrés au texte (mais ajoute une note de fin de paragraphe entre [ ] en la traduisant et en précisant la langue d'origine. 4. **Conserve les citations** russe/allemand/polonais intégrés au texte (mais ajoute une note de fin de paragraphe entre [ ] en la traduisant et en précisant la langue d'origine.
5. **Structure** : Garde les sauts de ligne et la mise en page originale. 5. **Structure** : Garde les sauts de ligne et la mise en page originale.
6. **Notes du traducteur** : Ajoute entre crochets [ ] les explications contextuelles si un contexte historique exist. 6. **Notes du traducteur** : Ajoute entre crochets [ ] les explications contextuelles si un contexte historique existe (ex. : "[Note : le context]").
""" """

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@@ -11,15 +11,14 @@ from reportlab.pdfbase.ttfonts import TTFont
import os, time import os, time
# Configuration # Configuration
DEBUG = True
PDF_PATH = "Traduction/TaniaBorecMemoir(Ukr).pdf" PDF_PATH = "Traduction/TaniaBorecMemoir(Ukr).pdf"
OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest" OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
OLLAMA_URL = "http://localhost:11434/api/generate" OLLAMA_URL = "http://localhost:11434/api/generate"
TARGET_LANGUAGE = "français" TARGET_LANGUAGE = "français"
CHECKPOINT_FILE = "Traduction/checkpoint.json" CHECKPOINT_FILE = "Traduction/checkpoint.json"
TEMP_OUTPUT_TXT = "Traduction/output_temp.txt" TEMP_OUTPUT_TXT = "Traduction/output_temp.txt"
FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V9.pdf") FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V10.pdf")
FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V9.txt") FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V10.txt")
DEBUG = True DEBUG = True
@@ -174,7 +173,6 @@ def load_checkpoint():
return json.load(f) return json.load(f)
return {"last_processed_index": -1, "results": {}} return {"last_processed_index": -1, "results": {}}
# Sauvegarde le checkpoint
# Sauvegarde le checkpoint # Sauvegarde le checkpoint
def save_checkpoint(last_index, results): def save_checkpoint(last_index, results):
# Trier les clés du dictionnaire results # Trier les clés du dictionnaire results

1
llama.cpp Submodule

Submodule llama.cpp added at e463bbdf65