diff --git a/Finetunning/cleanDataSet.py b/Finetunning/cleanDataSet.py new file mode 100644 index 0000000..ce2e7e0 --- /dev/null +++ b/Finetunning/cleanDataSet.py @@ -0,0 +1,144 @@ +import json +import unicodedata +import re +from collections import OrderedDict + +# ---------------------------- +# Configuration +# ---------------------------- +INPUT_FILE = "paires.json" +OUTPUT_FILE = "paires_clean.json" + +MIN_TOKENS = 5 +MAX_TOKENS = 200 +MIN_QUALITY_SCORE = 0.60 + +print("=== Dataset cleaning + quality scoring started ===") + +# ---------------------------- +# Normalization helpers +# ---------------------------- +def normalize_text(text: str) -> str: + text = unicodedata.normalize("NFKC", text) + text = re.sub(r"\s+", " ", text).strip() + text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"') + return text + + +def token_count(text: str) -> int: + return len(text.split()) + + +# ---------------------------- +# Quality scoring +# ---------------------------- +def length_ratio_score(src_len, tgt_len): + """ + Ideal ratio FR/UK ≈ 0.9 – 1.3 + """ + ratio = tgt_len / max(src_len, 1) + + if ratio < 0.5 or ratio > 2.0: + return 0.0 + elif 0.75 <= ratio <= 1.5: + return 1.0 + else: + return max(0.0, 1.0 - abs(ratio - 1.1)) + + +def lexical_density_score(text): + """ + Penalize very repetitive or trivial translations + """ + tokens = text.split() + if not tokens: + return 0.0 + unique_ratio = len(set(tokens)) / len(tokens) + return min(1.0, unique_ratio * 1.5) + + +def quality_score(src, tgt): + src_len = token_count(src) + tgt_len = token_count(tgt) + + l_score = length_ratio_score(src_len, tgt_len) + d_score = lexical_density_score(tgt) + + return 0.7 * l_score + 0.3 * d_score + + +# ---------------------------- +# Load + clean + score +# ---------------------------- +unique_sources = OrderedDict() + +stats = { + "total": 0, + "removed_length": 0, + "removed_duplicates": 0, + "removed_quality": 0, +} + +with open(INPUT_FILE, "r", encoding="utf-8") as f: + for line in f: + stats["total"] += 1 + item = json.loads(line) + + src = normalize_text(item["text"]) + tgt = normalize_text(item["translation"]) + + src_len = token_count(src) + tgt_len = token_count(tgt) + + # Length filtering + if not (MIN_TOKENS <= src_len <= MAX_TOKENS): + stats["removed_length"] += 1 + continue + + if not (MIN_TOKENS <= tgt_len <= MAX_TOKENS): + stats["removed_length"] += 1 + continue + + # Deduplication + if src in unique_sources: + stats["removed_duplicates"] += 1 + continue + + # Quality score + q_score = quality_score(src, tgt) + if q_score < MIN_QUALITY_SCORE: + stats["removed_quality"] += 1 + continue + + unique_sources[src] = { + "translation": tgt, + "quality_score": round(q_score, 3) + } + +# ---------------------------- +# Report +# ---------------------------- +print(f"Total lines processed: {stats['total']}") +print(f"Removed (length): {stats['removed_length']}") +print(f"Removed (duplicates): {stats['removed_duplicates']}") +print(f"Removed (quality): {stats['removed_quality']}") +print(f"Final kept pairs: {len(unique_sources)}") + +# ---------------------------- +# Save cleaned dataset +# ---------------------------- +with open(OUTPUT_FILE, "w", encoding="utf-8") as f: + for src, data in unique_sources.items(): + json.dump( + { + "text": src, + "translation": data["translation"], + "quality_score": data["quality_score"], + }, + f, + ensure_ascii=False + ) + f.write("\n") + +print(f"=== Cleaning completed ===") +print(f"Clean dataset saved to: {OUTPUT_FILE}") diff --git a/Finetunning/finetunning.py b/Finetunning/finetunning.py index cf8f687..ba5e796 100644 --- a/Finetunning/finetunning.py +++ b/Finetunning/finetunning.py @@ -5,6 +5,7 @@ from transformers import ( AutoTokenizer, AutoModelForCausalLM, TrainingArguments, + BitsAndBytesConfig ) from peft import ( LoraConfig, @@ -19,76 +20,86 @@ from trl import SFTTrainer os.environ["TORCHDYNAMO_DISABLE"] = "1" # ---------------------------- -# Model configuration +# Global configuration # ---------------------------- MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct" +OUTPUT_DIR = "./qwen2.5-7b-uk-fr-lora" +DATA_FILE = "paires_clean.json" +MAX_SEQ_LENGTH = 1024 -print(f"=== Starting fine-tuning script {MODEL_NAME} ===") +print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n") +# ---------------------------- +# [1/7] Tokenizer +# ---------------------------- print(f"{80 * '_'}\n[1/7] Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained( MODEL_NAME, trust_remote_code=True ) -# Ensure padding is defined tokenizer.pad_token = tokenizer.eos_token -tokenizer.model_max_length = 1024 +tokenizer.model_max_length = MAX_SEQ_LENGTH -print("Tokenizer loaded and configured.") +print("Tokenizer loaded.") +print(f"Pad token id: {tokenizer.pad_token_id}") +print(f"Max sequence length: {tokenizer.model_max_length}") +# ---------------------------- +# [2/7] Model loading (QLoRA) +# ---------------------------- print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (QLoRA)...") model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, load_in_4bit=True, device_map="auto", - torch_dtype=torch.float16, # weights in fp16, gradients fp32 + dtype=torch.float16, trust_remote_code=True, ) print("Model loaded.") +# ---------------------------- +# [3/7] Prepare model for k-bit training +# ---------------------------- print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...") model = prepare_model_for_kbit_training(model) -# Fix future PyTorch checkpointing behavior model.gradient_checkpointing_enable( gradient_checkpointing_kwargs={"use_reentrant": False} ) print("Model prepared for k-bit training.") +print("Gradient checkpointing enabled (non-reentrant).") # ---------------------------- -# LoRA configuration +# [4/7] LoRA configuration # ---------------------------- print(f"{80 * '_'}\n[4/7] Configuring LoRA adapters...") lora_config = LoraConfig( - r=16, - lora_alpha=32, - lora_dropout=0.05, + r=32, + lora_alpha=64, + lora_dropout=0.02, bias="none", task_type="CAUSAL_LM", target_modules=[ - "q_proj", - "k_proj", - "v_proj", - "o_proj", - "gate_proj", - "up_proj", - "down_proj", + "q_proj", "k_proj", "v_proj", "o_proj", + "gate_proj", "up_proj", "down_proj" ], + ) model = get_peft_model(model, lora_config) model.print_trainable_parameters() -print("LoRA adapters attached.") + +print("LoRA adapters successfully attached.") # ---------------------------- -# Dataset loading +# [5/7] Dataset loading & formatting # ---------------------------- print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...") dataset = load_dataset( "json", - data_files="paires.json" + data_files=DATA_FILE ) print(f"Dataset loaded with {len(dataset['train'])} samples.") @@ -96,12 +107,15 @@ print(f"Dataset loaded with {len(dataset['train'])} samples.") print("Formatting dataset for Ukrainian → French translation...") def format_prompt(example): - prompt = ( - "Translate the following Ukrainian text into French.\n\n" - f"Ukrainian: {example['text']}\n" - f"French: {example['translation']}" - ) - return {"text": prompt} + return { + "text": ( + "<|user|>\n" + "Translate the following Ukrainian text into French.\n" + f"Ukrainian: {example['text']}\n" + "<|assistant|>\n" + f"{example['translation']}" + ) + } dataset = dataset.map( format_prompt, @@ -109,27 +123,31 @@ dataset = dataset.map( ) print("Dataset formatting completed.") +print(f"Example prompt:\n{dataset['train'][0]['text']}") # ---------------------------- -# Training arguments (AMP OFF) +# [6/7] Training arguments # ---------------------------- print(f"{80 * '_'}\n[6/7] Initializing training arguments...") training_args = TrainingArguments( - output_dir="./qwen2.5-7b-uk-fr-lora", + output_dir=OUTPUT_DIR, per_device_train_batch_size=1, gradient_accumulation_steps=8, - learning_rate=2e-4, - num_train_epochs=2, # 2 epochs usually enough for translation + learning_rate=1e-4, + num_train_epochs=3, fp16=False, bf16=False, + optim="paged_adamw_32bit", logging_steps=10, save_steps=500, save_total_limit=2, - optim="paged_adamw_32bit", report_to="none", ) print("Training arguments ready.") +print(f"Output directory: {OUTPUT_DIR}") +print(f"Epochs: {training_args.num_train_epochs}") +print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}") # ---------------------------- # Trainer @@ -138,27 +156,30 @@ print("Initializing SFTTrainer...") trainer = SFTTrainer( model=model, train_dataset=dataset["train"], - processing_class=tokenizer, + tokenizer=tokenizer, args=training_args, ) print("Trainer initialized.") # ---------------------------- -# Train +# [7/7] Training # ---------------------------- print(f"{80 * '_'}\n[7/7] Starting training...") try: trainer.train(resume_from_checkpoint=True) -except: +except Exception as e: + print("No checkpoint found or resume failed, starting fresh training.") + print(f"Reason: {e}") trainer.train() + print("Training completed successfully.") # ---------------------------- # Save LoRA adapter # ---------------------------- -print("Saving LoRA adapter and tokenizer...") -trainer.model.save_pretrained("./qwen2.5-7b-uk-fr-lora") -tokenizer.save_pretrained("./qwen2.5-7b-uk-fr-lora") +print(f"{80 * '_'}\nSaving LoRA adapter and tokenizer...") +trainer.model.save_pretrained(OUTPUT_DIR) +tokenizer.save_pretrained(OUTPUT_DIR) -print("=== Fine-tuning finished ===") -print("LoRA adapter saved in ./qwen2.5-7b-uk-fr-lora") +print("\n=== Fine-tuning finished ===") +print(f"LoRA adapter saved in: {OUTPUT_DIR}") diff --git a/README.md b/README.md index 03266a5..a963476 100644 --- a/README.md +++ b/README.md @@ -89,9 +89,11 @@ Le principe est le suivant : ``` 1️⃣ Dataset d’entraînement (pairs.json) ↓ +1️⃣ Dataset nettoyé ( cleanDataSet.py -> pairs_clean.json) + ↓ 2️⃣ Fine-tuning LoRA (finetuning.py) ↓ -3️⃣ Validation / Évaluation (validation.py) +3️⃣ Validation / Évaluation BLEU (validation.py) ↓ 4️⃣ Merge LoRA + modèle de base (mergeLora.py) ↓ @@ -100,6 +102,10 @@ Le principe est le suivant : 6️⃣ Ollama (inférence finale) ``` + +### Nettoyage du dataset +Executer le script ```python cleanDataSet.py``` + ### Validation Executer le script ```python validation.py``` @@ -111,7 +117,7 @@ Il faut ensuite copier ce prompt dans le fichier ModelFile. Executer le script ```python mergeLora.py``` ### Conversion en GGUF -En étant à la racine du projet (et toujorus dans le venv), cloner le projet llama.cpp +En étant à la racine du projet (et toujours dans le venv), cloner le projet llama.cpp ```bash git clone https://github.com/ggerganov/llama.cpp cd llama.cpp