Compare commits
12 Commits
182e6e7a98
...
51e114b1ee
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
51e114b1ee | ||
|
|
fa3ad61dd7 | ||
|
|
70e4932cd0 | ||
|
|
aee2716a41 | ||
|
|
bf7949d8c3 | ||
|
|
a4296d012e | ||
|
|
c5d372e98d | ||
|
|
8d2e5ac021 | ||
|
|
adca297850 | ||
|
|
83b2eccd07 | ||
|
|
8dfb2b81e0 | ||
|
|
4ed1ffa226 |
2
Finetunning/.gitignore
vendored
Normal file
2
Finetunning/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
# Les modèles générés
|
||||||
|
qwen2.5*/
|
||||||
|
Can't render this file because it is too large.
|
144
Finetunning/cleanDataSet.py
Normal file
144
Finetunning/cleanDataSet.py
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
import json
|
||||||
|
import unicodedata
|
||||||
|
import re
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Configuration
|
||||||
|
# ----------------------------
|
||||||
|
INPUT_FILE = "paires.json"
|
||||||
|
OUTPUT_FILE = "paires_clean.json"
|
||||||
|
|
||||||
|
MIN_TOKENS = 5
|
||||||
|
MAX_TOKENS = 200
|
||||||
|
MIN_QUALITY_SCORE = 0.60
|
||||||
|
|
||||||
|
print("=== Dataset cleaning + quality scoring started ===")
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Normalization helpers
|
||||||
|
# ----------------------------
|
||||||
|
def normalize_text(text: str) -> str:
|
||||||
|
text = unicodedata.normalize("NFKC", text)
|
||||||
|
text = re.sub(r"\s+", " ", text).strip()
|
||||||
|
text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
def token_count(text: str) -> int:
|
||||||
|
return len(text.split())
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Quality scoring
|
||||||
|
# ----------------------------
|
||||||
|
def length_ratio_score(src_len, tgt_len):
|
||||||
|
"""
|
||||||
|
Ideal ratio FR/UK ≈ 0.9 – 1.3
|
||||||
|
"""
|
||||||
|
ratio = tgt_len / max(src_len, 1)
|
||||||
|
|
||||||
|
if ratio < 0.5 or ratio > 2.0:
|
||||||
|
return 0.0
|
||||||
|
elif 0.75 <= ratio <= 1.5:
|
||||||
|
return 1.0
|
||||||
|
else:
|
||||||
|
return max(0.0, 1.0 - abs(ratio - 1.1))
|
||||||
|
|
||||||
|
|
||||||
|
def lexical_density_score(text):
|
||||||
|
"""
|
||||||
|
Penalize very repetitive or trivial translations
|
||||||
|
"""
|
||||||
|
tokens = text.split()
|
||||||
|
if not tokens:
|
||||||
|
return 0.0
|
||||||
|
unique_ratio = len(set(tokens)) / len(tokens)
|
||||||
|
return min(1.0, unique_ratio * 1.5)
|
||||||
|
|
||||||
|
|
||||||
|
def quality_score(src, tgt):
|
||||||
|
src_len = token_count(src)
|
||||||
|
tgt_len = token_count(tgt)
|
||||||
|
|
||||||
|
l_score = length_ratio_score(src_len, tgt_len)
|
||||||
|
d_score = lexical_density_score(tgt)
|
||||||
|
|
||||||
|
return 0.7 * l_score + 0.3 * d_score
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Load + clean + score
|
||||||
|
# ----------------------------
|
||||||
|
unique_sources = OrderedDict()
|
||||||
|
|
||||||
|
stats = {
|
||||||
|
"total": 0,
|
||||||
|
"removed_length": 0,
|
||||||
|
"removed_duplicates": 0,
|
||||||
|
"removed_quality": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
stats["total"] += 1
|
||||||
|
item = json.loads(line)
|
||||||
|
|
||||||
|
src = normalize_text(item["text"])
|
||||||
|
tgt = normalize_text(item["translation"])
|
||||||
|
|
||||||
|
src_len = token_count(src)
|
||||||
|
tgt_len = token_count(tgt)
|
||||||
|
|
||||||
|
# Length filtering
|
||||||
|
if not (MIN_TOKENS <= src_len <= MAX_TOKENS):
|
||||||
|
stats["removed_length"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not (MIN_TOKENS <= tgt_len <= MAX_TOKENS):
|
||||||
|
stats["removed_length"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Deduplication
|
||||||
|
if src in unique_sources:
|
||||||
|
stats["removed_duplicates"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Quality score
|
||||||
|
q_score = quality_score(src, tgt)
|
||||||
|
if q_score < MIN_QUALITY_SCORE:
|
||||||
|
stats["removed_quality"] += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
unique_sources[src] = {
|
||||||
|
"translation": tgt,
|
||||||
|
"quality_score": round(q_score, 3)
|
||||||
|
}
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Report
|
||||||
|
# ----------------------------
|
||||||
|
print(f"Total lines processed: {stats['total']}")
|
||||||
|
print(f"Removed (length): {stats['removed_length']}")
|
||||||
|
print(f"Removed (duplicates): {stats['removed_duplicates']}")
|
||||||
|
print(f"Removed (quality): {stats['removed_quality']}")
|
||||||
|
print(f"Final kept pairs: {len(unique_sources)}")
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Save cleaned dataset
|
||||||
|
# ----------------------------
|
||||||
|
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||||
|
for src, data in unique_sources.items():
|
||||||
|
json.dump(
|
||||||
|
{
|
||||||
|
"text": src,
|
||||||
|
"translation": data["translation"],
|
||||||
|
"quality_score": data["quality_score"],
|
||||||
|
},
|
||||||
|
f,
|
||||||
|
ensure_ascii=False
|
||||||
|
)
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
print(f"=== Cleaning completed ===")
|
||||||
|
print(f"Clean dataset saved to: {OUTPUT_FILE}")
|
||||||
@ -1,9 +1,11 @@
|
|||||||
|
import os
|
||||||
import torch
|
import torch
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from transformers import (
|
from transformers import (
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
AutoModelForCausalLM,
|
AutoModelForCausalLM,
|
||||||
TrainingArguments,
|
TrainingArguments,
|
||||||
|
BitsAndBytesConfig,
|
||||||
)
|
)
|
||||||
from peft import (
|
from peft import (
|
||||||
LoraConfig,
|
LoraConfig,
|
||||||
@ -11,50 +13,85 @@ from peft import (
|
|||||||
prepare_model_for_kbit_training,
|
prepare_model_for_kbit_training,
|
||||||
)
|
)
|
||||||
from trl import SFTTrainer
|
from trl import SFTTrainer
|
||||||
import os
|
|
||||||
|
# ----------------------------
|
||||||
|
# Environment safety (Windows)
|
||||||
|
# ----------------------------
|
||||||
os.environ["TORCHDYNAMO_DISABLE"] = "1"
|
os.environ["TORCHDYNAMO_DISABLE"] = "1"
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# Model configuration
|
# Global configuration
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
MODEL_NAME = "Qwen/Qwen2.5-14B-Instruct"
|
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
|
||||||
|
OUTPUT_DIR = "./qwen2.5-7b-uk-fr-lora"
|
||||||
|
DATA_FILE = "paires_clean.json"
|
||||||
|
MAX_SEQ_LENGTH = 1024
|
||||||
|
|
||||||
print("=== Starting fine-tuning script ===")
|
print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n")
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# [1/7] Tokenizer
|
||||||
|
# ----------------------------
|
||||||
print(f"{80 * '_'}\n[1/7] Loading tokenizer...")
|
print(f"{80 * '_'}\n[1/7] Loading tokenizer...")
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
MODEL_NAME,
|
MODEL_NAME,
|
||||||
trust_remote_code=True
|
trust_remote_code=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Ensure padding token is defined
|
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
tokenizer.model_max_length = 1024
|
tokenizer.model_max_length = MAX_SEQ_LENGTH
|
||||||
|
|
||||||
print("Tokenizer loaded and configured.")
|
print("Tokenizer loaded.")
|
||||||
|
print(f"Pad token id: {tokenizer.pad_token_id}")
|
||||||
print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (QLoRA)...")
|
print(f"Max sequence length: {tokenizer.model_max_length}")
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
|
||||||
MODEL_NAME,
|
|
||||||
load_in_4bit=True,
|
|
||||||
device_map="auto",
|
|
||||||
torch_dtype=torch.float16, # OK for weights
|
|
||||||
trust_remote_code=True,
|
|
||||||
)
|
|
||||||
print("Model loaded.")
|
|
||||||
|
|
||||||
print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...")
|
|
||||||
model = prepare_model_for_kbit_training(model)
|
|
||||||
print("Model prepared for k-bit training.")
|
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# LoRA configuration
|
# [2/7] Quantization config (QLoRA)
|
||||||
|
# ----------------------------
|
||||||
|
print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (optimized QLoRA)...")
|
||||||
|
|
||||||
|
assert torch.cuda.is_available(), "CUDA GPU not detected!"
|
||||||
|
print(f"Using GPU: {torch.cuda.get_device_name(0)}")
|
||||||
|
|
||||||
|
bnb_config = BitsAndBytesConfig(
|
||||||
|
load_in_4bit=True,
|
||||||
|
bnb_4bit_quant_type="nf4",
|
||||||
|
bnb_4bit_compute_dtype=torch.float16,
|
||||||
|
bnb_4bit_use_double_quant=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
MODEL_NAME,
|
||||||
|
device_map="cuda", # 🔥 SAFE
|
||||||
|
quantization_config=bnb_config,
|
||||||
|
low_cpu_mem_usage=True,
|
||||||
|
trust_remote_code=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Model loaded successfully in 4-bit mode on GPU.")
|
||||||
|
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# [3/7] Prepare model for k-bit training
|
||||||
|
# ----------------------------
|
||||||
|
print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...")
|
||||||
|
model = prepare_model_for_kbit_training(model)
|
||||||
|
|
||||||
|
model.gradient_checkpointing_enable(
|
||||||
|
gradient_checkpointing_kwargs={"use_reentrant": False}
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Model prepared for k-bit training.")
|
||||||
|
print("Gradient checkpointing enabled (non-reentrant).")
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# [4/7] LoRA configuration
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
print(f"{80 * '_'}\n[4/7] Configuring LoRA adapters...")
|
print(f"{80 * '_'}\n[4/7] Configuring LoRA adapters...")
|
||||||
lora_config = LoraConfig(
|
lora_config = LoraConfig(
|
||||||
r=16,
|
r=32,
|
||||||
lora_alpha=32,
|
lora_alpha=64,
|
||||||
lora_dropout=0.05,
|
lora_dropout=0.02,
|
||||||
bias="none",
|
bias="none",
|
||||||
task_type="CAUSAL_LM",
|
task_type="CAUSAL_LM",
|
||||||
target_modules=[
|
target_modules=[
|
||||||
@ -70,57 +107,64 @@ lora_config = LoraConfig(
|
|||||||
|
|
||||||
model = get_peft_model(model, lora_config)
|
model = get_peft_model(model, lora_config)
|
||||||
model.print_trainable_parameters()
|
model.print_trainable_parameters()
|
||||||
print("LoRA adapters attached to the model.")
|
|
||||||
|
print("LoRA adapters successfully attached.")
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# Dataset loading
|
# [5/7] Dataset loading & formatting
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...")
|
print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...")
|
||||||
dataset = load_dataset(
|
dataset = load_dataset("json", data_files=DATA_FILE)
|
||||||
"json",
|
|
||||||
data_files="traductions.json"
|
|
||||||
)
|
|
||||||
print(f"Dataset loaded with {len(dataset['train'])} samples.")
|
print(f"Dataset loaded with {len(dataset['train'])} samples.")
|
||||||
|
|
||||||
print("Formatting dataset for Ukrainian → French translation...")
|
print("Formatting dataset for Ukrainian → French translation...")
|
||||||
|
|
||||||
def format_prompt(example):
|
def format_prompt(example):
|
||||||
prompt = (
|
return {
|
||||||
"Translate the following Ukrainian text into French.\n\n"
|
"text": ("<|user|>\n"
|
||||||
|
"Translate the following Ukrainian text into French.\n"
|
||||||
f"Ukrainian: {example['text']}\n"
|
f"Ukrainian: {example['text']}\n"
|
||||||
f"French: {example['translation']}"
|
"<|assistant|>\n"
|
||||||
|
f"{example['translation']}"
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
dataset = dataset.map(
|
||||||
|
format_prompt,
|
||||||
|
remove_columns=dataset["train"].column_names
|
||||||
)
|
)
|
||||||
return {"text": prompt}
|
|
||||||
|
|
||||||
dataset = dataset.map(format_prompt, remove_columns=dataset["train"].column_names)
|
|
||||||
print("Dataset formatting completed.")
|
print("Dataset formatting completed.")
|
||||||
|
print("Example prompt:\n")
|
||||||
|
print(dataset["train"][0]["text"])
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# Training arguments
|
# [6/7] Training arguments
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
print(f"{80 * '_'}\n[6/7] Initializing training arguments...")
|
print(f"{80 * '_'}\n[6/7] Initializing training arguments...")
|
||||||
training_args = TrainingArguments(
|
training_args = TrainingArguments(
|
||||||
output_dir="./qwen-uk-fr-lora",
|
output_dir=OUTPUT_DIR,
|
||||||
per_device_train_batch_size=1,
|
per_device_train_batch_size=1,
|
||||||
gradient_accumulation_steps=8,
|
gradient_accumulation_steps=8,
|
||||||
learning_rate=2e-4,
|
learning_rate=1e-4,
|
||||||
num_train_epochs=3,
|
num_train_epochs=3,
|
||||||
|
|
||||||
fp16=False,
|
fp16=False,
|
||||||
bf16=False,
|
bf16=False,
|
||||||
|
optim="paged_adamw_32bit",
|
||||||
logging_steps=10,
|
logging_steps=10,
|
||||||
save_steps=500,
|
save_steps=500,
|
||||||
save_total_limit=2,
|
save_total_limit=2,
|
||||||
|
|
||||||
# Use 32-bit optimizer
|
|
||||||
optim="paged_adamw_32bit",
|
|
||||||
|
|
||||||
report_to="none",
|
report_to="none",
|
||||||
|
dataloader_pin_memory=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
print("Training arguments ready.")
|
print("Training arguments ready.")
|
||||||
|
print(f"Output directory: {OUTPUT_DIR}")
|
||||||
|
print(f"Epochs: {training_args.num_train_epochs}")
|
||||||
|
print(f"Effective batch size: "
|
||||||
|
f"{training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}"
|
||||||
|
)
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# Trainer
|
# Trainer
|
||||||
@ -135,18 +179,34 @@ trainer = SFTTrainer(
|
|||||||
print("Trainer initialized.")
|
print("Trainer initialized.")
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# Train
|
# [7/7] Training
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
print(f"{80 * '_'}\n[7/7] Starting training...")
|
print(f"{80 * '_'}\n[7/7] Starting training...")
|
||||||
trainer.train()
|
checkpoint_exists = any(
|
||||||
|
d.startswith("checkpoint-")
|
||||||
|
for d in os.listdir(OUTPUT_DIR)
|
||||||
|
) if os.path.exists(OUTPUT_DIR) else False
|
||||||
|
|
||||||
|
if checkpoint_exists:
|
||||||
|
print("Checkpoint found → resuming training")
|
||||||
|
train_output = trainer.train(resume_from_checkpoint=True)
|
||||||
|
else:
|
||||||
|
print("No checkpoint found → starting fresh training")
|
||||||
|
train_output = trainer.train()
|
||||||
|
|
||||||
|
|
||||||
|
print("\n=== Training summary ===")
|
||||||
|
print(f"Global steps: {train_output.global_step}")
|
||||||
|
print(f"Training loss: {train_output.training_loss}")
|
||||||
|
print(f"Metrics: {train_output.metrics}")
|
||||||
print("Training completed successfully.")
|
print("Training completed successfully.")
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# Save LoRA adapter
|
# Save LoRA adapter
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
print("Saving LoRA adapter and tokenizer...")
|
print(f"{80 * '_'}\nSaving LoRA adapter and tokenizer...")
|
||||||
trainer.model.save_pretrained("./qwen-uk-fr-lora")
|
trainer.model.save_pretrained(OUTPUT_DIR)
|
||||||
tokenizer.save_pretrained("./qwen-uk-fr-lora")
|
tokenizer.save_pretrained(OUTPUT_DIR)
|
||||||
|
|
||||||
print("=== Fine-tuning finished ===")
|
print("\n=== Fine-tuning finished ===")
|
||||||
print("LoRA adapter saved in ./qwen-uk-fr-lora")
|
print(f"LoRA adapter saved in: {OUTPUT_DIR}")
|
||||||
|
|||||||
68
Finetunning/mergeLora.py
Normal file
68
Finetunning/mergeLora.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
import torch
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
from peft import PeftModel
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Configuration
|
||||||
|
# ----------------------------
|
||||||
|
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
|
||||||
|
LORA_DIR = "./qwen2.5-7b-uk-fr-lora" # dossier issu du fine-tuning
|
||||||
|
OUTPUT_DIR = "./qwen2.5-7b-uk-fr-merged" # modèle fusionné final
|
||||||
|
|
||||||
|
DTYPE = torch.float16 # GGUF-friendly
|
||||||
|
DEVICE = "cpu" # merge sur CPU (stable, sûr)
|
||||||
|
|
||||||
|
print("=== LoRA merge script started ===")
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Load base model
|
||||||
|
# ----------------------------
|
||||||
|
print(f"{80 * '_'}\n[1/4] Loading base model...")
|
||||||
|
base_model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
BASE_MODEL,
|
||||||
|
torch_dtype=DTYPE,
|
||||||
|
device_map=DEVICE,
|
||||||
|
trust_remote_code=True,
|
||||||
|
)
|
||||||
|
print("Base model loaded.")
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Load tokenizer
|
||||||
|
# ----------------------------
|
||||||
|
print(f"{80 * '_'}\n[2/4] Loading tokenizer...")
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
BASE_MODEL,
|
||||||
|
trust_remote_code=True
|
||||||
|
)
|
||||||
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
|
print("Tokenizer loaded.")
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Load LoRA adapter
|
||||||
|
# ----------------------------
|
||||||
|
print(f"{80 * '_'}\n[3/4] Loading LoRA adapter...")
|
||||||
|
model = PeftModel.from_pretrained(
|
||||||
|
base_model,
|
||||||
|
LORA_DIR,
|
||||||
|
)
|
||||||
|
print("LoRA adapter loaded.")
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Merge LoRA into base model
|
||||||
|
# ----------------------------
|
||||||
|
print(f"{80 * '_'}\n[4/4] Merging LoRA into base model...")
|
||||||
|
model = model.merge_and_unload()
|
||||||
|
print("LoRA successfully merged.")
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Save merged model
|
||||||
|
# ----------------------------
|
||||||
|
print("Saving merged model...")
|
||||||
|
model.save_pretrained(
|
||||||
|
OUTPUT_DIR,
|
||||||
|
safe_serialization=True,
|
||||||
|
)
|
||||||
|
tokenizer.save_pretrained(OUTPUT_DIR)
|
||||||
|
|
||||||
|
print("=== Merge completed successfully ===")
|
||||||
|
print(f"Merged model saved in: {OUTPUT_DIR}")
|
||||||
@ -11,13 +11,11 @@
|
|||||||
{"text": "Як на мене, то наразі помовчу.", "translation": "En ce qui me concerne, je n’ai pour le moment rien à dire."}
|
{"text": "Як на мене, то наразі помовчу.", "translation": "En ce qui me concerne, je n’ai pour le moment rien à dire."}
|
||||||
{"text": "Мій дядько вчора помер від раку шлунку.", "translation": "Mon oncle est mort hier d’un cancer à l’estomac."}
|
{"text": "Мій дядько вчора помер від раку шлунку.", "translation": "Mon oncle est mort hier d’un cancer à l’estomac."}
|
||||||
{"text": "Я не знаю, що ще можна зробити.", "translation": "Je ne sais plus quoi faire."}
|
{"text": "Я не знаю, що ще можна зробити.", "translation": "Je ne sais plus quoi faire."}
|
||||||
{"text": "Я навчився жити без неї.", "translation": "J’ai appris à vivre sans elle."}
|
|
||||||
{"text": "Я навчився жити без неї.", "translation": "J'ai appris à vivre sans elle."}
|
{"text": "Я навчився жити без неї.", "translation": "J'ai appris à vivre sans elle."}
|
||||||
{"text": "Справді?", "translation": "Vraiment ?"}
|
{"text": "Справді?", "translation": "Vraiment ?"}
|
||||||
{"text": "Справді?", "translation": "C'est vrai ?"}
|
{"text": "Справді?", "translation": "C'est vrai ?"}
|
||||||
{"text": "Справді?", "translation": "Vrai ?"}
|
{"text": "Справді?", "translation": "Vrai ?"}
|
||||||
{"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J’ai toujours préféré les personnages mystérieux."}
|
{"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J’ai toujours préféré les personnages mystérieux."}
|
||||||
{"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J'ai toujours préféré les personnages mystérieux."}
|
|
||||||
{"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J'ai toujours plus apprécié les personnages mystérieux."}
|
{"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J'ai toujours plus apprécié les personnages mystérieux."}
|
||||||
{"text": "Тобі краще поспати.", "translation": "Tu ferais mieux de dormir."}
|
{"text": "Тобі краще поспати.", "translation": "Tu ferais mieux de dormir."}
|
||||||
{"text": "Обдумай це.", "translation": "Penses-y."}
|
{"text": "Обдумай це.", "translation": "Penses-y."}
|
||||||
@ -69,7 +67,6 @@
|
|||||||
{"text": "Ця традиція зародилася в Китаї.", "translation": "Cette tradition est née en Chine."}
|
{"text": "Ця традиція зародилася в Китаї.", "translation": "Cette tradition est née en Chine."}
|
||||||
{"text": "У Японії є дипломатичні стосунки з Китаєм.", "translation": "Le Japon a des relations diplomatiques avec la Chine."}
|
{"text": "У Японії є дипломатичні стосунки з Китаєм.", "translation": "Le Japon a des relations diplomatiques avec la Chine."}
|
||||||
{"text": "Він повернувся з Китаю.", "translation": "Il est revenu de Chine."}
|
{"text": "Він повернувся з Китаю.", "translation": "Il est revenu de Chine."}
|
||||||
{"text": "Він повернувся з Китаю.", "translation": "Il est reparti de Chine."}
|
|
||||||
{"text": "Він повернувся з Китаю.", "translation": "Il revint de Chine."}
|
{"text": "Він повернувся з Китаю.", "translation": "Il revint de Chine."}
|
||||||
{"text": "Він повернувся з Китаю.", "translation": "Il est rentré de Chine."}
|
{"text": "Він повернувся з Китаю.", "translation": "Il est rentré de Chine."}
|
||||||
{"text": "Він написав книжку про порцеляну.", "translation": "Il a écrit un livre sur la porcelaine."}
|
{"text": "Він написав книжку про порцеляну.", "translation": "Il a écrit un livre sur la porcelaine."}
|
||||||
@ -125,7 +122,6 @@
|
|||||||
{"text": "Я кохаю тебе.", "translation": "Je t'aime !"}
|
{"text": "Я кохаю тебе.", "translation": "Je t'aime !"}
|
||||||
{"text": "З днем народження!", "translation": "Bon anniversaire !"}
|
{"text": "З днем народження!", "translation": "Bon anniversaire !"}
|
||||||
{"text": "З днем народження!", "translation": "Joyeux anniversaire !"}
|
{"text": "З днем народження!", "translation": "Joyeux anniversaire !"}
|
||||||
{"text": "З днем народження!", "translation": "Joyeux anniversaire."}
|
|
||||||
{"text": "Кожному своє.", "translation": "À chacun son goût."}
|
{"text": "Кожному своє.", "translation": "À chacun son goût."}
|
||||||
{"text": "Кожному своє.", "translation": "Chacun son truc."}
|
{"text": "Кожному своє.", "translation": "Chacun son truc."}
|
||||||
{"text": "Скільки це коштує?", "translation": "Ça coûte combien ?"}
|
{"text": "Скільки це коштує?", "translation": "Ça coûte combien ?"}
|
||||||
@ -171,7 +167,6 @@
|
|||||||
{"text": "В інтернеті мало сайтів татарською мовою.", "translation": "Il y a peu de sites en langue Tatar sur Internet."}
|
{"text": "В інтернеті мало сайтів татарською мовою.", "translation": "Il y a peu de sites en langue Tatar sur Internet."}
|
||||||
{"text": "Удачі на іспиті!", "translation": "Bonne chance pour ton examen !"}
|
{"text": "Удачі на іспиті!", "translation": "Bonne chance pour ton examen !"}
|
||||||
{"text": "Де ти живеш?", "translation": "Où habites-tu ?"}
|
{"text": "Де ти живеш?", "translation": "Où habites-tu ?"}
|
||||||
{"text": "Де ти живеш?", "translation": "Où résides-tu ?"}
|
|
||||||
{"text": "Де ти живеш?", "translation": "Tu habites où ?"}
|
{"text": "Де ти живеш?", "translation": "Tu habites où ?"}
|
||||||
{"text": "Де ти живеш?", "translation": "Où demeures-tu ?"}
|
{"text": "Де ти живеш?", "translation": "Où demeures-tu ?"}
|
||||||
{"text": "Де ти живеш?", "translation": "Où vis-tu ?"}
|
{"text": "Де ти живеш?", "translation": "Où vis-tu ?"}
|
||||||
@ -2,7 +2,7 @@ import json
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
# Chemin vers ton fichier d'entrée et de sortie
|
# Chemin vers ton fichier d'entrée et de sortie
|
||||||
input_file = "Paires de phrases en ukrainien-français - 2026-01-06.tsv" # Remplace par ton chemin
|
input_file = "Paires-de-phrases-en-ukrainien-francais-2026-01-06.tsv" # Remplace par ton chemin
|
||||||
output_file = "paires.json" # Fichier de sortie
|
output_file = "paires.json" # Fichier de sortie
|
||||||
|
|
||||||
# Dictionnaire pour stocker les paires uniques (clé = phrase ukrainienne, valeur = liste de traductions)
|
# Dictionnaire pour stocker les paires uniques (clé = phrase ukrainienne, valeur = liste de traductions)
|
||||||
|
|||||||
30
Finetunning/validation.jsonl
Normal file
30
Finetunning/validation.jsonl
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
{"text": "Як би ти не намагався, ти не вивчиш англійську за два-три місяці.", "translation": "Quels que soient tes efforts, tu ne pourras pas apprendre l’anglais en deux-trois mois."}
|
||||||
|
{"text": "Поки я не подзвонив, він не прийшов.", "translation": "Il n’est pas venu avant que je ne l’appelle."}
|
||||||
|
{"text": "У всесвіті багато галактик.", "translation": "Il y a beaucoup de galaxies dans l'univers."}
|
||||||
|
{"text": "Вона приймає душ щоранку.", "translation": "Elle prend une douche chaque matin."}
|
||||||
|
{"text": "У Майка є декілька друзів у Флориді.", "translation": "Mike a quelques amis en Floride."}
|
||||||
|
{"text": "Я зустрінуся з тобою в неділю о третій.", "translation": "On se voit dimanche à trois heures."}
|
||||||
|
{"text": "Я сказав собі: «Це гарна ідея».", "translation": "Je me suis dit : « C’est une bonne idée. »"}
|
||||||
|
{"text": "Ми збиралися пробути там біля двох тижнів.", "translation": "Nous avions l’intention de rester là près de deux semaines."}
|
||||||
|
{"text": "Я чищу зуби двічі на день.", "translation": "Je me brosse les dents deux fois par jour."}
|
||||||
|
{"text": "Він ніжно поклав руку на її плече.", "translation": "Il posa la main gentiment sur son épaule."}
|
||||||
|
{"text": "Сьогодні жахливо холодно.", "translation": "Il fait horriblement froid aujourd'hui."}
|
||||||
|
{"text": "У цю суму включено податки.", "translation": "Cette somme inclut les taxes."}
|
||||||
|
{"text": "Ця школа була заснована в 1650 році.", "translation": "Cette école fut fondée en 1650."}
|
||||||
|
{"text": "Я випадково знайшов цей ресторан.", "translation": "J'ai trouvé ce restaurant par hasard."}
|
||||||
|
{"text": "Я не хотів нікого образити.", "translation": "Je ne voulais vexer personne."}
|
||||||
|
{"text": "Цей сад найкраще виглядає весною.", "translation": "Ce parc est plus joli au printemps."}
|
||||||
|
{"text": "Цей сир виготовлено з овечого молока.", "translation": "Ce fromage est fait avec du lait de chèvre."}
|
||||||
|
{"text": "Він спить як немовля.", "translation": "Il dort comme un bébé."}
|
||||||
|
{"text": "Гора вкрита снігом.", "translation": "La montagne est recouverte de neige."}
|
||||||
|
{"text": "Я попав під дощ і промок.", "translation": "J’ai été pris sous la pluie, et suis tout trempé."}
|
||||||
|
{"text": "Прошу, дайте мені ще один шанс.", "translation": "Je vous en prie, donnez-moi encore une chance."}
|
||||||
|
{"text": "Я все сказав.", "translation": "J’ai tout dit."}
|
||||||
|
{"text": "Не забувай нас!", "translation": "Ne nous oublie pas !"}
|
||||||
|
{"text": "Випало багато снігу.", "translation": "Beaucoup de neige est tombée."}
|
||||||
|
{"text": "Йде сніг.", "translation": "Il est en train de neiger."}
|
||||||
|
{"text": "Може піти сніг.", "translation": "Il neigera peut-être."}
|
||||||
|
{"text": "У нас у січні йде сніг.", "translation": "Chez nous, il neige en janvier."}
|
||||||
|
{"text": "Сніг розтав.", "translation": "La neige a fondu."}
|
||||||
|
{"text": "Наша компанія планує побудувати новий хімічний завод у Росії.", "translation": "Notre entreprise a le projet de construire une nouvelle usine chimique en Russie."}
|
||||||
|
{"text": "Франція воювала з Росією.", "translation": "La France fut en guerre avec la Russie."}
|
||||||
170
Finetunning/validation.py
Normal file
170
Finetunning/validation.py
Normal file
@ -0,0 +1,170 @@
|
|||||||
|
import torch
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
||||||
|
from peft import PeftModel
|
||||||
|
from datasets import load_dataset
|
||||||
|
from nltk.translate.bleu_score import corpus_bleu
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Configuration
|
||||||
|
# ----------------------------
|
||||||
|
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct" # base model
|
||||||
|
LORA_DIR = "./qwen2.5-7b-uk-fr-lora" # fine-tuned LoRA
|
||||||
|
VALIDATION_FILE = "validation.jsonl" # small validation subset
|
||||||
|
MAX_INPUT_LENGTH = 1024
|
||||||
|
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
||||||
|
|
||||||
|
# Liste des prompts à tester
|
||||||
|
PROMPTS_TO_TEST = [
|
||||||
|
{
|
||||||
|
"name": "Prompt de base",
|
||||||
|
"prompt": "Traduis la phrase ukrainienne suivante en français: {text}"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Prompt spécialisé mémoires",
|
||||||
|
"prompt": (
|
||||||
|
"Tu es un traducteur spécialisé dans les mémoires ukrainiennes des années 1910.\n"
|
||||||
|
"- Garde le style narratif et les tournures orales de l'auteur.\n"
|
||||||
|
"- Respecte les règles de traduction suivantes :\n\n"
|
||||||
|
"Règles strictes :\n"
|
||||||
|
"1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]).\n"
|
||||||
|
"2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l'auteur.\n\n"
|
||||||
|
"Voici la phrase à traduire :\nUkrainien : {text}\nFrançais :"
|
||||||
|
)
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Prompt détaillé",
|
||||||
|
"prompt": (
|
||||||
|
"Tu es un expert en traduction littéraire spécialisé dans les textes historiques ukrainiens.\n"
|
||||||
|
"Règles à suivre absolument :\n"
|
||||||
|
"1. Conserve tous les noms propres et toponymes dans leur forme originale\n"
|
||||||
|
"2. Préserve le style et le registre de l'auteur original\n"
|
||||||
|
"3. Ajoute des notes entre crochets pour expliquer les références culturelles si nécessaire\n"
|
||||||
|
"4. Traduis de manière naturelle en français tout en restant fidèle au texte source\n\n"
|
||||||
|
"Texte à traduire :\nUkrainien : {text}\nTraduction française :"
|
||||||
|
)
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "Prompt minimaliste",
|
||||||
|
"prompt": "Traduction fidèle de l'ukrainien vers le français : {text}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
print("=== Loading tokenizer and model ===")
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Load tokenizer
|
||||||
|
# ----------------------------
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
|
BASE_MODEL,
|
||||||
|
trust_remote_code=True
|
||||||
|
)
|
||||||
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
|
tokenizer.model_max_length = MAX_INPUT_LENGTH
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Load base model directly on GPU
|
||||||
|
# ----------------------------
|
||||||
|
print(f"{80 * '_'}\nLoading base model on GPU...")
|
||||||
|
base_model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
BASE_MODEL,
|
||||||
|
torch_dtype=torch.float16,
|
||||||
|
device_map={"": 0}, # all on GPU
|
||||||
|
trust_remote_code=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Apply LoRA adapter
|
||||||
|
# ----------------------------
|
||||||
|
print(f"{80 * '_'}\nApplying LoRA adapter...")
|
||||||
|
model = PeftModel.from_pretrained(base_model, LORA_DIR)
|
||||||
|
model.eval()
|
||||||
|
model.to(DEVICE) # ensure everything on GPU
|
||||||
|
print("Model ready for validation.")
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Load validation dataset
|
||||||
|
# ----------------------------
|
||||||
|
print(f"{80 * '_'}\nLoading validation dataset...")
|
||||||
|
dataset = load_dataset("json", data_files=VALIDATION_FILE)
|
||||||
|
examples = dataset["train"]
|
||||||
|
print(f"{len(examples)} examples loaded for testing.")
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Translation function
|
||||||
|
# ----------------------------
|
||||||
|
@torch.inference_mode()
|
||||||
|
def translate(text, prompt_template):
|
||||||
|
prompt = prompt_template.format(text=text)
|
||||||
|
inputs = tokenizer(
|
||||||
|
prompt,
|
||||||
|
return_tensors="pt",
|
||||||
|
truncation=True,
|
||||||
|
max_length=MAX_INPUT_LENGTH
|
||||||
|
).to(DEVICE)
|
||||||
|
|
||||||
|
# Utilisation de GenerationConfig pour éviter les avertissements
|
||||||
|
generation_config = GenerationConfig.from_model_config(model.config)
|
||||||
|
generation_config.max_new_tokens = 256
|
||||||
|
generation_config.do_sample = False
|
||||||
|
|
||||||
|
outputs = model.generate(
|
||||||
|
**inputs,
|
||||||
|
generation_config=generation_config
|
||||||
|
)
|
||||||
|
|
||||||
|
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||||
|
|
||||||
|
# Extraction de la partie traduction
|
||||||
|
if "Français :" in result:
|
||||||
|
translation_part = result.split("Français :")[-1].strip()
|
||||||
|
elif "Traduction française :" in result:
|
||||||
|
translation_part = result.split("Traduction française :")[-1].strip()
|
||||||
|
else:
|
||||||
|
translation_part = result.split(text)[-1].strip()
|
||||||
|
|
||||||
|
return translation_part
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Evaluate all prompts and select best BLEU
|
||||||
|
# ----------------------------
|
||||||
|
best_bleu = 0
|
||||||
|
best_prompt = None
|
||||||
|
all_results = {}
|
||||||
|
|
||||||
|
print(f"{80 * '_'}\nTesting all prompts and computing BLEU scores...")
|
||||||
|
|
||||||
|
for prompt_config in PROMPTS_TO_TEST:
|
||||||
|
print(f"\n{80 * '='}\nTesting prompt: {prompt_config['name']}\n{80 * '='}")
|
||||||
|
references = []
|
||||||
|
hypotheses = []
|
||||||
|
|
||||||
|
for i, example in enumerate(examples):
|
||||||
|
src_text = example["text"]
|
||||||
|
ref_text = example["translation"]
|
||||||
|
pred_text = translate(src_text, prompt_config["prompt"])
|
||||||
|
|
||||||
|
print(f"\n[{i+1}] Source: {src_text}")
|
||||||
|
print(f" Reference: {ref_text}")
|
||||||
|
print(f" Prediction: {pred_text}")
|
||||||
|
|
||||||
|
references.append([ref_text.split()])
|
||||||
|
hypotheses.append(pred_text.split())
|
||||||
|
|
||||||
|
bleu_score = corpus_bleu(references, hypotheses) * 100
|
||||||
|
print(f"\n=== Corpus BLEU score for '{prompt_config['name']}': {bleu_score:.4f} ===")
|
||||||
|
|
||||||
|
all_results[prompt_config["name"]] = bleu_score
|
||||||
|
|
||||||
|
if bleu_score > best_bleu:
|
||||||
|
best_bleu = bleu_score
|
||||||
|
best_prompt = prompt_config
|
||||||
|
|
||||||
|
# ----------------------------
|
||||||
|
# Display results
|
||||||
|
# ----------------------------
|
||||||
|
print(f"\n{80 * '='}\nFINAL RESULTS\n{80 * '='}")
|
||||||
|
for prompt_name, score in all_results.items():
|
||||||
|
print(f"{prompt_name}: {score:.4f}")
|
||||||
|
|
||||||
|
print(f"\nBEST PROMPT: {best_prompt['name']} with BLEU score: {best_bleu:.4f}")
|
||||||
|
print(f"Prompt content:\n{best_prompt['prompt']}")
|
||||||
70
README.md
70
README.md
@ -80,3 +80,73 @@ Vous pouvez modifier les paramètres suivants dans `main.py` :
|
|||||||
- `OUTPUT_PDF_PATH` : Chemin et nom du fichier PDF de sortie (généré autoamtiquement)
|
- `OUTPUT_PDF_PATH` : Chemin et nom du fichier PDF de sortie (généré autoamtiquement)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## Finnetunning
|
||||||
|
Le finne-tunning permet d'avoir une meilleur traduction. C'est un processus long en temps de calcul, mais permet une traduction plus précise.
|
||||||
|
|
||||||
|
Le principe est le suivant :
|
||||||
|
|
||||||
|
```
|
||||||
|
1️⃣ Dataset d’entraînement (pairs.json)
|
||||||
|
↓
|
||||||
|
1️⃣ Dataset nettoyé ( cleanDataSet.py -> pairs_clean.json)
|
||||||
|
↓
|
||||||
|
2️⃣ Fine-tuning LoRA (finetuning.py)
|
||||||
|
↓
|
||||||
|
3️⃣ Validation / Évaluation BLEU (validation.py)
|
||||||
|
↓
|
||||||
|
4️⃣ Merge LoRA + modèle de base (mergeLora.py)
|
||||||
|
↓
|
||||||
|
5️⃣ Conversion en GGUF ()
|
||||||
|
↓
|
||||||
|
6️⃣ Ollama (inférence finale)
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
### Nettoyage du dataset
|
||||||
|
Executer le script ```python cleanDataSet.py```
|
||||||
|
|
||||||
|
### Validation
|
||||||
|
Executer le script ```python validation.py```
|
||||||
|
|
||||||
|
Le script tests plusieurs prompts et renvoie celui avec le meilleur score BLEU.
|
||||||
|
|
||||||
|
Il faut ensuite copier ce prompt dans le fichier ModelFile.
|
||||||
|
|
||||||
|
### Merge
|
||||||
|
Executer le script ```python mergeLora.py```
|
||||||
|
|
||||||
|
### Conversion en GGUF
|
||||||
|
En étant à la racine du projet (et toujours dans le venv), cloner le projet llama.cpp
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/ggerganov/llama.cpp
|
||||||
|
cd llama.cpp
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
Et lancer la commande (/!\ ca prend eviron 10 minutes):
|
||||||
|
```bash
|
||||||
|
python convert_hf_to_gguf.py ../Finetunning/qwen2.5-7b-uk-fr-merged --outfile qwen2.5-7b-uk-fr.gguf --outtype q8_0
|
||||||
|
```
|
||||||
|
|
||||||
|
Vérification :
|
||||||
|
```bash
|
||||||
|
./main -m qwen2.5-7b-uk-fr.gguf -p "Translate into French: Привіт світ"
|
||||||
|
```
|
||||||
|
Pour que ce nouveau modèle soit exploitable par ollama, il faut TODO
|
||||||
|
|
||||||
|
## Utilisation du modèle fine-tunné pour la traduction
|
||||||
|
Créer un Modelfile :
|
||||||
|
```
|
||||||
|
FROM ./qwen2.5-7b-uk-fr.gguf
|
||||||
|
|
||||||
|
PARAMETER temperature 0.1
|
||||||
|
PARAMETER top_p 0.95
|
||||||
|
PARAMETER num_ctx 4096
|
||||||
|
|
||||||
|
SYSTEM """
|
||||||
|
You are a professional Ukrainian to French translator.
|
||||||
|
Produce faithful, literal translations.
|
||||||
|
"""
|
||||||
|
|
||||||
|
```
|
||||||
@ -3,14 +3,15 @@ PARAMETER temperature 0.2
|
|||||||
PARAMETER num_ctx 8192
|
PARAMETER num_ctx 8192
|
||||||
|
|
||||||
SYSTEM """
|
SYSTEM """
|
||||||
|
|
||||||
Tu es un traducteur spécialisé dans les mémoires ukrainiennes des années 1910.
|
Tu es un traducteur spécialisé dans les mémoires ukrainiennes des années 1910.
|
||||||
- Utilise le glossaire fourni pour les noms de lieux et termes historiques.
|
|
||||||
- Garde le style narratif et les tournures orales de l'auteur.
|
- Garde le style narratif et les tournures orales de l'auteur.
|
||||||
|
- Respecte les règles de traduction suivantes :
|
||||||
Règles strictes :
|
Règles strictes :
|
||||||
1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]).
|
1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]).
|
||||||
2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l’auteur.
|
2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l'auteur.
|
||||||
3. **Pour les termes historiques** (ex. : "powiat"), utilise le terme français standard et ajoute une note explicative.
|
3. **Pour les termes historiques** (ex. : "powiat"), utilise le terme français standard et ajoute une note explicative.
|
||||||
4. **Conserve les citations** russe/allemand/polonais intégrés au texte (mais ajoute une note de fin de paragraphe entre [ ] en la traduisant et en précisant la langue d'origine.
|
4. **Conserve les citations** russe/allemand/polonais intégrés au texte (mais ajoute une note de fin de paragraphe entre [ ] en la traduisant et en précisant la langue d'origine.
|
||||||
5. **Structure** : Garde les sauts de ligne et la mise en page originale.
|
5. **Structure** : Garde les sauts de ligne et la mise en page originale.
|
||||||
6. **Notes du traducteur** : Ajoute entre crochets [ ] les explications contextuelles si un contexte historique existe (ex. : "[Note : le context]").
|
6. **Notes du traducteur** : Ajoute entre crochets [ ] les explications contextuelles si un contexte historique exist.
|
||||||
"""
|
"""
|
||||||
1674
Traduction/TaniaBorecMemoir(Ukr) (FR) V1.pdf
Normal file
1674
Traduction/TaniaBorecMemoir(Ukr) (FR) V1.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1169
Traduction/TaniaBorecMemoir(Ukr) (FR).txt
Normal file
1169
Traduction/TaniaBorecMemoir(Ukr) (FR).txt
Normal file
File diff suppressed because it is too large
Load Diff
1637
Traduction/TaniaBorecMemoir(Ukr) (FR)_V1.pdf
Normal file
1637
Traduction/TaniaBorecMemoir(Ukr) (FR)_V1.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1637
Traduction/TaniaBorecMemoir(Ukr) (FR)_V2.pdf
Normal file
1637
Traduction/TaniaBorecMemoir(Ukr) (FR)_V2.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1623
Traduction/TaniaBorecMemoir(Ukr) (FR)_V3.pdf
Normal file
1623
Traduction/TaniaBorecMemoir(Ukr) (FR)_V3.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1032
Traduction/TaniaBorecMemoir(Ukr)(FR)_V1.txt
Normal file
1032
Traduction/TaniaBorecMemoir(Ukr)(FR)_V1.txt
Normal file
File diff suppressed because it is too large
Load Diff
1038
Traduction/TaniaBorecMemoir(Ukr)(FR)_V2.txt
Normal file
1038
Traduction/TaniaBorecMemoir(Ukr)(FR)_V2.txt
Normal file
File diff suppressed because it is too large
Load Diff
1030
Traduction/TaniaBorecMemoir(Ukr)(FR)_V3.txt
Normal file
1030
Traduction/TaniaBorecMemoir(Ukr)(FR)_V3.txt
Normal file
File diff suppressed because it is too large
Load Diff
1664
Traduction/TaniaBorecMemoir(Ukr)(FR)_V4.pdf
Normal file
1664
Traduction/TaniaBorecMemoir(Ukr)(FR)_V4.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1119
Traduction/TaniaBorecMemoir(Ukr)(FR)_V4.txt
Normal file
1119
Traduction/TaniaBorecMemoir(Ukr)(FR)_V4.txt
Normal file
File diff suppressed because it is too large
Load Diff
1628
Traduction/TaniaBorecMemoir(Ukr)(FR)_V5.pdf
Normal file
1628
Traduction/TaniaBorecMemoir(Ukr)(FR)_V5.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1107
Traduction/TaniaBorecMemoir(Ukr)(FR)_V5.txt
Normal file
1107
Traduction/TaniaBorecMemoir(Ukr)(FR)_V5.txt
Normal file
File diff suppressed because it is too large
Load Diff
1704
Traduction/TaniaBorecMemoir(Ukr)(FR)_V6.pdf
Normal file
1704
Traduction/TaniaBorecMemoir(Ukr)(FR)_V6.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1209
Traduction/TaniaBorecMemoir(Ukr)(FR)_V6.txt
Normal file
1209
Traduction/TaniaBorecMemoir(Ukr)(FR)_V6.txt
Normal file
File diff suppressed because it is too large
Load Diff
1702
Traduction/TaniaBorecMemoir(Ukr)(FR)_V7.pdf
Normal file
1702
Traduction/TaniaBorecMemoir(Ukr)(FR)_V7.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1214
Traduction/TaniaBorecMemoir(Ukr)(FR)_V7.txt
Normal file
1214
Traduction/TaniaBorecMemoir(Ukr)(FR)_V7.txt
Normal file
File diff suppressed because it is too large
Load Diff
1702
Traduction/TaniaBorecMemoir(Ukr)(FR)_V8.pdf
Normal file
1702
Traduction/TaniaBorecMemoir(Ukr)(FR)_V8.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1214
Traduction/TaniaBorecMemoir(Ukr)(FR)_V8.txt
Normal file
1214
Traduction/TaniaBorecMemoir(Ukr)(FR)_V8.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -3,23 +3,23 @@ import requests
|
|||||||
import json
|
import json
|
||||||
from reportlab.lib.pagesizes import letter
|
from reportlab.lib.pagesizes import letter
|
||||||
from reportlab.lib.units import inch
|
from reportlab.lib.units import inch
|
||||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Flowable
|
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
||||||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||||
from reportlab.lib.enums import TA_JUSTIFY
|
from reportlab.lib.enums import TA_JUSTIFY
|
||||||
from reportlab.pdfbase import pdfmetrics
|
from reportlab.pdfbase import pdfmetrics
|
||||||
from reportlab.pdfbase.ttfonts import TTFont
|
from reportlab.pdfbase.ttfonts import TTFont
|
||||||
import os
|
import os, time
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
DEBUG = True
|
DEBUG = True
|
||||||
PDF_PATH = "Traduction\TaniaBorecMemoir(Ukr).pdf"
|
PDF_PATH = "Traduction/TaniaBorecMemoir(Ukr).pdf"
|
||||||
OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
|
OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
|
||||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||||
TARGET_LANGUAGE = "français"
|
TARGET_LANGUAGE = "français"
|
||||||
CHECKPOINT_FILE = "Traduction\checkpoint.json"
|
CHECKPOINT_FILE = "Traduction/checkpoint.json"
|
||||||
TEMP_OUTPUT_TXT = "Traduction\output_temp.txt"
|
TEMP_OUTPUT_TXT = "Traduction/output_temp.txt"
|
||||||
FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V8.pdf")
|
FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V9.pdf")
|
||||||
FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V8.txt")
|
FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V9.txt")
|
||||||
|
|
||||||
DEBUG = True
|
DEBUG = True
|
||||||
|
|
||||||
@ -341,6 +341,7 @@ def main():
|
|||||||
print(f"Batches manquants détectés : {missing_batches}")
|
print(f"Batches manquants détectés : {missing_batches}")
|
||||||
|
|
||||||
# Traduction des paragraphes manquants
|
# Traduction des paragraphes manquants
|
||||||
|
temps_cumule = 0.0
|
||||||
for i in missing_batches:
|
for i in missing_batches:
|
||||||
batch = paragraphs[i:i + batch_size]
|
batch = paragraphs[i:i + batch_size]
|
||||||
paragraph_cumul = "\n".join(batch)
|
paragraph_cumul = "\n".join(batch)
|
||||||
@ -348,13 +349,24 @@ def main():
|
|||||||
print(f"{15 * '-'} Traduction des paragraphes manquants {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
|
print(f"{15 * '-'} Traduction des paragraphes manquants {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
debut_chrono = time.time()
|
||||||
result = send_to_ollama(paragraph_cumul)
|
result = send_to_ollama(paragraph_cumul)
|
||||||
|
fin_chrono = time.time()
|
||||||
|
temps_paragraphe = fin_chrono - debut_chrono
|
||||||
|
temps_cumule += temps_paragraphe
|
||||||
|
|
||||||
|
# Conversion en minutes et secondes
|
||||||
|
minutes_paragraphe, secondes_paragraphe = divmod(temps_paragraphe, 60)
|
||||||
|
minutes_cumule, secondes_cumule = divmod(temps_cumule, 60)
|
||||||
|
|
||||||
print(f"{result}")
|
print(f"{result}")
|
||||||
results[str(i)] = result
|
results[str(i)] = result
|
||||||
save_checkpoint(len(paragraphs), results) # Met à jour le dernier indice du batch
|
save_checkpoint(len(paragraphs), results) # Met à jour le dernier indice du batch
|
||||||
save_temp_results(results)
|
save_temp_results(results)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Erreur lors de la traduction du paragraphe {i}: {e}")
|
print(f"Erreur lors de la traduction du paragraphe {i}: {e}")
|
||||||
|
print(f" Temps de traduction : {int(minutes_paragraphe)} min {secondes_paragraphe:.2f} sec")
|
||||||
|
print(f" Temps cumulé : {int(minutes_cumule)} min {secondes_cumule:.2f} sec")
|
||||||
|
|
||||||
# Traitement des paragraphes suivants
|
# Traitement des paragraphes suivants
|
||||||
for i in range(last_index + 1, len(paragraphs), batch_size):
|
for i in range(last_index + 1, len(paragraphs), batch_size):
|
||||||
@ -364,7 +376,16 @@ def main():
|
|||||||
print(f"{15 * '-'} Traduction des paragraphes {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
|
print(f"{15 * '-'} Traduction des paragraphes {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
debut_chrono = time.time()
|
||||||
result = send_to_ollama(paragraph_cumul)
|
result = send_to_ollama(paragraph_cumul)
|
||||||
|
fin_chrono = time.time()
|
||||||
|
temps_paragraphe = fin_chrono - debut_chrono
|
||||||
|
temps_cumule += temps_paragraphe
|
||||||
|
|
||||||
|
# Conversion en minutes et secondes
|
||||||
|
minutes_paragraphe, secondes_paragraphe = divmod(temps_paragraphe, 60)
|
||||||
|
minutes_cumule, secondes_cumule = divmod(temps_cumule, 60)
|
||||||
|
|
||||||
print(f"{result}")
|
print(f"{result}")
|
||||||
results[str(i)] = result
|
results[str(i)] = result
|
||||||
save_checkpoint(i + batch_size - 1, results)
|
save_checkpoint(i + batch_size - 1, results)
|
||||||
@ -372,6 +393,9 @@ def main():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Erreur : {e}")
|
print(f"Erreur : {e}")
|
||||||
continue
|
continue
|
||||||
|
print(f" Temps de traduction : {int(minutes_paragraphe)} min {secondes_paragraphe:.2f} sec")
|
||||||
|
print(f" Temps cumulé : {int(minutes_cumule)} min {secondes_cumule:.2f} sec")
|
||||||
|
|
||||||
|
|
||||||
save_temp_results(results)
|
save_temp_results(results)
|
||||||
create_pdf_from_results(results, FINAL_OUTPUT_PDF)
|
create_pdf_from_results(results, FINAL_OUTPUT_PDF)
|
||||||
|
|||||||
@ -14,3 +14,4 @@ peft
|
|||||||
bitsandbytes
|
bitsandbytes
|
||||||
accelerate
|
accelerate
|
||||||
trl
|
trl
|
||||||
|
nltk
|
||||||
2
run.bat
2
run.bat
@ -17,6 +17,7 @@ REM Activer l'environnement virtuel Python
|
|||||||
call %VENV_PATH%\Scripts\activate.bat
|
call %VENV_PATH%\Scripts\activate.bat
|
||||||
|
|
||||||
REM Lancer la compilation du modèle LLM pour Ollama
|
REM Lancer la compilation du modèle LLM pour Ollama
|
||||||
|
echo Compilation du modèle LLM pour Ollama
|
||||||
ollama create traductionUkrainienVersFrancais -f .\Traduction\Modelfile
|
ollama create traductionUkrainienVersFrancais -f .\Traduction\Modelfile
|
||||||
|
|
||||||
:: 1. Vérifie si le processus ollama.exe est en cours d'exécution
|
:: 1. Vérifie si le processus ollama.exe est en cours d'exécution
|
||||||
@ -39,6 +40,7 @@ if %ERRORLEVEL% neq 0 (
|
|||||||
)
|
)
|
||||||
|
|
||||||
REM Exécuter le script principal
|
REM Exécuter le script principal
|
||||||
|
echo Lancement du script principal de traduction
|
||||||
python %MAIN_SCRIPT_PATH%
|
python %MAIN_SCRIPT_PATH%
|
||||||
|
|
||||||
endlocal
|
endlocal
|
||||||
Loading…
x
Reference in New Issue
Block a user