Compare commits
No commits in common. "51e114b1ee91f88067c23c6343b0c238eede8a90" and "182e6e7a9803a4dbac084505acc2a9b1e4b52c1a" have entirely different histories.
51e114b1ee
...
182e6e7a98
2
Finetunning/.gitignore
vendored
2
Finetunning/.gitignore
vendored
@ -1,2 +0,0 @@
|
|||||||
# Les modèles générés
|
|
||||||
qwen2.5*/
|
|
||||||
|
Can't render this file because it is too large.
|
@ -1,144 +0,0 @@
|
|||||||
import json
|
|
||||||
import unicodedata
|
|
||||||
import re
|
|
||||||
from collections import OrderedDict
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Configuration
|
|
||||||
# ----------------------------
|
|
||||||
INPUT_FILE = "paires.json"
|
|
||||||
OUTPUT_FILE = "paires_clean.json"
|
|
||||||
|
|
||||||
MIN_TOKENS = 5
|
|
||||||
MAX_TOKENS = 200
|
|
||||||
MIN_QUALITY_SCORE = 0.60
|
|
||||||
|
|
||||||
print("=== Dataset cleaning + quality scoring started ===")
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Normalization helpers
|
|
||||||
# ----------------------------
|
|
||||||
def normalize_text(text: str) -> str:
|
|
||||||
text = unicodedata.normalize("NFKC", text)
|
|
||||||
text = re.sub(r"\s+", " ", text).strip()
|
|
||||||
text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
|
|
||||||
return text
|
|
||||||
|
|
||||||
|
|
||||||
def token_count(text: str) -> int:
|
|
||||||
return len(text.split())
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Quality scoring
|
|
||||||
# ----------------------------
|
|
||||||
def length_ratio_score(src_len, tgt_len):
|
|
||||||
"""
|
|
||||||
Ideal ratio FR/UK ≈ 0.9 – 1.3
|
|
||||||
"""
|
|
||||||
ratio = tgt_len / max(src_len, 1)
|
|
||||||
|
|
||||||
if ratio < 0.5 or ratio > 2.0:
|
|
||||||
return 0.0
|
|
||||||
elif 0.75 <= ratio <= 1.5:
|
|
||||||
return 1.0
|
|
||||||
else:
|
|
||||||
return max(0.0, 1.0 - abs(ratio - 1.1))
|
|
||||||
|
|
||||||
|
|
||||||
def lexical_density_score(text):
|
|
||||||
"""
|
|
||||||
Penalize very repetitive or trivial translations
|
|
||||||
"""
|
|
||||||
tokens = text.split()
|
|
||||||
if not tokens:
|
|
||||||
return 0.0
|
|
||||||
unique_ratio = len(set(tokens)) / len(tokens)
|
|
||||||
return min(1.0, unique_ratio * 1.5)
|
|
||||||
|
|
||||||
|
|
||||||
def quality_score(src, tgt):
|
|
||||||
src_len = token_count(src)
|
|
||||||
tgt_len = token_count(tgt)
|
|
||||||
|
|
||||||
l_score = length_ratio_score(src_len, tgt_len)
|
|
||||||
d_score = lexical_density_score(tgt)
|
|
||||||
|
|
||||||
return 0.7 * l_score + 0.3 * d_score
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Load + clean + score
|
|
||||||
# ----------------------------
|
|
||||||
unique_sources = OrderedDict()
|
|
||||||
|
|
||||||
stats = {
|
|
||||||
"total": 0,
|
|
||||||
"removed_length": 0,
|
|
||||||
"removed_duplicates": 0,
|
|
||||||
"removed_quality": 0,
|
|
||||||
}
|
|
||||||
|
|
||||||
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
|
||||||
for line in f:
|
|
||||||
stats["total"] += 1
|
|
||||||
item = json.loads(line)
|
|
||||||
|
|
||||||
src = normalize_text(item["text"])
|
|
||||||
tgt = normalize_text(item["translation"])
|
|
||||||
|
|
||||||
src_len = token_count(src)
|
|
||||||
tgt_len = token_count(tgt)
|
|
||||||
|
|
||||||
# Length filtering
|
|
||||||
if not (MIN_TOKENS <= src_len <= MAX_TOKENS):
|
|
||||||
stats["removed_length"] += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not (MIN_TOKENS <= tgt_len <= MAX_TOKENS):
|
|
||||||
stats["removed_length"] += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Deduplication
|
|
||||||
if src in unique_sources:
|
|
||||||
stats["removed_duplicates"] += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Quality score
|
|
||||||
q_score = quality_score(src, tgt)
|
|
||||||
if q_score < MIN_QUALITY_SCORE:
|
|
||||||
stats["removed_quality"] += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
unique_sources[src] = {
|
|
||||||
"translation": tgt,
|
|
||||||
"quality_score": round(q_score, 3)
|
|
||||||
}
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Report
|
|
||||||
# ----------------------------
|
|
||||||
print(f"Total lines processed: {stats['total']}")
|
|
||||||
print(f"Removed (length): {stats['removed_length']}")
|
|
||||||
print(f"Removed (duplicates): {stats['removed_duplicates']}")
|
|
||||||
print(f"Removed (quality): {stats['removed_quality']}")
|
|
||||||
print(f"Final kept pairs: {len(unique_sources)}")
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Save cleaned dataset
|
|
||||||
# ----------------------------
|
|
||||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
|
||||||
for src, data in unique_sources.items():
|
|
||||||
json.dump(
|
|
||||||
{
|
|
||||||
"text": src,
|
|
||||||
"translation": data["translation"],
|
|
||||||
"quality_score": data["quality_score"],
|
|
||||||
},
|
|
||||||
f,
|
|
||||||
ensure_ascii=False
|
|
||||||
)
|
|
||||||
f.write("\n")
|
|
||||||
|
|
||||||
print(f"=== Cleaning completed ===")
|
|
||||||
print(f"Clean dataset saved to: {OUTPUT_FILE}")
|
|
||||||
@ -1,11 +1,9 @@
|
|||||||
import os
|
|
||||||
import torch
|
import torch
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
from transformers import (
|
from transformers import (
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
AutoModelForCausalLM,
|
AutoModelForCausalLM,
|
||||||
TrainingArguments,
|
TrainingArguments,
|
||||||
BitsAndBytesConfig,
|
|
||||||
)
|
)
|
||||||
from peft import (
|
from peft import (
|
||||||
LoraConfig,
|
LoraConfig,
|
||||||
@ -13,85 +11,50 @@ from peft import (
|
|||||||
prepare_model_for_kbit_training,
|
prepare_model_for_kbit_training,
|
||||||
)
|
)
|
||||||
from trl import SFTTrainer
|
from trl import SFTTrainer
|
||||||
|
import os
|
||||||
# ----------------------------
|
|
||||||
# Environment safety (Windows)
|
|
||||||
# ----------------------------
|
|
||||||
os.environ["TORCHDYNAMO_DISABLE"] = "1"
|
os.environ["TORCHDYNAMO_DISABLE"] = "1"
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# Global configuration
|
# Model configuration
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
|
MODEL_NAME = "Qwen/Qwen2.5-14B-Instruct"
|
||||||
OUTPUT_DIR = "./qwen2.5-7b-uk-fr-lora"
|
|
||||||
DATA_FILE = "paires_clean.json"
|
|
||||||
MAX_SEQ_LENGTH = 1024
|
|
||||||
|
|
||||||
print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n")
|
print("=== Starting fine-tuning script ===")
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# [1/7] Tokenizer
|
|
||||||
# ----------------------------
|
|
||||||
print(f"{80 * '_'}\n[1/7] Loading tokenizer...")
|
print(f"{80 * '_'}\n[1/7] Loading tokenizer...")
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
MODEL_NAME,
|
MODEL_NAME,
|
||||||
trust_remote_code=True
|
trust_remote_code=True
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Ensure padding token is defined
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
tokenizer.model_max_length = MAX_SEQ_LENGTH
|
tokenizer.model_max_length = 1024
|
||||||
|
|
||||||
print("Tokenizer loaded.")
|
print("Tokenizer loaded and configured.")
|
||||||
print(f"Pad token id: {tokenizer.pad_token_id}")
|
|
||||||
print(f"Max sequence length: {tokenizer.model_max_length}")
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# [2/7] Quantization config (QLoRA)
|
|
||||||
# ----------------------------
|
|
||||||
print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (optimized QLoRA)...")
|
|
||||||
|
|
||||||
assert torch.cuda.is_available(), "CUDA GPU not detected!"
|
|
||||||
print(f"Using GPU: {torch.cuda.get_device_name(0)}")
|
|
||||||
|
|
||||||
bnb_config = BitsAndBytesConfig(
|
|
||||||
load_in_4bit=True,
|
|
||||||
bnb_4bit_quant_type="nf4",
|
|
||||||
bnb_4bit_compute_dtype=torch.float16,
|
|
||||||
bnb_4bit_use_double_quant=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (QLoRA)...")
|
||||||
model = AutoModelForCausalLM.from_pretrained(
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
MODEL_NAME,
|
MODEL_NAME,
|
||||||
device_map="cuda", # 🔥 SAFE
|
load_in_4bit=True,
|
||||||
quantization_config=bnb_config,
|
device_map="auto",
|
||||||
low_cpu_mem_usage=True,
|
torch_dtype=torch.float16, # OK for weights
|
||||||
trust_remote_code=True,
|
trust_remote_code=True,
|
||||||
)
|
)
|
||||||
|
print("Model loaded.")
|
||||||
|
|
||||||
print("Model loaded successfully in 4-bit mode on GPU.")
|
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# [3/7] Prepare model for k-bit training
|
|
||||||
# ----------------------------
|
|
||||||
print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...")
|
print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...")
|
||||||
model = prepare_model_for_kbit_training(model)
|
model = prepare_model_for_kbit_training(model)
|
||||||
|
|
||||||
model.gradient_checkpointing_enable(
|
|
||||||
gradient_checkpointing_kwargs={"use_reentrant": False}
|
|
||||||
)
|
|
||||||
|
|
||||||
print("Model prepared for k-bit training.")
|
print("Model prepared for k-bit training.")
|
||||||
print("Gradient checkpointing enabled (non-reentrant).")
|
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# [4/7] LoRA configuration
|
# LoRA configuration
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
print(f"{80 * '_'}\n[4/7] Configuring LoRA adapters...")
|
print(f"{80 * '_'}\n[4/7] Configuring LoRA adapters...")
|
||||||
lora_config = LoraConfig(
|
lora_config = LoraConfig(
|
||||||
r=32,
|
r=16,
|
||||||
lora_alpha=64,
|
lora_alpha=32,
|
||||||
lora_dropout=0.02,
|
lora_dropout=0.05,
|
||||||
bias="none",
|
bias="none",
|
||||||
task_type="CAUSAL_LM",
|
task_type="CAUSAL_LM",
|
||||||
target_modules=[
|
target_modules=[
|
||||||
@ -107,64 +70,57 @@ lora_config = LoraConfig(
|
|||||||
|
|
||||||
model = get_peft_model(model, lora_config)
|
model = get_peft_model(model, lora_config)
|
||||||
model.print_trainable_parameters()
|
model.print_trainable_parameters()
|
||||||
|
print("LoRA adapters attached to the model.")
|
||||||
print("LoRA adapters successfully attached.")
|
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# [5/7] Dataset loading & formatting
|
# Dataset loading
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...")
|
print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...")
|
||||||
dataset = load_dataset("json", data_files=DATA_FILE)
|
dataset = load_dataset(
|
||||||
|
"json",
|
||||||
|
data_files="traductions.json"
|
||||||
|
)
|
||||||
print(f"Dataset loaded with {len(dataset['train'])} samples.")
|
print(f"Dataset loaded with {len(dataset['train'])} samples.")
|
||||||
|
|
||||||
print("Formatting dataset for Ukrainian → French translation...")
|
print("Formatting dataset for Ukrainian → French translation...")
|
||||||
|
|
||||||
def format_prompt(example):
|
def format_prompt(example):
|
||||||
return {
|
prompt = (
|
||||||
"text": ("<|user|>\n"
|
"Translate the following Ukrainian text into French.\n\n"
|
||||||
"Translate the following Ukrainian text into French.\n"
|
f"Ukrainian: {example['text']}\n"
|
||||||
f"Ukrainian: {example['text']}\n"
|
f"French: {example['translation']}"
|
||||||
"<|assistant|>\n"
|
)
|
||||||
f"{example['translation']}"
|
return {"text": prompt}
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
dataset = dataset.map(
|
|
||||||
format_prompt,
|
|
||||||
remove_columns=dataset["train"].column_names
|
|
||||||
)
|
|
||||||
|
|
||||||
|
dataset = dataset.map(format_prompt, remove_columns=dataset["train"].column_names)
|
||||||
print("Dataset formatting completed.")
|
print("Dataset formatting completed.")
|
||||||
print("Example prompt:\n")
|
|
||||||
print(dataset["train"][0]["text"])
|
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# [6/7] Training arguments
|
# Training arguments
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
print(f"{80 * '_'}\n[6/7] Initializing training arguments...")
|
print(f"{80 * '_'}\n[6/7] Initializing training arguments...")
|
||||||
training_args = TrainingArguments(
|
training_args = TrainingArguments(
|
||||||
output_dir=OUTPUT_DIR,
|
output_dir="./qwen-uk-fr-lora",
|
||||||
per_device_train_batch_size=1,
|
per_device_train_batch_size=1,
|
||||||
gradient_accumulation_steps=8,
|
gradient_accumulation_steps=8,
|
||||||
learning_rate=1e-4,
|
learning_rate=2e-4,
|
||||||
num_train_epochs=3,
|
num_train_epochs=3,
|
||||||
|
|
||||||
fp16=False,
|
fp16=False,
|
||||||
bf16=False,
|
bf16=False,
|
||||||
optim="paged_adamw_32bit",
|
|
||||||
logging_steps=10,
|
logging_steps=10,
|
||||||
save_steps=500,
|
save_steps=500,
|
||||||
save_total_limit=2,
|
save_total_limit=2,
|
||||||
|
|
||||||
|
# Use 32-bit optimizer
|
||||||
|
optim="paged_adamw_32bit",
|
||||||
|
|
||||||
report_to="none",
|
report_to="none",
|
||||||
dataloader_pin_memory=False,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
print("Training arguments ready.")
|
print("Training arguments ready.")
|
||||||
print(f"Output directory: {OUTPUT_DIR}")
|
|
||||||
print(f"Epochs: {training_args.num_train_epochs}")
|
|
||||||
print(f"Effective batch size: "
|
|
||||||
f"{training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# Trainer
|
# Trainer
|
||||||
@ -179,34 +135,18 @@ trainer = SFTTrainer(
|
|||||||
print("Trainer initialized.")
|
print("Trainer initialized.")
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# [7/7] Training
|
# Train
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
print(f"{80 * '_'}\n[7/7] Starting training...")
|
print(f"{80 * '_'}\n[7/7] Starting training...")
|
||||||
checkpoint_exists = any(
|
trainer.train()
|
||||||
d.startswith("checkpoint-")
|
|
||||||
for d in os.listdir(OUTPUT_DIR)
|
|
||||||
) if os.path.exists(OUTPUT_DIR) else False
|
|
||||||
|
|
||||||
if checkpoint_exists:
|
|
||||||
print("Checkpoint found → resuming training")
|
|
||||||
train_output = trainer.train(resume_from_checkpoint=True)
|
|
||||||
else:
|
|
||||||
print("No checkpoint found → starting fresh training")
|
|
||||||
train_output = trainer.train()
|
|
||||||
|
|
||||||
|
|
||||||
print("\n=== Training summary ===")
|
|
||||||
print(f"Global steps: {train_output.global_step}")
|
|
||||||
print(f"Training loss: {train_output.training_loss}")
|
|
||||||
print(f"Metrics: {train_output.metrics}")
|
|
||||||
print("Training completed successfully.")
|
print("Training completed successfully.")
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# Save LoRA adapter
|
# Save LoRA adapter
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
print(f"{80 * '_'}\nSaving LoRA adapter and tokenizer...")
|
print("Saving LoRA adapter and tokenizer...")
|
||||||
trainer.model.save_pretrained(OUTPUT_DIR)
|
trainer.model.save_pretrained("./qwen-uk-fr-lora")
|
||||||
tokenizer.save_pretrained(OUTPUT_DIR)
|
tokenizer.save_pretrained("./qwen-uk-fr-lora")
|
||||||
|
|
||||||
print("\n=== Fine-tuning finished ===")
|
print("=== Fine-tuning finished ===")
|
||||||
print(f"LoRA adapter saved in: {OUTPUT_DIR}")
|
print("LoRA adapter saved in ./qwen-uk-fr-lora")
|
||||||
|
|||||||
@ -1,68 +0,0 @@
|
|||||||
import torch
|
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
||||||
from peft import PeftModel
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Configuration
|
|
||||||
# ----------------------------
|
|
||||||
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
|
|
||||||
LORA_DIR = "./qwen2.5-7b-uk-fr-lora" # dossier issu du fine-tuning
|
|
||||||
OUTPUT_DIR = "./qwen2.5-7b-uk-fr-merged" # modèle fusionné final
|
|
||||||
|
|
||||||
DTYPE = torch.float16 # GGUF-friendly
|
|
||||||
DEVICE = "cpu" # merge sur CPU (stable, sûr)
|
|
||||||
|
|
||||||
print("=== LoRA merge script started ===")
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Load base model
|
|
||||||
# ----------------------------
|
|
||||||
print(f"{80 * '_'}\n[1/4] Loading base model...")
|
|
||||||
base_model = AutoModelForCausalLM.from_pretrained(
|
|
||||||
BASE_MODEL,
|
|
||||||
torch_dtype=DTYPE,
|
|
||||||
device_map=DEVICE,
|
|
||||||
trust_remote_code=True,
|
|
||||||
)
|
|
||||||
print("Base model loaded.")
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Load tokenizer
|
|
||||||
# ----------------------------
|
|
||||||
print(f"{80 * '_'}\n[2/4] Loading tokenizer...")
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
|
||||||
BASE_MODEL,
|
|
||||||
trust_remote_code=True
|
|
||||||
)
|
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
|
||||||
print("Tokenizer loaded.")
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Load LoRA adapter
|
|
||||||
# ----------------------------
|
|
||||||
print(f"{80 * '_'}\n[3/4] Loading LoRA adapter...")
|
|
||||||
model = PeftModel.from_pretrained(
|
|
||||||
base_model,
|
|
||||||
LORA_DIR,
|
|
||||||
)
|
|
||||||
print("LoRA adapter loaded.")
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Merge LoRA into base model
|
|
||||||
# ----------------------------
|
|
||||||
print(f"{80 * '_'}\n[4/4] Merging LoRA into base model...")
|
|
||||||
model = model.merge_and_unload()
|
|
||||||
print("LoRA successfully merged.")
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Save merged model
|
|
||||||
# ----------------------------
|
|
||||||
print("Saving merged model...")
|
|
||||||
model.save_pretrained(
|
|
||||||
OUTPUT_DIR,
|
|
||||||
safe_serialization=True,
|
|
||||||
)
|
|
||||||
tokenizer.save_pretrained(OUTPUT_DIR)
|
|
||||||
|
|
||||||
print("=== Merge completed successfully ===")
|
|
||||||
print(f"Merged model saved in: {OUTPUT_DIR}")
|
|
||||||
@ -11,11 +11,13 @@
|
|||||||
{"text": "Як на мене, то наразі помовчу.", "translation": "En ce qui me concerne, je n’ai pour le moment rien à dire."}
|
{"text": "Як на мене, то наразі помовчу.", "translation": "En ce qui me concerne, je n’ai pour le moment rien à dire."}
|
||||||
{"text": "Мій дядько вчора помер від раку шлунку.", "translation": "Mon oncle est mort hier d’un cancer à l’estomac."}
|
{"text": "Мій дядько вчора помер від раку шлунку.", "translation": "Mon oncle est mort hier d’un cancer à l’estomac."}
|
||||||
{"text": "Я не знаю, що ще можна зробити.", "translation": "Je ne sais plus quoi faire."}
|
{"text": "Я не знаю, що ще можна зробити.", "translation": "Je ne sais plus quoi faire."}
|
||||||
|
{"text": "Я навчився жити без неї.", "translation": "J’ai appris à vivre sans elle."}
|
||||||
{"text": "Я навчився жити без неї.", "translation": "J'ai appris à vivre sans elle."}
|
{"text": "Я навчився жити без неї.", "translation": "J'ai appris à vivre sans elle."}
|
||||||
{"text": "Справді?", "translation": "Vraiment ?"}
|
{"text": "Справді?", "translation": "Vraiment ?"}
|
||||||
{"text": "Справді?", "translation": "C'est vrai ?"}
|
{"text": "Справді?", "translation": "C'est vrai ?"}
|
||||||
{"text": "Справді?", "translation": "Vrai ?"}
|
{"text": "Справді?", "translation": "Vrai ?"}
|
||||||
{"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J’ai toujours préféré les personnages mystérieux."}
|
{"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J’ai toujours préféré les personnages mystérieux."}
|
||||||
|
{"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J'ai toujours préféré les personnages mystérieux."}
|
||||||
{"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J'ai toujours plus apprécié les personnages mystérieux."}
|
{"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J'ai toujours plus apprécié les personnages mystérieux."}
|
||||||
{"text": "Тобі краще поспати.", "translation": "Tu ferais mieux de dormir."}
|
{"text": "Тобі краще поспати.", "translation": "Tu ferais mieux de dormir."}
|
||||||
{"text": "Обдумай це.", "translation": "Penses-y."}
|
{"text": "Обдумай це.", "translation": "Penses-y."}
|
||||||
@ -67,6 +69,7 @@
|
|||||||
{"text": "Ця традиція зародилася в Китаї.", "translation": "Cette tradition est née en Chine."}
|
{"text": "Ця традиція зародилася в Китаї.", "translation": "Cette tradition est née en Chine."}
|
||||||
{"text": "У Японії є дипломатичні стосунки з Китаєм.", "translation": "Le Japon a des relations diplomatiques avec la Chine."}
|
{"text": "У Японії є дипломатичні стосунки з Китаєм.", "translation": "Le Japon a des relations diplomatiques avec la Chine."}
|
||||||
{"text": "Він повернувся з Китаю.", "translation": "Il est revenu de Chine."}
|
{"text": "Він повернувся з Китаю.", "translation": "Il est revenu de Chine."}
|
||||||
|
{"text": "Він повернувся з Китаю.", "translation": "Il est reparti de Chine."}
|
||||||
{"text": "Він повернувся з Китаю.", "translation": "Il revint de Chine."}
|
{"text": "Він повернувся з Китаю.", "translation": "Il revint de Chine."}
|
||||||
{"text": "Він повернувся з Китаю.", "translation": "Il est rentré de Chine."}
|
{"text": "Він повернувся з Китаю.", "translation": "Il est rentré de Chine."}
|
||||||
{"text": "Він написав книжку про порцеляну.", "translation": "Il a écrit un livre sur la porcelaine."}
|
{"text": "Він написав книжку про порцеляну.", "translation": "Il a écrit un livre sur la porcelaine."}
|
||||||
@ -122,6 +125,7 @@
|
|||||||
{"text": "Я кохаю тебе.", "translation": "Je t'aime !"}
|
{"text": "Я кохаю тебе.", "translation": "Je t'aime !"}
|
||||||
{"text": "З днем народження!", "translation": "Bon anniversaire !"}
|
{"text": "З днем народження!", "translation": "Bon anniversaire !"}
|
||||||
{"text": "З днем народження!", "translation": "Joyeux anniversaire !"}
|
{"text": "З днем народження!", "translation": "Joyeux anniversaire !"}
|
||||||
|
{"text": "З днем народження!", "translation": "Joyeux anniversaire."}
|
||||||
{"text": "Кожному своє.", "translation": "À chacun son goût."}
|
{"text": "Кожному своє.", "translation": "À chacun son goût."}
|
||||||
{"text": "Кожному своє.", "translation": "Chacun son truc."}
|
{"text": "Кожному своє.", "translation": "Chacun son truc."}
|
||||||
{"text": "Скільки це коштує?", "translation": "Ça coûte combien ?"}
|
{"text": "Скільки це коштує?", "translation": "Ça coûte combien ?"}
|
||||||
@ -167,6 +171,7 @@
|
|||||||
{"text": "В інтернеті мало сайтів татарською мовою.", "translation": "Il y a peu de sites en langue Tatar sur Internet."}
|
{"text": "В інтернеті мало сайтів татарською мовою.", "translation": "Il y a peu de sites en langue Tatar sur Internet."}
|
||||||
{"text": "Удачі на іспиті!", "translation": "Bonne chance pour ton examen !"}
|
{"text": "Удачі на іспиті!", "translation": "Bonne chance pour ton examen !"}
|
||||||
{"text": "Де ти живеш?", "translation": "Où habites-tu ?"}
|
{"text": "Де ти живеш?", "translation": "Où habites-tu ?"}
|
||||||
|
{"text": "Де ти живеш?", "translation": "Où résides-tu ?"}
|
||||||
{"text": "Де ти живеш?", "translation": "Tu habites où ?"}
|
{"text": "Де ти живеш?", "translation": "Tu habites où ?"}
|
||||||
{"text": "Де ти живеш?", "translation": "Où demeures-tu ?"}
|
{"text": "Де ти живеш?", "translation": "Où demeures-tu ?"}
|
||||||
{"text": "Де ти живеш?", "translation": "Où vis-tu ?"}
|
{"text": "Де ти живеш?", "translation": "Où vis-tu ?"}
|
||||||
@ -2,7 +2,7 @@ import json
|
|||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
# Chemin vers ton fichier d'entrée et de sortie
|
# Chemin vers ton fichier d'entrée et de sortie
|
||||||
input_file = "Paires-de-phrases-en-ukrainien-francais-2026-01-06.tsv" # Remplace par ton chemin
|
input_file = "Paires de phrases en ukrainien-français - 2026-01-06.tsv" # Remplace par ton chemin
|
||||||
output_file = "paires.json" # Fichier de sortie
|
output_file = "paires.json" # Fichier de sortie
|
||||||
|
|
||||||
# Dictionnaire pour stocker les paires uniques (clé = phrase ukrainienne, valeur = liste de traductions)
|
# Dictionnaire pour stocker les paires uniques (clé = phrase ukrainienne, valeur = liste de traductions)
|
||||||
|
|||||||
@ -1,30 +0,0 @@
|
|||||||
{"text": "Як би ти не намагався, ти не вивчиш англійську за два-три місяці.", "translation": "Quels que soient tes efforts, tu ne pourras pas apprendre l’anglais en deux-trois mois."}
|
|
||||||
{"text": "Поки я не подзвонив, він не прийшов.", "translation": "Il n’est pas venu avant que je ne l’appelle."}
|
|
||||||
{"text": "У всесвіті багато галактик.", "translation": "Il y a beaucoup de galaxies dans l'univers."}
|
|
||||||
{"text": "Вона приймає душ щоранку.", "translation": "Elle prend une douche chaque matin."}
|
|
||||||
{"text": "У Майка є декілька друзів у Флориді.", "translation": "Mike a quelques amis en Floride."}
|
|
||||||
{"text": "Я зустрінуся з тобою в неділю о третій.", "translation": "On se voit dimanche à trois heures."}
|
|
||||||
{"text": "Я сказав собі: «Це гарна ідея».", "translation": "Je me suis dit : « C’est une bonne idée. »"}
|
|
||||||
{"text": "Ми збиралися пробути там біля двох тижнів.", "translation": "Nous avions l’intention de rester là près de deux semaines."}
|
|
||||||
{"text": "Я чищу зуби двічі на день.", "translation": "Je me brosse les dents deux fois par jour."}
|
|
||||||
{"text": "Він ніжно поклав руку на її плече.", "translation": "Il posa la main gentiment sur son épaule."}
|
|
||||||
{"text": "Сьогодні жахливо холодно.", "translation": "Il fait horriblement froid aujourd'hui."}
|
|
||||||
{"text": "У цю суму включено податки.", "translation": "Cette somme inclut les taxes."}
|
|
||||||
{"text": "Ця школа була заснована в 1650 році.", "translation": "Cette école fut fondée en 1650."}
|
|
||||||
{"text": "Я випадково знайшов цей ресторан.", "translation": "J'ai trouvé ce restaurant par hasard."}
|
|
||||||
{"text": "Я не хотів нікого образити.", "translation": "Je ne voulais vexer personne."}
|
|
||||||
{"text": "Цей сад найкраще виглядає весною.", "translation": "Ce parc est plus joli au printemps."}
|
|
||||||
{"text": "Цей сир виготовлено з овечого молока.", "translation": "Ce fromage est fait avec du lait de chèvre."}
|
|
||||||
{"text": "Він спить як немовля.", "translation": "Il dort comme un bébé."}
|
|
||||||
{"text": "Гора вкрита снігом.", "translation": "La montagne est recouverte de neige."}
|
|
||||||
{"text": "Я попав під дощ і промок.", "translation": "J’ai été pris sous la pluie, et suis tout trempé."}
|
|
||||||
{"text": "Прошу, дайте мені ще один шанс.", "translation": "Je vous en prie, donnez-moi encore une chance."}
|
|
||||||
{"text": "Я все сказав.", "translation": "J’ai tout dit."}
|
|
||||||
{"text": "Не забувай нас!", "translation": "Ne nous oublie pas !"}
|
|
||||||
{"text": "Випало багато снігу.", "translation": "Beaucoup de neige est tombée."}
|
|
||||||
{"text": "Йде сніг.", "translation": "Il est en train de neiger."}
|
|
||||||
{"text": "Може піти сніг.", "translation": "Il neigera peut-être."}
|
|
||||||
{"text": "У нас у січні йде сніг.", "translation": "Chez nous, il neige en janvier."}
|
|
||||||
{"text": "Сніг розтав.", "translation": "La neige a fondu."}
|
|
||||||
{"text": "Наша компанія планує побудувати новий хімічний завод у Росії.", "translation": "Notre entreprise a le projet de construire une nouvelle usine chimique en Russie."}
|
|
||||||
{"text": "Франція воювала з Росією.", "translation": "La France fut en guerre avec la Russie."}
|
|
||||||
@ -1,170 +0,0 @@
|
|||||||
import torch
|
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
|
||||||
from peft import PeftModel
|
|
||||||
from datasets import load_dataset
|
|
||||||
from nltk.translate.bleu_score import corpus_bleu
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Configuration
|
|
||||||
# ----------------------------
|
|
||||||
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct" # base model
|
|
||||||
LORA_DIR = "./qwen2.5-7b-uk-fr-lora" # fine-tuned LoRA
|
|
||||||
VALIDATION_FILE = "validation.jsonl" # small validation subset
|
|
||||||
MAX_INPUT_LENGTH = 1024
|
|
||||||
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
|
||||||
|
|
||||||
# Liste des prompts à tester
|
|
||||||
PROMPTS_TO_TEST = [
|
|
||||||
{
|
|
||||||
"name": "Prompt de base",
|
|
||||||
"prompt": "Traduis la phrase ukrainienne suivante en français: {text}"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Prompt spécialisé mémoires",
|
|
||||||
"prompt": (
|
|
||||||
"Tu es un traducteur spécialisé dans les mémoires ukrainiennes des années 1910.\n"
|
|
||||||
"- Garde le style narratif et les tournures orales de l'auteur.\n"
|
|
||||||
"- Respecte les règles de traduction suivantes :\n\n"
|
|
||||||
"Règles strictes :\n"
|
|
||||||
"1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]).\n"
|
|
||||||
"2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l'auteur.\n\n"
|
|
||||||
"Voici la phrase à traduire :\nUkrainien : {text}\nFrançais :"
|
|
||||||
)
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Prompt détaillé",
|
|
||||||
"prompt": (
|
|
||||||
"Tu es un expert en traduction littéraire spécialisé dans les textes historiques ukrainiens.\n"
|
|
||||||
"Règles à suivre absolument :\n"
|
|
||||||
"1. Conserve tous les noms propres et toponymes dans leur forme originale\n"
|
|
||||||
"2. Préserve le style et le registre de l'auteur original\n"
|
|
||||||
"3. Ajoute des notes entre crochets pour expliquer les références culturelles si nécessaire\n"
|
|
||||||
"4. Traduis de manière naturelle en français tout en restant fidèle au texte source\n\n"
|
|
||||||
"Texte à traduire :\nUkrainien : {text}\nTraduction française :"
|
|
||||||
)
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Prompt minimaliste",
|
|
||||||
"prompt": "Traduction fidèle de l'ukrainien vers le français : {text}"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
print("=== Loading tokenizer and model ===")
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Load tokenizer
|
|
||||||
# ----------------------------
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
|
||||||
BASE_MODEL,
|
|
||||||
trust_remote_code=True
|
|
||||||
)
|
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
|
||||||
tokenizer.model_max_length = MAX_INPUT_LENGTH
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Load base model directly on GPU
|
|
||||||
# ----------------------------
|
|
||||||
print(f"{80 * '_'}\nLoading base model on GPU...")
|
|
||||||
base_model = AutoModelForCausalLM.from_pretrained(
|
|
||||||
BASE_MODEL,
|
|
||||||
torch_dtype=torch.float16,
|
|
||||||
device_map={"": 0}, # all on GPU
|
|
||||||
trust_remote_code=True
|
|
||||||
)
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Apply LoRA adapter
|
|
||||||
# ----------------------------
|
|
||||||
print(f"{80 * '_'}\nApplying LoRA adapter...")
|
|
||||||
model = PeftModel.from_pretrained(base_model, LORA_DIR)
|
|
||||||
model.eval()
|
|
||||||
model.to(DEVICE) # ensure everything on GPU
|
|
||||||
print("Model ready for validation.")
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Load validation dataset
|
|
||||||
# ----------------------------
|
|
||||||
print(f"{80 * '_'}\nLoading validation dataset...")
|
|
||||||
dataset = load_dataset("json", data_files=VALIDATION_FILE)
|
|
||||||
examples = dataset["train"]
|
|
||||||
print(f"{len(examples)} examples loaded for testing.")
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Translation function
|
|
||||||
# ----------------------------
|
|
||||||
@torch.inference_mode()
|
|
||||||
def translate(text, prompt_template):
|
|
||||||
prompt = prompt_template.format(text=text)
|
|
||||||
inputs = tokenizer(
|
|
||||||
prompt,
|
|
||||||
return_tensors="pt",
|
|
||||||
truncation=True,
|
|
||||||
max_length=MAX_INPUT_LENGTH
|
|
||||||
).to(DEVICE)
|
|
||||||
|
|
||||||
# Utilisation de GenerationConfig pour éviter les avertissements
|
|
||||||
generation_config = GenerationConfig.from_model_config(model.config)
|
|
||||||
generation_config.max_new_tokens = 256
|
|
||||||
generation_config.do_sample = False
|
|
||||||
|
|
||||||
outputs = model.generate(
|
|
||||||
**inputs,
|
|
||||||
generation_config=generation_config
|
|
||||||
)
|
|
||||||
|
|
||||||
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
|
||||||
|
|
||||||
# Extraction de la partie traduction
|
|
||||||
if "Français :" in result:
|
|
||||||
translation_part = result.split("Français :")[-1].strip()
|
|
||||||
elif "Traduction française :" in result:
|
|
||||||
translation_part = result.split("Traduction française :")[-1].strip()
|
|
||||||
else:
|
|
||||||
translation_part = result.split(text)[-1].strip()
|
|
||||||
|
|
||||||
return translation_part
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Evaluate all prompts and select best BLEU
|
|
||||||
# ----------------------------
|
|
||||||
best_bleu = 0
|
|
||||||
best_prompt = None
|
|
||||||
all_results = {}
|
|
||||||
|
|
||||||
print(f"{80 * '_'}\nTesting all prompts and computing BLEU scores...")
|
|
||||||
|
|
||||||
for prompt_config in PROMPTS_TO_TEST:
|
|
||||||
print(f"\n{80 * '='}\nTesting prompt: {prompt_config['name']}\n{80 * '='}")
|
|
||||||
references = []
|
|
||||||
hypotheses = []
|
|
||||||
|
|
||||||
for i, example in enumerate(examples):
|
|
||||||
src_text = example["text"]
|
|
||||||
ref_text = example["translation"]
|
|
||||||
pred_text = translate(src_text, prompt_config["prompt"])
|
|
||||||
|
|
||||||
print(f"\n[{i+1}] Source: {src_text}")
|
|
||||||
print(f" Reference: {ref_text}")
|
|
||||||
print(f" Prediction: {pred_text}")
|
|
||||||
|
|
||||||
references.append([ref_text.split()])
|
|
||||||
hypotheses.append(pred_text.split())
|
|
||||||
|
|
||||||
bleu_score = corpus_bleu(references, hypotheses) * 100
|
|
||||||
print(f"\n=== Corpus BLEU score for '{prompt_config['name']}': {bleu_score:.4f} ===")
|
|
||||||
|
|
||||||
all_results[prompt_config["name"]] = bleu_score
|
|
||||||
|
|
||||||
if bleu_score > best_bleu:
|
|
||||||
best_bleu = bleu_score
|
|
||||||
best_prompt = prompt_config
|
|
||||||
|
|
||||||
# ----------------------------
|
|
||||||
# Display results
|
|
||||||
# ----------------------------
|
|
||||||
print(f"\n{80 * '='}\nFINAL RESULTS\n{80 * '='}")
|
|
||||||
for prompt_name, score in all_results.items():
|
|
||||||
print(f"{prompt_name}: {score:.4f}")
|
|
||||||
|
|
||||||
print(f"\nBEST PROMPT: {best_prompt['name']} with BLEU score: {best_bleu:.4f}")
|
|
||||||
print(f"Prompt content:\n{best_prompt['prompt']}")
|
|
||||||
70
README.md
70
README.md
@ -80,73 +80,3 @@ Vous pouvez modifier les paramètres suivants dans `main.py` :
|
|||||||
- `OUTPUT_PDF_PATH` : Chemin et nom du fichier PDF de sortie (généré autoamtiquement)
|
- `OUTPUT_PDF_PATH` : Chemin et nom du fichier PDF de sortie (généré autoamtiquement)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Finnetunning
|
|
||||||
Le finne-tunning permet d'avoir une meilleur traduction. C'est un processus long en temps de calcul, mais permet une traduction plus précise.
|
|
||||||
|
|
||||||
Le principe est le suivant :
|
|
||||||
|
|
||||||
```
|
|
||||||
1️⃣ Dataset d’entraînement (pairs.json)
|
|
||||||
↓
|
|
||||||
1️⃣ Dataset nettoyé ( cleanDataSet.py -> pairs_clean.json)
|
|
||||||
↓
|
|
||||||
2️⃣ Fine-tuning LoRA (finetuning.py)
|
|
||||||
↓
|
|
||||||
3️⃣ Validation / Évaluation BLEU (validation.py)
|
|
||||||
↓
|
|
||||||
4️⃣ Merge LoRA + modèle de base (mergeLora.py)
|
|
||||||
↓
|
|
||||||
5️⃣ Conversion en GGUF ()
|
|
||||||
↓
|
|
||||||
6️⃣ Ollama (inférence finale)
|
|
||||||
|
|
||||||
```
|
|
||||||
|
|
||||||
### Nettoyage du dataset
|
|
||||||
Executer le script ```python cleanDataSet.py```
|
|
||||||
|
|
||||||
### Validation
|
|
||||||
Executer le script ```python validation.py```
|
|
||||||
|
|
||||||
Le script tests plusieurs prompts et renvoie celui avec le meilleur score BLEU.
|
|
||||||
|
|
||||||
Il faut ensuite copier ce prompt dans le fichier ModelFile.
|
|
||||||
|
|
||||||
### Merge
|
|
||||||
Executer le script ```python mergeLora.py```
|
|
||||||
|
|
||||||
### Conversion en GGUF
|
|
||||||
En étant à la racine du projet (et toujours dans le venv), cloner le projet llama.cpp
|
|
||||||
```bash
|
|
||||||
git clone https://github.com/ggerganov/llama.cpp
|
|
||||||
cd llama.cpp
|
|
||||||
pip install -r requirements.txt
|
|
||||||
```
|
|
||||||
|
|
||||||
Et lancer la commande (/!\ ca prend eviron 10 minutes):
|
|
||||||
```bash
|
|
||||||
python convert_hf_to_gguf.py ../Finetunning/qwen2.5-7b-uk-fr-merged --outfile qwen2.5-7b-uk-fr.gguf --outtype q8_0
|
|
||||||
```
|
|
||||||
|
|
||||||
Vérification :
|
|
||||||
```bash
|
|
||||||
./main -m qwen2.5-7b-uk-fr.gguf -p "Translate into French: Привіт світ"
|
|
||||||
```
|
|
||||||
Pour que ce nouveau modèle soit exploitable par ollama, il faut TODO
|
|
||||||
|
|
||||||
## Utilisation du modèle fine-tunné pour la traduction
|
|
||||||
Créer un Modelfile :
|
|
||||||
```
|
|
||||||
FROM ./qwen2.5-7b-uk-fr.gguf
|
|
||||||
|
|
||||||
PARAMETER temperature 0.1
|
|
||||||
PARAMETER top_p 0.95
|
|
||||||
PARAMETER num_ctx 4096
|
|
||||||
|
|
||||||
SYSTEM """
|
|
||||||
You are a professional Ukrainian to French translator.
|
|
||||||
Produce faithful, literal translations.
|
|
||||||
"""
|
|
||||||
|
|
||||||
```
|
|
||||||
@ -3,15 +3,14 @@ PARAMETER temperature 0.2
|
|||||||
PARAMETER num_ctx 8192
|
PARAMETER num_ctx 8192
|
||||||
|
|
||||||
SYSTEM """
|
SYSTEM """
|
||||||
|
|
||||||
Tu es un traducteur spécialisé dans les mémoires ukrainiennes des années 1910.
|
Tu es un traducteur spécialisé dans les mémoires ukrainiennes des années 1910.
|
||||||
|
- Utilise le glossaire fourni pour les noms de lieux et termes historiques.
|
||||||
- Garde le style narratif et les tournures orales de l'auteur.
|
- Garde le style narratif et les tournures orales de l'auteur.
|
||||||
- Respecte les règles de traduction suivantes :
|
|
||||||
Règles strictes :
|
Règles strictes :
|
||||||
1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]).
|
1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]).
|
||||||
2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l'auteur.
|
2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l’auteur.
|
||||||
3. **Pour les termes historiques** (ex. : "powiat"), utilise le terme français standard et ajoute une note explicative.
|
3. **Pour les termes historiques** (ex. : "powiat"), utilise le terme français standard et ajoute une note explicative.
|
||||||
4. **Conserve les citations** russe/allemand/polonais intégrés au texte (mais ajoute une note de fin de paragraphe entre [ ] en la traduisant et en précisant la langue d'origine.
|
4. **Conserve les citations** russe/allemand/polonais intégrés au texte (mais ajoute une note de fin de paragraphe entre [ ] en la traduisant et en précisant la langue d'origine.
|
||||||
5. **Structure** : Garde les sauts de ligne et la mise en page originale.
|
5. **Structure** : Garde les sauts de ligne et la mise en page originale.
|
||||||
6. **Notes du traducteur** : Ajoute entre crochets [ ] les explications contextuelles si un contexte historique exist.
|
6. **Notes du traducteur** : Ajoute entre crochets [ ] les explications contextuelles si un contexte historique existe (ex. : "[Note : le context]").
|
||||||
"""
|
"""
|
||||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -3,23 +3,23 @@ import requests
|
|||||||
import json
|
import json
|
||||||
from reportlab.lib.pagesizes import letter
|
from reportlab.lib.pagesizes import letter
|
||||||
from reportlab.lib.units import inch
|
from reportlab.lib.units import inch
|
||||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Flowable
|
||||||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||||
from reportlab.lib.enums import TA_JUSTIFY
|
from reportlab.lib.enums import TA_JUSTIFY
|
||||||
from reportlab.pdfbase import pdfmetrics
|
from reportlab.pdfbase import pdfmetrics
|
||||||
from reportlab.pdfbase.ttfonts import TTFont
|
from reportlab.pdfbase.ttfonts import TTFont
|
||||||
import os, time
|
import os
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
DEBUG = True
|
DEBUG = True
|
||||||
PDF_PATH = "Traduction/TaniaBorecMemoir(Ukr).pdf"
|
PDF_PATH = "Traduction\TaniaBorecMemoir(Ukr).pdf"
|
||||||
OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
|
OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
|
||||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||||
TARGET_LANGUAGE = "français"
|
TARGET_LANGUAGE = "français"
|
||||||
CHECKPOINT_FILE = "Traduction/checkpoint.json"
|
CHECKPOINT_FILE = "Traduction\checkpoint.json"
|
||||||
TEMP_OUTPUT_TXT = "Traduction/output_temp.txt"
|
TEMP_OUTPUT_TXT = "Traduction\output_temp.txt"
|
||||||
FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V9.pdf")
|
FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V8.pdf")
|
||||||
FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V9.txt")
|
FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V8.txt")
|
||||||
|
|
||||||
DEBUG = True
|
DEBUG = True
|
||||||
|
|
||||||
@ -341,7 +341,6 @@ def main():
|
|||||||
print(f"Batches manquants détectés : {missing_batches}")
|
print(f"Batches manquants détectés : {missing_batches}")
|
||||||
|
|
||||||
# Traduction des paragraphes manquants
|
# Traduction des paragraphes manquants
|
||||||
temps_cumule = 0.0
|
|
||||||
for i in missing_batches:
|
for i in missing_batches:
|
||||||
batch = paragraphs[i:i + batch_size]
|
batch = paragraphs[i:i + batch_size]
|
||||||
paragraph_cumul = "\n".join(batch)
|
paragraph_cumul = "\n".join(batch)
|
||||||
@ -349,24 +348,13 @@ def main():
|
|||||||
print(f"{15 * '-'} Traduction des paragraphes manquants {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
|
print(f"{15 * '-'} Traduction des paragraphes manquants {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
debut_chrono = time.time()
|
|
||||||
result = send_to_ollama(paragraph_cumul)
|
result = send_to_ollama(paragraph_cumul)
|
||||||
fin_chrono = time.time()
|
|
||||||
temps_paragraphe = fin_chrono - debut_chrono
|
|
||||||
temps_cumule += temps_paragraphe
|
|
||||||
|
|
||||||
# Conversion en minutes et secondes
|
|
||||||
minutes_paragraphe, secondes_paragraphe = divmod(temps_paragraphe, 60)
|
|
||||||
minutes_cumule, secondes_cumule = divmod(temps_cumule, 60)
|
|
||||||
|
|
||||||
print(f"{result}")
|
print(f"{result}")
|
||||||
results[str(i)] = result
|
results[str(i)] = result
|
||||||
save_checkpoint(len(paragraphs), results) # Met à jour le dernier indice du batch
|
save_checkpoint(len(paragraphs), results) # Met à jour le dernier indice du batch
|
||||||
save_temp_results(results)
|
save_temp_results(results)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Erreur lors de la traduction du paragraphe {i}: {e}")
|
print(f"Erreur lors de la traduction du paragraphe {i}: {e}")
|
||||||
print(f" Temps de traduction : {int(minutes_paragraphe)} min {secondes_paragraphe:.2f} sec")
|
|
||||||
print(f" Temps cumulé : {int(minutes_cumule)} min {secondes_cumule:.2f} sec")
|
|
||||||
|
|
||||||
# Traitement des paragraphes suivants
|
# Traitement des paragraphes suivants
|
||||||
for i in range(last_index + 1, len(paragraphs), batch_size):
|
for i in range(last_index + 1, len(paragraphs), batch_size):
|
||||||
@ -376,16 +364,7 @@ def main():
|
|||||||
print(f"{15 * '-'} Traduction des paragraphes {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
|
print(f"{15 * '-'} Traduction des paragraphes {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
debut_chrono = time.time()
|
|
||||||
result = send_to_ollama(paragraph_cumul)
|
result = send_to_ollama(paragraph_cumul)
|
||||||
fin_chrono = time.time()
|
|
||||||
temps_paragraphe = fin_chrono - debut_chrono
|
|
||||||
temps_cumule += temps_paragraphe
|
|
||||||
|
|
||||||
# Conversion en minutes et secondes
|
|
||||||
minutes_paragraphe, secondes_paragraphe = divmod(temps_paragraphe, 60)
|
|
||||||
minutes_cumule, secondes_cumule = divmod(temps_cumule, 60)
|
|
||||||
|
|
||||||
print(f"{result}")
|
print(f"{result}")
|
||||||
results[str(i)] = result
|
results[str(i)] = result
|
||||||
save_checkpoint(i + batch_size - 1, results)
|
save_checkpoint(i + batch_size - 1, results)
|
||||||
@ -393,9 +372,6 @@ def main():
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Erreur : {e}")
|
print(f"Erreur : {e}")
|
||||||
continue
|
continue
|
||||||
print(f" Temps de traduction : {int(minutes_paragraphe)} min {secondes_paragraphe:.2f} sec")
|
|
||||||
print(f" Temps cumulé : {int(minutes_cumule)} min {secondes_cumule:.2f} sec")
|
|
||||||
|
|
||||||
|
|
||||||
save_temp_results(results)
|
save_temp_results(results)
|
||||||
create_pdf_from_results(results, FINAL_OUTPUT_PDF)
|
create_pdf_from_results(results, FINAL_OUTPUT_PDF)
|
||||||
|
|||||||
@ -14,4 +14,3 @@ peft
|
|||||||
bitsandbytes
|
bitsandbytes
|
||||||
accelerate
|
accelerate
|
||||||
trl
|
trl
|
||||||
nltk
|
|
||||||
2
run.bat
2
run.bat
@ -17,7 +17,6 @@ REM Activer l'environnement virtuel Python
|
|||||||
call %VENV_PATH%\Scripts\activate.bat
|
call %VENV_PATH%\Scripts\activate.bat
|
||||||
|
|
||||||
REM Lancer la compilation du modèle LLM pour Ollama
|
REM Lancer la compilation du modèle LLM pour Ollama
|
||||||
echo Compilation du modèle LLM pour Ollama
|
|
||||||
ollama create traductionUkrainienVersFrancais -f .\Traduction\Modelfile
|
ollama create traductionUkrainienVersFrancais -f .\Traduction\Modelfile
|
||||||
|
|
||||||
:: 1. Vérifie si le processus ollama.exe est en cours d'exécution
|
:: 1. Vérifie si le processus ollama.exe est en cours d'exécution
|
||||||
@ -40,7 +39,6 @@ if %ERRORLEVEL% neq 0 (
|
|||||||
)
|
)
|
||||||
|
|
||||||
REM Exécuter le script principal
|
REM Exécuter le script principal
|
||||||
echo Lancement du script principal de traduction
|
|
||||||
python %MAIN_SCRIPT_PATH%
|
python %MAIN_SCRIPT_PATH%
|
||||||
|
|
||||||
endlocal
|
endlocal
|
||||||
Loading…
x
Reference in New Issue
Block a user