Compare commits
12 Commits
182e6e7a98
...
51e114b1ee
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
51e114b1ee | ||
|
|
fa3ad61dd7 | ||
|
|
70e4932cd0 | ||
|
|
aee2716a41 | ||
|
|
bf7949d8c3 | ||
|
|
a4296d012e | ||
|
|
c5d372e98d | ||
|
|
8d2e5ac021 | ||
|
|
adca297850 | ||
|
|
83b2eccd07 | ||
|
|
8dfb2b81e0 | ||
|
|
4ed1ffa226 |
2
Finetunning/.gitignore
vendored
Normal file
2
Finetunning/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
||||
# Les modèles générés
|
||||
qwen2.5*/
|
||||
|
Can't render this file because it is too large.
|
144
Finetunning/cleanDataSet.py
Normal file
144
Finetunning/cleanDataSet.py
Normal file
@ -0,0 +1,144 @@
|
||||
import json
|
||||
import unicodedata
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
|
||||
# ----------------------------
|
||||
# Configuration
|
||||
# ----------------------------
|
||||
INPUT_FILE = "paires.json"
|
||||
OUTPUT_FILE = "paires_clean.json"
|
||||
|
||||
MIN_TOKENS = 5
|
||||
MAX_TOKENS = 200
|
||||
MIN_QUALITY_SCORE = 0.60
|
||||
|
||||
print("=== Dataset cleaning + quality scoring started ===")
|
||||
|
||||
# ----------------------------
|
||||
# Normalization helpers
|
||||
# ----------------------------
|
||||
def normalize_text(text: str) -> str:
|
||||
text = unicodedata.normalize("NFKC", text)
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
text = text.replace("’", "'").replace("‘", "'").replace("“", '"').replace("”", '"')
|
||||
return text
|
||||
|
||||
|
||||
def token_count(text: str) -> int:
|
||||
return len(text.split())
|
||||
|
||||
|
||||
# ----------------------------
|
||||
# Quality scoring
|
||||
# ----------------------------
|
||||
def length_ratio_score(src_len, tgt_len):
|
||||
"""
|
||||
Ideal ratio FR/UK ≈ 0.9 – 1.3
|
||||
"""
|
||||
ratio = tgt_len / max(src_len, 1)
|
||||
|
||||
if ratio < 0.5 or ratio > 2.0:
|
||||
return 0.0
|
||||
elif 0.75 <= ratio <= 1.5:
|
||||
return 1.0
|
||||
else:
|
||||
return max(0.0, 1.0 - abs(ratio - 1.1))
|
||||
|
||||
|
||||
def lexical_density_score(text):
|
||||
"""
|
||||
Penalize very repetitive or trivial translations
|
||||
"""
|
||||
tokens = text.split()
|
||||
if not tokens:
|
||||
return 0.0
|
||||
unique_ratio = len(set(tokens)) / len(tokens)
|
||||
return min(1.0, unique_ratio * 1.5)
|
||||
|
||||
|
||||
def quality_score(src, tgt):
|
||||
src_len = token_count(src)
|
||||
tgt_len = token_count(tgt)
|
||||
|
||||
l_score = length_ratio_score(src_len, tgt_len)
|
||||
d_score = lexical_density_score(tgt)
|
||||
|
||||
return 0.7 * l_score + 0.3 * d_score
|
||||
|
||||
|
||||
# ----------------------------
|
||||
# Load + clean + score
|
||||
# ----------------------------
|
||||
unique_sources = OrderedDict()
|
||||
|
||||
stats = {
|
||||
"total": 0,
|
||||
"removed_length": 0,
|
||||
"removed_duplicates": 0,
|
||||
"removed_quality": 0,
|
||||
}
|
||||
|
||||
with open(INPUT_FILE, "r", encoding="utf-8") as f:
|
||||
for line in f:
|
||||
stats["total"] += 1
|
||||
item = json.loads(line)
|
||||
|
||||
src = normalize_text(item["text"])
|
||||
tgt = normalize_text(item["translation"])
|
||||
|
||||
src_len = token_count(src)
|
||||
tgt_len = token_count(tgt)
|
||||
|
||||
# Length filtering
|
||||
if not (MIN_TOKENS <= src_len <= MAX_TOKENS):
|
||||
stats["removed_length"] += 1
|
||||
continue
|
||||
|
||||
if not (MIN_TOKENS <= tgt_len <= MAX_TOKENS):
|
||||
stats["removed_length"] += 1
|
||||
continue
|
||||
|
||||
# Deduplication
|
||||
if src in unique_sources:
|
||||
stats["removed_duplicates"] += 1
|
||||
continue
|
||||
|
||||
# Quality score
|
||||
q_score = quality_score(src, tgt)
|
||||
if q_score < MIN_QUALITY_SCORE:
|
||||
stats["removed_quality"] += 1
|
||||
continue
|
||||
|
||||
unique_sources[src] = {
|
||||
"translation": tgt,
|
||||
"quality_score": round(q_score, 3)
|
||||
}
|
||||
|
||||
# ----------------------------
|
||||
# Report
|
||||
# ----------------------------
|
||||
print(f"Total lines processed: {stats['total']}")
|
||||
print(f"Removed (length): {stats['removed_length']}")
|
||||
print(f"Removed (duplicates): {stats['removed_duplicates']}")
|
||||
print(f"Removed (quality): {stats['removed_quality']}")
|
||||
print(f"Final kept pairs: {len(unique_sources)}")
|
||||
|
||||
# ----------------------------
|
||||
# Save cleaned dataset
|
||||
# ----------------------------
|
||||
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
|
||||
for src, data in unique_sources.items():
|
||||
json.dump(
|
||||
{
|
||||
"text": src,
|
||||
"translation": data["translation"],
|
||||
"quality_score": data["quality_score"],
|
||||
},
|
||||
f,
|
||||
ensure_ascii=False
|
||||
)
|
||||
f.write("\n")
|
||||
|
||||
print(f"=== Cleaning completed ===")
|
||||
print(f"Clean dataset saved to: {OUTPUT_FILE}")
|
||||
@ -1,9 +1,11 @@
|
||||
import os
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from transformers import (
|
||||
AutoTokenizer,
|
||||
AutoModelForCausalLM,
|
||||
TrainingArguments,
|
||||
BitsAndBytesConfig,
|
||||
)
|
||||
from peft import (
|
||||
LoraConfig,
|
||||
@ -11,50 +13,85 @@ from peft import (
|
||||
prepare_model_for_kbit_training,
|
||||
)
|
||||
from trl import SFTTrainer
|
||||
import os
|
||||
|
||||
# ----------------------------
|
||||
# Environment safety (Windows)
|
||||
# ----------------------------
|
||||
os.environ["TORCHDYNAMO_DISABLE"] = "1"
|
||||
|
||||
# ----------------------------
|
||||
# Model configuration
|
||||
# Global configuration
|
||||
# ----------------------------
|
||||
MODEL_NAME = "Qwen/Qwen2.5-14B-Instruct"
|
||||
MODEL_NAME = "Qwen/Qwen2.5-7B-Instruct"
|
||||
OUTPUT_DIR = "./qwen2.5-7b-uk-fr-lora"
|
||||
DATA_FILE = "paires_clean.json"
|
||||
MAX_SEQ_LENGTH = 1024
|
||||
|
||||
print("=== Starting fine-tuning script ===")
|
||||
print(f"\n=== Starting fine-tuning script for {MODEL_NAME} ===\n")
|
||||
|
||||
# ----------------------------
|
||||
# [1/7] Tokenizer
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\n[1/7] Loading tokenizer...")
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
MODEL_NAME,
|
||||
trust_remote_code=True
|
||||
)
|
||||
|
||||
# Ensure padding token is defined
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
tokenizer.model_max_length = 1024
|
||||
tokenizer.model_max_length = MAX_SEQ_LENGTH
|
||||
|
||||
print("Tokenizer loaded and configured.")
|
||||
|
||||
print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (QLoRA)...")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_NAME,
|
||||
load_in_4bit=True,
|
||||
device_map="auto",
|
||||
torch_dtype=torch.float16, # OK for weights
|
||||
trust_remote_code=True,
|
||||
)
|
||||
print("Model loaded.")
|
||||
|
||||
print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...")
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
print("Model prepared for k-bit training.")
|
||||
print("Tokenizer loaded.")
|
||||
print(f"Pad token id: {tokenizer.pad_token_id}")
|
||||
print(f"Max sequence length: {tokenizer.model_max_length}")
|
||||
|
||||
# ----------------------------
|
||||
# LoRA configuration
|
||||
# [2/7] Quantization config (QLoRA)
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (optimized QLoRA)...")
|
||||
|
||||
assert torch.cuda.is_available(), "CUDA GPU not detected!"
|
||||
print(f"Using GPU: {torch.cuda.get_device_name(0)}")
|
||||
|
||||
bnb_config = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=torch.float16,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
)
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
MODEL_NAME,
|
||||
device_map="cuda", # 🔥 SAFE
|
||||
quantization_config=bnb_config,
|
||||
low_cpu_mem_usage=True,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
|
||||
print("Model loaded successfully in 4-bit mode on GPU.")
|
||||
|
||||
|
||||
# ----------------------------
|
||||
# [3/7] Prepare model for k-bit training
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\n[3/7] Preparing model for k-bit training...")
|
||||
model = prepare_model_for_kbit_training(model)
|
||||
|
||||
model.gradient_checkpointing_enable(
|
||||
gradient_checkpointing_kwargs={"use_reentrant": False}
|
||||
)
|
||||
|
||||
print("Model prepared for k-bit training.")
|
||||
print("Gradient checkpointing enabled (non-reentrant).")
|
||||
|
||||
# ----------------------------
|
||||
# [4/7] LoRA configuration
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\n[4/7] Configuring LoRA adapters...")
|
||||
lora_config = LoraConfig(
|
||||
r=16,
|
||||
lora_alpha=32,
|
||||
lora_dropout=0.05,
|
||||
r=32,
|
||||
lora_alpha=64,
|
||||
lora_dropout=0.02,
|
||||
bias="none",
|
||||
task_type="CAUSAL_LM",
|
||||
target_modules=[
|
||||
@ -70,57 +107,64 @@ lora_config = LoraConfig(
|
||||
|
||||
model = get_peft_model(model, lora_config)
|
||||
model.print_trainable_parameters()
|
||||
print("LoRA adapters attached to the model.")
|
||||
|
||||
print("LoRA adapters successfully attached.")
|
||||
|
||||
# ----------------------------
|
||||
# Dataset loading
|
||||
# [5/7] Dataset loading & formatting
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...")
|
||||
dataset = load_dataset(
|
||||
"json",
|
||||
data_files="traductions.json"
|
||||
)
|
||||
dataset = load_dataset("json", data_files=DATA_FILE)
|
||||
|
||||
print(f"Dataset loaded with {len(dataset['train'])} samples.")
|
||||
|
||||
print("Formatting dataset for Ukrainian → French translation...")
|
||||
|
||||
def format_prompt(example):
|
||||
prompt = (
|
||||
"Translate the following Ukrainian text into French.\n\n"
|
||||
f"Ukrainian: {example['text']}\n"
|
||||
f"French: {example['translation']}"
|
||||
)
|
||||
return {"text": prompt}
|
||||
return {
|
||||
"text": ("<|user|>\n"
|
||||
"Translate the following Ukrainian text into French.\n"
|
||||
f"Ukrainian: {example['text']}\n"
|
||||
"<|assistant|>\n"
|
||||
f"{example['translation']}"
|
||||
)
|
||||
}
|
||||
|
||||
dataset = dataset.map(
|
||||
format_prompt,
|
||||
remove_columns=dataset["train"].column_names
|
||||
)
|
||||
|
||||
dataset = dataset.map(format_prompt, remove_columns=dataset["train"].column_names)
|
||||
print("Dataset formatting completed.")
|
||||
print("Example prompt:\n")
|
||||
print(dataset["train"][0]["text"])
|
||||
|
||||
# ----------------------------
|
||||
# Training arguments
|
||||
# [6/7] Training arguments
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\n[6/7] Initializing training arguments...")
|
||||
training_args = TrainingArguments(
|
||||
output_dir="./qwen-uk-fr-lora",
|
||||
output_dir=OUTPUT_DIR,
|
||||
per_device_train_batch_size=1,
|
||||
gradient_accumulation_steps=8,
|
||||
learning_rate=2e-4,
|
||||
learning_rate=1e-4,
|
||||
num_train_epochs=3,
|
||||
|
||||
fp16=False,
|
||||
bf16=False,
|
||||
|
||||
optim="paged_adamw_32bit",
|
||||
logging_steps=10,
|
||||
save_steps=500,
|
||||
save_total_limit=2,
|
||||
|
||||
# Use 32-bit optimizer
|
||||
optim="paged_adamw_32bit",
|
||||
|
||||
report_to="none",
|
||||
dataloader_pin_memory=False,
|
||||
)
|
||||
|
||||
|
||||
print("Training arguments ready.")
|
||||
print(f"Output directory: {OUTPUT_DIR}")
|
||||
print(f"Epochs: {training_args.num_train_epochs}")
|
||||
print(f"Effective batch size: "
|
||||
f"{training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}"
|
||||
)
|
||||
|
||||
# ----------------------------
|
||||
# Trainer
|
||||
@ -135,18 +179,34 @@ trainer = SFTTrainer(
|
||||
print("Trainer initialized.")
|
||||
|
||||
# ----------------------------
|
||||
# Train
|
||||
# [7/7] Training
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\n[7/7] Starting training...")
|
||||
trainer.train()
|
||||
checkpoint_exists = any(
|
||||
d.startswith("checkpoint-")
|
||||
for d in os.listdir(OUTPUT_DIR)
|
||||
) if os.path.exists(OUTPUT_DIR) else False
|
||||
|
||||
if checkpoint_exists:
|
||||
print("Checkpoint found → resuming training")
|
||||
train_output = trainer.train(resume_from_checkpoint=True)
|
||||
else:
|
||||
print("No checkpoint found → starting fresh training")
|
||||
train_output = trainer.train()
|
||||
|
||||
|
||||
print("\n=== Training summary ===")
|
||||
print(f"Global steps: {train_output.global_step}")
|
||||
print(f"Training loss: {train_output.training_loss}")
|
||||
print(f"Metrics: {train_output.metrics}")
|
||||
print("Training completed successfully.")
|
||||
|
||||
# ----------------------------
|
||||
# Save LoRA adapter
|
||||
# ----------------------------
|
||||
print("Saving LoRA adapter and tokenizer...")
|
||||
trainer.model.save_pretrained("./qwen-uk-fr-lora")
|
||||
tokenizer.save_pretrained("./qwen-uk-fr-lora")
|
||||
print(f"{80 * '_'}\nSaving LoRA adapter and tokenizer...")
|
||||
trainer.model.save_pretrained(OUTPUT_DIR)
|
||||
tokenizer.save_pretrained(OUTPUT_DIR)
|
||||
|
||||
print("=== Fine-tuning finished ===")
|
||||
print("LoRA adapter saved in ./qwen-uk-fr-lora")
|
||||
print("\n=== Fine-tuning finished ===")
|
||||
print(f"LoRA adapter saved in: {OUTPUT_DIR}")
|
||||
|
||||
68
Finetunning/mergeLora.py
Normal file
68
Finetunning/mergeLora.py
Normal file
@ -0,0 +1,68 @@
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||
from peft import PeftModel
|
||||
|
||||
# ----------------------------
|
||||
# Configuration
|
||||
# ----------------------------
|
||||
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct"
|
||||
LORA_DIR = "./qwen2.5-7b-uk-fr-lora" # dossier issu du fine-tuning
|
||||
OUTPUT_DIR = "./qwen2.5-7b-uk-fr-merged" # modèle fusionné final
|
||||
|
||||
DTYPE = torch.float16 # GGUF-friendly
|
||||
DEVICE = "cpu" # merge sur CPU (stable, sûr)
|
||||
|
||||
print("=== LoRA merge script started ===")
|
||||
|
||||
# ----------------------------
|
||||
# Load base model
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\n[1/4] Loading base model...")
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
BASE_MODEL,
|
||||
torch_dtype=DTYPE,
|
||||
device_map=DEVICE,
|
||||
trust_remote_code=True,
|
||||
)
|
||||
print("Base model loaded.")
|
||||
|
||||
# ----------------------------
|
||||
# Load tokenizer
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\n[2/4] Loading tokenizer...")
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
BASE_MODEL,
|
||||
trust_remote_code=True
|
||||
)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
print("Tokenizer loaded.")
|
||||
|
||||
# ----------------------------
|
||||
# Load LoRA adapter
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\n[3/4] Loading LoRA adapter...")
|
||||
model = PeftModel.from_pretrained(
|
||||
base_model,
|
||||
LORA_DIR,
|
||||
)
|
||||
print("LoRA adapter loaded.")
|
||||
|
||||
# ----------------------------
|
||||
# Merge LoRA into base model
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\n[4/4] Merging LoRA into base model...")
|
||||
model = model.merge_and_unload()
|
||||
print("LoRA successfully merged.")
|
||||
|
||||
# ----------------------------
|
||||
# Save merged model
|
||||
# ----------------------------
|
||||
print("Saving merged model...")
|
||||
model.save_pretrained(
|
||||
OUTPUT_DIR,
|
||||
safe_serialization=True,
|
||||
)
|
||||
tokenizer.save_pretrained(OUTPUT_DIR)
|
||||
|
||||
print("=== Merge completed successfully ===")
|
||||
print(f"Merged model saved in: {OUTPUT_DIR}")
|
||||
@ -11,13 +11,11 @@
|
||||
{"text": "Як на мене, то наразі помовчу.", "translation": "En ce qui me concerne, je n’ai pour le moment rien à dire."}
|
||||
{"text": "Мій дядько вчора помер від раку шлунку.", "translation": "Mon oncle est mort hier d’un cancer à l’estomac."}
|
||||
{"text": "Я не знаю, що ще можна зробити.", "translation": "Je ne sais plus quoi faire."}
|
||||
{"text": "Я навчився жити без неї.", "translation": "J’ai appris à vivre sans elle."}
|
||||
{"text": "Я навчився жити без неї.", "translation": "J'ai appris à vivre sans elle."}
|
||||
{"text": "Справді?", "translation": "Vraiment ?"}
|
||||
{"text": "Справді?", "translation": "C'est vrai ?"}
|
||||
{"text": "Справді?", "translation": "Vrai ?"}
|
||||
{"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J’ai toujours préféré les personnages mystérieux."}
|
||||
{"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J'ai toujours préféré les personnages mystérieux."}
|
||||
{"text": "Мені завжди більше подобалися загадкові персонажі.", "translation": "J'ai toujours plus apprécié les personnages mystérieux."}
|
||||
{"text": "Тобі краще поспати.", "translation": "Tu ferais mieux de dormir."}
|
||||
{"text": "Обдумай це.", "translation": "Penses-y."}
|
||||
@ -69,7 +67,6 @@
|
||||
{"text": "Ця традиція зародилася в Китаї.", "translation": "Cette tradition est née en Chine."}
|
||||
{"text": "У Японії є дипломатичні стосунки з Китаєм.", "translation": "Le Japon a des relations diplomatiques avec la Chine."}
|
||||
{"text": "Він повернувся з Китаю.", "translation": "Il est revenu de Chine."}
|
||||
{"text": "Він повернувся з Китаю.", "translation": "Il est reparti de Chine."}
|
||||
{"text": "Він повернувся з Китаю.", "translation": "Il revint de Chine."}
|
||||
{"text": "Він повернувся з Китаю.", "translation": "Il est rentré de Chine."}
|
||||
{"text": "Він написав книжку про порцеляну.", "translation": "Il a écrit un livre sur la porcelaine."}
|
||||
@ -125,7 +122,6 @@
|
||||
{"text": "Я кохаю тебе.", "translation": "Je t'aime !"}
|
||||
{"text": "З днем народження!", "translation": "Bon anniversaire !"}
|
||||
{"text": "З днем народження!", "translation": "Joyeux anniversaire !"}
|
||||
{"text": "З днем народження!", "translation": "Joyeux anniversaire."}
|
||||
{"text": "Кожному своє.", "translation": "À chacun son goût."}
|
||||
{"text": "Кожному своє.", "translation": "Chacun son truc."}
|
||||
{"text": "Скільки це коштує?", "translation": "Ça coûte combien ?"}
|
||||
@ -171,7 +167,6 @@
|
||||
{"text": "В інтернеті мало сайтів татарською мовою.", "translation": "Il y a peu de sites en langue Tatar sur Internet."}
|
||||
{"text": "Удачі на іспиті!", "translation": "Bonne chance pour ton examen !"}
|
||||
{"text": "Де ти живеш?", "translation": "Où habites-tu ?"}
|
||||
{"text": "Де ти живеш?", "translation": "Où résides-tu ?"}
|
||||
{"text": "Де ти живеш?", "translation": "Tu habites où ?"}
|
||||
{"text": "Де ти живеш?", "translation": "Où demeures-tu ?"}
|
||||
{"text": "Де ти живеш?", "translation": "Où vis-tu ?"}
|
||||
@ -2,7 +2,7 @@ import json
|
||||
from collections import defaultdict
|
||||
|
||||
# Chemin vers ton fichier d'entrée et de sortie
|
||||
input_file = "Paires de phrases en ukrainien-français - 2026-01-06.tsv" # Remplace par ton chemin
|
||||
input_file = "Paires-de-phrases-en-ukrainien-francais-2026-01-06.tsv" # Remplace par ton chemin
|
||||
output_file = "paires.json" # Fichier de sortie
|
||||
|
||||
# Dictionnaire pour stocker les paires uniques (clé = phrase ukrainienne, valeur = liste de traductions)
|
||||
|
||||
30
Finetunning/validation.jsonl
Normal file
30
Finetunning/validation.jsonl
Normal file
@ -0,0 +1,30 @@
|
||||
{"text": "Як би ти не намагався, ти не вивчиш англійську за два-три місяці.", "translation": "Quels que soient tes efforts, tu ne pourras pas apprendre l’anglais en deux-trois mois."}
|
||||
{"text": "Поки я не подзвонив, він не прийшов.", "translation": "Il n’est pas venu avant que je ne l’appelle."}
|
||||
{"text": "У всесвіті багато галактик.", "translation": "Il y a beaucoup de galaxies dans l'univers."}
|
||||
{"text": "Вона приймає душ щоранку.", "translation": "Elle prend une douche chaque matin."}
|
||||
{"text": "У Майка є декілька друзів у Флориді.", "translation": "Mike a quelques amis en Floride."}
|
||||
{"text": "Я зустрінуся з тобою в неділю о третій.", "translation": "On se voit dimanche à trois heures."}
|
||||
{"text": "Я сказав собі: «Це гарна ідея».", "translation": "Je me suis dit : « C’est une bonne idée. »"}
|
||||
{"text": "Ми збиралися пробути там біля двох тижнів.", "translation": "Nous avions l’intention de rester là près de deux semaines."}
|
||||
{"text": "Я чищу зуби двічі на день.", "translation": "Je me brosse les dents deux fois par jour."}
|
||||
{"text": "Він ніжно поклав руку на її плече.", "translation": "Il posa la main gentiment sur son épaule."}
|
||||
{"text": "Сьогодні жахливо холодно.", "translation": "Il fait horriblement froid aujourd'hui."}
|
||||
{"text": "У цю суму включено податки.", "translation": "Cette somme inclut les taxes."}
|
||||
{"text": "Ця школа була заснована в 1650 році.", "translation": "Cette école fut fondée en 1650."}
|
||||
{"text": "Я випадково знайшов цей ресторан.", "translation": "J'ai trouvé ce restaurant par hasard."}
|
||||
{"text": "Я не хотів нікого образити.", "translation": "Je ne voulais vexer personne."}
|
||||
{"text": "Цей сад найкраще виглядає весною.", "translation": "Ce parc est plus joli au printemps."}
|
||||
{"text": "Цей сир виготовлено з овечого молока.", "translation": "Ce fromage est fait avec du lait de chèvre."}
|
||||
{"text": "Він спить як немовля.", "translation": "Il dort comme un bébé."}
|
||||
{"text": "Гора вкрита снігом.", "translation": "La montagne est recouverte de neige."}
|
||||
{"text": "Я попав під дощ і промок.", "translation": "J’ai été pris sous la pluie, et suis tout trempé."}
|
||||
{"text": "Прошу, дайте мені ще один шанс.", "translation": "Je vous en prie, donnez-moi encore une chance."}
|
||||
{"text": "Я все сказав.", "translation": "J’ai tout dit."}
|
||||
{"text": "Не забувай нас!", "translation": "Ne nous oublie pas !"}
|
||||
{"text": "Випало багато снігу.", "translation": "Beaucoup de neige est tombée."}
|
||||
{"text": "Йде сніг.", "translation": "Il est en train de neiger."}
|
||||
{"text": "Може піти сніг.", "translation": "Il neigera peut-être."}
|
||||
{"text": "У нас у січні йде сніг.", "translation": "Chez nous, il neige en janvier."}
|
||||
{"text": "Сніг розтав.", "translation": "La neige a fondu."}
|
||||
{"text": "Наша компанія планує побудувати новий хімічний завод у Росії.", "translation": "Notre entreprise a le projet de construire une nouvelle usine chimique en Russie."}
|
||||
{"text": "Франція воювала з Росією.", "translation": "La France fut en guerre avec la Russie."}
|
||||
170
Finetunning/validation.py
Normal file
170
Finetunning/validation.py
Normal file
@ -0,0 +1,170 @@
|
||||
import torch
|
||||
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig
|
||||
from peft import PeftModel
|
||||
from datasets import load_dataset
|
||||
from nltk.translate.bleu_score import corpus_bleu
|
||||
|
||||
# ----------------------------
|
||||
# Configuration
|
||||
# ----------------------------
|
||||
BASE_MODEL = "Qwen/Qwen2.5-7B-Instruct" # base model
|
||||
LORA_DIR = "./qwen2.5-7b-uk-fr-lora" # fine-tuned LoRA
|
||||
VALIDATION_FILE = "validation.jsonl" # small validation subset
|
||||
MAX_INPUT_LENGTH = 1024
|
||||
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
|
||||
# Liste des prompts à tester
|
||||
PROMPTS_TO_TEST = [
|
||||
{
|
||||
"name": "Prompt de base",
|
||||
"prompt": "Traduis la phrase ukrainienne suivante en français: {text}"
|
||||
},
|
||||
{
|
||||
"name": "Prompt spécialisé mémoires",
|
||||
"prompt": (
|
||||
"Tu es un traducteur spécialisé dans les mémoires ukrainiennes des années 1910.\n"
|
||||
"- Garde le style narratif et les tournures orales de l'auteur.\n"
|
||||
"- Respecte les règles de traduction suivantes :\n\n"
|
||||
"Règles strictes :\n"
|
||||
"1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]).\n"
|
||||
"2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l'auteur.\n\n"
|
||||
"Voici la phrase à traduire :\nUkrainien : {text}\nFrançais :"
|
||||
)
|
||||
},
|
||||
{
|
||||
"name": "Prompt détaillé",
|
||||
"prompt": (
|
||||
"Tu es un expert en traduction littéraire spécialisé dans les textes historiques ukrainiens.\n"
|
||||
"Règles à suivre absolument :\n"
|
||||
"1. Conserve tous les noms propres et toponymes dans leur forme originale\n"
|
||||
"2. Préserve le style et le registre de l'auteur original\n"
|
||||
"3. Ajoute des notes entre crochets pour expliquer les références culturelles si nécessaire\n"
|
||||
"4. Traduis de manière naturelle en français tout en restant fidèle au texte source\n\n"
|
||||
"Texte à traduire :\nUkrainien : {text}\nTraduction française :"
|
||||
)
|
||||
},
|
||||
{
|
||||
"name": "Prompt minimaliste",
|
||||
"prompt": "Traduction fidèle de l'ukrainien vers le français : {text}"
|
||||
}
|
||||
]
|
||||
|
||||
print("=== Loading tokenizer and model ===")
|
||||
|
||||
# ----------------------------
|
||||
# Load tokenizer
|
||||
# ----------------------------
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
BASE_MODEL,
|
||||
trust_remote_code=True
|
||||
)
|
||||
tokenizer.pad_token = tokenizer.eos_token
|
||||
tokenizer.model_max_length = MAX_INPUT_LENGTH
|
||||
|
||||
# ----------------------------
|
||||
# Load base model directly on GPU
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\nLoading base model on GPU...")
|
||||
base_model = AutoModelForCausalLM.from_pretrained(
|
||||
BASE_MODEL,
|
||||
torch_dtype=torch.float16,
|
||||
device_map={"": 0}, # all on GPU
|
||||
trust_remote_code=True
|
||||
)
|
||||
|
||||
# ----------------------------
|
||||
# Apply LoRA adapter
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\nApplying LoRA adapter...")
|
||||
model = PeftModel.from_pretrained(base_model, LORA_DIR)
|
||||
model.eval()
|
||||
model.to(DEVICE) # ensure everything on GPU
|
||||
print("Model ready for validation.")
|
||||
|
||||
# ----------------------------
|
||||
# Load validation dataset
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\nLoading validation dataset...")
|
||||
dataset = load_dataset("json", data_files=VALIDATION_FILE)
|
||||
examples = dataset["train"]
|
||||
print(f"{len(examples)} examples loaded for testing.")
|
||||
|
||||
# ----------------------------
|
||||
# Translation function
|
||||
# ----------------------------
|
||||
@torch.inference_mode()
|
||||
def translate(text, prompt_template):
|
||||
prompt = prompt_template.format(text=text)
|
||||
inputs = tokenizer(
|
||||
prompt,
|
||||
return_tensors="pt",
|
||||
truncation=True,
|
||||
max_length=MAX_INPUT_LENGTH
|
||||
).to(DEVICE)
|
||||
|
||||
# Utilisation de GenerationConfig pour éviter les avertissements
|
||||
generation_config = GenerationConfig.from_model_config(model.config)
|
||||
generation_config.max_new_tokens = 256
|
||||
generation_config.do_sample = False
|
||||
|
||||
outputs = model.generate(
|
||||
**inputs,
|
||||
generation_config=generation_config
|
||||
)
|
||||
|
||||
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
||||
|
||||
# Extraction de la partie traduction
|
||||
if "Français :" in result:
|
||||
translation_part = result.split("Français :")[-1].strip()
|
||||
elif "Traduction française :" in result:
|
||||
translation_part = result.split("Traduction française :")[-1].strip()
|
||||
else:
|
||||
translation_part = result.split(text)[-1].strip()
|
||||
|
||||
return translation_part
|
||||
|
||||
# ----------------------------
|
||||
# Evaluate all prompts and select best BLEU
|
||||
# ----------------------------
|
||||
best_bleu = 0
|
||||
best_prompt = None
|
||||
all_results = {}
|
||||
|
||||
print(f"{80 * '_'}\nTesting all prompts and computing BLEU scores...")
|
||||
|
||||
for prompt_config in PROMPTS_TO_TEST:
|
||||
print(f"\n{80 * '='}\nTesting prompt: {prompt_config['name']}\n{80 * '='}")
|
||||
references = []
|
||||
hypotheses = []
|
||||
|
||||
for i, example in enumerate(examples):
|
||||
src_text = example["text"]
|
||||
ref_text = example["translation"]
|
||||
pred_text = translate(src_text, prompt_config["prompt"])
|
||||
|
||||
print(f"\n[{i+1}] Source: {src_text}")
|
||||
print(f" Reference: {ref_text}")
|
||||
print(f" Prediction: {pred_text}")
|
||||
|
||||
references.append([ref_text.split()])
|
||||
hypotheses.append(pred_text.split())
|
||||
|
||||
bleu_score = corpus_bleu(references, hypotheses) * 100
|
||||
print(f"\n=== Corpus BLEU score for '{prompt_config['name']}': {bleu_score:.4f} ===")
|
||||
|
||||
all_results[prompt_config["name"]] = bleu_score
|
||||
|
||||
if bleu_score > best_bleu:
|
||||
best_bleu = bleu_score
|
||||
best_prompt = prompt_config
|
||||
|
||||
# ----------------------------
|
||||
# Display results
|
||||
# ----------------------------
|
||||
print(f"\n{80 * '='}\nFINAL RESULTS\n{80 * '='}")
|
||||
for prompt_name, score in all_results.items():
|
||||
print(f"{prompt_name}: {score:.4f}")
|
||||
|
||||
print(f"\nBEST PROMPT: {best_prompt['name']} with BLEU score: {best_bleu:.4f}")
|
||||
print(f"Prompt content:\n{best_prompt['prompt']}")
|
||||
70
README.md
70
README.md
@ -80,3 +80,73 @@ Vous pouvez modifier les paramètres suivants dans `main.py` :
|
||||
- `OUTPUT_PDF_PATH` : Chemin et nom du fichier PDF de sortie (généré autoamtiquement)
|
||||
|
||||
---
|
||||
|
||||
## Finnetunning
|
||||
Le finne-tunning permet d'avoir une meilleur traduction. C'est un processus long en temps de calcul, mais permet une traduction plus précise.
|
||||
|
||||
Le principe est le suivant :
|
||||
|
||||
```
|
||||
1️⃣ Dataset d’entraînement (pairs.json)
|
||||
↓
|
||||
1️⃣ Dataset nettoyé ( cleanDataSet.py -> pairs_clean.json)
|
||||
↓
|
||||
2️⃣ Fine-tuning LoRA (finetuning.py)
|
||||
↓
|
||||
3️⃣ Validation / Évaluation BLEU (validation.py)
|
||||
↓
|
||||
4️⃣ Merge LoRA + modèle de base (mergeLora.py)
|
||||
↓
|
||||
5️⃣ Conversion en GGUF ()
|
||||
↓
|
||||
6️⃣ Ollama (inférence finale)
|
||||
|
||||
```
|
||||
|
||||
### Nettoyage du dataset
|
||||
Executer le script ```python cleanDataSet.py```
|
||||
|
||||
### Validation
|
||||
Executer le script ```python validation.py```
|
||||
|
||||
Le script tests plusieurs prompts et renvoie celui avec le meilleur score BLEU.
|
||||
|
||||
Il faut ensuite copier ce prompt dans le fichier ModelFile.
|
||||
|
||||
### Merge
|
||||
Executer le script ```python mergeLora.py```
|
||||
|
||||
### Conversion en GGUF
|
||||
En étant à la racine du projet (et toujours dans le venv), cloner le projet llama.cpp
|
||||
```bash
|
||||
git clone https://github.com/ggerganov/llama.cpp
|
||||
cd llama.cpp
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
Et lancer la commande (/!\ ca prend eviron 10 minutes):
|
||||
```bash
|
||||
python convert_hf_to_gguf.py ../Finetunning/qwen2.5-7b-uk-fr-merged --outfile qwen2.5-7b-uk-fr.gguf --outtype q8_0
|
||||
```
|
||||
|
||||
Vérification :
|
||||
```bash
|
||||
./main -m qwen2.5-7b-uk-fr.gguf -p "Translate into French: Привіт світ"
|
||||
```
|
||||
Pour que ce nouveau modèle soit exploitable par ollama, il faut TODO
|
||||
|
||||
## Utilisation du modèle fine-tunné pour la traduction
|
||||
Créer un Modelfile :
|
||||
```
|
||||
FROM ./qwen2.5-7b-uk-fr.gguf
|
||||
|
||||
PARAMETER temperature 0.1
|
||||
PARAMETER top_p 0.95
|
||||
PARAMETER num_ctx 4096
|
||||
|
||||
SYSTEM """
|
||||
You are a professional Ukrainian to French translator.
|
||||
Produce faithful, literal translations.
|
||||
"""
|
||||
|
||||
```
|
||||
@ -3,14 +3,15 @@ PARAMETER temperature 0.2
|
||||
PARAMETER num_ctx 8192
|
||||
|
||||
SYSTEM """
|
||||
|
||||
Tu es un traducteur spécialisé dans les mémoires ukrainiennes des années 1910.
|
||||
- Utilise le glossaire fourni pour les noms de lieux et termes historiques.
|
||||
- Garde le style narratif et les tournures orales de l'auteur.
|
||||
- Respecte les règles de traduction suivantes :
|
||||
Règles strictes :
|
||||
1. **Conserve tous les noms de lieux** dans leur forme originale (ex. : Львів → Lviv, mais ajoute une note si nécessaire entre [ ]).
|
||||
2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l’auteur.
|
||||
2. **Respecte le style narratif** : garde les tournures orales et les expressions propres à l'auteur.
|
||||
3. **Pour les termes historiques** (ex. : "powiat"), utilise le terme français standard et ajoute une note explicative.
|
||||
4. **Conserve les citations** russe/allemand/polonais intégrés au texte (mais ajoute une note de fin de paragraphe entre [ ] en la traduisant et en précisant la langue d'origine.
|
||||
5. **Structure** : Garde les sauts de ligne et la mise en page originale.
|
||||
6. **Notes du traducteur** : Ajoute entre crochets [ ] les explications contextuelles si un contexte historique existe (ex. : "[Note : le context]").
|
||||
6. **Notes du traducteur** : Ajoute entre crochets [ ] les explications contextuelles si un contexte historique exist.
|
||||
"""
|
||||
1674
Traduction/TaniaBorecMemoir(Ukr) (FR) V1.pdf
Normal file
1674
Traduction/TaniaBorecMemoir(Ukr) (FR) V1.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1169
Traduction/TaniaBorecMemoir(Ukr) (FR).txt
Normal file
1169
Traduction/TaniaBorecMemoir(Ukr) (FR).txt
Normal file
File diff suppressed because it is too large
Load Diff
1637
Traduction/TaniaBorecMemoir(Ukr) (FR)_V1.pdf
Normal file
1637
Traduction/TaniaBorecMemoir(Ukr) (FR)_V1.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1637
Traduction/TaniaBorecMemoir(Ukr) (FR)_V2.pdf
Normal file
1637
Traduction/TaniaBorecMemoir(Ukr) (FR)_V2.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1623
Traduction/TaniaBorecMemoir(Ukr) (FR)_V3.pdf
Normal file
1623
Traduction/TaniaBorecMemoir(Ukr) (FR)_V3.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1032
Traduction/TaniaBorecMemoir(Ukr)(FR)_V1.txt
Normal file
1032
Traduction/TaniaBorecMemoir(Ukr)(FR)_V1.txt
Normal file
File diff suppressed because it is too large
Load Diff
1038
Traduction/TaniaBorecMemoir(Ukr)(FR)_V2.txt
Normal file
1038
Traduction/TaniaBorecMemoir(Ukr)(FR)_V2.txt
Normal file
File diff suppressed because it is too large
Load Diff
1030
Traduction/TaniaBorecMemoir(Ukr)(FR)_V3.txt
Normal file
1030
Traduction/TaniaBorecMemoir(Ukr)(FR)_V3.txt
Normal file
File diff suppressed because it is too large
Load Diff
1664
Traduction/TaniaBorecMemoir(Ukr)(FR)_V4.pdf
Normal file
1664
Traduction/TaniaBorecMemoir(Ukr)(FR)_V4.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1119
Traduction/TaniaBorecMemoir(Ukr)(FR)_V4.txt
Normal file
1119
Traduction/TaniaBorecMemoir(Ukr)(FR)_V4.txt
Normal file
File diff suppressed because it is too large
Load Diff
1628
Traduction/TaniaBorecMemoir(Ukr)(FR)_V5.pdf
Normal file
1628
Traduction/TaniaBorecMemoir(Ukr)(FR)_V5.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1107
Traduction/TaniaBorecMemoir(Ukr)(FR)_V5.txt
Normal file
1107
Traduction/TaniaBorecMemoir(Ukr)(FR)_V5.txt
Normal file
File diff suppressed because it is too large
Load Diff
1704
Traduction/TaniaBorecMemoir(Ukr)(FR)_V6.pdf
Normal file
1704
Traduction/TaniaBorecMemoir(Ukr)(FR)_V6.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1209
Traduction/TaniaBorecMemoir(Ukr)(FR)_V6.txt
Normal file
1209
Traduction/TaniaBorecMemoir(Ukr)(FR)_V6.txt
Normal file
File diff suppressed because it is too large
Load Diff
1702
Traduction/TaniaBorecMemoir(Ukr)(FR)_V7.pdf
Normal file
1702
Traduction/TaniaBorecMemoir(Ukr)(FR)_V7.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1214
Traduction/TaniaBorecMemoir(Ukr)(FR)_V7.txt
Normal file
1214
Traduction/TaniaBorecMemoir(Ukr)(FR)_V7.txt
Normal file
File diff suppressed because it is too large
Load Diff
1702
Traduction/TaniaBorecMemoir(Ukr)(FR)_V8.pdf
Normal file
1702
Traduction/TaniaBorecMemoir(Ukr)(FR)_V8.pdf
Normal file
File diff suppressed because it is too large
Load Diff
1214
Traduction/TaniaBorecMemoir(Ukr)(FR)_V8.txt
Normal file
1214
Traduction/TaniaBorecMemoir(Ukr)(FR)_V8.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -3,23 +3,23 @@ import requests
|
||||
import json
|
||||
from reportlab.lib.pagesizes import letter
|
||||
from reportlab.lib.units import inch
|
||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Flowable
|
||||
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
||||
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
||||
from reportlab.lib.enums import TA_JUSTIFY
|
||||
from reportlab.pdfbase import pdfmetrics
|
||||
from reportlab.pdfbase.ttfonts import TTFont
|
||||
import os
|
||||
import os, time
|
||||
|
||||
# Configuration
|
||||
DEBUG = True
|
||||
PDF_PATH = "Traduction\TaniaBorecMemoir(Ukr).pdf"
|
||||
PDF_PATH = "Traduction/TaniaBorecMemoir(Ukr).pdf"
|
||||
OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
|
||||
OLLAMA_URL = "http://localhost:11434/api/generate"
|
||||
TARGET_LANGUAGE = "français"
|
||||
CHECKPOINT_FILE = "Traduction\checkpoint.json"
|
||||
TEMP_OUTPUT_TXT = "Traduction\output_temp.txt"
|
||||
FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V8.pdf")
|
||||
FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V8.txt")
|
||||
CHECKPOINT_FILE = "Traduction/checkpoint.json"
|
||||
TEMP_OUTPUT_TXT = "Traduction/output_temp.txt"
|
||||
FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V9.pdf")
|
||||
FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V9.txt")
|
||||
|
||||
DEBUG = True
|
||||
|
||||
@ -341,6 +341,7 @@ def main():
|
||||
print(f"Batches manquants détectés : {missing_batches}")
|
||||
|
||||
# Traduction des paragraphes manquants
|
||||
temps_cumule = 0.0
|
||||
for i in missing_batches:
|
||||
batch = paragraphs[i:i + batch_size]
|
||||
paragraph_cumul = "\n".join(batch)
|
||||
@ -348,13 +349,24 @@ def main():
|
||||
print(f"{15 * '-'} Traduction des paragraphes manquants {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
|
||||
|
||||
try:
|
||||
debut_chrono = time.time()
|
||||
result = send_to_ollama(paragraph_cumul)
|
||||
fin_chrono = time.time()
|
||||
temps_paragraphe = fin_chrono - debut_chrono
|
||||
temps_cumule += temps_paragraphe
|
||||
|
||||
# Conversion en minutes et secondes
|
||||
minutes_paragraphe, secondes_paragraphe = divmod(temps_paragraphe, 60)
|
||||
minutes_cumule, secondes_cumule = divmod(temps_cumule, 60)
|
||||
|
||||
print(f"{result}")
|
||||
results[str(i)] = result
|
||||
save_checkpoint(len(paragraphs), results) # Met à jour le dernier indice du batch
|
||||
save_temp_results(results)
|
||||
except Exception as e:
|
||||
print(f"Erreur lors de la traduction du paragraphe {i}: {e}")
|
||||
print(f" Temps de traduction : {int(minutes_paragraphe)} min {secondes_paragraphe:.2f} sec")
|
||||
print(f" Temps cumulé : {int(minutes_cumule)} min {secondes_cumule:.2f} sec")
|
||||
|
||||
# Traitement des paragraphes suivants
|
||||
for i in range(last_index + 1, len(paragraphs), batch_size):
|
||||
@ -364,7 +376,16 @@ def main():
|
||||
print(f"{15 * '-'} Traduction des paragraphes {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")
|
||||
|
||||
try:
|
||||
debut_chrono = time.time()
|
||||
result = send_to_ollama(paragraph_cumul)
|
||||
fin_chrono = time.time()
|
||||
temps_paragraphe = fin_chrono - debut_chrono
|
||||
temps_cumule += temps_paragraphe
|
||||
|
||||
# Conversion en minutes et secondes
|
||||
minutes_paragraphe, secondes_paragraphe = divmod(temps_paragraphe, 60)
|
||||
minutes_cumule, secondes_cumule = divmod(temps_cumule, 60)
|
||||
|
||||
print(f"{result}")
|
||||
results[str(i)] = result
|
||||
save_checkpoint(i + batch_size - 1, results)
|
||||
@ -372,6 +393,9 @@ def main():
|
||||
except Exception as e:
|
||||
print(f"Erreur : {e}")
|
||||
continue
|
||||
print(f" Temps de traduction : {int(minutes_paragraphe)} min {secondes_paragraphe:.2f} sec")
|
||||
print(f" Temps cumulé : {int(minutes_cumule)} min {secondes_cumule:.2f} sec")
|
||||
|
||||
|
||||
save_temp_results(results)
|
||||
create_pdf_from_results(results, FINAL_OUTPUT_PDF)
|
||||
|
||||
@ -14,3 +14,4 @@ peft
|
||||
bitsandbytes
|
||||
accelerate
|
||||
trl
|
||||
nltk
|
||||
2
run.bat
2
run.bat
@ -17,6 +17,7 @@ REM Activer l'environnement virtuel Python
|
||||
call %VENV_PATH%\Scripts\activate.bat
|
||||
|
||||
REM Lancer la compilation du modèle LLM pour Ollama
|
||||
echo Compilation du modèle LLM pour Ollama
|
||||
ollama create traductionUkrainienVersFrancais -f .\Traduction\Modelfile
|
||||
|
||||
:: 1. Vérifie si le processus ollama.exe est en cours d'exécution
|
||||
@ -39,6 +40,7 @@ if %ERRORLEVEL% neq 0 (
|
||||
)
|
||||
|
||||
REM Exécuter le script principal
|
||||
echo Lancement du script principal de traduction
|
||||
python %MAIN_SCRIPT_PATH%
|
||||
|
||||
endlocal
|
||||
Loading…
x
Reference in New Issue
Block a user