Alex 1 settimana fa
parent
commit
70e4932cd0
3 ha cambiato i file con 62 aggiunte e 21 eliminazioni
  1. 29
    14
      Finetunning/finetunning.py
  2. 31
    7
      Traduction/main.py
  3. 2
    0
      run.bat

+ 29
- 14
Finetunning/finetunning.py Vedi File

AutoTokenizer, AutoTokenizer,
AutoModelForCausalLM, AutoModelForCausalLM,
TrainingArguments, TrainingArguments,
BitsAndBytesConfig
BitsAndBytesConfig,
) )
from peft import ( from peft import (
LoraConfig, LoraConfig,
print(f"Max sequence length: {tokenizer.model_max_length}") print(f"Max sequence length: {tokenizer.model_max_length}")


# ---------------------------- # ----------------------------
# [2/7] Model loading (QLoRA)
# [2/7] Quantization config (QLoRA)
# ---------------------------- # ----------------------------
print(f"{80 * '_'}\n[2/7] Loading model in 4-bit mode (QLoRA)...")
print(f"{80 * '_'}\n[2/7] Configuring 4-bit quantization (BitsAndBytes)...")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True,
)

print("4-bit NF4 quantization configured.")

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME, MODEL_NAME,
load_in_4bit=True,
device_map="auto", device_map="auto",
quantization_config=bnb_config,
dtype=torch.float16, dtype=torch.float16,
trust_remote_code=True, trust_remote_code=True,
) )
print("Model loaded.")
print("Model loaded successfully.")


# ---------------------------- # ----------------------------
# [3/7] Prepare model for k-bit training # [3/7] Prepare model for k-bit training
bias="none", bias="none",
task_type="CAUSAL_LM", task_type="CAUSAL_LM",
target_modules=[ target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj"
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
], ],

) )


model = get_peft_model(model, lora_config) model = get_peft_model(model, lora_config)
# [5/7] Dataset loading & formatting # [5/7] Dataset loading & formatting
# ---------------------------- # ----------------------------
print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...") print(f"{80 * '_'}\n[5/7] Loading dataset from JSON file...")
dataset = load_dataset(
"json",
data_files=DATA_FILE
)
dataset = load_dataset("json", data_files=DATA_FILE)


print(f"Dataset loaded with {len(dataset['train'])} samples.") print(f"Dataset loaded with {len(dataset['train'])} samples.")


) )


print("Dataset formatting completed.") print("Dataset formatting completed.")
print(f"Example prompt:\n{dataset['train'][0]['text']}")
print("Example prompt:\n")
print(dataset["train"][0]["text"])


# ---------------------------- # ----------------------------
# [6/7] Training arguments # [6/7] Training arguments
print("Training arguments ready.") print("Training arguments ready.")
print(f"Output directory: {OUTPUT_DIR}") print(f"Output directory: {OUTPUT_DIR}")
print(f"Epochs: {training_args.num_train_epochs}") print(f"Epochs: {training_args.num_train_epochs}")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(
f"Effective batch size: "
f"{training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}"
)


# ---------------------------- # ----------------------------
# Trainer # Trainer

+ 31
- 7
Traduction/main.py Vedi File

import json import json
from reportlab.lib.pagesizes import letter from reportlab.lib.pagesizes import letter
from reportlab.lib.units import inch from reportlab.lib.units import inch
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Flowable
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_JUSTIFY from reportlab.lib.enums import TA_JUSTIFY
from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont from reportlab.pdfbase.ttfonts import TTFont
import os
import os, time


# Configuration # Configuration
DEBUG = True DEBUG = True
PDF_PATH = "Traduction\TaniaBorecMemoir(Ukr).pdf"
PDF_PATH = "Traduction/TaniaBorecMemoir(Ukr).pdf"
OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest" OLLAMA_MODEL = "traductionUkrainienVersFrancais:latest"
OLLAMA_URL = "http://localhost:11434/api/generate" OLLAMA_URL = "http://localhost:11434/api/generate"
TARGET_LANGUAGE = "français" TARGET_LANGUAGE = "français"
CHECKPOINT_FILE = "Traduction\checkpoint.json"
TEMP_OUTPUT_TXT = "Traduction\output_temp.txt"
FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V8.pdf")
FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V8.txt")
CHECKPOINT_FILE = "Traduction/checkpoint.json"
TEMP_OUTPUT_TXT = "Traduction/output_temp.txt"
FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V9.pdf")
FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V9.txt")


DEBUG = True DEBUG = True


print(f"Batches manquants détectés : {missing_batches}") print(f"Batches manquants détectés : {missing_batches}")


# Traduction des paragraphes manquants # Traduction des paragraphes manquants
temps_cumule = 0.0
for i in missing_batches: for i in missing_batches:
batch = paragraphs[i:i + batch_size] batch = paragraphs[i:i + batch_size]
paragraph_cumul = "\n".join(batch) paragraph_cumul = "\n".join(batch)
print(f"{15 * '-'} Traduction des paragraphes manquants {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}") print(f"{15 * '-'} Traduction des paragraphes manquants {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")


try: try:
debut_chrono = time.time()
result = send_to_ollama(paragraph_cumul) result = send_to_ollama(paragraph_cumul)
fin_chrono = time.time()
temps_paragraphe = fin_chrono - debut_chrono
temps_cumule += temps_paragraphe
# Conversion en minutes et secondes
minutes_paragraphe, secondes_paragraphe = divmod(temps_paragraphe, 60)
minutes_cumule, secondes_cumule = divmod(temps_cumule, 60)
print(f"{result}") print(f"{result}")
results[str(i)] = result results[str(i)] = result
save_checkpoint(len(paragraphs), results) # Met à jour le dernier indice du batch save_checkpoint(len(paragraphs), results) # Met à jour le dernier indice du batch
save_temp_results(results) save_temp_results(results)
except Exception as e: except Exception as e:
print(f"Erreur lors de la traduction du paragraphe {i}: {e}") print(f"Erreur lors de la traduction du paragraphe {i}: {e}")
print(f" Temps de traduction : {int(minutes_paragraphe)} min {secondes_paragraphe:.2f} sec")
print(f" Temps cumulé : {int(minutes_cumule)} min {secondes_cumule:.2f} sec")


# Traitement des paragraphes suivants # Traitement des paragraphes suivants
for i in range(last_index + 1, len(paragraphs), batch_size): for i in range(last_index + 1, len(paragraphs), batch_size):
print(f"{15 * '-'} Traduction des paragraphes {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}") print(f"{15 * '-'} Traduction des paragraphes {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}")


try: try:
debut_chrono = time.time()
result = send_to_ollama(paragraph_cumul) result = send_to_ollama(paragraph_cumul)
fin_chrono = time.time()
temps_paragraphe = fin_chrono - debut_chrono
temps_cumule += temps_paragraphe
# Conversion en minutes et secondes
minutes_paragraphe, secondes_paragraphe = divmod(temps_paragraphe, 60)
minutes_cumule, secondes_cumule = divmod(temps_cumule, 60)

print(f"{result}") print(f"{result}")
results[str(i)] = result results[str(i)] = result
save_checkpoint(i + batch_size - 1, results) save_checkpoint(i + batch_size - 1, results)
except Exception as e: except Exception as e:
print(f"Erreur : {e}") print(f"Erreur : {e}")
continue continue
print(f" Temps de traduction : {int(minutes_paragraphe)} min {secondes_paragraphe:.2f} sec")
print(f" Temps cumulé : {int(minutes_cumule)} min {secondes_cumule:.2f} sec")



save_temp_results(results) save_temp_results(results)
create_pdf_from_results(results, FINAL_OUTPUT_PDF) create_pdf_from_results(results, FINAL_OUTPUT_PDF)

+ 2
- 0
run.bat Vedi File

call %VENV_PATH%\Scripts\activate.bat call %VENV_PATH%\Scripts\activate.bat


REM Lancer la compilation du modèle LLM pour Ollama REM Lancer la compilation du modèle LLM pour Ollama
echo Compilation du modèle LLM pour Ollama
ollama create traductionUkrainienVersFrancais -f .\Traduction\Modelfile ollama create traductionUkrainienVersFrancais -f .\Traduction\Modelfile


:: 1. Vérifie si le processus ollama.exe est en cours d'exécution :: 1. Vérifie si le processus ollama.exe est en cours d'exécution
) )


REM Exécuter le script principal REM Exécuter le script principal
echo Lancement du script principal de traduction
python %MAIN_SCRIPT_PATH% python %MAIN_SCRIPT_PATH%


endlocal endlocal

Loading…
Annulla
Salva