|
|
|
|
|
|
|
|
TARGET_LANGUAGE = "français" |
|
|
TARGET_LANGUAGE = "français" |
|
|
CHECKPOINT_FILE = "checkpoint.json" |
|
|
CHECKPOINT_FILE = "checkpoint.json" |
|
|
TEMP_OUTPUT_TXT = "output_temp.txt" |
|
|
TEMP_OUTPUT_TXT = "output_temp.txt" |
|
|
FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V4.pdf") |
|
|
|
|
|
FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V4.txt") |
|
|
|
|
|
|
|
|
FINAL_OUTPUT_PDF = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V5.pdf") |
|
|
|
|
|
FINAL_OUTPUT_TXT = PDF_PATH.replace(".pdf",f"({TARGET_LANGUAGE.upper()[:2]})_V5.txt") |
|
|
|
|
|
|
|
|
DEBUG = True |
|
|
DEBUG = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return json.load(f) |
|
|
return json.load(f) |
|
|
return {"last_processed_index": -1, "results": {}} |
|
|
return {"last_processed_index": -1, "results": {}} |
|
|
|
|
|
|
|
|
|
|
|
# Sauvegarde le checkpoint |
|
|
# Sauvegarde le checkpoint |
|
|
# Sauvegarde le checkpoint |
|
|
def save_checkpoint(last_index, results): |
|
|
def save_checkpoint(last_index, results): |
|
|
|
|
|
# Trier les clés du dictionnaire results |
|
|
|
|
|
sorted_results = {key: results[key] for key in sorted(results.keys(), key=int)} |
|
|
|
|
|
|
|
|
with open(CHECKPOINT_FILE, "w") as f: |
|
|
with open(CHECKPOINT_FILE, "w") as f: |
|
|
json.dump({"last_processed_index": last_index, "results": results}, f) |
|
|
|
|
|
|
|
|
# Utiliser un espace d'indentation de 4 espaces pour rendre le JSON plus lisible |
|
|
|
|
|
json.dump({"last_processed_index": last_index, "results": sorted_results}, f, indent=4) |
|
|
|
|
|
|
|
|
# Sauvegarde les résultats temporaires dans un fichier TXT |
|
|
# Sauvegarde les résultats temporaires dans un fichier TXT |
|
|
def save_temp_results(results): |
|
|
def save_temp_results(results): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Fonction principale |
|
|
# Fonction principale |
|
|
def main(): |
|
|
def main(): |
|
|
# Charge le checkpoint |
|
|
|
|
|
checkpoint = load_checkpoint() |
|
|
checkpoint = load_checkpoint() |
|
|
last_index = checkpoint["last_processed_index"] |
|
|
last_index = checkpoint["last_processed_index"] |
|
|
results = checkpoint["results"] |
|
|
results = checkpoint["results"] |
|
|
|
|
|
|
|
|
# Extraction des paragraphes |
|
|
|
|
|
pages = extract_text_from_pdf(PDF_PATH) |
|
|
pages = extract_text_from_pdf(PDF_PATH) |
|
|
paragraphs = split_pages_in_paragraphs(pages) |
|
|
paragraphs = split_pages_in_paragraphs(pages) |
|
|
|
|
|
|
|
|
# Traitement des paragraphes |
|
|
|
|
|
|
|
|
# Liste de tous les indices de batches attendus (par pas de batch_size) |
|
|
batch_size = 5 |
|
|
batch_size = 5 |
|
|
|
|
|
expected_batch_indices = list(range(0, len(paragraphs), batch_size)) |
|
|
|
|
|
|
|
|
|
|
|
# Liste des indices de batches déjà présents dans results |
|
|
|
|
|
present_batch_indices = set() |
|
|
|
|
|
for key in results.keys(): |
|
|
|
|
|
batch_start = int(int(key) // batch_size * batch_size) # Arrondit à l'indice de début de batch |
|
|
|
|
|
present_batch_indices.add(batch_start) |
|
|
|
|
|
|
|
|
|
|
|
# Trouve les batches manquants |
|
|
|
|
|
missing_batches = [i for i in expected_batch_indices if i not in present_batch_indices and i <= last_index] |
|
|
|
|
|
|
|
|
|
|
|
# Affichage des batches manquants (pour débogage) |
|
|
|
|
|
print(f"Batches manquants détectés : {missing_batches}") |
|
|
|
|
|
|
|
|
|
|
|
# Traduction des paragraphes manquants |
|
|
|
|
|
for i in missing_batches: |
|
|
|
|
|
batch = paragraphs[i:i + batch_size] |
|
|
|
|
|
paragraph_cumul = "\n".join(batch) |
|
|
|
|
|
|
|
|
|
|
|
print(f"{15 * '-'} Traduction des paragraphes manquants {i+1} à {min(i + batch_size, len(paragraphs))} / {len(paragraphs)}") |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
|
|
|
result = send_to_ollama(paragraph_cumul) |
|
|
|
|
|
print(f"{result}") |
|
|
|
|
|
results[str(i)] = result |
|
|
|
|
|
save_checkpoint(len(paragraphs), results) # Met à jour le dernier indice du batch |
|
|
|
|
|
save_temp_results(results) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print(f"Erreur lors de la traduction du paragraphe {i}: {e}") |
|
|
|
|
|
|
|
|
|
|
|
# Traitement des paragraphes suivants |
|
|
for i in range(last_index + 1, len(paragraphs), batch_size): |
|
|
for i in range(last_index + 1, len(paragraphs), batch_size): |
|
|
batch = paragraphs[i:i + batch_size] |
|
|
batch = paragraphs[i:i + batch_size] |
|
|
paragraph_cumul = "\n".join(batch) |
|
|
paragraph_cumul = "\n".join(batch) |
|
|
|
|
|
|
|
|
try: |
|
|
try: |
|
|
result = send_to_ollama(paragraph_cumul) |
|
|
result = send_to_ollama(paragraph_cumul) |
|
|
print(f"{result}") |
|
|
print(f"{result}") |
|
|
results[i] = result |
|
|
|
|
|
save_checkpoint(i, results) # Sauvegarde le checkpoint |
|
|
|
|
|
save_temp_results(results) # Sauvegarde les résultats temporaires |
|
|
|
|
|
|
|
|
results[str(i)] = result |
|
|
|
|
|
save_checkpoint(i + batch_size - 1, results) |
|
|
|
|
|
save_temp_results(results) |
|
|
except Exception as e: |
|
|
except Exception as e: |
|
|
print(f"Erreur : {e}") |
|
|
print(f"Erreur : {e}") |
|
|
continue |
|
|
continue |
|
|
|
|
|
|
|
|
# Génération des fichiers finaux |
|
|
|
|
|
save_temp_results(results) |
|
|
save_temp_results(results) |
|
|
create_pdf_from_results(results, FINAL_OUTPUT_PDF) |
|
|
create_pdf_from_results(results, FINAL_OUTPUT_PDF) |
|
|
create_txt_from_results(results, FINAL_OUTPUT_TXT) |
|
|
create_txt_from_results(results, FINAL_OUTPUT_TXT) |
|
|
print("Traduction terminée !") |
|
|
print("Traduction terminée !") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
main() |