meilleur gestion de la reprise du trainning

This commit is contained in:
Alex 2026-01-15 17:03:18 +01:00
parent fa3ad61dd7
commit 51e114b1ee
1 changed files with 10 additions and 4 deletions

View File

@ -182,13 +182,19 @@ print("Trainer initialized.")
# [7/7] Training # [7/7] Training
# ---------------------------- # ----------------------------
print(f"{80 * '_'}\n[7/7] Starting training...") print(f"{80 * '_'}\n[7/7] Starting training...")
try: checkpoint_exists = any(
d.startswith("checkpoint-")
for d in os.listdir(OUTPUT_DIR)
) if os.path.exists(OUTPUT_DIR) else False
if checkpoint_exists:
print("Checkpoint found → resuming training")
train_output = trainer.train(resume_from_checkpoint=True) train_output = trainer.train(resume_from_checkpoint=True)
except Exception as e: else:
print("No checkpoint found or resume failed, starting fresh training.") print("No checkpoint found → starting fresh training")
print(f"Reason: {e}")
train_output = trainer.train() train_output = trainer.train()
print("\n=== Training summary ===") print("\n=== Training summary ===")
print(f"Global steps: {train_output.global_step}") print(f"Global steps: {train_output.global_step}")
print(f"Training loss: {train_output.training_loss}") print(f"Training loss: {train_output.training_loss}")