meilleur gestion de la reprise du trainning
This commit is contained in:
parent
fa3ad61dd7
commit
51e114b1ee
@ -182,13 +182,19 @@ print("Trainer initialized.")
|
||||
# [7/7] Training
|
||||
# ----------------------------
|
||||
print(f"{80 * '_'}\n[7/7] Starting training...")
|
||||
try:
|
||||
checkpoint_exists = any(
|
||||
d.startswith("checkpoint-")
|
||||
for d in os.listdir(OUTPUT_DIR)
|
||||
) if os.path.exists(OUTPUT_DIR) else False
|
||||
|
||||
if checkpoint_exists:
|
||||
print("Checkpoint found → resuming training")
|
||||
train_output = trainer.train(resume_from_checkpoint=True)
|
||||
except Exception as e:
|
||||
print("No checkpoint found or resume failed, starting fresh training.")
|
||||
print(f"Reason: {e}")
|
||||
else:
|
||||
print("No checkpoint found → starting fresh training")
|
||||
train_output = trainer.train()
|
||||
|
||||
|
||||
print("\n=== Training summary ===")
|
||||
print(f"Global steps: {train_output.global_step}")
|
||||
print(f"Training loss: {train_output.training_loss}")
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user