meilleur gestion de la reprise du trainning
This commit is contained in:
parent
fa3ad61dd7
commit
51e114b1ee
|
|
@ -182,13 +182,19 @@ print("Trainer initialized.")
|
||||||
# [7/7] Training
|
# [7/7] Training
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
print(f"{80 * '_'}\n[7/7] Starting training...")
|
print(f"{80 * '_'}\n[7/7] Starting training...")
|
||||||
try:
|
checkpoint_exists = any(
|
||||||
|
d.startswith("checkpoint-")
|
||||||
|
for d in os.listdir(OUTPUT_DIR)
|
||||||
|
) if os.path.exists(OUTPUT_DIR) else False
|
||||||
|
|
||||||
|
if checkpoint_exists:
|
||||||
|
print("Checkpoint found → resuming training")
|
||||||
train_output = trainer.train(resume_from_checkpoint=True)
|
train_output = trainer.train(resume_from_checkpoint=True)
|
||||||
except Exception as e:
|
else:
|
||||||
print("No checkpoint found or resume failed, starting fresh training.")
|
print("No checkpoint found → starting fresh training")
|
||||||
print(f"Reason: {e}")
|
|
||||||
train_output = trainer.train()
|
train_output = trainer.train()
|
||||||
|
|
||||||
|
|
||||||
print("\n=== Training summary ===")
|
print("\n=== Training summary ===")
|
||||||
print(f"Global steps: {train_output.global_step}")
|
print(f"Global steps: {train_output.global_step}")
|
||||||
print(f"Training loss: {train_output.training_loss}")
|
print(f"Training loss: {train_output.training_loss}")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue