|
|
|
@ -182,13 +182,19 @@ print("Trainer initialized.") |
|
|
|
# [7/7] Training |
|
|
|
# [7/7] Training |
|
|
|
# ---------------------------- |
|
|
|
# ---------------------------- |
|
|
|
print(f"{80 * '_'}\n[7/7] Starting training...") |
|
|
|
print(f"{80 * '_'}\n[7/7] Starting training...") |
|
|
|
try: |
|
|
|
checkpoint_exists = any( |
|
|
|
|
|
|
|
d.startswith("checkpoint-") |
|
|
|
|
|
|
|
for d in os.listdir(OUTPUT_DIR) |
|
|
|
|
|
|
|
) if os.path.exists(OUTPUT_DIR) else False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if checkpoint_exists: |
|
|
|
|
|
|
|
print("Checkpoint found → resuming training") |
|
|
|
train_output = trainer.train(resume_from_checkpoint=True) |
|
|
|
train_output = trainer.train(resume_from_checkpoint=True) |
|
|
|
except Exception as e: |
|
|
|
else: |
|
|
|
print("No checkpoint found or resume failed, starting fresh training.") |
|
|
|
print("No checkpoint found → starting fresh training") |
|
|
|
print(f"Reason: {e}") |
|
|
|
|
|
|
|
train_output = trainer.train() |
|
|
|
train_output = trainer.train() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print("\n=== Training summary ===") |
|
|
|
print("\n=== Training summary ===") |
|
|
|
print(f"Global steps: {train_output.global_step}") |
|
|
|
print(f"Global steps: {train_output.global_step}") |
|
|
|
print(f"Training loss: {train_output.training_loss}") |
|
|
|
print(f"Training loss: {train_output.training_loss}") |
|
|
|
|