| # [7/7] Training | # [7/7] Training | ||||
| # ---------------------------- | # ---------------------------- | ||||
| print(f"{80 * '_'}\n[7/7] Starting training...") | print(f"{80 * '_'}\n[7/7] Starting training...") | ||||
| try: | |||||
| checkpoint_exists = any( | |||||
| d.startswith("checkpoint-") | |||||
| for d in os.listdir(OUTPUT_DIR) | |||||
| ) if os.path.exists(OUTPUT_DIR) else False | |||||
| if checkpoint_exists: | |||||
| print("Checkpoint found → resuming training") | |||||
| train_output = trainer.train(resume_from_checkpoint=True) | train_output = trainer.train(resume_from_checkpoint=True) | ||||
| except Exception as e: | |||||
| print("No checkpoint found or resume failed, starting fresh training.") | |||||
| print(f"Reason: {e}") | |||||
| else: | |||||
| print("No checkpoint found → starting fresh training") | |||||
| train_output = trainer.train() | train_output = trainer.train() | ||||
| print("\n=== Training summary ===") | print("\n=== Training summary ===") | ||||
| print(f"Global steps: {train_output.global_step}") | print(f"Global steps: {train_output.global_step}") | ||||
| print(f"Training loss: {train_output.training_loss}") | print(f"Training loss: {train_output.training_loss}") |