# Specify model directly (overrides environment variable)
reward-kit agent-eval --task-dir ./flight_task --model openai/gpt-4o
# Use custom output directory
reward-kit agent-eval --task-dir ./flight_task --output-dir ./my_runs
# Disable simulated user (use static initial messages only)
reward-kit agent-eval --task-dir ./flight_task --no-sim-user
# Use test mode without requiring API keys
reward-kit agent-eval --task-dir ./flight_task --test-mode
# Use mock response in test mode
reward-kit agent-eval --task-dir ./flight_task --test-mode --mock-response
# Run in debug mode with verbose output
reward-kit agent-eval --task-dir ./flight_task --debug
# Limit the number of tasks to evaluate
reward-kit agent-eval --task-dir ./flight_task --max-tasks 2
# Run specific tasks by ID
reward-kit agent-eval --task-dir ./flight_task --task-ids flight.booking.001,flight.booking.002
# Use a specific registry for a task
reward-kit agent-eval --task-dir ./flight_task --registry-override my_custom_tools.flight_tools
# Use multiple tool registries
reward-kit agent-eval --task-dir ./complex_task --registries flight=flight_tools,hotel=hotel_tools
# Specify evaluator
reward-kit agent-eval --task-dir ./flight_task --evaluator flight_reward.success_evaluator