Source code for pettingzoo.utils.average_total_reward

from __future__ import annotations

import random

import numpy as np

from pettingzoo.utils.env import AECEnv



[docs]
def average_total_reward(
    env: AECEnv, max_episodes: int = 100, max_steps: int = 10000000000
) -> float:
    """Calculates the average total reward over the episodes for AEC environments.

    Runs an env object with random actions until either max_episodes or
    max_steps is reached.
    Reward is summed across all agents, making it unsuited for use in zero-sum
    games.
    """
    total_reward = 0
    total_steps = 0
    num_episodes = 0

    for episode in range(max_episodes):
        if total_steps >= max_steps:
            break

        env.reset()
        for agent in env.agent_iter():
            # Because we call env.last() this function only works with AEC envs
            obs, reward, termination, truncation, _ = env.last(observe=False)
            total_reward += reward
            total_steps += 1
            if termination or truncation:
                action = None
            elif isinstance(obs, dict) and "action_mask" in obs:
                actions = np.flatnonzero(obs["action_mask"]).tolist()
                assert not isinstance(actions, int)
                action = random.choice(actions)
            else:
                action = env.action_space(agent).sample()
            env.step(action)

        num_episodes = episode + 1
    print("Average total reward", total_reward / num_episodes)

    return total_reward / num_episodes