Source code for pfrl.experiments.train_agent_batch

import logging
import os
from collections import deque

import numpy as np

from pfrl.experiments.evaluator import Evaluator, save_agent


[docs]def train_agent_batch( agent, env, steps, outdir, checkpoint_freq=None, log_interval=None, max_episode_len=None, step_offset=0, evaluator=None, successful_score=None, step_hooks=(), return_window_size=100, logger=None, ): """Train an agent in a batch environment. Args: agent: Agent to train. env: Environment to train the agent against. steps (int): Number of total time steps for training. outdir (str): Path to the directory to output things. checkpoint_freq (int): frequency at which agents are stored. log_interval (int): Interval of logging. max_episode_len (int): Maximum episode length. step_offset (int): Time step from which training starts. return_window_size (int): Number of training episodes used to estimate the average returns of the current agent. successful_score (float): Finish training if the mean score is greater or equal to thisvalue if not None step_hooks (Sequence): Sequence of callable objects that accepts (env, agent, step) as arguments. They are called every step. See pfrl.experiments.hooks. logger (logging.Logger): Logger used in this function. Returns: List of evaluation episode stats dict. """ logger = logger or logging.getLogger(__name__) recent_returns = deque(maxlen=return_window_size) num_envs = env.num_envs episode_r = np.zeros(num_envs, dtype=np.float64) episode_idx = np.zeros(num_envs, dtype="i") episode_len = np.zeros(num_envs, dtype="i") # o_0, r_0 obss = env.reset() t = step_offset if hasattr(agent, "t"): agent.t = step_offset eval_stats_history = [] # List of evaluation episode stats dict try: while True: # a_t actions = agent.batch_act(obss) # o_{t+1}, r_{t+1} obss, rs, dones, infos = env.step(actions) episode_r += rs episode_len += 1 # Compute mask for done and reset if max_episode_len is None: resets = np.zeros(num_envs, dtype=bool) else: resets = episode_len == max_episode_len resets = np.logical_or( resets, [info.get("needs_reset", False) for info in infos] ) # Agent observes the consequences agent.batch_observe(obss, rs, dones, resets) # Make mask. 0 if done/reset, 1 if pass end = np.logical_or(resets, dones) not_end = np.logical_not(end) # For episodes that ends, do the following: # 1. increment the episode count # 2. record the return # 3. clear the record of rewards # 4. clear the record of the number of steps # 5. reset the env to start a new episode # 3-5 are skipped when training is already finished. episode_idx += end recent_returns.extend(episode_r[end]) for _ in range(num_envs): t += 1 if checkpoint_freq and t % checkpoint_freq == 0: save_agent(agent, t, outdir, logger, suffix="_checkpoint") for hook in step_hooks: hook(env, agent, t) if ( log_interval is not None and t >= log_interval and t % log_interval < num_envs ): logger.info( "outdir:{} step:{} episode:{} last_R: {} average_R:{}".format( # NOQA outdir, t, np.sum(episode_idx), recent_returns[-1] if recent_returns else np.nan, np.mean(recent_returns) if recent_returns else np.nan, ) ) logger.info("statistics: {}".format(agent.get_statistics())) if evaluator: eval_score = evaluator.evaluate_if_necessary( t=t, episodes=np.sum(episode_idx) ) if eval_score is not None: eval_stats = dict(agent.get_statistics()) eval_stats["eval_score"] = eval_score eval_stats_history.append(eval_stats) if ( successful_score is not None and evaluator.max_score >= successful_score ): break if t >= steps: break # Start new episodes if needed episode_r[end] = 0 episode_len[end] = 0 obss = env.reset(not_end) except (Exception, KeyboardInterrupt): # Save the current model before being killed save_agent(agent, t, outdir, logger, suffix="_except") env.close() if evaluator: evaluator.env.close() raise else: # Save the final model save_agent(agent, t, outdir, logger, suffix="_finish") return eval_stats_history
[docs]def train_agent_batch_with_evaluation( agent, env, steps, eval_n_steps, eval_n_episodes, eval_interval, outdir, checkpoint_freq=None, max_episode_len=None, step_offset=0, eval_max_episode_len=None, return_window_size=100, eval_env=None, log_interval=None, successful_score=None, step_hooks=(), evaluation_hooks=(), save_best_so_far_agent=True, use_tensorboard=False, logger=None, ): """Train an agent while regularly evaluating it. Args: agent: Agent to train. env: Environment train the againt against. steps (int): Number of total time steps for training. eval_n_steps (int): Number of timesteps at each evaluation phase. eval_n_runs (int): Number of runs for each time of evaluation. eval_interval (int): Interval of evaluation. outdir (str): Path to the directory to output things. log_interval (int): Interval of logging. checkpoint_freq (int): frequency with which to store networks max_episode_len (int): Maximum episode length. step_offset (int): Time step from which training starts. return_window_size (int): Number of training episodes used to estimate the average returns of the current agent. eval_max_episode_len (int or None): Maximum episode length of evaluation runs. If set to None, max_episode_len is used instead. eval_env: Environment used for evaluation. successful_score (float): Finish training if the mean score is greater or equal to thisvalue if not None step_hooks (Sequence): Sequence of callable objects that accepts (env, agent, step) as arguments. They are called every step. See pfrl.experiments.hooks. evaluation_hooks (Sequence): Sequence of pfrl.experiments.evaluation_hooks.EvaluationHook objects. They are called after each evaluation. save_best_so_far_agent (bool): If set to True, after each evaluation, if the score (= mean return of evaluation episodes) exceeds the best-so-far score, the current agent is saved. use_tensorboard (bool): Additionally log eval stats to tensorboard logger (logging.Logger): Logger used in this function. Returns: agent: Trained agent. eval_stats_history: List of evaluation episode stats dict. """ logger = logger or logging.getLogger(__name__) for hook in evaluation_hooks: if not hook.support_train_agent_batch: raise ValueError( "{} does not support train_agent_batch_with_evaluation().".format(hook) ) os.makedirs(outdir, exist_ok=True) if eval_env is None: eval_env = env if eval_max_episode_len is None: eval_max_episode_len = max_episode_len evaluator = Evaluator( agent=agent, n_steps=eval_n_steps, n_episodes=eval_n_episodes, eval_interval=eval_interval, outdir=outdir, max_episode_len=eval_max_episode_len, env=eval_env, step_offset=step_offset, evaluation_hooks=evaluation_hooks, save_best_so_far_agent=save_best_so_far_agent, use_tensorboard=use_tensorboard, logger=logger, ) eval_stats_history = train_agent_batch( agent, env, steps, outdir, checkpoint_freq=checkpoint_freq, max_episode_len=max_episode_len, step_offset=step_offset, evaluator=evaluator, successful_score=successful_score, return_window_size=return_window_size, log_interval=log_interval, step_hooks=step_hooks, logger=logger, ) return agent, eval_stats_history