Source code for pfrl.agents.reinforce

import warnings
from logging import getLogger

import numpy as np
import torch

import pfrl
from pfrl import agent
from pfrl.utils import clip_l2_grad_norm_
from pfrl.utils.mode_of_distribution import mode_of_distribution
from pfrl.utils.recurrent import one_step_forward

[docs]class REINFORCE(agent.AttributeSavingMixin, agent.Agent): """William's episodic REINFORCE. Args: model (Policy): Model to train. It must be a callable that accepts observations as input and return action distributions (Distribution). optimizer (torch.optim.Optimizer): optimizer used to train the model gpu (int): GPU device id if not None nor negative beta (float): Weight coefficient for the entropy regularizaiton term. phi (callable): Feature extractor function act_deterministically (bool): If set true, choose most probable actions in act method. batchsize (int): Number of episodes used for each update backward_separately (bool): If set true, call backward separately for each episode and accumulate only gradients. average_entropy_decay (float): Decay rate of average entropy. Used only to record statistics. batch_states (callable): Method which makes a batch of observations. default is `pfrl.utils.batch_states` recurrent (bool): If set to True, `model` is assumed to implement `pfrl.nn.Recurrent` and update in a recurrent manner. max_grad_norm (float or None): Maximum L2 norm of the gradient used for gradient clipping. If set to None, the gradient is not clipped. logger (logging.Logger): Logger to be used. """ saved_attributes = ("model", "optimizer") def __init__( self, model, optimizer, gpu=None, beta=0, phi=lambda x: x, batchsize=1, act_deterministically=False, average_entropy_decay=0.999, backward_separately=False, batch_states=pfrl.utils.batch_states, recurrent=False, max_grad_norm=None, logger=None, ): self.model = model if gpu is not None and gpu >= 0: assert torch.cuda.is_available() self.device = torch.device("cuda:{}".format(gpu)) else: self.device = torch.device("cpu") self.optimizer = optimizer self.beta = beta self.phi = phi self.batchsize = batchsize self.backward_separately = backward_separately self.act_deterministically = act_deterministically self.average_entropy_decay = average_entropy_decay self.batch_states = batch_states self.recurrent = recurrent self.max_grad_norm = max_grad_norm self.logger = logger or getLogger(__name__) # Recurrent states of the model self.train_recurrent_states = None self.test_recurrent_states = None # Statistics self.average_entropy = 0 self.t = 0 self.reward_sequences = [[]] self.log_prob_sequences = [[]] self.entropy_sequences = [[]] self.n_backward = 0 def act(self, obs): if return self._act_train(obs) else: return self._act_eval(obs) def observe(self, obs, reward, done, reset): if self._observe_train(obs, reward, done, reset) else: self._observe_eval(obs, reward, done, reset) def _act_train(self, obs): batch_obs = self.batch_states([obs], self.device, self.phi) if self.recurrent: action_distrib, self.train_recurrent_states = one_step_forward( self.model, batch_obs, self.train_recurrent_states ) else: action_distrib = self.model(batch_obs) batch_action = action_distrib.sample() # Save values used to compute losses self.log_prob_sequences[-1].append(action_distrib.log_prob(batch_action)) self.entropy_sequences[-1].append(action_distrib.entropy()) action = batch_action.cpu().numpy()[0] self.logger.debug("t:%s a:%s", self.t, action) # Update stats self.average_entropy += (1 - self.average_entropy_decay) * ( float(action_distrib.entropy()) - self.average_entropy ) return action def _observe_train(self, obs, reward, done, reset): self.reward_sequences[-1].append(reward) self.t += 1 if done or reset: if not done: warnings.warn( "Since REINFORCE supports episodic environments only, " "reset=True with done=False will throw away the last episode." ) self.reward_sequences[-1] = [] self.log_prob_sequences[-1] = [] self.entropy_sequences[-1] = [] elif done: self.reward_sequences[-1].append(reward) if self.backward_separately: self.accumulate_grad() if self.n_backward == self.batchsize: self.update_with_accumulated_grad() else: if len(self.reward_sequences) == self.batchsize: self.batch_update() else: # Prepare for the next episode self.reward_sequences.append([]) self.log_prob_sequences.append([]) self.entropy_sequences.append([]) self.train_recurrent_states = None def _act_eval(self, obs): with torch.no_grad(): batch_obs = self.batch_states([obs], self.device, self.phi) if self.recurrent: action_distrib, self.test_recurrent_states = one_step_forward( self.model, batch_obs, self.test_recurrent_states ) else: action_distrib = self.model(batch_obs) if self.act_deterministically: return mode_of_distribution(action_distrib).cpu().numpy()[0] else: return action_distrib.sample().cpu().numpy()[0] def _observe_eval(self, obs, reward, done, reset): if done or reset: self.test_recurrent_states = None def accumulate_grad(self): if self.n_backward == 0: self.optimizer.zero_grad() # Compute losses losses = [] for r_seq, log_prob_seq, ent_seq in zip( self.reward_sequences, self.log_prob_sequences, self.entropy_sequences ): assert len(r_seq) - 1 == len(log_prob_seq) == len(ent_seq) # Convert rewards into returns (=sum of future rewards) R_seq = np.cumsum(list(reversed(r_seq[1:])))[::-1] for R, log_prob, entropy in zip(R_seq, log_prob_seq, ent_seq): loss = -R * log_prob - self.beta * entropy losses.append(loss) total_loss = torch.stack(losses).sum() / self.batchsize total_loss.backward() self.reward_sequences = [[]] self.log_prob_sequences = [[]] self.entropy_sequences = [[]] self.n_backward += 1 def batch_update(self): assert len(self.reward_sequences) == self.batchsize assert len(self.log_prob_sequences) == self.batchsize assert len(self.entropy_sequences) == self.batchsize # Update the model assert self.n_backward == 0 self.accumulate_grad() if self.max_grad_norm is not None: clip_l2_grad_norm_(self.model.parameters(), self.max_grad_norm) self.optimizer.step() self.n_backward = 0 def update_with_accumulated_grad(self): assert self.n_backward == self.batchsize if self.max_grad_norm is not None: clip_l2_grad_norm_(self.model.parameters(), self.max_grad_norm) self.optimizer.step() self.n_backward = 0 def get_statistics(self): return [ ("average_entropy", self.average_entropy), ]