Source code for pfrl.action_value

import warnings
from abc import ABCMeta, abstractmethod, abstractproperty

import torch
import torch.nn.functional as F
from torch.distributions.utils import lazy_property


[docs]class ActionValue(object, metaclass=ABCMeta): """Struct that holds state-fixed Q-functions and its subproducts. Every operation it supports is done in a batch manner. """ @abstractproperty def greedy_actions(self): """Get argmax_a Q(s,a).""" raise NotImplementedError() @abstractproperty def max(self): """Evaluate max Q(s,a).""" raise NotImplementedError()
[docs] @abstractmethod def evaluate_actions(self, actions): """Evaluate Q(s,a) with a = given actions.""" raise NotImplementedError()
@abstractproperty def params(self): """Learnable parameters of this action value. Returns: tuple of torch.Tensor """ raise NotImplementedError() def __getitem__(self, i) -> "ActionValue": """ActionValue is expected to be indexable.""" raise NotImplementedError()
[docs]class DiscreteActionValue(ActionValue): """Q-function output for discrete action space. Args: q_values (torch.Tensor): Array of Q values whose shape is (batchsize, n_actions) """ def __init__(self, q_values, q_values_formatter=lambda x: x): assert isinstance(q_values, torch.Tensor) self.device = q_values.device self.q_values = q_values self.n_actions = q_values.shape[1] self.q_values_formatter = q_values_formatter @lazy_property def greedy_actions(self): return self.q_values.detach().argmax(axis=1).int() @lazy_property def max(self): index = self.greedy_actions.long().unsqueeze(1) return self.q_values.gather(dim=1, index=index).flatten() def evaluate_actions(self, actions): index = actions.long().unsqueeze(1) return self.q_values.gather(dim=1, index=index).flatten() def compute_advantage(self, actions): return self.evaluate_actions(actions) - self.max def compute_double_advantage(self, actions, argmax_actions): return self.evaluate_actions(actions) - self.evaluate_actions(argmax_actions) def compute_expectation(self, beta): return torch.sum(F.softmax(beta * self.q_values) * self.q_values, dim=1) def __repr__(self): return "DiscreteActionValue greedy_actions:{} q_values:{}".format( self.greedy_actions.detach().cpu().numpy(), self.q_values_formatter(self.q_values.detach().cpu().numpy()), ) @property def params(self): return (self.q_values,) def __getitem__(self, i): return DiscreteActionValue( self.q_values[i], q_values_formatter=self.q_values_formatter )
class DistributionalDiscreteActionValue(ActionValue): """distributional Q-function output for discrete action space. Args: q_dist: Probabilities of atoms. Its shape must be (batchsize, n_actions, n_atoms). z_values (ndarray): Values represented by atoms. Its shape must be (n_atoms,). """ def __init__(self, q_dist, z_values, q_values_formatter=lambda x: x): assert isinstance(q_dist, torch.Tensor) assert isinstance(z_values, torch.Tensor) assert q_dist.ndim == 3 assert z_values.ndim == 1 assert q_dist.shape[2] == z_values.shape[0] self.z_values = z_values q_scaled = q_dist * self.z_values[None, None, ...] self.q_values = q_scaled.sum(dim=2) self.q_dist = q_dist self.n_actions = q_dist.shape[1] self.q_values_formatter = q_values_formatter @lazy_property def greedy_actions(self): return self.q_values.argmax(dim=1).detach() @lazy_property def max(self): return torch.gather(self.q_values, 1, self.greedy_actions[:, None])[:, 0] @lazy_property def max_as_distribution(self): """Return the return distributions of the greedy actions. Returns: torch.Tensor: Return distributions. Its shape will be (batch_size, n_atoms). """ return self.q_dist[ torch.arange(self.q_values.shape[0]), self.greedy_actions.detach() ] def evaluate_actions(self, actions): return torch.gather(self.q_values, 1, actions[:, None])[:, 0] def evaluate_actions_as_distribution(self, actions): """Return the return distributions of given actions. Args: actions (torch.Tensor): Array of action indices. Its shape must be (batch_size,). Returns: torch.Tensor: Return distributions. Its shape will be (batch_size, n_atoms). """ return self.q_dist[torch.arange(self.q_values.shape[0]), actions] def compute_advantage(self, actions): return self.evaluate_actions(actions) - self.max def compute_double_advantage(self, actions, argmax_actions): return self.evaluate_actions(actions) - self.evaluate_actions(argmax_actions) def compute_expectation(self, beta): return (F.softmax(beta * self.q_values) * self.q_values).sum(dim=1) def __repr__(self): return "DistributionalDiscreteActionValue greedy_actions:{} q_values:{}".format( # NOQA self.greedy_actions.detach(), self.q_values_formatter(self.q_values.detach()), ) @property def params(self): return (self.q_dist,) def __getitem__(self, i): return DistributionalDiscreteActionValue( self.q_dist[i], self.z_values, q_values_formatter=self.q_values_formatter, ) class QuantileDiscreteActionValue(DiscreteActionValue): """Quantile action value for discrete actions. Args: quantiles (torch.Tensor): (batch_size, n_taus, n_actions) q_values_formatter (callable): """ def __init__(self, quantiles, q_values_formatter=lambda x: x): assert quantiles.ndim == 3 self.quantiles = quantiles self.n_actions = quantiles.shape[2] self.q_values_formatter = q_values_formatter @lazy_property def q_values(self): return self.quantiles.mean(1) def evaluate_actions_as_quantiles(self, actions): """Return the return quantiles of given actions. Args: actions (torch.Tensor or ndarray): Array of action indices. Its shape must be (batch_size,). Returns: torch.Tensor: Return quantiles. Its shape will be (batch_size, n_taus). """ return self.quantiles[ torch.arange(self.quantiles.shape[0], dtype=torch.long), :, actions.long() ] def __repr__(self): return ( "QuantileDiscreteActionValue greedy_actions:{} q_values:{}".format( # NOQA self.greedy_actions.detach().cpu().numpy(), self.q_values_formatter(self.q_values.detach().cpu().numpy()), ) ) @property def params(self): return (self.quantiles,) def __getitem__(self, i): return QuantileDiscreteActionValue( quantiles=self.quantiles[i], q_values_formatter=self.q_values_formatter, )
[docs]class QuadraticActionValue(ActionValue): """Q-function output for continuous action space. See: http://arxiv.org/abs/1603.00748 Define a Q(s,a) with A(s,a) in a quadratic form. Q(s,a) = V(s,a) + A(s,a) A(s,a) = -1/2 (u - mu(s))^T P(s) (u - mu(s)) Args: mu (torch.Tensor): mu(s), actions that maximize A(s,a) mat (torch.Tensor): P(s), coefficient matrices of A(s,a). It must be positive definite. v (torch.Tensor): V(s), values of s min_action (ndarray): minimum action, not batched max_action (ndarray): maximum action, not batched """ def __init__(self, mu, mat, v, min_action=None, max_action=None): self.mu = mu self.mat = mat self.v = v self.device = mu.device if isinstance(min_action, (int, float)): min_action = [min_action] if min_action is None: self.min_action = None else: self.min_action = torch.as_tensor(min_action).to(self.device).float() if isinstance(max_action, (int, float)): max_action = [max_action] if max_action is None: self.max_action = None else: self.max_action = torch.as_tensor(max_action).to(self.device).float() self.batch_size = self.mu.shape[0] @lazy_property def greedy_actions(self): a = self.mu if self.min_action is not None: a = torch.max(self.min_action.unsqueeze(0).expand_as(a), a) if self.max_action is not None: a = torch.min(self.max_action.unsqueeze(0).expand_as(a), a) return a @lazy_property def max(self): if self.min_action is None and self.max_action is None: return self.v.reshape( self.batch_size, ) else: return self.evaluate_actions(self.greedy_actions) def evaluate_actions(self, actions): u_minus_mu = actions - self.mu a = ( -0.5 * torch.matmul( torch.matmul(u_minus_mu[:, None, :], self.mat), u_minus_mu[:, :, None] )[:, 0, 0] ) return a + self.v.reshape( self.batch_size, ) def compute_advantage(self, actions): return self.evaluate_actions(actions) - self.max def compute_double_advantage(self, actions, argmax_actions): return self.evaluate_actions(actions) - self.evaluate_actions(argmax_actions) def __repr__(self): return "QuadraticActionValue greedy_actions:{} v:{}".format( self.greedy_actions.detach().cpu().numpy(), self.v.detach().cpu().numpy() ) @property def params(self): return (self.mu, self.mat, self.v) def __getitem__(self, i): return QuadraticActionValue( self.mu[i], self.mat[i], self.v[i], min_action=self.min_action, max_action=self.max_action, )
[docs]class SingleActionValue(ActionValue): """ActionValue that can evaluate only a single action.""" def __init__(self, evaluator, maximizer=None): self.evaluator = evaluator self.maximizer = maximizer @lazy_property def greedy_actions(self): return self.maximizer() @lazy_property def max(self): return self.evaluator(self.greedy_actions) def evaluate_actions(self, actions): return self.evaluator(actions) def compute_advantage(self, actions): return self.evaluator(actions) - self.max def compute_double_advantage(self, actions, argmax_actions): return self.evaluate_actions(actions) - self.evaluate_actions(argmax_actions) def __repr__(self): return "SingleActionValue" @property def params(self): warnings.warn( "SingleActionValue has no learnable parameters until it" " is evaluated on some action. If you want to draw a computation" " graph that outputs SingleActionValue, use the variable returned" " by its method such as evaluate_actions instead." ) return () def __getitem__(self, i): raise NotImplementedError