Source code for xuance.tensorflow.policies.core

import numpy as np
from gymnasium.spaces import Discrete
from xuance.common import Sequence, Optional, Callable, Union, Dict
from xuance.tensorflow import tf, tk, Module, Tensor
from xuance.tensorflow.utils import mlp_block, gru_block, lstm_block, ModuleType
from xuance.tensorflow.utils import CategoricalDistribution, DiagGaussianDistribution, ActivatedDiagGaussianDistribution


[docs] class BasicQhead(Module): """ A base class to build Q network and calculate the Q values. Args: state_dim (int): The input state dimension. n_actions (int): The number of discrete actions. hidden_sizes: List of hidden units for fully connect layers. normalize (Optional[ModuleType]): The layer normalization over a minibatch of inputs. initialize (Optional[Callable[..., Tensor]]): The parameters initializer. activation (Optional[ModuleType]): The activation function for each layer. """ def __init__(self, state_dim: int, n_actions: int, hidden_sizes: Sequence[int], normalize: Optional[tk.layers.Layer] = None, initialize: Optional[tk.initializers.Initializer] = None, activation: Optional[tk.layers.Layer] = None): super(BasicQhead, self).__init__() layers = [] self.state_dim = state_dim self.n_actions = n_actions input_shape = (state_dim,) for h in hidden_sizes: mlp, input_shape = mlp_block(input_shape[0], h, normalize, activation, initialize) layers.extend(mlp) layers.extend(mlp_block(input_shape[0], n_actions, None, None, initialize)[0]) self.model = tk.Sequential(layers) @tf.function def call(self, x: Union[Tensor, np.ndarray], **kwargs): """ Returns the output of the Q network. Parameters: x (Union[Tensor, np.ndarray]): The input tensor. """ input_shape = x.shape x_flat = tf.reshape(x, (-1, input_shape[-1])) y_flat = self.model(x_flat) return tf.reshape(y_flat, input_shape[:-1] + (self.n_actions, ))
[docs] class DuelQhead(Module): """ A base class to build Q network and calculate the dueling Q values. Args: state_dim (int): The input state dimension. n_actions (int): The number of discrete actions. hidden_sizes: List of hidden units for fully connect layers. normalize (Optional[ModuleType]): The layer normalization over a minibatch of inputs. initialize (Optional[Callable[..., Tensor]]): The parameters initializer. activation (Optional[ModuleType]): The activation function for each layer. """ def __init__(self, state_dim: int, n_actions: int, hidden_sizes: Sequence[int], normalize: Optional[tk.layers.Layer] = None, initialize: Optional[tk.initializers.Initializer] = None, activation: Optional[tk.layers.Layer] = None): super(DuelQhead, self).__init__() v_layers = [] input_shape = (state_dim,) for h in hidden_sizes: v_mlp, input_shape = mlp_block(input_shape[0], h // 2, normalize, activation, initialize) v_layers.extend(v_mlp) v_layers.extend(mlp_block(input_shape[0], 1, None, None, None)[0]) a_layers = [] input_shape = (state_dim,) for h in hidden_sizes: a_mlp, input_shape = mlp_block(input_shape[0], h // 2, normalize, activation, initialize) a_layers.extend(a_mlp) a_layers.extend(mlp_block(input_shape[0], n_actions, None, None, None)[0]) self.a_model = tk.Sequential(a_layers) self.v_model = tk.Sequential(v_layers) @tf.function def call(self, x: Union[Tensor, np.ndarray], **kwargs): """ Returns the dueling Q-values. Parameters: x (Union[Tensor, np.ndarray]): The input tensor. Returns: q: The dueling Q-values. """ v = self.v_model(x) a = self.a_model(x) q = v + (a - tf.expand_dims(tf.reduce_mean(a, axis=-1), axis=-1)) return q
[docs] class C51Qhead(Module): """ A base class to build Q network and calculate the distributional Q values. Args: state_dim (int): The input state dimension. n_actions (int): The number of discrete actions. atom_num (int): The number of atoms. hidden_sizes: List of hidden units for fully connect layers. normalize (Optional[ModuleType]): The layer normalization over a minibatch of inputs. initialize (Optional[Callable[..., Tensor]]): The parameters initializer. activation (Optional[ModuleType]): The activation function for each layer. """ def __init__(self, state_dim: int, n_actions: int, atom_num: int, hidden_sizes: Sequence[int], normalize: Optional[tk.layers.Layer] = None, initialize: Optional[tk.initializers.Initializer] = None, activation: Optional[tk.layers.Layer] = None): super(C51Qhead, self).__init__() self.action_dim = n_actions self.atom_num = atom_num layers = [] input_shape = (state_dim,) for h in hidden_sizes: mlp, input_shape = mlp_block(input_shape[0], h, normalize, activation, initialize) layers.extend(mlp) layers.extend(mlp_block(input_shape[0], n_actions * atom_num, None, None, initialize)[0]) self.model = tk.Sequential(layers) @tf.function def call(self, x: Union[Tensor, np.ndarray], **kwargs): """ Returns the discrete action distributions. Parameters: x (Union[Tensor, np.ndarray]): The input tensor. Returns: dist_probs: The probability distribution of the discrete actions. """ dist_logits = tf.reshape(self.model(x), [-1, self.action_dim, self.atom_num]) dist_probs = tf.nn.softmax(dist_logits, axis=-1) return dist_probs
[docs] class QRDQNhead(Module): """ A base class to build Q networks for QRDQN policy. Args: state_dim (int): The input state dimension. n_actions (int): The number of discrete actions. atom_num (int): The number of atoms. hidden_sizes: List of hidden units for fully connect layers. normalize (Optional[ModuleType]): The layer normalization over a minibatch of inputs. initialize (Optional[Callable[..., Tensor]]): The parameters initializer. activation (Optional[ModuleType]): The activation function for each layer. """ def __init__(self, state_dim: int, n_actions: int, atom_num: int, hidden_sizes: Sequence[int], normalize: Optional[tk.layers.Layer] = None, initialize: Optional[tk.initializers.Initializer] = None, activation: Optional[tk.layers.Layer] = None): super(QRDQNhead, self).__init__() self.action_dim = n_actions self.atom_num = atom_num layers = [] input_shape = (state_dim,) for h in hidden_sizes: mlp, input_shape = mlp_block(input_shape[0], h, normalize, activation, initialize) layers.extend(mlp) layers.extend(mlp_block(input_shape[0], n_actions * atom_num, None, None, None)[0]) self.model = tk.Sequential(layers) @tf.function def call(self, x: Union[Tensor, np.ndarray], **kwargs): """ Returns the quantiles of the distribution. Parameters: x (Union[Tensor, np.ndarray]): The input tensor. Returns: quantiles: The quantiles of the action distribution. """ quantiles = tf.reshape(self.model(x), [-1, self.action_dim, self.atom_num]) return quantiles
[docs] class BasicRecurrent(Module): """Build recurrent neural network to calculate Q values.""" def __init__(self, **kwargs): super(BasicRecurrent, self).__init__() self.lstm = False if kwargs["rnn"] == "GRU": output, _ = gru_block(kwargs["input_dim"], kwargs["recurrent_hidden_size"], kwargs["recurrent_layer_N"], kwargs["dropout"], kwargs["initialize"]) elif kwargs["rnn"] == "LSTM": self.lstm = True output, _ = lstm_block(kwargs["input_dim"], kwargs["recurrent_hidden_size"], kwargs["recurrent_layer_N"], kwargs["dropout"], kwargs["initialize"]) else: raise "Unknown recurrent module!" self.rnn_layer = output fc_layer = mlp_block(kwargs["recurrent_hidden_size"], kwargs["action_dim"], None, None, None)[0] self.output_dim = kwargs["action_dim"] self.model = tk.Sequential(fc_layer) self.rnn_layer.build(input_shape=(None, None, kwargs["input_dim"])) @tf.function def call(self, x: Union[Tensor, np.ndarray], **kwargs): """Returns the rnn hidden and Q-values via RNN networks.""" if self.lstm: rnn_output, hn, cn = self.rnn_layer(x) fc_input_shape = rnn_output.shape fc_input = tf.reshape(x, [-1, fc_input_shape[-1]]) fc_output = self.model(fc_input) return hn, cn, tf.reshape(fc_output, fc_input_shape[:-1] + (self.output_dim, )) else: rnn_output, hn = self.rnn_layer(x) fc_input_shape = rnn_output.shape fc_input = tf.reshape(x, [-1, fc_input_shape[-1]]) fc_output = self.model(fc_input) return hn, tf.reshape(fc_output, fc_input_shape[:-1] + (self.output_dim,))
[docs] class ActorNet(Module): """ The actor network for deterministic policy, which outputs activated continuous actions directly. Args: state_dim (int): The input state dimension. action_dim (int): The dimension of continuous action space. hidden_sizes (Sequence[int]): List of hidden units for fully connect layers. normalize (Optional[ModuleType]): The layer normalization over a minibatch of inputs. initialize (Optional[Callable[..., Tensor]]): The parameters initializer. activation (Optional[ModuleType]): The activation function for each layer. activation_action (Optional[ModuleType]): The activation of final layer to bound the actions. """ def __init__(self, state_dim: int, action_dim: int, hidden_sizes: Sequence[int], normalize: Optional[ModuleType] = None, initialize: Optional[Callable[..., Tensor]] = None, activation: Optional[tk.layers.Layer] = None, activation_action: Optional[tk.layers.Layer] = None): super(ActorNet, self).__init__() layers = [] input_shape = (state_dim,) for h in hidden_sizes: mlp, input_shape = mlp_block(input_shape[0], h, normalize, activation, initialize) layers.extend(mlp) layers.extend(mlp_block(input_shape[0], action_dim, None, activation_action, initialize)[0]) self.model = tk.Sequential(layers) @tf.function def call(self, x: Union[Tensor, np.ndarray], avail_actions: Optional[Tensor] = None, **kwargs): """ Returns the output of the actor. Parameters: x (Union[Tensor, np.ndarray]): The input tensor. avail_actions (Optional[Tensor]): The actions mask values when use actions mask, default is None. """ logits = self.model(x) if avail_actions is not None: logits[avail_actions == 0] = -1e10 return logits
[docs] class CategoricalActorNet(Module): """ The actor network for categorical policy, which outputs a distribution over all discrete actions. Args: state_dim (int): The input state dimension. action_dim (int): The dimension of continuous action space. hidden_sizes (Sequence[int]): List of hidden units for fully connect layers. normalize (Optional[ModuleType]): The layer normalization over a minibatch of inputs. initialize (Optional[Callable[..., Tensor]]): The parameters initializer. activation (Optional[ModuleType]): The activation function for each layer. """ def __init__(self, state_dim: int, action_dim: int, hidden_sizes: Sequence[int], normalize: Optional[ModuleType] = None, initialize: Optional[Callable[..., Tensor]] = None, activation: Optional[ModuleType] = None): super(CategoricalActorNet, self).__init__() layers = [] input_shape = (state_dim,) for h in hidden_sizes: mlp, input_shape = mlp_block(input_shape[0], h, normalize, activation, initialize) layers.extend(mlp) layers.extend(mlp_block(input_shape[0], action_dim, None, None, initialize)[0]) self.model = tk.Sequential(layers) self.dist = CategoricalDistribution(action_dim) @tf.function def call(self, x: Union[Tensor, np.ndarray], avail_actions: Optional[Tensor] = None, **kwargs): """ Returns the stochastic distribution over all discrete actions. Parameters: x (Union[Tensor, np.ndarray]): The input tensor. avail_actions (Optional[Tensor]): The actions mask values when use actions mask, default is None. Returns: self.dist: CategoricalDistribution(action_dim), a distribution over all discrete actions. """ logits = self.model(x) if avail_actions is not None: logits[avail_actions == 0] = -1e10 return logits
[docs] def distribution(self, logits: Tensor): self.dist.set_param(logits=logits) return self.dist
[docs] class CategoricalActorNet_SAC(CategoricalActorNet): """ The actor network for categorical policy in SAC-DIS, which outputs a distribution over all discrete actions. Args: state_dim (int): The input state dimension. action_dim (int): The dimension of continuous action space. hidden_sizes (Sequence[int]): List of hidden units for fully connect layers. normalize (Optional[ModuleType]): The layer normalization over a minibatch of inputs. initialize (Optional[Callable[..., Tensor]]): The parameters initializer. activation (Optional[ModuleType]): The activation function for each layer. """ def __init__(self, state_dim: int, action_dim: int, hidden_sizes: Sequence[int], normalize: Optional[ModuleType] = None, initialize: Optional[Callable[..., Tensor]] = None, activation: Optional[ModuleType] = None): super(CategoricalActorNet_SAC, self).__init__(state_dim, action_dim, hidden_sizes, normalize, initialize, activation)
[docs] class GaussianActorNet(Module): """ The actor network for Gaussian policy, which outputs a distribution over the continuous action space. Args: state_dim (int): The input state dimension. action_dim (int): The dimension of continuous action space. hidden_sizes (Sequence[int]): List of hidden units for fully connect layers. normalize (Optional[ModuleType]): The layer normalization over a minibatch of inputs. initialize (Optional[Callable[..., Tensor]]): The parameters initializer. activation (Optional[ModuleType]): The activation function for each layer. activation_action (Optional[ModuleType]): The activation of final layer to bound the actions. """ def __init__(self, state_dim: int, action_dim: int, hidden_sizes: Sequence[int], normalize: Optional[ModuleType] = None, initialize: Optional[Callable[..., Tensor]] = None, activation: Optional[ModuleType] = None, activation_action: Optional[ModuleType] = None): super(GaussianActorNet, self).__init__() layers = [] input_shape = (state_dim,) for h in hidden_sizes: mlp, input_shape = mlp_block(input_shape[0], h, normalize, activation, initialize) layers.extend(mlp) layers.extend(mlp_block(input_shape[0], action_dim, None, activation_action, initialize)[0]) self.mu = tk.Sequential(layers) self.logstd = self.add_weight(name="log_of_std", shape=(action_dim,), initializer=tf.keras.initializers.Constant(-1.0), trainable=True) self.dist = DiagGaussianDistribution(action_dim) @tf.function def call(self, x: Union[Tensor, np.ndarray], **kwargs): """ Returns the stochastic distribution over the continuous action space. Parameters: x (Union[Tensor, np.ndarray]): The input tensor. Returns: mu_: The mean variable of the Gaussian distribution. """ mu_ = self.mu(x) std_ = tf.math.exp(self.logstd) return mu_, std_
[docs] def distribution(self, mu: Tensor, std: Tensor): self.dist.set_param(mu=mu, std=std) return self.dist
[docs] class CriticNet(Module): """ The actor network for categorical policy, which outputs a distribution over all discrete actions. Args: input_dim (int): The input state dimension. hidden_sizes (Sequence[int]): List of hidden units for fully connect layers. normalize (Optional[ModuleType]): The layer normalization over a minibatch of inputs. initialize (Optional[Callable[..., Tensor]]): The parameters initializer. activation (Optional[ModuleType]): The activation function for each layer. """ def __init__(self, input_dim: int, hidden_sizes: Sequence[int], normalize: Optional[ModuleType] = None, initialize: Optional[Callable[..., Tensor]] = None, activation: Optional[tk.layers.Layer] = None): super(CriticNet, self).__init__() layers = [] input_shape = (input_dim,) for h in hidden_sizes: mlp, input_shape = mlp_block(input_shape[0], h, normalize, activation, initialize) layers.extend(mlp) layers.extend(mlp_block(input_shape[0], 1, None, None, initialize)[0]) self.model = tk.Sequential(layers) @tf.function def call(self, x: Union[Tensor, np.ndarray], **kwargs): """ Returns the output of the Q network. Parameters: x (Union[Tensor, np.ndarray]): The input tensor. """ return self.model(x)
[docs] class GaussianActorNet_SAC(Module): """ The actor network for Gaussian policy in SAC, which outputs a distribution over the continuous action space. Args: state_dim (int): The input state dimension. action_dim (int): The dimension of continuous action space. hidden_sizes (Sequence[int]): List of hidden units for fully connect layers. normalize (Optional[ModuleType]): The layer normalization over a minibatch of inputs. initialize (Optional[Callable[..., Tensor]]): The parameters initializer. activation (Optional[ModuleType]): The activation function for each layer. activation_action (Optional[ModuleType]): The activation of final layer to bound the actions. """ def __init__(self, state_dim: int, action_dim: int, hidden_sizes: Sequence[int], normalize: Optional[ModuleType] = None, initialize: Optional[Callable[..., Tensor]] = None, activation: Optional[ModuleType] = None, activation_action: Optional[ModuleType] = None): super(GaussianActorNet_SAC, self).__init__() self.activation_action = activation_action layers = [] input_shape = (state_dim,) for h in hidden_sizes: mlp, input_shape = mlp_block(input_shape[0], h, normalize, activation, initialize) layers.extend(mlp) self.out = tk.Sequential(layers) self.mu = tk.layers.Dense(units=action_dim, activation=None, input_shape=(hidden_sizes[-1],)) self.log_std = tk.layers.Dense(units=action_dim, activation=None, input_shape=(hidden_sizes[-1],)) self.dist = ActivatedDiagGaussianDistribution(action_dim, activation_action) @tf.function def call(self, x: Union[Tensor, np.ndarray], **kwargs): """ Returns the stochastic distribution over the continuous action space. Parameters: x (Union[Tensor, np.ndarray]): The input tensor. Returns: self.dist: A distribution over the continuous action space. """ output = self.out(x) mu_ = self.mu(output) log_std = tf.clip_by_value(self.log_std(output), -20, 2) std_ = tf.exp(log_std) return mu_, std_
[docs] def distribution(self, mu: Tensor, std: Tensor): self.dist.set_param(mu=mu, std=std) return self.dist
[docs] class VDN_mixer(Module): """ The value decomposition networks mixer. (Additivity) """ def __init__(self): super(VDN_mixer, self).__init__() @tf.function def call(self, values_n, states=None, **kwargs): return tf.reduce_sum(values_n, axis=1)
[docs] class QMIX_mixer(Module): """ The QMIX mixer. (Monotonicity) Args: dim_state (int): The dimension of global state. dim_hidden (int): The size of rach hidden layer. dim_hypernet_hidden (int): The size of rach hidden layer for hyper network. n_agents (int): The number of agents. """ def __init__(self, dim_state, dim_hidden, dim_hypernet_hidden, n_agents): super(QMIX_mixer, self).__init__() self.dim_state = dim_state self.dim_hidden = dim_hidden self.dim_hypernet_hidden = dim_hypernet_hidden self.n_agents = n_agents # self.hyper_w_1 = nn.Linear(self.dim_state, self.dim_hidden * self.n_agents) # self.hyper_w_2 = nn.Linear(self.dim_state, self.dim_hidden) self.hyper_w_1 = tk.Sequential([tk.layers.Dense(units=self.dim_hypernet_hidden, activation=tk.layers.Activation('relu'), input_shape=(self.dim_state,)), tk.layers.Dense(units=self.dim_hidden * self.n_agents, input_shape=(self.dim_hypernet_hidden,))]) self.hyper_w_2 = tk.Sequential([tk.layers.Dense(units=self.dim_hypernet_hidden, activation=tk.layers.Activation('relu'), input_shape=(self.dim_state,)), tk.layers.Dense(units=self.dim_hidden, input_shape=(self.dim_hypernet_hidden,))]) self.hyper_b_1 = tk.layers.Dense(units=self.dim_hidden, input_shape=(self.dim_state,)) self.hyper_b_2 = tk.Sequential([tk.layers.Dense(units=self.dim_hypernet_hidden, activation=tk.layers.Activation('relu'), input_shape=(self.dim_state,)), tk.layers.Dense(units=1, input_shape=(self.dim_hypernet_hidden,))]) @tf.function def call(self, values_n, states=None, **kwargs): """ Returns the total Q-values for multi-agent team. Parameters: values_n: The individual values for agents in team. states: The global states. Returns: q_tot: The total Q-values for the multi-agent team. """ states = tf.reshape(states, [-1, self.dim_state]) agent_qs = tf.reshape(values_n, [-1, 1, self.n_agents]) # First layer w_1 = tf.abs(self.hyper_w_1(states)) w_1 = tf.reshape(w_1, [-1, self.n_agents, self.dim_hidden]) b_1 = self.hyper_b_1(states) b_1 = tf.reshape(b_1, [-1, 1, self.dim_hidden]) hidden = tf.nn.elu(tf.linalg.matmul(agent_qs, w_1) + b_1) # Second layer w_2 = tf.abs(self.hyper_w_2(states)) w_2 = tf.reshape(w_2, [-1, self.dim_hidden, 1]) b_2 = self.hyper_b_2(states) b_2 = tf.reshape(b_2, [-1, 1, 1]) # Compute final output y = tf.linalg.matmul(hidden, w_2) + b_2 # Reshape and return q_tot = tf.reshape(y, [-1, 1]) return q_tot
[docs] class QMIX_FF_mixer(Module): """ The feedforward mixer without the constraints of monotonicity. """ def __init__(self, dim_state: int = 0, dim_hidden: int = 32, n_agents: int = 1): super(QMIX_FF_mixer, self).__init__() self.dim_state = dim_state self.dim_hidden = dim_hidden self.n_agents = n_agents self.dim_input = self.n_agents + self.dim_state self.ff_net = tk.Sequential([ tk.layers.Dense(self.dim_hidden, input_shape=(self.dim_input,)), tk.layers.ReLU(), tk.layers.Dense(self.dim_hidden, input_shape=(self.dim_hidden,)), tk.layers.ReLU(), tk.layers.Dense(self.dim_hidden, input_shape=(self.dim_hidden,)), tk.layers.ReLU(), tk.layers.Dense(1, input_shape=(self.dim_hidden,)) ]) self.ff_net_bias = tk.Sequential([ tk.layers.Dense(self.dim_hidden, input_shape=(self.dim_state,)), tk.layers.ReLU(), tk.layers.Dense(1, input_shape=(self.dim_hidden,)) ]) @tf.function def call(self, values_n, states=None, **kwargs): """ Returns the feedforward total Q-values. Parameters: values_n: The individual Q-values. states: The global states. """ states = tf.reshape(states, [-1, self.dim_state]) agent_qs = tf.reshape(values_n, [-1, self.n_agents]) inputs = tf.concat([agent_qs, states], axis=-1) out_put = self.ff_net(inputs) bias = self.ff_net_bias(states) y = out_put + bias q_tot = tf.reshape(y, [-1, 1]) return q_tot
[docs] class QTRAN_base(Module): """ The basic QTRAN module. Args: dim_state (int): The dimension of the global state. action_space (Dict[str, Discrete]): The action space for all agents. dim_hidden (int): The dimension of the hidden layers. n_agents (int): The number of agents. dim_utility_hidden (int): The dimension of the utility hidden states. use_parameter_sharing (bool): Whether to use parameters sharing trick. """ def __init__(self, dim_state: int = 0, action_space: Dict[str, Discrete] = None, dim_hidden: int = 32, n_agents: int = 1, dim_utility_hidden: int = 1, use_parameter_sharing: bool = False,): super(QTRAN_base, self).__init__() self.dim_state = dim_state self.action_space = action_space self.n_actions_list = [a_space.n for a_space in action_space.values()] self.n_actions_max = max(self.n_actions_list) self.dim_hidden = dim_hidden self.n_agents = n_agents self.use_parameter_sharing = use_parameter_sharing self.dim_q_input = self.dim_state + dim_utility_hidden + self.n_actions_max self.dim_v_input = self.dim_state self.Q_jt = tf.keras.Sequential([ tk.layers.Dense(self.dim_hidden, input_shape=(self.dim_q_input,)), tk.layers.ReLU(), tk.layers.Dense(self.dim_hidden, input_shape=(self.dim_hidden,)), tk.layers.ReLU(), tk.layers.Dense(1, input_shape=(self.dim_hidden,)) ]) self.V_jt = tf.keras.Sequential([ tk.layers.Dense(self.dim_hidden, input_shape=(self.dim_v_input,)), tk.layers.ReLU(), tk.layers.Dense(self.dim_hidden, input_shape=(self.dim_hidden,)), tk.layers.ReLU(), tk.layers.Dense(1, input_shape=(self.dim_hidden,)) ]) self.dim_ae_input = dim_utility_hidden + self.n_actions_max self.action_encoding = tf.keras.Sequential([ tk.layers.Dense(self.dim_ae_input, input_shape=(self.dim_ae_input,)), tk.layers.ReLU(), tk.layers.Dense(self.dim_ae_input, input_shape=(self.dim_ae_input,)) ]) @tf.function def call(self, states: Tensor, hidden_state_inputs: Tensor, actions_onehot: Tensor, **kwargs): """ Calculating the joint Q and V values. Parameters: states (Tensor): The global states. hidden_state_inputs (Tensor): The joint hidden states inputs for QTRAN network. actions_onehot (Tensor): The joint onehot actions for QTRAN network. Returns: q_jt (Tensor): The evaluated joint Q values. v_jt (Tensor): The evaluated joint V values. """ h_state_action_input = tf.concat([hidden_state_inputs, actions_onehot], axis=-1) input_shape = h_state_action_input.shape h_state_action_input_flat = tf.reshape(h_state_action_input, (-1, input_shape[-1])) h_state_action_encode_flat = self.action_encoding(h_state_action_input_flat) h_state_action_encode = tf.reshape(h_state_action_encode_flat, input_shape[:-1] + [self.dim_ae_input, ]) h_state_action_encode = tf.reshape(h_state_action_encode, [-1, self.n_agents, self.dim_ae_input]) h_state_action_encode = tf.reduce_sum(h_state_action_encode, axis=1, keepdims=False) # Sum across agents input_q = tf.concat([states, h_state_action_encode], axis=-1) input_v = states q_jt = self.Q_jt(input_q) v_jt = self.V_jt(input_v) return q_jt, v_jt
[docs] class QTRAN_alt(Module): """ The basic QTRAN module. Parameters: dim_state (int): The dimension of the global state. action_space (Dict[str, Discrete]): The action space for all agents. dim_hidden (int): The dimension of the hidden layers. n_agents (int): The number of agents. dim_utility_hidden (int): The dimension of the utility hidden states. use_parameter_sharing (bool): Whether to use parameters sharing trick. """ def __init__(self, dim_state: int = 0, action_space: Dict[str, Discrete] = None, dim_hidden: int = 32, n_agents: int = 1, dim_utility_hidden: int = 1, use_parameter_sharing: bool = False): super(QTRAN_alt, self).__init__() self.dim_state = dim_state self.action_space = action_space self.n_actions_list = [a_space.n for a_space in action_space.values()] self.n_actions_max = max(self.n_actions_list) self.dim_hidden = dim_hidden self.n_agents = n_agents self.use_parameter_sharing = use_parameter_sharing self.dim_q_input = self.dim_state + dim_utility_hidden + self.n_actions_max + self.n_agents self.dim_v_input = self.dim_state self.Q_jt = tf.keras.Sequential([ tk.layers.Dense(self.dim_hidden, input_shape=(self.dim_q_input,)), tk.layers.ReLU(), tk.layers.Dense(self.dim_hidden), tk.layers.ReLU(), tk.layers.Dense(self.n_actions_max) ]) self.V_jt = tf.keras.Sequential([ tk.layers.Dense(self.dim_hidden, input_shape=(self.dim_v_input,)), tk.layers.ReLU(), tk.layers.Dense(self.dim_hidden), tk.layers.ReLU(), tk.layers.Dense(1) ]) self.dim_ae_input = dim_utility_hidden + self.n_actions_max self.action_encoding = tf.keras.Sequential([ tk.layers.Dense(self.dim_ae_input, input_shape=(self.dim_ae_input,)), tk.layers.ReLU(), tk.layers.Dense(self.dim_ae_input) ]) @tf.function def call(self, states: Tensor, hidden_state_inputs: Tensor, actions_onehot: Tensor, **kwargs): """Calculating the joint Q and V values. Parameters: states (Tensor): The global states. hidden_state_inputs (Tensor): The joint hidden states inputs for QTRAN network. actions_onehot (Tensor): The joint onehot actions for QTRAN network. Returns: q_jt (Tensor): The evaluated joint Q values. v_jt (Tensor): The evaluated joint V values. """ h_state_action_input = tf.concat([hidden_state_inputs, actions_onehot], axis=-1) input_shape = h_state_action_input.shape h_state_action_input_flat = tf.reshape(h_state_action_input, (-1, input_shape[-1])) h_state_action_encode_flat = self.action_encoding(h_state_action_input_flat) h_state_action_encode = tf.reshape(h_state_action_encode_flat, input_shape[:-1] + [self.dim_ae_input, ]) bs = input_shape[0] dim_h = self.dim_ae_input agent_ids = tf.eye(self.n_agents, dtype=tf.float32) agent_masks = 1.0 - agent_ids repeat_agent_ids = tf.tile(agent_ids[tf.newaxis, :, :], [bs, 1, 1]) # [bs, n_agents, n_agents] repeated_agent_masks = tf.tile(agent_masks[tf.newaxis, :, :, tf.newaxis], [bs, 1, 1, dim_h]) repeated_h_state_action_encode = tf.tile(h_state_action_encode[:, :, tf.newaxis, :], [1, 1, self.n_agents, 1]) h_state_action_encode_masked = repeated_h_state_action_encode * repeated_agent_masks h_state_action_encode_sum = tf.reduce_sum(h_state_action_encode_masked, axis=2) # sum over other agents repeated_states = tf.tile(states[:, None, :], [1, self.n_agents, 1]) input_q = tf.concat([repeated_states, h_state_action_encode_sum, repeat_agent_ids], axis=-1) input_q_shape = input_q.shape input_q_flat = tf.reshape(input_q, (-1, input_q_shape[-1])) q_jt_flat = self.Q_jt(input_q_flat) q_jt = tf.reshape(q_jt_flat, input_q_shape[:-1] + [q_jt_flat.shape[-1], ]) v_jt = self.V_jt(states) return q_jt, v_jt