diff --git a/xuance/mindspore/agents/multi_agent_rl/isac_agents.py b/xuance/mindspore/agents/multi_agent_rl/isac_agents.py index 0380ef45..46e46dc7 100644 --- a/xuance/mindspore/agents/multi_agent_rl/isac_agents.py +++ b/xuance/mindspore/agents/multi_agent_rl/isac_agents.py @@ -81,11 +81,11 @@ def action(self, if self.use_parameter_sharing: key = self.model_keys[0] - actions[key] = tf.reshape(actions[key], [batch_size, self.n_agents, -1]).numpy() + actions[key] = actions[key].reshape([batch_size, self.n_agents, -1]).asnumpy() actions_dict = [{k: actions[key][e, i] for i, k in enumerate(self.agent_keys)} for e in range(batch_size)] else: for key in self.agent_keys: - actions[key] = tf.reshape(actions[key], [batch_size, -1]).numpy() + actions[key] = actions[key].reshape([batch_size, -1]).asnumpy() actions_dict = [{k: actions[k][i] for k in self.agent_keys} for i in range(batch_size)] return {"hidden_state": hidden_state, "actions": actions_dict} diff --git a/xuance/mindspore/learners/multi_agent_rl/isac_learner.py b/xuance/mindspore/learners/multi_agent_rl/isac_learner.py index 2badb0ce..d6ea1e4e 100644 --- a/xuance/mindspore/learners/multi_agent_rl/isac_learner.py +++ b/xuance/mindspore/learners/multi_agent_rl/isac_learner.py @@ -1,101 +1,148 @@ """ Independent Soft Actor-critic (ISAC) -Implementation: Pytorch -Creator: Kun Jiang (kjiang@seu.edu.cn) +Implementation: MindSpore """ -from xuance.mindspore import ms, Module, Tensor, optim +from mindspore.nn import MSELoss +from xuance.mindspore import ms, Module, Tensor, optim, ops from xuance.mindspore.learners import LearnerMAS +from xuance.mindspore.utils import clip_grads from xuance.common import List from argparse import Namespace class ISAC_Learner(LearnerMAS): - class ActorNetWithLossCell(Module): - def __init__(self, backbone, n_agents, alpha): - super(ISAC_Learner.ActorNetWithLossCell, self).__init__() - self._backbone = backbone - self.n_agents = n_agents - self.alpha = alpha - - def construct(self, bs, o, ids, agt_mask): - _, actions_dist_mu = self._backbone(o, ids) - actions_eval = self._backbone.actor_net.sample(actions_dist_mu) - log_pi_a = self._backbone.actor_net.log_prob(actions_eval, actions_dist_mu) - log_pi_a = ms.ops.expand_dims(log_pi_a, axis=-1) - loss_a = -(self._backbone.critic_for_train(o, actions_eval, ids) - self.alpha * log_pi_a * agt_mask).sum() / agt_mask.sum() - return loss_a - - class CriticNetWithLossCell(Module): - def __init__(self, backbone): - super(ISAC_Learner.CriticNetWithLossCell, self).__init__() - self._backbone = backbone - - def construct(self, o, acts, ids, agt_mask, tar_q): - q_eval = self._backbone.critic_for_train(o, acts, ids) - td_error = (q_eval - tar_q) * agt_mask - loss_c = (td_error ** 2).sum() / agt_mask.sum() - return loss_c - def __init__(self, config: Namespace, model_keys: List[str], agent_keys: List[str], policy: Module): - self.gamma = gamma - self.tau = config.tau - self.alpha = config.alpha - self.sync_frequency = sync_frequency - self.mse_loss = nn.MSELoss() super(ISAC_Learner, self).__init__(config, model_keys, agent_keys, policy) self.optimizer = { - 'actor': optimizer[0], - 'critic': optimizer[1] - } + key: { + 'actor': optim.Adam(params=self.policy.parameters_actor[key], lr=self.config.learning_rate_actor, + eps=1e-5), + 'critic': optim.Adam(params=self.policy.parameters_critic[key], lr=self.config.learning_rate_critic, + eps=1e-5)} + for key in self.model_keys} self.scheduler = { - 'actor': scheduler[0], - 'critic': scheduler[1] - } - # define mindspore trainers - self.actor_loss_net = self.ActorNetWithLossCell(policy, self.n_agents, self.alpha) - self.actor_train = nn.TrainOneStepCell(self.actor_loss_net, self.optimizer['actor']) - self.actor_train.set_train() - self.critic_loss_net = self.CriticNetWithLossCell(policy) - self.critic_train = nn.TrainOneStepCell(self.critic_loss_net, self.optimizer['critic']) - self.critic_train.set_train() + key: {'actor': optim.lr_scheduler.LinearLR(self.optimizer[key]['actor'], start_factor=1.0, + end_factor=0.5, total_iters=self.config.running_steps), + 'critic': optim.lr_scheduler.LinearLR(self.optimizer[key]['critic'], start_factor=1.0, + end_factor=0.5, total_iters=self.config.running_steps)} + for key in self.model_keys} + self.gamma = config.gamma + self.tau = config.tau + self.alpha = {key: config.alpha for key in self.model_keys} + self.mse_loss = MSELoss() + self._ones = ops.Ones() + self.use_automatic_entropy_tuning = config.use_automatic_entropy_tuning + if self.use_automatic_entropy_tuning: + self.target_entropy = {key: -policy.action_space[key].shape[-1] for key in self.model_keys} + self.log_alpha = {key: ms.Parameter(self._ones(1, ms.float32)) for key in self.model_keys} + self.alpha = {key: ops.exp(self.log_alpha[key]) for key in self.model_keys} + self.alpha_optimizer = {key: optim.Adam(params=[self.log_alpha[key]], lr=config.learning_rate_actor) + for key in self.model_keys} + # Get gradient function + self.grad_fn_alpha = {key: ms.value_and_grad(self.forward_fn_alpha, None, + self.alpha_optimizer[key].parameters, has_aux=True) + for key in self.model_keys} + # Get gradient function + self.grad_fn_actor = {key: ms.value_and_grad(self.forward_fn_actor, None, + self.optimizer[key]['actor'].parameters, has_aux=True) + for key in self.model_keys} + self.grad_fn_critic = {key: ms.value_and_grad(self.forward_fn_critic, None, + self.optimizer[key]['critic'].parameters, has_aux=True) + for key in self.model_keys} + + def forward_fn_alpha(self, log_pi_eval_i, key): + alpha_loss = -(self.log_alpha[key] * ops.stop_gradient((log_pi_eval_i + self.target_entropy[key]))).mean() + return alpha_loss, self.log_alpha[key] + + def forward_fn_actor(self, obs, ids, mask_values, agent_key): + _, actions_eval, log_pi_eval = self.policy(observation=obs, agent_ids=ids) + _, _, policy_q_1, policy_q_2 = self.policy.Qpolicy(observation=obs, actions=actions_eval, agent_ids=ids, + agent_key=agent_key) + log_pi_eval_i = log_pi_eval[agent_key].reshape(-1) + policy_q = ops.minimum(policy_q_1[agent_key], policy_q_2[agent_key]).reshape(-1) + loss_a = ((self.alpha[agent_key] * log_pi_eval_i - policy_q) * mask_values).sum() / mask_values.sum() + return loss_a, log_pi_eval[agent_key], policy_q + + def forward_fn_critic(self, obs, actions, ids, mask_values, backup, agent_key): + _, _, action_q_1, action_q_2 = self.policy.Qaction(observation=obs, actions=actions, agent_ids=ids) + action_q_1_i, action_q_2_i = action_q_1[agent_key].reshape(-1), action_q_2[agent_key].reshape(-1) + td_error_1, td_error_2 = action_q_1_i - ops.stop_gradient(backup), action_q_2_i - ops.stop_gradient(backup) + td_error_1 *= mask_values + td_error_2 *= mask_values + loss_c = ((td_error_1 ** 2).sum() + (td_error_2 ** 2).sum()) / mask_values.sum() + return loss_c, action_q_1_i, action_q_2_i def update(self, sample): self.iterations += 1 - obs = Tensor(sample['obs']) - actions = Tensor(sample['actions']) - obs_next = Tensor(sample['obs_next']) - rewards = Tensor(sample['rewards']) - terminals = Tensor(sample['terminals']).view(-1, self.n_agents, 1) - agent_mask = Tensor(sample['agent_mask']).view(-1, self.n_agents, 1) - batch_size = obs.shape[0] - IDs = ops.broadcast_to(self.expand_dims(self.eye(self.n_agents, self.n_agents, ms.float32), 0), - (batch_size, -1, -1)) + info = {} - actions_next_dist_mu = self.policy.target_actor(obs_next, IDs) - actions_next = self.policy.target_actor_net.sample(actions_next_dist_mu) - log_pi_a_next = self.policy.target_actor_net.log_prob(actions_next, actions_next_dist_mu) - q_next = self.policy.target_critic(obs_next, actions_next, IDs) - log_pi_a_next = ms.ops.expand_dims(log_pi_a_next, axis=-1) - q_target = rewards + (1-terminals) * self.args.gamma * (q_next - self.alpha * log_pi_a_next) + # Prepare training data. + sample_Tensor = self.build_training_data(sample, + use_parameter_sharing=self.use_parameter_sharing, + use_actions_mask=False) + batch_size = sample_Tensor['batch_size'] + obs = sample_Tensor['obs'] + actions = sample_Tensor['actions'] + obs_next = sample_Tensor['obs_next'] + rewards = sample_Tensor['rewards'] + terminals = sample_Tensor['terminals'] + agent_mask = sample_Tensor['agent_mask'] + IDs = sample_Tensor['agent_ids'] + if self.use_parameter_sharing: + key = self.model_keys[0] + bs = batch_size * self.n_agents + rewards[key] = rewards[key].reshape(batch_size * self.n_agents) + terminals[key] = terminals[key].reshape(batch_size * self.n_agents) + else: + bs = batch_size - # calculate the loss function - loss_a = self.actor_train(batch_size, obs, IDs, agent_mask) - loss_c = self.critic_train(obs, actions, IDs, agent_mask, q_target) + # feedforward - self.policy.soft_update(self.tau) + _, actions_next, log_pi_next = self.policy(observation=obs_next, agent_ids=IDs) + _, _, next_q = self.policy.Qtarget(next_observation=obs_next, next_actions=actions_next, agent_ids=IDs) + + for key in self.model_keys: + mask_values = agent_mask[key] + # update critic + log_pi_next_eval = log_pi_next[key].reshape(bs) + next_q_i = next_q[key].reshape(bs) + target_value = next_q_i - self.alpha[key] * log_pi_next_eval + backup = rewards[key] + (1 - terminals[key]) * self.gamma * target_value + (loss_c, _, _), grads_critic = self.grad_fn_critic[key](obs, actions, IDs, mask_values, backup, key) + if self.use_grad_clip: + grads_critic = clip_grads(grads_critic, Tensor(-self.grad_clip_norm), Tensor(self.grad_clip_norm)) + self.optimizer[key]['critic'](grads_critic) - learning_rate_actor = self.scheduler['actor'](self.iterations).asnumpy() - learning_rate_critic = self.scheduler['critic'](self.iterations).asnumpy() + # update actor + (loss_a, log_pi_eval_i, policy_q), grads_actor = self.grad_fn_actor[key](obs, IDs, mask_values, key) + if self.use_grad_clip: + grads_actor = clip_grads(grads_actor, Tensor(-self.grad_clip_norm), Tensor(self.grad_clip_norm)) + self.optimizer[key]['actor'](grads_actor) - info = { - "learning_rate_actor": learning_rate_actor, - "learning_rate_critic": learning_rate_critic, - "loss_actor": loss_a.asnumpy(), - "loss_critic": loss_c.asnumpy() - } + # automatic entropy tuning + if self.use_automatic_entropy_tuning: + (alpha_loss, _), grads_alpha = self.grad_fn_alpha[key](log_pi_eval_i, key) + self.alpha_optimizer[key](grads_alpha) + self.alpha[key] = ops.exp(self.log_alpha[key]) + else: + alpha_loss = 0 + learning_rate_actor = self.scheduler[key]['actor'].get_last_lr()[0] + learning_rate_critic = self.scheduler[key]['critic'].get_last_lr()[0] + + info.update({ + f"{key}/learning_rate_actor": learning_rate_actor.asnumpy(), + f"{key}/learning_rate_critic": learning_rate_critic.asnumpy(), + f"{key}/loss_actor": loss_a.asnumpy(), + f"{key}/loss_critic": loss_c.asnumpy(), + f"{key}/predictQ": policy_q.mean().asnumpy(), + f"{key}/alpha_loss": alpha_loss.asnumpy(), + f"{key}/alpha": self.alpha[key].asnumpy(), + }) + + self.policy.soft_update(self.tau) return info diff --git a/xuance/mindspore/learners/multi_agent_rl/masac_learner.py b/xuance/mindspore/learners/multi_agent_rl/masac_learner.py index e1a4465b..df0a7642 100644 --- a/xuance/mindspore/learners/multi_agent_rl/masac_learner.py +++ b/xuance/mindspore/learners/multi_agent_rl/masac_learner.py @@ -1,101 +1,171 @@ """ Multi-agent Soft Actor-critic (MASAC) -Implementation: Pytorch -Creator: Kun Jiang (kjiang@seu.edu.cn) +Implementation: MindSpore """ -from xuance.mindspore import ms, Module, Tensor, optim +from mindspore.nn import MSELoss +from xuance.mindspore import ms, Module, Tensor, optim, ops from xuance.mindspore.learners import LearnerMAS +from xuance.mindspore.utils import clip_grads from xuance.common import List from argparse import Namespace +from operator import itemgetter class MASAC_Learner(LearnerMAS): - class ActorNetWithLossCell(Module): - def __init__(self, backbone, n_agents, alpha): - super(MASAC_Learner.ActorNetWithLossCell, self).__init__() - self._backbone = backbone - self.n_agents = n_agents - self.alpha = alpha - - def construct(self, bs, o, ids, agt_mask): - _, actions_dist_mu = self._backbone(o, ids) - actions_eval = self._backbone.actor_net.sample(actions_dist_mu) - log_pi_a = self._backbone.actor_net.log_prob(actions_eval, actions_dist_mu) - log_pi_a = ms.ops.expand_dims(log_pi_a, axis=-1) - loss_a = -(self._backbone.critic_for_train(o, actions_eval, ids) - self.alpha * log_pi_a * agt_mask).sum() / agt_mask.sum() - return loss_a - - class CriticNetWithLossCell(Module): - def __init__(self, backbone): - super(MASAC_Learner.CriticNetWithLossCell, self).__init__() - self._backbone = backbone - - def construct(self, o, acts, ids, agt_mask, tar_q): - q_eval = self._backbone.critic_for_train(o, acts, ids) - td_error = (q_eval - tar_q) * agt_mask - loss_c = (td_error ** 2).sum() / agt_mask.sum() - return loss_c - def __init__(self, config: Namespace, model_keys: List[str], agent_keys: List[str], policy: Module): - self.gamma = gamma - self.tau = config.tau - self.alpha = config.alpha - self.sync_frequency = sync_frequency - self.mse_loss = nn.MSELoss() super(MASAC_Learner, self).__init__(config, model_keys, agent_keys, policy) self.optimizer = { - 'actor': optimizer[0], - 'critic': optimizer[1] - } + key: { + 'actor': optim.Adam(params=self.policy.parameters_actor[key], lr=self.config.learning_rate_actor, + eps=1e-5), + 'critic': optim.Adam(params=self.policy.parameters_critic[key], lr=self.config.learning_rate_critic, + eps=1e-5)} + for key in self.model_keys} self.scheduler = { - 'actor': scheduler[0], - 'critic': scheduler[1] - } - # define mindspore trainers - self.actor_loss_net = self.ActorNetWithLossCell(policy, self.n_agents, self.alpha) - self.actor_train = nn.TrainOneStepCell(self.actor_loss_net, self.optimizer['actor']) - self.actor_train.set_train() - self.critic_loss_net = self.CriticNetWithLossCell(policy) - self.critic_train = nn.TrainOneStepCell(self.critic_loss_net, self.optimizer['critic']) - self.critic_train.set_train() + key: {'actor': optim.lr_scheduler.LinearLR(self.optimizer[key]['actor'], start_factor=1.0, + end_factor=0.5, total_iters=self.config.running_steps), + 'critic': optim.lr_scheduler.LinearLR(self.optimizer[key]['critic'], start_factor=1.0, + end_factor=0.5, total_iters=self.config.running_steps)} + for key in self.model_keys} + self.gamma = config.gamma + self.tau = config.tau + self.alpha = {key: config.alpha for key in self.model_keys} + self.mse_loss = MSELoss() + self._ones = ops.Ones() + self.use_automatic_entropy_tuning = config.use_automatic_entropy_tuning + if self.use_automatic_entropy_tuning: + self.target_entropy = {key: -policy.action_space[key].shape[-1] for key in self.model_keys} + self.log_alpha = {key: ms.Parameter(self._ones(1, ms.float32)) for key in self.model_keys} + self.alpha = {key: ops.exp(self.log_alpha[key]) for key in self.model_keys} + self.alpha_optimizer = {key: optim.Adam(params=[self.log_alpha[key]], lr=config.learning_rate_actor) + for key in self.model_keys} + # Get gradient function + self.grad_fn_alpha = {key: ms.value_and_grad(self.forward_fn_alpha, None, + self.alpha_optimizer[key].parameters, has_aux=True) + for key in self.model_keys} + # Get gradient function + self.grad_fn_actor = {key: ms.value_and_grad(self.forward_fn_actor, None, + self.optimizer[key]['actor'].parameters, has_aux=True) + for key in self.model_keys} + self.grad_fn_critic = {key: ms.value_and_grad(self.forward_fn_critic, None, + self.optimizer[key]['critic'].parameters, has_aux=True) + for key in self.model_keys} + + def forward_fn_alpha(self, log_pi_eval_i, key): + alpha_loss = -(self.log_alpha[key] * ops.stop_gradient((log_pi_eval_i + self.target_entropy[key]))).mean() + return alpha_loss, self.log_alpha[key] + + def forward_fn_actor(self, batch_size, obs, obs_joint, ids, mask_values, agent_key): + _, actions_eval, log_pi_eval = self.policy(observation=obs, agent_ids=ids) + if self.use_parameter_sharing: + actions_eval_joint = actions_eval[agent_key].reshape(batch_size, self.n_agents, -1).reshape(batch_size, -1) + else: + actions_eval_detach_others = {k: actions_eval[k] if k == agent_key else ops.stop_gradient(actions_eval[k]) + for k in self.model_keys} + actions_eval_joint = ops.cat(itemgetter(*self.model_keys)(actions_eval_detach_others), + axis=-1).reshape(batch_size, -1) + _, _, policy_q_1, policy_q_2 = self.policy.Qpolicy(joint_observation=obs_joint, + joint_actions=actions_eval_joint, + agent_ids=ids, agent_key=agent_key) + log_pi_eval_i = log_pi_eval[agent_key].reshape(-1) + policy_q = ops.minimum(policy_q_1[agent_key], policy_q_2[agent_key]).reshape(-1) + loss_a = ((self.alpha[agent_key] * log_pi_eval_i - policy_q) * mask_values).sum() / mask_values.sum() + return loss_a, log_pi_eval[agent_key], policy_q + + def forward_fn_critic(self, obs_joint, actions_joint, ids, mask_values, backup, agent_key): + _, _, action_q_1, action_q_2 = self.policy.Qaction(joint_observation=obs_joint, joint_actions=actions_joint, + agent_ids=ids) + action_q_1_i = action_q_1[agent_key].reshape(-1) + action_q_2_i = action_q_2[agent_key].reshape(-1) + td_error_1, td_error_2 = action_q_1_i - ops.stop_gradient(backup), action_q_2_i - ops.stop_gradient(backup) + td_error_1 *= mask_values + td_error_2 *= mask_values + loss_c = ((td_error_1 ** 2).sum() + (td_error_2 ** 2).sum()) / mask_values.sum() + return loss_c, action_q_1_i, action_q_2_i def update(self, sample): self.iterations += 1 - obs = Tensor(sample['obs']) - actions = Tensor(sample['actions']) - obs_next = Tensor(sample['obs_next']) - rewards = Tensor(sample['rewards']) - terminals = Tensor(sample['terminals']).view(-1, self.n_agents, 1) - agent_mask = Tensor(sample['agent_mask']).view(-1, self.n_agents, 1) - batch_size = obs.shape[0] - IDs = ops.broadcast_to(self.expand_dims(self.eye(self.n_agents, self.n_agents, ms.float32), 0), - (batch_size, -1, -1)) + info = {} - actions_next_dist_mu = self.policy.target_actor(obs_next, IDs) - actions_next = self.policy.target_actor_net.sample(actions_next_dist_mu) - log_pi_a_next = self.policy.target_actor_net.log_prob(actions_next, actions_next_dist_mu) - q_next = self.policy.target_critic(obs_next, actions_next, IDs) - log_pi_a_next = ms.ops.expand_dims(log_pi_a_next, axis=-1) - q_target = rewards + (1-terminals) * self.args.gamma * (q_next - self.alpha * log_pi_a_next) + # Prepare training data. + sample_Tensor = self.build_training_data(sample, + use_parameter_sharing=self.use_parameter_sharing, + use_actions_mask=False) + batch_size = sample_Tensor['batch_size'] + obs = sample_Tensor['obs'] + actions = sample_Tensor['actions'] + obs_next = sample_Tensor['obs_next'] + rewards = sample_Tensor['rewards'] + terminals = sample_Tensor['terminals'] + agent_mask = sample_Tensor['agent_mask'] + IDs = sample_Tensor['agent_ids'] + if self.use_parameter_sharing: + key = self.model_keys[0] + bs = batch_size * self.n_agents + obs_joint = obs[key].reshape(batch_size, -1) + next_obs_joint = obs_next[key].reshape(batch_size, -1) + actions_joint = actions[key].reshape(batch_size, -1) + rewards[key] = rewards[key].reshape(batch_size * self.n_agents) + terminals[key] = terminals[key].reshape(batch_size * self.n_agents) + else: + bs = batch_size + obs_joint = ops.cat(itemgetter(*self.agent_keys)(obs), axis=-1).reshape(batch_size, -1) + next_obs_joint = ops.cat(itemgetter(*self.agent_keys)(obs_next), axis=-1).reshape(batch_size, -1) + actions_joint = ops.cat(itemgetter(*self.agent_keys)(actions), axis=-1).reshape(batch_size, -1) - # calculate the loss function - loss_a = self.actor_train(batch_size, obs, IDs, agent_mask) - loss_c = self.critic_train(obs, actions, IDs, agent_mask, q_target) + # train the model + _, actions_next, log_pi_next = self.policy(observation=obs_next, agent_ids=IDs) + if self.use_parameter_sharing: + key = self.model_keys[0] + actions_next_joint = actions_next[key].reshape(batch_size, self.n_agents, -1).reshape(batch_size, -1) + else: + actions_next_joint = ops.cat(itemgetter(*self.model_keys)(actions_next), -1).reshape(batch_size, -1) - self.policy.soft_update(self.tau) + _, _, target_q = self.policy.Qtarget(joint_observation=next_obs_joint, joint_actions=actions_next_joint, + agent_ids=IDs) + for key in self.model_keys: + mask_values = agent_mask[key] + # critic update + log_pi_next_eval = log_pi_next[key].reshape(bs) + target_value = target_q[key].reshape(bs) - self.alpha[key] * log_pi_next_eval + backup = rewards[key] + (1 - terminals[key]) * self.gamma * target_value + (loss_c, _, _), grads_critic = self.grad_fn_critic[key](obs_joint, actions_joint, IDs, mask_values, backup, + key) + if self.use_grad_clip: + grads_critic = clip_grads(grads_critic, Tensor(-self.grad_clip_norm), Tensor(self.grad_clip_norm)) + self.optimizer[key]['critic'](grads_critic) - learning_rate_actor = self.scheduler['actor'](self.iterations).asnumpy() - learning_rate_critic = self.scheduler['critic'](self.iterations).asnumpy() + # update actor + (loss_a, log_pi_eval_i, policy_q), grads_actor = self.grad_fn_actor[key](batch_size, obs, obs_joint, IDs, + mask_values, key) + if self.use_grad_clip: + grads_actor = clip_grads(grads_actor, Tensor(-self.grad_clip_norm), Tensor(self.grad_clip_norm)) + self.optimizer[key]['actor'](grads_actor) - info = { - "learning_rate_actor": learning_rate_actor, - "loss_actor": loss_a.asnumpy(), - "learning_rate_critic": learning_rate_critic, - "loss_critic": loss_c.asnumpy() - } + # automatic entropy tuning + if self.use_automatic_entropy_tuning: + (alpha_loss, _), grads_alpha = self.grad_fn_alpha[key](log_pi_eval_i, key) + self.alpha_optimizer[key](grads_alpha) + self.alpha[key] = ops.exp(self.log_alpha[key]) + else: + alpha_loss = 0 + learning_rate_actor = self.scheduler[key]['actor'].get_last_lr()[0] + learning_rate_critic = self.scheduler[key]['critic'].get_last_lr()[0] + + info.update({ + f"{key}/learning_rate_actor": learning_rate_actor.asnumpy(), + f"{key}/learning_rate_critic": learning_rate_critic.asnumpy(), + f"{key}/loss_actor": loss_a.asnumpy(), + f"{key}/loss_critic": loss_c.asnumpy(), + f"{key}/predictQ": policy_q.mean().asnumpy(), + f"{key}/alpha_loss": alpha_loss.asnumpy(), + f"{key}/alpha": self.alpha[key].asnumpy(), + }) + + self.policy.soft_update(self.tau) return info diff --git a/xuance/mindspore/learners/multi_agent_rl/matd3_learner.py b/xuance/mindspore/learners/multi_agent_rl/matd3_learner.py index 9b2bfdb0..ffdf3360 100644 --- a/xuance/mindspore/learners/multi_agent_rl/matd3_learner.py +++ b/xuance/mindspore/learners/multi_agent_rl/matd3_learner.py @@ -1,124 +1,150 @@ """ Multi-Agent TD3 - """ -from xuance.mindspore import ms, Module, Tensor, optim +from mindspore.nn import MSELoss +from xuance.mindspore import ms, Module, Tensor, optim, ops from xuance.mindspore.learners import LearnerMAS +from xuance.mindspore.utils import clip_grads from xuance.common import List from argparse import Namespace +from operator import itemgetter class MATD3_Learner(LearnerMAS): - class ActorNetWithLossCell(Module): - def __init__(self, backbone, n_agents): - super(MATD3_Learner.ActorNetWithLossCell, self).__init__() - self._backbone = backbone - self._mean = ms.ops.ReduceMean(keep_dims=True) - self.n_agents = n_agents - - def construct(self, bs, o, ids, agt_mask): - _, actions_eval = self._backbone(o, ids) - actions_n_eval = ms.ops.broadcast_to(actions_eval.view(bs, 1, -1), (-1, self.n_agents, -1)) - _, policy_q = self._backbone.Qpolicy(o, actions_n_eval, ids) - loss_a = -policy_q.mean() - return loss_a - - class CriticNetWithLossCell_A(Module): - def __init__(self, backbone): - super(MATD3_Learner.CriticNetWithLossCell_A, self).__init__() - self._backbone = backbone - self._loss = nn.MSELoss() - - def construct(self, o, acts, ids, agt_mask, tar_q): - _, q_eval = self._backbone.Qaction_A(o, acts, ids) - td_error = (q_eval - tar_q) * agt_mask - loss_c = (td_error ** 2).sum() / agt_mask.sum() - return loss_c - - class CriticNetWithLossCell_B(Module): - def __init__(self, backbone): - super(MATD3_Learner.CriticNetWithLossCell_B, self).__init__() - self._backbone = backbone - self._loss = nn.MSELoss() - - def construct(self, o, acts, ids, agt_mask, tar_q): - _, q_eval = self._backbone.Qaction_B(o, acts, ids) - td_error = (q_eval - tar_q) * agt_mask - loss_c = (td_error ** 2).sum() / agt_mask.sum() - return loss_c - def __init__(self, config: Namespace, model_keys: List[str], agent_keys: List[str], policy: Module): - self.gamma = gamma - self.tau = config.tau - self.delay = delay - self.sync_frequency = sync_frequency - self.mse_loss = nn.MSELoss() super(MATD3_Learner, self).__init__(config, model_keys, agent_keys, policy) self.optimizer = { - 'actor': optimizer[0], - 'critic_A': optimizer[1], - 'critic_B': optimizer[2] - } + key: { + 'actor': optim.Adam(params=self.policy.parameters_actor[key], lr=self.config.learning_rate_actor, + eps=1e-5), + 'critic': optim.Adam(params=self.policy.parameters_critic[key], lr=self.config.learning_rate_critic, + eps=1e-5)} + for key in self.model_keys} self.scheduler = { - 'actor': scheduler[0], - 'critic_A': scheduler[1], - 'critic_B': scheduler[2] - } - # define mindspore trainers - self.actor_loss_net = self.ActorNetWithLossCell(policy, self.n_agents) - self.actor_train = nn.TrainOneStepCell(self.actor_loss_net, self.optimizer['actor']) - self.actor_train.set_train() - self.critic_loss_net_A = self.CriticNetWithLossCell_A(policy) - self.critic_train_A = nn.TrainOneStepCell(self.critic_loss_net_A, self.optimizer['critic_A']) - self.critic_train_A.set_train() - self.critic_loss_net_B = self.CriticNetWithLossCell_B(policy) - self.critic_train_B = nn.TrainOneStepCell(self.critic_loss_net_B, self.optimizer['critic_B']) - self.critic_train_B.set_train() + key: {'actor': optim.lr_scheduler.LinearLR(self.optimizer[key]['actor'], start_factor=1.0, + end_factor=0.5, total_iters=self.config.running_steps), + 'critic': optim.lr_scheduler.LinearLR(self.optimizer[key]['critic'], start_factor=1.0, + end_factor=0.5, total_iters=self.config.running_steps)} + for key in self.model_keys} + self.gamma = config.gamma + self.tau = config.tau + self.mse_loss = MSELoss() + self.actor_update_delay = config.actor_update_delay + # Get gradient function + self.grad_fn_actor = {key: ms.value_and_grad(self.forward_fn_actor, None, + self.optimizer[key]['actor'].parameters, has_aux=True) + for key in self.model_keys} + self.grad_fn_critic = {key: ms.value_and_grad(self.forward_fn_critic, None, + self.optimizer[key]['critic'].parameters, has_aux=True) + for key in self.model_keys} + self.policy.set_train() + + def forward_fn_actor(self, batch_size, obs, obs_joint, actions, ids, mask_values, agent_key): + _, actions_eval = self.policy(observation=obs, agent_ids=ids) + if self.use_parameter_sharing: + act_eval = actions_eval[agent_key].reshape(batch_size, self.n_agents, -1).reshape(batch_size, -1) + else: + a_joint = {k: actions_eval[k] if k == agent_key else actions[k] for k in self.agent_keys} + act_eval = ops.cat(itemgetter(*self.agent_keys)(a_joint), axis=-1).reshape(batch_size, -1) + _, _, q_policy = self.policy.Qpolicy(joint_observation=obs_joint, joint_actions=act_eval, agent_ids=ids, + agent_key=agent_key) + q_policy_i = q_policy[agent_key].reshape(-1) + loss_a = -(q_policy_i * mask_values).sum() / mask_values.sum() + return loss_a, q_policy_i + + def forward_fn_critic(self, obs_joint, actions_joint, ids, mask_values, q_target, agent_key): + q_eval_A, q_eval_B, _ = self.policy.Qpolicy(joint_observation=obs_joint, joint_actions=actions_joint, + agent_ids=ids) + q_eval_A_i, q_eval_B_i = q_eval_A[agent_key].reshape(-1), q_eval_B[agent_key].reshape(-1) + td_error_A = (q_eval_A_i - ops.stop_gradient(q_target)) * mask_values + td_error_B = (q_eval_B_i - ops.stop_gradient(q_target)) * mask_values + loss_c = ((td_error_A ** 2).sum() + (td_error_B ** 2).sum()) / mask_values.sum() + return loss_c, q_eval_A_i, q_eval_B_i def update(self, sample): self.iterations += 1 - obs = Tensor(sample['obs']) - actions = Tensor(sample['actions']) - obs_next = Tensor(sample['obs_next']) - rewards = Tensor(sample['rewards']) - terminals = Tensor(sample['terminals']).view(-1, self.n_agents, 1) - agent_mask = Tensor(sample['agent_mask']).view(-1, self.n_agents, 1) - batch_size = obs.shape[0] - IDs = ops.broadcast_to(self.expand_dims(self.eye(self.n_agents, self.n_agents, ms.float32), 0), - (batch_size, -1, -1)) - - # train critic - actions_next = self.policy.target_actor(obs_next, IDs) - actions_next_n = ms.ops.broadcast_to(actions_next.view(batch_size, 1, -1), (-1, self.n_agents, -1)) - _, target_q = self.policy.Qtarget(obs_next, actions_next_n, IDs) - q_target = rewards + (1 - terminals) * self.args.gamma * target_q - - actions_n = ms.ops.broadcast_to(actions.view(batch_size, 1, -1), (-1, self.n_agents, -1)) - loss_c_A = self.critic_train_A(obs, actions_n, IDs, agent_mask, q_target) - loss_c_B = self.critic_train_B(obs, actions_n, IDs, agent_mask, q_target) - - # actor update - if self.iterations % self.delay == 0: - p_loss = self.actor_train(batch_size, obs, IDs, agent_mask) + info = {} + + # prepare training data + sample_Tensor = self.build_training_data(sample, + use_parameter_sharing=self.use_parameter_sharing, + use_actions_mask=False) + batch_size = sample_Tensor['batch_size'] + obs = sample_Tensor['obs'] + actions = sample_Tensor['actions'] + obs_next = sample_Tensor['obs_next'] + rewards = sample_Tensor['rewards'] + terminals = sample_Tensor['terminals'] + agent_mask = sample_Tensor['agent_mask'] + IDs = sample_Tensor['agent_ids'] + if self.use_parameter_sharing: + key = self.model_keys[0] + bs = batch_size * self.n_agents + obs_joint = obs[key].reshape(batch_size, -1) + next_obs_joint = obs_next[key].reshape(batch_size, -1) + actions_joint = actions[key].reshape(batch_size, -1) + rewards[key] = rewards[key].reshape(batch_size * self.n_agents) + terminals[key] = terminals[key].reshape(batch_size * self.n_agents) + else: + bs = batch_size + obs_joint = ops.cat(itemgetter(*self.agent_keys)(obs), axis=-1).reshape(batch_size, -1) + next_obs_joint = ops.cat(itemgetter(*self.agent_keys)(obs_next), axis=-1).reshape(batch_size, -1) + actions_joint = ops.cat(itemgetter(*self.agent_keys)(actions), axis=-1).reshape(batch_size, -1) + + # get values + _, actions_next = self.policy.Atarget(next_observation=obs_next, agent_ids=IDs) + if self.use_parameter_sharing: + key = self.model_keys[0] + actions_next_joint = actions_next[key].reshape(batch_size, self.n_agents, -1).reshape(batch_size, -1) + else: + actions_next_joint = ops.cat(itemgetter(*self.model_keys)(actions_next), axis=-1).reshape(batch_size, -1) + + q_next = self.policy.Qtarget(joint_observation=next_obs_joint, joint_actions=actions_next_joint, agent_ids=IDs) + + # update critic(s) + for key in self.model_keys: + mask_values = agent_mask[key] + q_next_i = q_next[key].reshape(bs) + q_target = rewards[key] + (1 - terminals[key]) * self.gamma * q_next_i + (loss_c, q_eval_A_i, q_eval_B_i), grads_critic = self.grad_fn_critic[key](obs_joint, actions_joint, IDs, + mask_values, q_target, key) + if self.use_grad_clip: + grads_critic = clip_grads(grads_critic, Tensor(-self.grad_clip_norm), Tensor(self.grad_clip_norm)) + self.optimizer[key]['critic'](grads_critic) + + self.scheduler[key]['critic'].step() + learning_rate_critic = self.scheduler[key]['critic'].get_last_lr()[0] + + info.update({ + f"{key}/learning_rate_critic": learning_rate_critic.asnumpy(), + f"{key}/loss_critic": loss_c.asnumpy(), + f"{key}/predictQ_A": q_eval_A_i.mean().asnumpy(), + f"{key}/predictQ_B": q_eval_B_i.mean().asnumpy() + }) + + # update actor(s) + if self.iterations % self.actor_update_delay == 0: + for key in self.model_keys: + mask_values = agent_mask[key] + # update actor + (loss_a, q_policy_i), grads_actor = self.grad_fn_actor[key](batch_size, obs, obs_joint, actions, + IDs, mask_values, key) + if self.use_grad_clip: + grads_actor = clip_grads(grads_actor, Tensor(-self.grad_clip_norm), Tensor(self.grad_clip_norm)) + self.optimizer[key]['actor'](grads_actor) + + self.scheduler[key]['actor'].step() + learning_rate_actor = self.scheduler[key]['actor'].get_last_lr()[0] + + info.update({ + f"{key}/learning_rate_actor": learning_rate_actor.asnumpy(), + f"{key}/loss_actor": loss_a.asnumpy(), + f"{key}/q_policy": q_policy_i.mean().asnumpy(), + }) self.policy.soft_update(self.tau) - learning_rate_actor = self.scheduler['actor'](self.iterations).asnumpy() - learning_rate_critic_A = self.scheduler['critic_A'](self.iterations).asnumpy() - learning_rate_critic_B = self.scheduler['critic_B'](self.iterations).asnumpy() - - info = { - "learning_rate_actor": learning_rate_actor, - "learning_rate_critic_A": learning_rate_critic_A, - "learning_rate_critic_B": learning_rate_critic_B, - "loss_critic_A": loss_c_A.asnumpy(), - "loss_critic_B": loss_c_B.asnumpy() - } - - if self.iterations % self.delay == 0: - info["loss_actor"] = p_loss.asnumpy() - return info diff --git a/xuance/mindspore/policies/__init__.py b/xuance/mindspore/policies/__init__.py index 412746f9..8104073e 100644 --- a/xuance/mindspore/policies/__init__.py +++ b/xuance/mindspore/policies/__init__.py @@ -1,7 +1,7 @@ from .core import BasicQhead from .core import ActorNet from .core import CategoricalActorNet -# from .core import CategoricalActorNet_SAC +from .core import CategoricalActorNet_SAC from .core import GaussianActorNet from .core import CriticNet from .core import GaussianActorNet_SAC @@ -39,8 +39,8 @@ from .categorical_marl import MeanFieldActorCriticPolicy, COMA_Policy from .categorical_marl import MAAC_Policy as Categorical_MAAC_Policy from .categorical_marl import MAAC_Policy_Share as Categorical_MAAC_Policy_Share -# from .gaussian_marl import Basic_ISAC_Policy as Gaussian_ISAC -# from .gaussian_marl import MASAC_Policy as Gaussian_MASAC +from .gaussian_marl import Basic_ISAC_Policy as Gaussian_ISAC +from .gaussian_marl import MASAC_Policy as Gaussian_MASAC from .gaussian_marl import MAAC_Policy as Gaussain_MAAC Mixer = { @@ -86,22 +86,22 @@ "MF_Q_network": MFQnetwork, "Categorical_MFAC_Policy": MeanFieldActorCriticPolicy, "Gaussian_MAAC_Policy": Gaussain_MAAC, - # "Gaussian_ISAC_Policy": Gaussian_ISAC, - # "Gaussian_MASAC_Policy": Gaussian_MASAC, + "Gaussian_ISAC_Policy": Gaussian_ISAC, + "Gaussian_MASAC_Policy": Gaussian_MASAC, "MATD3_Policy": MATD3_Policy } -# __all__ = [ -# "REGISTRY_Policy", "Mixer", -# "ActorNet", "CategoricalActorNet", "CategoricalActorNet_SAC", "GaussianActorNet", "GaussianActorNet_SAC", -# "BasicQhead", "CriticNet", "GaussianActorNet_SAC", -# "VDN_mixer", "QMIX_mixer", "QMIX_FF_mixer", "QTRAN_base", "QTRAN_alt", -# "Categorical_AC_Policy", "Categorical_Actor_Policy", "Categorical_PPG_Policy", "Categorical_SAC_Policy", -# "Gaussian_AC_Policy", "Gaussian_Actor_Policy", "Gaussian_PPG_Policy", "Gaussian_SAC_Policy", -# "BasicQnetwork", "DuelQnetwork", "NoisyQnetwork", "C51Qnetwork", "QRDQN_Network", "DDPGPolicy", "TD3Policy", -# "PDQNPolicy", "MPDQNPolicy", "SPDQNPolicy", "DRQNPolicy", -# "BasicQnetwork_marl", "MFQnetwork", "MixingQnetwork", "Weighted_MixingQnetwork", "Qtran_MixingQnetwork", -# "DCG_policy", "Independent_DDPG_Policy", "MADDPG_Policy", "MATD3_Policy", -# "MeanFieldActorCriticPolicy", "COMA_Policy", "Categorical_MAAC_Policy", "Categorical_MAAC_Policy_Share", -# "Gaussian_ISAC", "Gaussian_MASAC", "Gaussain_MAAC", -# ] +__all__ = [ + "REGISTRY_Policy", "Mixer", + "ActorNet", "CategoricalActorNet", "CategoricalActorNet_SAC", "GaussianActorNet", "GaussianActorNet_SAC", + "BasicQhead", "CriticNet", "GaussianActorNet_SAC", + "VDN_mixer", "QMIX_mixer", "QMIX_FF_mixer", "QTRAN_base", "QTRAN_alt", + "Categorical_AC_Policy", "Categorical_Actor_Policy", "Categorical_PPG_Policy", "Categorical_SAC_Policy", + "Gaussian_AC_Policy", "Gaussian_Actor_Policy", "Gaussian_PPG_Policy", "Gaussian_SAC_Policy", + "BasicQnetwork", "DuelQnetwork", "NoisyQnetwork", "C51Qnetwork", "QRDQN_Network", "DDPGPolicy", "TD3Policy", + "PDQNPolicy", "MPDQNPolicy", "SPDQNPolicy", "DRQNPolicy", + "BasicQnetwork_marl", "MFQnetwork", "MixingQnetwork", "Weighted_MixingQnetwork", "Qtran_MixingQnetwork", + "DCG_policy", "Independent_DDPG_Policy", "MADDPG_Policy", "MATD3_Policy", + "MeanFieldActorCriticPolicy", "COMA_Policy", "Categorical_MAAC_Policy", "Categorical_MAAC_Policy_Share", + "Gaussian_ISAC", "Gaussian_MASAC", "Gaussain_MAAC", +] diff --git a/xuance/mindspore/policies/deterministic_marl.py b/xuance/mindspore/policies/deterministic_marl.py index 1177bcfb..adcccb1b 100644 --- a/xuance/mindspore/policies/deterministic_marl.py +++ b/xuance/mindspore/policies/deterministic_marl.py @@ -984,6 +984,10 @@ def __init__(self, self.target_actor[key] = deepcopy(self.actor[key]) self.target_critic_A[key] = deepcopy(self.critic_A[key]) self.target_critic_B[key] = deepcopy(self.critic_B[key]) + # Update parameters name + self.actor[key].update_parameters_name(key + '_actor_') + self.critic_A[key].update_parameters_name(key + '_critic_A_') + self.critic_B[key].update_parameters_name(key + '_critic_B_') @property def parameters_critic(self): diff --git a/xuance/mindspore/policies/gaussian_marl.py b/xuance/mindspore/policies/gaussian_marl.py index 2699fea3..36795bef 100644 --- a/xuance/mindspore/policies/gaussian_marl.py +++ b/xuance/mindspore/policies/gaussian_marl.py @@ -184,6 +184,10 @@ def __init__(self, normalize, initialize, activation, activation_action) self.critic_1[key] = CriticNet(dim_critic_in, critic_hidden_size, normalize, initialize, activation) self.critic_2[key] = CriticNet(dim_critic_in, critic_hidden_size, normalize, initialize, activation) + # Update parameters name + self.actor[key].update_parameters_name(key + '_actor_') + self.critic_1[key].update_parameters_name(key + '_critic_1_') + self.critic_2[key].update_parameters_name(key + '_critic_2_') self.target_critic_1 = deepcopy(self.critic_1) self.target_critic_2 = deepcopy(self.critic_2) @@ -191,17 +195,18 @@ def __init__(self, def parameters_actor(self): parameters_actor = {} for key in self.model_keys: - parameters_actor[key] = list(self.actor_representation[key].parameters()) + list( - self.actor[key].parameters()) + parameters_actor[key] = list(self.actor_representation[key].trainable_params()) + \ + list(self.actor[key].trainable_params()) return parameters_actor @property def parameters_critic(self): parameters_critic = {} for key in self.model_keys: - parameters_critic[key] = list(self.critic_1_representation[key].parameters()) + list( - self.critic_1[key].parameters()) + list(self.critic_2_representation[key].parameters()) + list( - self.critic_2[key].parameters()) + parameters_critic[key] = list(self.critic_1_representation[key].trainable_params()) + \ + list(self.critic_1[key].trainable_params()) + \ + list(self.critic_2_representation[key].trainable_params()) + \ + list(self.critic_2[key].trainable_params()) return parameters_critic def _get_actor_critic_input(self, dim_actor_rep, dim_action, dim_critic_rep, n_agents): diff --git a/xuance/torch/learners/multi_agent_rl/matd3_learner.py b/xuance/torch/learners/multi_agent_rl/matd3_learner.py index 4d0b9763..f7192ee3 100644 --- a/xuance/torch/learners/multi_agent_rl/matd3_learner.py +++ b/xuance/torch/learners/multi_agent_rl/matd3_learner.py @@ -127,8 +127,8 @@ def update(self, sample): f"{key}/loss_actor": loss_a.item(), f"{key}/q_policy": q_policy_i.mean().item(), }) + self.policy.soft_update(self.tau) - self.policy.soft_update(self.tau) return info def update_rnn(self, sample): @@ -239,6 +239,6 @@ def update_rnn(self, sample): f"{key}/loss_actor": loss_a.item(), f"{key}/q_policy": q_policy_i.mean().item(), }) + self.policy.soft_update(self.tau) - self.policy.soft_update(self.tau) return info