Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/agi-brain/xuance
Browse files Browse the repository at this point in the history
  • Loading branch information
Ykizi committed Nov 30, 2023
2 parents 0d123be + 381c9d8 commit 0a0947e
Show file tree
Hide file tree
Showing 2 changed files with 266 additions and 21 deletions.
144 changes: 133 additions & 11 deletions docs/source/documents/api/learners/marl/isac.rst
Original file line number Diff line number Diff line change
@@ -1,20 +1,50 @@
ISAC_Learner
=====================================

xxxxxx.

.. raw:: html

<br><hr>

**PyTorch:**

.. py:class::
xuance.torch.learners.multi_agent_rl.isac_learner.ISAC_Learner(config, policy, optimizer, scheduler, device, model_dir, gamma, sync_frequency)

:param config: xxxxxx.
:type config: xxxxxx
:param policy: xxxxxx.
:type policy: xxxxxx
:param optimizer: xxxxxx.
:type optimizer: xxxxxx
:param scheduler: xxxxxx.
:type scheduler: xxxxxx
:param device: xxxxxx.
:type device: xxxxxx
:param model_dir: xxxxxx.
:type model_dir: xxxxxx
:param gamma: xxxxxx.
:type gamma: xxxxxx
:param sync_frequency: xxxxxx.
:type sync_frequency: xxxxxx

.. py:function::
xuance.torch.learners.multi_agent_rl.isac_learner.ISAC_Learner.update(sample)

xxxxxx.

:param sample: xxxxxx.
:type sample: xxxxxx
:return: xxxxxx.
:rtype: xxxxxx

.. raw:: html

<br><hr>

**TensorFlow:**


.. raw:: html

<br><hr>
Expand All @@ -29,20 +59,112 @@ Source Code
-----------------

.. tabs::

.. group-tab:: PyTorch

.. code-block:: python3

.. group-tab:: PyTorch

.. code-block:: python
"""
Independent Soft Actor-critic (ISAC)
Implementation: Pytorch
"""
from xuance.torch.learners import *


class ISAC_Learner(LearnerMAS):
def __init__(self,
config: Namespace,
policy: nn.Module,
optimizer: Sequence[torch.optim.Optimizer],
scheduler: Sequence[torch.optim.lr_scheduler._LRScheduler] = None,
device: Optional[Union[int, str, torch.device]] = None,
model_dir: str = "./",
gamma: float = 0.99,
sync_frequency: int = 100
):
self.gamma = gamma
self.tau = config.tau
self.alpha = config.alpha
self.sync_frequency = sync_frequency
self.mse_loss = nn.MSELoss()
super(ISAC_Learner, self).__init__(config, policy, optimizer, scheduler, device, model_dir)
self.optimizer = {
'actor': optimizer[0],
'critic': optimizer[1]
}
self.scheduler = {
'actor': scheduler[0],
'critic': scheduler[1]
}

def update(self, sample):
self.iterations += 1
obs = torch.Tensor(sample['obs']).to(self.device)
actions = torch.Tensor(sample['actions']).to(self.device)
obs_next = torch.Tensor(sample['obs_next']).to(self.device)
rewards = torch.Tensor(sample['rewards']).to(self.device)
terminals = torch.Tensor(sample['terminals']).float().reshape(-1, self.n_agents, 1).to(self.device)
agent_mask = torch.Tensor(sample['agent_mask']).float().reshape(-1, self.n_agents, 1).to(self.device)
IDs = torch.eye(self.n_agents).unsqueeze(0).expand(self.args.batch_size, -1, -1).to(self.device)

q_eval = self.policy.critic(obs, actions, IDs)
actions_next_dist = self.policy.target_actor(obs_next, IDs)
actions_next = actions_next_dist.rsample()
log_pi_a_next = actions_next_dist.log_prob(actions_next)
q_next = self.policy.target_critic(obs_next, actions_next, IDs)
q_target = rewards + (1-terminals) * self.args.gamma * (q_next - self.alpha * log_pi_a_next.unsqueeze(dim=-1))

# calculate the loss function
_, actions_dist = self.policy(obs, IDs)
actions_eval = actions_dist.rsample()
log_pi_a = actions_dist.log_prob(actions_eval)
loss_a = -(self.policy.critic(obs, actions_eval, IDs) - self.alpha * log_pi_a.unsqueeze(dim=-1) * agent_mask).sum() / agent_mask.sum()
# loss_a = (- self.policy.critic(obs, actions_eval, IDs)) * agent_mask.sum() / agent_mask.sum()
self.optimizer['actor'].zero_grad()
loss_a.backward()
torch.nn.utils.clip_grad_norm_(self.policy.parameters_actor, self.args.grad_clip_norm)
self.optimizer['actor'].step()
if self.scheduler['actor'] is not None:
self.scheduler['actor'].step()

td_error = (q_eval - q_target.detach()) * agent_mask
loss_c = (td_error ** 2).sum() / agent_mask.sum()
self.optimizer['critic'].zero_grad()
loss_c.backward()
torch.nn.utils.clip_grad_norm_(self.policy.parameters_critic, self.args.grad_clip_norm)
self.optimizer['critic'].step()
if self.scheduler['critic'] is not None:
self.scheduler['critic'].step()

self.policy.soft_update(self.tau)

lr_a = self.optimizer['actor'].state_dict()['param_groups'][0]['lr']
lr_c = self.optimizer['critic'].state_dict()['param_groups'][0]['lr']

info = {
"learning_rate_actor": lr_a,
"learning_rate_critic": lr_c,
"loss_actor": loss_a.item(),
"loss_critic": loss_c.item(),
"predictQ": q_eval.mean().item()
}

return info







.. group-tab:: TensorFlow

.. code-block:: python3



.. group-tab:: MindSpore
.. group-tab:: TensorFlow

.. code-block:: python3
.. code-block:: python
.. group-tab:: MindSpore

.. code-block:: python
143 changes: 133 additions & 10 deletions docs/source/documents/api/learners/marl/maddpg.rst
Original file line number Diff line number Diff line change
@@ -1,20 +1,50 @@
MADDPG_Learner
=====================================

xxxxxx.

.. raw:: html

<br><hr>

**PyTorch:**

.. py:class::
xuance.torch.learners.multi_agent_rl.maddpg_learner.MADDPG_Learner(config, policy, optimizer, scheduler, device, model_dir, gamma, sync_frequency)

:param config: xxxxxx.
:type config: xxxxxx
:param policy: xxxxxx.
:type policy: xxxxxx
:param optimizer: xxxxxx.
:type optimizer: xxxxxx
:param scheduler: xxxxxx.
:type scheduler: xxxxxx
:param device: xxxxxx.
:type device: xxxxxx
:param model_dir: xxxxxx.
:type model_dir: xxxxxx
:param gamma: xxxxxx.
:type gamma: xxxxxx
:param sync_frequency: xxxxxx.
:type sync_frequency: xxxxxx

.. py:function::
xuance.torch.learners.multi_agent_rl.isac_learner.ISAC_Learner.update(sample)

xxxxxx.

:param sample: xxxxxx.
:type sample: xxxxxx
:return: xxxxxx.
:rtype: xxxxxx

.. raw:: html

<br><hr>

**TensorFlow:**


.. raw:: html

<br><hr>
Expand All @@ -30,21 +60,114 @@ Source Code

.. tabs::

.. group-tab:: PyTorch

.. code-block:: python3
.. group-tab:: PyTorch

.. code-block:: python
"""
Multi-Agent Deep Deterministic Policy Gradient
Paper link:
https://proceedings.neurips.cc/paper/2017/file/68a9750337a418a86fe06c1991a1d64c-Paper.pdf
Implementation: Pytorch
Trick: Parameter sharing for all agents, with agents' one-hot IDs as actor-critic's inputs.
"""
from xuance.torch.learners import *


class MADDPG_Learner(LearnerMAS):
def __init__(self,
config: Namespace,
policy: nn.Module,
optimizer: Sequence[torch.optim.Optimizer],
scheduler: Sequence[torch.optim.lr_scheduler._LRScheduler] = None,
device: Optional[Union[int, str, torch.device]] = None,
model_dir: str = "./",
gamma: float = 0.99,
sync_frequency: int = 100
):
self.gamma = gamma
self.tau = config.tau
self.sync_frequency = sync_frequency
self.mse_loss = nn.MSELoss()
super(MADDPG_Learner, self).__init__(config, policy, optimizer, scheduler, device, model_dir)
self.optimizer = {
'actor': optimizer[0],
'critic': optimizer[1]
}
self.scheduler = {
'actor': scheduler[0],
'critic': scheduler[1]
}

def update(self, sample):
self.iterations += 1
obs = torch.Tensor(sample['obs']).to(self.device)
actions = torch.Tensor(sample['actions']).to(self.device)
obs_next = torch.Tensor(sample['obs_next']).to(self.device)
rewards = torch.Tensor(sample['rewards']).to(self.device)
terminals = torch.Tensor(sample['terminals']).float().reshape(-1, self.n_agents, 1).to(self.device)
agent_mask = torch.Tensor(sample['agent_mask']).float().reshape(-1, self.n_agents, 1).to(self.device)
IDs = torch.eye(self.n_agents).unsqueeze(0).expand(self.args.batch_size, -1, -1).to(self.device)

# train actor
_, actions_eval = self.policy(obs, IDs)
loss_a = -(self.policy.critic(obs, actions_eval, IDs) * agent_mask).sum() / agent_mask.sum()
self.optimizer['actor'].zero_grad()
loss_a.backward()
if self.args.use_grad_clip:
torch.nn.utils.clip_grad_norm_(self.policy.parameters_actor, self.args.grad_clip_norm)
self.optimizer['actor'].step()
if self.scheduler['actor'] is not None:
self.scheduler['actor'].step()

# train critic
actions_next = self.policy.target_actor(obs_next, IDs)
q_eval = self.policy.critic(obs, actions, IDs)
q_next = self.policy.target_critic(obs_next, actions_next, IDs)
q_target = rewards + (1 - terminals) * self.args.gamma * q_next
td_error = (q_eval - q_target.detach()) * agent_mask
loss_c = (td_error ** 2).sum() / agent_mask.sum()
self.optimizer['critic'].zero_grad()
loss_c.backward()
if self.args.use_grad_clip:
torch.nn.utils.clip_grad_norm_(self.policy.parameters_critic, self.args.grad_clip_norm)
self.optimizer['critic'].step()
if self.scheduler['critic'] is not None:
self.scheduler['critic'].step()

self.policy.soft_update(self.tau)

lr_a = self.optimizer['actor'].state_dict()['param_groups'][0]['lr']
lr_c = self.optimizer['critic'].state_dict()['param_groups'][0]['lr']

info = {
"learning_rate_actor": lr_a,
"learning_rate_critic": lr_c,
"loss_actor": loss_a.item(),
"loss_critic": loss_c.item(),
"predictQ": q_eval.mean().item()
}

return info











.. group-tab:: TensorFlow

.. code-block:: python3
.. group-tab:: TensorFlow

.. code-block:: python
.. group-tab:: MindSpore
.. group-tab:: MindSpore

.. code-block:: python
.. code-block:: python3

0 comments on commit 0a0947e

Please sign in to comment.