You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Thank you for providing the code. I encountered some difficulties while trying to reproduce the code in the sd repository.
Here is the error message I encountered:
[rank0]: Traceback (most recent call last):
[rank0]: File "main_forget.py", line 749, in
[rank0]: trainer.fit(model, data)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 553, in fit
[rank0]: self._run(model)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 918, in _run
[rank0]: self._dispatch()
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 986, in _dispatch
[rank0]: self.accelerator.start_training(self)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 92, in start_training
[rank0]: self.training_type_plugin.start_training(trainer)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 161, in start_training
[rank0]: self._results = trainer.run_stage()
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 996, in run_stage
[rank0]: return self._run_train()
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1058, in _run_train
[rank0]: self.training_type_plugin.reconciliate_processes(traceback.format_exc())
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp.py", line 453, in reconciliate_processes
[rank0]: raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")
[rank0]: pytorch_lightning.utilities.exceptions.DeadlockDetectedException: DeadLock detected from rank: 0
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1045, in _run_train
[rank0]: self.fit_loop.run()
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
[rank0]: self.advance(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 200, in advance
[rank0]: epoch_output = self.epoch_loop.run(train_dataloader)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
[rank0]: self.advance(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 130, in advance
[rank0]: batch_output = self.batch_loop.run(batch, self.iteration_count, self._dataloader_idx)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 101, in run
[rank0]: super().run(batch, batch_idx, dataloader_idx)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
[rank0]: self.advance(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 148, in advance
[rank0]: result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 202, in _run_optimization
[rank0]: self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 396, in _optimizer_step
[rank0]: model_ref.optimizer_step(
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1618, in optimizer_step
[rank0]: optimizer.step(closure=optimizer_closure)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 209, in step
[rank0]: self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 129, in __optimizer_step
[rank0]: trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 296, in optimizer_step
[rank0]: self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 303, in run_optimizer_step
[rank0]: self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 226, in optimizer_step
[rank0]: optimizer.step(closure=lambda_closure, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 75, in wrapper
[rank0]: return wrapped(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/optim/optimizer.py", line 391, in wrapper
[rank0]: out = func(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/optim/optimizer.py", line 76, in _use_grad
[rank0]: ret = func(self, *args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/optim/adamw.py", line 165, in step
[rank0]: loss = closure()
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 236, in _training_step_and_backward_closure
[rank0]: result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 537, in training_step_and_backward
[rank0]: result = self._training_step(split_batch, batch_idx, opt_idx, hiddens)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 307, in _training_step
[rank0]: training_step_output = self.trainer.accelerator.training_step(step_kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 193, in training_step
[rank0]: return self.training_type_plugin.training_step(*step_kwargs.values())
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp.py", line 383, in training_step
[rank0]: return self.model(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1593, in forward
[rank0]: else self._run_ddp_forward(*inputs, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1411, in _run_ddp_forward
[rank0]: return self.module(*inputs, **kwargs) # type: ignore[index]
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/overrides/base.py", line 82, in forward
[rank0]: output = self.module.training_step(*inputs, **kwargs)
[rank0]: File "/data/xunaen/lwj/selective-amnesia/sd/ldm/models/diffusion/ddpm_forget.py", line 345, in training_step
[rank0]: loss_corrupt, loss_dict_corrupt = self.shared_step(batch[0])
[rank0]: File "/data/xunaen/lwj/selective-amnesia/sd/ldm/models/diffusion/ddpm_forget.py", line 911, in shared_step
[rank0]: loss = self(x, c)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/data/xunaen/lwj/selective-amnesia/sd/ldm/models/diffusion/ddpm_forget.py", line 923, in forward
[rank0]: return self.p_losses(x, c, t, *args, **kwargs)
[rank0]: File "/data/xunaen/lwj/selective-amnesia/sd/ldm/models/diffusion/ddpm_forget.py", line 1074, in p_losses
[rank0]: logvar_t = self.logvar[t].to(self.device)
[rank0]: RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)
I've tried updating the library version, but it didn't work. Can you provide some help?
The text was updated successfully, but these errors were encountered:
Hi, thanks for the interest in our work. The error suggests that the tensors are not on the same device, particularly self.logvar and the index t. Can you print their devices out and ensure they are on the same device? I didn't face such an issue on my end.
Thank you for providing the code. I encountered some difficulties while trying to reproduce the code in the sd repository.
Here is the error message I encountered:
[rank0]: Traceback (most recent call last):
[rank0]: File "main_forget.py", line 749, in
[rank0]: trainer.fit(model, data)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 553, in fit
[rank0]: self._run(model)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 918, in _run
[rank0]: self._dispatch()
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 986, in _dispatch
[rank0]: self.accelerator.start_training(self)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 92, in start_training
[rank0]: self.training_type_plugin.start_training(trainer)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 161, in start_training
[rank0]: self._results = trainer.run_stage()
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 996, in run_stage
[rank0]: return self._run_train()
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1058, in _run_train
[rank0]: self.training_type_plugin.reconciliate_processes(traceback.format_exc())
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp.py", line 453, in reconciliate_processes
[rank0]: raise DeadlockDetectedException(f"DeadLock detected from rank: {self.global_rank} \n {trace}")
[rank0]: pytorch_lightning.utilities.exceptions.DeadlockDetectedException: DeadLock detected from rank: 0
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1045, in _run_train
[rank0]: self.fit_loop.run()
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
[rank0]: self.advance(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 200, in advance
[rank0]: epoch_output = self.epoch_loop.run(train_dataloader)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
[rank0]: self.advance(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 130, in advance
[rank0]: batch_output = self.batch_loop.run(batch, self.iteration_count, self._dataloader_idx)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 101, in run
[rank0]: super().run(batch, batch_idx, dataloader_idx)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 111, in run
[rank0]: self.advance(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 148, in advance
[rank0]: result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 202, in _run_optimization
[rank0]: self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 396, in _optimizer_step
[rank0]: model_ref.optimizer_step(
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/core/lightning.py", line 1618, in optimizer_step
[rank0]: optimizer.step(closure=optimizer_closure)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 209, in step
[rank0]: self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/core/optimizer.py", line 129, in __optimizer_step
[rank0]: trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 296, in optimizer_step
[rank0]: self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 303, in run_optimizer_step
[rank0]: self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 226, in optimizer_step
[rank0]: optimizer.step(closure=lambda_closure, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/optim/lr_scheduler.py", line 75, in wrapper
[rank0]: return wrapped(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/optim/optimizer.py", line 391, in wrapper
[rank0]: out = func(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/optim/optimizer.py", line 76, in _use_grad
[rank0]: ret = func(self, *args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/optim/adamw.py", line 165, in step
[rank0]: loss = closure()
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 236, in _training_step_and_backward_closure
[rank0]: result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 537, in training_step_and_backward
[rank0]: result = self._training_step(split_batch, batch_idx, opt_idx, hiddens)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 307, in _training_step
[rank0]: training_step_output = self.trainer.accelerator.training_step(step_kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 193, in training_step
[rank0]: return self.training_type_plugin.training_step(*step_kwargs.values())
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/ddp.py", line 383, in training_step
[rank0]: return self.model(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1593, in forward
[rank0]: else self._run_ddp_forward(*inputs, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1411, in _run_ddp_forward
[rank0]: return self.module(*inputs, **kwargs) # type: ignore[index]
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/pytorch_lightning/overrides/base.py", line 82, in forward
[rank0]: output = self.module.training_step(*inputs, **kwargs)
[rank0]: File "/data/xunaen/lwj/selective-amnesia/sd/ldm/models/diffusion/ddpm_forget.py", line 345, in training_step
[rank0]: loss_corrupt, loss_dict_corrupt = self.shared_step(batch[0])
[rank0]: File "/data/xunaen/lwj/selective-amnesia/sd/ldm/models/diffusion/ddpm_forget.py", line 911, in shared_step
[rank0]: loss = self(x, c)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank0]: return self._call_impl(*args, **kwargs)
[rank0]: File "/home/xunaen/anaconda3/envs/sa-sd/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
[rank0]: return forward_call(*args, **kwargs)
[rank0]: File "/data/xunaen/lwj/selective-amnesia/sd/ldm/models/diffusion/ddpm_forget.py", line 923, in forward
[rank0]: return self.p_losses(x, c, t, *args, **kwargs)
[rank0]: File "/data/xunaen/lwj/selective-amnesia/sd/ldm/models/diffusion/ddpm_forget.py", line 1074, in p_losses
[rank0]: logvar_t = self.logvar[t].to(self.device)
[rank0]: RuntimeError: indices should be either on cpu or on the same device as the indexed tensor (cpu)
I've tried updating the library version, but it didn't work. Can you provide some help?
The text was updated successfully, but these errors were encountered: