We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Traceback (most recent call last) ──────────────────────╮
│ /home/lk/moss_finetuning-dev/train.py:107 in │
│ │
│ 104 │ │ num_workers=0 │
│ 105 │ ) │
│ 106 │ │
│ ❱ 107 │ trainer.fit(pl_model, train_dataloaders=train_datasets) │
│ 108 │
│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/trainer/traine │
│ r.py:531 in fit │
│ 528 │ │ """ │
│ 529 │ │ model = _maybe_unwrap_optimized(model) │
│ 530 │ │ self.strategy._lightning_module = model │
│ ❱ 531 │ │ call._call_and_handle_interrupt( │
│ 532 │ │ │ self, self._fit_impl, model, train_dataloaders, val_datal │
│ 533 │ │ ) │
│ 534 │
│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/trainer/call.p │
│ y:41 in _call_and_handle_interrupt │
│ 38 │ """ │
│ 39 │ try: │
│ 40 │ │ if trainer.strategy.launcher is not None: │
│ ❱ 41 │ │ │ return trainer.strategy.launcher.launch(trainer_fn, *args, │
│ 42 │ │ return trainer_fn(*args, **kwargs) │
│ 43 │ │
│ 44 │ except _TunerExitException: │
│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/strategies/lau │
│ nchers/subprocess_script.py:91 in launch │
│ 88 │ │ """ │
│ 89 │ │ if not self.cluster_environment.creates_processes_externally: │
│ 90 │ │ │ self._call_children_scripts() │
│ ❱ 91 │ │ return function(*args, **kwargs) │
│ 92 │ │
│ 93 │ def kill(self, signum: _SIGNUM) -> None: │
│ 94 │ │ for proc in self.procs: │
│ r.py:570 in _fit_impl │
│ 567 │ │ │ model_provided=True, │
│ 568 │ │ │ model_connected=self.lightning_module is not None, │
│ 569 │ │ ) │
│ ❱ 570 │ │ self._run(model, ckpt_path=ckpt_path) │
│ 571 │ │ │
│ 572 │ │ assert self.state.stopped │
│ 573 │ │ self.training = False │
│ r.py:951 in _run │
│ 948 │ │ self._logger_connector.reset_metrics() │
│ 949 │ │ │
│ 950 │ │ # strategy will configure model and move it to the device │
│ ❱ 951 │ │ self.strategy.setup(self) │
│ 952 │ │ │
│ 953 │ │ # hook │
│ 954 │ │ if self.state.fn == TrainerFn.FITTING: │
│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/strategies/dee │
│ pspeed.py:345 in setup │
│ 342 │ │ self.setup_optimizers(trainer) │
│ 343 │ │ self.setup_precision_plugin() │
│ 344 │ │ _optimizers_to_device(self.optimizers, self.root_device) │
│ ❱ 345 │ │ self.init_deepspeed() │
│ 346 │ │ self.barrier() │
│ 347 │ │
│ 348 │ def _init_deepspeed_distributed(self) -> None: │
│ pspeed.py:449 in init_deepspeed │
│ 446 │ │ model = _LightningModuleWrapperBase(forward_module=self.model) │
│ 447 │ │ │
│ 448 │ │ if self.lightning_module.trainer and self.lightning_module.tra │
│ ❱ 449 │ │ │ self._initialize_deepspeed_train(model) │
│ 450 │ │ else: │
│ 451 │ │ │ self._initialize_deepspeed_inference(model) │
│ 452 │
│ pspeed.py:485 in _initialize_deepspeed_train │
│ 482 │ │ │ if lr_scheduler is not None: │
│ 483 │ │ │ │ scheduler = lr_scheduler.scheduler │
│ 484 │ │ │
│ ❱ 485 │ │ model, deepspeed_optimizer = self._setup_model_and_optimizer(m │
│ 486 │ │ self._set_deepspeed_activation_checkpointing() │
│ 487 │ │ │
│ 488 │ │ # although we set these here, deepspeed manages the specific o │
│ pspeed.py:414 in _setup_model_and_optimizer │
│ 411 │ │ import deepspeed │
│ 412 │ │ │
│ 413 │ │ model_parameters = filter(lambda p: p.requires_grad, model.par │
│ ❱ 414 │ │ deepspeed_engine, deepspeed_optimizer, _, _ = deepspeed.initia │
│ 415 │ │ │ args=argparse.Namespace(device_rank=self.root_device.index │
│ 416 │ │ │ config=self.config, │
│ 417 │ │ │ model=model, │
│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/init.py:165 in │
│ initialize │
│ 162 │ │ │ │ │ │ │ │ │ │ config=config, │
│ 163 │ │ │ │ │ │ │ │ │ │ config_class=config_class) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ engine = DeepSpeedEngine(args=args, │
│ 166 │ │ │ │ │ │ │ │ │ model=model, │
│ 167 │ │ │ │ │ │ │ │ │ optimizer=optimizer, │
│ 168 │ │ │ │ │ │ │ │ │ model_parameters=model_parameters │
│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py:308 │
│ in init │
│ 305 │ │ │ model_parameters = list(model_parameters) │
│ 306 │ │ │
│ 307 │ │ if has_optimizer: │
│ ❱ 308 │ │ │ self._configure_optimizer(optimizer, model_parameters) │
│ 309 │ │ │ self._configure_lr_scheduler(lr_scheduler) │
│ 310 │ │ │ self._report_progress(0) │
│ 311 │ │ elif self.zero_optimization(): │
│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py:1173 │
│ in _configure_optimizer │
│ 1170 │ │ optimizer_wrapper = self._do_optimizer_sanity_check(basic_opt │
│ 1171 │ │ │
│ 1172 │ │ if optimizer_wrapper == ZERO_OPTIMIZATION: │
│ ❱ 1173 │ │ │ self.optimizer = self._configure_zero_optimizer(basic_opt │
│ 1174 │ │ elif optimizer_wrapper == AMP: │
│ 1175 │ │ │ amp_params = self.amp_params() │
│ 1176 │ │ │ log_dist(f"Initializing AMP with these params: {amp_param │
│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py:1408 │
│ in _configure_zero_optimizer │
│ 1405 │ │ │ │ if overlap_comm: │
│ 1406 │ │ │ │ │ logger.warning("Pipeline parallelism does not sup │
│ 1407 │ │ │ │ │ overlap_comm = False │
│ ❱ 1408 │ │ │ optimizer = DeepSpeedZeroOptimizer( │
│ 1409 │ │ │ │ optimizer, │
│ 1410 │ │ │ │ self.param_names, │
│ 1411 │ │ │ │ timers=timers, │
│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_a │
│ nd_2.py:313 in init │
│ 310 │ │ │ │
│ 311 │ │ │ # create flat buffer in CPU and move to GPU │
│ 312 │ │ │ self.bit16_groups_flat.append( │
│ ❱ 313 │ │ │ │ self.flatten_dense_tensors_aligned( │
│ 314 │ │ │ │ │ self.round_robin_bit16_groups[i], │
│ 315 │ │ │ │ │ self.nccl_start_alignment_factor * dist.get_world │
│ 316 │ │ │ │ │ │ get_accelerator().current_device_name())) │
│ nd_2.py:830 in flatten_dense_tensors_aligned │
│ 827 │ │
│ 828 │ # create a flat tensor aligned at the alignment boundary │
│ 829 │ def flatten_dense_tensors_aligned(self, tensor_list, alignment): │
│ ❱ 830 │ │ return self.flatten(align_dense_tensors(tensor_list, alignmen │
│ 831 │ │
│ 832 │ ############### Independent Partition Gradient ################## │
│ 833 │ def reduce_independent_p_g_buckets_and_remove_grads(self, param, │
╰──────────────────────────────────────────────────────────────────────────────╯
RuntimeError: torch.cat(): expected a non-empty list of Tensors
作者大大能帮看下是哪里的问题吗
The text was updated successfully, but these errors were encountered:
换一下优化器就行了
Sorry, something went wrong.
我换了lion、adam、adamw_torch这几个优化器都报一样的错...这里有什么讲究吗
No branches or pull requests
Traceback (most recent call last) ──────────────────────╮
│ /home/lk/moss_finetuning-dev/train.py:107 in │
│ │
│ 104 │ │ num_workers=0 │
│ 105 │ ) │
│ 106 │ │
│ ❱ 107 │ trainer.fit(pl_model, train_dataloaders=train_datasets) │
│ 108 │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/trainer/traine │
│ r.py:531 in fit │
│ │
│ 528 │ │ """ │
│ 529 │ │ model = _maybe_unwrap_optimized(model) │
│ 530 │ │ self.strategy._lightning_module = model │
│ ❱ 531 │ │ call._call_and_handle_interrupt( │
│ 532 │ │ │ self, self._fit_impl, model, train_dataloaders, val_datal │
│ 533 │ │ ) │
│ 534 │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/trainer/call.p │
│ y:41 in _call_and_handle_interrupt │
│ │
│ 38 │ """ │
│ 39 │ try: │
│ 40 │ │ if trainer.strategy.launcher is not None: │
│ ❱ 41 │ │ │ return trainer.strategy.launcher.launch(trainer_fn, *args, │
│ 42 │ │ return trainer_fn(*args, **kwargs) │
│ 43 │ │
│ 44 │ except _TunerExitException: │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/strategies/lau │
│ nchers/subprocess_script.py:91 in launch │
│ │
│ 88 │ │ """ │
│ 89 │ │ if not self.cluster_environment.creates_processes_externally: │
│ 90 │ │ │ self._call_children_scripts() │
│ ❱ 91 │ │ return function(*args, **kwargs) │
│ 92 │ │
│ 93 │ def kill(self, signum: _SIGNUM) -> None: │
│ 94 │ │ for proc in self.procs: │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/trainer/traine │
│ r.py:570 in _fit_impl │
│ │
│ 567 │ │ │ model_provided=True, │
│ 568 │ │ │ model_connected=self.lightning_module is not None, │
│ 569 │ │ ) │
│ ❱ 570 │ │ self._run(model, ckpt_path=ckpt_path) │
│ 571 │ │ │
│ 572 │ │ assert self.state.stopped │
│ 573 │ │ self.training = False │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/trainer/traine │
│ r.py:951 in _run │
│ │
│ 948 │ │ self._logger_connector.reset_metrics() │
│ 949 │ │ │
│ 950 │ │ # strategy will configure model and move it to the device │
│ ❱ 951 │ │ self.strategy.setup(self) │
│ 952 │ │ │
│ 953 │ │ # hook │
│ 954 │ │ if self.state.fn == TrainerFn.FITTING: │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/strategies/dee │
│ pspeed.py:345 in setup │
│ │
│ 342 │ │ self.setup_optimizers(trainer) │
│ 343 │ │ self.setup_precision_plugin() │
│ 344 │ │ _optimizers_to_device(self.optimizers, self.root_device) │
│ ❱ 345 │ │ self.init_deepspeed() │
│ 346 │ │ self.barrier() │
│ 347 │ │
│ 348 │ def _init_deepspeed_distributed(self) -> None: │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/strategies/dee │
│ pspeed.py:449 in init_deepspeed │
│ │
│ 446 │ │ model = _LightningModuleWrapperBase(forward_module=self.model) │
│ 447 │ │ │
│ 448 │ │ if self.lightning_module.trainer and self.lightning_module.tra │
│ ❱ 449 │ │ │ self._initialize_deepspeed_train(model) │
│ 450 │ │ else: │
│ 451 │ │ │ self._initialize_deepspeed_inference(model) │
│ 452 │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/strategies/dee │
│ pspeed.py:485 in _initialize_deepspeed_train │
│ │
│ 482 │ │ │ if lr_scheduler is not None: │
│ 483 │ │ │ │ scheduler = lr_scheduler.scheduler │
│ 484 │ │ │
│ ❱ 485 │ │ model, deepspeed_optimizer = self._setup_model_and_optimizer(m │
│ 486 │ │ self._set_deepspeed_activation_checkpointing() │
│ 487 │ │ │
│ 488 │ │ # although we set these here, deepspeed manages the specific o │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/lightning/pytorch/strategies/dee │
│ pspeed.py:414 in _setup_model_and_optimizer │
│ │
│ 411 │ │ import deepspeed │
│ 412 │ │ │
│ 413 │ │ model_parameters = filter(lambda p: p.requires_grad, model.par │
│ ❱ 414 │ │ deepspeed_engine, deepspeed_optimizer, _, _ = deepspeed.initia │
│ 415 │ │ │ args=argparse.Namespace(device_rank=self.root_device.index │
│ 416 │ │ │ config=self.config, │
│ 417 │ │ │ model=model, │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/init.py:165 in │
│ initialize │
│ │
│ 162 │ │ │ │ │ │ │ │ │ │ config=config, │
│ 163 │ │ │ │ │ │ │ │ │ │ config_class=config_class) │
│ 164 │ │ else: │
│ ❱ 165 │ │ │ engine = DeepSpeedEngine(args=args, │
│ 166 │ │ │ │ │ │ │ │ │ model=model, │
│ 167 │ │ │ │ │ │ │ │ │ optimizer=optimizer, │
│ 168 │ │ │ │ │ │ │ │ │ model_parameters=model_parameters │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py:308 │
│ in init │
│ │
│ 305 │ │ │ model_parameters = list(model_parameters) │
│ 306 │ │ │
│ 307 │ │ if has_optimizer: │
│ ❱ 308 │ │ │ self._configure_optimizer(optimizer, model_parameters) │
│ 309 │ │ │ self._configure_lr_scheduler(lr_scheduler) │
│ 310 │ │ │ self._report_progress(0) │
│ 311 │ │ elif self.zero_optimization(): │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py:1173 │
│ in _configure_optimizer │
│ │
│ 1170 │ │ optimizer_wrapper = self._do_optimizer_sanity_check(basic_opt │
│ 1171 │ │ │
│ 1172 │ │ if optimizer_wrapper == ZERO_OPTIMIZATION: │
│ ❱ 1173 │ │ │ self.optimizer = self._configure_zero_optimizer(basic_opt │
│ 1174 │ │ elif optimizer_wrapper == AMP: │
│ 1175 │ │ │ amp_params = self.amp_params() │
│ 1176 │ │ │ log_dist(f"Initializing AMP with these params: {amp_param │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/runtime/engine.py:1408 │
│ in _configure_zero_optimizer │
│ │
│ 1405 │ │ │ │ if overlap_comm: │
│ 1406 │ │ │ │ │ logger.warning("Pipeline parallelism does not sup │
│ 1407 │ │ │ │ │ overlap_comm = False │
│ ❱ 1408 │ │ │ optimizer = DeepSpeedZeroOptimizer( │
│ 1409 │ │ │ │ optimizer, │
│ 1410 │ │ │ │ self.param_names, │
│ 1411 │ │ │ │ timers=timers, │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_a │
│ nd_2.py:313 in init │
│ │
│ 310 │ │ │ │
│ 311 │ │ │ # create flat buffer in CPU and move to GPU │
│ 312 │ │ │ self.bit16_groups_flat.append( │
│ ❱ 313 │ │ │ │ self.flatten_dense_tensors_aligned( │
│ 314 │ │ │ │ │ self.round_robin_bit16_groups[i], │
│ 315 │ │ │ │ │ self.nccl_start_alignment_factor * dist.get_world │
│ 316 │ │ │ │ │ │ get_accelerator().current_device_name())) │
│ │
│ /home/lk/.local/lib/python3.8/site-packages/deepspeed/runtime/zero/stage_1_a │
│ nd_2.py:830 in flatten_dense_tensors_aligned │
│ │
│ 827 │ │
│ 828 │ # create a flat tensor aligned at the alignment boundary │
│ 829 │ def flatten_dense_tensors_aligned(self, tensor_list, alignment): │
│ ❱ 830 │ │ return self.flatten(align_dense_tensors(tensor_list, alignmen │
│ 831 │ │
│ 832 │ ############### Independent Partition Gradient ################## │
│ 833 │ def reduce_independent_p_g_buckets_and_remove_grads(self, param, │
╰──────────────────────────────────────────────────────────────────────────────╯
RuntimeError: torch.cat(): expected a non-empty list of Tensors
作者大大能帮看下是哪里的问题吗
The text was updated successfully, but these errors were encountered: