You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Please check that this issue hasn't been reported before.
I searched previous Bug Reports didn't find any similar reports.
Expected Behavior
train normally
Current behaviour
Traceback (most recent call last):
File "/usr/local/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/local/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/home/axolotl/src/axolotl/cli/train.py", line 38, in
fire.Fire(do_cli)
File "/usr/local/lib/python3.10/site-packages/fire/core.py", line 135, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/usr/local/lib/python3.10/site-packages/fire/core.py", line 468, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/usr/local/lib/python3.10/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/home/axolotl/src/axolotl/cli/train.py", line 34, in do_cli
train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)
File "/home/axolotl/src/axolotl/train.py", line 119, in train
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
File "/usr/local/lib/python3.10/site-packages/transformers/trainer.py", line 2164, in train
return inner_training_loop(
File "/usr/local/lib/python3.10/site-packages/transformers/trainer.py", line 2522, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
File "/usr/local/lib/python3.10/site-packages/transformers/trainer.py", line 3656, in training_step
loss = self.compute_loss(model, inputs, num_items_in_batch)
File "/home/axolotl/src/axolotl/core/trainer_builder.py", line 250, in compute_loss
return super().compute_loss(model, inputs, return_outputs=return_outputs)
File "/home/axolotl/src/axolotl/monkeypatch/medusa_utils.py", line 227, in compute_loss
logits = model(
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1855, in forward
loss = self.module(*inputs, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/peft/peft_model.py", line 1091, in forward
return self.base_model(
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 160, in forward
return self.model.forward(*args, **kwargs)
File "/home/axolotl/src/axolotl/monkeypatch/medusa_utils.py", line 184, in forward
medusa_logits.append(self.medusa_headi)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/peft/utils/other.py", line 250, in forward
return self.modules_to_save[self.active_adapter](*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/container.py", line 215, in forward
input = module(input)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate 8.70 GiB. GPU 0 has a total capacty of 63.98 GiB of which 0 bytes is free. Of the allocated memory 52.85 GiB is allocated by PyTorch, and 8.45 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_HIP_ALLOC_CONF
How many gpus are you using? Even with multiple gpus, you're going to need zero3 with offload. None of the vanilla zero1-3 is going to work for a 72b parameter model. Additionally, consider using an 8bit LoRA instead of a half precision. A 72b model needs 144b GBs of VRAM just for the model weights at half precision.
How many gpus are you using? Even with multiple gpus, you're going to need zero3 with offload. None of the vanilla zero1-3 is going to work for a 72b parameter model. Additionally, consider using an 8bit LoRA instead of a half precision. A 72b model needs 144b GBs of VRAM just for the model weights at half precision.
zero3 offload is "out of memory too", >4 gpus, 4*64 GBs of VRAM , and 8 gpus is all the same bug, I have test 8bit LoRA , something like the deepspeed didnt work ,but check deepspeed init is normal, deepspeed code in transformers trainer is normal, and so I dont know that`s why.
Right, but you have to offload everything with zero3, which isn't the default, which is why I'm asking. Also, you're going to need close to 4x gpus, maybe 8x if you're not doing a quantized lora.
Please check that this issue hasn't been reported before.
Expected Behavior
train normally
Current behaviour
Traceback (most recent call last):
File "/usr/local/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/usr/local/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/home/axolotl/src/axolotl/cli/train.py", line 38, in
fire.Fire(do_cli)
File "/usr/local/lib/python3.10/site-packages/fire/core.py", line 135, in Fire
component_trace = _Fire(component, args, parsed_flag_args, context, name)
File "/usr/local/lib/python3.10/site-packages/fire/core.py", line 468, in _Fire
component, remaining_args = _CallAndUpdateTrace(
File "/usr/local/lib/python3.10/site-packages/fire/core.py", line 684, in _CallAndUpdateTrace
component = fn(*varargs, **kwargs)
File "/home/axolotl/src/axolotl/cli/train.py", line 34, in do_cli
train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)
File "/home/axolotl/src/axolotl/train.py", line 119, in train
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
File "/usr/local/lib/python3.10/site-packages/transformers/trainer.py", line 2164, in train
return inner_training_loop(
File "/usr/local/lib/python3.10/site-packages/transformers/trainer.py", line 2522, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
File "/usr/local/lib/python3.10/site-packages/transformers/trainer.py", line 3656, in training_step
loss = self.compute_loss(model, inputs, num_items_in_batch)
File "/home/axolotl/src/axolotl/core/trainer_builder.py", line 250, in compute_loss
return super().compute_loss(model, inputs, return_outputs=return_outputs)
File "/home/axolotl/src/axolotl/monkeypatch/medusa_utils.py", line 227, in compute_loss
logits = model(
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/deepspeed/utils/nvtx.py", line 15, in wrapped_fn
ret_val = func(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 1855, in forward
loss = self.module(*inputs, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/peft/peft_model.py", line 1091, in forward
return self.base_model(
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/peft/tuners/tuners_utils.py", line 160, in forward
return self.model.forward(*args, **kwargs)
File "/home/axolotl/src/axolotl/monkeypatch/medusa_utils.py", line 184, in forward
medusa_logits.append(self.medusa_headi)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/peft/utils/other.py", line 250, in forward
return self.modules_to_save[self.active_adapter](*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/container.py", line 215, in forward
input = module(input)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/nn/modules/linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
torch.cuda.OutOfMemoryError: HIP out of memory. Tried to allocate 8.70 GiB. GPU 0 has a total capacty of 63.98 GiB of which 0 bytes is free. Of the allocated memory 52.85 GiB is allocated by PyTorch, and 8.45 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_HIP_ALLOC_CONF
Steps to reproduce
zero1.json zero2.json zero3.json
accelerate launch -m axolotl.cli.train examples/medusa/qwen_lora_stage1.yml
https://github.com/ctlllll/axolotl.git
I have test : #1129
'''
load_in_8bit: false
load_in_4bit: false
...
"bf16": true
'''
but it didn`t work! any help ?
Config yaml
Possible solution
No response
Which Operating Systems are you using?
Python Version
3.10
axolotl branch-commit
https://github.com/ctlllll/axolotl.git
Acknowledgements
The text was updated successfully, but these errors were encountered: