diff --git a/requirements-cuda.txt b/requirements-cuda.txt index 6230797e..d030d340 100644 --- a/requirements-cuda.txt +++ b/requirements-cuda.txt @@ -1,5 +1,8 @@ flash-attn>=2.4.0 bitsandbytes>=0.43.1 +# available as an option for NVIDIA, FSDP still default +deepspeed>=0.14.3 + # required for FSDP updates -accelerate>=0.34.2 +accelerate>=0.34.2,<1.1.0 diff --git a/requirements-hpu.txt b/requirements-hpu.txt index 6c9ca0c2..73db0ef6 100644 --- a/requirements-hpu.txt +++ b/requirements-hpu.txt @@ -1,2 +1,2 @@ # required for optimum-habana's deps -accelerate>=0.33.0 \ No newline at end of file +accelerate>=0.33.0,<1.1.0 \ No newline at end of file diff --git a/requirements-rocm.txt b/requirements-rocm.txt index 9d72f4d5..bb9d572e 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -1,3 +1,3 @@ flash-attn>=2.6.2,<2.7.0 # required for FSDP updates -accelerate>=0.34.2 +accelerate>=0.34.2,<1.1.0 diff --git a/requirements.txt b/requirements.txt index 7c6a00ca..d4b7760c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -21,7 +21,4 @@ instructlab-dolomite>=0.2.0 trl>=0.9.4 peft pydantic>=2.7.0 - -# deepspeed needs to be at the end or it'll break stuff. -deepspeed>=0.14.3 aiofiles>=23.2.1 diff --git a/src/instructlab/training/config.py b/src/instructlab/training/config.py index bcc06fa2..bf43f2eb 100644 --- a/src/instructlab/training/config.py +++ b/src/instructlab/training/config.py @@ -195,7 +195,7 @@ class TrainingArgs(BaseModel): cpu_offload_params=False, sharding_strategy=ShardingStrategies.SHARD_GRAD_OP ) ) - distributed_backend: DistributedBackend = DistributedBackend.DEEPSPEED + distributed_backend: DistributedBackend = DistributedBackend.FSDP disable_flash_attn: Optional[bool] = False diff --git a/src/instructlab/training/setup_accelerator.py b/src/instructlab/training/setup_accelerator.py index 239367f4..c7d079e6 100644 --- a/src/instructlab/training/setup_accelerator.py +++ b/src/instructlab/training/setup_accelerator.py @@ -96,8 +96,13 @@ def get_fsdp_config(args, model: PreTrainedModel): def setup_accelerator(args, model: PreTrainedModel, grad_accum): if args.distributed_training_framework == "deepspeed": - # Third Party - from deepspeed import DeepSpeedEngine + try: + # Third Party + from deepspeed import DeepSpeedEngine + except ImportError as exc: + raise ImportError( + "DeepSpeed selected as distributed framework, but not installed" + ) from exc # patch deepspeed to work with quantized models. if args.lora_quant_bits is not None: diff --git a/src/instructlab/training/utils.py b/src/instructlab/training/utils.py index 6a9d6f84..41ec413f 100644 --- a/src/instructlab/training/utils.py +++ b/src/instructlab/training/utils.py @@ -653,16 +653,21 @@ def prepare_universal_checkpoint_from_latest(output_dir): start = time.time() if torch.distributed.get_rank() == 0: - # Third Party - from deepspeed.checkpoint import DeepSpeedCheckpoint - from deepspeed.checkpoint.ds_to_universal import ( - PARAM_SHAPES, - UNIVERSAL_CHECKPOINT_INFO, - _check_for_required_state, - _extract_zero_shard_files, - _merge_tp_slice_files, - _save_optimizer_state, - ) + try: + # Third Party + from deepspeed.checkpoint import DeepSpeedCheckpoint + from deepspeed.checkpoint.ds_to_universal import ( + PARAM_SHAPES, + UNIVERSAL_CHECKPOINT_INFO, + _check_for_required_state, + _extract_zero_shard_files, + _merge_tp_slice_files, + _save_optimizer_state, + ) + except ImportError as exc: + raise ImportError( + "DeepSpeed-specific checkpoints cannot be saved without DeepSpeed>=0.14.3 installed" + ) from exc # read the latest file to get the step folder latest_file = output_dir / "latest"