Skip to content

Commit

Permalink
Bump pytorch to 2.5.0
Browse files Browse the repository at this point in the history
  • Loading branch information
b-chu committed Oct 18, 2024
1 parent 32caf5b commit 195bdc4
Show file tree
Hide file tree
Showing 13 changed files with 151 additions and 144 deletions.
28 changes: 0 additions & 28 deletions .github/workflows/daily.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,6 @@ jobs:
strategy:
matrix:
include:
- name: cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: cpu-3.11-2.3
container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
markers: not daily and (remote or not remote) and not gpu and not doctest
Expand All @@ -45,11 +40,6 @@ jobs:
markers: not daily and (remote or not remote) and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
composer_package_name: mosaicml
- name: daily-cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
- name: daily-cpu-3.11-2.3
container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
markers: daily and (remote or not remote) and not gpu and not doctest
Expand Down Expand Up @@ -107,12 +97,6 @@ jobs:
include:
# Unlike CPU tests, we run daily tests together with GPU tests to minimize launch time
# on MCLOUD and not eat up all GPUs at once
- name: "gpu-3.11-2.2-1-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 1
- name: "gpu-3.11-2.3-1-gpu"
container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand All @@ -125,12 +109,6 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 1
- name: "gpu-3.11-2.2-2-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 2
- name: "gpu-3.11-2.3-2-gpu"
container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand All @@ -143,12 +121,6 @@ jobs:
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 2
- name: "gpu-3.11-2.2-4-gpu"
container: mosaicml/pytorch:2.2.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
pytest_command: "coverage run -m pytest"
composer_package_name: "mosaicml"
gpu_num: 4
- name: "gpu-3.11-2.3-4-gpu"
container: mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04
markers: "(daily or not daily) and (remote or not remote) and gpu and (doctest or not doctest)"
Expand Down
10 changes: 5 additions & 5 deletions .github/workflows/pr-cpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,6 @@ jobs:
strategy:
matrix:
include:
- name: cpu-3.11-2.2
container: mosaicml/pytorch:2.2.1_cpu-python3.11-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
pytest_command: coverage run -m pytest
- name: cpu-3.11-2.3
container: mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
Expand All @@ -28,8 +24,12 @@ jobs:
container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
pytest_command: coverage run -m pytest
- name: cpu-3.11-2.5
container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
markers: not daily and not remote and not gpu and not doctest
pytest_command: coverage run -m pytest
- name: cpu-doctest
container: mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04
container: mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04
markers: not daily and not remote and not gpu and doctest
pytest_command: coverage run -m pytest tests/test_docs.py
steps:
Expand Down
14 changes: 7 additions & 7 deletions .github/workflows/pr-gpu.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
name: PR GPU tests
on:
pull_request_target:
pull_request:
workflow_dispatch:
# Cancel old runs when a new commit is pushed to the same branch if not on main
# or dev
Expand All @@ -15,8 +15,8 @@ jobs:
strategy:
matrix:
include:
- name: gpu-3.11-2.4-1
container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
- name: gpu-3.11-2.5-1
container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
markers: not daily and not remote and gpu and (doctest or not doctest)
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
Expand Down Expand Up @@ -44,8 +44,8 @@ jobs:
strategy:
matrix:
include:
- name: gpu-3.11-2.4-2
container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
- name: gpu-3.11-2.5-2
container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
markers: not daily and not remote and gpu and (doctest or not doctest)
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
Expand Down Expand Up @@ -74,8 +74,8 @@ jobs:
strategy:
matrix:
include:
- name: gpu-3.11-2.4-4
container: mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04
- name: gpu-3.11-2.5-4
container: mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04
markers: not daily and not remote and gpu and (doctest or not doctest)
pytest_command: coverage run -m pytest
composer_package_name: mosaicml
Expand Down
2 changes: 1 addition & 1 deletion composer/checkpoint/state_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def get_model_state_dict(
log.debug('Calling model.state_dict() for non-FSDP model...')
model_state_dict = model.state_dict()
if isinstance(model, DistributedDataParallel):
nn.modules.utils.consume_prefix_in_state_dict_if_present(model_state_dict, 'module.')
nn.modules.utils.consume_prefix_in_state_dict_if_present(model_state_dict, 'module.') # type: ignore

if include_keys is not None:
model_state_dict = _extract_keys_from_state_dict(model_state_dict, include_keys)
Expand Down
2 changes: 1 addition & 1 deletion composer/models/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -916,7 +916,7 @@ def write_huggingface_pretrained_from_composer_checkpoint(
peft_config.save_pretrained(str(output_folder))

weights_state_dict = composer_state_dict['state']['model']
torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(weights_state_dict, prefix='model.')
torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(weights_state_dict, prefix='model.') # type: ignore

# NOTE: This only works for default adapter name, not multiple adapters
if peft_config is not None:
Expand Down
55 changes: 55 additions & 0 deletions composer/trainer/_patch_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,12 @@ def patch_pytorch():
# No monkeypatches!
pass

elif version.parse(torch.__version__) < version.parse('2.5.1'):
# Monkey patch for torch < 2.5.1 ie torch == 2.5.0

# No monkeypatches!
pass


if version.parse(torch.__version__) >= version.parse('2.2.1') and version.parse(
torch.__version__,) < version.parse('2.2.3'):
Expand Down Expand Up @@ -1046,3 +1052,52 @@ def unshard_with_sync(self):
raise RuntimeError('CUDA out of memory encountered on a different rank')
padded_unsharded_flat_param = self._all_gather_flat_param(unsharded_flat_param)
self._use_unsharded_flat_param(padded_unsharded_flat_param)

if version.parse(torch.__version__) >= version.parse('2.5.0') and version.parse(
torch.__version__,
) < version.parse('2.5.1'):

# Save original FlatParamHandle.unshard to revert back to when dropping automicrobatching hooks
from torch.distributed.fsdp._flat_param import FlatParamHandle
original_unshard = FlatParamHandle.unshard

@no_type_check
def unshard_with_sync(self):
"""Run the unshard logic, but with a sync after a :meth:`_alloc_padded_unsharded_flat_param`.
This prevents deadlocks when some ranks OOM after the alloc call and others do not.
This is a patched method from pytorch, meant to be called when automicrobatching
turns on hooks in its search process for the optimal non-OOMing microbatch size.
This includes all-gathering the flat parameter
and switching to using the unsharded flat parameter. If the handle does
not need unsharding, then this only switches to using the unsharded
flat parameter. For ``NO_SHARD``, this is a no-op.
If FSDP is in :meth:`summon_full_params` and the handle uses parameter
mixed precision, then the parameter is forced to full precision.
"""
if not self.needs_unshard():
# Even when not needing an unshard, we should switch to using
# the unsharded flat parameter
unsharded_flat_param = (
self._get_padded_unsharded_flat_param()
if self.uses_sharded_strategy
else self.flat_param
)
self._use_unsharded_flat_param(unsharded_flat_param)
return
unsharded_flat_param = self._alloc_padded_unsharded_flat_param()

# Check if any other rank hit an OOM
found_cuda_oom_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True)

dist.all_reduce(found_cuda_oom_tensor, reduce_operation='MAX')
found_cuda_oom = found_cuda_oom_tensor.item()
# Signal current rank is still in batch
all_ranks_finished_tensor = torch.tensor([0], dtype=torch.uint8).to(self.device, non_blocking=True)

dist.all_reduce(all_ranks_finished_tensor, reduce_operation='MIN')

if found_cuda_oom == 1:
raise RuntimeError('CUDA out of memory encountered on a different rank')
padded_unsharded_flat_param = self._all_gather_flat_param(unsharded_flat_param)
self._use_unsharded_flat_param(padded_unsharded_flat_param)
9 changes: 6 additions & 3 deletions composer/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2317,9 +2317,12 @@ def fit(
self.state.max_duration = duration + self.state.timestamp.get(duration.unit)

# Raise error if callig fit with SGD
if type(
self.state.optimizers[0],
) == torch.optim.SGD and version.parse(torch.__version__) >= version.parse('2.4.0'):
if (
type(
self.state.optimizers[0],
) == torch.optim.SGD and version.parse(torch.__version__) >= version.parse('2.4.0') and
version.parse(torch.__version__) >= version.parse('2.4.0')
):
raise ValueError(
'PyTorch 2.4 breaks (distributed) checkpointing with SGD. '
'Please use a different optimizer, e.g. composer.optim.DecoupledSGDW, '
Expand Down
12 changes: 6 additions & 6 deletions docker/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ To install composer, once inside the image, run `pip install mosaicml`.
<!-- BEGIN_PYTORCH_BUILD_MATRIX -->
| Linux Distro | Flavor | PyTorch Version | CUDA Version | Python Version | Docker Tags |
|----------------|----------|-------------------|---------------------|------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Ubuntu 20.04 | Base | 2.4.1 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.4.1 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws` |
| Ubuntu 20.04 | Base | 2.4.1 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.5.0 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:latest`, `mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.5.0 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:latest-aws`, `mosaicml/pytorch:2.5.0_cu124-python3.11-ubuntu20.04-aws` |
| Ubuntu 20.04 | Base | 2.5.0 | cpu | 3.11 | `mosaicml/pytorch:latest_cpu`, `mosaicml/pytorch:2.5.0_cpu-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.4.1 | 12.4.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.4.1 | 12.4.1 (EFA) | 3.11 | `mosaicml/pytorch:2.4.1_cu124-python3.11-ubuntu20.04-aws` |
| Ubuntu 20.04 | Base | 2.4.1 | cpu | 3.11 | `mosaicml/pytorch:2.4.1_cpu-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.3.1 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.3.1_cu121-python3.11-ubuntu20.04-aws` |
| Ubuntu 20.04 | Base | 2.3.1 | cpu | 3.11 | `mosaicml/pytorch:2.3.1_cpu-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.2.2 | 12.1.1 (Infiniband) | 3.11 | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04` |
| Ubuntu 20.04 | Base | 2.2.2 | 12.1.1 (EFA) | 3.11 | `mosaicml/pytorch:2.2.2_cu121-python3.11-ubuntu20.04-aws` |
| Ubuntu 20.04 | Base | 2.2.2 | cpu | 3.11 | `mosaicml/pytorch:2.2.2_cpu-python3.11-ubuntu20.04` |
<!-- END_PYTORCH_BUILD_MATRIX -->

**Note**: The `mosaicml/pytorch:latest`, `mosaicml/pytorch:latest_cpu`, and `mosaicml/pytorch:latest-aws`
Expand Down
Loading

0 comments on commit 195bdc4

Please sign in to comment.