From 416a1b73f3152444670bb8200cca12f2a374e9dd Mon Sep 17 00:00:00 2001 From: Saaketh Date: Thu, 22 Aug 2024 18:06:23 -0400 Subject: [PATCH 1/4] bumP --- composer/_version.py | 2 +- composer/callbacks/__init__.py | 2 - .../callbacks/eval_output_logging_callback.py | 129 ------------------ composer/core/state.py | 16 --- composer/trainer/trainer.py | 3 +- composer/utils/__init__.py | 3 +- composer/utils/parallelism.py | 39 ------ docker/README.md | 4 +- docker/build_matrix.yaml | 16 +-- docker/generate_build_matrix.py | 2 +- 10 files changed, 14 insertions(+), 202 deletions(-) delete mode 100644 composer/callbacks/eval_output_logging_callback.py diff --git a/composer/_version.py b/composer/_version.py index a38b61a722..72f97c6d90 100644 --- a/composer/_version.py +++ b/composer/_version.py @@ -3,4 +3,4 @@ """The Composer Version.""" -__version__ = '0.24.0.dev0' +__version__ = '0.24.0' diff --git a/composer/callbacks/__init__.py b/composer/callbacks/__init__.py index b876826e3c..16a50a31a9 100644 --- a/composer/callbacks/__init__.py +++ b/composer/callbacks/__init__.py @@ -9,7 +9,6 @@ from composer.callbacks.activation_monitor import ActivationMonitor from composer.callbacks.checkpoint_saver import CheckpointSaver from composer.callbacks.early_stopper import EarlyStopper -from composer.callbacks.eval_output_logging_callback import EvalOutputLogging from composer.callbacks.export_for_inference import ExportForInferenceCallback from composer.callbacks.free_outputs import FreeOutputs from composer.callbacks.generate import Generate @@ -36,7 +35,6 @@ 'CheckpointSaver', 'MLPerfCallback', 'EarlyStopper', - 'EvalOutputLogging', 'ExportForInferenceCallback', 'ThresholdStopper', 'ImageVisualizer', diff --git a/composer/callbacks/eval_output_logging_callback.py b/composer/callbacks/eval_output_logging_callback.py deleted file mode 100644 index 717994413a..0000000000 --- a/composer/callbacks/eval_output_logging_callback.py +++ /dev/null @@ -1,129 +0,0 @@ -# Copyright 2022 MosaicML Composer authors -# SPDX-License-Identifier: Apache-2.0 - -"""Log model outputs and expected outputs during ICL evaluation.""" - -import warnings -from copy import deepcopy -from typing import Any, Sequence, Union - -import torch - -from composer.core import Callback, State -from composer.loggers import ConsoleLogger, Logger -from composer.utils import VersionedDeprecationWarning, dist - - -class EvalOutputLogging(Callback): - """Logs eval outputs for each sample of each ICL evaluation dataset. - - ICL metrics are required to support caching the model's responses including information on whether model was correct. - Metrics are responsible for returning the results of individual data points in a dictionary of lists. - The callback will log the metric name, the depadded and detokenized input, any data stored in state.metric_outputs, and - any keys from the batch passed into `batch_keys_to_log`. It will do so after every eval batch. - """ - - def __init__(self, log_tokens=False, *args, **kwargs): - warnings.warn( - VersionedDeprecationWarning( - '`InContextLearningMetric` and it\'s subclasses have been deprecated and ' + - 'migrated to MosaicML\'s llm-foundry repo under the llmfoundry.eval.datasets.in_context_learning module: ' - + 'https://github.com/mosaicml/llm-foundry/blob/main/scripts/eval/README.md.' + - 'As EvalOutputLogging only works for ICL metrics, it has been deprecated and ' + - 'will be migrated as well.', - remove_version='0.24.0', - ), - ) - super().__init__(self, *args, **kwargs) - self.log_tokens = log_tokens - self.columns = None - self.name = None - self.rows = [] - - def eval_batch_end(self, state: State, logger: Logger) -> None: - if not isinstance(state.batch, dict): - warnings.warn( - f'''EvalOutputLogging only supports batches that are dictionary. \ - Found batch for type {type(state.batch)}. \ - Not logging eval outputs.''', - ) - return - - assert state.outputs is not None - assert state.metric_outputs is not None - logging_dict: dict[str, Union[list[Any], torch.Tensor, Sequence[torch.Tensor]]] = deepcopy(state.metric_outputs) - - # If batch mode is not generate, outputs will be logits - if state.batch['mode'] == 'generate': - # Outputs are already detokenized - logging_dict['outputs'] = state.outputs - - input_ids = state.batch['input_ids'] - logged_input = [] - assert state.dataloader is not None - - # Depad and decode input_ids - for input_list in input_ids.tolist(): - dataset = state.dataloader.dataset # pyright: ignore[reportGeneralTypeIssues] - depadded_input = [tok for tok in input_list if tok != dataset.pad_tok_id] - logged_input.append(dataset.tokenizer.decode(depadded_input)) - logging_dict['input'] = logged_input - - # Log token indices if toggled - if self.log_tokens: - logging_dict['input_tokens'] = input_ids.tolist() - if not state.batch['mode'] == 'generate': - if isinstance(state.outputs, torch.Tensor): # pyright - logging_dict['label_tokens'] = state.outputs.tolist() - - # Add run_name as a column - run_name_list = [state.run_name for _ in range(0, len(logging_dict['input']))] - logging_dict['run_name'] = run_name_list - - # NOTE: This assumes _any_ tensor logged are tokens to be decoded. - # This might not be true if, for example, logits are logged. - - # Detokenize data in rows - for key, value in logging_dict.items(): - # All types in list are the same - if isinstance(value[0], torch.Tensor): - logging_dict[key] = [ - state.dataloader.dataset.tokenizer.decode(t) # pyright: ignore[reportGeneralTypeIssues] - for t in value - ] - elif isinstance(value[0], list): - if isinstance(value[0][0], torch.Tensor): - tokenizer = state.dataloader.dataset.tokenizer # pyright: ignore[reportGeneralTypeIssues] - logging_dict[key] = [[tokenizer.decode(choice) for choice in t] for t in value] - - # Convert logging_dict from kv pairs of column name and column values to a list of rows - # Example: - # logging_dict = {"a": ["1a", "2a"], "b": ["1b", "2b"]} - # will become - # columns = {"a", "b"}, rows = [["1a", "1b"], ["2a", "2b"]] - columns = list(logging_dict.keys()) - rows = [list(item) for item in zip(*logging_dict.values())] - - assert state.dataloader_label is not None - if not self.name: - # If only running eval, step will be 0 - # If running training, step will be current training step - step = state.timestamp.batch.value - self.name = f'{state.dataloader_label}_step_{step}' - self.columns = columns - self.rows.extend(rows) - - def eval_end(self, state: State, logger: Logger) -> None: - # eval_batch_end will have set these if there is anything to log - if self.name is None or self.columns is None: - return - - list_of_rows = dist.all_gather_object(self.rows) - rows = [row for rows in list_of_rows for row in rows] - for dest_logger in logger.destinations: - if not isinstance(dest_logger, ConsoleLogger): - dest_logger.log_table(self.columns, rows, name=self.name, step=state.timestamp.batch.value) - - self.rows = [] - self.name = None - self.columns = None diff --git a/composer/core/state.py b/composer/core/state.py index 7c43473ace..ce10438c60 100644 --- a/composer/core/state.py +++ b/composer/core/state.py @@ -47,7 +47,6 @@ ParallelismConfig, ParallelismType, TPConfig, - VersionedDeprecationWarning, batch_get, batch_set, dist, @@ -900,21 +899,6 @@ def fsdp_state_dict_type(self): def fsdp_sharded_state_dict_enabled(self): return self.fsdp_config is not None and self.fsdp_enabled and self.fsdp_state_dict_type == 'sharded' - @property - def fsdp_device_mesh(self): - warnings.warn(VersionedDeprecationWarning('fsdp_device_mesh is deprecated. Use device_mesh instead.', '0.24')) - return self.device_mesh - - @property - def load_fsdp_monolith_rank0_only(self): - warnings.warn( - VersionedDeprecationWarning( - 'load_fsdp_monolith_rank0_only is deprecated. Use load_monolith_rank0_only instead.', - '0.24', - ), - ) - return self.load_monolith_rank0_only - @property def load_monolith_rank0_only(self): return ( diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 2a65a40b43..8c00d375c2 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -114,7 +114,6 @@ Transform, VersionedDeprecationWarning, checkpoint, - create_fsdp_config, dist, ensure_tuple, export_with_logger, @@ -1323,7 +1322,7 @@ def __init__( if isinstance(parallelism_config['fsdp'], FSDPConfig): parallelism_config_args['fsdp'] = parallelism_config['fsdp'] else: - parallelism_config_args['fsdp'] = create_fsdp_config(parallelism_config['fsdp']) + parallelism_config_args['fsdp'] = FSDPConfig(**parallelism_config_args['fsdp']) if 'tp' in parallelism_config and parallelism_config['tp'] is not None: if isinstance(parallelism_config['tp'], TPConfig): parallelism_config_args['tp'] = parallelism_config['tp'] diff --git a/composer/utils/__init__.py b/composer/utils/__init__.py index 20fa44e092..283ab446c1 100644 --- a/composer/utils/__init__.py +++ b/composer/utils/__init__.py @@ -76,7 +76,7 @@ UCObjectStore, build_remote_backend, ) -from composer.utils.parallelism import FSDPConfig, ParallelismConfig, TPConfig, create_fsdp_config +from composer.utils.parallelism import FSDPConfig, ParallelismConfig, TPConfig from composer.utils.remote_uploader import RemoteFilesExistingCheckStatus, RemoteUploader from composer.utils.retrying import retry from composer.utils.string_enum import StringEnum @@ -153,7 +153,6 @@ 'KNOWN_COMPRESSORS', 'STR_TO_DTYPE', 'ParallelismType', - 'create_fsdp_config', 'FSDPConfig', 'TPConfig', 'ParallelismConfig', diff --git a/composer/utils/parallelism.py b/composer/utils/parallelism.py index 4dc921b63a..6d4e05d773 100644 --- a/composer/utils/parallelism.py +++ b/composer/utils/parallelism.py @@ -3,14 +3,11 @@ """Parallelism configs.""" -import warnings from dataclasses import dataclass from typing import Any, Optional from torch.distributed._tensor.device_mesh import DeviceMesh -from composer.utils.warnings import VersionedDeprecationWarning - @dataclass class FSDPConfig: @@ -45,42 +42,6 @@ class FSDPConfig: verbose: bool = False -def create_fsdp_config(fsdp_config: dict[str, Any]): - """Modify fsdp_config to set default values for missing keys.""" - fsdp_config = {**fsdp_config} # Shallow copy to avoid modifying input - if 'process_group' in fsdp_config: - warnings.warn( - VersionedDeprecationWarning( - 'process_group is deprecated. Please specify `data_parallel_shard_degree` and `data_parallel_replicate_degree` instead.', - remove_version='0.24', - ), - ) - - if 'device_mesh' in fsdp_config: - warnings.warn( - VersionedDeprecationWarning( - 'device_mesh is deprecated. Please specify `data_parallel_shard_degree` and `data_parallel_replicate_degree` instead.', - remove_version='0.24', - ), - ) - if 'data_parallel_shard_degree' in fsdp_config or 'data_parallel_replicate_degree' in fsdp_config: - raise ValueError( - 'Cannot specify both `device_mesh` and `data_parallel_shard_degree` or `data_parallel_replicate_degree`. Please remove `device_mesh`.', - ) - device_mesh = fsdp_config.pop('device_mesh') - if len(device_mesh) == 1: - fsdp_config['data_parallel_shard_degree'] = device_mesh[0] - elif len(device_mesh) == 2: - fsdp_config['data_parallel_replicate_degree'] = device_mesh[0] - fsdp_config['data_parallel_shard_degree'] = device_mesh[1] - else: - raise ValueError( - f'device_mesh must be of length 1 or 2 but received length {len(device_mesh)} with device mesh {device_mesh}.', - ) - - return FSDPConfig(**fsdp_config) - - @dataclass class TPConfig: """Configuration for tensor parallelism (TP).""" diff --git a/docker/README.md b/docker/README.md index a8ebfa63e4..7639a70f06 100644 --- a/docker/README.md +++ b/docker/README.md @@ -15,8 +15,8 @@ all dependencies for both NLP and Vision models. They are built on top of the | Composer Version | CUDA Support | Docker Tag | |--------------------|----------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| 0.23.5 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.23.5` | -| 0.23.5 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.23.5_cpu` | +| 0.24.0 | Yes | `mosaicml/composer:latest`, `mosaicml/composer:0.24.0` | +| 0.24.0 | No | `mosaicml/composer:latest_cpu`, `mosaicml/composer:0.24.0_cpu` | **Note**: For a lightweight installation, we recommended using a [MosaicML PyTorch Image](#pytorch-images) and manually diff --git a/docker/build_matrix.yaml b/docker/build_matrix.yaml index 856ae598e1..32b942d265 100644 --- a/docker/build_matrix.yaml +++ b/docker/build_matrix.yaml @@ -194,9 +194,9 @@ TORCHVISION_VERSION: 0.17.2 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: nvidia/cuda:12.4.1-cudnn-devel-ubuntu20.04 - COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5 + COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.24.0 CUDA_VERSION: 12.4.1 - IMAGE_NAME: composer-0-23-5 + IMAGE_NAME: composer-0-24-0 MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' @@ -204,17 +204,17 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.4.0 TAGS: - - mosaicml/composer:0.23.5 - - ghcr.io/databricks-mosaic/composer:0.23.5 + - mosaicml/composer:0.24.0 + - ghcr.io/databricks-mosaic/composer:0.24.0 - mosaicml/composer:latest - ghcr.io/databricks-mosaic/composer:latest TARGET: composer_stage TORCHVISION_VERSION: 0.19.0 - AWS_OFI_NCCL_VERSION: '' BASE_IMAGE: ubuntu:20.04 - COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.23.5 + COMPOSER_INSTALL_COMMAND: mosaicml[all]==0.24.0 CUDA_VERSION: '' - IMAGE_NAME: composer-0-23-5-cpu + IMAGE_NAME: composer-0-24-0-cpu MOFED_VERSION: latest-23.10 NVIDIA_REQUIRE_CUDA_OVERRIDE: '' PYTHON_VERSION: '3.11' @@ -222,8 +222,8 @@ PYTORCH_NIGHTLY_VERSION: '' PYTORCH_VERSION: 2.4.0 TAGS: - - mosaicml/composer:0.23.5_cpu - - ghcr.io/databricks-mosaic/composer:0.23.5_cpu + - mosaicml/composer:0.24.0_cpu + - ghcr.io/databricks-mosaic/composer:0.24.0_cpu - mosaicml/composer:latest_cpu - ghcr.io/databricks-mosaic/composer:latest_cpu TARGET: composer_stage diff --git a/docker/generate_build_matrix.py b/docker/generate_build_matrix.py index d2261a4ea3..bb9317dbe6 100644 --- a/docker/generate_build_matrix.py +++ b/docker/generate_build_matrix.py @@ -244,7 +244,7 @@ def _main(): composer_entries = [] # The `GIT_COMMIT` is a placeholder and Jenkins will substitute it with the actual git commit for the `composer_staging` images - composer_versions = ['0.23.5'] # Only build images for the latest composer version + composer_versions = ['0.24.0'] # Only build images for the latest composer version composer_python_versions = [PRODUCTION_PYTHON_VERSION] # just build composer against the latest for product in itertools.product(composer_python_versions, composer_versions, cuda_options): From f2e3496eba75d9e01f105b172076851b91bd6d25 Mon Sep 17 00:00:00 2001 From: Saaketh Date: Thu, 22 Aug 2024 18:11:18 -0400 Subject: [PATCH 2/4] bumP --- composer/trainer/trainer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py index 8c00d375c2..68d543e40e 100644 --- a/composer/trainer/trainer.py +++ b/composer/trainer/trainer.py @@ -1322,7 +1322,7 @@ def __init__( if isinstance(parallelism_config['fsdp'], FSDPConfig): parallelism_config_args['fsdp'] = parallelism_config['fsdp'] else: - parallelism_config_args['fsdp'] = FSDPConfig(**parallelism_config_args['fsdp']) + parallelism_config_args['fsdp'] = FSDPConfig(**parallelism_config['fsdp']) if 'tp' in parallelism_config and parallelism_config['tp'] is not None: if isinstance(parallelism_config['tp'], TPConfig): parallelism_config_args['tp'] = parallelism_config['tp'] From fb9f919bb0a7840a53d16e678395aff4c2a15991 Mon Sep 17 00:00:00 2001 From: Saaketh Date: Fri, 23 Aug 2024 11:32:59 -0400 Subject: [PATCH 3/4] ayo --- composer/checkpoint/load.py | 10 ++++++---- composer/core/state.py | 12 +++++++----- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/composer/checkpoint/load.py b/composer/checkpoint/load.py index a78e871ccd..c82c0d64d8 100644 --- a/composer/checkpoint/load.py +++ b/composer/checkpoint/load.py @@ -539,10 +539,12 @@ def torch_set_model_state_dict( textwrap.dedent( 'PyTorch DTensor broke backwards compatibility in older checkpoints ' 'with ShardedTensor, which is now deprecated. To load old checkpoints, ' - 'either downgrade to PyTorch <2.3.0 or explicitly pass process groups ' - 'in the Trainer constructor via ' - "`parallelism_config = {'fsdp': {'process_group': 'mod1'}}`. We can " - 'provide assistance at https://github.com/mosaicml/composer/issues.', + 'either downgrade to PyTorch <2.3.0 or define process groups in the ' + 'Trainer constructor via the `data_parallel_shard_degree` or ' + '`data_parallel_replicate_degree` arguments to `parallelism_config` via' + 'the "fsdp" key, as ' + '`parallelism_config = {"fsdp": {"data_parallel_shard_degree": 8}}`, ' + 'for example. We can provide assistance at https://github.com/mosaicml/composer/issues.', ), ) from e else: diff --git a/composer/core/state.py b/composer/core/state.py index ce10438c60..fe35fa42a0 100644 --- a/composer/core/state.py +++ b/composer/core/state.py @@ -616,7 +616,7 @@ def _validate_parallelism_configs(self): # Load monolith rank0 only if self.load_monolith_rank0_only: if self.tp_config is not None: - raise ValueError('load_fsdp_monolith_rank0_only is not compatible with tensor parallelism (TP).') + raise ValueError('load_monolith_rank0_only is not compatible with tensor parallelism (TP).') assert self.fsdp_config is not None error_message = '' if self.fsdp_config.sync_module_states == False: @@ -1336,10 +1336,12 @@ def load_model_state( textwrap.dedent( 'PyTorch DTensor broke backwards compatibility in older checkpoints ' 'with ShardedTensor, which is now deprecated. To load old checkpoints, ' - 'either downgrade to PyTorch <2.3.0 or explicitly pass process groups ' - 'in the Trainer constructor via ' - "`parallelism_config = {'fsdp': {'process_group': 'mod1'}}`. We can " - 'provide assistance at https://github.com/mosaicml/composer/issues.', + 'either downgrade to PyTorch <2.3.0 or define process groups in the ' + 'Trainer constructor via the `data_parallel_shard_degree` or ' + '`data_parallel_replicate_degree` arguments to `parallelism_config` via' + 'the "fsdp" key, as ' + '`parallelism_config = {"fsdp": {"data_parallel_shard_degree": 8}}`, ' + 'for example. We can provide assistance at https://github.com/mosaicml/composer/issues.', ), ) from e else: From 698f0d4ee9576d60a9aed1f0b8fae795746b09e8 Mon Sep 17 00:00:00 2001 From: Saaketh Date: Fri, 23 Aug 2024 11:45:30 -0400 Subject: [PATCH 4/4] yo --- tests/trainer/test_fsdp.py | 44 -------------------------------------- 1 file changed, 44 deletions(-) diff --git a/tests/trainer/test_fsdp.py b/tests/trainer/test_fsdp.py index 9dab386324..491589c557 100644 --- a/tests/trainer/test_fsdp.py +++ b/tests/trainer/test_fsdp.py @@ -324,32 +324,6 @@ def test_fsdp_automicrobatching_sync_hooks(world_size: int): mock_readd_hooks.assert_called_once() -@pytest.mark.gpu -@world_size(2) -@pytest.mark.filterwarnings('ignore:Instantiating FSDP with custom process groups.*:UserWarning') -@pytest.mark.filterwarnings('ignore:Composer is instantiating custom process groups.*:UserWarning') -@pytest.mark.filterwarnings('ignore:.*process_group and device_mesh are set for FSDP.*.:UserWarning') -def test_fsdp_process_group(world_size: int): - model = SimpleModel() - model.fc1._fsdp_wrap = True # pyright: ignore[reportGeneralTypeIssues] - model.fc2._fsdp_wrap = True # pyright: ignore[reportGeneralTypeIssues] - dataset = RandomClassificationDataset(size=10) - dataloader = DataLoader(dataset, sampler=dist.get_sampler(dataset)) - - trainer = Trainer( - model=model, - train_dataloader=dataloader, - parallelism_config={ - 'fsdp': { - 'process_group': 'mod1', # all ranks - }, - }, - max_duration='3ba', - ) - - trainer.fit() - - @pytest.mark.gpu @world_size(2) @pytest.mark.skipif( @@ -577,24 +551,6 @@ def oom_hook(module, grad_input, grad_ouput): assert torch.equal(output_1, output_2) -@pytest.mark.gpu -@world_size(2) -def test_fsdp_device_mesh(world_size: int): - model = SimpleModel() - model.fc1._fsdp_wrap = True # pyright: ignore[reportGeneralTypeIssues] - model.fc2._fsdp_wrap = True # pyright: ignore[reportGeneralTypeIssues] - - # Expect warning via pytest - with pytest.warns(DeprecationWarning): - Trainer( - model=model, - parallelism_config={'fsdp': { - 'device_mesh': [2], - }}, - max_duration='3ba', - ) - - @pytest.mark.gpu @world_size(2) def test_fsdp_shard(world_size: int):