Skip to content

Commit

Permalink
Upgrade to Synapse AI Release 1.12.1 (#106)
Browse files Browse the repository at this point in the history
* Upgrade to Synapse AI Release 1.12.1

Update images and modify hooks to suit lightning 2.1

Signed-off-by: Jerome <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Jirka Borovec <[email protected]>
  • Loading branch information
3 people authored Oct 25, 2023
1 parent 73149db commit 5b00a32
Show file tree
Hide file tree
Showing 8 changed files with 54 additions and 22 deletions.
8 changes: 4 additions & 4 deletions .azure/hpu-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ jobs:
strategy:
matrix:
'w. pytorch-lightning | pypi':
image: "1.12.0/ubuntu22.04/habanalabs/pytorch-installer-2.0.1:latest"
image: "1.12.1/ubuntu22.04/habanalabs/pytorch-installer-2.0.1:latest"
dependency: "pytorch-lightning"
pkg_source: "pypi"
'w. lightning | pypi':
image: "1.12.0/ubuntu22.04/habanalabs/pytorch-installer-2.0.1:latest"
image: "1.12.1/ubuntu22.04/habanalabs/pytorch-installer-2.0.1:latest"
dependency: "lightning"
pkg_source: "pypi"
'w. lightning | source':
image: "1.12.0/ubuntu22.04/habanalabs/pytorch-installer-2.0.1:latest"
image: "1.12.1/ubuntu22.04/habanalabs/pytorch-installer-2.0.1:latest"
dependency: "lightning"
pkg_source: "source"
pool: "intel-hpus"
Expand All @@ -52,7 +52,7 @@ jobs:
--shm-size=4g \
-v /usr/bin/docker:/tmp/docker:ro"
variables:
DEEPSPEED_VERSION: "1.12.0"
DEEPSPEED_VERSION: "1.12.1"
workspace:
clean: all

Expand Down
24 changes: 24 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,29 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Added

-

### Changed

-

### Fixed

-

### Removed

-

### Deprecated

-


## [1.2.0] - 2023-10-26

### Added

- Added tests, examples and documentation for HPUPrecisionPlugin with autocast ([#94](https://github.com/Lightning-AI/lightning-Habana/pull/94))
- Added test to validate checkpoint resuming with HPUDeepSpeedStrategy ([#95](https://github.com/Lightning-AI/lightning-Habana/pull/95))
- Added support for lightning 2.1 ([#100](https://github.com/Lightning-AI/lightning-Habana/pull/100), [#105](https://github.com/Lightning-AI/lightning-Habana/pull/105))
Expand All @@ -17,6 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

- Changed HPU docker image based on synapse AI release 1.12.0 ([#90](https://github.com/Lightning-AI/lightning-Habana/pull/90))
- Use standard API's and Remove env variable to get HPU distributed backend ([#91](https://github.com/Lightning-AI/lightning-Habana/pull/91))
- Changed HPU docker image based on synapse AI release 1.12.1, updated hooks ([#106](https://github.com/Lightning-AI/lightning-Habana/pull/106))


### Fixed
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@ The `devices>1` parameter with HPUs enables the Habana accelerator for distribut

# Support Matrix

| **SynapseAI** | **1.12.0** |
| **SynapseAI** | **1.12.1** |
| --------------------- | -------------------------------------------------- |
| PyTorch | 2.0.1 |
| (PyTorch) Lightning\* | 2.1.x |
| **Lightning Habana** | **1.1.0** |
| **Lightning Habana** | **1.2.0** |
| DeepSpeed\*\* | Forked from v0.9.4 of the official DeepSpeed repo. |

\* covers both packages [`lightning`](https://pypi.org/project/lightning/) and [`pytorch-lightning`](https://pypi.org/project/pytorch-lightning/)
Expand Down
2 changes: 1 addition & 1 deletion src/lightning_habana/__about__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "1.1.1.dev"
__version__ = "1.2.0"
__author__ = "Lightning-AI et al."
__author_email__ = "[email protected]"
__license__ = "Apache-2.0"
Expand Down
4 changes: 4 additions & 0 deletions src/lightning_habana/pytorch/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from typing import Any, Dict, List, Optional, Union

import torch
Expand Down Expand Up @@ -54,6 +55,9 @@ def get_device_stats(self, device: _DEVICE) -> Dict[str, Any]:
return get_device_stats(device)

def teardown(self) -> None:
os.environ.pop("HABANA_PROFILE", None)
os.environ.pop("HLS_MODULE_ID", None)
os.environ.pop("ID", None)
pass

@staticmethod
Expand Down
6 changes: 3 additions & 3 deletions src/lightning_habana/pytorch/strategies/deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@
warning_cache = WarningCache()

_HPU_DEEPSPEED_AVAILABLE = (
# HPU deep speed is supported only through this pip install git+https://github.com/HabanaAI/[email protected].0
RequirementCache("deepspeed==0.9.4+hpu.synapse.v1.12.0")
# HPU deep speed is supported only through this pip install git+https://github.com/HabanaAI/[email protected].1
RequirementCache("deepspeed==0.9.4+hpu.synapse.v1.12.1")
)
if TYPE_CHECKING and _HPU_DEEPSPEED_AVAILABLE:
import deepspeed
Expand Down Expand Up @@ -295,7 +295,7 @@ def __init__(
if not _HPU_DEEPSPEED_AVAILABLE:
raise MisconfigurationException(
"To use the `HPUDeepSpeedStrategy`, you must have hpu DeepSpeed installed."
" Install it by running `pip install git+https://github.com/HabanaAI/[email protected].0`."
" Install it by running `pip install git+https://github.com/HabanaAI/[email protected].1`."
)

super().__init__(
Expand Down
14 changes: 8 additions & 6 deletions src/lightning_habana/pytorch/strategies/parallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
from lightning.pytorch.plugins.precision import PrecisionPlugin
from lightning.pytorch.strategies.ddp import DDPStrategy
from lightning.pytorch.utilities.types import STEP_OUTPUT
elif module_available("pytorch_lightning"):
from lightning_fabric.plugins import CheckpointIO, ClusterEnvironment
from lightning_fabric.utilities.distributed import group as _group
Expand All @@ -36,6 +37,7 @@
from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
from pytorch_lightning.plugins.precision import PrecisionPlugin
from pytorch_lightning.strategies.ddp import DDPStrategy
from pytorch_lightning.utilities.types import STEP_OUTPUT
else:
raise ModuleNotFoundError("You are missing `lightning` or `pytorch-lightning` package, please install it.")
from torch import Tensor
Expand Down Expand Up @@ -138,20 +140,20 @@ def optimizer_step(
htcore.mark_step()
return optimizer_output

def validation_step(self, batch: Any, batch_idx: int) -> Any:
def validation_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
# Break lazy accumulation of graph after every step
htcore.mark_step()
return super().validation_step(batch, batch_idx)
return super().validation_step(*args, **kwargs)

def test_step(self, batch: Any, batch_idx: int) -> Any:
def test_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
# Break lazy accumulation of graph after every step
htcore.mark_step()
return super().test_step(batch, batch_idx)
return super().test_step(*args, **kwargs)

def predict_step(self, batch: Any, batch_idx: int) -> Any:
def predict_step(self, *args: Any, **kwargs: Any) -> Any:
# Break lazy accumulation of graph after every step
htcore.mark_step()
return super().predict_step(batch, batch_idx)
return super().predict_step(*args, **kwargs)

def reduce(
self, tensor: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean"
Expand Down
14 changes: 8 additions & 6 deletions src/lightning_habana/pytorch/strategies/single.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from lightning.pytorch.plugins.io.wrapper import _WrappingCheckpointIO
from lightning.pytorch.plugins.precision import PrecisionPlugin
from lightning.pytorch.strategies.single_device import SingleDeviceStrategy
from lightning.pytorch.utilities.types import STEP_OUTPUT
elif module_available("pytorch_lightning"):
from lightning_fabric.plugins import CheckpointIO
from lightning_fabric.utilities.types import _DEVICE
Expand All @@ -32,6 +33,7 @@
from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO
from pytorch_lightning.plugins.precision import PrecisionPlugin
from pytorch_lightning.strategies.single_device import SingleDeviceStrategy
from pytorch_lightning.utilities.types import STEP_OUTPUT
else:
raise ModuleNotFoundError("You are missing `lightning` or `pytorch-lightning` package, please install it.")

Expand Down Expand Up @@ -107,20 +109,20 @@ def optimizer_step(
htcore.mark_step()
return optimizer_output

def validation_step(self, batch: Any, batch_idx: int) -> Any:
def validation_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
# Break lazy accumulation of graph after every step
htcore.mark_step()
return super().validation_step(batch, batch_idx)
return super().validation_step(*args, **kwargs)

def test_step(self, batch: Any, batch_idx: int) -> Any:
def test_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
# Break lazy accumulation of graph after every step
htcore.mark_step()
return super().test_step(batch, batch_idx)
return super().test_step(*args, **kwargs)

def predict_step(self, batch: Any, batch_idx: int) -> Any:
def predict_step(self, *args: Any, **kwargs: Any) -> Any:
# Break lazy accumulation of graph after every step
htcore.mark_step()
return super().predict_step(batch, batch_idx)
return super().predict_step(*args, **kwargs)

@classmethod
def register_strategies(cls, strategy_registry: Dict) -> None:
Expand Down

0 comments on commit 5b00a32

Please sign in to comment.