From f8ba548876450023f0f4584f544e080fdc0d3394 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Wed, 23 Aug 2023 15:54:33 -0400 Subject: [PATCH 1/5] Enable TP greedy env var --- src/accelerate/accelerator.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index 89f8129e63f..333c142cd61 100755 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -1193,10 +1193,12 @@ def prepare(self, *args, device_placement=None): ) for obj in args: + # TODO: Look at enabling native TP training directly with a proper config if ( isinstance(obj, torch.nn.Module) and self.verify_device_map(obj) and self.distributed_type != DistributedType.NO + and not os.environ.get("ACCELERATE_BYPASS_AUTO", "false") ): raise ValueError( "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode." @@ -1328,7 +1330,12 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e device_placement = self.device_placement and self.distributed_type != DistributedType.FSDP self._models.append(model) - if self.verify_device_map(model) and self.distributed_type != DistributedType.NO: + # TODO: Look at enabling native TP training directly with a proper config + if ( + self.verify_device_map(model) + and self.distributed_type != DistributedType.NO + and not os.environ.get("ACCELERATE_BYPASS_AUTO", "false") + ): raise ValueError( "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode." " Please rerun your script specifying `--num_processes=1` or by launching with `python {{myscript.py}}`." @@ -1401,8 +1408,14 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e ): if any(p.requires_grad for p in model.parameters()): kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {} + # TODO: Look at enabling native TP training directly with a proper config + if not os.environ.get("ACCELERATE_BYPASS_AUTO", "false"): + device_ids, output_device = [self.local_process_index], self.local_process_index + else: + device_ids, output_device = None, None + model = torch.nn.parallel.DistributedDataParallel( - model, device_ids=[self.local_process_index], output_device=self.local_process_index, **kwargs + model, device_ids=device_ids, output_device=output_device, **kwargs ) elif self.distributed_type == DistributedType.FSDP: from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP From d08a2dae27608b4e74e99cbc0d7f5c39fd47c960 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Wed, 23 Aug 2023 15:56:31 -0400 Subject: [PATCH 2/5] Right env setting --- src/accelerate/accelerator.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index 333c142cd61..60102db0d23 100755 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -1198,7 +1198,7 @@ def prepare(self, *args, device_placement=None): isinstance(obj, torch.nn.Module) and self.verify_device_map(obj) and self.distributed_type != DistributedType.NO - and not os.environ.get("ACCELERATE_BYPASS_AUTO", "false") + and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") == "false" ): raise ValueError( "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode." @@ -1334,7 +1334,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e if ( self.verify_device_map(model) and self.distributed_type != DistributedType.NO - and not os.environ.get("ACCELERATE_BYPASS_AUTO", "false") + and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") == "false" ): raise ValueError( "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode." @@ -1409,10 +1409,10 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e if any(p.requires_grad for p in model.parameters()): kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {} # TODO: Look at enabling native TP training directly with a proper config - if not os.environ.get("ACCELERATE_BYPASS_AUTO", "false"): - device_ids, output_device = [self.local_process_index], self.local_process_index - else: + if os.environ.get("ACCELERATE_BYPASS_AUTO", "false") == "true": device_ids, output_device = None, None + else: + device_ids, output_device = [self.local_process_index], self.local_process_index model = torch.nn.parallel.DistributedDataParallel( model, device_ids=device_ids, output_device=output_device, **kwargs From d2bd6a410a5aa3aab4fbb4a4f9051fbf6cae2f6d Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Wed, 23 Aug 2023 15:59:37 -0400 Subject: [PATCH 3/5] Use true, not false --- src/accelerate/accelerator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index 60102db0d23..aa110b6a535 100755 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -1198,7 +1198,7 @@ def prepare(self, *args, device_placement=None): isinstance(obj, torch.nn.Module) and self.verify_device_map(obj) and self.distributed_type != DistributedType.NO - and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") == "false" + and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") != "true" ): raise ValueError( "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode." @@ -1334,7 +1334,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e if ( self.verify_device_map(model) and self.distributed_type != DistributedType.NO - and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") == "false" + and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") != "true" ): raise ValueError( "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode." From 3c560bccf873a024c8c2932ea7c5ae78f9d45b8c Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Wed, 23 Aug 2023 16:00:30 -0400 Subject: [PATCH 4/5] Design nit --- src/accelerate/accelerator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index aa110b6a535..0cfe5ee5164 100755 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -1409,10 +1409,10 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e if any(p.requires_grad for p in model.parameters()): kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {} # TODO: Look at enabling native TP training directly with a proper config - if os.environ.get("ACCELERATE_BYPASS_AUTO", "false") == "true": - device_ids, output_device = None, None - else: + if os.environ.get("ACCELERATE_BYPASS_AUTO", "false") != "true": device_ids, output_device = [self.local_process_index], self.local_process_index + else: + device_ids, output_device = None, None model = torch.nn.parallel.DistributedDataParallel( model, device_ids=device_ids, output_device=output_device, **kwargs From 9b8cc1ed704de9bc7f672df1cdf59c504c6a06a4 Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Thu, 24 Aug 2023 09:44:17 -0400 Subject: [PATCH 5/5] ACCELERATE_BYPASS_DEVICE_MAP --- src/accelerate/accelerator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py index 0cfe5ee5164..7fc5bf0173e 100755 --- a/src/accelerate/accelerator.py +++ b/src/accelerate/accelerator.py @@ -1198,7 +1198,7 @@ def prepare(self, *args, device_placement=None): isinstance(obj, torch.nn.Module) and self.verify_device_map(obj) and self.distributed_type != DistributedType.NO - and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") != "true" + and os.environ.get("ACCELERATE_BYPASS_DEVICE_MAP", "false") != "true" ): raise ValueError( "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode." @@ -1334,7 +1334,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e if ( self.verify_device_map(model) and self.distributed_type != DistributedType.NO - and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") != "true" + and os.environ.get("ACCELERATE_BYPASS_DEVICE_MAP", "false") != "true" ): raise ValueError( "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode." @@ -1409,7 +1409,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e if any(p.requires_grad for p in model.parameters()): kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {} # TODO: Look at enabling native TP training directly with a proper config - if os.environ.get("ACCELERATE_BYPASS_AUTO", "false") != "true": + if os.environ.get("ACCELERATE_BYPASS_DEVICE_MAP", "false") != "true": device_ids, output_device = [self.local_process_index], self.local_process_index else: device_ids, output_device = None, None