From f8ba548876450023f0f4584f544e080fdc0d3394 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Wed, 23 Aug 2023 15:54:33 -0400
Subject: [PATCH 1/5] Enable TP greedy env var

---
 src/accelerate/accelerator.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index 89f8129e63f..333c142cd61 100755
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -1193,10 +1193,12 @@ def prepare(self, *args, device_placement=None):
             )
 
         for obj in args:
+            # TODO: Look at enabling native TP training directly with a proper config
             if (
                 isinstance(obj, torch.nn.Module)
                 and self.verify_device_map(obj)
                 and self.distributed_type != DistributedType.NO
+                and not os.environ.get("ACCELERATE_BYPASS_AUTO", "false")
             ):
                 raise ValueError(
                     "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode."
@@ -1328,7 +1330,12 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
             device_placement = self.device_placement and self.distributed_type != DistributedType.FSDP
         self._models.append(model)
 
-        if self.verify_device_map(model) and self.distributed_type != DistributedType.NO:
+        # TODO: Look at enabling native TP training directly with a proper config
+        if (
+            self.verify_device_map(model)
+            and self.distributed_type != DistributedType.NO
+            and not os.environ.get("ACCELERATE_BYPASS_AUTO", "false")
+        ):
             raise ValueError(
                 "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode."
                 " Please rerun your script specifying `--num_processes=1` or by launching with `python {{myscript.py}}`."
@@ -1401,8 +1408,14 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
             ):
                 if any(p.requires_grad for p in model.parameters()):
                     kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
+                    # TODO: Look at enabling native TP training directly with a proper config
+                    if not os.environ.get("ACCELERATE_BYPASS_AUTO", "false"):
+                        device_ids, output_device = [self.local_process_index], self.local_process_index
+                    else:
+                        device_ids, output_device = None, None
+
                     model = torch.nn.parallel.DistributedDataParallel(
-                        model, device_ids=[self.local_process_index], output_device=self.local_process_index, **kwargs
+                        model, device_ids=device_ids, output_device=output_device, **kwargs
                     )
             elif self.distributed_type == DistributedType.FSDP:
                 from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP

From d08a2dae27608b4e74e99cbc0d7f5c39fd47c960 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Wed, 23 Aug 2023 15:56:31 -0400
Subject: [PATCH 2/5] Right env setting

---
 src/accelerate/accelerator.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index 333c142cd61..60102db0d23 100755
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -1198,7 +1198,7 @@ def prepare(self, *args, device_placement=None):
                 isinstance(obj, torch.nn.Module)
                 and self.verify_device_map(obj)
                 and self.distributed_type != DistributedType.NO
-                and not os.environ.get("ACCELERATE_BYPASS_AUTO", "false")
+                and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") == "false"
             ):
                 raise ValueError(
                     "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode."
@@ -1334,7 +1334,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
         if (
             self.verify_device_map(model)
             and self.distributed_type != DistributedType.NO
-            and not os.environ.get("ACCELERATE_BYPASS_AUTO", "false")
+            and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") == "false"
         ):
             raise ValueError(
                 "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode."
@@ -1409,10 +1409,10 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                 if any(p.requires_grad for p in model.parameters()):
                     kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
                     # TODO: Look at enabling native TP training directly with a proper config
-                    if not os.environ.get("ACCELERATE_BYPASS_AUTO", "false"):
-                        device_ids, output_device = [self.local_process_index], self.local_process_index
-                    else:
+                    if os.environ.get("ACCELERATE_BYPASS_AUTO", "false") == "true":
                         device_ids, output_device = None, None
+                    else:
+                        device_ids, output_device = [self.local_process_index], self.local_process_index
 
                     model = torch.nn.parallel.DistributedDataParallel(
                         model, device_ids=device_ids, output_device=output_device, **kwargs

From d2bd6a410a5aa3aab4fbb4a4f9051fbf6cae2f6d Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Wed, 23 Aug 2023 15:59:37 -0400
Subject: [PATCH 3/5] Use true, not false

---
 src/accelerate/accelerator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index 60102db0d23..aa110b6a535 100755
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -1198,7 +1198,7 @@ def prepare(self, *args, device_placement=None):
                 isinstance(obj, torch.nn.Module)
                 and self.verify_device_map(obj)
                 and self.distributed_type != DistributedType.NO
-                and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") == "false"
+                and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") != "true"
             ):
                 raise ValueError(
                     "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode."
@@ -1334,7 +1334,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
         if (
             self.verify_device_map(model)
             and self.distributed_type != DistributedType.NO
-            and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") == "false"
+            and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") != "true"
         ):
             raise ValueError(
                 "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode."

From 3c560bccf873a024c8c2932ea7c5ae78f9d45b8c Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Wed, 23 Aug 2023 16:00:30 -0400
Subject: [PATCH 4/5] Design nit

---
 src/accelerate/accelerator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index aa110b6a535..0cfe5ee5164 100755
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -1409,10 +1409,10 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                 if any(p.requires_grad for p in model.parameters()):
                     kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
                     # TODO: Look at enabling native TP training directly with a proper config
-                    if os.environ.get("ACCELERATE_BYPASS_AUTO", "false") == "true":
-                        device_ids, output_device = None, None
-                    else:
+                    if os.environ.get("ACCELERATE_BYPASS_AUTO", "false") != "true":
                         device_ids, output_device = [self.local_process_index], self.local_process_index
+                    else:
+                        device_ids, output_device = None, None
 
                     model = torch.nn.parallel.DistributedDataParallel(
                         model, device_ids=device_ids, output_device=output_device, **kwargs

From 9b8cc1ed704de9bc7f672df1cdf59c504c6a06a4 Mon Sep 17 00:00:00 2001
From: Zach Mueller <muellerzr@gmail.com>
Date: Thu, 24 Aug 2023 09:44:17 -0400
Subject: [PATCH 5/5] ACCELERATE_BYPASS_DEVICE_MAP

---
 src/accelerate/accelerator.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/accelerate/accelerator.py b/src/accelerate/accelerator.py
index 0cfe5ee5164..7fc5bf0173e 100755
--- a/src/accelerate/accelerator.py
+++ b/src/accelerate/accelerator.py
@@ -1198,7 +1198,7 @@ def prepare(self, *args, device_placement=None):
                 isinstance(obj, torch.nn.Module)
                 and self.verify_device_map(obj)
                 and self.distributed_type != DistributedType.NO
-                and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") != "true"
+                and os.environ.get("ACCELERATE_BYPASS_DEVICE_MAP", "false") != "true"
             ):
                 raise ValueError(
                     "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode."
@@ -1334,7 +1334,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
         if (
             self.verify_device_map(model)
             and self.distributed_type != DistributedType.NO
-            and os.environ.get("ACCELERATE_BYPASS_AUTO", "false") != "true"
+            and os.environ.get("ACCELERATE_BYPASS_DEVICE_MAP", "false") != "true"
         ):
             raise ValueError(
                 "You can't train a model that has been loaded with `device_map='auto'` in any distributed mode."
@@ -1409,7 +1409,7 @@ def prepare_model(self, model: torch.nn.Module, device_placement: bool = None, e
                 if any(p.requires_grad for p in model.parameters()):
                     kwargs = self.ddp_handler.to_kwargs() if self.ddp_handler is not None else {}
                     # TODO: Look at enabling native TP training directly with a proper config
-                    if os.environ.get("ACCELERATE_BYPASS_AUTO", "false") != "true":
+                    if os.environ.get("ACCELERATE_BYPASS_DEVICE_MAP", "false") != "true":
                         device_ids, output_device = [self.local_process_index], self.local_process_index
                     else:
                         device_ids, output_device = None, None