Add apple silicon GPU Acceleration to DL examples

NVIDIA · Aug 1, 2023 · 164998c · 164998c
1 parent b03375b
commit 164998c
Show file tree

Hide file tree

Showing 6 changed files with 91 additions and 37 deletions.
diff --git a/PyTorch/LanguageModeling/BERT/extract_features.py b/PyTorch/LanguageModeling/BERT/extract_features.py
@@ -210,17 +210,24 @@ def main():
                         type=int,
                         default=-1,
                         help = "local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda",
+    parser.add_argument("--no_gpu",
                         action='store_true',
-                        help="Whether not to use CUDA when available")
+                        help="Whether not to use GPU when available")
 
     args = parser.parse_args()
 
     if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
+        if torch.cuda.is_available() and not args.no_gpu:
+            device = torch.device("cuda" if torch.cuda.is_available() and not args.no_gpu else "cpu")
+            n_gpu = torch.cuda.device_count()
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() and not args.no_gpu:
+            device = torch.device('mps' if torch.backends.mps.is_available() and not args.no_gpu else 'cpu') # noqa
+            n_GPU = 1
     else:
-        device = torch.device("cuda", args.local_rank)
+        if torch.cuda.is_available():
+            device = torch.device('cuda')
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            device = torch.device('mps') # noqa
         n_gpu = 1
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl')

diff --git a/PyTorch/LanguageModeling/BERT/inference.py b/PyTorch/LanguageModeling/BERT/inference.py
@@ -450,9 +450,9 @@ def main():
     parser.add_argument("--max_answer_length", default=30, type=int,
                         help="The maximum length of an answer that can be generated. This is needed because the start "
                              "and end predictions are not conditioned on one another.")
-    parser.add_argument("--no_cuda",
+    parser.add_argument("--no_gpu",
                         action='store_true',
-                        help="Whether not to use CUDA when available")
+                        help="Whether not to use GPU when available")
     parser.add_argument("--do_lower_case",
                         action='store_true',
                         help="Whether to lower case the input text. True for uncased models, False for cased models.")
@@ -482,10 +482,18 @@ def main():
     torch.cuda.manual_seed(args.seed)
 
     if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        if torch.cuda.is_available() and not args.no_gpu:
+            device = torch.device('cuda')
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() and not args.no_gpu:
+            device = torch.device('mps') # noqa
+        else:
+            device = torch.device('cpu')
     else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
+        if torch.cuda.is_available():
+            torch.cuda.set_device(args.local_rank)
+            device = torch.device('cuda', args.local_rank)
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            device = torch.device('mps') # noqa
 
     tokenizer = BertTokenizer(args.vocab_file, do_lower_case=args.do_lower_case, max_len=512) # for bert large
 

diff --git a/PyTorch/LanguageModeling/BERT/run_glue.py b/PyTorch/LanguageModeling/BERT/run_glue.py
@@ -439,9 +439,9 @@ def main():
                         type=float,
                         help="Proportion of training to perform linear learning rate warmup for. "
                              "E.g., 0.1 = 10%% of training.")
-    parser.add_argument("--no_cuda",
+    parser.add_argument("--no_GPU",
                         action='store_true',
-                        help="Whether not to use CUDA when available")
+                        help="Whether not to use GPU when available")
     parser.add_argument("--local_rank",
                         type=int,
                         default=-1,
@@ -495,12 +495,21 @@ def main():
         "mrpc": 2,
     }
 
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
+    if args.local_rank == -1 or args.no_GPU:
+        if torch.cuda.is_available() and not args.no_GPU:
+            device = torch.device('cuda')
+            n_gpu = torch.cuda.device_count()
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() and not args.no_GPU:  #noqa
+            device = torch.device('mps')
+            n_gpu = 1
+        else:
+            device = torch.device('cpu')
     else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
+        if torch.cuda.is_available():
+            torch.cuda.set_device(args.local_rank)
+            device = torch.device("cuda", args.local_rank)
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():  #noqa
+            device = torch.device('mps')
         n_gpu = 1
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl')

diff --git a/PyTorch/LanguageModeling/BERT/run_pretraining_inference.py b/PyTorch/LanguageModeling/BERT/run_pretraining_inference.py
@@ -140,10 +140,10 @@ def main():
                         default=-1,
                         type=int,
                         help="Total number of eval  steps to perform, otherwise use full dataset")
-    parser.add_argument("--no_cuda",
+    parser.add_argument("--no_gpu",
                         default=False,
                         action='store_true',
-                        help="Whether not to use CUDA when available")
+                        help="Whether not to use GPU when available")
     parser.add_argument("--local_rank",
                         type=int,
                         default=-1,
@@ -166,12 +166,21 @@ def main():
     if 'LOCAL_RANK' in os.environ:
         args.local_rank = int(os.environ['LOCAL_RANK'])
 
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    if args.local_rank == -1 or args.no_gpu:
+        if torch.cuda.is_available() and not args.no_gpu:
+            device = torch.device('cuda')
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() and not args.no_gpu:
+            device = torch.device('mps')
+        else:
+            device = torch.device('cpu')
 
     else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
+        if torch.cuda.is_available():
+            torch.cuda.set_device(args.local_rank)
+            device = torch.device("cuda", args.local_rank)
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            device = torch.device('mps')
+
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl', init_method='env://')
 

diff --git a/PyTorch/LanguageModeling/BERT/run_squad.py b/PyTorch/LanguageModeling/BERT/run_squad.py
@@ -842,9 +842,9 @@ def main():
     parser.add_argument("--verbose_logging", action='store_true',
                         help="If true, all of the warnings related to data processing will be printed. "
                              "A number of warnings are expected for a normal SQuAD evaluation.")
-    parser.add_argument("--no_cuda",
+    parser.add_argument("--no_gpu",
                         action='store_true',
-                        help="Whether not to use CUDA when available")
+                        help="Whether not to use GPU when available")
     parser.add_argument('--seed',
                         type=int,
                         default=42,
@@ -907,12 +907,22 @@ def main():
     if args.use_env and 'LOCAL_RANK' in os.environ:
         args.local_rank = int(os.environ['LOCAL_RANK'])
 
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
+    if args.local_rank == -1 or args.no_gpu:
+        if torch.cuda.is_available() and not args.no_gpu:
+            device = torch.device('cuda')
+            n_gpu = torch.cuda.device_count()
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() and not args.no_gpu:
+            device = torch.device('mps')
+            n_gpu = 1
+        else:
+            device = torch.device('cpu')
+
     else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
+        if torch.cuda.is_available():
+            torch.cuda.set_device(args.local_rank)
+            device = torch.device("cuda", args.local_rank)
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            device = torch.device('mps')
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl', init_method='env://')
         n_gpu = 1

diff --git a/PyTorch/LanguageModeling/BERT/run_swag.py b/PyTorch/LanguageModeling/BERT/run_swag.py
@@ -303,9 +303,9 @@ def main():
                         type=float,
                         help="Proportion of training to perform linear learning rate warmup for. "
                              "E.g., 0.1 = 10%% of training.")
-    parser.add_argument("--no_cuda",
+    parser.add_argument("--no_gpu",
                         action='store_true',
-                        help="Whether not to use CUDA when available")
+                        help="Whether not to use GPU when available")
     parser.add_argument("--local_rank",
                         type=int,
                         default=-1,
@@ -329,12 +329,23 @@ def main():
 
     args = parser.parse_args()
 
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
+    if args.local_rank == -1 or args.no_gpu:
+        if torch.cuda.is_available() and not args.no_gpu:
+            device = torch.device('cuda')
+            n_gpu = torch.cuda.device_count()
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() and not args.no_gpu:
+            device = torch.device('mps')
+            n_gpu = 1
+        else:
+            device = torch.device('cpu')
+
     else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
+        if torch.cuda.is_available():
+            torch.cuda.set_device(args.local_rank)
+            device = torch.device("cuda", args.local_rank)
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            torch.mps.set_rank(args.local_rank)
+            device = torch.device("mps")
         n_gpu = 1
         # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
         torch.distributed.init_process_group(backend='nccl')