-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_distributed.py
81 lines (65 loc) · 1.81 KB
/
run_distributed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import argparse
import torch
from args import get_training_args
from run_training import train
def run_distributed(local_rank: int, args: argparse.Namespace):
"""Run in distributed setting.
Args:
local_rank (int): The local rank.
args (argparse.Namespace): Arguments passed.
"""
global_rank = args.node_rank * args.num_gpus_node + local_rank
world_size = args.num_gpus_node * args.num_nodes
torch.cuda.set_device(local_rank)
if torch.distributed.is_available() is False:
exit()
torch.distributed.init_process_group(
"nccl",
init_method="".join(["tcp://", args.hostname, ":", str(args.port_id)]),
rank=global_rank,
world_size=world_size,
)
if local_rank == 0:
print("Starting Distributed training...")
if torch.distributed.is_initialized() is False:
exit()
distributed = True
train(args, distributed)
if __name__ == "__main__":
parser = get_training_args()
parser.add_argument(
"--num_nodes",
default=1,
type=int,
help="total number of nodes",
)
parser.add_argument(
"--num_gpus_node",
default=1,
type=int,
help="number of gpus per node",
)
parser.add_argument(
"--node_rank",
default=0,
type=int,
help="number of the node used",
)
parser.add_argument(
"--hostname",
default="example-machine",
type=str,
help="local host name for distributed training",
)
parser.add_argument(
"--port_id",
default=23400,
type=int,
help="port id for distributed training",
)
args = parser.parse_args()
torch.multiprocessing.spawn(
run_distributed,
nprocs=args.num_gpus_node,
args=(args,),
)