You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
I give the instruction as follows:
machine1: bash gpt3.sh 0
machine2: bash gpt3.sh 1
but get the error:
WARNING:torch.distributed.run:
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, ple ase further tune the variable for optimal performance in your application as needed.
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==2.0.0+mc2.23.0.23', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 241, in launch_agent
result = agent.run()
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 723, in run
result = self._invoke_run(role)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 858, in _invoke_run
self._initialize_workers(self._worker_group)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 692, in _initialize_workers
self._rendezvous(worker_group)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 549, in _rendezvous
workers = self._assign_worker_ranks(store, group_rank, group_world_size, spec)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 624, in _assign_worker_ranks
role_infos = self._share_and_gather(store, group_rank, group_world_size, spec)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 661, in _share_and_gather
role_infos_bytes = store_util.synchronize(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize
agent_data = get_all(store, rank, key_prefix, world_size)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all
data = store.get(f"{prefix}{idx}")
RuntimeError: Socket Timeout
have you met the same problems?? thanks!!!
The text was updated successfully, but these errors were encountered:
I use two machines to run megatron-gpt
the .sh parameters as follows:
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
NCCL_DEBUG=WARN
GPUS_PER_NODE=8
Change for multinode config
MASTER_ADDR=192.168.2.111
MASTER_PORT=45678
NUM_NODES=2
NODE_RANK=$1
WORLD_SIZE=16
BASE_DIR="/data/train_nfs/offload_megatron/megatron_0.8/zmt/Megatron-LM"
#/megatron_lm_345m_v0.0/release/mp_rank_00/model_optim_rng.pt
CHECKPOINT_PATH="${BASE_DIR}/release/mp_rank_00/"
#tensorboard_logs
TENSORBOARD_LOGS_PATH="${BASE_DIR}/tensorboard_logs"
#gpt2-vocab.json
VOCAB_FILE="${BASE_DIR}/gpt2-vocab.json" #/gpt2-vocab.json
#gpt2-vocab.json
MERGE_FILE="${BASE_DIR}/gpt2-merges.txt" #/gpt2-vocab.json
#test-corpus.json
DATA_PATH="${BASE_DIR}/oscar-en-10k-meg-llama_text_document" #_text_documen
I give the instruction as follows:
machine1: bash gpt3.sh 0
machine2: bash gpt3.sh 1
but get the error:
WARNING:torch.distributed.run:
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, ple ase further tune the variable for optimal performance in your application as needed.
Traceback (most recent call last):
File "/opt/conda/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==2.0.0+mc2.23.0.23', 'console_scripts', 'torchrun')())
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 346, in wrapper
return f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 241, in launch_agent
result = agent.run()
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 723, in run
result = self._invoke_run(role)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 858, in _invoke_run
self._initialize_workers(self._worker_group)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 692, in _initialize_workers
self._rendezvous(worker_group)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 549, in _rendezvous
workers = self._assign_worker_ranks(store, group_rank, group_world_size, spec)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 624, in _assign_worker_ranks
role_infos = self._share_and_gather(store, group_rank, group_world_size, spec)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 661, in _share_and_gather
role_infos_bytes = store_util.synchronize(
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 64, in synchronize
agent_data = get_all(store, rank, key_prefix, world_size)
File "/opt/conda/lib/python3.8/site-packages/torch/distributed/elastic/utils/store.py", line 34, in get_all
data = store.get(f"{prefix}{idx}")
RuntimeError: Socket Timeout
have you met the same problems?? thanks!!!
The text was updated successfully, but these errors were encountered: