Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

check CUDA_DEVICE_MAX_CONNECTIONS and TORCH_NCCL_AVOID_RECORD_STREAMS #379

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/demo_in_readme.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,25 +45,33 @@ jobs:
id: basic_train
run: |
source activate ${evo_env_torch21_flash2}
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}

- name: load_preset_ckpt
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
run: |
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}

- name: load_new_ckpt
run: |
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
rm -rf $GITHUB_WORKSPACE/llm_ckpts

- name: torchrun-train
run: |
source activate ${evo_env_torch21_flash2}
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
rm -rf $GITHUB_WORKSPACE/llm_ckpts

Expand Down
29 changes: 28 additions & 1 deletion internlm/core/trainer_builder.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import gc
import logging
import os
import time
from functools import partial
from typing import Dict, List, Optional, Union
Expand All @@ -8,6 +9,7 @@
import torch.distributed as dist
from torch.utils.data import DataLoader

from internlm.accelerator import AcceleratorType, get_accelerator
from internlm.checkpoint.checkpoint_manager import CheckpointManager
from internlm.core.context import global_context as gpc
from internlm.core.context.process_group_initializer import ParallelMode
Expand All @@ -31,7 +33,6 @@
)
from internlm.utils.common import (
BatchSkipper,
check_cuda_env,
enable_pytorch_expandable_segments,
get_current_device,
get_megatron_flops,
Expand All @@ -47,6 +48,32 @@

# global llm logger
logger = logging.getLogger(__file__)
internlm_accelerator = get_accelerator()


def check_cuda_env():
if internlm_accelerator.get_accelerator_backend() == AcceleratorType.GPU:
wp_fwd_per = gpc.config.parallel.weight.get("forward_overlap_per", "layer")
ewp_fwd_per = gpc.config.parallel.expert_weight.get("forward_overlap_per", "layer")
wp_size = gpc.config.parallel.weight.get("size", 1)
ewp_size = gpc.config.parallel.expert_weight.get("size", 1)
open_max_conns = (wp_size == 1 or wp_fwd_per != "layer") and (ewp_size == 1 or ewp_fwd_per != "layer")
if open_max_conns:
max_connections = os.getenv("CUDA_DEVICE_MAX_CONNECTIONS")
assert (
max_connections is not None
), "Env var CUDA_DEVICE_MAX_CONNECTIONS has not been set, please set it to 1!"
assert (
max_connections == "1"
), "Env var CUDA_DEVICE_MAX_CONNECTIONS is set to {}, it should be set to 1!".format(max_connections)

avoid_record_streams = os.getenv("TORCH_NCCL_AVOID_RECORD_STREAMS")
assert (
avoid_record_streams is not None
), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS has not been set, please set it to 1!"
assert (
avoid_record_streams == "1"
), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS is set to {}, it should be set to 1!".format(avoid_record_streams)


class TrainerBuilder(Trainer):
Expand Down
4 changes: 2 additions & 2 deletions internlm/data/tokenized/dummy_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import numpy as np
from torch.utils.data import Dataset

# from internlm.core.context.parallel_context import global_context as gpc
from internlm.core.context.parallel_context import global_context as gpc


class RandomDataset(Dataset):
Expand All @@ -30,7 +30,7 @@ def __init__(self, num_samples=10000, max_len=1024, fixed_seqlen: bool = False)
while len(d) < max_len:
r *= 2
d = list(range(n)) * r
# r = r % gpc.config.model.vocab_size
r = r % gpc.config.model.vocab_size
d = [n, r] + d
d = d[:max_len]
data.append(d)
Expand Down
5 changes: 0 additions & 5 deletions internlm/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,11 +249,6 @@ def enable_pytorch_expandable_segments():
logger.warning("To support the 'expandable_segments' configuration, please upgrade torch to version 2.1.0.")


def check_cuda_env():
if os.getenv("CUDA_DEVICE_MAX_CONNECTIONS") is None:
logger.warning("Env var CUDA_DEVICE_MAX_CONNECTIONS has not be set, please note this!")


class DummyProfile:
"""
Dummy Profile.
Expand Down
1 change: 1 addition & 0 deletions tests/test_data/test_batch_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ def do_warmup(args):
rank, worldsize, init_config, should_sccuess, answer = args
build_environment(rank, worldsize, init_config)
gpc.config.model.num_chunks = 1 if gpc.get_world_size(ParallelMode.PIPELINE) == 1 else 2
gpc.config.model.vocab_size = 92544
engine, scheduler = init_model_and_optim(
8,
gpc.config.model.num_chunks,
Expand Down
Loading