Skip to content

Commit

Permalink
Populate PG metadata from CPU op to GPU kernel (#880)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #880

Populate 'Process Group ID' and 'Process Group Ranks' captured at CPU op to GPU kernel

Reviewed By: aaronenyeshi

Differential Revision: D54092504

fbshipit-source-id: 911bc244410036596d2268e54707b4b7b6dd9921
  • Loading branch information
shengbao-zheng authored and facebook-github-bot committed Mar 6, 2024
1 parent eeafa44 commit 8466a8b
Show file tree
Hide file tree
Showing 2 changed files with 44 additions and 0 deletions.
16 changes: 16 additions & 0 deletions libkineto/src/output_json.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ static constexpr const char* kOutMsgNelems = "Out msg nelems";
static constexpr const char* kGroupSize = "Group size";
static constexpr const char* kInSplit = "In split size";
static constexpr const char* kOutSplit = "Out split size";
static constexpr const char* kProcessGroupId = "Process Group ID";
static constexpr const char* kGroupRanks = "Process Group Ranks";

#ifdef __linux__
static constexpr char kDefaultLogFileFmt[] =
Expand Down Expand Up @@ -349,6 +351,20 @@ void ChromeTraceLogger::handleActivity(
kOutSplit,
outSplitSize));
}
const auto& processGroupId =
collectiveRecord->getMetadataValue(kProcessGroupId);
const auto& groupRanks = collectiveRecord->getMetadataValue(kGroupRanks);
if (!processGroupId.empty() && !groupRanks.empty()) {
if (!arg_values.empty()) {
arg_values.append(",");
}
arg_values.append(fmt::format(
"\"{}\": {}, \"{}\": {}",
kProcessGroupId,
processGroupId,
kGroupRanks,
groupRanks));
}
}

std::string args = "";
Expand Down
28 changes: 28 additions & 0 deletions libkineto/test/CuptiActivityProfilerTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ static constexpr auto kOutMsgNelems = "Out msg nelems";
static constexpr auto kInSplit = "In split size";
static constexpr auto kOutSplit = "Out split size";
static constexpr auto kGroupSize = "Group size";
static constexpr const char* kProcessGroupId = "Process Group ID";
static constexpr const char* kGroupRanks = "Process Group Ranks";
static constexpr int32_t kTruncatLength = 30;

#define CUDA_LAUNCH_KERNEL CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
Expand Down Expand Up @@ -571,6 +573,7 @@ TEST_F(CuptiActivityProfilerTest, GpuNCCLCollectiveTest) {
metadataMap.emplace(kInMsgNelems, "65664");
metadataMap.emplace(kOutMsgNelems, "131328");
metadataMap.emplace(kGroupSize, "2");
metadataMap.emplace(kProcessGroupId, "0");

std::vector<int64_t> inSplitSizes(50, 0);
std::string inSplitSizesStr = "";
Expand Down Expand Up @@ -602,6 +605,23 @@ TEST_F(CuptiActivityProfilerTest, GpuNCCLCollectiveTest) {
metadataMap.emplace(kOutSplit, outSplitSizesStr);
}

std::vector<int64_t> groupRanks(64, 0);
std::string groupRanksStr = "";
if (!groupRanks.empty() && groupRanks.size() <= kTruncatLength) {
metadataMap.emplace(
kGroupRanks, fmt::format("\"[{}]\"", fmt::join(groupRanks, ", ")));
} else if (groupRanks.size() > kTruncatLength) {
metadataMap.emplace(
kGroupRanks,
fmt::format(
"\"[{}, ..., {}]\"",
fmt::join(
groupRanks.begin(),
groupRanks.begin() + kTruncatLength - 1,
", "),
groupRanks.back()));
}

// Set up CPU events
auto cpuOps = std::make_unique<MockCpuActivityBuffer>(
start_time_us, start_time_us + duration_us);
Expand Down Expand Up @@ -642,6 +662,11 @@ TEST_F(CuptiActivityProfilerTest, GpuNCCLCollectiveTest) {
auto expectedInSplitStr =
fmt::format("\"[{}, ...]\"", fmt::join(expectedInSplit, ", "));
EXPECT_EQ(cpu_annotation->getMetadataValue(kInSplit), expectedInSplitStr);
std::vector<int64_t> expectedGroupRanks(kTruncatLength-1, 0);
auto expectedGroupRanksStr =
fmt::format("\"[{}, ..., {}]\"", fmt::join(expectedGroupRanks, ", "), "0");
EXPECT_EQ(cpu_annotation->getMetadataValue(kGroupRanks), expectedGroupRanksStr);


#ifdef __linux__
// Test saved output can be loaded as JSON
Expand Down Expand Up @@ -682,6 +707,9 @@ TEST_F(CuptiActivityProfilerTest, GpuNCCLCollectiveTest) {
EXPECT_EQ(2, countSubstrings(jsonString, expectedInSplitStr));
EXPECT_EQ(2, countSubstrings(jsonString, kOutSplit));
EXPECT_EQ(2, countSubstrings(jsonString, outSplitSizesStr));
EXPECT_EQ(2, countSubstrings(jsonString, kProcessGroupId));
EXPECT_EQ(2, countSubstrings(jsonString, kGroupRanks));
EXPECT_EQ(2, countSubstrings(jsonString, expectedGroupRanksStr));
#endif
}

Expand Down

0 comments on commit 8466a8b

Please sign in to comment.