Skip to content

Commit

Permalink
Support Cuda Graph in XProf
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 642775989
  • Loading branch information
tensorflower-gardener authored and copybara-github committed Jun 25, 2024
1 parent f888ff7 commit beb1769
Show file tree
Hide file tree
Showing 17 changed files with 420 additions and 5 deletions.
2 changes: 2 additions & 0 deletions third_party/tsl/tsl/profiler/utils/xplane_schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,8 @@ const StatTypeMap& GetStatTypeMap() {
{"dcn_chunk", kDcnChunk},
{"dcn_loop_index", kDcnLoopIndex},
{"dropped_traces", kDroppedTraces},
{"cuda_graph_id", kCudaGraphId},
{"cuda_graph_details", kCudaGraphDetails},
});
DCHECK_EQ(stat_type_map->size(), kNumStatTypes);
return *stat_type_map;
Expand Down
4 changes: 3 additions & 1 deletion third_party/tsl/tsl/profiler/utils/xplane_schema.h
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,9 @@ enum StatType {
kEdgeTpuModelProfileInfo,
kEdgeTpuMlir,
kDroppedTraces,
kLastStatType = kDroppedTraces,
kCudaGraphId,
kCudaGraphDetails,
kLastStatType = kCudaGraphDetails,
};

enum MegaScaleStatType : uint8_t {
Expand Down
1 change: 1 addition & 0 deletions xla/backends/profiler/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ tsl_gpu_library(
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/status",
"@com_google_absl//absl/types:optional",
"@com_google_absl//absl/types:span",
"@tsl//tsl/platform:env",
"@tsl//tsl/platform:errors",
"@tsl//tsl/platform:logging",
Expand Down
111 changes: 111 additions & 0 deletions xla/backends/profiler/gpu/cuda_test.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ limitations under the License.
#if GOOGLE_CUDA
#include <stdio.h>

#include "third_party/gpus/cuda/include/cuda.h"
#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
#include "third_party/gpus/cuda/include/driver_types.h"
#endif
Expand Down Expand Up @@ -183,6 +184,116 @@ void MemCopyP2PExplicit() {
#endif
}

#if GOOGLE_CUDA

// The test about cuda graph is based on Nvidia's CUPTI sample code
// under extras/CUPTI/samples/cuda_graphs_trace/ dir of CUDA distribution.
__global__ void VecAdd(const int *A, const int *B, int *C, int N) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i];
}

__global__ void VecSub(const int *A, const int *B, int *C, int N) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < N) C[i] = A[i] - B[i];
}

// Return if the cuda version is newer so all the cuda graph APIs are available.
bool CudaGraphCreateAndExecute() {
#if CUDA_VERSION >= 11070 // CUDA 11.7
static constexpr size_t kNumElements = 2048;
static constexpr size_t num_bytes = kNumElements * sizeof(int);
static constexpr int kThreadsPerBlock = 256;

int blocksPerGrid = 0;
cudaStream_t stream = nullptr;
cudaKernelNodeParams kernelParams;
cudaMemcpy3DParms memcpyParams = {0};
cudaGraph_t graph;
cudaGraph_t cloned_graph;
cudaGraphExec_t graphExec;
cudaGraphNode_t nodes[5];

// Allocate input vectors h_A and h_B in host memory
// don't bother to initialize
auto *h_A = (int *)malloc(num_bytes);
auto *h_B = (int *)malloc(num_bytes);
auto *h_C = (int *)malloc(num_bytes);

// Allocate vectors in device memory
int *d_A, *d_B, *d_C;
cudaMalloc((void **)&d_A, num_bytes);
cudaMalloc((void **)&d_B, num_bytes);
cudaMalloc((void **)&d_C, num_bytes);

cudaGraphCreate(&graph, 0);

// Init memcpy params
memcpyParams.kind = cudaMemcpyHostToDevice;
memcpyParams.srcPtr.ptr = h_A;
memcpyParams.dstPtr.ptr = d_A;
memcpyParams.extent.width = num_bytes;
memcpyParams.extent.height = 1;
memcpyParams.extent.depth = 1;
cudaGraphAddMemcpyNode(&nodes[0], graph, NULL, 0, &memcpyParams);

memcpyParams.srcPtr.ptr = h_B;
memcpyParams.dstPtr.ptr = d_B;
cudaGraphAddMemcpyNode(&nodes[1], graph, NULL, 0, &memcpyParams);

// Init kernel params
int num = kNumElements;
void *kernelArgs[] = {(void *)&d_A, (void *)&d_B, (void *)&d_C, (void *)&num};
blocksPerGrid = (kNumElements + kThreadsPerBlock - 1) / kThreadsPerBlock;
kernelParams.func = (void *)VecAdd;
kernelParams.gridDim = dim3(blocksPerGrid, 1, 1);
kernelParams.blockDim = dim3(kThreadsPerBlock, 1, 1);
kernelParams.sharedMemBytes = 0;
kernelParams.kernelParams = (void **)kernelArgs;
kernelParams.extra = NULL;

cudaGraphAddKernelNode(&nodes[2], graph, &nodes[0], 2, &kernelParams);

kernelParams.func = (void *)VecSub;
cudaGraphAddKernelNode(&nodes[3], graph, &nodes[2], 1, &kernelParams);

memcpyParams.kind = cudaMemcpyDeviceToHost;
memcpyParams.srcPtr.ptr = d_C;
memcpyParams.dstPtr.ptr = h_C;
memcpyParams.extent.width = num_bytes;
memcpyParams.extent.height = 1;
memcpyParams.extent.depth = 1;
cudaGraphAddMemcpyNode(&nodes[4], graph, &nodes[3], 1, &memcpyParams);

cudaGraphClone(&cloned_graph, graph);

cudaGraphInstantiate(&graphExec, cloned_graph, NULL, NULL, 0);

cudaGraphLaunch(graphExec, stream);

cudaStreamSynchronize(stream);

free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return true;
#else
return false;
#endif // CUDA_VERSION >= 11070
}

#else

bool CudaGraphCreateAndExecute() {
GTEST_FAIL() << "Build with --config=cuda";
return false;
}

#endif

} // namespace test
} // namespace profiler
} // namespace xla
5 changes: 5 additions & 0 deletions xla/backends/profiler/gpu/cuda_test.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ void MemCopyP2PImplicit();
// Copies a few bytes of memory from device 0 to device 1.
void MemCopyP2PExplicit();

// Create a simple cuda graph, instantiate it and execute it.
// Return true if the cuda version is newer so all related cuda graph APIs are
// available, especially for activity graph tracing after 11.7.
bool CudaGraphCreateAndExecute();

} // namespace test
} // namespace profiler
} // namespace xla
Expand Down
84 changes: 84 additions & 0 deletions xla/backends/profiler/gpu/cupti_buffer_events.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ limitations under the License.

#include "xla/backends/profiler/gpu/cupti_buffer_events.h"

#include "absl/strings/str_cat.h"
#include "third_party/gpus/cuda/include/cuda.h"
#include "xla/backends/profiler/gpu/cupti_interface.h"
#include "tsl/platform/errors.h"
Expand All @@ -27,6 +28,11 @@ namespace {

using absl::StatusCode;

template <typename CuptiActivity>
struct CuptiActivityHasGraphId {
static constexpr bool value = false;
};

// CUPTI from CUDA 11.6 adds information about the hardware channel that ops
// run on; this makes its way into the channel_id and channel_type fields in the
// structs we export.
Expand All @@ -39,19 +45,59 @@ using CuptiActivityKernelTy = CUpti_ActivityKernel9;
using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy5;
using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpyPtoP4;
using CuptiActivityMemsetTy = CUpti_ActivityMemset4;

template <>
struct CuptiActivityHasGraphId<CuptiActivityKernelTy> {
static constexpr bool value = true;
};
template <>
struct CuptiActivityHasGraphId<CuptiActivityMemcpyTy> {
static constexpr bool value = true;
};
template <>
struct CuptiActivityHasGraphId<CuptiActivityMemcpyP2PTy> {
static constexpr bool value = true;
};
template <>
struct CuptiActivityHasGraphId<CuptiActivityMemsetTy> {
static constexpr bool value = true;
};
#elif CUDA_VERSION >= 11060 // CUDA 11.6
#define TF_CUPTI_HAS_CHANNEL_ID 1
using CuptiActivityKernelTy = CUpti_ActivityKernel7;
using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy5;
using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpyPtoP4;
using CuptiActivityMemsetTy = CUpti_ActivityMemset4;

template <>
struct CuptiActivityHasGraphId<CuptiActivityKernelTy> {
static constexpr bool value = true;
};
template <>
struct CuptiActivityHasGraphId<CuptiActivityMemcpyTy> {
static constexpr bool value = true;
};
template <>
struct CuptiActivityHasGraphId<CuptiActivityMemcpyP2PTy> {
static constexpr bool value = true;
};
template <>
struct CuptiActivityHasGraphId<CuptiActivityMemsetTy> {
static constexpr bool value = true;
};
#else
using CuptiActivityKernelTy = CUpti_ActivityKernel4;
using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy;
using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpy2;
using CuptiActivityMemsetTy = CUpti_ActivityMemset;
#endif

#if CUDA_VERSION >= 12030 // CUDA 12.3
using CuptiActivityGraphTraceTy = CUpti_ActivityGraphTrace2;
#elif CUDA_VERSION >= 11070
using CuptiActivityGraphTraceTy = CUpti_ActivityGraphTrace;
#endif

// Maps an OverheadKind enum to a const string.
const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
switch (kind) {
Expand Down Expand Up @@ -94,6 +140,14 @@ const char *getActivityUnifiedMemoryKindString(
return "<UNKNOWN>";
}

template <typename CuptiActivity>
void SetEventGraphId(CuptiTracerEvent &event,
const CuptiActivity *cupti_activity) {
if constexpr (CuptiActivityHasGraphId<CuptiActivity>::value) {
event.graph_id = cupti_activity->graphId;
}
}

template <bool cupti_has_channel_id, typename CuptiActivityKernel>
void AddKernelActivityEvent(CuptiEventCollectorDelegate &collector,
const CuptiActivityKernel *kernel) {
Expand All @@ -111,6 +165,7 @@ void AddKernelActivityEvent(CuptiEventCollectorDelegate &collector,
collector.annotation_map.LookUp(event.device_id, event.correlation_id);
event.annotation = info.annotation;
event.nvtx_range = info.nvtx_range;
SetEventGraphId(event, kernel);
event.kernel_info.registers_per_thread = kernel->registersPerThread;
event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
Expand All @@ -127,6 +182,26 @@ void AddKernelActivityEvent(CuptiEventCollectorDelegate &collector,
collector.receive(std::move(event));
}

void AddGraphTraceActivityEvent(CuptiEventCollectorDelegate &collector,
CuptiActivityGraphTraceTy *graph_trace) {
CuptiTracerEvent event{};
event.type = CuptiTracerEventType::CudaGraph;
event.source = CuptiTracerEventSource::Activity;
event.name = absl::StrCat("CudaGraphExec:", graph_trace->graphId);
event.start_time_ns = graph_trace->start;
event.end_time_ns = graph_trace->end;
event.device_id = graph_trace->deviceId;
event.context_id = graph_trace->contextId;
event.stream_id = graph_trace->streamId;
event.correlation_id = graph_trace->correlationId;
AnnotationMap::AnnotationInfo info =
collector.annotation_map.LookUp(event.device_id, event.correlation_id);
event.annotation = info.annotation;
event.nvtx_range = info.nvtx_range;
event.graph_id = graph_trace->graphId;
collector.receive(std::move(event));
}

void AddMemcpyActivityEvent(CuptiEventCollectorDelegate &collector,
const CuptiActivityMemcpyTy *memcpy) {
CuptiTracerEvent event{};
Expand Down Expand Up @@ -163,6 +238,7 @@ void AddMemcpyActivityEvent(CuptiEventCollectorDelegate &collector,
AnnotationMap::AnnotationInfo info =
collector.annotation_map.LookUp(event.device_id, event.correlation_id);
event.annotation = info.annotation;
SetEventGraphId(event, memcpy);
event.memcpy_info.copy_kind = memcpy->copyKind;
event.memcpy_info.num_bytes = memcpy->bytes;
event.memcpy_info.destination = memcpy->deviceId;
Expand Down Expand Up @@ -192,6 +268,7 @@ void AddMemcpyP2PActivityEvent(CuptiEventCollectorDelegate &collector,
AnnotationMap::AnnotationInfo info =
collector.annotation_map.LookUp(event.device_id, event.correlation_id);
event.annotation = info.annotation;
SetEventGraphId(event, memcpy);
event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
event.memcpy_info.num_bytes = memcpy->bytes;
event.memcpy_info.destination = memcpy->dstDeviceId;
Expand Down Expand Up @@ -320,6 +397,7 @@ void AddMemsetActivityEvent(CuptiEventCollectorDelegate &collector,
event.correlation_id = memset->correlationId;
event.context_id = memset->contextId;
event.stream_id = memset->streamId;
SetEventGraphId(event, memset);
event.memset_info.num_bytes = memset->bytes;
event.memset_info.mem_kind = mem_kind;
event.memset_info.async = (memset->flags & CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC);
Expand Down Expand Up @@ -418,6 +496,12 @@ static absl::Status ConvertActivityBuffer(
collector,
reinterpret_cast<CUpti_ActivitySynchronization *>(record));
break;
#if CUDA_VERSION >= 11070
case CUPTI_ACTIVITY_KIND_GRAPH_TRACE:
AddGraphTraceActivityEvent(
collector, reinterpret_cast<CuptiActivityGraphTraceTy *>(record));
break;
#endif
default:
VLOG(3) << "Activity type " << record->kind << " is not supported.";
break;
Expand Down
12 changes: 12 additions & 0 deletions xla/backends/profiler/gpu/cupti_buffer_events.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,14 @@ struct GenericDetails {
uint32_t cbid;
};

struct CudaGraphDetails {
uint32_t cbid; // 0 for activity events, other wise the cbid of the callback
uint32_t orig_graph_id; // The original graph from which new graph is
// instantiated. Not graph_id is put into general
// fields as if trace in node mode, many activity
// events will contains graph id.
};

inline std::string ToXStat(const KernelDetails& kernel_info,
double occupancy_pct) {
return absl::StrCat(
Expand Down Expand Up @@ -165,6 +173,7 @@ enum class CuptiTracerEventType {
MemoryResidency = 12,
HostRegister = 13,
HostUnregister = 14,
CudaGraph = 15,
Generic = 100,
};

Expand Down Expand Up @@ -203,6 +212,7 @@ struct CuptiTracerEvent {
uint32_t thread_id = kInvalidThreadId;
int64_t context_id = kInvalidContextId;
int64_t stream_id = kInvalidStreamId;
uint32_t graph_id = 0;
union {
// For Memcpy API and activities. `type` must be Memcpy*.
MemcpyDetails memcpy_info;
Expand All @@ -222,6 +232,8 @@ struct CuptiTracerEvent {
MemoryResidencyDetails memory_residency_info;
// Used for `source` DriverCallback, `type` must be Generic.
GenericDetails generic_info;
// Used for `source` DriverCallback, `type` must be CudaGraph.
CudaGraphDetails cuda_graph_info;
};
};

Expand Down
Loading

0 comments on commit beb1769

Please sign in to comment.