Support Cuda Graph in XProf

PiperOrigin-RevId: 642775989
openxla · Jun 25, 2024 · beb1769 · beb1769
1 parent f888ff7
commit beb1769
Show file tree

Hide file tree

Showing 17 changed files with 420 additions and 5 deletions.
diff --git a/third_party/tsl/tsl/profiler/utils/xplane_schema.cc b/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
@@ -335,6 +335,8 @@ const StatTypeMap& GetStatTypeMap() {
       {"dcn_chunk", kDcnChunk},
       {"dcn_loop_index", kDcnLoopIndex},
       {"dropped_traces", kDroppedTraces},
+      {"cuda_graph_id", kCudaGraphId},
+      {"cuda_graph_details", kCudaGraphDetails},
   });
   DCHECK_EQ(stat_type_map->size(), kNumStatTypes);
   return *stat_type_map;

diff --git a/third_party/tsl/tsl/profiler/utils/xplane_schema.h b/third_party/tsl/tsl/profiler/utils/xplane_schema.h
@@ -320,7 +320,9 @@ enum StatType {
   kEdgeTpuModelProfileInfo,
   kEdgeTpuMlir,
   kDroppedTraces,
-  kLastStatType = kDroppedTraces,
+  kCudaGraphId,
+  kCudaGraphDetails,
+  kLastStatType = kCudaGraphDetails,
 };
 
 enum MegaScaleStatType : uint8_t {

diff --git a/xla/backends/profiler/gpu/BUILD b/xla/backends/profiler/gpu/BUILD
@@ -176,6 +176,7 @@ tsl_gpu_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
         "@tsl//tsl/platform:env",
         "@tsl//tsl/platform:errors",
         "@tsl//tsl/platform:logging",

diff --git a/xla/backends/profiler/gpu/cuda_test.cu.cc b/xla/backends/profiler/gpu/cuda_test.cu.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #include <stdio.h>
 
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "third_party/gpus/cuda/include/driver_types.h"
 #endif
@@ -183,6 +184,116 @@ void MemCopyP2PExplicit() {
 #endif
 }
 
+#if GOOGLE_CUDA
+
+// The test about cuda graph is based on Nvidia's CUPTI sample code
+// under extras/CUPTI/samples/cuda_graphs_trace/ dir of CUDA distribution.
+__global__ void VecAdd(const int *A, const int *B, int *C, int N) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < N) C[i] = A[i] + B[i];
+}
+
+__global__ void VecSub(const int *A, const int *B, int *C, int N) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < N) C[i] = A[i] - B[i];
+}
+
+// Return if the cuda version is newer so all the cuda graph APIs are available.
+bool CudaGraphCreateAndExecute() {
+#if CUDA_VERSION >= 11070  // CUDA 11.7
+  static constexpr size_t kNumElements = 2048;
+  static constexpr size_t num_bytes = kNumElements * sizeof(int);
+  static constexpr int kThreadsPerBlock = 256;
+
+  int blocksPerGrid = 0;
+  cudaStream_t stream = nullptr;
+  cudaKernelNodeParams kernelParams;
+  cudaMemcpy3DParms memcpyParams = {0};
+  cudaGraph_t graph;
+  cudaGraph_t cloned_graph;
+  cudaGraphExec_t graphExec;
+  cudaGraphNode_t nodes[5];
+
+  // Allocate input vectors h_A and h_B in host memory
+  // don't bother to initialize
+  auto *h_A = (int *)malloc(num_bytes);
+  auto *h_B = (int *)malloc(num_bytes);
+  auto *h_C = (int *)malloc(num_bytes);
+
+  // Allocate vectors in device memory
+  int *d_A, *d_B, *d_C;
+  cudaMalloc((void **)&d_A, num_bytes);
+  cudaMalloc((void **)&d_B, num_bytes);
+  cudaMalloc((void **)&d_C, num_bytes);
+
+  cudaGraphCreate(&graph, 0);
+
+  // Init memcpy params
+  memcpyParams.kind = cudaMemcpyHostToDevice;
+  memcpyParams.srcPtr.ptr = h_A;
+  memcpyParams.dstPtr.ptr = d_A;
+  memcpyParams.extent.width = num_bytes;
+  memcpyParams.extent.height = 1;
+  memcpyParams.extent.depth = 1;
+  cudaGraphAddMemcpyNode(&nodes[0], graph, NULL, 0, &memcpyParams);
+
+  memcpyParams.srcPtr.ptr = h_B;
+  memcpyParams.dstPtr.ptr = d_B;
+  cudaGraphAddMemcpyNode(&nodes[1], graph, NULL, 0, &memcpyParams);
+
+  // Init kernel params
+  int num = kNumElements;
+  void *kernelArgs[] = {(void *)&d_A, (void *)&d_B, (void *)&d_C, (void *)&num};
+  blocksPerGrid = (kNumElements + kThreadsPerBlock - 1) / kThreadsPerBlock;
+  kernelParams.func = (void *)VecAdd;
+  kernelParams.gridDim = dim3(blocksPerGrid, 1, 1);
+  kernelParams.blockDim = dim3(kThreadsPerBlock, 1, 1);
+  kernelParams.sharedMemBytes = 0;
+  kernelParams.kernelParams = (void **)kernelArgs;
+  kernelParams.extra = NULL;
+
+  cudaGraphAddKernelNode(&nodes[2], graph, &nodes[0], 2, &kernelParams);
+
+  kernelParams.func = (void *)VecSub;
+  cudaGraphAddKernelNode(&nodes[3], graph, &nodes[2], 1, &kernelParams);
+
+  memcpyParams.kind = cudaMemcpyDeviceToHost;
+  memcpyParams.srcPtr.ptr = d_C;
+  memcpyParams.dstPtr.ptr = h_C;
+  memcpyParams.extent.width = num_bytes;
+  memcpyParams.extent.height = 1;
+  memcpyParams.extent.depth = 1;
+  cudaGraphAddMemcpyNode(&nodes[4], graph, &nodes[3], 1, &memcpyParams);
+
+  cudaGraphClone(&cloned_graph, graph);
+
+  cudaGraphInstantiate(&graphExec, cloned_graph, NULL, NULL, 0);
+
+  cudaGraphLaunch(graphExec, stream);
+
+  cudaStreamSynchronize(stream);
+
+  free(h_A);
+  free(h_B);
+  free(h_C);
+  cudaFree(d_A);
+  cudaFree(d_B);
+  cudaFree(d_C);
+  return true;
+#else
+  return false;
+#endif  // CUDA_VERSION >= 11070
+}
+
+#else
+
+bool CudaGraphCreateAndExecute() {
+  GTEST_FAIL() << "Build with --config=cuda";
+  return false;
+}
+
+#endif
+
 }  // namespace test
 }  // namespace profiler
 }  // namespace xla
diff --git a/xla/backends/profiler/gpu/cuda_test.h b/xla/backends/profiler/gpu/cuda_test.h
@@ -48,6 +48,11 @@ void MemCopyP2PImplicit();
 // Copies a few bytes of memory from device 0 to device 1.
 void MemCopyP2PExplicit();
 
+// Create a simple cuda graph, instantiate it and execute it.
+// Return true if the cuda version is newer so all related cuda graph APIs are
+// available, especially for activity graph tracing after 11.7.
+bool CudaGraphCreateAndExecute();
+
 }  // namespace test
 }  // namespace profiler
 }  // namespace xla

diff --git a/xla/backends/profiler/gpu/cupti_buffer_events.cc b/xla/backends/profiler/gpu/cupti_buffer_events.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/backends/profiler/gpu/cupti_buffer_events.h"
 
+#include "absl/strings/str_cat.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/backends/profiler/gpu/cupti_interface.h"
 #include "tsl/platform/errors.h"
@@ -27,6 +28,11 @@ namespace {
 
 using absl::StatusCode;
 
+template <typename CuptiActivity>
+struct CuptiActivityHasGraphId {
+  static constexpr bool value = false;
+};
+
 // CUPTI from CUDA 11.6 adds information about the hardware channel that ops
 // run on; this makes its way into the channel_id and channel_type fields in the
 // structs we export.
@@ -39,19 +45,59 @@ using CuptiActivityKernelTy = CUpti_ActivityKernel9;
 using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy5;
 using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpyPtoP4;
 using CuptiActivityMemsetTy = CUpti_ActivityMemset4;
+
+template <>
+struct CuptiActivityHasGraphId<CuptiActivityKernelTy> {
+  static constexpr bool value = true;
+};
+template <>
+struct CuptiActivityHasGraphId<CuptiActivityMemcpyTy> {
+  static constexpr bool value = true;
+};
+template <>
+struct CuptiActivityHasGraphId<CuptiActivityMemcpyP2PTy> {
+  static constexpr bool value = true;
+};
+template <>
+struct CuptiActivityHasGraphId<CuptiActivityMemsetTy> {
+  static constexpr bool value = true;
+};
 #elif CUDA_VERSION >= 11060  // CUDA 11.6
 #define TF_CUPTI_HAS_CHANNEL_ID 1
 using CuptiActivityKernelTy = CUpti_ActivityKernel7;
 using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy5;
 using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpyPtoP4;
 using CuptiActivityMemsetTy = CUpti_ActivityMemset4;
+
+template <>
+struct CuptiActivityHasGraphId<CuptiActivityKernelTy> {
+  static constexpr bool value = true;
+};
+template <>
+struct CuptiActivityHasGraphId<CuptiActivityMemcpyTy> {
+  static constexpr bool value = true;
+};
+template <>
+struct CuptiActivityHasGraphId<CuptiActivityMemcpyP2PTy> {
+  static constexpr bool value = true;
+};
+template <>
+struct CuptiActivityHasGraphId<CuptiActivityMemsetTy> {
+  static constexpr bool value = true;
+};
 #else
 using CuptiActivityKernelTy = CUpti_ActivityKernel4;
 using CuptiActivityMemcpyTy = CUpti_ActivityMemcpy;
 using CuptiActivityMemcpyP2PTy = CUpti_ActivityMemcpy2;
 using CuptiActivityMemsetTy = CUpti_ActivityMemset;
 #endif
 
+#if CUDA_VERSION >= 12030  // CUDA 12.3
+using CuptiActivityGraphTraceTy = CUpti_ActivityGraphTrace2;
+#elif CUDA_VERSION >= 11070
+using CuptiActivityGraphTraceTy = CUpti_ActivityGraphTrace;
+#endif
+
 // Maps an OverheadKind enum to a const string.
 const char *getActivityOverheadKindString(CUpti_ActivityOverheadKind kind) {
   switch (kind) {
@@ -94,6 +140,14 @@ const char *getActivityUnifiedMemoryKindString(
   return "<UNKNOWN>";
 }
 
+template <typename CuptiActivity>
+void SetEventGraphId(CuptiTracerEvent &event,
+                     const CuptiActivity *cupti_activity) {
+  if constexpr (CuptiActivityHasGraphId<CuptiActivity>::value) {
+    event.graph_id = cupti_activity->graphId;
+  }
+}
+
 template <bool cupti_has_channel_id, typename CuptiActivityKernel>
 void AddKernelActivityEvent(CuptiEventCollectorDelegate &collector,
                             const CuptiActivityKernel *kernel) {
@@ -111,6 +165,7 @@ void AddKernelActivityEvent(CuptiEventCollectorDelegate &collector,
       collector.annotation_map.LookUp(event.device_id, event.correlation_id);
   event.annotation = info.annotation;
   event.nvtx_range = info.nvtx_range;
+  SetEventGraphId(event, kernel);
   event.kernel_info.registers_per_thread = kernel->registersPerThread;
   event.kernel_info.static_shared_memory_usage = kernel->staticSharedMemory;
   event.kernel_info.dynamic_shared_memory_usage = kernel->dynamicSharedMemory;
@@ -127,6 +182,26 @@ void AddKernelActivityEvent(CuptiEventCollectorDelegate &collector,
   collector.receive(std::move(event));
 }
 
+void AddGraphTraceActivityEvent(CuptiEventCollectorDelegate &collector,
+                                CuptiActivityGraphTraceTy *graph_trace) {
+  CuptiTracerEvent event{};
+  event.type = CuptiTracerEventType::CudaGraph;
+  event.source = CuptiTracerEventSource::Activity;
+  event.name = absl::StrCat("CudaGraphExec:", graph_trace->graphId);
+  event.start_time_ns = graph_trace->start;
+  event.end_time_ns = graph_trace->end;
+  event.device_id = graph_trace->deviceId;
+  event.context_id = graph_trace->contextId;
+  event.stream_id = graph_trace->streamId;
+  event.correlation_id = graph_trace->correlationId;
+  AnnotationMap::AnnotationInfo info =
+      collector.annotation_map.LookUp(event.device_id, event.correlation_id);
+  event.annotation = info.annotation;
+  event.nvtx_range = info.nvtx_range;
+  event.graph_id = graph_trace->graphId;
+  collector.receive(std::move(event));
+}
+
 void AddMemcpyActivityEvent(CuptiEventCollectorDelegate &collector,
                             const CuptiActivityMemcpyTy *memcpy) {
   CuptiTracerEvent event{};
@@ -163,6 +238,7 @@ void AddMemcpyActivityEvent(CuptiEventCollectorDelegate &collector,
   AnnotationMap::AnnotationInfo info =
       collector.annotation_map.LookUp(event.device_id, event.correlation_id);
   event.annotation = info.annotation;
+  SetEventGraphId(event, memcpy);
   event.memcpy_info.copy_kind = memcpy->copyKind;
   event.memcpy_info.num_bytes = memcpy->bytes;
   event.memcpy_info.destination = memcpy->deviceId;
@@ -192,6 +268,7 @@ void AddMemcpyP2PActivityEvent(CuptiEventCollectorDelegate &collector,
   AnnotationMap::AnnotationInfo info =
       collector.annotation_map.LookUp(event.device_id, event.correlation_id);
   event.annotation = info.annotation;
+  SetEventGraphId(event, memcpy);
   event.memcpy_info.copy_kind = CUPTI_ACTIVITY_MEMCPY_KIND_PTOP;
   event.memcpy_info.num_bytes = memcpy->bytes;
   event.memcpy_info.destination = memcpy->dstDeviceId;
@@ -320,6 +397,7 @@ void AddMemsetActivityEvent(CuptiEventCollectorDelegate &collector,
   event.correlation_id = memset->correlationId;
   event.context_id = memset->contextId;
   event.stream_id = memset->streamId;
+  SetEventGraphId(event, memset);
   event.memset_info.num_bytes = memset->bytes;
   event.memset_info.mem_kind = mem_kind;
   event.memset_info.async = (memset->flags & CUPTI_ACTIVITY_FLAG_MEMSET_ASYNC);
@@ -418,6 +496,12 @@ static absl::Status ConvertActivityBuffer(
               collector,
               reinterpret_cast<CUpti_ActivitySynchronization *>(record));
           break;
+#if CUDA_VERSION >= 11070
+        case CUPTI_ACTIVITY_KIND_GRAPH_TRACE:
+          AddGraphTraceActivityEvent(
+              collector, reinterpret_cast<CuptiActivityGraphTraceTy *>(record));
+          break;
+#endif
         default:
           VLOG(3) << "Activity type " << record->kind << " is not supported.";
           break;

diff --git a/xla/backends/profiler/gpu/cupti_buffer_events.h b/xla/backends/profiler/gpu/cupti_buffer_events.h
@@ -134,6 +134,14 @@ struct GenericDetails {
   uint32_t cbid;
 };
 
+struct CudaGraphDetails {
+  uint32_t cbid;  // 0 for activity events, other wise the cbid of the callback
+  uint32_t orig_graph_id;  // The original graph from which new graph is
+                           // instantiated. Not graph_id is put into general
+                           // fields as if trace in node mode, many activity
+                           // events will contains graph id.
+};
+
 inline std::string ToXStat(const KernelDetails& kernel_info,
                            double occupancy_pct) {
   return absl::StrCat(
@@ -165,6 +173,7 @@ enum class CuptiTracerEventType {
   MemoryResidency = 12,
   HostRegister = 13,
   HostUnregister = 14,
+  CudaGraph = 15,
   Generic = 100,
 };
 
@@ -203,6 +212,7 @@ struct CuptiTracerEvent {
   uint32_t thread_id = kInvalidThreadId;
   int64_t context_id = kInvalidContextId;
   int64_t stream_id = kInvalidStreamId;
+  uint32_t graph_id = 0;
   union {
     // For Memcpy API and activities. `type` must be Memcpy*.
     MemcpyDetails memcpy_info;
@@ -222,6 +232,8 @@ struct CuptiTracerEvent {
     MemoryResidencyDetails memory_residency_info;
     // Used for `source` DriverCallback, `type` must be Generic.
     GenericDetails generic_info;
+    // Used for `source` DriverCallback, `type` must be CudaGraph.
+    CudaGraphDetails cuda_graph_info;
   };
 };