PR #16913: [PJRT:GPU] Enable creating topology without a GPU device

Imported from GitHub PR #16913 Currently PJRT_TopologyDescription_Create always creates topology from the local client. This requires having a local GPU device. This patch allows explicitly specifying topology shape and device config in PJRT_TopologyDescription_Create call, without querying local client. This enables deviceless compilation. Copybara import of the project: -- bc85038 by Jaroslav Sevcik <[email protected]>: [PJRT:GPU] Enable creating topology without a GPU device -- e4eb44e by Jaroslav Sevcik <[email protected]>: Enable overlaying topology on local device -- ff25a57 by Jaroslav Sevcik <[email protected]>: Address reviewer comments -- 8bfb7a2 by Jaroslav Sevcik <[email protected]>: Cleanup Merging this change closes #16913 COPYBARA_INTEGRATE_REVIEW=#16913 from jaro-sevcik:deviceless-topology-creation 8bfb7a2 PiperOrigin-RevId: 679047171
openxla · Sep 26, 2024 · c3e0914 · c3e0914
1 parent 69d0f6d
commit c3e0914
Show file tree

Hide file tree

Showing 2 changed files with 208 additions and 19 deletions.
diff --git a/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -167,33 +167,81 @@ PJRT_Error* PJRT_ExecuteContext_Create(PJRT_ExecuteContext_Create_Args* args) {
   return nullptr;
 }
 
-PJRT_Error* PJRT_GpuDeviceTopology_Create(
-    PJRT_TopologyDescription_Create_Args* args) {
-  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
-      "PJRT_TopologyDescription_Create_Args",
-      PJRT_TopologyDescription_Create_Args_STRUCT_SIZE, args->struct_size));
+namespace {
 
-  PJRT_ASSIGN_OR_RETURN(xla::LocalClient * xla_client,
-                        xla::GetGpuXlaClient(/*platform_name=*/std::nullopt,
-                                             /*allowed_devices=*/std::nullopt));
+struct TargetConfigAndDevices {
+  stream_executor::GpuTargetConfigProto target_config_proto;
+  std::vector<int> device_ids;
+};
+
+// Parses the 'target_config' entry in 'options'. The option is
+// parsed as GpuTargetConfigProto. If there is no 'target_config' in
+// 'options', the function falls back to creating a local client,
+// returning the local client's target config.
+absl::StatusOr<TargetConfigAndDevices> GetTargetConfigFromOptions(
+    const absl::flat_hash_map<std::string, xla::PjRtValueType>& options) {
+  if (auto target_config_it = options.find("target_config");
+      target_config_it != options.end()) {
+    std::string target_config_proto_string =
+        std::get<std::string>(target_config_it->second);
+    stream_executor::GpuTargetConfigProto target_config_proto;
+    if (!tsl::protobuf::TextFormat::ParseFromString(target_config_proto_string,
+                                                    &target_config_proto)) {
+      return absl::FailedPreconditionError(
+          "Failed to parse GpuTargetConfigProto "
+          "from the 'target_config' parameter.");
+    }
+    return {{target_config_proto, {}}};
+  }
+  TF_ASSIGN_OR_RETURN(xla::LocalClient * xla_client,
+                      xla::GetGpuXlaClient(/*platform_name=*/std::nullopt,
+                                           /*allowed_devices=*/std::nullopt));
   stream_executor::StreamExecutor* executor =
       xla_client->backend().default_stream_executor();
-  const stream_executor::DeviceDescription& description =
-      executor->GetDeviceDescription();
   std::vector<int> device_ids;
   device_ids.reserve(xla_client->backend().stream_executors().size());
   for (stream_executor::StreamExecutor* executor :
        xla_client->backend().stream_executors()) {
     device_ids.push_back(executor->device_ordinal());
   }
   auto gpu_target_config = xla::Compiler::TargetConfig(executor);
-  // TODO(b/341334898): Create a single-host GPU topology. Will be updated for
-  // multi-host support in the future.
-  auto gpu_topology = std::make_shared<const xla::GpuTopology>(
-      device_ids, description.name(),
-      /*num_slices=*/1,
-      /*num_hosts_per_slice=*/1,
-      /*num_devices_per_host=*/device_ids.size());
+  return {{gpu_target_config.ToProto(), device_ids}};
+}
+
+struct TopologySizes {
+  int num_slices = 0;
+  int num_hosts_per_slice = 0;
+  int num_devices_per_host = 0;
+
+  int GetDeviceCount() {
+    return num_slices * num_hosts_per_slice * num_devices_per_host;
+  }
+
+  static absl::StatusOr<TopologySizes> FromString(
+      std::string_view topology_string) {
+    TopologySizes sizes;
+    std::vector<std::string> topology_components =
+        absl::StrSplit(topology_string, 'x');
+    if (topology_components.size() != 3 ||
+        !absl::SimpleAtoi(topology_components[0], &sizes.num_slices) ||
+        !absl::SimpleAtoi(topology_components[1], &sizes.num_hosts_per_slice) ||
+        !absl::SimpleAtoi(topology_components[2],
+                          &sizes.num_devices_per_host)) {
+      return absl::InternalError(
+          "topology must be of shape "
+          "\"<num-slices>x<num-hosts-per-slice>x<num-devices-per-host>\"");
+    }
+    return sizes;
+  }
+};
+
+}  // namespace
+
+PJRT_Error* PJRT_GpuDeviceTopology_Create(
+    PJRT_TopologyDescription_Create_Args* args) {
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_TopologyDescription_Create_Args",
+      PJRT_TopologyDescription_Create_Args_STRUCT_SIZE, args->struct_size));
 
   // Determine the platform ID and name based on the platform.
   xla::PjRtPlatformId platform_id =
@@ -203,12 +251,47 @@ PJRT_Error* PJRT_GpuDeviceTopology_Create(
       (std::string(PJRT_GPU_PLUGIN_PLATFORM_NAME) == "ROCM") ? xla::RocmName()
                                                              : xla::CudaName();
 
+  absl::flat_hash_map<std::string, xla::PjRtValueType> create_options =
+      pjrt::ConvertFromPjRtNamedValueList(args->create_options,
+                                          args->num_options);
+
+  PJRT_ASSIGN_OR_RETURN(TargetConfigAndDevices target_config_and_devices,
+                        GetTargetConfigFromOptions(create_options));
+
+  std::vector<int>& device_ids = target_config_and_devices.device_ids;
+  stream_executor::GpuTargetConfigProto& target_config_proto =
+      target_config_and_devices.target_config_proto;
+  TopologySizes sizes{1, 1, static_cast<int>(device_ids.size())};
+
+  if (auto topology_it = create_options.find("topology");
+      topology_it != create_options.end()) {
+    std::string topology_string = std::get<std::string>(topology_it->second);
+    PJRT_ASSIGN_OR_RETURN(sizes, TopologySizes::FromString(topology_string));
+  }
+
+  if (sizes.GetDeviceCount() == 0) {
+    // If the user did not specify the topology and we did not
+    // get any devices from the client, then error out because
+    // we do not know how many devices the topology should have.
+    return new PJRT_Error{absl::FailedPreconditionError(
+        "Cannot create topology without an explicit topology shape or without "
+        "a client")};
+  }
+
+  if (sizes.GetDeviceCount() != device_ids.size()) {
+    device_ids.resize(sizes.GetDeviceCount());
+    absl::c_iota(device_ids, sizes.GetDeviceCount());
+  }
+
+  auto gpu_topology = std::make_shared<const xla::GpuTopology>(
+      device_ids, target_config_proto.device_description_str(),
+      sizes.num_slices, sizes.num_hosts_per_slice, sizes.num_devices_per_host);
+
   auto pjrt_topology =
       std::make_unique<xla::StreamExecutorGpuTopologyDescription>(
           platform_id, platform_name, std::move(gpu_topology),
           absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute>{
-              {"target_config",
-               gpu_target_config.ToProto().SerializeAsString()}});
+              {"target_config", target_config_proto.SerializeAsString()}});
   args->topology = CreateWrapperDeviceTopology(std::move(pjrt_topology));
   return nullptr;
 }

diff --git a/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -427,6 +427,8 @@ TEST(PJRTGpuDeviceTopologyTest, CreateGpuTopology) {
   args.struct_size = PJRT_TopologyDescription_Create_Args_STRUCT_SIZE;
   args.extension_start = nullptr;
   args.topology = nullptr;
+  args.num_options = 0;
+  args.create_options = nullptr;
 
   PJRT_Error* error = pjrt_api->PJRT_TopologyDescription_Create(&args);
   EXPECT_EQ(error, nullptr) << error->status.message();
@@ -452,6 +454,110 @@ TEST(PJRTGpuDeviceTopologyTest, CreateGpuTopology) {
   EXPECT_EQ(destroy_error, nullptr) << destroy_error->status.message();
 }
 
+constexpr char const* kTargetConfigString = R"(gpu_device_info {
+  threads_per_block_limit: 1024
+  threads_per_warp: 32
+  shared_memory_per_block: 49152
+  shared_memory_per_core: 98304
+  threads_per_core_limit: 2048
+  core_count: 80
+  fpus_per_core: 64
+  block_dim_limit_x: 2147483647
+  block_dim_limit_y: 65535
+  block_dim_limit_z: 65535
+  memory_bandwidth: 898048000000
+  l2_cache_size: 6291456
+  clock_rate_ghz: 1.53
+  device_memory_size: 34072559616
+  shared_memory_per_block_optin: 98304
+  cuda_compute_capability {
+    major: 7
+  }
+  registers_per_core_limit: 65536
+  registers_per_block_limit: 65536
+}
+platform_name: "CUDA"
+dnn_version_info {
+  major: 9
+  minor: 3
+}
+device_description_str: "Tesla V100-SXM2-32GB"
+)";
+
+TEST(PJRTGpuDeviceTopologyTest, CreateExplicitGpuTopologyAndTargetConfig) {
+  auto pjrt_api = gpu_plugin::GetGpuPjrtApi();
+
+  absl::flat_hash_map<std::string, xla::PjRtValueType> options = {
+      {"topology", static_cast<std::string>("16 x 2 x 4")},
+      {"target_config", static_cast<std::string>(kTargetConfigString)}};
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<PJRT_NamedValue> c_options,
+                          ::pjrt::ConvertToPjRtNamedValueList(options));
+
+  PJRT_TopologyDescription_Create_Args args;
+  args.struct_size = PJRT_TopologyDescription_Create_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.topology = nullptr;
+  args.num_options = c_options.size();
+  args.create_options = c_options.data();
+
+  PJRT_Error* error = pjrt_api->PJRT_TopologyDescription_Create(&args);
+  EXPECT_EQ(error, nullptr) << error->status.message();
+
+  auto pjrt_topology =
+      reinterpret_cast<const PJRT_TopologyDescription*>(args.topology);
+  ASSERT_NE(pjrt_topology, nullptr);
+
+  EXPECT_EQ(pjrt_topology->topology->platform_id(), xla::CudaId());
+  EXPECT_EQ(pjrt_topology->topology->platform_name(), xla::CudaName());
+
+  EXPECT_EQ(pjrt_topology->topology->ProcessCount().value(), 16 * 2);
+  EXPECT_EQ(pjrt_topology->topology->DeviceDescriptions().size(), 16 * 2 * 4);
+  EXPECT_EQ(pjrt_topology->topology->DeviceDescriptions()[0]->device_kind(),
+            "Tesla V100-SXM2-32GB");
+
+  PJRT_TopologyDescription_Destroy_Args destroy_args;
+  destroy_args.struct_size = PJRT_TopologyDescription_Destroy_Args_STRUCT_SIZE;
+  destroy_args.extension_start = nullptr;
+  destroy_args.topology = const_cast<PJRT_TopologyDescription*>(pjrt_topology);
+  PJRT_Error* destroy_error =
+      pjrt_api->PJRT_TopologyDescription_Destroy(&destroy_args);
+  EXPECT_EQ(destroy_error, nullptr) << destroy_error->status.message();
+}
+
+TEST(PJRTGpuDeviceTopologyTest, CreateExplicitGpuTopology) {
+  auto pjrt_api = gpu_plugin::GetGpuPjrtApi();
+
+  absl::flat_hash_map<std::string, xla::PjRtValueType> options = {
+      {"topology", static_cast<std::string>("16 x 2 x 4")}};
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<PJRT_NamedValue> c_options,
+                          ::pjrt::ConvertToPjRtNamedValueList(options));
+
+  PJRT_TopologyDescription_Create_Args args;
+  args.struct_size = PJRT_TopologyDescription_Create_Args_STRUCT_SIZE;
+  args.extension_start = nullptr;
+  args.topology = nullptr;
+  args.num_options = c_options.size();
+  args.create_options = c_options.data();
+
+  PJRT_Error* error = pjrt_api->PJRT_TopologyDescription_Create(&args);
+  EXPECT_EQ(error, nullptr) << error->status.message();
+
+  auto pjrt_topology =
+      reinterpret_cast<const PJRT_TopologyDescription*>(args.topology);
+  ASSERT_NE(pjrt_topology, nullptr);
+
+  EXPECT_EQ(pjrt_topology->topology->ProcessCount().value(), 16 * 2);
+  EXPECT_EQ(pjrt_topology->topology->DeviceDescriptions().size(), 16 * 2 * 4);
+
+  PJRT_TopologyDescription_Destroy_Args destroy_args;
+  destroy_args.struct_size = PJRT_TopologyDescription_Destroy_Args_STRUCT_SIZE;
+  destroy_args.extension_start = nullptr;
+  destroy_args.topology = const_cast<PJRT_TopologyDescription*>(pjrt_topology);
+  PJRT_Error* destroy_error =
+      pjrt_api->PJRT_TopologyDescription_Destroy(&destroy_args);
+  EXPECT_EQ(destroy_error, nullptr) << destroy_error->status.message();
+}
+
 void TestCustomCallV2() {}
 
 TEST(PjrtCApiGpuExtensionTest, CustomCallUntyped) {