Skip to content

Commit

Permalink
PR #16913: [PJRT:GPU] Enable creating topology without a GPU device
Browse files Browse the repository at this point in the history
Imported from GitHub PR #16913

Currently PJRT_TopologyDescription_Create always creates topology from the local client. This requires having a local GPU device.

This patch allows explicitly specifying topology shape and device config in PJRT_TopologyDescription_Create call, without querying local client. This enables deviceless compilation.
Copybara import of the project:

--
bc85038 by Jaroslav Sevcik <[email protected]>:

[PJRT:GPU] Enable creating topology without a GPU device

--
e4eb44e by Jaroslav Sevcik <[email protected]>:

Enable overlaying topology on local device

--
ff25a57 by Jaroslav Sevcik <[email protected]>:

Address reviewer comments

--
8bfb7a2 by Jaroslav Sevcik <[email protected]>:

Cleanup

Merging this change closes #16913

COPYBARA_INTEGRATE_REVIEW=#16913 from jaro-sevcik:deviceless-topology-creation 8bfb7a2
PiperOrigin-RevId: 679047171
  • Loading branch information
jaro-sevcik authored and Google-ML-Automation committed Sep 26, 2024
1 parent 69d0f6d commit c3e0914
Show file tree
Hide file tree
Showing 2 changed files with 208 additions and 19 deletions.
121 changes: 102 additions & 19 deletions xla/pjrt/c/pjrt_c_api_gpu_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -167,33 +167,81 @@ PJRT_Error* PJRT_ExecuteContext_Create(PJRT_ExecuteContext_Create_Args* args) {
return nullptr;
}

PJRT_Error* PJRT_GpuDeviceTopology_Create(
PJRT_TopologyDescription_Create_Args* args) {
PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
"PJRT_TopologyDescription_Create_Args",
PJRT_TopologyDescription_Create_Args_STRUCT_SIZE, args->struct_size));
namespace {

PJRT_ASSIGN_OR_RETURN(xla::LocalClient * xla_client,
xla::GetGpuXlaClient(/*platform_name=*/std::nullopt,
/*allowed_devices=*/std::nullopt));
struct TargetConfigAndDevices {
stream_executor::GpuTargetConfigProto target_config_proto;
std::vector<int> device_ids;
};

// Parses the 'target_config' entry in 'options'. The option is
// parsed as GpuTargetConfigProto. If there is no 'target_config' in
// 'options', the function falls back to creating a local client,
// returning the local client's target config.
absl::StatusOr<TargetConfigAndDevices> GetTargetConfigFromOptions(
const absl::flat_hash_map<std::string, xla::PjRtValueType>& options) {
if (auto target_config_it = options.find("target_config");
target_config_it != options.end()) {
std::string target_config_proto_string =
std::get<std::string>(target_config_it->second);
stream_executor::GpuTargetConfigProto target_config_proto;
if (!tsl::protobuf::TextFormat::ParseFromString(target_config_proto_string,
&target_config_proto)) {
return absl::FailedPreconditionError(
"Failed to parse GpuTargetConfigProto "
"from the 'target_config' parameter.");
}
return {{target_config_proto, {}}};
}
TF_ASSIGN_OR_RETURN(xla::LocalClient * xla_client,
xla::GetGpuXlaClient(/*platform_name=*/std::nullopt,
/*allowed_devices=*/std::nullopt));
stream_executor::StreamExecutor* executor =
xla_client->backend().default_stream_executor();
const stream_executor::DeviceDescription& description =
executor->GetDeviceDescription();
std::vector<int> device_ids;
device_ids.reserve(xla_client->backend().stream_executors().size());
for (stream_executor::StreamExecutor* executor :
xla_client->backend().stream_executors()) {
device_ids.push_back(executor->device_ordinal());
}
auto gpu_target_config = xla::Compiler::TargetConfig(executor);
// TODO(b/341334898): Create a single-host GPU topology. Will be updated for
// multi-host support in the future.
auto gpu_topology = std::make_shared<const xla::GpuTopology>(
device_ids, description.name(),
/*num_slices=*/1,
/*num_hosts_per_slice=*/1,
/*num_devices_per_host=*/device_ids.size());
return {{gpu_target_config.ToProto(), device_ids}};
}

struct TopologySizes {
int num_slices = 0;
int num_hosts_per_slice = 0;
int num_devices_per_host = 0;

int GetDeviceCount() {
return num_slices * num_hosts_per_slice * num_devices_per_host;
}

static absl::StatusOr<TopologySizes> FromString(
std::string_view topology_string) {
TopologySizes sizes;
std::vector<std::string> topology_components =
absl::StrSplit(topology_string, 'x');
if (topology_components.size() != 3 ||
!absl::SimpleAtoi(topology_components[0], &sizes.num_slices) ||
!absl::SimpleAtoi(topology_components[1], &sizes.num_hosts_per_slice) ||
!absl::SimpleAtoi(topology_components[2],
&sizes.num_devices_per_host)) {
return absl::InternalError(
"topology must be of shape "
"\"<num-slices>x<num-hosts-per-slice>x<num-devices-per-host>\"");
}
return sizes;
}
};

} // namespace

PJRT_Error* PJRT_GpuDeviceTopology_Create(
PJRT_TopologyDescription_Create_Args* args) {
PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
"PJRT_TopologyDescription_Create_Args",
PJRT_TopologyDescription_Create_Args_STRUCT_SIZE, args->struct_size));

// Determine the platform ID and name based on the platform.
xla::PjRtPlatformId platform_id =
Expand All @@ -203,12 +251,47 @@ PJRT_Error* PJRT_GpuDeviceTopology_Create(
(std::string(PJRT_GPU_PLUGIN_PLATFORM_NAME) == "ROCM") ? xla::RocmName()
: xla::CudaName();

absl::flat_hash_map<std::string, xla::PjRtValueType> create_options =
pjrt::ConvertFromPjRtNamedValueList(args->create_options,
args->num_options);

PJRT_ASSIGN_OR_RETURN(TargetConfigAndDevices target_config_and_devices,
GetTargetConfigFromOptions(create_options));

std::vector<int>& device_ids = target_config_and_devices.device_ids;
stream_executor::GpuTargetConfigProto& target_config_proto =
target_config_and_devices.target_config_proto;
TopologySizes sizes{1, 1, static_cast<int>(device_ids.size())};

if (auto topology_it = create_options.find("topology");
topology_it != create_options.end()) {
std::string topology_string = std::get<std::string>(topology_it->second);
PJRT_ASSIGN_OR_RETURN(sizes, TopologySizes::FromString(topology_string));
}

if (sizes.GetDeviceCount() == 0) {
// If the user did not specify the topology and we did not
// get any devices from the client, then error out because
// we do not know how many devices the topology should have.
return new PJRT_Error{absl::FailedPreconditionError(
"Cannot create topology without an explicit topology shape or without "
"a client")};
}

if (sizes.GetDeviceCount() != device_ids.size()) {
device_ids.resize(sizes.GetDeviceCount());
absl::c_iota(device_ids, sizes.GetDeviceCount());
}

auto gpu_topology = std::make_shared<const xla::GpuTopology>(
device_ids, target_config_proto.device_description_str(),
sizes.num_slices, sizes.num_hosts_per_slice, sizes.num_devices_per_host);

auto pjrt_topology =
std::make_unique<xla::StreamExecutorGpuTopologyDescription>(
platform_id, platform_name, std::move(gpu_topology),
absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute>{
{"target_config",
gpu_target_config.ToProto().SerializeAsString()}});
{"target_config", target_config_proto.SerializeAsString()}});
args->topology = CreateWrapperDeviceTopology(std::move(pjrt_topology));
return nullptr;
}
Expand Down
106 changes: 106 additions & 0 deletions xla/pjrt/c/pjrt_c_api_gpu_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -427,6 +427,8 @@ TEST(PJRTGpuDeviceTopologyTest, CreateGpuTopology) {
args.struct_size = PJRT_TopologyDescription_Create_Args_STRUCT_SIZE;
args.extension_start = nullptr;
args.topology = nullptr;
args.num_options = 0;
args.create_options = nullptr;

PJRT_Error* error = pjrt_api->PJRT_TopologyDescription_Create(&args);
EXPECT_EQ(error, nullptr) << error->status.message();
Expand All @@ -452,6 +454,110 @@ TEST(PJRTGpuDeviceTopologyTest, CreateGpuTopology) {
EXPECT_EQ(destroy_error, nullptr) << destroy_error->status.message();
}

constexpr char const* kTargetConfigString = R"(gpu_device_info {
threads_per_block_limit: 1024
threads_per_warp: 32
shared_memory_per_block: 49152
shared_memory_per_core: 98304
threads_per_core_limit: 2048
core_count: 80
fpus_per_core: 64
block_dim_limit_x: 2147483647
block_dim_limit_y: 65535
block_dim_limit_z: 65535
memory_bandwidth: 898048000000
l2_cache_size: 6291456
clock_rate_ghz: 1.53
device_memory_size: 34072559616
shared_memory_per_block_optin: 98304
cuda_compute_capability {
major: 7
}
registers_per_core_limit: 65536
registers_per_block_limit: 65536
}
platform_name: "CUDA"
dnn_version_info {
major: 9
minor: 3
}
device_description_str: "Tesla V100-SXM2-32GB"
)";

TEST(PJRTGpuDeviceTopologyTest, CreateExplicitGpuTopologyAndTargetConfig) {
auto pjrt_api = gpu_plugin::GetGpuPjrtApi();

absl::flat_hash_map<std::string, xla::PjRtValueType> options = {
{"topology", static_cast<std::string>("16 x 2 x 4")},
{"target_config", static_cast<std::string>(kTargetConfigString)}};
TF_ASSERT_OK_AND_ASSIGN(std::vector<PJRT_NamedValue> c_options,
::pjrt::ConvertToPjRtNamedValueList(options));

PJRT_TopologyDescription_Create_Args args;
args.struct_size = PJRT_TopologyDescription_Create_Args_STRUCT_SIZE;
args.extension_start = nullptr;
args.topology = nullptr;
args.num_options = c_options.size();
args.create_options = c_options.data();

PJRT_Error* error = pjrt_api->PJRT_TopologyDescription_Create(&args);
EXPECT_EQ(error, nullptr) << error->status.message();

auto pjrt_topology =
reinterpret_cast<const PJRT_TopologyDescription*>(args.topology);
ASSERT_NE(pjrt_topology, nullptr);

EXPECT_EQ(pjrt_topology->topology->platform_id(), xla::CudaId());
EXPECT_EQ(pjrt_topology->topology->platform_name(), xla::CudaName());

EXPECT_EQ(pjrt_topology->topology->ProcessCount().value(), 16 * 2);
EXPECT_EQ(pjrt_topology->topology->DeviceDescriptions().size(), 16 * 2 * 4);
EXPECT_EQ(pjrt_topology->topology->DeviceDescriptions()[0]->device_kind(),
"Tesla V100-SXM2-32GB");

PJRT_TopologyDescription_Destroy_Args destroy_args;
destroy_args.struct_size = PJRT_TopologyDescription_Destroy_Args_STRUCT_SIZE;
destroy_args.extension_start = nullptr;
destroy_args.topology = const_cast<PJRT_TopologyDescription*>(pjrt_topology);
PJRT_Error* destroy_error =
pjrt_api->PJRT_TopologyDescription_Destroy(&destroy_args);
EXPECT_EQ(destroy_error, nullptr) << destroy_error->status.message();
}

TEST(PJRTGpuDeviceTopologyTest, CreateExplicitGpuTopology) {
auto pjrt_api = gpu_plugin::GetGpuPjrtApi();

absl::flat_hash_map<std::string, xla::PjRtValueType> options = {
{"topology", static_cast<std::string>("16 x 2 x 4")}};
TF_ASSERT_OK_AND_ASSIGN(std::vector<PJRT_NamedValue> c_options,
::pjrt::ConvertToPjRtNamedValueList(options));

PJRT_TopologyDescription_Create_Args args;
args.struct_size = PJRT_TopologyDescription_Create_Args_STRUCT_SIZE;
args.extension_start = nullptr;
args.topology = nullptr;
args.num_options = c_options.size();
args.create_options = c_options.data();

PJRT_Error* error = pjrt_api->PJRT_TopologyDescription_Create(&args);
EXPECT_EQ(error, nullptr) << error->status.message();

auto pjrt_topology =
reinterpret_cast<const PJRT_TopologyDescription*>(args.topology);
ASSERT_NE(pjrt_topology, nullptr);

EXPECT_EQ(pjrt_topology->topology->ProcessCount().value(), 16 * 2);
EXPECT_EQ(pjrt_topology->topology->DeviceDescriptions().size(), 16 * 2 * 4);

PJRT_TopologyDescription_Destroy_Args destroy_args;
destroy_args.struct_size = PJRT_TopologyDescription_Destroy_Args_STRUCT_SIZE;
destroy_args.extension_start = nullptr;
destroy_args.topology = const_cast<PJRT_TopologyDescription*>(pjrt_topology);
PJRT_Error* destroy_error =
pjrt_api->PJRT_TopologyDescription_Destroy(&destroy_args);
EXPECT_EQ(destroy_error, nullptr) << destroy_error->status.message();
}

void TestCustomCallV2() {}

TEST(PjrtCApiGpuExtensionTest, CustomCallUntyped) {
Expand Down

0 comments on commit c3e0914

Please sign in to comment.