Move down_cast from the tensorflow to the tsl namespace

No functionality changes are intended PiperOrigin-RevId: 568886635
openxla · Nov 15, 2024 · 432b67e · 432b67e
1 parent 5b847d8
commit 432b67e
Show file tree

Hide file tree

Showing 38 changed files with 166 additions and 176 deletions.
diff --git a/third_party/tsl/tsl/platform/default/casts.h b/third_party/tsl/tsl/platform/default/casts.h
@@ -13,14 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// IWYU pragma: private, include "tsl/platform/casts.h"
+
 #ifndef TENSORFLOW_TSL_PLATFORM_DEFAULT_CASTS_H_
 #define TENSORFLOW_TSL_PLATFORM_DEFAULT_CASTS_H_
 
 #include <assert.h>  // for use with down_cast<>
 
 #include <type_traits>
 
-namespace tensorflow {
+namespace tsl {
 
 // An "upcast", i.e. a conversion from a pointer to an object to a pointer to a
 // base subobject, always succeeds if the base is unambiguous and accessible,
@@ -87,6 +89,6 @@ inline To down_cast(From& f) {
   return static_cast<To>(f);
 }
 
-}  // namespace tensorflow
+}  // namespace tsl
 
 #endif  // TENSORFLOW_TSL_PLATFORM_DEFAULT_CASTS_H_
diff --git a/xla/pjrt/BUILD b/xla/pjrt/BUILD
@@ -911,6 +911,7 @@ xla_cc_test(
         "//xla/hlo/parser:hlo_parser",
         "//xla/pjrt/cpu:cpu_client",
         "@com_google_googletest//:gtest_main",
+        "@tsl//tsl/platform:casts",
         "@tsl//tsl/platform:env",
         "@tsl//tsl/platform:test",
     ],

diff --git a/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -1676,7 +1676,7 @@ PJRT_Error* PJRT_Buffer_GetMemoryLayout(
       std::unique_ptr<xla::PjRtLayout> pjrt_layout =
           args->buffer->buffer->layout();
       xla::PjRtXlaLayout* pjrt_xla_layout =
-          tensorflow::down_cast<xla::PjRtXlaLayout*>(pjrt_layout.get());
+          tsl::down_cast<xla::PjRtXlaLayout*>(pjrt_layout.get());
       CHECK(pjrt_xla_layout != nullptr) << "Got unexpected layout type";
       const xla::Layout& xla_layout = pjrt_xla_layout->xla_layout();
 

diff --git a/xla/pjrt/cpu/cpu_client.cc b/xla/pjrt/cpu/cpu_client.cc
@@ -145,9 +145,9 @@ absl::StatusOr<std::unique_ptr<TfrtCpuBuffer>> AllocateDestinationBufferAndAvs(
   // buffer.
   absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4> definition_events;
   AbstractTfrtCpuBuffer::AllocateAvsAndEvents(shape, avs, &definition_events);
-  return AllocateDestinationBuffer(
-      shape, std::move(definition_events),
-      tensorflow::down_cast<TfrtCpuDevice*>(device), client);
+  return AllocateDestinationBuffer(shape, std::move(definition_events),
+                                   tsl::down_cast<TfrtCpuDevice*>(device),
+                                   client);
 }
 
 const char kCpuPlatformName[] = "cpu";
@@ -469,7 +469,7 @@ TfrtCpuClient::TfrtCpuClient(
     // do not promise that memory space ids and device ids are the same.
     const int id = device->id();
     auto memory_space = std::make_unique<UnpinnedHostMemorySpace>(id, device);
-    tensorflow::down_cast<TfrtCpuDevice*>(device)->AttachMemorySpace(
+    tsl::down_cast<TfrtCpuDevice*>(device)->AttachMemorySpace(
         memory_space.get());
     memory_spaces_.push_back(memory_space.get());
     owned_memory_spaces_.push_back(std::move(memory_space));
@@ -642,7 +642,7 @@ TfrtCpuClient::DeserializeExecutable(absl::string_view serialized,
       &num_replicas, &num_partitions, &device_assignment));
 
   auto cpu_executable_ptr =
-      tensorflow::down_cast<cpu::CpuExecutable*>(executable.get());
+      tsl::down_cast<cpu::CpuExecutable*>(executable.get());
 
   // `buffer_table[result_slice.index()]` points to result buffer:
   // If output is a tuple, it points to the buffer index table.
@@ -845,7 +845,7 @@ absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
                  eigen_intraop_device()->getPool()->NumThreads(),
                  customize_hlo_module_config_));
   auto cpu_executable_ptr =
-      tensorflow::down_cast<cpu::CpuExecutable*>(cpu_executable.get());
+      tsl::down_cast<cpu::CpuExecutable*>(cpu_executable.get());
 
   // `buffer_table[result_slice.index()]` points to result buffer:
   // If output is a tuple, it points to the buffer index table.
@@ -921,8 +921,7 @@ TfrtCpuClient::CreateViewOfDeviceBuffer(
       std::move(on_delete_callback));
   return std::unique_ptr<PjRtBuffer>(std::make_unique<TfrtCpuBuffer>(
       shape, std::move(tracked_device_buffer), this,
-      tensorflow::down_cast<TfrtCpuDevice*>(device),
-      *device->default_memory_space()));
+      tsl::down_cast<TfrtCpuDevice*>(device), *device->default_memory_space()));
 }
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::CreateErrorBuffer(
@@ -938,7 +937,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::CreateErrorBuffer(
           absl::InlinedVector<tsl::AsyncValueRef<CpuEvent>, 4>{
               tsl::AsyncValueRef<CpuEvent>(
                   tsl::MakeErrorAsyncValueRef(std::move(error)))}),
-      this, tensorflow::down_cast<TfrtCpuDevice*>(device),
+      this, tsl::down_cast<TfrtCpuDevice*>(device),
       *device->default_memory_space());
 }
 
@@ -953,15 +952,15 @@ TfrtCpuClient::CreateUninitializedBuffer(const Shape& shape,
   tsl::profiler::TraceMe traceme("TfrtCpuClient::CreateUninitializedBuffer");
   VLOG(1) << "TfrtCpuClient::CreateUninitializedBuffer: shape: "
           << shape.DebugString() << " device: " << device->DebugString();
-  return AllocateDestinationBuffer(
-      shape, /*definition_events=*/{},
-      tensorflow::down_cast<TfrtCpuDevice*>(device), this);
+  return AllocateDestinationBuffer(shape, /*definition_events=*/{},
+                                   tsl::down_cast<TfrtCpuDevice*>(device),
+                                   this);
 }
 
 absl::StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
 TfrtCpuClient::CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
                                                  PjRtDevice* device) {
-  auto* tfrt_device = tensorflow::down_cast<TfrtCpuDevice*>(device);
+  auto* tfrt_device = tsl::down_cast<TfrtCpuDevice*>(device);
   return TfrtCpuAsyncHostToDeviceTransferManager::Create(shapes, tfrt_device,
                                                          this);
 }
@@ -997,8 +996,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::BufferFromHostBuffer(
 
   return std::unique_ptr<PjRtBuffer>(std::make_unique<TfrtCpuBuffer>(
       shape, std::move(tracked_device_buffer), this,
-      tensorflow::down_cast<TfrtCpuDevice*>(device),
-      *device->default_memory_space()));
+      tsl::down_cast<TfrtCpuDevice*>(device), *device->default_memory_space()));
 }
 
 absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::BufferFromHostBuffer(
@@ -1044,7 +1042,7 @@ TfrtCpuClient::BufferFromHostLiteral(const LiteralSlice& literal,
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<TfrtCpuBuffer> output_buffer,
       AllocateDestinationBufferAndAvs(
-          shape, &avs, tensorflow::down_cast<TfrtCpuDevice*>(device), this));
+          shape, &avs, tsl::down_cast<TfrtCpuDevice*>(device), this));
 
   output_buffer->CopyFromLiteral(literal, shape, &avs, async_work_runner());
 
@@ -1119,7 +1117,7 @@ absl::StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuBuffer::CopyToDevice(
 
   return std::unique_ptr<PjRtBuffer>(std::make_unique<TfrtCpuBuffer>(
       on_device_shape_, std::move(tracked_device_buffer), client(),
-      tensorflow::down_cast<TfrtCpuDevice*>(dst_device),
+      tsl::down_cast<TfrtCpuDevice*>(dst_device),
       *dst_device->default_memory_space()));
 }
 
@@ -1382,7 +1380,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
     PjRtGlobalDeviceId global_device_id(device_id);
     TF_ASSIGN_OR_RETURN(PjRtDevice * pjrt_device,
                         client_->LookupDevice(global_device_id));
-    device = tensorflow::down_cast<TfrtCpuDevice*>(pjrt_device);
+    device = tsl::down_cast<TfrtCpuDevice*>(pjrt_device);
     device_assignment = device_assignment_;
   } else {
     CHECK(device_assignment_ == nullptr);
@@ -1435,7 +1433,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
   donation_clashes.reserve(argument_handles.size());
   for (int i = 0; i < argument_handles.size(); ++i) {
     PjRtBuffer* handle = argument_handles[i];
-    auto* tfrt_buffer = tensorflow::down_cast<TfrtCpuBuffer*>(handle);
+    auto* tfrt_buffer = tsl::down_cast<TfrtCpuBuffer*>(handle);
     if (tfrt_buffer->device() != device) {
       return InvalidArgument(
           "Buffer passed to Execute() as argument %d to replica %d is on "
@@ -1522,7 +1520,7 @@ absl::StatusOr<PjRtLoadedExecutable::Result> TfrtCpuExecutable::ExecuteHelper(
   }
 
   auto* cpu_executable =
-      tensorflow::down_cast<cpu::CpuExecutable*>(cpu_executable_.get());
+      tsl::down_cast<cpu::CpuExecutable*>(cpu_executable_.get());
   // `buffer_alloc` and `buffer_alloc_and_copy` are used to do real memory
   // allocation and copy work.
   BufferAlloc buffer_alloc;
@@ -2090,7 +2088,7 @@ TfrtCpuExecutable::ExecutePortable(
           /*replica=*/0,
           /*partition=*/0, RunId(), options,
           /*last_collective_launch_event=*/tsl::AsyncValueRef<CpuEvent>(),
-          fill_future, tensorflow::down_cast<TfrtCpuDevice*>(device)));
+          fill_future, tsl::down_cast<TfrtCpuDevice*>(device)));
   returned_future = std::move(result.future);
   return std::move(result.buffers);
 }

diff --git a/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -174,8 +174,7 @@ class AsyncHostToDeviceTransferManager
       // buffers, because the invariants of this class ensure that the buffer
       // definition event will not fire until after all of this class' uses of
       // the TrackedDeviceBuffer have completed.
-      auto* se_buffer =
-          tensorflow::down_cast<PjRtStreamExecutorBuffer*>(buffer.get());
+      auto* se_buffer = tsl::down_cast<PjRtStreamExecutorBuffer*>(buffer.get());
       DCHECK(se_buffer);
       auto hold = se_buffer->GetBufferWithUsageHold();
       buffer_ptrs.push_back(hold.buffer());
@@ -243,7 +242,7 @@ class AsyncHostToDeviceTransferManager
         "AsyncHostToDeviceTransferManager::TransferLiteralToBuffer");
     auto* stream = device_->local_device_state()->host_to_device_stream();
     auto* se_client =
-        tensorflow::down_cast<PjRtStreamExecutorClient*>(device_->client());
+        tsl::down_cast<PjRtStreamExecutorClient*>(device_->client());
     DCHECK(se_client);
 
     TransferManager* transfer_manager =
@@ -330,8 +329,7 @@ class AsyncHostToDeviceTransferManager
       bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) override {
     auto* stream = device_->local_device_state()->host_to_device_stream();
 
-    auto* client =
-        tensorflow::down_cast<PjRtStreamExecutorClient*>(device_->client());
+    auto* client = tsl::down_cast<PjRtStreamExecutorClient*>(device_->client());
     bool should_stage_host_to_device_transfers =
         client->should_stage_host_to_device_transfers();
     std::shared_ptr<void> staging_buffer;
@@ -516,12 +514,12 @@ StreamExecutorGpuClient::StreamExecutorGpuClient(
     const int id = device->id();
     auto memory_space =
         std::make_unique<StreamExecutorGpuHbmMemorySpace>(id, device);
-    tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)->AttachMemorySpace(
+    tsl::down_cast<PjRtStreamExecutorDevice*>(device)->AttachMemorySpace(
         memory_space.get());
     owned_memory_spaces_.push_back(std::move(memory_space));
     auto pinned =
         std::make_unique<PinnedHostMemorySpace>(basePinnedId + id, device);
-    tensorflow::down_cast<PjRtStreamExecutorDevice*>(device)->AttachMemorySpace(
+    tsl::down_cast<PjRtStreamExecutorDevice*>(device)->AttachMemorySpace(
         pinned.get());
     owned_memory_spaces_.push_back(std::move(pinned));
   }
@@ -558,7 +556,7 @@ StreamExecutorGpuClient::CreateBuffersForAsyncHostToDevice(
     std::optional<absl::Span<const std::optional<Layout>>> device_layouts,
     PjRtDevice* device) {
   auto* stream_executor_device =
-      tensorflow::down_cast<PjRtStreamExecutorDevice*>(device);
+      tsl::down_cast<PjRtStreamExecutorDevice*>(device);
   return xla::AsyncHostToDeviceTransferManager::Create(
       shape_specs, std::move(device_layouts), stream_executor_device, this,
       /*memory_space=*/nullptr);
@@ -586,7 +584,7 @@ StreamExecutorGpuClient::CreateBuffersForAsyncHostToDevice(
   CHECK_EQ(memory_space->devices().size(), 1);
   PjRtDevice* device = memory_space->devices()[0];
   auto* stream_executor_device =
-      tensorflow::down_cast<PjRtStreamExecutorDevice*>(device);
+      tsl::down_cast<PjRtStreamExecutorDevice*>(device);
   return xla::AsyncHostToDeviceTransferManager::Create(
       shape_specs, std::move(device_layouts), stream_executor_device, this,
       memory_space);
@@ -624,7 +622,7 @@ StreamExecutorGpuClient::GetDefaultDeviceAssignment(int num_replicas,
 PjRtFuture<> StreamExecutorGpuClient::CopyRawSubBufferToHost(
     PjRtBuffer* pjrt_buffer, PjRtFuture<void*> dst, int64_t offset,
     int64_t transfer_size) {
-  auto* buffer = tensorflow::down_cast<PjRtStreamExecutorBuffer*>(pjrt_buffer);
+  auto* buffer = tsl::down_cast<PjRtStreamExecutorBuffer*>(pjrt_buffer);
   DCHECK(buffer);
   PjRtStreamExecutorDevice* device = buffer->device();
   LocalDeviceState* local_device = device->local_device_state();
@@ -785,7 +783,7 @@ StreamExecutorGpuClient::Compile(const XlaComputation& computation,
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
   for (const PjRtDevice* device : addressable_devices()) {
     LocalDeviceState* local_device_state =
-        tensorflow::down_cast<const PjRtStreamExecutorDevice*>(device)
+        tsl::down_cast<const PjRtStreamExecutorDevice*>(device)
             ->local_device_state();
     int64_t free_memory, total_memory;
     if (local_device_state != nullptr) {
@@ -813,7 +811,7 @@ StreamExecutorGpuClient::LoadSerialized(absl::string_view serialized,
 absl::StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
 StreamExecutorGpuClient::Load(std::unique_ptr<PjRtExecutable> executable) {
   auto se_executable = absl::WrapUnique(
-      tensorflow::down_cast<StreamExecutorExecutable*>(executable.release()));
+      tsl::down_cast<StreamExecutorExecutable*>(executable.release()));
 
   CompileOptions compile_options = se_executable->compile_options();
   CompileOptions input_options = compile_options;
@@ -1214,7 +1212,7 @@ absl::StatusOr<tsl::AllocatorStats> StreamExecutorGpuDevice::GetAllocatorStats()
   }
 
   auto* allocator_adapter = dynamic_cast<se::MultiDeviceAdapter*>(
-      tensorflow::down_cast<PjRtStreamExecutorClient*>(client())->allocator());
+      tsl::down_cast<PjRtStreamExecutorClient*>(client())->allocator());
   if (!allocator_adapter) {
     return Unimplemented(
         "GetAllocatorStats() is only implemented with MultiDeviceAdapter "

diff --git a/xla/pjrt/gpu/se_gpu_pjrt_client.h b/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -245,7 +245,7 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
       std::unique_ptr<PjRtExecutable> executable,
       const LoadOptions& load_options) override {
     return absl::WrapUnique<PjRtLoadedExecutable>(
-        tensorflow::down_cast<PjRtLoadedExecutable*>(executable.release()));
+        tsl::down_cast<PjRtLoadedExecutable*>(executable.release()));
   }
 
   // TODO(b/296466237): Unify `Load` method after (de)serialization and tests on

diff --git a/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -1072,7 +1072,7 @@ TEST(StreamExecutorGpuClientTest, MockNcclClientWithGpuTopologyTest) {
   TF_ASSERT_OK_AND_ASSIGN(const xla::PjRtTopologyDescription* topology,
                           client->GetTopologyDescription());
   const StreamExecutorGpuTopologyDescription& gpu_topology =
-      tensorflow::down_cast<const xla::StreamExecutorGpuTopologyDescription&>(
+      tsl::down_cast<const xla::StreamExecutorGpuTopologyDescription&>(
           *topology);
 
   EXPECT_EQ(gpu_topology.gpu_topology().num_slices(), 2);

diff --git a/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -61,11 +61,9 @@ bool IsGpuClient(const PjRtClient& client) {
 bool IsSameTopology(const PjRtTopologyDescription& topology1,
                     const PjRtTopologyDescription& topology2) {
   const StreamExecutorGpuTopologyDescription& gpu_topology1 =
-      tensorflow::down_cast<const StreamExecutorGpuTopologyDescription&>(
-          topology1);
+      tsl::down_cast<const StreamExecutorGpuTopologyDescription&>(topology1);
   const StreamExecutorGpuTopologyDescription& gpu_topology2 =
-      tensorflow::down_cast<const StreamExecutorGpuTopologyDescription&>(
-          topology2);
+      tsl::down_cast<const StreamExecutorGpuTopologyDescription&>(topology2);
   return gpu_topology1 == gpu_topology2;
 }
 

diff --git a/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc b/xla/pjrt/gpu/se_gpu_pjrt_compiler_aot_test.cc
@@ -90,7 +90,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileMlirAndLoad) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));
   auto se_client = absl::WrapUnique(
-      tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+      tsl::down_cast<StreamExecutorGpuClient*>(client.release()));
   Compiler::TargetConfig gpu_target_config = xla::Compiler::TargetConfig(
       se_client->client()->backend().default_stream_executor());
   StreamExecutorGpuCompiler compiler;
@@ -119,7 +119,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessAotCompileXlaAndLoad) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));
   auto se_client = absl::WrapUnique(
-      tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+      tsl::down_cast<StreamExecutorGpuClient*>(client.release()));
 #if GOOGLE_CUDA
   auto gpu_compiler = gpu::NVPTXCompiler();
 #elif TENSORFLOW_USE_ROCM
@@ -152,7 +152,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessLoadFromSerializedExecutable) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));
   auto se_client = absl::WrapUnique(
-      tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+      tsl::down_cast<StreamExecutorGpuClient*>(client.release()));
   StreamExecutorGpuCompiler compiler;
   xla::CompileOptions opts;
   opts.target_config = Compiler::TargetConfig(
@@ -189,7 +189,7 @@ TEST(StreamExecutorGpuCompilerTest, SuccessSerializeDeserialize) {
   TF_ASSERT_OK_AND_ASSIGN(auto client,
                           GetStreamExecutorGpuClient(GpuClientOptions()));
   auto se_client = absl::WrapUnique(
-      tensorflow::down_cast<StreamExecutorGpuClient*>(client.release()));
+      tsl::down_cast<StreamExecutorGpuClient*>(client.release()));
   StreamExecutorGpuCompiler compiler;
   xla::CompileOptions opts;
   opts.target_config = Compiler::TargetConfig(