diff --git a/xla/backends/interpreter/BUILD b/xla/backends/interpreter/BUILD
index a98f4a1cb01b0..4feb06b578a1b 100644
--- a/xla/backends/interpreter/BUILD
+++ b/xla/backends/interpreter/BUILD
@@ -92,7 +92,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:dynamic_dimension_inference",
         "//xla/service:executable",
-        "//xla/service:hlo_execution_profile",
         "//xla/service:maybe_owning_device_memory",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
diff --git a/xla/backends/interpreter/executable_base.cc b/xla/backends/interpreter/executable_base.cc
index 793ded37a9de0..d8eb22ca6da80 100644
--- a/xla/backends/interpreter/executable_base.cc
+++ b/xla/backends/interpreter/executable_base.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/service/executable.h"
-#include "xla/service/hlo_execution_profile.h"
 #include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
@@ -57,8 +56,7 @@ InterpreterExecutableBase::InterpreterExecutableBase(
 
 absl::StatusOr<ExecutionOutput> InterpreterExecutableBase::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    std::vector<ExecutionInput> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
+    std::vector<ExecutionInput> arguments) {
   se::Stream* stream = run_options->stream();
   se::StreamExecutor* executor = stream->parent();
   const se::Platform* platform = executor->GetPlatform();
diff --git a/xla/backends/interpreter/executable_base.h b/xla/backends/interpreter/executable_base.h
index ab79a4f62c82e..da1a5c534aa0e 100644
--- a/xla/backends/interpreter/executable_base.h
+++ b/xla/backends/interpreter/executable_base.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/service/dynamic_dimension_inference.h"
 #include "xla/service/executable.h"
-#include "xla/service/hlo_execution_profile.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory_allocator.h"
@@ -44,8 +43,7 @@ class InterpreterExecutableBase : public Executable {
 
   absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      std::vector<ExecutionInput> arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
+      std::vector<ExecutionInput> arguments) override;
 
  protected:
   virtual absl::StatusOr<Literal> Evaluate(
diff --git a/xla/service/cpu/BUILD b/xla/service/cpu/BUILD
index 8b08ca1927da8..9fd73467bd09b 100644
--- a/xla/service/cpu/BUILD
+++ b/xla/service/cpu/BUILD
@@ -609,7 +609,6 @@ cc_library(
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_status_internal",
         "//xla/service:executable",
-        "//xla/service:hlo_execution_profile",
         "//xla/service:hlo_profile_printer_data_cc",
         "//xla/service:hlo_value",
         "//xla/service:maybe_owning_device_memory",
diff --git a/xla/service/cpu/cpu_executable.cc b/xla/service/cpu/cpu_executable.cc
index 34652a016b28e..ddc96e141dfa2 100644
--- a/xla/service/cpu/cpu_executable.cc
+++ b/xla/service/cpu/cpu_executable.cc
@@ -58,7 +58,6 @@ limitations under the License.
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
 #include "xla/service/executable.h"
-#include "xla/service/hlo_execution_profile.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/service_executable_run_options.h"
@@ -290,17 +289,11 @@ CpuExecutable::CreateBufferTable(se::DeviceMemoryAllocator* memory_allocator,
 
 absl::Status CpuExecutable::ExecuteComputeFunction(
     const ExecutableRunOptions* run_options,
-    absl::Span<MaybeOwningDeviceMemory const> buffers,
-    HloExecutionProfile* hlo_execution_profile) {
+    absl::Span<MaybeOwningDeviceMemory const> buffers) {
   uint64_t start_micros = tsl::Env::Default()->NowMicros();
 
-  size_t profile_counters_size =
-      hlo_execution_profile ? hlo_execution_profile->profile_counters().size()
-                            : 0;
-  int64_t* profile_counters =
-      hlo_execution_profile
-          ? hlo_execution_profile->mutable_profile_counters()->data()
-          : nullptr;
+  size_t profile_counters_size = 0;
+  int64_t* profile_counters = nullptr;
 
   // Call the computation function following the calling convention. See the
   // definition of 'ComputeFunctionType' for the details of the calling
@@ -329,12 +322,6 @@ absl::Status CpuExecutable::ExecuteComputeFunction(
       const double nanoseconds = (end_micros - start_micros) * 1000.0;
       run_options->execution_profile()->set_compute_time_ns(
           std::max(nanoseconds, 1.0));
-      // If hlo profiling was disabled then the cycle count is left empty.
-      if (hlo_execution_profile) {
-        run_options->execution_profile()->set_compute_cycle_count(
-            hlo_execution_profile->total_cycles_executed(
-                *module().entry_computation()));
-      }
     }
   };
 
@@ -356,17 +343,11 @@ absl::Status CpuExecutable::ExecuteComputeFunction(
 
 absl::Status CpuExecutable::ExecuteThunks(
     const ExecutableRunOptions* run_options,
-    absl::Span<MaybeOwningDeviceMemory const> buffers,
-    HloExecutionProfile* hlo_execution_profile) {
+    absl::Span<MaybeOwningDeviceMemory const> buffers) {
   uint64_t start_ns = tsl::Env::Default()->NowNanos();
 
-  size_t profile_counters_size =
-      hlo_execution_profile ? hlo_execution_profile->profile_counters().size()
-                            : 0;
-  int64_t* profile_counters =
-      hlo_execution_profile
-          ? hlo_execution_profile->mutable_profile_counters()->data()
-          : nullptr;
+  size_t profile_counters_size = 0;
+  int64_t* profile_counters = nullptr;
 
   BufferAllocations allocations(buffers);
 
@@ -412,12 +393,6 @@ absl::Status CpuExecutable::ExecuteThunks(
     uint64_t end_ns = tsl::Env::Default()->NowNanos();
     run_options->execution_profile()->set_compute_time_ns(
         std::max<int64_t>(end_ns - start_ns, 1));
-    // If hlo profiling was disabled then the cycle count is left empty.
-    if (hlo_execution_profile) {
-      run_options->execution_profile()->set_compute_cycle_count(
-          hlo_execution_profile->total_cycles_executed(
-              *module().entry_computation()));
-    }
   }
 
   return ABSL_PREDICT_FALSE(executed_event.IsError())
@@ -527,8 +502,7 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::CreateResultShapedBuffer(
 
 absl::StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    std::vector<ExecutionInput> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
+    std::vector<ExecutionInput> arguments) {
   if (GetRootValueSet().IsAmbiguous()) {
     return Unimplemented("Points-to set of root instruction is ambiguous");
   }
@@ -576,15 +550,14 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
     CpuExecutable* executable;
     ServiceExecutableRunOptions run_options;
     std::shared_ptr<std::vector<MaybeOwningDeviceMemory>> task_buffers;
-    HloExecutionProfile* hlo_execution_profile;
 
     absl::Status operator()() {
       if (executable->has_compute_function()) {
-        return executable->ExecuteComputeFunction(
-            &run_options.run_options(), *task_buffers, hlo_execution_profile);
+        return executable->ExecuteComputeFunction(&run_options.run_options(),
+                                                  *task_buffers);
       } else if (executable->has_thunks()) {
         return executable->ExecuteThunks(&run_options.run_options(),
-                                         *task_buffers, hlo_execution_profile);
+                                         *task_buffers);
       } else {
         return Internal("No compute function or thunks found.");
       }
@@ -593,8 +566,7 @@ absl::StatusOr<ExecutionOutput> CpuExecutable::ExecuteAsyncOnStream(
   host_stream->EnqueueTaskWithStatus(
       AsyncRunTask{this, *run_options,
                    std::make_shared<std::vector<MaybeOwningDeviceMemory>>(
-                       std::move(buffers)),
-                   hlo_execution_profile});
+                       std::move(buffers))});
 
   MarkToBeReleasedArguments(absl::MakeSpan(arguments), result);
   return std::move(result);
diff --git a/xla/service/cpu/cpu_executable.h b/xla/service/cpu/cpu_executable.h
index 102f15aa94908..592b1af45395b 100644
--- a/xla/service/cpu/cpu_executable.h
+++ b/xla/service/cpu/cpu_executable.h
@@ -39,7 +39,6 @@ limitations under the License.
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
 #include "xla/service/executable.h"
-#include "xla/service/hlo_execution_profile.h"
 #include "xla/service/hlo_profile_printer_data.pb.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/maybe_owning_device_memory.h"
@@ -89,21 +88,18 @@ class CpuExecutable : public Executable {
 
   absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      std::vector<ExecutionInput> arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
+      std::vector<ExecutionInput> arguments) override;
 
   // Calls the generated function performing the computation with the given
   // arguments using the supplied buffers.
   absl::Status ExecuteComputeFunction(
       const ExecutableRunOptions* run_options,
-      absl::Span<MaybeOwningDeviceMemory const> buffers,
-      HloExecutionProfile* hlo_execution_profile);
+      absl::Span<MaybeOwningDeviceMemory const> buffers);
 
   // Calls emitted thunk sequence with the given arguments using the supplied
   // buffers.
   absl::Status ExecuteThunks(const ExecutableRunOptions* run_options,
-                             absl::Span<MaybeOwningDeviceMemory const> buffers,
-                             HloExecutionProfile* hlo_execution_profile);
+                             absl::Span<MaybeOwningDeviceMemory const> buffers);
 
   absl::Span<const std::string> obj_files() const { return obj_files_; }
 
diff --git a/xla/service/executable.cc b/xla/service/executable.cc
index 0c522d4f37b83..36231756281d2 100644
--- a/xla/service/executable.cc
+++ b/xla/service/executable.cc
@@ -16,13 +16,16 @@ limitations under the License.
 #include "xla/service/executable.h"
 
 #include <memory>
+#include <utility>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
+#include "absl/types/span.h"
 #include "xla/debug_options_flags.h"
 #include "xla/service/dump.h"
 #include "xla/service/hlo_graph_dumper.h"
 #include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/shaped_buffer.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/device_description.h"
 #include "tsl/platform/env.h"
@@ -59,10 +62,9 @@ void ExecutionInput::SetUnownedBuffer(const ShapeIndex& index,
 
 absl::StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
-    absl::Span<const ShapedBuffer* const> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
+    absl::Span<const ShapedBuffer* const> arguments) {
   absl::StatusOr<ScopedShapedBuffer> result =
-      ExecuteAsyncOnStream(run_options, arguments, hlo_execution_profile);
+      ExecuteAsyncOnStream(run_options, arguments);
   absl::Status blocking_status = run_options->stream()->BlockHostUntilDone();
   TF_RETURN_IF_ERROR(result.status());
   TF_RETURN_IF_ERROR(blocking_status);
@@ -81,25 +83,22 @@ static ExecutionInput MakeMaybeOwningDeviceMemoryTree(
 
 absl::StatusOr<ScopedShapedBuffer> Executable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    absl::Span<const ShapedBuffer* const> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
+    absl::Span<const ShapedBuffer* const> arguments) {
   std::vector<ExecutionInput> args;
   args.reserve(arguments.size());
   for (const ShapedBuffer* arg : arguments) {
     args.emplace_back(MakeMaybeOwningDeviceMemoryTree(*arg));
   }
   TF_ASSIGN_OR_RETURN(ExecutionOutput out,
-                      ExecuteAsyncOnStream(run_options, std::move(args),
-                                           hlo_execution_profile));
+                      ExecuteAsyncOnStream(run_options, std::move(args)));
   return out.ConsumeResult();
 }
 
 absl::StatusOr<ExecutionOutput> Executable::ExecuteOnStream(
     const ServiceExecutableRunOptions* run_options,
-    std::vector<ExecutionInput> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
-  absl::StatusOr<ExecutionOutput> result = ExecuteAsyncOnStream(
-      run_options, std::move(arguments), hlo_execution_profile);
+    std::vector<ExecutionInput> arguments) {
+  absl::StatusOr<ExecutionOutput> result =
+      ExecuteAsyncOnStream(run_options, std::move(arguments));
   absl::Status blocking_status = run_options->stream()->BlockHostUntilDone();
   TF_RETURN_IF_ERROR(result.status());
   TF_RETURN_IF_ERROR(blocking_status);
@@ -116,8 +115,7 @@ absl::StatusOr<std::vector<ScopedShapedBuffer>> Executable::ExecuteOnStreams(
 
   if (run_options.size() == 1) {
     TF_ASSIGN_OR_RETURN(auto rv,
-                        ExecuteOnStream(&run_options[0], arguments[0],
-                                        /*hlo_execution_profile=*/nullptr));
+                        ExecuteOnStream(&run_options[0], arguments[0]));
     return_values.push_back(std::move(rv));
     return std::move(return_values);
   }
@@ -126,9 +124,8 @@ absl::StatusOr<std::vector<ScopedShapedBuffer>> Executable::ExecuteOnStreams(
     // We cannot BlockHostUntilDone() on the already-launched executions in case
     // of error, since if the executions communicate, the initially launched
     // executions may never complete if not all executions are running.
-    TF_ASSIGN_OR_RETURN(
-        auto rv, ExecuteAsyncOnStream(&run_options[i], arguments[i],
-                                      /*hlo_execution_profile=*/nullptr));
+    TF_ASSIGN_OR_RETURN(auto rv,
+                        ExecuteAsyncOnStream(&run_options[i], arguments[i]));
     return_values.push_back(std::move(rv));
   }
   for (const auto& options : run_options) {
@@ -218,7 +215,7 @@ absl::StatusOr<ScopedShapedBuffer> Executable::ExecuteAsyncOnStreamWrapper(
     absl::Span<const ShapedBuffer* const> arguments) {
   auto state = ExecuteWrapperBeforeExecution(*this, run_options);
   absl::StatusOr<ScopedShapedBuffer> return_value =
-      ExecuteAsyncOnStream(run_options, arguments, nullptr);
+      ExecuteAsyncOnStream(run_options, arguments);
   TF_RETURN_IF_ERROR(ExecuteWrapperAfterExecution(
       this, state, return_value.status(), run_options->stream()));
   return return_value;
@@ -229,7 +226,7 @@ absl::StatusOr<ExecutionOutput> Executable::ExecuteAsyncOnStreamWrapper(
     std::vector<ExecutionInput> arguments) {
   auto state = ExecuteWrapperBeforeExecution(*this, run_options);
   absl::StatusOr<ExecutionOutput> return_value =
-      ExecuteAsyncOnStream(run_options, std::move(arguments), nullptr);
+      ExecuteAsyncOnStream(run_options, std::move(arguments));
   TF_RETURN_IF_ERROR(ExecuteWrapperAfterExecution(
       this, state, return_value.status(), run_options->stream()));
   return return_value;
diff --git a/xla/service/executable.h b/xla/service/executable.h
index e9c4abe32e1b8..12ee096986a05 100644
--- a/xla/service/executable.h
+++ b/xla/service/executable.h
@@ -260,14 +260,10 @@ class Executable {
   // Enqueues the compilation result on the provided stream, passing the given
   // arguments. This call is blocking and returns after the execution is done.
   //
-  // If the hlo_execution_profile is provided as non-nullptr, profiling will be
-  // enabled.
-  //
   // Returns a shaped buffer containing the result of the computation.
   absl::StatusOr<ScopedShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
-      absl::Span<const ShapedBuffer* const> arguments,
-      HloExecutionProfile* hlo_execution_profile);
+      absl::Span<const ShapedBuffer* const> arguments);
 
   // Starts the given program executing on the given stream/executor.
   //
@@ -283,26 +279,19 @@ class Executable {
   // operations are enqueued for launch on the stream. Note that some
   // implementations may in fact block or may block in some circumstances (e.g.,
   // when profiling); i.e., asynchronous is a "may" not a "must".
-  //
-  // If the hlo_execution_profile is provided as non-nullptr, profiling will be
-  // enabled. Note that profiling is tricky to use correctly, as the profiling
-  // objects (when they exist) must out-live the task.
   virtual absl::StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      absl::Span<const ShapedBuffer* const> arguments,
-      HloExecutionProfile* hlo_execution_profile);
+      absl::Span<const ShapedBuffer* const> arguments);
 
   // Same as ExecuteAsyncOnStream(), but blocks waiting for the computation to
   // complete.
   absl::StatusOr<ExecutionOutput> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
-      std::vector<ExecutionInput> arguments,
-      HloExecutionProfile* hlo_execution_profile);
+      std::vector<ExecutionInput> arguments);
 
   virtual absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      std::vector<ExecutionInput> arguments,
-      HloExecutionProfile* hlo_execution_profile) = 0;
+      std::vector<ExecutionInput> arguments) = 0;
 
   // Same as ExecuteOnStream(), but runs this executable on multiple
   // streams. arguments[i] contains the arguments to the execution on
diff --git a/xla/service/executable_test.cc b/xla/service/executable_test.cc
index 388b7be1bd44a..8c21dbe360351 100644
--- a/xla/service/executable_test.cc
+++ b/xla/service/executable_test.cc
@@ -43,8 +43,7 @@ class TestExecutable : public Executable {
 
   absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      std::vector<ExecutionInput> arguments,
-      HloExecutionProfile* hlo_execution_profile) override {
+      std::vector<ExecutionInput> arguments) override {
     return absl::UnimplementedError("Not needed for this test.");
   }
 };
diff --git a/xla/service/gpu/BUILD b/xla/service/gpu/BUILD
index 705106de5fe22..422524336e794 100644
--- a/xla/service/gpu/BUILD
+++ b/xla/service/gpu/BUILD
@@ -576,7 +576,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
-        "//xla/service:hlo_execution_profile",
         "//xla/service:hlo_value",
         "//xla/service:maybe_owning_device_memory",
         "//xla/service:rendezvous",
diff --git a/xla/service/gpu/gpu_executable.cc b/xla/service/gpu/gpu_executable.cc
index 1de6f9106a01a..f71b46da140b9 100644
--- a/xla/service/gpu/gpu_executable.cc
+++ b/xla/service/gpu/gpu_executable.cc
@@ -55,7 +55,6 @@ limitations under the License.
 #include "xla/service/gpu/runtime/sequential_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
 #include "xla/service/gpu/stream_executor_util.h"
-#include "xla/service/hlo_execution_profile.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/rendezvous.h"
@@ -782,15 +781,13 @@ absl::StatusOr<BufferAllocations> GpuExecutable::GenerateBufferAllocations(
 
 absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    std::vector<ExecutionInput> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
+    std::vector<ExecutionInput> arguments) {
   return ExecuteAsyncOnStreamImpl(run_options, absl::MakeSpan(arguments));
 }
 
 absl::StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    absl::Span<const ShapedBuffer* const> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
+    absl::Span<const ShapedBuffer* const> arguments) {
   TF_ASSIGN_OR_RETURN(ExecutionOutput out,
                       ExecuteAsyncOnStreamImpl(run_options, arguments));
   return out.ConsumeResult();
diff --git a/xla/service/gpu/gpu_executable.h b/xla/service/gpu/gpu_executable.h
index 8103043560efb..fd2d51d18e674 100644
--- a/xla/service/gpu/gpu_executable.h
+++ b/xla/service/gpu/gpu_executable.h
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/service/gpu/runtime/annotation.h"
 #include "xla/service/gpu/runtime/sequential_thunk.h"
 #include "xla/service/gpu/runtime/thunk.h"
-#include "xla/service/hlo_execution_profile.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
@@ -131,13 +130,11 @@ class GpuExecutable : public Executable {
   // doesn't match the compute capability passed to this object's constructor.
   absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      std::vector<ExecutionInput> arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
+      std::vector<ExecutionInput> arguments) override;
 
   absl::StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      absl::Span<const ShapedBuffer* const> arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
+      absl::Span<const ShapedBuffer* const> arguments) override;
 
   using VariantArguments = std::variant<absl::Span<const ShapedBuffer* const>,
                                         absl::Span<ExecutionInput>>;
diff --git a/xla/service/hlo_runner.cc b/xla/service/hlo_runner.cc
index 16a3e4a0ac601..6d91c1cdda3be 100644
--- a/xla/service/hlo_runner.cc
+++ b/xla/service/hlo_runner.cc
@@ -546,8 +546,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
             for (int64_t i = 0; i < options.num_replicas; ++i) {
               pool.Schedule([&, i] {
                 auto result = executable->ExecuteOnStream(
-                    &service_run_options[i], argument_buffer_slices[i],
-                    nullptr);
+                    &service_run_options[i], argument_buffer_slices[i]);
                 absl::MutexLock lock(&mutex);
                 thread_results[i] = std::move(result);
               });
@@ -605,7 +604,7 @@ absl::StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
             }
             pool.Schedule([&, i] {
               auto result = executable_provider(i)->ExecuteOnStream(
-                  &service_run_options[i], argument_buffer_slices[i], nullptr);
+                  &service_run_options[i], argument_buffer_slices[i]);
               absl::MutexLock lock(&mutex);
               thread_results[i] = std::move(result);
             });
diff --git a/xla/service/hlo_runner_pjrt.cc b/xla/service/hlo_runner_pjrt.cc
index 6406e6269854d..3ad883a28f5f6 100644
--- a/xla/service/hlo_runner_pjrt.cc
+++ b/xla/service/hlo_runner_pjrt.cc
@@ -122,8 +122,7 @@ class PjRtWrappedExecutable : public Executable {
 
   absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      std::vector<ExecutionInput> arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
+      std::vector<ExecutionInput> arguments) override;
 
   PjRtLoadedExecutable* GetPjRtLoadedExecutable() const {
     return pjrt_loaded_executable_.get();
@@ -135,8 +134,7 @@ class PjRtWrappedExecutable : public Executable {
 
 absl::StatusOr<ExecutionOutput> PjRtWrappedExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    std::vector<ExecutionInput> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
+    std::vector<ExecutionInput> arguments) {
   return Unimplemented(
       "PjRtWrappedExecutable: Unimplemented ExecuteAsyncOnStream");
 }
diff --git a/xla/service/service.cc b/xla/service/service.cc
index 34f3f4e82314d..2f22259a9d4f9 100644
--- a/xla/service/service.cc
+++ b/xla/service/service.cc
@@ -404,8 +404,7 @@ Service::ExecuteParallelAndRegisterResult(
       // Asynchronously launch the computation.
       TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
                           executables[i]->ExecuteAsyncOnStream(
-                              &run_options, arguments[i][replica],
-                              /*hlo_execution_profile=*/nullptr));
+                              &run_options, arguments[i][replica]));
 
       result_buffers.push_back(std::move(result));
     }
diff --git a/xla/stream_executor/tpu/BUILD b/xla/stream_executor/tpu/BUILD
index 2e4c565c7ed15..dd5959ec56ac0 100644
--- a/xla/stream_executor/tpu/BUILD
+++ b/xla/stream_executor/tpu/BUILD
@@ -599,7 +599,6 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:compiler",
         "//xla/service:executable",
-        "//xla/service:hlo_execution_profile",
         "//xla/service:maybe_owning_device_memory",
         "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
@@ -636,7 +635,6 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:executable",
-        "//xla/service:hlo_execution_profile",
         "//xla/service:shaped_buffer",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:stream",
diff --git a/xla/stream_executor/tpu/tpu_executable.cc b/xla/stream_executor/tpu/tpu_executable.cc
index cf70be04961c1..e91f1d279138a 100644
--- a/xla/stream_executor/tpu/tpu_executable.cc
+++ b/xla/stream_executor/tpu/tpu_executable.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/executable.h"
-#include "xla/service/hlo_execution_profile.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/stream_executor/stream.h"
@@ -92,8 +91,7 @@ TpuExecutable::~TpuExecutable() {
 
 absl::StatusOr<ExecutionOutput> TpuExecutable::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    std::vector<ExecutionInput> arguments,
-    HloExecutionProfile* hlo_execution_profile) {
+    std::vector<ExecutionInput> arguments) {
   SE_ExecutableRunOptions se_run_options = ApiConverter::ToC(*run_options);
   SE_ExecutionInput** se_args = new SE_ExecutionInput*[arguments.size()];
   for (int i = 0; i < arguments.size(); ++i) {
@@ -126,7 +124,7 @@ absl::StatusOr<ExecutionOutput> TpuExecutable::ExecuteAsyncOnStream(
   SE_ExecutionOutput se_execution_output;
   StatusHelper status;
   ExecutorApiFn()->TpuExecutable_ExecuteAsyncOnStreamFn(
-      se_executable_, &se_run_options, se_args, arguments.size(), nullptr,
+      se_executable_, &se_run_options, se_args, arguments.size(),
       &se_execution_output, status.c_status);
 
   if (se_run_options.device_assignment.bytes != nullptr) {
diff --git a/xla/stream_executor/tpu/tpu_executable.h b/xla/stream_executor/tpu/tpu_executable.h
index c5b639e9bf7c2..756876bbb50e4 100644
--- a/xla/stream_executor/tpu/tpu_executable.h
+++ b/xla/stream_executor/tpu/tpu_executable.h
@@ -29,7 +29,6 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/executable.h"
-#include "xla/service/hlo_execution_profile.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
@@ -49,8 +48,7 @@ class TpuExecutable : public xla::TpuExecutableInterface {
 
   absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      std::vector<ExecutionInput> arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
+      std::vector<ExecutionInput> arguments) override;
 
   absl::string_view fingerprint() const override;
 
diff --git a/xla/stream_executor/tpu/tpu_executable_interface.cc b/xla/stream_executor/tpu/tpu_executable_interface.cc
index 3912814786d80..5fef42035f58d 100644
--- a/xla/stream_executor/tpu/tpu_executable_interface.cc
+++ b/xla/stream_executor/tpu/tpu_executable_interface.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/service/compiler.h"
 #include "xla/service/executable.h"
-#include "xla/service/hlo_execution_profile.h"
 #include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
@@ -209,8 +208,7 @@ TpuExecutableInterface::AllocateOutputMemoryWithInputReuse(
 
 absl::StatusOr<ExecutionOutput> TpuExecutableInterface::ExecuteAsyncOnStream(
     const ServiceExecutableRunOptions* run_options,
-    std::vector<ExecutionInput> arguments,
-    HloExecutionProfile* /*hlo_execution_profile*/) {
+    std::vector<ExecutionInput> arguments) {
   std::vector<se::DeviceMemoryBase> memory_bases;
   memory_bases.reserve(arguments.size());
   for (auto& argument : arguments) {
diff --git a/xla/stream_executor/tpu/tpu_executable_interface.h b/xla/stream_executor/tpu/tpu_executable_interface.h
index ce9555a1fedf3..5ed03e63a88d2 100644
--- a/xla/stream_executor/tpu/tpu_executable_interface.h
+++ b/xla/stream_executor/tpu/tpu_executable_interface.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/executable.h"
-#include "xla/service/hlo_execution_profile.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape.h"
 #include "xla/stream_executor/device_memory.h"
@@ -46,8 +45,7 @@ class TpuExecutableInterface : public Executable {
 
   absl::StatusOr<ExecutionOutput> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
-      std::vector<ExecutionInput> arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
+      std::vector<ExecutionInput> arguments) override;
 
   // Same as AllocateOutputMemory, except that input buffers can be reused
   // as output buffers. See UserBufferAlias class comment for more details on
diff --git a/xla/stream_executor/tpu/tpu_executor_c_api.h b/xla/stream_executor/tpu/tpu_executor_c_api.h
index a415204e85f59..3b13c28cf109f 100644
--- a/xla/stream_executor/tpu/tpu_executor_c_api.h
+++ b/xla/stream_executor/tpu/tpu_executor_c_api.h
@@ -289,7 +289,6 @@ TFTPU_CAPI_EXPORT void TpuCompiler_DefaultDeviceShapeRepresentation(
 TFTPU_CAPI_EXPORT void TpuExecutable_ExecuteAsyncOnStream(
     SE_Executable* executable, SE_ExecutableRunOptions* se_options,
     SE_ExecutionInput** se_arguments, int se_arguments_size,
-    SE_HloExecutionProfile* hlo_execution_profile,
     SE_ExecutionOutput* se_output, TF_Status* status);
 
 // This frees the XLA_ShapeIndex* array allocated when se_output is returned by
diff --git a/xla/tests/BUILD b/xla/tests/BUILD
index 01fdbfe957b0f..8eeeeb73a2764 100644
--- a/xla/tests/BUILD
+++ b/xla/tests/BUILD
@@ -592,37 +592,6 @@ xla_test(
     ],
 )
 
-xla_test(
-    name = "xla_hlo_profile_test",
-    srcs = ["xla_hlo_profile_test.cc"],
-    backends = [
-        # Hlo profiles are only supported on CPU/GPU.
-        "cpu",
-        "gpu",
-    ],
-    tags = ["not_run:arm"],
-    deps = [
-        ":client_library_test_base",
-        ":test_macros_header",
-        ":test_utils",
-        "//xla:array2d",
-        "//xla:shape_util",
-        "//xla:util",
-        "//xla/client:local_client",
-        "//xla/hlo/builder:xla_builder",
-        "//xla/hlo/builder:xla_computation",
-        "//xla/service:platform_util",
-        "//xla/service:stream_pool",
-        "//xla/tsl/lib/core:status_test_util",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@tsl//tsl/platform:regexp",
-        "@tsl//tsl/platform:test",
-    ],
-)
-
 xla_test(
     name = "axpy_simple_test",
     srcs = ["axpy_simple_test.cc"],
diff --git a/xla/tests/buffer_donation_test.cc b/xla/tests/buffer_donation_test.cc
index 35d9c64884689..666ebb6dd411c 100644
--- a/xla/tests/buffer_donation_test.cc
+++ b/xla/tests/buffer_donation_test.cc
@@ -126,8 +126,7 @@ class BufferDonationTest : public HloTestBase {
     }
 
     absl::StatusOr<ExecutionOutput> output_status =
-        executable->ExecuteAsyncOnStream(&service_run_options, std::move(args),
-                                         /*hlo_execution_profile=*/nullptr);
+        executable->ExecuteAsyncOnStream(&service_run_options, std::move(args));
     if (!expected_failure.empty()) {
       ASSERT_FALSE(output_status.ok());
       ASSERT_TRUE(
diff --git a/xla/tests/xla_hlo_profile_test.cc b/xla/tests/xla_hlo_profile_test.cc
deleted file mode 100644
index 8013f2d8be904..0000000000000
--- a/xla/tests/xla_hlo_profile_test.cc
+++ /dev/null
@@ -1,411 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/strings/match.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_split.h"
-#include "xla/array2d.h"
-#include "xla/client/local_client.h"
-#include "xla/hlo/builder/xla_builder.h"
-#include "xla/hlo/builder/xla_computation.h"
-#include "xla/map_util.h"
-#include "xla/service/platform_util.h"
-#include "xla/service/stream_pool.h"
-#include "xla/shape_util.h"
-#include "xla/tests/client_library_test_base.h"
-#include "xla/tests/test_macros.h"
-#include "xla/tests/test_utils.h"
-#include "xla/tsl/lib/core/status_test_util.h"
-#include "tsl/platform/regexp.h"
-#include "tsl/platform/test.h"
-
-namespace xla {
-namespace {
-
-class HloProfileTest : public ClientLibraryTestBase {};
-
-struct ParsedProfileOutputLine {
-  int64_t cycles;
-  std::string cycles_percentage;
-  double usec;
-  std::string flops;
-  std::string trops;
-  std::string bytes_per_sec;
-  std::string bytes_per_cycle;
-  std::string opcode;
-};
-
-::testing::AssertionResult HasFlops(
-    const ParsedProfileOutputLine& parsed_line) {
-  if (RE2::FullMatch(parsed_line.flops, "[0-9.TGMk]+FLOP/s")) {
-    return ::testing::AssertionSuccess()
-           << "'flops' field present in  " << parsed_line.opcode << ": '"
-           << parsed_line.flops << "'";
-  }
-
-  return ::testing::AssertionFailure()
-         << "'flops' field absent in  " << parsed_line.opcode << ": '"
-         << parsed_line.flops << "'";
-}
-
-::testing::AssertionResult HasTrops(
-    const ParsedProfileOutputLine& parsed_line) {
-  if (RE2::FullMatch(parsed_line.trops, "[0-9.TGMk]+TROP/s")) {
-    return ::testing::AssertionSuccess()
-           << "'trops' field present in  " << parsed_line.opcode << ": '"
-           << parsed_line.trops << "'";
-  }
-
-  return ::testing::AssertionFailure()
-         << "'trops' field absent in  " << parsed_line.opcode << ": '"
-         << parsed_line.trops << "'";
-}
-
-absl::Status ParseOneProfileOutputLine(
-    const std::string& line, bool expect_hlo,
-    absl::flat_hash_map<std::string, ParsedProfileOutputLine>* parsed_results,
-    absl::Span<const absl::string_view> opcodes_to_ignore = {}) {
-  std::string separator = "[^:]*:: +";
-  std::string match_percentage = R"(\d+\.\d*% +\d+Σ)";
-  std::string match_cycles =
-      R"((\d+) cycles +\( *()" + match_percentage + R"()\))";
-  std::string match_usecs = "([0-9.]+) usec";
-  std::string match_flops = "([^ ]*)";
-  std::string match_trops = "([^ ]*)";
-  std::string match_bytes_per_sec = "([0-9.TGMKi]*)(?:B/s)?";
-  std::string match_bytes_per_cycle = "([0-9.TGMKi]*)(?:B/cycle)?";
-
-  // The underlined part is what we're trying to match with match_opcode:
-  //
-  //   %dot33 = f32[256,256]{1,0} dot(...)
-  //                              ^^^
-
-  std::string match_opcode = expect_hlo ? "%[^=]+= [^ ]+ ([^(]+)\\(.*"
-                                        : "(\\[total\\])( \\[entry\\])?";
-  std::string regexp_pattern = absl::StrCat(
-      " +", match_cycles, separator, match_usecs, separator, match_flops,
-      separator, match_trops, separator, match_bytes_per_sec, separator,
-      match_bytes_per_cycle, separator, match_opcode);
-
-  ParsedProfileOutputLine parsed_line;
-  bool matched = RE2::FullMatch(
-      line, regexp_pattern, &parsed_line.cycles, &parsed_line.cycles_percentage,
-      &parsed_line.usec, &parsed_line.flops, &parsed_line.trops,
-      &parsed_line.bytes_per_sec, &parsed_line.bytes_per_cycle,
-      &parsed_line.opcode);
-  if (!matched) {
-    return tsl::errors::InvalidArgument("Input did not match regexp.  Input: ",
-                                        line, ", Regexp: ", regexp_pattern);
-  }
-
-  if (!absl::c_linear_search(opcodes_to_ignore, parsed_line.opcode)) {
-    InsertOrDie(parsed_results, parsed_line.opcode, parsed_line);
-  }
-
-  return absl::OkStatus();
-}
-
-bool IsExtraMetricProfileOutputLine(const std::string& line) {
-  return RE2::FullMatch(line, "Extra metric \\S+: \\d+");
-}
-
-// Returns void so that we can ASSERT.
-void ExecuteAndFetchProfile(std::string* profile_output, LocalClient* client,
-                            const XlaComputation& computation,
-                            const Shape& lhs_arg_shape,
-                            const Shape& rhs_arg_shape) {
-  LocalService* service = ClientLibrary::GetXlaService(client->platform());
-  Backend* backend = service->mutable_backend();
-  se::StreamExecutor* executor = backend->default_stream_executor();
-  se::DeviceMemoryAllocator* allocator = backend->memory_allocator();
-  auto* transfer_manager = backend->transfer_manager();
-  TF_ASSERT_OK_AND_ASSIGN(
-      StreamPool::Ptr stream_ptr,
-      backend->BorrowStream(backend->default_device_ordinal()));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      ScopedShapedBuffer lhs_arg,
-      transfer_manager->AllocateScopedShapedBuffer(
-          lhs_arg_shape, allocator, backend->default_device_ordinal()));
-  TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
-      stream_ptr.get(), Literal::CreateFromShape(lhs_arg_shape), lhs_arg));
-
-  TF_ASSERT_OK_AND_ASSIGN(
-      ScopedShapedBuffer rhs_arg,
-      transfer_manager->AllocateScopedShapedBuffer(
-          rhs_arg_shape, allocator, backend->default_device_ordinal()));
-  TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
-      stream_ptr.get(), Literal::CreateFromShape(rhs_arg_shape), rhs_arg));
-
-  ExecutableBuildOptions build_options;
-  build_options.mutable_debug_options()->set_xla_hlo_profile(true);
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto local_executables,
-      client->Compile(computation, {&lhs_arg_shape, &rhs_arg_shape},
-                      build_options));
-
-  Executable* executable = local_executables[0]->executable();
-  HloExecutionProfile hlo_execution_profile(
-      &executable->hlo_profile_printer_data(),
-      &executable->hlo_profile_index_map());
-
-  ExecutableRunOptions exec_run_options;
-  exec_run_options.set_stream(stream_ptr.get());
-  exec_run_options.set_allocator(backend->memory_allocator());
-  exec_run_options.set_intra_op_thread_pool(
-      backend->eigen_intra_op_thread_pool_device());
-  ServiceExecutableRunOptions run_options(exec_run_options,
-                                          /*borrow_stream=*/nullptr);
-  std::vector<const ShapedBuffer*> args = {&lhs_arg, &rhs_arg};
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto execution_result,
-      executable->ExecuteOnStream(&run_options, args, &hlo_execution_profile));
-  TF_ASSERT_OK(stream_ptr->BlockHostUntilDone());
-  (void)execution_result;
-
-  *profile_output = hlo_execution_profile.ToString(
-      executor->GetDeviceDescription().clock_rate_ghz());
-
-  XLA_VLOG_LINES(4, *profile_output);
-}
-
-XLA_TEST_F(HloProfileTest,
-           DISABLED_ON_CPU(DISABLED_ON_GPU(ProfileSingleComputation))) {
-  const int64_t m = 32, k = 32, n = 32;
-  Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k});
-  Shape rhs_shape = ShapeUtil::MakeShape(F32, {m, k});
-
-  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
-                          PlatformUtil::GetDefaultPlatform());
-  TF_ASSERT_OK_AND_ASSIGN(LocalClient * client,
-                          ClientLibrary::GetOrCreateLocalClient(platform));
-
-  XlaBuilder builder(TestName());
-  Tanh(Add(
-      Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {m, k}), "dot_lhs"),
-      Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {k, n}), "dot_rhs")));
-
-  TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
-
-  std::string profile_output;
-  ExecuteAndFetchProfile(&profile_output, client, computation, lhs_shape,
-                         rhs_shape);
-  VLOG(4) << "Profile Output:\n" << profile_output;
-  std::vector<std::string> profile_output_lines =
-      absl::StrSplit(profile_output, '\n');
-
-  absl::flat_hash_map<std::string, ParsedProfileOutputLine>
-      parsed_profile_lines;
-
-  int line_no = 0;
-
-  // Skip extra metrics.
-  while (IsExtraMetricProfileOutputLine(profile_output_lines[line_no])) {
-    line_no++;
-  }
-
-  line_no++;  // Skip 'Execution profile for ....'
-
-  ASSERT_LT(line_no, profile_output_lines.size());
-  TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++],
-                                         /*expect_hlo=*/false,
-                                         &parsed_profile_lines));
-
-  ASSERT_LT(line_no, profile_output_lines.size());
-  TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++],
-                                         /*expect_hlo=*/true,
-                                         &parsed_profile_lines));
-
-  ASSERT_LT(line_no, profile_output_lines.size());
-  TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++],
-                                         /*expect_hlo=*/true,
-                                         &parsed_profile_lines));
-
-  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_profile,
-                          MaybeFind(parsed_profile_lines, "[total]"));
-  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine dot_profile,
-                          MaybeFind(parsed_profile_lines, "add"));
-  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine tanh_profile,
-                          MaybeFind(parsed_profile_lines, "tanh"));
-
-  EXPECT_GT(total_profile.cycles, 0);
-  EXPECT_EQ(total_profile.cycles_percentage, "100.% 100Σ");
-
-  EXPECT_TRUE(HasFlops(total_profile));
-  EXPECT_TRUE(HasTrops(total_profile));
-
-  EXPECT_GT(total_profile.cycles, dot_profile.cycles);
-  EXPECT_NE(dot_profile.cycles_percentage, "0.00%");
-  EXPECT_NE(dot_profile.cycles_percentage, "100.00%");
-
-  EXPECT_TRUE(HasFlops(dot_profile));
-  EXPECT_FALSE(HasTrops(dot_profile));
-
-  EXPECT_GT(total_profile.cycles, tanh_profile.cycles);
-  EXPECT_NE(tanh_profile.cycles_percentage, "0.00%");
-  EXPECT_NE(tanh_profile.cycles_percentage, "100.00%");
-
-  EXPECT_FALSE(HasFlops(tanh_profile));
-  EXPECT_TRUE(HasTrops(tanh_profile));
-}
-
-XLA_TEST_F(HloProfileTest,
-           DISABLED_ON_CPU(DISABLED_ON_GPU(ProfileWhileComputation))) {
-  const int64_t size = 32;
-  Shape matrix_shape = ShapeUtil::MakeShape(F32, {size, size});
-  Shape while_result_shape =
-      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {}), matrix_shape});
-
-  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
-                          PlatformUtil::GetDefaultPlatform());
-  TF_ASSERT_OK_AND_ASSIGN(LocalClient * client,
-                          ClientLibrary::GetOrCreateLocalClient(platform));
-
-  XlaComputation condition;
-  {
-    XlaBuilder builder("condition");
-    auto state = Parameter(&builder, 0, while_result_shape, "state");
-    auto iteration = GetTupleElement(state, 0);
-    Gt(ConstantR0<int32_t>(&builder, 5), iteration);
-    TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
-  }
-
-  XlaComputation body;
-  {
-    XlaBuilder builder("body");
-    auto state = Parameter(&builder, 0, while_result_shape, "state");
-    auto matrix = GetTupleElement(state, 1);
-    auto next_iteration =
-        Add(GetTupleElement(state, 0), ConstantR0<int32_t>(&builder, 1));
-    Tuple(&builder, {next_iteration, Mul(matrix, matrix)});
-    TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
-  }
-
-  XlaBuilder builder(TestName());
-  auto initial_while_state =
-      Tuple(&builder, {ConstantR0<int32_t>(&builder, 0),
-                       Parameter(&builder, 0, matrix_shape, "initial_value")});
-  auto while_result = While(condition, body, initial_while_state);
-  Add(GetTupleElement(while_result, 1),
-      Parameter(&builder, 1, matrix_shape, "other_value"));
-
-  TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
-
-  std::string profile_output;
-  ExecuteAndFetchProfile(&profile_output, client, computation, matrix_shape,
-                         matrix_shape);
-  SCOPED_TRACE(profile_output);
-
-  std::vector<std::string> profile_output_lines =
-      absl::StrSplit(profile_output, '\n');
-
-  auto while_body_profile_start =
-      absl::c_find_if(profile_output_lines, [](absl::string_view s) {
-        return absl::StartsWith(s, "Execution profile for body");
-      });
-
-  ASSERT_NE(while_body_profile_start, profile_output_lines.cend());
-
-  auto while_body_profile_end =
-      std::find_if(while_body_profile_start, profile_output_lines.end(),
-                   [](absl::string_view s) {
-                     return absl::StartsWith(s, "********** microseconds ");
-                   });
-
-  // We emit a blank line before the "microseconds report" line.
-  while_body_profile_end--;
-
-  ASSERT_NE(while_body_profile_end, profile_output_lines.end());
-
-  absl::flat_hash_map<std::string, ParsedProfileOutputLine>
-      parsed_profile_lines;
-
-  for (auto while_body_profile_i = while_body_profile_start + 1;
-       while_body_profile_i != while_body_profile_end; while_body_profile_i++) {
-    // There are multiple "get-tuple-element" instructions in the while body so
-    // we ignore them -- we don't want parsed_profile_lines to be a multi-map.
-    TF_ASSERT_OK(ParseOneProfileOutputLine(
-        *while_body_profile_i,
-        /*expect_hlo=*/while_body_profile_i != (while_body_profile_start + 1),
-        &parsed_profile_lines, {"get-tuple-element"}));
-  }
-
-  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_while_body_profile,
-                          MaybeFind(parsed_profile_lines, "[total]"));
-  TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine multiply_profile,
-                          MaybeFind(parsed_profile_lines, "multiply"));
-
-  EXPECT_GT(total_while_body_profile.cycles, 0);
-  EXPECT_EQ(total_while_body_profile.opcode, "[total]");
-  EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.% 100Σ");
-
-  EXPECT_GT(total_while_body_profile.cycles, multiply_profile.cycles);
-  EXPECT_NE(multiply_profile.cycles_percentage, "0.00%");
-  EXPECT_NE(multiply_profile.cycles_percentage, "100.00%");
-}
-}  // namespace
-}  // namespace xla
-
-static std::pair<int, char**> AddXlaHloProfileFlag(int argc, char** argv) {
-  char** new_argv = new char*[argc + 2];
-  for (int i = 0; i < argc; i++) {
-    new_argv[i] = argv[i];
-  }
-
-  // We do it this way (as opposed to piping in a modified DebugOptions
-  // instance) for better end-to-end integration testing.
-  new_argv[argc] = strdup("--xla_hlo_profile");
-
-  // Fusion can change the Hlo instructions that show up in the final Hlo
-  // executable, so block it here. Also block the WhileLoopInvariantCodeMotion
-  // pass, otherwise a while loop is transformed and we could not match the
-  // original name in the ProfileWhileComputation test.
-  new_argv[argc + 1] = strdup(
-      "--xla_disable_hlo_passes=fusion,fusion_merger,multi_output_fusion,"
-      "while-loop-invariant-code-motion");
-  return {argc + 2, new_argv};
-}
-
-GTEST_API_ int main(int argc, char** argv) {
-  std::vector<tsl::Flag> flag_list;
-  xla::AppendDebugOptionsFlags(&flag_list);
-  std::tie(argc, argv) = AddXlaHloProfileFlag(argc, argv);
-  std::unique_ptr<char*[]> argv_ptr(argv);
-  char* to_be_freed[] = {argv[argc - 1], argv[argc - 2]};
-
-  auto usage = tsl::Flags::Usage(argv[0], flag_list);
-  const bool parseResult = tsl::Flags::Parse(&argc, argv, flag_list);
-  for (auto p : to_be_freed) {
-    free(p);
-  }
-  if (!parseResult) {
-    LOG(ERROR) << "\n" << usage;
-    return 2;
-  }
-
-  testing::InitGoogleTest(&argc, argv);
-  if (argc > 1) {
-    LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage;
-    return 2;
-  }
-  return RUN_ALL_TESTS();
-}