diff --git a/xla/backends/interpreter/BUILD b/xla/backends/interpreter/BUILD index a98f4a1cb01b0..4feb06b578a1b 100644 --- a/xla/backends/interpreter/BUILD +++ b/xla/backends/interpreter/BUILD @@ -92,7 +92,6 @@ cc_library( "//xla/hlo/ir:hlo", "//xla/service:dynamic_dimension_inference", "//xla/service:executable", - "//xla/service:hlo_execution_profile", "//xla/service:maybe_owning_device_memory", "//xla/service:shaped_buffer", "//xla/service:transfer_manager", diff --git a/xla/backends/interpreter/executable_base.cc b/xla/backends/interpreter/executable_base.cc index 793ded37a9de0..d8eb22ca6da80 100644 --- a/xla/backends/interpreter/executable_base.cc +++ b/xla/backends/interpreter/executable_base.cc @@ -27,7 +27,6 @@ limitations under the License. #include "xla/layout_util.h" #include "xla/literal.h" #include "xla/service/executable.h" -#include "xla/service/hlo_execution_profile.h" #include "xla/service/maybe_owning_device_memory.h" #include "xla/service/service_executable_run_options.h" #include "xla/service/shaped_buffer.h" @@ -57,8 +56,7 @@ InterpreterExecutableBase::InterpreterExecutableBase( absl::StatusOr InterpreterExecutableBase::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile) { + std::vector arguments) { se::Stream* stream = run_options->stream(); se::StreamExecutor* executor = stream->parent(); const se::Platform* platform = executor->GetPlatform(); diff --git a/xla/backends/interpreter/executable_base.h b/xla/backends/interpreter/executable_base.h index ab79a4f62c82e..da1a5c534aa0e 100644 --- a/xla/backends/interpreter/executable_base.h +++ b/xla/backends/interpreter/executable_base.h @@ -27,7 +27,6 @@ limitations under the License. #include "xla/literal.h" #include "xla/service/dynamic_dimension_inference.h" #include "xla/service/executable.h" -#include "xla/service/hlo_execution_profile.h" #include "xla/service/service_executable_run_options.h" #include "xla/shape.h" #include "xla/stream_executor/device_memory_allocator.h" @@ -44,8 +43,7 @@ class InterpreterExecutableBase : public Executable { absl::StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile) override; + std::vector arguments) override; protected: virtual absl::StatusOr Evaluate( diff --git a/xla/service/cpu/BUILD b/xla/service/cpu/BUILD index 8b08ca1927da8..9fd73467bd09b 100644 --- a/xla/service/cpu/BUILD +++ b/xla/service/cpu/BUILD @@ -609,7 +609,6 @@ cc_library( "//xla/service:custom_call_status", "//xla/service:custom_call_status_internal", "//xla/service:executable", - "//xla/service:hlo_execution_profile", "//xla/service:hlo_profile_printer_data_cc", "//xla/service:hlo_value", "//xla/service:maybe_owning_device_memory", diff --git a/xla/service/cpu/cpu_executable.cc b/xla/service/cpu/cpu_executable.cc index 34652a016b28e..ddc96e141dfa2 100644 --- a/xla/service/cpu/cpu_executable.cc +++ b/xla/service/cpu/cpu_executable.cc @@ -58,7 +58,6 @@ limitations under the License. #include "xla/service/custom_call_status.h" #include "xla/service/custom_call_status_internal.h" #include "xla/service/executable.h" -#include "xla/service/hlo_execution_profile.h" #include "xla/service/hlo_value.h" #include "xla/service/maybe_owning_device_memory.h" #include "xla/service/service_executable_run_options.h" @@ -290,17 +289,11 @@ CpuExecutable::CreateBufferTable(se::DeviceMemoryAllocator* memory_allocator, absl::Status CpuExecutable::ExecuteComputeFunction( const ExecutableRunOptions* run_options, - absl::Span buffers, - HloExecutionProfile* hlo_execution_profile) { + absl::Span buffers) { uint64_t start_micros = tsl::Env::Default()->NowMicros(); - size_t profile_counters_size = - hlo_execution_profile ? hlo_execution_profile->profile_counters().size() - : 0; - int64_t* profile_counters = - hlo_execution_profile - ? hlo_execution_profile->mutable_profile_counters()->data() - : nullptr; + size_t profile_counters_size = 0; + int64_t* profile_counters = nullptr; // Call the computation function following the calling convention. See the // definition of 'ComputeFunctionType' for the details of the calling @@ -329,12 +322,6 @@ absl::Status CpuExecutable::ExecuteComputeFunction( const double nanoseconds = (end_micros - start_micros) * 1000.0; run_options->execution_profile()->set_compute_time_ns( std::max(nanoseconds, 1.0)); - // If hlo profiling was disabled then the cycle count is left empty. - if (hlo_execution_profile) { - run_options->execution_profile()->set_compute_cycle_count( - hlo_execution_profile->total_cycles_executed( - *module().entry_computation())); - } } }; @@ -356,17 +343,11 @@ absl::Status CpuExecutable::ExecuteComputeFunction( absl::Status CpuExecutable::ExecuteThunks( const ExecutableRunOptions* run_options, - absl::Span buffers, - HloExecutionProfile* hlo_execution_profile) { + absl::Span buffers) { uint64_t start_ns = tsl::Env::Default()->NowNanos(); - size_t profile_counters_size = - hlo_execution_profile ? hlo_execution_profile->profile_counters().size() - : 0; - int64_t* profile_counters = - hlo_execution_profile - ? hlo_execution_profile->mutable_profile_counters()->data() - : nullptr; + size_t profile_counters_size = 0; + int64_t* profile_counters = nullptr; BufferAllocations allocations(buffers); @@ -412,12 +393,6 @@ absl::Status CpuExecutable::ExecuteThunks( uint64_t end_ns = tsl::Env::Default()->NowNanos(); run_options->execution_profile()->set_compute_time_ns( std::max(end_ns - start_ns, 1)); - // If hlo profiling was disabled then the cycle count is left empty. - if (hlo_execution_profile) { - run_options->execution_profile()->set_compute_cycle_count( - hlo_execution_profile->total_cycles_executed( - *module().entry_computation())); - } } return ABSL_PREDICT_FALSE(executed_event.IsError()) @@ -527,8 +502,7 @@ absl::StatusOr CpuExecutable::CreateResultShapedBuffer( absl::StatusOr CpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile) { + std::vector arguments) { if (GetRootValueSet().IsAmbiguous()) { return Unimplemented("Points-to set of root instruction is ambiguous"); } @@ -576,15 +550,14 @@ absl::StatusOr CpuExecutable::ExecuteAsyncOnStream( CpuExecutable* executable; ServiceExecutableRunOptions run_options; std::shared_ptr> task_buffers; - HloExecutionProfile* hlo_execution_profile; absl::Status operator()() { if (executable->has_compute_function()) { - return executable->ExecuteComputeFunction( - &run_options.run_options(), *task_buffers, hlo_execution_profile); + return executable->ExecuteComputeFunction(&run_options.run_options(), + *task_buffers); } else if (executable->has_thunks()) { return executable->ExecuteThunks(&run_options.run_options(), - *task_buffers, hlo_execution_profile); + *task_buffers); } else { return Internal("No compute function or thunks found."); } @@ -593,8 +566,7 @@ absl::StatusOr CpuExecutable::ExecuteAsyncOnStream( host_stream->EnqueueTaskWithStatus( AsyncRunTask{this, *run_options, std::make_shared>( - std::move(buffers)), - hlo_execution_profile}); + std::move(buffers))}); MarkToBeReleasedArguments(absl::MakeSpan(arguments), result); return std::move(result); diff --git a/xla/service/cpu/cpu_executable.h b/xla/service/cpu/cpu_executable.h index 102f15aa94908..592b1af45395b 100644 --- a/xla/service/cpu/cpu_executable.h +++ b/xla/service/cpu/cpu_executable.h @@ -39,7 +39,6 @@ limitations under the License. #include "xla/service/custom_call_status.h" #include "xla/service/custom_call_status_internal.h" #include "xla/service/executable.h" -#include "xla/service/hlo_execution_profile.h" #include "xla/service/hlo_profile_printer_data.pb.h" #include "xla/service/hlo_value.h" #include "xla/service/maybe_owning_device_memory.h" @@ -89,21 +88,18 @@ class CpuExecutable : public Executable { absl::StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile) override; + std::vector arguments) override; // Calls the generated function performing the computation with the given // arguments using the supplied buffers. absl::Status ExecuteComputeFunction( const ExecutableRunOptions* run_options, - absl::Span buffers, - HloExecutionProfile* hlo_execution_profile); + absl::Span buffers); // Calls emitted thunk sequence with the given arguments using the supplied // buffers. absl::Status ExecuteThunks(const ExecutableRunOptions* run_options, - absl::Span buffers, - HloExecutionProfile* hlo_execution_profile); + absl::Span buffers); absl::Span obj_files() const { return obj_files_; } diff --git a/xla/service/executable.cc b/xla/service/executable.cc index 0c522d4f37b83..36231756281d2 100644 --- a/xla/service/executable.cc +++ b/xla/service/executable.cc @@ -16,13 +16,16 @@ limitations under the License. #include "xla/service/executable.h" #include +#include #include "absl/status/status.h" #include "absl/strings/str_format.h" +#include "absl/types/span.h" #include "xla/debug_options_flags.h" #include "xla/service/dump.h" #include "xla/service/hlo_graph_dumper.h" #include "xla/service/maybe_owning_device_memory.h" +#include "xla/service/shaped_buffer.h" #include "xla/status_macros.h" #include "xla/stream_executor/device_description.h" #include "tsl/platform/env.h" @@ -59,10 +62,9 @@ void ExecutionInput::SetUnownedBuffer(const ShapeIndex& index, absl::StatusOr Executable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, - absl::Span arguments, - HloExecutionProfile* hlo_execution_profile) { + absl::Span arguments) { absl::StatusOr result = - ExecuteAsyncOnStream(run_options, arguments, hlo_execution_profile); + ExecuteAsyncOnStream(run_options, arguments); absl::Status blocking_status = run_options->stream()->BlockHostUntilDone(); TF_RETURN_IF_ERROR(result.status()); TF_RETURN_IF_ERROR(blocking_status); @@ -81,25 +83,22 @@ static ExecutionInput MakeMaybeOwningDeviceMemoryTree( absl::StatusOr Executable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - absl::Span arguments, - HloExecutionProfile* hlo_execution_profile) { + absl::Span arguments) { std::vector args; args.reserve(arguments.size()); for (const ShapedBuffer* arg : arguments) { args.emplace_back(MakeMaybeOwningDeviceMemoryTree(*arg)); } TF_ASSIGN_OR_RETURN(ExecutionOutput out, - ExecuteAsyncOnStream(run_options, std::move(args), - hlo_execution_profile)); + ExecuteAsyncOnStream(run_options, std::move(args))); return out.ConsumeResult(); } absl::StatusOr Executable::ExecuteOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile) { - absl::StatusOr result = ExecuteAsyncOnStream( - run_options, std::move(arguments), hlo_execution_profile); + std::vector arguments) { + absl::StatusOr result = + ExecuteAsyncOnStream(run_options, std::move(arguments)); absl::Status blocking_status = run_options->stream()->BlockHostUntilDone(); TF_RETURN_IF_ERROR(result.status()); TF_RETURN_IF_ERROR(blocking_status); @@ -116,8 +115,7 @@ absl::StatusOr> Executable::ExecuteOnStreams( if (run_options.size() == 1) { TF_ASSIGN_OR_RETURN(auto rv, - ExecuteOnStream(&run_options[0], arguments[0], - /*hlo_execution_profile=*/nullptr)); + ExecuteOnStream(&run_options[0], arguments[0])); return_values.push_back(std::move(rv)); return std::move(return_values); } @@ -126,9 +124,8 @@ absl::StatusOr> Executable::ExecuteOnStreams( // We cannot BlockHostUntilDone() on the already-launched executions in case // of error, since if the executions communicate, the initially launched // executions may never complete if not all executions are running. - TF_ASSIGN_OR_RETURN( - auto rv, ExecuteAsyncOnStream(&run_options[i], arguments[i], - /*hlo_execution_profile=*/nullptr)); + TF_ASSIGN_OR_RETURN(auto rv, + ExecuteAsyncOnStream(&run_options[i], arguments[i])); return_values.push_back(std::move(rv)); } for (const auto& options : run_options) { @@ -218,7 +215,7 @@ absl::StatusOr Executable::ExecuteAsyncOnStreamWrapper( absl::Span arguments) { auto state = ExecuteWrapperBeforeExecution(*this, run_options); absl::StatusOr return_value = - ExecuteAsyncOnStream(run_options, arguments, nullptr); + ExecuteAsyncOnStream(run_options, arguments); TF_RETURN_IF_ERROR(ExecuteWrapperAfterExecution( this, state, return_value.status(), run_options->stream())); return return_value; @@ -229,7 +226,7 @@ absl::StatusOr Executable::ExecuteAsyncOnStreamWrapper( std::vector arguments) { auto state = ExecuteWrapperBeforeExecution(*this, run_options); absl::StatusOr return_value = - ExecuteAsyncOnStream(run_options, std::move(arguments), nullptr); + ExecuteAsyncOnStream(run_options, std::move(arguments)); TF_RETURN_IF_ERROR(ExecuteWrapperAfterExecution( this, state, return_value.status(), run_options->stream())); return return_value; diff --git a/xla/service/executable.h b/xla/service/executable.h index e9c4abe32e1b8..12ee096986a05 100644 --- a/xla/service/executable.h +++ b/xla/service/executable.h @@ -260,14 +260,10 @@ class Executable { // Enqueues the compilation result on the provided stream, passing the given // arguments. This call is blocking and returns after the execution is done. // - // If the hlo_execution_profile is provided as non-nullptr, profiling will be - // enabled. - // // Returns a shaped buffer containing the result of the computation. absl::StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, - absl::Span arguments, - HloExecutionProfile* hlo_execution_profile); + absl::Span arguments); // Starts the given program executing on the given stream/executor. // @@ -283,26 +279,19 @@ class Executable { // operations are enqueued for launch on the stream. Note that some // implementations may in fact block or may block in some circumstances (e.g., // when profiling); i.e., asynchronous is a "may" not a "must". - // - // If the hlo_execution_profile is provided as non-nullptr, profiling will be - // enabled. Note that profiling is tricky to use correctly, as the profiling - // objects (when they exist) must out-live the task. virtual absl::StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - absl::Span arguments, - HloExecutionProfile* hlo_execution_profile); + absl::Span arguments); // Same as ExecuteAsyncOnStream(), but blocks waiting for the computation to // complete. absl::StatusOr ExecuteOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile); + std::vector arguments); virtual absl::StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile) = 0; + std::vector arguments) = 0; // Same as ExecuteOnStream(), but runs this executable on multiple // streams. arguments[i] contains the arguments to the execution on diff --git a/xla/service/executable_test.cc b/xla/service/executable_test.cc index 388b7be1bd44a..8c21dbe360351 100644 --- a/xla/service/executable_test.cc +++ b/xla/service/executable_test.cc @@ -43,8 +43,7 @@ class TestExecutable : public Executable { absl::StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile) override { + std::vector arguments) override { return absl::UnimplementedError("Not needed for this test."); } }; diff --git a/xla/service/gpu/BUILD b/xla/service/gpu/BUILD index 705106de5fe22..422524336e794 100644 --- a/xla/service/gpu/BUILD +++ b/xla/service/gpu/BUILD @@ -576,7 +576,6 @@ cc_library( "//xla/hlo/ir:hlo", "//xla/service:buffer_assignment", "//xla/service:executable", - "//xla/service:hlo_execution_profile", "//xla/service:hlo_value", "//xla/service:maybe_owning_device_memory", "//xla/service:rendezvous", diff --git a/xla/service/gpu/gpu_executable.cc b/xla/service/gpu/gpu_executable.cc index 1de6f9106a01a..f71b46da140b9 100644 --- a/xla/service/gpu/gpu_executable.cc +++ b/xla/service/gpu/gpu_executable.cc @@ -55,7 +55,6 @@ limitations under the License. #include "xla/service/gpu/runtime/sequential_thunk.h" #include "xla/service/gpu/runtime/thunk.h" #include "xla/service/gpu/stream_executor_util.h" -#include "xla/service/hlo_execution_profile.h" #include "xla/service/hlo_value.h" #include "xla/service/maybe_owning_device_memory.h" #include "xla/service/rendezvous.h" @@ -782,15 +781,13 @@ absl::StatusOr GpuExecutable::GenerateBufferAllocations( absl::StatusOr GpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile) { + std::vector arguments) { return ExecuteAsyncOnStreamImpl(run_options, absl::MakeSpan(arguments)); } absl::StatusOr GpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - absl::Span arguments, - HloExecutionProfile* hlo_execution_profile) { + absl::Span arguments) { TF_ASSIGN_OR_RETURN(ExecutionOutput out, ExecuteAsyncOnStreamImpl(run_options, arguments)); return out.ConsumeResult(); diff --git a/xla/service/gpu/gpu_executable.h b/xla/service/gpu/gpu_executable.h index 8103043560efb..fd2d51d18e674 100644 --- a/xla/service/gpu/gpu_executable.h +++ b/xla/service/gpu/gpu_executable.h @@ -40,7 +40,6 @@ limitations under the License. #include "xla/service/gpu/runtime/annotation.h" #include "xla/service/gpu/runtime/sequential_thunk.h" #include "xla/service/gpu/runtime/thunk.h" -#include "xla/service/hlo_execution_profile.h" #include "xla/service/service_executable_run_options.h" #include "xla/service/shaped_buffer.h" #include "xla/shape.h" @@ -131,13 +130,11 @@ class GpuExecutable : public Executable { // doesn't match the compute capability passed to this object's constructor. absl::StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile) override; + std::vector arguments) override; absl::StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - absl::Span arguments, - HloExecutionProfile* hlo_execution_profile) override; + absl::Span arguments) override; using VariantArguments = std::variant, absl::Span>; diff --git a/xla/service/hlo_runner.cc b/xla/service/hlo_runner.cc index 16a3e4a0ac601..6d91c1cdda3be 100644 --- a/xla/service/hlo_runner.cc +++ b/xla/service/hlo_runner.cc @@ -546,8 +546,7 @@ absl::StatusOr> HloRunner::ExecuteReplicated( for (int64_t i = 0; i < options.num_replicas; ++i) { pool.Schedule([&, i] { auto result = executable->ExecuteOnStream( - &service_run_options[i], argument_buffer_slices[i], - nullptr); + &service_run_options[i], argument_buffer_slices[i]); absl::MutexLock lock(&mutex); thread_results[i] = std::move(result); }); @@ -605,7 +604,7 @@ absl::StatusOr> HloRunner::ExecuteReplicated( } pool.Schedule([&, i] { auto result = executable_provider(i)->ExecuteOnStream( - &service_run_options[i], argument_buffer_slices[i], nullptr); + &service_run_options[i], argument_buffer_slices[i]); absl::MutexLock lock(&mutex); thread_results[i] = std::move(result); }); diff --git a/xla/service/hlo_runner_pjrt.cc b/xla/service/hlo_runner_pjrt.cc index 6406e6269854d..3ad883a28f5f6 100644 --- a/xla/service/hlo_runner_pjrt.cc +++ b/xla/service/hlo_runner_pjrt.cc @@ -122,8 +122,7 @@ class PjRtWrappedExecutable : public Executable { absl::StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile) override; + std::vector arguments) override; PjRtLoadedExecutable* GetPjRtLoadedExecutable() const { return pjrt_loaded_executable_.get(); @@ -135,8 +134,7 @@ class PjRtWrappedExecutable : public Executable { absl::StatusOr PjRtWrappedExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile) { + std::vector arguments) { return Unimplemented( "PjRtWrappedExecutable: Unimplemented ExecuteAsyncOnStream"); } diff --git a/xla/service/service.cc b/xla/service/service.cc index 34f3f4e82314d..2f22259a9d4f9 100644 --- a/xla/service/service.cc +++ b/xla/service/service.cc @@ -404,8 +404,7 @@ Service::ExecuteParallelAndRegisterResult( // Asynchronously launch the computation. TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result, executables[i]->ExecuteAsyncOnStream( - &run_options, arguments[i][replica], - /*hlo_execution_profile=*/nullptr)); + &run_options, arguments[i][replica])); result_buffers.push_back(std::move(result)); } diff --git a/xla/stream_executor/tpu/BUILD b/xla/stream_executor/tpu/BUILD index 2e4c565c7ed15..dd5959ec56ac0 100644 --- a/xla/stream_executor/tpu/BUILD +++ b/xla/stream_executor/tpu/BUILD @@ -599,7 +599,6 @@ cc_library( "//xla/hlo/ir:hlo", "//xla/service:compiler", "//xla/service:executable", - "//xla/service:hlo_execution_profile", "//xla/service:maybe_owning_device_memory", "//xla/service:shaped_buffer", "//xla/service:transfer_manager", @@ -636,7 +635,6 @@ cc_library( "//xla:xla_data_proto_cc", "//xla/hlo/ir:hlo", "//xla/service:executable", - "//xla/service:hlo_execution_profile", "//xla/service:shaped_buffer", "//xla/stream_executor:device_memory", "//xla/stream_executor:stream", diff --git a/xla/stream_executor/tpu/tpu_executable.cc b/xla/stream_executor/tpu/tpu_executable.cc index cf70be04961c1..e91f1d279138a 100644 --- a/xla/stream_executor/tpu/tpu_executable.cc +++ b/xla/stream_executor/tpu/tpu_executable.cc @@ -27,7 +27,6 @@ limitations under the License. #include "absl/strings/string_view.h" #include "xla/hlo/ir/hlo_module.h" #include "xla/service/executable.h" -#include "xla/service/hlo_execution_profile.h" #include "xla/service/service_executable_run_options.h" #include "xla/service/shaped_buffer.h" #include "xla/stream_executor/stream.h" @@ -92,8 +91,7 @@ TpuExecutable::~TpuExecutable() { absl::StatusOr TpuExecutable::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile) { + std::vector arguments) { SE_ExecutableRunOptions se_run_options = ApiConverter::ToC(*run_options); SE_ExecutionInput** se_args = new SE_ExecutionInput*[arguments.size()]; for (int i = 0; i < arguments.size(); ++i) { @@ -126,7 +124,7 @@ absl::StatusOr TpuExecutable::ExecuteAsyncOnStream( SE_ExecutionOutput se_execution_output; StatusHelper status; ExecutorApiFn()->TpuExecutable_ExecuteAsyncOnStreamFn( - se_executable_, &se_run_options, se_args, arguments.size(), nullptr, + se_executable_, &se_run_options, se_args, arguments.size(), &se_execution_output, status.c_status); if (se_run_options.device_assignment.bytes != nullptr) { diff --git a/xla/stream_executor/tpu/tpu_executable.h b/xla/stream_executor/tpu/tpu_executable.h index c5b639e9bf7c2..756876bbb50e4 100644 --- a/xla/stream_executor/tpu/tpu_executable.h +++ b/xla/stream_executor/tpu/tpu_executable.h @@ -29,7 +29,6 @@ limitations under the License. #include "absl/types/span.h" #include "xla/hlo/ir/hlo_module.h" #include "xla/service/executable.h" -#include "xla/service/hlo_execution_profile.h" #include "xla/service/service_executable_run_options.h" #include "xla/stream_executor/device_memory.h" #include "xla/stream_executor/tpu/c_api_decl.h" @@ -49,8 +48,7 @@ class TpuExecutable : public xla::TpuExecutableInterface { absl::StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile) override; + std::vector arguments) override; absl::string_view fingerprint() const override; diff --git a/xla/stream_executor/tpu/tpu_executable_interface.cc b/xla/stream_executor/tpu/tpu_executable_interface.cc index 3912814786d80..5fef42035f58d 100644 --- a/xla/stream_executor/tpu/tpu_executable_interface.cc +++ b/xla/stream_executor/tpu/tpu_executable_interface.cc @@ -30,7 +30,6 @@ limitations under the License. #include "xla/layout_util.h" #include "xla/service/compiler.h" #include "xla/service/executable.h" -#include "xla/service/hlo_execution_profile.h" #include "xla/service/maybe_owning_device_memory.h" #include "xla/service/service_executable_run_options.h" #include "xla/service/shaped_buffer.h" @@ -209,8 +208,7 @@ TpuExecutableInterface::AllocateOutputMemoryWithInputReuse( absl::StatusOr TpuExecutableInterface::ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* /*hlo_execution_profile*/) { + std::vector arguments) { std::vector memory_bases; memory_bases.reserve(arguments.size()); for (auto& argument : arguments) { diff --git a/xla/stream_executor/tpu/tpu_executable_interface.h b/xla/stream_executor/tpu/tpu_executable_interface.h index ce9555a1fedf3..5ed03e63a88d2 100644 --- a/xla/stream_executor/tpu/tpu_executable_interface.h +++ b/xla/stream_executor/tpu/tpu_executable_interface.h @@ -28,7 +28,6 @@ limitations under the License. #include "xla/hlo/ir/hlo_input_output_alias_config.h" #include "xla/hlo/ir/hlo_module.h" #include "xla/service/executable.h" -#include "xla/service/hlo_execution_profile.h" #include "xla/service/service_executable_run_options.h" #include "xla/shape.h" #include "xla/stream_executor/device_memory.h" @@ -46,8 +45,7 @@ class TpuExecutableInterface : public Executable { absl::StatusOr ExecuteAsyncOnStream( const ServiceExecutableRunOptions* run_options, - std::vector arguments, - HloExecutionProfile* hlo_execution_profile) override; + std::vector arguments) override; // Same as AllocateOutputMemory, except that input buffers can be reused // as output buffers. See UserBufferAlias class comment for more details on diff --git a/xla/stream_executor/tpu/tpu_executor_c_api.h b/xla/stream_executor/tpu/tpu_executor_c_api.h index a415204e85f59..3b13c28cf109f 100644 --- a/xla/stream_executor/tpu/tpu_executor_c_api.h +++ b/xla/stream_executor/tpu/tpu_executor_c_api.h @@ -289,7 +289,6 @@ TFTPU_CAPI_EXPORT void TpuCompiler_DefaultDeviceShapeRepresentation( TFTPU_CAPI_EXPORT void TpuExecutable_ExecuteAsyncOnStream( SE_Executable* executable, SE_ExecutableRunOptions* se_options, SE_ExecutionInput** se_arguments, int se_arguments_size, - SE_HloExecutionProfile* hlo_execution_profile, SE_ExecutionOutput* se_output, TF_Status* status); // This frees the XLA_ShapeIndex* array allocated when se_output is returned by diff --git a/xla/tests/BUILD b/xla/tests/BUILD index 01fdbfe957b0f..8eeeeb73a2764 100644 --- a/xla/tests/BUILD +++ b/xla/tests/BUILD @@ -592,37 +592,6 @@ xla_test( ], ) -xla_test( - name = "xla_hlo_profile_test", - srcs = ["xla_hlo_profile_test.cc"], - backends = [ - # Hlo profiles are only supported on CPU/GPU. - "cpu", - "gpu", - ], - tags = ["not_run:arm"], - deps = [ - ":client_library_test_base", - ":test_macros_header", - ":test_utils", - "//xla:array2d", - "//xla:shape_util", - "//xla:util", - "//xla/client:local_client", - "//xla/hlo/builder:xla_builder", - "//xla/hlo/builder:xla_computation", - "//xla/service:platform_util", - "//xla/service:stream_pool", - "//xla/tsl/lib/core:status_test_util", - "@com_google_absl//absl/algorithm:container", - "@com_google_absl//absl/container:flat_hash_map", - "@com_google_absl//absl/status", - "@com_google_absl//absl/strings", - "@tsl//tsl/platform:regexp", - "@tsl//tsl/platform:test", - ], -) - xla_test( name = "axpy_simple_test", srcs = ["axpy_simple_test.cc"], diff --git a/xla/tests/buffer_donation_test.cc b/xla/tests/buffer_donation_test.cc index 35d9c64884689..666ebb6dd411c 100644 --- a/xla/tests/buffer_donation_test.cc +++ b/xla/tests/buffer_donation_test.cc @@ -126,8 +126,7 @@ class BufferDonationTest : public HloTestBase { } absl::StatusOr output_status = - executable->ExecuteAsyncOnStream(&service_run_options, std::move(args), - /*hlo_execution_profile=*/nullptr); + executable->ExecuteAsyncOnStream(&service_run_options, std::move(args)); if (!expected_failure.empty()) { ASSERT_FALSE(output_status.ok()); ASSERT_TRUE( diff --git a/xla/tests/xla_hlo_profile_test.cc b/xla/tests/xla_hlo_profile_test.cc deleted file mode 100644 index 8013f2d8be904..0000000000000 --- a/xla/tests/xla_hlo_profile_test.cc +++ /dev/null @@ -1,411 +0,0 @@ -/* Copyright 2017 The OpenXLA Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -#include -#include -#include - -#include "absl/algorithm/container.h" -#include "absl/container/flat_hash_map.h" -#include "absl/status/status.h" -#include "absl/strings/match.h" -#include "absl/strings/str_cat.h" -#include "absl/strings/str_split.h" -#include "xla/array2d.h" -#include "xla/client/local_client.h" -#include "xla/hlo/builder/xla_builder.h" -#include "xla/hlo/builder/xla_computation.h" -#include "xla/map_util.h" -#include "xla/service/platform_util.h" -#include "xla/service/stream_pool.h" -#include "xla/shape_util.h" -#include "xla/tests/client_library_test_base.h" -#include "xla/tests/test_macros.h" -#include "xla/tests/test_utils.h" -#include "xla/tsl/lib/core/status_test_util.h" -#include "tsl/platform/regexp.h" -#include "tsl/platform/test.h" - -namespace xla { -namespace { - -class HloProfileTest : public ClientLibraryTestBase {}; - -struct ParsedProfileOutputLine { - int64_t cycles; - std::string cycles_percentage; - double usec; - std::string flops; - std::string trops; - std::string bytes_per_sec; - std::string bytes_per_cycle; - std::string opcode; -}; - -::testing::AssertionResult HasFlops( - const ParsedProfileOutputLine& parsed_line) { - if (RE2::FullMatch(parsed_line.flops, "[0-9.TGMk]+FLOP/s")) { - return ::testing::AssertionSuccess() - << "'flops' field present in " << parsed_line.opcode << ": '" - << parsed_line.flops << "'"; - } - - return ::testing::AssertionFailure() - << "'flops' field absent in " << parsed_line.opcode << ": '" - << parsed_line.flops << "'"; -} - -::testing::AssertionResult HasTrops( - const ParsedProfileOutputLine& parsed_line) { - if (RE2::FullMatch(parsed_line.trops, "[0-9.TGMk]+TROP/s")) { - return ::testing::AssertionSuccess() - << "'trops' field present in " << parsed_line.opcode << ": '" - << parsed_line.trops << "'"; - } - - return ::testing::AssertionFailure() - << "'trops' field absent in " << parsed_line.opcode << ": '" - << parsed_line.trops << "'"; -} - -absl::Status ParseOneProfileOutputLine( - const std::string& line, bool expect_hlo, - absl::flat_hash_map* parsed_results, - absl::Span opcodes_to_ignore = {}) { - std::string separator = "[^:]*:: +"; - std::string match_percentage = R"(\d+\.\d*% +\d+Σ)"; - std::string match_cycles = - R"((\d+) cycles +\( *()" + match_percentage + R"()\))"; - std::string match_usecs = "([0-9.]+) usec"; - std::string match_flops = "([^ ]*)"; - std::string match_trops = "([^ ]*)"; - std::string match_bytes_per_sec = "([0-9.TGMKi]*)(?:B/s)?"; - std::string match_bytes_per_cycle = "([0-9.TGMKi]*)(?:B/cycle)?"; - - // The underlined part is what we're trying to match with match_opcode: - // - // %dot33 = f32[256,256]{1,0} dot(...) - // ^^^ - - std::string match_opcode = expect_hlo ? "%[^=]+= [^ ]+ ([^(]+)\\(.*" - : "(\\[total\\])( \\[entry\\])?"; - std::string regexp_pattern = absl::StrCat( - " +", match_cycles, separator, match_usecs, separator, match_flops, - separator, match_trops, separator, match_bytes_per_sec, separator, - match_bytes_per_cycle, separator, match_opcode); - - ParsedProfileOutputLine parsed_line; - bool matched = RE2::FullMatch( - line, regexp_pattern, &parsed_line.cycles, &parsed_line.cycles_percentage, - &parsed_line.usec, &parsed_line.flops, &parsed_line.trops, - &parsed_line.bytes_per_sec, &parsed_line.bytes_per_cycle, - &parsed_line.opcode); - if (!matched) { - return tsl::errors::InvalidArgument("Input did not match regexp. Input: ", - line, ", Regexp: ", regexp_pattern); - } - - if (!absl::c_linear_search(opcodes_to_ignore, parsed_line.opcode)) { - InsertOrDie(parsed_results, parsed_line.opcode, parsed_line); - } - - return absl::OkStatus(); -} - -bool IsExtraMetricProfileOutputLine(const std::string& line) { - return RE2::FullMatch(line, "Extra metric \\S+: \\d+"); -} - -// Returns void so that we can ASSERT. -void ExecuteAndFetchProfile(std::string* profile_output, LocalClient* client, - const XlaComputation& computation, - const Shape& lhs_arg_shape, - const Shape& rhs_arg_shape) { - LocalService* service = ClientLibrary::GetXlaService(client->platform()); - Backend* backend = service->mutable_backend(); - se::StreamExecutor* executor = backend->default_stream_executor(); - se::DeviceMemoryAllocator* allocator = backend->memory_allocator(); - auto* transfer_manager = backend->transfer_manager(); - TF_ASSERT_OK_AND_ASSIGN( - StreamPool::Ptr stream_ptr, - backend->BorrowStream(backend->default_device_ordinal())); - - TF_ASSERT_OK_AND_ASSIGN( - ScopedShapedBuffer lhs_arg, - transfer_manager->AllocateScopedShapedBuffer( - lhs_arg_shape, allocator, backend->default_device_ordinal())); - TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice( - stream_ptr.get(), Literal::CreateFromShape(lhs_arg_shape), lhs_arg)); - - TF_ASSERT_OK_AND_ASSIGN( - ScopedShapedBuffer rhs_arg, - transfer_manager->AllocateScopedShapedBuffer( - rhs_arg_shape, allocator, backend->default_device_ordinal())); - TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice( - stream_ptr.get(), Literal::CreateFromShape(rhs_arg_shape), rhs_arg)); - - ExecutableBuildOptions build_options; - build_options.mutable_debug_options()->set_xla_hlo_profile(true); - TF_ASSERT_OK_AND_ASSIGN( - auto local_executables, - client->Compile(computation, {&lhs_arg_shape, &rhs_arg_shape}, - build_options)); - - Executable* executable = local_executables[0]->executable(); - HloExecutionProfile hlo_execution_profile( - &executable->hlo_profile_printer_data(), - &executable->hlo_profile_index_map()); - - ExecutableRunOptions exec_run_options; - exec_run_options.set_stream(stream_ptr.get()); - exec_run_options.set_allocator(backend->memory_allocator()); - exec_run_options.set_intra_op_thread_pool( - backend->eigen_intra_op_thread_pool_device()); - ServiceExecutableRunOptions run_options(exec_run_options, - /*borrow_stream=*/nullptr); - std::vector args = {&lhs_arg, &rhs_arg}; - TF_ASSERT_OK_AND_ASSIGN( - auto execution_result, - executable->ExecuteOnStream(&run_options, args, &hlo_execution_profile)); - TF_ASSERT_OK(stream_ptr->BlockHostUntilDone()); - (void)execution_result; - - *profile_output = hlo_execution_profile.ToString( - executor->GetDeviceDescription().clock_rate_ghz()); - - XLA_VLOG_LINES(4, *profile_output); -} - -XLA_TEST_F(HloProfileTest, - DISABLED_ON_CPU(DISABLED_ON_GPU(ProfileSingleComputation))) { - const int64_t m = 32, k = 32, n = 32; - Shape lhs_shape = ShapeUtil::MakeShape(F32, {m, k}); - Shape rhs_shape = ShapeUtil::MakeShape(F32, {m, k}); - - TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform, - PlatformUtil::GetDefaultPlatform()); - TF_ASSERT_OK_AND_ASSIGN(LocalClient * client, - ClientLibrary::GetOrCreateLocalClient(platform)); - - XlaBuilder builder(TestName()); - Tanh(Add( - Parameter(&builder, 0, ShapeUtil::MakeShape(F32, {m, k}), "dot_lhs"), - Parameter(&builder, 1, ShapeUtil::MakeShape(F32, {k, n}), "dot_rhs"))); - - TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build()); - - std::string profile_output; - ExecuteAndFetchProfile(&profile_output, client, computation, lhs_shape, - rhs_shape); - VLOG(4) << "Profile Output:\n" << profile_output; - std::vector profile_output_lines = - absl::StrSplit(profile_output, '\n'); - - absl::flat_hash_map - parsed_profile_lines; - - int line_no = 0; - - // Skip extra metrics. - while (IsExtraMetricProfileOutputLine(profile_output_lines[line_no])) { - line_no++; - } - - line_no++; // Skip 'Execution profile for ....' - - ASSERT_LT(line_no, profile_output_lines.size()); - TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++], - /*expect_hlo=*/false, - &parsed_profile_lines)); - - ASSERT_LT(line_no, profile_output_lines.size()); - TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++], - /*expect_hlo=*/true, - &parsed_profile_lines)); - - ASSERT_LT(line_no, profile_output_lines.size()); - TF_ASSERT_OK(ParseOneProfileOutputLine(profile_output_lines[line_no++], - /*expect_hlo=*/true, - &parsed_profile_lines)); - - TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_profile, - MaybeFind(parsed_profile_lines, "[total]")); - TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine dot_profile, - MaybeFind(parsed_profile_lines, "add")); - TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine tanh_profile, - MaybeFind(parsed_profile_lines, "tanh")); - - EXPECT_GT(total_profile.cycles, 0); - EXPECT_EQ(total_profile.cycles_percentage, "100.% 100Σ"); - - EXPECT_TRUE(HasFlops(total_profile)); - EXPECT_TRUE(HasTrops(total_profile)); - - EXPECT_GT(total_profile.cycles, dot_profile.cycles); - EXPECT_NE(dot_profile.cycles_percentage, "0.00%"); - EXPECT_NE(dot_profile.cycles_percentage, "100.00%"); - - EXPECT_TRUE(HasFlops(dot_profile)); - EXPECT_FALSE(HasTrops(dot_profile)); - - EXPECT_GT(total_profile.cycles, tanh_profile.cycles); - EXPECT_NE(tanh_profile.cycles_percentage, "0.00%"); - EXPECT_NE(tanh_profile.cycles_percentage, "100.00%"); - - EXPECT_FALSE(HasFlops(tanh_profile)); - EXPECT_TRUE(HasTrops(tanh_profile)); -} - -XLA_TEST_F(HloProfileTest, - DISABLED_ON_CPU(DISABLED_ON_GPU(ProfileWhileComputation))) { - const int64_t size = 32; - Shape matrix_shape = ShapeUtil::MakeShape(F32, {size, size}); - Shape while_result_shape = - ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {}), matrix_shape}); - - TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform, - PlatformUtil::GetDefaultPlatform()); - TF_ASSERT_OK_AND_ASSIGN(LocalClient * client, - ClientLibrary::GetOrCreateLocalClient(platform)); - - XlaComputation condition; - { - XlaBuilder builder("condition"); - auto state = Parameter(&builder, 0, while_result_shape, "state"); - auto iteration = GetTupleElement(state, 0); - Gt(ConstantR0(&builder, 5), iteration); - TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build()); - } - - XlaComputation body; - { - XlaBuilder builder("body"); - auto state = Parameter(&builder, 0, while_result_shape, "state"); - auto matrix = GetTupleElement(state, 1); - auto next_iteration = - Add(GetTupleElement(state, 0), ConstantR0(&builder, 1)); - Tuple(&builder, {next_iteration, Mul(matrix, matrix)}); - TF_ASSERT_OK_AND_ASSIGN(body, builder.Build()); - } - - XlaBuilder builder(TestName()); - auto initial_while_state = - Tuple(&builder, {ConstantR0(&builder, 0), - Parameter(&builder, 0, matrix_shape, "initial_value")}); - auto while_result = While(condition, body, initial_while_state); - Add(GetTupleElement(while_result, 1), - Parameter(&builder, 1, matrix_shape, "other_value")); - - TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build()); - - std::string profile_output; - ExecuteAndFetchProfile(&profile_output, client, computation, matrix_shape, - matrix_shape); - SCOPED_TRACE(profile_output); - - std::vector profile_output_lines = - absl::StrSplit(profile_output, '\n'); - - auto while_body_profile_start = - absl::c_find_if(profile_output_lines, [](absl::string_view s) { - return absl::StartsWith(s, "Execution profile for body"); - }); - - ASSERT_NE(while_body_profile_start, profile_output_lines.cend()); - - auto while_body_profile_end = - std::find_if(while_body_profile_start, profile_output_lines.end(), - [](absl::string_view s) { - return absl::StartsWith(s, "********** microseconds "); - }); - - // We emit a blank line before the "microseconds report" line. - while_body_profile_end--; - - ASSERT_NE(while_body_profile_end, profile_output_lines.end()); - - absl::flat_hash_map - parsed_profile_lines; - - for (auto while_body_profile_i = while_body_profile_start + 1; - while_body_profile_i != while_body_profile_end; while_body_profile_i++) { - // There are multiple "get-tuple-element" instructions in the while body so - // we ignore them -- we don't want parsed_profile_lines to be a multi-map. - TF_ASSERT_OK(ParseOneProfileOutputLine( - *while_body_profile_i, - /*expect_hlo=*/while_body_profile_i != (while_body_profile_start + 1), - &parsed_profile_lines, {"get-tuple-element"})); - } - - TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine total_while_body_profile, - MaybeFind(parsed_profile_lines, "[total]")); - TF_ASSERT_OK_AND_ASSIGN(ParsedProfileOutputLine multiply_profile, - MaybeFind(parsed_profile_lines, "multiply")); - - EXPECT_GT(total_while_body_profile.cycles, 0); - EXPECT_EQ(total_while_body_profile.opcode, "[total]"); - EXPECT_EQ(total_while_body_profile.cycles_percentage, "100.% 100Σ"); - - EXPECT_GT(total_while_body_profile.cycles, multiply_profile.cycles); - EXPECT_NE(multiply_profile.cycles_percentage, "0.00%"); - EXPECT_NE(multiply_profile.cycles_percentage, "100.00%"); -} -} // namespace -} // namespace xla - -static std::pair AddXlaHloProfileFlag(int argc, char** argv) { - char** new_argv = new char*[argc + 2]; - for (int i = 0; i < argc; i++) { - new_argv[i] = argv[i]; - } - - // We do it this way (as opposed to piping in a modified DebugOptions - // instance) for better end-to-end integration testing. - new_argv[argc] = strdup("--xla_hlo_profile"); - - // Fusion can change the Hlo instructions that show up in the final Hlo - // executable, so block it here. Also block the WhileLoopInvariantCodeMotion - // pass, otherwise a while loop is transformed and we could not match the - // original name in the ProfileWhileComputation test. - new_argv[argc + 1] = strdup( - "--xla_disable_hlo_passes=fusion,fusion_merger,multi_output_fusion," - "while-loop-invariant-code-motion"); - return {argc + 2, new_argv}; -} - -GTEST_API_ int main(int argc, char** argv) { - std::vector flag_list; - xla::AppendDebugOptionsFlags(&flag_list); - std::tie(argc, argv) = AddXlaHloProfileFlag(argc, argv); - std::unique_ptr argv_ptr(argv); - char* to_be_freed[] = {argv[argc - 1], argv[argc - 2]}; - - auto usage = tsl::Flags::Usage(argv[0], flag_list); - const bool parseResult = tsl::Flags::Parse(&argc, argv, flag_list); - for (auto p : to_be_freed) { - free(p); - } - if (!parseResult) { - LOG(ERROR) << "\n" << usage; - return 2; - } - - testing::InitGoogleTest(&argc, argv); - if (argc > 1) { - LOG(ERROR) << "Unknown argument " << argv[1] << "\n" << usage; - return 2; - } - return RUN_ALL_TESTS(); -}