diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 43f6f90..50e0834 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +add_subdirectory(argmax) add_subdirectory(compute) add_subdirectory(convolution) add_subdirectory(matmul) diff --git a/benchmarks/argmax/CMakeLists.txt b/benchmarks/argmax/CMakeLists.txt new file mode 100644 index 0000000..e11e9eb --- /dev/null +++ b/benchmarks/argmax/CMakeLists.txt @@ -0,0 +1,45 @@ +# Copyright 2024 Advanced Micro Devices Inc. +# Copyright 2020-2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +uvkc_glsl_shader_instance( + NAME + one_workgroup_argmax_loop_shader + SRC + "one_workgroup_argmax_loop.glsl" + GLSLC_ARGS + "--target-env=vulkan1.1" +) + +uvkc_glsl_shader_instance( + NAME + one_workgroup_argmax_subgroup_shader + SRC + "one_workgroup_argmax_subgroup.glsl" + GLSLC_ARGS + "--target-env=vulkan1.1" +) + +uvkc_cc_binary( + NAME + one_workgrop_argmax + SRCS + "one_workgroup_argmax_main.cc" + DEPS + ::one_workgroup_argmax_loop_shader + ::one_workgroup_argmax_subgroup_shader + benchmark::benchmark + uvkc::benchmark::core + uvkc::benchmark::main +) diff --git a/benchmarks/argmax/README.md b/benchmarks/argmax/README.md new file mode 100644 index 0000000..1be20c5 --- /dev/null +++ b/benchmarks/argmax/README.md @@ -0,0 +1,12 @@ +# Argmax Benchmarks + +This directory contains microbenchmarks for evaluating different strategy to +implement argmax. + +### `one_workgroup_argmax` + +Performs argmax using just one workgroup. The workgroup just contains one +subgroup. This approach does not use any synchronization mechanisms. + +A subgroup uses either a single thread to loop over all elements or subgroup +reduction operations involving all invocations. \ No newline at end of file diff --git a/benchmarks/argmax/one_workgroup_argmax_loop.glsl b/benchmarks/argmax/one_workgroup_argmax_loop.glsl new file mode 100644 index 0000000..9b75d2e --- /dev/null +++ b/benchmarks/argmax/one_workgroup_argmax_loop.glsl @@ -0,0 +1,30 @@ +# version 450 core +#extension GL_EXT_control_flow_attributes : enable + +layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; + +layout(set=0, binding=0) buffer InputBuffer { float data[]; } Input; +layout(set=0, binding=1) buffer OutputBuffer { uint data; } Output; + +layout(constant_id = 0) const uint totalCount = 1; // Total number of scalars + +// Each workgroup contains just one subgroup. + +void main() { + uint laneID = gl_LocalInvocationID.x; + + if (laneID != 0) return; + + uint wgResult = 0; + float wgMax = Input.data[0]; + + for (uint i = 1; i < totalCount; ++i) { + float elem = Input.data[i]; + if (elem > wgMax) { + wgResult = i; + wgMax = elem; + } + } + + Output.data = wgResult; +} diff --git a/benchmarks/argmax/one_workgroup_argmax_main.cc b/benchmarks/argmax/one_workgroup_argmax_main.cc new file mode 100644 index 0000000..35cc378 --- /dev/null +++ b/benchmarks/argmax/one_workgroup_argmax_main.cc @@ -0,0 +1,282 @@ +// Copyright 2024 Advanced Micro Devices Inc. +// Copyright 2020-2024 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "absl/strings/str_cat.h" +#include "absl/types/span.h" +#include "benchmark/benchmark.h" +#include "uvkc/benchmark/data_type_util.h" +#include "uvkc/benchmark/main.h" +#include "uvkc/benchmark/status_util.h" +#include "uvkc/benchmark/vulkan_buffer_util.h" +#include "uvkc/benchmark/vulkan_context.h" +#include "uvkc/vulkan/device.h" +#include "uvkc/vulkan/pipeline.h" + +using ::uvkc::benchmark::LatencyMeasureMode; +using ::uvkc::vulkan::Pipeline; + +static const char kBenchmarkName[] = "one_workgroup_argmax"; + +static const uint32_t kLoopShader[] = { +#include "one_workgroup_argmax_loop_shader_spirv_instance.inc" +}; + +static const uint32_t kSubgroupShader[] = { +#include "one_workgroup_argmax_subgroup_shader_spirv_instance.inc" +}; + +struct ShaderCode { + const char *name; // Test case name + const uint32_t *code; // SPIR-V code + size_t code_num_bytes; // Number of bytes for SPIR-V code + int workgroup_size; // Number of invocations per workgroup +}; + +ShaderCode kShaders[] = { + {"loop", kLoopShader, sizeof(kLoopShader), 32}, + {"subgroup", kSubgroupShader, sizeof(kSubgroupShader), 32}, +}; + +static void Argmax(::benchmark::State &state, ::uvkc::vulkan::Device *device, + const ::uvkc::benchmark::LatencyMeasure *latency_measure, + const uint32_t *code, size_t code_num_words, + size_t total_elements, int workgroup_size) { + //===-------------------------------------------------------------------===/ + // Create shader module, pipeline, and descriptor sets + //===-------------------------------------------------------------------===/ + + BM_CHECK_OK_AND_ASSIGN(auto shader_module, + device->CreateShaderModule(code, code_num_words)); + BM_CHECK_OK_AND_ASSIGN(auto descriptor_pool, + device->CreateDescriptorPool(*shader_module)); + BM_CHECK_OK_AND_ASSIGN(auto layout_set_map, + descriptor_pool->AllocateDescriptorSets( + shader_module->descriptor_set_layouts())); + + Pipeline::SpecConstant spec_constants[] = { + {/*id=*/0, Pipeline::SpecConstant::Type::u32, + static_cast(total_elements)}, + {/*id=*/1, Pipeline::SpecConstant::Type::u32, workgroup_size}, + }; + BM_CHECK_OK_AND_ASSIGN( + auto pipeline, device->CreatePipeline(*shader_module, "main", + absl::MakeSpan(spec_constants, 2))); + + //===-------------------------------------------------------------------===/ + // Create buffers + //===-------------------------------------------------------------------===/ + + const size_t src_buffer_size = total_elements * sizeof(float); + const size_t dst_buffer_size = sizeof(int); + + BM_CHECK_OK_AND_ASSIGN( + auto src_buffer, + device->CreateBuffer( + VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, src_buffer_size)); + BM_CHECK_OK_AND_ASSIGN( + auto dst_buffer, + device->CreateBuffer( + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, dst_buffer_size)); + + // Create a buffer for zeroing the destination buffer. + BM_CHECK_OK_AND_ASSIGN( + auto data_buffer, + device->CreateBuffer( + VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, + VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, dst_buffer_size)); + + //===-------------------------------------------------------------------===/ + // Set source buffer data + //===-------------------------------------------------------------------===/ + + auto generate_float_data = [](size_t i) { + return i == 550 ? float(i) : 1.0f; + }; + + BM_CHECK_OK(::uvkc::benchmark::SetDeviceBufferViaStagingBuffer( + device, src_buffer.get(), src_buffer_size, + [&](void *ptr, size_t num_bytes) { + float *src_float_buffer = reinterpret_cast(ptr); + for (size_t i = 0; i < num_bytes / sizeof(float); i++) { + src_float_buffer[i] = generate_float_data(i); + } + })); + + //===-------------------------------------------------------------------===/ + // Dispatch + //===-------------------------------------------------------------------===/ + + std::vector<::uvkc::vulkan::Device::BoundBuffer> bound_buffers = { + {src_buffer.get(), /*set=*/0, /*binding=*/0}, + {dst_buffer.get(), /*set=*/0, /*binding=*/1}, + }; + BM_CHECK_OK(device->AttachBufferToDescriptor( + *shader_module, layout_set_map, + {bound_buffers.data(), bound_buffers.size()})); + + BM_CHECK_EQ(shader_module->descriptor_set_layouts().size(), 1) + << "unexpected number of descriptor sets"; + auto descriptor_set_layout = shader_module->descriptor_set_layouts().front(); + + std::vector<::uvkc::vulkan::CommandBuffer::BoundDescriptorSet> + bound_descriptor_sets(1); + bound_descriptor_sets[0].index = 0; + bound_descriptor_sets[0].set = layout_set_map.at(descriptor_set_layout); + BM_CHECK_OK_AND_ASSIGN(auto dispatch_cmdbuf, device->AllocateCommandBuffer()); + + BM_CHECK_OK(dispatch_cmdbuf->Begin()); + dispatch_cmdbuf->BindPipelineAndDescriptorSets( + *pipeline, {bound_descriptor_sets.data(), bound_descriptor_sets.size()}); + dispatch_cmdbuf->Dispatch(1, 1, 1); + BM_CHECK_OK(dispatch_cmdbuf->End()); + BM_CHECK_OK(device->QueueSubmitAndWait(*dispatch_cmdbuf)); + + //===-------------------------------------------------------------------===/ + // Verify destination buffer data + //===-------------------------------------------------------------------===/ + + BM_CHECK_OK(::uvkc::benchmark::GetDeviceBufferViaStagingBuffer( + device, dst_buffer.get(), dst_buffer_size, + [&](void *ptr, size_t num_bytes) { + int *dst_int_buffer = reinterpret_cast(ptr); + float max = -10000.f; + int idx = -1; + for (size_t i = 0; i < total_elements; i++) { + float data = generate_float_data(i); + if (data > max) { + idx = i; + max = data; + } + }; + BM_CHECK_EQ(dst_int_buffer[0], idx) + << "destination buffer element #0 has incorrect value: " + "expected to be " + << idx << " but found " << dst_int_buffer[0]; + })); + + //===-------------------------------------------------------------------===/ + // Benchmarking + //===-------------------------------------------------------------------===/ + + std::unique_ptr<::uvkc::vulkan::TimestampQueryPool> query_pool; + bool use_timestamp = + latency_measure->mode == LatencyMeasureMode::kGpuTimestamp; + if (use_timestamp) { + BM_CHECK_OK_AND_ASSIGN(query_pool, device->CreateTimestampQueryPool(2)); + } + + BM_CHECK_OK_AND_ASSIGN(auto cmdbuf, device->AllocateCommandBuffer()); + for (auto _ : state) { + BM_CHECK_OK(cmdbuf->Begin()); + if (use_timestamp) cmdbuf->ResetQueryPool(*query_pool); + + cmdbuf->BindPipelineAndDescriptorSets( + *pipeline, + {bound_descriptor_sets.data(), bound_descriptor_sets.size()}); + + if (use_timestamp) { + cmdbuf->WriteTimestamp(*query_pool, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0); + } + + cmdbuf->Dispatch(1, 1, 1); + + if (use_timestamp) { + cmdbuf->WriteTimestamp(*query_pool, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT, + 1); + } + + BM_CHECK_OK(cmdbuf->End()); + + auto start_time = std::chrono::high_resolution_clock::now(); + BM_CHECK_OK(device->QueueSubmitAndWait(*cmdbuf)); + auto end_time = std::chrono::high_resolution_clock::now(); + auto elapsed_seconds = + std::chrono::duration_cast>(end_time - + start_time); + + switch (latency_measure->mode) { + case LatencyMeasureMode::kSystemDispatch: { + state.SetIterationTime(elapsed_seconds.count() - + latency_measure->overhead_seconds); + } break; + case LatencyMeasureMode::kSystemSubmit: { + state.SetIterationTime(elapsed_seconds.count()); + } break; + case LatencyMeasureMode::kGpuTimestamp: { + BM_CHECK_OK_AND_ASSIGN( + double timestamp_seconds, + query_pool->CalculateElapsedSecondsBetween(0, 1)); + state.SetIterationTime(timestamp_seconds); + } break; + } + + BM_CHECK_OK(cmdbuf->Reset()); + } + + state.SetBytesProcessed(state.iterations() * src_buffer_size); + state.counters["FLOps"] = + ::benchmark::Counter(total_elements, + ::benchmark::Counter::kIsIterationInvariant | + ::benchmark::Counter::kIsRate, + ::benchmark::Counter::kIs1000); + + // Reset the command pool to release all command buffers in the benchmarking + // loop to avoid draining GPU resources. + BM_CHECK_OK(device->ResetCommandPool()); +} + +namespace uvkc { +namespace benchmark { + +absl::StatusOr> CreateVulkanContext() { + return CreateDefaultVulkanContext(kBenchmarkName); +} + +bool RegisterVulkanOverheadBenchmark( + const vulkan::Driver::PhysicalDeviceInfo &physical_device, + vulkan::Device *device, double *overhead_seconds) { + return false; +} + +void RegisterVulkanBenchmarks( + const vulkan::Driver::PhysicalDeviceInfo &physical_device, + vulkan::Device *device, const LatencyMeasure *latency_measure) { + const char *gpu_name = physical_device.v10_properties.deviceName; + + for (const auto &shader : kShaders) { + for (size_t total_elements : {1 << 10, 1 << 12, 1 << 14, 1 << 16}) { + std::string test_name = absl::StrCat( + gpu_name, "/#elements=", total_elements, + "/workgroup_size=", shader.workgroup_size, "/", shader.name); + ::benchmark::RegisterBenchmark(test_name.c_str(), Argmax, device, + latency_measure, shader.code, + shader.code_num_bytes / sizeof(uint32_t), + total_elements, shader.workgroup_size) + ->UseManualTime() + ->Unit(::benchmark::kMicrosecond); + } + } +} + +} // namespace benchmark +} // namespace uvkc diff --git a/benchmarks/argmax/one_workgroup_argmax_subgroup.glsl b/benchmarks/argmax/one_workgroup_argmax_subgroup.glsl new file mode 100644 index 0000000..b751821 --- /dev/null +++ b/benchmarks/argmax/one_workgroup_argmax_subgroup.glsl @@ -0,0 +1,44 @@ +# version 450 core +#extension GL_EXT_control_flow_attributes : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable +#extension GL_KHR_shader_subgroup_ballot : enable + +layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; + +layout(set=0, binding=0) buffer InputBuffer { float data[]; } Input; +layout(set=0, binding=1) buffer OutputBuffer { uint data; } Output; + +layout(constant_id = 0) const uint totalCount = 1; // Total number of scalars + +// Each workgroup contains just one subgroup. + +void main() { + uint laneID = gl_LocalInvocationID.x; + uint laneCount = gl_WorkGroupSize.x; + + uint laneResult = 0; + float laneMax = Input.data[laneID]; + + uint numBatches = totalCount / laneCount; + + for (uint i = 1; i < numBatches; ++i) { + uint idx = laneCount * i + laneID; + float elem = Input.data[idx]; + if (elem > laneMax) { + laneResult = idx; + laneMax = elem; + } + } + + // Find the max of workgroup (containing only one subgroup). + float wgMax = subgroupMax(laneMax); + + // Find the smallest thread ID with the max element. + bool bit = laneMax == wgMax; + uvec4 mask = subgroupBallot(bit); + uint smallestID = subgroupBallotFindLSB(mask); + + // The thread is responsible for outputing result. + if (laneID == smallestID) + Output.data = laneResult; +}