Skip to content

Commit

Permalink
Add a one workgroup argmax benchmark (#49)
Browse files Browse the repository at this point in the history
This PR is based on #47. I opened a new one because the old one got stale.
  • Loading branch information
angelz913 authored Aug 2, 2024
1 parent c9fe9ca commit 32f2f57
Show file tree
Hide file tree
Showing 6 changed files with 414 additions and 0 deletions.
1 change: 1 addition & 0 deletions benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

add_subdirectory(argmax)
add_subdirectory(compute)
add_subdirectory(convolution)
add_subdirectory(matmul)
Expand Down
45 changes: 45 additions & 0 deletions benchmarks/argmax/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Copyright 2024 Advanced Micro Devices Inc.
# Copyright 2020-2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

uvkc_glsl_shader_instance(
NAME
one_workgroup_argmax_loop_shader
SRC
"one_workgroup_argmax_loop.glsl"
GLSLC_ARGS
"--target-env=vulkan1.1"
)

uvkc_glsl_shader_instance(
NAME
one_workgroup_argmax_subgroup_shader
SRC
"one_workgroup_argmax_subgroup.glsl"
GLSLC_ARGS
"--target-env=vulkan1.1"
)

uvkc_cc_binary(
NAME
one_workgrop_argmax
SRCS
"one_workgroup_argmax_main.cc"
DEPS
::one_workgroup_argmax_loop_shader
::one_workgroup_argmax_subgroup_shader
benchmark::benchmark
uvkc::benchmark::core
uvkc::benchmark::main
)
12 changes: 12 additions & 0 deletions benchmarks/argmax/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Argmax Benchmarks

This directory contains microbenchmarks for evaluating different strategy to
implement argmax.

### `one_workgroup_argmax`

Performs argmax using just one workgroup. The workgroup just contains one
subgroup. This approach does not use any synchronization mechanisms.

A subgroup uses either a single thread to loop over all elements or subgroup
reduction operations involving all invocations.
30 changes: 30 additions & 0 deletions benchmarks/argmax/one_workgroup_argmax_loop.glsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# version 450 core
#extension GL_EXT_control_flow_attributes : enable

layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;

layout(set=0, binding=0) buffer InputBuffer { float data[]; } Input;
layout(set=0, binding=1) buffer OutputBuffer { uint data; } Output;

layout(constant_id = 0) const uint totalCount = 1; // Total number of scalars

// Each workgroup contains just one subgroup.

void main() {
uint laneID = gl_LocalInvocationID.x;

if (laneID != 0) return;

uint wgResult = 0;
float wgMax = Input.data[0];

for (uint i = 1; i < totalCount; ++i) {
float elem = Input.data[i];
if (elem > wgMax) {
wgResult = i;
wgMax = elem;
}
}

Output.data = wgResult;
}
282 changes: 282 additions & 0 deletions benchmarks/argmax/one_workgroup_argmax_main.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
// Copyright 2024 Advanced Micro Devices Inc.
// Copyright 2020-2024 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <chrono>
#include <memory>
#include <numeric>

#include "absl/strings/str_cat.h"
#include "absl/types/span.h"
#include "benchmark/benchmark.h"
#include "uvkc/benchmark/data_type_util.h"
#include "uvkc/benchmark/main.h"
#include "uvkc/benchmark/status_util.h"
#include "uvkc/benchmark/vulkan_buffer_util.h"
#include "uvkc/benchmark/vulkan_context.h"
#include "uvkc/vulkan/device.h"
#include "uvkc/vulkan/pipeline.h"

using ::uvkc::benchmark::LatencyMeasureMode;
using ::uvkc::vulkan::Pipeline;

static const char kBenchmarkName[] = "one_workgroup_argmax";

static const uint32_t kLoopShader[] = {
#include "one_workgroup_argmax_loop_shader_spirv_instance.inc"
};

static const uint32_t kSubgroupShader[] = {
#include "one_workgroup_argmax_subgroup_shader_spirv_instance.inc"
};

struct ShaderCode {
const char *name; // Test case name
const uint32_t *code; // SPIR-V code
size_t code_num_bytes; // Number of bytes for SPIR-V code
int workgroup_size; // Number of invocations per workgroup
};

ShaderCode kShaders[] = {
{"loop", kLoopShader, sizeof(kLoopShader), 32},
{"subgroup", kSubgroupShader, sizeof(kSubgroupShader), 32},
};

static void Argmax(::benchmark::State &state, ::uvkc::vulkan::Device *device,
const ::uvkc::benchmark::LatencyMeasure *latency_measure,
const uint32_t *code, size_t code_num_words,
size_t total_elements, int workgroup_size) {
//===-------------------------------------------------------------------===/
// Create shader module, pipeline, and descriptor sets
//===-------------------------------------------------------------------===/

BM_CHECK_OK_AND_ASSIGN(auto shader_module,
device->CreateShaderModule(code, code_num_words));
BM_CHECK_OK_AND_ASSIGN(auto descriptor_pool,
device->CreateDescriptorPool(*shader_module));
BM_CHECK_OK_AND_ASSIGN(auto layout_set_map,
descriptor_pool->AllocateDescriptorSets(
shader_module->descriptor_set_layouts()));

Pipeline::SpecConstant spec_constants[] = {
{/*id=*/0, Pipeline::SpecConstant::Type::u32,
static_cast<int32_t>(total_elements)},
{/*id=*/1, Pipeline::SpecConstant::Type::u32, workgroup_size},
};
BM_CHECK_OK_AND_ASSIGN(
auto pipeline, device->CreatePipeline(*shader_module, "main",
absl::MakeSpan(spec_constants, 2)));

//===-------------------------------------------------------------------===/
// Create buffers
//===-------------------------------------------------------------------===/

const size_t src_buffer_size = total_elements * sizeof(float);
const size_t dst_buffer_size = sizeof(int);

BM_CHECK_OK_AND_ASSIGN(
auto src_buffer,
device->CreateBuffer(
VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, src_buffer_size));
BM_CHECK_OK_AND_ASSIGN(
auto dst_buffer,
device->CreateBuffer(
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, dst_buffer_size));

// Create a buffer for zeroing the destination buffer.
BM_CHECK_OK_AND_ASSIGN(
auto data_buffer,
device->CreateBuffer(
VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, dst_buffer_size));

//===-------------------------------------------------------------------===/
// Set source buffer data
//===-------------------------------------------------------------------===/

auto generate_float_data = [](size_t i) {
return i == 550 ? float(i) : 1.0f;
};

BM_CHECK_OK(::uvkc::benchmark::SetDeviceBufferViaStagingBuffer(
device, src_buffer.get(), src_buffer_size,
[&](void *ptr, size_t num_bytes) {
float *src_float_buffer = reinterpret_cast<float *>(ptr);
for (size_t i = 0; i < num_bytes / sizeof(float); i++) {
src_float_buffer[i] = generate_float_data(i);
}
}));

//===-------------------------------------------------------------------===/
// Dispatch
//===-------------------------------------------------------------------===/

std::vector<::uvkc::vulkan::Device::BoundBuffer> bound_buffers = {
{src_buffer.get(), /*set=*/0, /*binding=*/0},
{dst_buffer.get(), /*set=*/0, /*binding=*/1},
};
BM_CHECK_OK(device->AttachBufferToDescriptor(
*shader_module, layout_set_map,
{bound_buffers.data(), bound_buffers.size()}));

BM_CHECK_EQ(shader_module->descriptor_set_layouts().size(), 1)
<< "unexpected number of descriptor sets";
auto descriptor_set_layout = shader_module->descriptor_set_layouts().front();

std::vector<::uvkc::vulkan::CommandBuffer::BoundDescriptorSet>
bound_descriptor_sets(1);
bound_descriptor_sets[0].index = 0;
bound_descriptor_sets[0].set = layout_set_map.at(descriptor_set_layout);
BM_CHECK_OK_AND_ASSIGN(auto dispatch_cmdbuf, device->AllocateCommandBuffer());

BM_CHECK_OK(dispatch_cmdbuf->Begin());
dispatch_cmdbuf->BindPipelineAndDescriptorSets(
*pipeline, {bound_descriptor_sets.data(), bound_descriptor_sets.size()});
dispatch_cmdbuf->Dispatch(1, 1, 1);
BM_CHECK_OK(dispatch_cmdbuf->End());
BM_CHECK_OK(device->QueueSubmitAndWait(*dispatch_cmdbuf));

//===-------------------------------------------------------------------===/
// Verify destination buffer data
//===-------------------------------------------------------------------===/

BM_CHECK_OK(::uvkc::benchmark::GetDeviceBufferViaStagingBuffer(
device, dst_buffer.get(), dst_buffer_size,
[&](void *ptr, size_t num_bytes) {
int *dst_int_buffer = reinterpret_cast<int *>(ptr);
float max = -10000.f;
int idx = -1;
for (size_t i = 0; i < total_elements; i++) {
float data = generate_float_data(i);
if (data > max) {
idx = i;
max = data;
}
};
BM_CHECK_EQ(dst_int_buffer[0], idx)
<< "destination buffer element #0 has incorrect value: "
"expected to be "
<< idx << " but found " << dst_int_buffer[0];
}));

//===-------------------------------------------------------------------===/
// Benchmarking
//===-------------------------------------------------------------------===/

std::unique_ptr<::uvkc::vulkan::TimestampQueryPool> query_pool;
bool use_timestamp =
latency_measure->mode == LatencyMeasureMode::kGpuTimestamp;
if (use_timestamp) {
BM_CHECK_OK_AND_ASSIGN(query_pool, device->CreateTimestampQueryPool(2));
}

BM_CHECK_OK_AND_ASSIGN(auto cmdbuf, device->AllocateCommandBuffer());
for (auto _ : state) {
BM_CHECK_OK(cmdbuf->Begin());
if (use_timestamp) cmdbuf->ResetQueryPool(*query_pool);

cmdbuf->BindPipelineAndDescriptorSets(
*pipeline,
{bound_descriptor_sets.data(), bound_descriptor_sets.size()});

if (use_timestamp) {
cmdbuf->WriteTimestamp(*query_pool, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0);
}

cmdbuf->Dispatch(1, 1, 1);

if (use_timestamp) {
cmdbuf->WriteTimestamp(*query_pool, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
1);
}

BM_CHECK_OK(cmdbuf->End());

auto start_time = std::chrono::high_resolution_clock::now();
BM_CHECK_OK(device->QueueSubmitAndWait(*cmdbuf));
auto end_time = std::chrono::high_resolution_clock::now();
auto elapsed_seconds =
std::chrono::duration_cast<std::chrono::duration<double>>(end_time -
start_time);

switch (latency_measure->mode) {
case LatencyMeasureMode::kSystemDispatch: {
state.SetIterationTime(elapsed_seconds.count() -
latency_measure->overhead_seconds);
} break;
case LatencyMeasureMode::kSystemSubmit: {
state.SetIterationTime(elapsed_seconds.count());
} break;
case LatencyMeasureMode::kGpuTimestamp: {
BM_CHECK_OK_AND_ASSIGN(
double timestamp_seconds,
query_pool->CalculateElapsedSecondsBetween(0, 1));
state.SetIterationTime(timestamp_seconds);
} break;
}

BM_CHECK_OK(cmdbuf->Reset());
}

state.SetBytesProcessed(state.iterations() * src_buffer_size);
state.counters["FLOps"] =
::benchmark::Counter(total_elements,
::benchmark::Counter::kIsIterationInvariant |
::benchmark::Counter::kIsRate,
::benchmark::Counter::kIs1000);

// Reset the command pool to release all command buffers in the benchmarking
// loop to avoid draining GPU resources.
BM_CHECK_OK(device->ResetCommandPool());
}

namespace uvkc {
namespace benchmark {

absl::StatusOr<std::unique_ptr<VulkanContext>> CreateVulkanContext() {
return CreateDefaultVulkanContext(kBenchmarkName);
}

bool RegisterVulkanOverheadBenchmark(
const vulkan::Driver::PhysicalDeviceInfo &physical_device,
vulkan::Device *device, double *overhead_seconds) {
return false;
}

void RegisterVulkanBenchmarks(
const vulkan::Driver::PhysicalDeviceInfo &physical_device,
vulkan::Device *device, const LatencyMeasure *latency_measure) {
const char *gpu_name = physical_device.v10_properties.deviceName;

for (const auto &shader : kShaders) {
for (size_t total_elements : {1 << 10, 1 << 12, 1 << 14, 1 << 16}) {
std::string test_name = absl::StrCat(
gpu_name, "/#elements=", total_elements,
"/workgroup_size=", shader.workgroup_size, "/", shader.name);
::benchmark::RegisterBenchmark(test_name.c_str(), Argmax, device,
latency_measure, shader.code,
shader.code_num_bytes / sizeof(uint32_t),
total_elements, shader.workgroup_size)
->UseManualTime()
->Unit(::benchmark::kMicrosecond);
}
}
}

} // namespace benchmark
} // namespace uvkc
Loading

0 comments on commit 32f2f57

Please sign in to comment.