diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 43f6f90..50e0834 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+add_subdirectory(argmax)
 add_subdirectory(compute)
 add_subdirectory(convolution)
 add_subdirectory(matmul)
diff --git a/benchmarks/argmax/CMakeLists.txt b/benchmarks/argmax/CMakeLists.txt
new file mode 100644
index 0000000..e11e9eb
--- /dev/null
+++ b/benchmarks/argmax/CMakeLists.txt
@@ -0,0 +1,45 @@
+# Copyright 2024 Advanced Micro Devices Inc.
+# Copyright 2020-2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+uvkc_glsl_shader_instance(
+  NAME
+    one_workgroup_argmax_loop_shader
+  SRC
+    "one_workgroup_argmax_loop.glsl"
+  GLSLC_ARGS
+    "--target-env=vulkan1.1"
+)
+
+uvkc_glsl_shader_instance(
+  NAME
+    one_workgroup_argmax_subgroup_shader
+  SRC
+    "one_workgroup_argmax_subgroup.glsl"
+  GLSLC_ARGS
+    "--target-env=vulkan1.1"
+)
+
+uvkc_cc_binary(
+  NAME
+    one_workgrop_argmax
+  SRCS
+    "one_workgroup_argmax_main.cc"
+  DEPS
+    ::one_workgroup_argmax_loop_shader
+    ::one_workgroup_argmax_subgroup_shader
+    benchmark::benchmark
+    uvkc::benchmark::core
+    uvkc::benchmark::main
+)
diff --git a/benchmarks/argmax/README.md b/benchmarks/argmax/README.md
new file mode 100644
index 0000000..1be20c5
--- /dev/null
+++ b/benchmarks/argmax/README.md
@@ -0,0 +1,12 @@
+# Argmax Benchmarks
+
+This directory contains microbenchmarks for evaluating different strategy to
+implement argmax.
+
+### `one_workgroup_argmax`
+
+Performs argmax using just one workgroup. The workgroup just contains one
+subgroup. This approach does not use any synchronization mechanisms.
+
+A subgroup uses either a single thread to loop over all elements or subgroup
+reduction operations involving all invocations.
\ No newline at end of file
diff --git a/benchmarks/argmax/one_workgroup_argmax_loop.glsl b/benchmarks/argmax/one_workgroup_argmax_loop.glsl
new file mode 100644
index 0000000..9b75d2e
--- /dev/null
+++ b/benchmarks/argmax/one_workgroup_argmax_loop.glsl
@@ -0,0 +1,30 @@
+# version 450 core
+#extension GL_EXT_control_flow_attributes : enable
+
+layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
+
+layout(set=0, binding=0) buffer InputBuffer { float data[]; } Input;
+layout(set=0, binding=1) buffer OutputBuffer { uint data; } Output;
+
+layout(constant_id = 0) const uint totalCount = 1; // Total number of scalars
+
+// Each workgroup contains just one subgroup.
+
+void main() {
+  uint laneID = gl_LocalInvocationID.x;
+
+  if (laneID != 0) return;
+
+  uint wgResult = 0;
+  float wgMax = Input.data[0];
+
+  for (uint i = 1; i < totalCount; ++i) {
+    float elem = Input.data[i];
+    if (elem > wgMax) {
+      wgResult = i;
+      wgMax = elem;
+    }
+  }
+
+  Output.data = wgResult;
+}
diff --git a/benchmarks/argmax/one_workgroup_argmax_main.cc b/benchmarks/argmax/one_workgroup_argmax_main.cc
new file mode 100644
index 0000000..35cc378
--- /dev/null
+++ b/benchmarks/argmax/one_workgroup_argmax_main.cc
@@ -0,0 +1,282 @@
+// Copyright 2024 Advanced Micro Devices Inc.
+// Copyright 2020-2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <chrono>
+#include <memory>
+#include <numeric>
+
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "benchmark/benchmark.h"
+#include "uvkc/benchmark/data_type_util.h"
+#include "uvkc/benchmark/main.h"
+#include "uvkc/benchmark/status_util.h"
+#include "uvkc/benchmark/vulkan_buffer_util.h"
+#include "uvkc/benchmark/vulkan_context.h"
+#include "uvkc/vulkan/device.h"
+#include "uvkc/vulkan/pipeline.h"
+
+using ::uvkc::benchmark::LatencyMeasureMode;
+using ::uvkc::vulkan::Pipeline;
+
+static const char kBenchmarkName[] = "one_workgroup_argmax";
+
+static const uint32_t kLoopShader[] = {
+#include "one_workgroup_argmax_loop_shader_spirv_instance.inc"
+};
+
+static const uint32_t kSubgroupShader[] = {
+#include "one_workgroup_argmax_subgroup_shader_spirv_instance.inc"
+};
+
+struct ShaderCode {
+  const char *name;       // Test case name
+  const uint32_t *code;   // SPIR-V code
+  size_t code_num_bytes;  // Number of bytes for SPIR-V code
+  int workgroup_size;     // Number of invocations per workgroup
+};
+
+ShaderCode kShaders[] = {
+    {"loop", kLoopShader, sizeof(kLoopShader), 32},
+    {"subgroup", kSubgroupShader, sizeof(kSubgroupShader), 32},
+};
+
+static void Argmax(::benchmark::State &state, ::uvkc::vulkan::Device *device,
+                   const ::uvkc::benchmark::LatencyMeasure *latency_measure,
+                   const uint32_t *code, size_t code_num_words,
+                   size_t total_elements, int workgroup_size) {
+  //===-------------------------------------------------------------------===/
+  // Create shader module, pipeline, and descriptor sets
+  //===-------------------------------------------------------------------===/
+
+  BM_CHECK_OK_AND_ASSIGN(auto shader_module,
+                         device->CreateShaderModule(code, code_num_words));
+  BM_CHECK_OK_AND_ASSIGN(auto descriptor_pool,
+                         device->CreateDescriptorPool(*shader_module));
+  BM_CHECK_OK_AND_ASSIGN(auto layout_set_map,
+                         descriptor_pool->AllocateDescriptorSets(
+                             shader_module->descriptor_set_layouts()));
+
+  Pipeline::SpecConstant spec_constants[] = {
+      {/*id=*/0, Pipeline::SpecConstant::Type::u32,
+       static_cast<int32_t>(total_elements)},
+      {/*id=*/1, Pipeline::SpecConstant::Type::u32, workgroup_size},
+  };
+  BM_CHECK_OK_AND_ASSIGN(
+      auto pipeline, device->CreatePipeline(*shader_module, "main",
+                                            absl::MakeSpan(spec_constants, 2)));
+
+  //===-------------------------------------------------------------------===/
+  // Create buffers
+  //===-------------------------------------------------------------------===/
+
+  const size_t src_buffer_size = total_elements * sizeof(float);
+  const size_t dst_buffer_size = sizeof(int);
+
+  BM_CHECK_OK_AND_ASSIGN(
+      auto src_buffer,
+      device->CreateBuffer(
+          VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, src_buffer_size));
+  BM_CHECK_OK_AND_ASSIGN(
+      auto dst_buffer,
+      device->CreateBuffer(
+          VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+              VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, dst_buffer_size));
+
+  // Create a buffer for zeroing the destination buffer.
+  BM_CHECK_OK_AND_ASSIGN(
+      auto data_buffer,
+      device->CreateBuffer(
+          VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT |
+              VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
+          VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, dst_buffer_size));
+
+  //===-------------------------------------------------------------------===/
+  // Set source buffer data
+  //===-------------------------------------------------------------------===/
+
+  auto generate_float_data = [](size_t i) {
+    return i == 550 ? float(i) : 1.0f;
+  };
+
+  BM_CHECK_OK(::uvkc::benchmark::SetDeviceBufferViaStagingBuffer(
+      device, src_buffer.get(), src_buffer_size,
+      [&](void *ptr, size_t num_bytes) {
+        float *src_float_buffer = reinterpret_cast<float *>(ptr);
+        for (size_t i = 0; i < num_bytes / sizeof(float); i++) {
+          src_float_buffer[i] = generate_float_data(i);
+        }
+      }));
+
+  //===-------------------------------------------------------------------===/
+  // Dispatch
+  //===-------------------------------------------------------------------===/
+
+  std::vector<::uvkc::vulkan::Device::BoundBuffer> bound_buffers = {
+      {src_buffer.get(), /*set=*/0, /*binding=*/0},
+      {dst_buffer.get(), /*set=*/0, /*binding=*/1},
+  };
+  BM_CHECK_OK(device->AttachBufferToDescriptor(
+      *shader_module, layout_set_map,
+      {bound_buffers.data(), bound_buffers.size()}));
+
+  BM_CHECK_EQ(shader_module->descriptor_set_layouts().size(), 1)
+      << "unexpected number of descriptor sets";
+  auto descriptor_set_layout = shader_module->descriptor_set_layouts().front();
+
+  std::vector<::uvkc::vulkan::CommandBuffer::BoundDescriptorSet>
+      bound_descriptor_sets(1);
+  bound_descriptor_sets[0].index = 0;
+  bound_descriptor_sets[0].set = layout_set_map.at(descriptor_set_layout);
+  BM_CHECK_OK_AND_ASSIGN(auto dispatch_cmdbuf, device->AllocateCommandBuffer());
+
+  BM_CHECK_OK(dispatch_cmdbuf->Begin());
+  dispatch_cmdbuf->BindPipelineAndDescriptorSets(
+      *pipeline, {bound_descriptor_sets.data(), bound_descriptor_sets.size()});
+  dispatch_cmdbuf->Dispatch(1, 1, 1);
+  BM_CHECK_OK(dispatch_cmdbuf->End());
+  BM_CHECK_OK(device->QueueSubmitAndWait(*dispatch_cmdbuf));
+
+  //===-------------------------------------------------------------------===/
+  // Verify destination buffer data
+  //===-------------------------------------------------------------------===/
+
+  BM_CHECK_OK(::uvkc::benchmark::GetDeviceBufferViaStagingBuffer(
+      device, dst_buffer.get(), dst_buffer_size,
+      [&](void *ptr, size_t num_bytes) {
+        int *dst_int_buffer = reinterpret_cast<int *>(ptr);
+        float max = -10000.f;
+        int idx = -1;
+        for (size_t i = 0; i < total_elements; i++) {
+          float data = generate_float_data(i);
+          if (data > max) {
+            idx = i;
+            max = data;
+          }
+        };
+        BM_CHECK_EQ(dst_int_buffer[0], idx)
+            << "destination buffer element #0 has incorrect value: "
+               "expected to be "
+            << idx << " but found " << dst_int_buffer[0];
+      }));
+
+  //===-------------------------------------------------------------------===/
+  // Benchmarking
+  //===-------------------------------------------------------------------===/
+
+  std::unique_ptr<::uvkc::vulkan::TimestampQueryPool> query_pool;
+  bool use_timestamp =
+      latency_measure->mode == LatencyMeasureMode::kGpuTimestamp;
+  if (use_timestamp) {
+    BM_CHECK_OK_AND_ASSIGN(query_pool, device->CreateTimestampQueryPool(2));
+  }
+
+  BM_CHECK_OK_AND_ASSIGN(auto cmdbuf, device->AllocateCommandBuffer());
+  for (auto _ : state) {
+    BM_CHECK_OK(cmdbuf->Begin());
+    if (use_timestamp) cmdbuf->ResetQueryPool(*query_pool);
+
+    cmdbuf->BindPipelineAndDescriptorSets(
+        *pipeline,
+        {bound_descriptor_sets.data(), bound_descriptor_sets.size()});
+
+    if (use_timestamp) {
+      cmdbuf->WriteTimestamp(*query_pool, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT, 0);
+    }
+
+    cmdbuf->Dispatch(1, 1, 1);
+
+    if (use_timestamp) {
+      cmdbuf->WriteTimestamp(*query_pool, VK_PIPELINE_STAGE_BOTTOM_OF_PIPE_BIT,
+                             1);
+    }
+
+    BM_CHECK_OK(cmdbuf->End());
+
+    auto start_time = std::chrono::high_resolution_clock::now();
+    BM_CHECK_OK(device->QueueSubmitAndWait(*cmdbuf));
+    auto end_time = std::chrono::high_resolution_clock::now();
+    auto elapsed_seconds =
+        std::chrono::duration_cast<std::chrono::duration<double>>(end_time -
+                                                                  start_time);
+
+    switch (latency_measure->mode) {
+      case LatencyMeasureMode::kSystemDispatch: {
+        state.SetIterationTime(elapsed_seconds.count() -
+                               latency_measure->overhead_seconds);
+      } break;
+      case LatencyMeasureMode::kSystemSubmit: {
+        state.SetIterationTime(elapsed_seconds.count());
+      } break;
+      case LatencyMeasureMode::kGpuTimestamp: {
+        BM_CHECK_OK_AND_ASSIGN(
+            double timestamp_seconds,
+            query_pool->CalculateElapsedSecondsBetween(0, 1));
+        state.SetIterationTime(timestamp_seconds);
+      } break;
+    }
+
+    BM_CHECK_OK(cmdbuf->Reset());
+  }
+
+  state.SetBytesProcessed(state.iterations() * src_buffer_size);
+  state.counters["FLOps"] =
+      ::benchmark::Counter(total_elements,
+                           ::benchmark::Counter::kIsIterationInvariant |
+                               ::benchmark::Counter::kIsRate,
+                           ::benchmark::Counter::kIs1000);
+
+  // Reset the command pool to release all command buffers in the benchmarking
+  // loop to avoid draining GPU resources.
+  BM_CHECK_OK(device->ResetCommandPool());
+}
+
+namespace uvkc {
+namespace benchmark {
+
+absl::StatusOr<std::unique_ptr<VulkanContext>> CreateVulkanContext() {
+  return CreateDefaultVulkanContext(kBenchmarkName);
+}
+
+bool RegisterVulkanOverheadBenchmark(
+    const vulkan::Driver::PhysicalDeviceInfo &physical_device,
+    vulkan::Device *device, double *overhead_seconds) {
+  return false;
+}
+
+void RegisterVulkanBenchmarks(
+    const vulkan::Driver::PhysicalDeviceInfo &physical_device,
+    vulkan::Device *device, const LatencyMeasure *latency_measure) {
+  const char *gpu_name = physical_device.v10_properties.deviceName;
+
+  for (const auto &shader : kShaders) {
+    for (size_t total_elements : {1 << 10, 1 << 12, 1 << 14, 1 << 16}) {
+      std::string test_name = absl::StrCat(
+          gpu_name, "/#elements=", total_elements,
+          "/workgroup_size=", shader.workgroup_size, "/", shader.name);
+      ::benchmark::RegisterBenchmark(test_name.c_str(), Argmax, device,
+                                     latency_measure, shader.code,
+                                     shader.code_num_bytes / sizeof(uint32_t),
+                                     total_elements, shader.workgroup_size)
+          ->UseManualTime()
+          ->Unit(::benchmark::kMicrosecond);
+    }
+  }
+}
+
+}  // namespace benchmark
+}  // namespace uvkc
diff --git a/benchmarks/argmax/one_workgroup_argmax_subgroup.glsl b/benchmarks/argmax/one_workgroup_argmax_subgroup.glsl
new file mode 100644
index 0000000..b751821
--- /dev/null
+++ b/benchmarks/argmax/one_workgroup_argmax_subgroup.glsl
@@ -0,0 +1,44 @@
+# version 450 core
+#extension GL_EXT_control_flow_attributes : enable
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+#extension GL_KHR_shader_subgroup_ballot : enable
+
+layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
+
+layout(set=0, binding=0) buffer InputBuffer { float data[]; } Input;
+layout(set=0, binding=1) buffer OutputBuffer { uint data; } Output;
+
+layout(constant_id = 0) const uint totalCount = 1; // Total number of scalars
+
+// Each workgroup contains just one subgroup.
+
+void main() {
+  uint laneID = gl_LocalInvocationID.x;
+  uint laneCount = gl_WorkGroupSize.x;
+
+  uint laneResult = 0;
+  float laneMax = Input.data[laneID];
+
+  uint numBatches = totalCount / laneCount;
+
+  for (uint i = 1; i < numBatches; ++i) {
+    uint idx = laneCount * i + laneID;
+    float elem = Input.data[idx];
+    if (elem > laneMax) {
+      laneResult = idx;
+      laneMax = elem;
+    }
+  }
+
+  // Find the max of workgroup (containing only one subgroup).
+  float wgMax = subgroupMax(laneMax);
+
+  // Find the smallest thread ID with the max element.
+  bool bit = laneMax == wgMax;
+  uvec4 mask = subgroupBallot(bit);
+  uint smallestID = subgroupBallotFindLSB(mask);
+
+  // The thread is responsible for outputing result.
+  if (laneID == smallestID)
+    Output.data = laneResult;
+}