diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000000..ec48d0d2bb
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,14 @@
+
+version: 2
+updates:
+# Enable version updates for Github Actions
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      interval: "monthly"
+    groups:
+      github-actions:
+        patterns:
+          - "*"
+    reviewers:
+      - "codeplaysoftware/security-managers"
diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml
new file mode 100644
index 0000000000..84d38c1e57
--- /dev/null
+++ b/.github/workflows/coverity.yml
@@ -0,0 +1,53 @@
+name: Coverity Scan
+
+# We only want to test official release code, not every pull request.
+on:
+  push:
+    branches:
+    - sycl-develop
+  pull_request:
+
+permissions: read-all
+
+jobs:
+  coverity:
+    runs-on: ubuntu-latest
+    container: nvidia/cuda:12.4.1-devel-ubuntu22.04
+    steps:
+      - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6
+      - name: Configure image
+        run: >
+          apt update && apt install -y cmake curl git jq libstdc++-12-dev \
+            ninja-build python3 wget
+      - name: Download DPCPP
+        shell: bash
+        run: |
+          cd /usr/local
+          echo "Will use DPCPP ${DPCPP_VERSION:-latest}."
+          if [[ "${DPCPP_VERSION}" != "" ]]; then
+            echo "Downloading DPCPP from https://github.com/intel/llvm/releases/download/$DPCPP_VERSION/sycl_linux.tar.gz"
+            wget -q https://github.com/intel/llvm/releases/download/$DPCPP_VERSION/sycl_linux.tar.gz
+          else
+            latest=$(curl -sS https://api.github.com/repos/intel/llvm/releases | jq -r '[.[].tag_name|select(match("nightly-[0-9]{4}-[0-9]{2}-[0-9]{2}"))][0]')
+            echo "Downloading DPCPP from https://github.com/intel/llvm/releases/download/${latest}/sycl_linux.tar.gz"
+            wget -q https://github.com/intel/llvm/releases/download/${latest}/sycl_linux.tar.gz
+          fi
+          tar -xf sycl_linux.tar.gz
+      - name: Configure CMake
+        run: |
+          export PATH=/usr/local/bin/:$PATH
+          export C_INCLUDE_PATH=/usr/local/include/:$C_INCLUDE_PATH
+          export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH
+          export CC=clang
+          export CXX=clang++
+          cmake -G Ninja  \
+            -DCMAKE_CUDA_HOST_COMPILER=clang++  \
+            -DCUTLASS_ENABLE_SYCL=ON  \
+            -DDPCPP_SYCL_TARGET=nvptx64-nvidia-cuda  \
+            -DDPCPP_SYCL_ARCH=sm_80
+      - uses: vapier/coverity-scan-action@2068473c7bdf8c2fb984a6a40ae76ee7facd7a85 # v1.8.0
+        with:
+          email: ${{ secrets.COVERITY_SCAN_EMAIL }}
+          token: ${{ secrets.COVERITY_SCAN_TOKEN }}
+          command: cmake --build .
+          working-directory: ''
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
new file mode 100644
index 0000000000..e742840395
--- /dev/null
+++ b/.github/workflows/scorecard.yml
@@ -0,0 +1,53 @@
+# Scorecards' GitHub action
+
+name: Scorecard supply-chain security
+on:
+  # For Branch-Protection check. Only the default branch is supported. See
+  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
+  branch_protection_rule:
+  schedule:
+    - cron: '18 16 * * 3'
+  push:
+    branches: [ "sycl-develop" ]
+
+# Declare default permissions as read only.
+permissions: read-all
+
+jobs:
+  analysis:
+    name: Scorecard analysis
+    runs-on: ubuntu-latest
+    permissions:
+      # Needed to upload the results to code-scanning dashboard.
+      security-events: write
+      # Needed to publish results and get a badge (see publish_results below).
+      id-token: write
+
+    steps:
+      - name: "Checkout code"
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+        with:
+          persist-credentials: false
+
+      - name: "Run analysis"
+        uses: ossf/scorecard-action@dc50aa9510b46c811795eb24b2f1ba02a914e534 # v2.3.3
+        with:
+          results_file: results.sarif
+          results_format: sarif
+          publish_results: true
+
+      # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
+      # format to the repository Actions tab.
+      - name: "Upload artifact"
+        uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
+        with:
+          name: SARIF file
+          path: results.sarif
+          retention-days: 5
+
+      # Upload the results to GitHub's code scanning dashboard (optional).
+      # Commenting out will disable upload of results to your repo's Code Scanning dashboard
+      - name: "Upload to code-scanning"
+        uses: github/codeql-action/upload-sarif@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9
+        with:
+          sarif_file: results.sarif
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 91408ddaa9..46b95fbaa6 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -159,10 +159,7 @@ set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_LIBRARY_INIT} CACHE BOOL "Enable CUT
 set(CUTLASS_ENABLE_PROFILER ${CUTLASS_ENABLE_LIBRARY} CACHE BOOL "Enable CUTLASS Profiler")
 set(CUTLASS_ENABLE_PERFORMANCE ${CUTLASS_ENABLE_PROFILER} CACHE BOOL "Enable CUTLASS Performance")
 option(CUTLASS_ENABLE_DEBUG_PRINTS "Whether or not to enable debug prints in CUTLASS kernels" OFF)
-
-if (CUTLASS_ENABLE_SYCL)
-  set(CUTLASS_ENABLE_BENCHMARKS ON CACHE BOOL "Enable CUTLASS Benchmarks")
-endif()
+set(CUTLASS_ENABLE_BENCHMARKS ON CACHE BOOL "Enable CUTLASS Benchmarks")
 
 set(CUTLASS_ENABLE_TESTS ${CUTLASS_ENABLE_TESTS_INIT} CACHE BOOL "Enable CUTLASS Tests")
 set(CUTLASS_ENABLE_GTEST_UNIT_TESTS ${CUTLASS_ENABLE_TESTS} CACHE BOOL "Enable CUTLASS GTest-based Unit Tests")
diff --git a/README-sycl.md b/README-sycl.md
index 3b412fb954..3d59a7bdc8 100644
--- a/README-sycl.md
+++ b/README-sycl.md
@@ -1,5 +1,7 @@
 # SYCL support for CUTLASS
 
+[![OpenSSF Scorecard](https://api.scorecard.dev/projects/github.com/codeplaysoftware/cutlass-fork/badge)](https://scorecard.dev/viewer/?uri=github.com/codeplaysoftware/cutlass-fork)
+
 This repository contains a development version of the CUTLASS repository
 with experimental SYCL support enabled. The aim is to
 support other SYCL-enabled devices with the minimal source code modifications by using the same CUTLASS features.
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 9e127e3e34..e4ec52b63f 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -43,28 +43,32 @@ function(cutlass_benchmark_add_executable NAME)
 
   add_dependencies(cutlass_benchmarks ${NAME})
 
+  if (NOT CUTLASS_ENABLE_SYCL)
+    SET(ADD_CUDA ON)
+  endif()
+
   target_link_libraries(
     ${NAME}
     PRIVATE
     CUTLASS
     cutlass_tools_util_includes
-    )
-
-  target_include_directories(
-    ${NAME}
-    PRIVATE
-    ${CUTLASS_BENCHMARKS_COMMON_SOURCE_DIR}
-    )
+    $<$<BOOL:${CUTLASS_ENABLE_CUBLAS}>:nvidia::cublas>
+    $<$<BOOL:${ADD_CUDA}>:cuda>
+  )
 
-  add_sycl_to_target(TARGET ${NAME})
+  if (CUTLASS_ENABLE_SYCL)
+    add_sycl_to_target(TARGET ${NAME})
+  endif()
 
   install(
     TARGETS ${NAME}
     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
-    )
+  )
 endfunction()
 
-foreach(BENCH
-)
-  add_subdirectory(${BENCH})
-endforeach()
+if(SYCL_INTEL_TARGET)
+  add_subdirectory(pvc)
+endif()
+if(SYCL_NVIDIA_TARGET OR NOT CUTLASS_ENABLE_SYCL)
+  add_subdirectory(ampere)
+endif()
diff --git a/benchmarks/ampere/CMakeLists.txt b/benchmarks/ampere/CMakeLists.txt
new file mode 100644
index 0000000000..a77901594b
--- /dev/null
+++ b/benchmarks/ampere/CMakeLists.txt
@@ -0,0 +1,38 @@
+# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_benchmark_add_executable(
+  bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32
+  bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu
+)
+
+cutlass_benchmark_add_executable(
+  bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32
+  bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu
+)
diff --git a/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu b/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu
new file mode 100644
index 0000000000..8dad127417
--- /dev/null
+++ b/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu
@@ -0,0 +1,153 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "../common/benchmark_runner.hpp"
+#include "gemm_configuration.hpp"
+
+int main(int argc, const char** argv)
+{
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Run benchmark
+  //
+
+  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+// The code section below describes datatype for input, output matrices and computation between
+// elements in input matrices.
+  using ElementAccumulator = float;                   // <- data type of accumulator
+  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
+  using ElementInputA = bfloat16_t;                        // <- data type of elements in input matrix A
+  using ElementInputB = bfloat16_t;                        // <- data type of elements in input matrix B
+  using ElementOutput = float;                        // <- data type of elements in output matrix D
+
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using TileShape = Shape<_128, _128, _32>;
+
+  using TiledMma = TiledMMA<
+          MMA_Atom<SM80_16x8x16_F32BF16BF16F32_TN>,
+          Layout<Shape<_2,_2,_1>>, // 2x2x1 thread group
+          Tile<_32,_32,_16>>;                           // 32x32x8 MMA for LDSM, 1x2x1 value group
+
+  static constexpr int kAlignmentA = 8;
+  using DefaultOperandA = DefaultGemm_TensorOpSm80_OperandA<
+          ElementInputA, LayoutA, kAlignmentA, 32>;
+  using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K
+  using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom;
+  using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy;
+
+  static constexpr int kAlignmentB = 8;
+  using DefaultOperandB = DefaultGemm_TensorOpSm80_OperandB<
+          ElementInputB, LayoutB, kAlignmentB, 32>;
+  using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K
+  using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom;
+  using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy;
+
+  using Stages = Int<3>;
+
+  // This code section describes the epilogue part of the kernel
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+          ElementOutput,                                     // <- data type of output matrix
+          128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- the number of elements per vectorized
+          // memory access. For a byte, it's 16
+          // elements. This becomes the vector width of
+          // math instructions in the epilogue too
+          ElementAccumulator,                                // <- data type of accumulator
+          ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
+
+  using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync<Stages{}>;
+
+  // Define strides (mixed)
+  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+  using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
+  using StrideC = cutlass::detail::TagToStrideC_t<LayoutC>;
+  using StrideD = cutlass::detail::TagToStrideC_t<LayoutD>;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
+          StrideC,
+          StrideD,
+          EpilogueOp,
+          cutlass::gemm::EpilogueDefault>;
+
+  // Mainloop
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+          DispatchPolicy,
+          TileShape,
+          ElementInputA,
+          StrideA,
+          ElementInputB,
+          StrideB,
+          TiledMma,
+          GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+          GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+          Shape<int, int, int, int>,
+          CollectiveMainloop,
+          CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  BenchmarkRunner<Gemm> runner;
+
+  runner.run(options, hw_info);
+
+  return 0;
+}
diff --git a/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu b/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu
new file mode 100644
index 0000000000..69bc482f12
--- /dev/null
+++ b/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu
@@ -0,0 +1,153 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "../common/benchmark_runner.hpp"
+#include "gemm_configuration.hpp"
+
+int main(int argc, const char** argv)
+{
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Run Benchmark
+  //
+
+  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+// The code section below describes datatype for input, output matrices and computation between
+// elements in input matrices.
+  using ElementAccumulator = float;                   // <- data type of accumulator
+  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
+  using ElementInputA = half_t;                        // <- data type of elements in input matrix A
+  using ElementInputB = half_t;                        // <- data type of elements in input matrix B
+  using ElementOutput = float;                        // <- data type of elements in output matrix D
+
+  using LayoutA = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutC = cutlass::layout::ColumnMajor;
+  using LayoutD = cutlass::layout::ColumnMajor;
+
+  using TileShape = Shape<_128, _128, _32>;
+
+  using TiledMma = TiledMMA<
+          MMA_Atom<SM80_16x8x16_F32F16F16F32_TN>,
+          Layout<Shape<_2,_2,_1>>, // 2x2x1 thread group
+          Tile<_32,_32,_16>>;                           // 32x32x8 MMA for LDSM, 1x2x1 value group
+
+  static constexpr int kAlignmentA = 8;
+  using DefaultOperandA = DefaultGemm_TensorOpSm80_OperandA<
+          ElementInputA, LayoutA, kAlignmentA, 32>;
+  using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K
+  using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom;
+  using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy;
+
+  static constexpr int kAlignmentB = 8;
+  using DefaultOperandB = DefaultGemm_TensorOpSm80_OperandB<
+          ElementInputB, LayoutB, kAlignmentB, 32>;
+  using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K
+  using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom;
+  using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy;
+
+  using Stages = Int<3>;
+
+  // This code section describes the epilogue part of the kernel
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+          ElementOutput,                                     // <- data type of output matrix
+          128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- the number of elements per vectorized
+          // memory access. For a byte, it's 16
+          // elements. This becomes the vector width of
+          // math instructions in the epilogue too
+          ElementAccumulator,                                // <- data type of accumulator
+          ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
+
+  using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync<Stages{}>;
+
+  // Define strides (mixed)
+  using StrideA = cutlass::detail::TagToStrideA_t<LayoutA>;
+  using StrideB = cutlass::detail::TagToStrideB_t<LayoutB>;
+  using StrideC = cutlass::detail::TagToStrideC_t<LayoutC>;
+  using StrideD = cutlass::detail::TagToStrideC_t<LayoutD>;
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
+          StrideC,
+          StrideD,
+          EpilogueOp,
+          cutlass::gemm::EpilogueDefault>;
+
+  // Mainloop
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+          DispatchPolicy,
+          TileShape,
+          ElementInputA,
+          StrideA,
+          ElementInputB,
+          StrideB,
+          TiledMma,
+          GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity,  // A
+          GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity   // B
+  >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+          Shape<int, int, int, int>,
+          CollectiveMainloop,
+          CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  BenchmarkRunner<Gemm> runner;
+
+  runner.run(options, hw_info);
+
+  return 0;
+}
diff --git a/benchmarks/ampere/gemm_configuration.hpp b/benchmarks/ampere/gemm_configuration.hpp
new file mode 100644
index 0000000000..484786567f
--- /dev/null
+++ b/benchmarks/ampere/gemm_configuration.hpp
@@ -0,0 +1,197 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass/half.h"
+#include "cutlass/layout/layout.h"
+
+#include "cute/swizzle.hpp"
+#include "cute/layout.hpp"
+#include "cute/arch/copy_sm75.hpp"
+#include "cute/arch/copy_sm80.hpp"
+#include "cute/atom/copy_atom.hpp"
+
+using namespace cute;
+
+template <typename Element, typename Layout, int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandA;
+
+template <typename Element, typename Layout, int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB;
+
+/////////////////////////////////////////////////////////////////////////
+
+// half
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<cutlass::half_t, cutlass::layout::RowMajor, 8, 64>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape < _8,_64>,
+                        Stride<_64, _1>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
+                    Layout<Shape <_16,_8>,
+                            Stride< _8,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+/// Operand A - Column-major (M-major)
+template <int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, 8, SizeK>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape <_64, _8>,
+                        Stride< _1,_64>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, half_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
+                    Layout<Shape <_16, _8>,
+                            Stride< _1,_16>>{},
+                    Layout<Shape < _8, _1>>{}));
+};
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::RowMajor, 8, 32>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<2,3,3>{},
+                Layout<Shape < _8,_32>,
+                        Stride<_32, _1>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, half_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, half_t>{},
+                    Layout<Shape <_32,_4>,
+                            Stride< _4,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands
+
+// Operand B - Column-Major (K-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<half_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
+        : DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::RowMajor, Alignment, SizeK>
+{};
+
+// Operand B - Row-Major (N-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<half_t, cutlass::layout::RowMajor, Alignment, SizeK>
+        : DefaultGemm_TensorOpSm80_OperandA<half_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
+{};
+
+/////////////////////////////////////////////////////////////////////////
+
+// Bfloat
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<cutlass::bfloat16_t, cutlass::layout::RowMajor, 8, 64>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape < _8,_64>,
+                        Stride<_64, _1>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, bfloat16_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
+                    Layout<Shape <_16,_8>,
+                            Stride< _8,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+/// Operand A - Column-major (M-major)
+template <int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::ColumnMajor, 8, SizeK>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<3,3,3>{},
+                Layout<Shape <_64, _8>,
+                        Stride< _1,_64>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U16x8_LDSM_T, bfloat16_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
+                    Layout<Shape <_16, _8>,
+                            Stride< _1,_16>>{},
+                    Layout<Shape < _8, _1>>{}));
+};
+
+/// Operand A - Row-major (K-Major)
+template <>
+struct DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::RowMajor, 8, 32>
+{
+    // Smem
+    using SmemLayoutAtom = decltype(
+    composition(Swizzle<2,3,3>{},
+                Layout<Shape < _8,_32>,
+                        Stride<_32, _1>>{}));
+    using SmemCopyAtom = Copy_Atom<SM75_U32x4_LDSM_N, bfloat16_t>;
+
+    // Gmem
+    using GmemTiledCopy = decltype(
+    make_tiled_copy(Copy_Atom<SM80_CP_ASYNC_CACHEGLOBAL<cute::uint128_t>, bfloat16_t>{},
+                    Layout<Shape <_32,_4>,
+                            Stride< _4,_1>>{},
+                    Layout<Shape < _1,_8>>{}));
+};
+
+// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands
+
+// Operand B - Column-Major (K-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<bfloat16_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
+        : DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::RowMajor, Alignment, SizeK>
+{};
+
+// Operand B - Row-Major (N-major)
+template <int Alignment, int SizeK>
+struct DefaultGemm_TensorOpSm80_OperandB<bfloat16_t, cutlass::layout::RowMajor, Alignment, SizeK>
+        : DefaultGemm_TensorOpSm80_OperandA<bfloat16_t, cutlass::layout::ColumnMajor, Alignment, SizeK>
+{};
diff --git a/benchmarks/common/benchmark_runner.hpp b/benchmarks/common/benchmark_runner.hpp
new file mode 100644
index 0000000000..5eb2ade3eb
--- /dev/null
+++ b/benchmarks/common/benchmark_runner.hpp
@@ -0,0 +1,388 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "cutlass/gemm/device/gemm.h"
+#include "cutlass/epilogue/collective/default_epilogue.hpp"
+#include "cutlass/gemm/device/gemm_universal.h"
+#include "cutlass/gemm/device/gemm_universal_adapter.h"
+#include "cutlass/gemm/collective/collective_mma.hpp"
+#include "cutlass/util/GPU_Clock.hpp"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cute/tensor.hpp"
+
+#include "cutlass/util/command_line.h"
+#include "cutlass/util/device_memory.h"
+#include "cutlass/util/packed_stride.hpp"
+#include "cutlass/util/reference/device/gemm_complex.h"
+#include "cutlass/util/reference/device/tensor_compare.h"
+#include "cutlass/util/print_error.hpp"
+
+template <typename T>
+static void fill_matrix(std::vector<T> &M)
+{
+  std::generate(std::begin(M), std::end(M), [&]
+  { return static_cast<T>( 2 * (rand() / double(RAND_MAX)) - 1); });
+}
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Command line options parsing
+struct Options {
+
+    bool help;
+    bool error;
+
+    int m, n, k, l, iterations;
+    float alpha, beta;
+
+    Options():
+            help(false),
+            error(false),
+            m(4096), n(4096), k(4096), l(1), iterations(100),
+            alpha(1.f), beta(0.f)
+    { }
+
+    // Parses the command line
+    void parse(int argc, char const **args) {
+      cutlass::CommandLine cmd(argc, args);
+
+      if (cmd.check_cmd_line_flag("help")) {
+        help = true;
+        return;
+      }
+
+      cmd.get_cmd_line_argument("m", m, 4096);
+      cmd.get_cmd_line_argument("n", n, 4096);
+      cmd.get_cmd_line_argument("k", k, 4096);
+      cmd.get_cmd_line_argument("l", l, 1);
+      cmd.get_cmd_line_argument("alpha", alpha, 1.f);
+      cmd.get_cmd_line_argument("beta", beta, 0.f);
+      cmd.get_cmd_line_argument("iterations", iterations, 100);
+    }
+
+    /// Prints the usage statement.
+    std::ostream & print_usage(std::ostream &out) const {
+
+      out << "PVC GEMM Benchmark\n\n"
+          << "Options:\n\n"
+          << "  --help                      If specified, displays this usage statement\n\n"
+          << "  --m=<int>                   Sets the M extent of the GEMM\n"
+          << "  --n=<int>                   Sets the N extent of the GEMM\n"
+          << "  --k=<int>                   Sets the K extent of the GEMM\n"
+          << "  --l=<int>                   Sets the L extent (batch count) of the GEMM\n"
+          << "  --alpha=<s32>               Epilogue scalar alpha\n"
+          << "  --beta=<s32>                Epilogue scalar beta\n\n"
+          << "  --iterations=<int>          Iterations\n\n";
+
+      return out;
+    }
+};
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <class Gemm>
+struct BenchmarkRunner {
+
+    using StrideA = typename Gemm::GemmKernel::StrideA;
+    using StrideB = typename Gemm::GemmKernel::StrideB;
+    using StrideC = typename Gemm::GemmKernel::StrideC;
+    using StrideD = typename Gemm::GemmKernel::StrideD;
+
+    using LayoutA = typename Gemm::LayoutA;
+    using LayoutB = typename Gemm::LayoutB;
+    using LayoutC = typename Gemm::LayoutC;
+    using LayoutD = typename Gemm::LayoutD;
+
+    using ElementA = typename Gemm::ElementA;
+    using ElementB = typename Gemm::ElementB;
+    using ElementAcc = typename Gemm::ElementAccumulator;
+
+    using CollectiveEpilogue = typename Gemm::CollectiveEpilogue;
+    using ElementC = typename Gemm::ElementC;
+    using ElementOutput = typename CollectiveEpilogue::ElementOutput;
+    using ElementCompute = typename CollectiveEpilogue::ElementCompute;
+    using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator;
+
+    using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape;
+
+    //
+    // Data members
+    //
+
+    /// Initialization
+    StrideA stride_A;
+    StrideB stride_B;
+    StrideC stride_C;
+    StrideD stride_D;
+
+    cutlass::DeviceAllocation<ElementA> block_A;
+    cutlass::DeviceAllocation<ElementB> block_B;
+    cutlass::DeviceAllocation<ElementC> block_C;
+    cutlass::DeviceAllocation<ElementOutput> block_D;
+    cutlass::DeviceAllocation<ElementOutput> block_ref_D;
+
+    //
+    // Methods
+    //
+
+    bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) {
+      auto [M, N, K, L] = problem_size;
+
+      cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K}));
+      cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N}));
+      cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N}));
+      cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N}));
+
+      cutlass::reference::device::GemmComplex(
+              {M, N, K},
+              alpha,
+              ref_A,
+              cutlass::ComplexTransform::kNone,
+              ref_B,
+              cutlass::ComplexTransform::kNone,
+              beta,
+              ref_C,
+              ref_D,
+              ElementAccumulator(0),
+              L,     // batch_count
+              M * K, // batch_stride_A
+              K * N, // batch_stride_B
+              M * N, // batch_stride_C
+              M * N  // batch_stride_D
+      );
+
+#if defined(CUTLASS_ENABLE_SYCL)
+      syclcompat::wait();
+#else
+      cudaDeviceSynchronize();
+#endif
+
+      // Check if output from CUTLASS kernel and reference kernel are relatively equal or not
+      // need to set a larger error margin for comparison to succeed
+      auto epsilon = static_cast<ElementOutput>(0.1f);
+      auto nonzero_floor = static_cast<ElementOutput>(0.1f);
+
+      bool passed = cutlass::reference::device::BlockCompareRelativelyEqual(
+              block_ref_D.get(), block_D.get(), block_D.size(),
+              epsilon, nonzero_floor);
+
+      return passed;
+    }
+
+    /// Initialize operands to be used in the GEMM and reference GEMM
+    virtual void initialize(const ProblemShapeType& problem_size) {
+      auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+      auto [M, N, K, L] = problem_shape_MNKL;
+
+      stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L));
+      stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L));
+      stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L));
+      stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L));
+
+      block_A.reset(M * K * L);
+      block_B.reset(K * N * L);
+      block_C.reset(M * N * L);
+      block_D.reset(M * N * L);
+      block_ref_D.reset(M * N * L);
+
+      // TODO: Enable initialization on device directly once RNG is
+      // available through SYCL.
+      std::vector<ElementA> a(K * M * L);
+      std::vector<ElementB> b(K * N * L);
+      std::vector<ElementC> c(M * N * L);
+      std::vector<ElementC> d(M * N * L, ElementC{-1});
+      std::vector<ElementC> ref_d(M * N * L, ElementC{-2});
+
+      fill_matrix(a);
+      fill_matrix(b);
+      fill_matrix(c);
+
+      block_A.copy_from_host(a.data(), a.size());
+      block_B.copy_from_host(b.data(), b.size());
+      block_C.copy_from_host(c.data(), c.size());
+      block_D.copy_from_host(d.data(), d.size());
+      block_ref_D.copy_from_host(ref_d.data(), d.size());
+    }
+
+    virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) {
+      ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+      initialize(problem_size);
+
+      typename Gemm::GemmKernel::Arguments arguments{
+              cutlass::gemm::GemmUniversalMode::kGemm,
+              problem_size,
+              {block_A.get(), stride_A, block_B.get(), stride_B},
+              {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D},
+              hw_info
+      };
+
+      Gemm gemm_op;
+
+      size_t workspace_size = Gemm::get_workspace_size(arguments);
+      cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+      gemm_op.can_implement(arguments);
+
+      gemm_op.initialize(arguments, workspace.get());
+
+      // Run the GEMM
+      gemm_op.run();
+
+#if defined(CUTLASS_ENABLE_SYCL)
+      syclcompat::wait();
+#else
+      cudaDeviceSynchronize();
+#endif
+
+      // Verify that the result is correct
+      bool passed = verify(problem_size, options.alpha, options.beta);
+      std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+      if (passed && options.iterations > 0) {
+        GPU_Clock timer;
+        timer.start();
+        for (int i = 0; i < options.iterations; ++i) {
+          gemm_op.run();
+        }
+
+        float cute_time = timer.seconds() / options.iterations;
+        double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+        std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l
+                  << std::endl;
+        printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time * 1000);
+      }
+    }
+};
+
+template <class Gemm>
+struct PvcBenchmarkRunner : BenchmarkRunner<Gemm> {
+    using Base = BenchmarkRunner<Gemm>;
+
+    using ElementB = typename Base::ElementB;
+
+    using ProblemShapeType = typename Base::ProblemShapeType;
+
+    cutlass::DeviceAllocation<ElementB> block_B_vnni;
+
+    template <typename T>
+    void vnni_matrix(
+            T* dst, const T* src,
+            int batch, int numRows, int numCols, int factor)
+    {
+      for (int b = 0; b < batch; b++) {
+        for (int r = 0; r < numRows / factor; r++) {
+          for (int c = 0; c < numCols; c++) {
+            for (int k = 0; k < factor; k++) {
+              dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] =
+                      src[((b * (numRows / factor) + r) * factor + k) * numCols + c];
+            }
+          }
+        }
+      }
+    }
+
+    void initialize(const ProblemShapeType& problem_size) override {
+      Base::initialize(problem_size);
+
+      auto problem_shape_MNKL = cute::append<4>(problem_size, 1);
+      auto [M, N, K, L] = problem_shape_MNKL;
+
+      block_B_vnni.reset(Base::block_B.size());
+
+      std::vector<ElementB> b(K * N * L);
+      std::vector<ElementB> b_vnni(b.size());
+
+      Base::block_B.copy_to_host(b.data());
+      vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2);
+
+      block_B_vnni.copy_from_host(b_vnni.data());
+    }
+
+    void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) override {
+      ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l};
+
+      initialize(problem_size);
+
+      typename Gemm::GemmKernel::Arguments arguments{
+              cutlass::gemm::GemmUniversalMode::kGemm,
+              problem_size,
+              {Base::block_A.get(), Base::stride_A, block_B_vnni.get(), Base::stride_B},
+              {
+                {options.alpha, options.beta},
+                Base::block_C.get(), Base::stride_C, Base::block_D.get(), Base::stride_D
+              },
+              hw_info
+      };
+
+      Gemm gemm_op;
+
+      size_t workspace_size = Gemm::get_workspace_size(arguments);
+      cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+      gemm_op.can_implement(arguments);
+
+      gemm_op.initialize(arguments, workspace.get());
+
+      // Run the GEMM
+      gemm_op.run();
+
+#if defined(CUTLASS_ENABLE_SYCL)
+      syclcompat::wait();
+#else
+      cudaDeviceSynchronize();
+#endif
+
+      // Verify that the result is correct
+      bool passed = Base::verify(problem_size, options.alpha, options.beta);
+      std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
+
+      if (passed && options.iterations > 0) {
+        GPU_Clock timer;
+        timer.start();
+        for (int i = 0; i < options.iterations; ++i) {
+          gemm_op.run();
+        }
+
+        float cute_time = timer.seconds() / options.iterations;
+        double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12;
+        std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl;
+        printf("Cutlass GEMM Performance:     [%4.3f]TFlop/s  (%6.4f)ms\n", tflops / cute_time, cute_time*1000);
+      }
+    }
+};
+
diff --git a/benchmarks/pvc/CMakeLists.txt b/benchmarks/pvc/CMakeLists.txt
new file mode 100644
index 0000000000..b97e679fce
--- /dev/null
+++ b/benchmarks/pvc/CMakeLists.txt
@@ -0,0 +1,33 @@
+# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+# SPDX-License-Identifier: BSD-3-Clause
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# 1. Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# 3. Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+cutlass_benchmark_add_executable(
+  bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32
+  bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp
+)
diff --git a/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp b/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp
new file mode 100644
index 0000000000..67b76929db
--- /dev/null
+++ b/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp
@@ -0,0 +1,136 @@
+/***************************************************************************************************
+ * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+#include "../common/benchmark_runner.hpp"
+
+using namespace cute;
+
+///////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main(int argc, const char** argv)
+{
+  //
+  // Parse options
+  //
+
+  Options options;
+
+  options.parse(argc, argv);
+
+  if (options.help) {
+    options.print_usage(std::cout) << std::endl;
+    return 0;
+  }
+
+  if (options.error) {
+    std::cerr << "Aborting execution." << std::endl;
+    return -1;
+  }
+
+  //
+  // Run benchmark
+  //
+
+  // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This
+  // information is used by the underlying kernel.
+  cutlass::KernelHardwareInfo hw_info;
+
+  // Change device_id to another value if you are running on a machine with multiple GPUs and wish
+  // to use a GPU other than that with device ID 0.
+  hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id);
+
+  bool passed;
+
+  // The code section below describes datatype for input, output matrices and computation between
+  // elements in input matrices.
+  using ElementAccumulator = float;                   // <- data type of accumulator
+  using ElementComputeEpilogue = float;  // <- data type of epilogue operations
+  using ElementInputA = bfloat16_t;                        // <- data type of elements in input matrix A
+  using ElementInputB = bfloat16_t;                        // <- data type of elements in input matrix B
+  using ElementOutput = float;                        // <- data type of elements in output matrix D
+
+  using LayoutA = cutlass::layout::RowMajor;
+  using LayoutB = cutlass::layout::RowMajor;
+  using LayoutC = cutlass::layout::RowMajor;
+  using LayoutD = cutlass::layout::RowMajor;
+
+  using GmemTiledCopyA = XE_2D_U16x8x16x4x2_LD_N;
+  using GmemTiledCopyB = XE_2D_U16x16x16x2x1_LD_N;
+
+  using TileShape = Shape<_32, _64, _32>;
+
+  using TiledMma = TiledMMA<MMA_Atom<XE_8x16x16_BF16BF16F32F32_NN>,
+          Layout<Shape<_8,_16,_1>>>;
+
+  using DispatchPolicy = cutlass::gemm::MainloopIntelPVCUnpredicated;
+
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+          ElementOutput,                                     // <- data type of output matrix
+          128 / cutlass::sizeof_bits<ElementOutput>::value,  // <- the number of elements per vectorized
+          // memory access. For a byte, it's 16
+          // elements. This becomes the vector width of
+          // math instructions in the epilogue too
+          ElementAccumulator,                                // <- data type of accumulator
+          ElementComputeEpilogue>;  // <- data type for alpha/beta in linear combination function
+
+  using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue<
+          cutlass::gemm::TagToStrideC_t<LayoutC>,
+          cutlass::gemm::TagToStrideC_t<LayoutD>,
+          EpilogueOp,
+          cutlass::gemm::EpilogueDefault>;
+
+// Mainloop
+  using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma<
+          DispatchPolicy,
+          TileShape,
+          ElementInputA,
+          cutlass::gemm::TagToStrideA_t<LayoutA>,
+          ElementInputB,
+          cutlass::gemm::TagToStrideB_t<LayoutB>,
+          TiledMma,
+          GmemTiledCopyA, void, void, cute::identity,  // A
+          GmemTiledCopyB, void, void, cute::identity   // B
+  >;
+
+  using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
+  Shape<int, int, int, int>,
+  CollectiveMainloop,
+  CollectiveEpilogue
+  >;
+
+  using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;
+
+  PvcBenchmarkRunner<Gemm> runner;
+
+  runner.run(options, hw_info);
+
+  return 0;
+}
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index ef0449f902..b736ce35e8 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -27,6 +27,6 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
-if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc")
+if(SYCL_INTEL_TARGET)
   add_subdirectory(pvc)
 endif()
diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
index c7ccfd832a..343075681a 100644
--- a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
+++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp
@@ -235,7 +235,7 @@ struct ExampleRunner {
     block_D.reset(M * N * L);
     block_ref_D.reset(M * N * L);
 
-    // TODO: Enable initialization on device directly once RNG is 
+    // TODO: Enable initialization on device directly once RNG is
     // available through SYCL.
     std::vector<ElementA> a(K * M * L);
     std::vector<ElementB> b(K * N * L);
diff --git a/include/cute/arch/copy_sm75.hpp b/include/cute/arch/copy_sm75.hpp
index 30d86b7ba7..b0bce99f9a 100644
--- a/include/cute/arch/copy_sm75.hpp
+++ b/include/cute/arch/copy_sm75.hpp
@@ -48,6 +48,10 @@
   #define CUTE_ARCH_NVCC_SUPPORTS_LDSM_SM75 ((__CUDACC_VER_MAJOR__  == 10 && __CUDACC_VER_MINOR__ >= 2) || __CUDACC_VER_MAJOR__ >= 11)
 #endif
 
+#if defined(SYCL_NVIDIA_TARGET)
+  #define CUTE_ARCH_NVCC_SUPPORTS_LDSM_SM75 1
+#endif
+
 #if ! defined(CUTE_ARCH_LDSM_SM75_SUPPORTED)
   #define CUTE_ARCH_LDSM_SM75_SUPPORTED (CUTE_ARCH_NVCC_SUPPORTS_LDSM_SM75 || CUTE_ARCH_CLANG_SUPPORTS_LDSM_SM75)
 #endif
diff --git a/include/cutlass/vector_types.h b/include/cutlass/vector_types.h
index e34730db3b..3086fcea80 100644
--- a/include/cutlass/vector_types.h
+++ b/include/cutlass/vector_types.h
@@ -31,13 +31,17 @@
 #pragma once
 
 #if defined(CUTLASS_ENABLE_SYCL)
-#include <sycl/sycl.hpp>
+#include "cutlass/detail/helper_macros.hpp"
 
 // Add these definitions in the cutlass namespace, so they do not clash with the ones in cuda
 namespace cutlass {
     // We use this struct instead of sycl::int4 because the sycl version requires x() to access x,
     // while the struct does not need the (). This prevents us from having to modify the Cutlass
     // implementation in all the places where these vector types are used.
+    using int2 = struct alignas(8) {
+        int x, y;
+    };
+
     using int4 = struct alignas(16) {
         int x, y, z, w;
     };
@@ -85,6 +89,16 @@ namespace cutlass {
     using double4 = struct alignas(16) {
         long long int x, y, z, w;
     };
+
+    CUTLASS_HOST_DEVICE
+    int2 make_int2(int x, int y) {
+      return int2{x,y};
+    }
+
+    CUTLASS_HOST_DEVICE
+    int4 make_int4(int x, int y, int z, int w) {
+      return int4 {x,y,z,w};
+    }
 }
 #else
 #include <vector_types.h>