diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000000..ec48d0d2bb --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,14 @@ + +version: 2 +updates: +# Enable version updates for Github Actions + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "monthly" + groups: + github-actions: + patterns: + - "*" + reviewers: + - "codeplaysoftware/security-managers" diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml new file mode 100644 index 0000000000..84d38c1e57 --- /dev/null +++ b/.github/workflows/coverity.yml @@ -0,0 +1,53 @@ +name: Coverity Scan + +# We only want to test official release code, not every pull request. +on: + push: + branches: + - sycl-develop + pull_request: + +permissions: read-all + +jobs: + coverity: + runs-on: ubuntu-latest + container: nvidia/cuda:12.4.1-devel-ubuntu22.04 + steps: + - uses: actions/checkout@a5ac7e51b41094c92402da3b24376905380afc29 # v4.1.6 + - name: Configure image + run: > + apt update && apt install -y cmake curl git jq libstdc++-12-dev \ + ninja-build python3 wget + - name: Download DPCPP + shell: bash + run: | + cd /usr/local + echo "Will use DPCPP ${DPCPP_VERSION:-latest}." + if [[ "${DPCPP_VERSION}" != "" ]]; then + echo "Downloading DPCPP from https://github.com/intel/llvm/releases/download/$DPCPP_VERSION/sycl_linux.tar.gz" + wget -q https://github.com/intel/llvm/releases/download/$DPCPP_VERSION/sycl_linux.tar.gz + else + latest=$(curl -sS https://api.github.com/repos/intel/llvm/releases | jq -r '[.[].tag_name|select(match("nightly-[0-9]{4}-[0-9]{2}-[0-9]{2}"))][0]') + echo "Downloading DPCPP from https://github.com/intel/llvm/releases/download/${latest}/sycl_linux.tar.gz" + wget -q https://github.com/intel/llvm/releases/download/${latest}/sycl_linux.tar.gz + fi + tar -xf sycl_linux.tar.gz + - name: Configure CMake + run: | + export PATH=/usr/local/bin/:$PATH + export C_INCLUDE_PATH=/usr/local/include/:$C_INCLUDE_PATH + export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH + export CC=clang + export CXX=clang++ + cmake -G Ninja \ + -DCMAKE_CUDA_HOST_COMPILER=clang++ \ + -DCUTLASS_ENABLE_SYCL=ON \ + -DDPCPP_SYCL_TARGET=nvptx64-nvidia-cuda \ + -DDPCPP_SYCL_ARCH=sm_80 + - uses: vapier/coverity-scan-action@2068473c7bdf8c2fb984a6a40ae76ee7facd7a85 # v1.8.0 + with: + email: ${{ secrets.COVERITY_SCAN_EMAIL }} + token: ${{ secrets.COVERITY_SCAN_TOKEN }} + command: cmake --build . + working-directory: '' diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml new file mode 100644 index 0000000000..e742840395 --- /dev/null +++ b/.github/workflows/scorecard.yml @@ -0,0 +1,53 @@ +# Scorecards' GitHub action + +name: Scorecard supply-chain security +on: + # For Branch-Protection check. Only the default branch is supported. See + # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection + branch_protection_rule: + schedule: + - cron: '18 16 * * 3' + push: + branches: [ "sycl-develop" ] + +# Declare default permissions as read only. +permissions: read-all + +jobs: + analysis: + name: Scorecard analysis + runs-on: ubuntu-latest + permissions: + # Needed to upload the results to code-scanning dashboard. + security-events: write + # Needed to publish results and get a badge (see publish_results below). + id-token: write + + steps: + - name: "Checkout code" + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + persist-credentials: false + + - name: "Run analysis" + uses: ossf/scorecard-action@dc50aa9510b46c811795eb24b2f1ba02a914e534 # v2.3.3 + with: + results_file: results.sarif + results_format: sarif + publish_results: true + + # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF + # format to the repository Actions tab. + - name: "Upload artifact" + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 + with: + name: SARIF file + path: results.sarif + retention-days: 5 + + # Upload the results to GitHub's code scanning dashboard (optional). + # Commenting out will disable upload of results to your repo's Code Scanning dashboard + - name: "Upload to code-scanning" + uses: github/codeql-action/upload-sarif@1b1aada464948af03b950897e5eb522f92603cc2 # v3.24.9 + with: + sarif_file: results.sarif diff --git a/CMakeLists.txt b/CMakeLists.txt index 91408ddaa9..46b95fbaa6 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -159,10 +159,7 @@ set(CUTLASS_ENABLE_LIBRARY ${CUTLASS_ENABLE_LIBRARY_INIT} CACHE BOOL "Enable CUT set(CUTLASS_ENABLE_PROFILER ${CUTLASS_ENABLE_LIBRARY} CACHE BOOL "Enable CUTLASS Profiler") set(CUTLASS_ENABLE_PERFORMANCE ${CUTLASS_ENABLE_PROFILER} CACHE BOOL "Enable CUTLASS Performance") option(CUTLASS_ENABLE_DEBUG_PRINTS "Whether or not to enable debug prints in CUTLASS kernels" OFF) - -if (CUTLASS_ENABLE_SYCL) - set(CUTLASS_ENABLE_BENCHMARKS ON CACHE BOOL "Enable CUTLASS Benchmarks") -endif() +set(CUTLASS_ENABLE_BENCHMARKS ON CACHE BOOL "Enable CUTLASS Benchmarks") set(CUTLASS_ENABLE_TESTS ${CUTLASS_ENABLE_TESTS_INIT} CACHE BOOL "Enable CUTLASS Tests") set(CUTLASS_ENABLE_GTEST_UNIT_TESTS ${CUTLASS_ENABLE_TESTS} CACHE BOOL "Enable CUTLASS GTest-based Unit Tests") diff --git a/README-sycl.md b/README-sycl.md index 3b412fb954..3d59a7bdc8 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -1,5 +1,7 @@ # SYCL support for CUTLASS +[![OpenSSF Scorecard](https://api.scorecard.dev/projects/github.com/codeplaysoftware/cutlass-fork/badge)](https://scorecard.dev/viewer/?uri=github.com/codeplaysoftware/cutlass-fork) + This repository contains a development version of the CUTLASS repository with experimental SYCL support enabled. The aim is to support other SYCL-enabled devices with the minimal source code modifications by using the same CUTLASS features. diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 9e127e3e34..e4ec52b63f 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -43,28 +43,32 @@ function(cutlass_benchmark_add_executable NAME) add_dependencies(cutlass_benchmarks ${NAME}) + if (NOT CUTLASS_ENABLE_SYCL) + SET(ADD_CUDA ON) + endif() + target_link_libraries( ${NAME} PRIVATE CUTLASS cutlass_tools_util_includes - ) - - target_include_directories( - ${NAME} - PRIVATE - ${CUTLASS_BENCHMARKS_COMMON_SOURCE_DIR} - ) + $<$:nvidia::cublas> + $<$:cuda> + ) - add_sycl_to_target(TARGET ${NAME}) + if (CUTLASS_ENABLE_SYCL) + add_sycl_to_target(TARGET ${NAME}) + endif() install( TARGETS ${NAME} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - ) + ) endfunction() -foreach(BENCH -) - add_subdirectory(${BENCH}) -endforeach() +if(SYCL_INTEL_TARGET) + add_subdirectory(pvc) +endif() +if(SYCL_NVIDIA_TARGET OR NOT CUTLASS_ENABLE_SYCL) + add_subdirectory(ampere) +endif() diff --git a/benchmarks/ampere/CMakeLists.txt b/benchmarks/ampere/CMakeLists.txt new file mode 100644 index 0000000000..a77901594b --- /dev/null +++ b/benchmarks/ampere/CMakeLists.txt @@ -0,0 +1,38 @@ +# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +cutlass_benchmark_add_executable( + bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32 + bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu +) + +cutlass_benchmark_add_executable( + bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32 + bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu +) diff --git a/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu b/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu new file mode 100644 index 0000000000..8dad127417 --- /dev/null +++ b/benchmarks/ampere/bench_ampere_gemm_bf16_bf16_fp32_tensor_op_fp32.cu @@ -0,0 +1,153 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "../common/benchmark_runner.hpp" +#include "gemm_configuration.hpp" + +int main(int argc, const char** argv) +{ + // + // Parse options + // + + Options options; + + options.parse(argc, argv); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.error) { + std::cerr << "Aborting execution." << std::endl; + return -1; + } + + // + // Run benchmark + // + + // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This + // information is used by the underlying kernel. + cutlass::KernelHardwareInfo hw_info; + + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); + +// The code section below describes datatype for input, output matrices and computation between +// elements in input matrices. + using ElementAccumulator = float; // <- data type of accumulator + using ElementComputeEpilogue = float; // <- data type of epilogue operations + using ElementInputA = bfloat16_t; // <- data type of elements in input matrix A + using ElementInputB = bfloat16_t; // <- data type of elements in input matrix B + using ElementOutput = float; // <- data type of elements in output matrix D + + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutC = cutlass::layout::ColumnMajor; + using LayoutD = cutlass::layout::ColumnMajor; + + using TileShape = Shape<_128, _128, _32>; + + using TiledMma = TiledMMA< + MMA_Atom, + Layout>, // 2x2x1 thread group + Tile<_32,_32,_16>>; // 32x32x8 MMA for LDSM, 1x2x1 value group + + static constexpr int kAlignmentA = 8; + using DefaultOperandA = DefaultGemm_TensorOpSm80_OperandA< + ElementInputA, LayoutA, kAlignmentA, 32>; + using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K + using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom; + using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy; + + static constexpr int kAlignmentB = 8; + using DefaultOperandB = DefaultGemm_TensorOpSm80_OperandB< + ElementInputB, LayoutB, kAlignmentB, 32>; + using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K + using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom; + using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy; + + using Stages = Int<3>; + + // This code section describes the epilogue part of the kernel + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, // <- data type of output matrix + 128 / cutlass::sizeof_bits::value, // <- the number of elements per vectorized + // memory access. For a byte, it's 16 + // elements. This becomes the vector width of + // math instructions in the epilogue too + ElementAccumulator, // <- data type of accumulator + ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function + + using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync; + + // Define strides (mixed) + using StrideA = cutlass::detail::TagToStrideA_t; + using StrideB = cutlass::detail::TagToStrideB_t; + using StrideC = cutlass::detail::TagToStrideC_t; + using StrideD = cutlass::detail::TagToStrideC_t; + + using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< + StrideC, + StrideD, + EpilogueOp, + cutlass::gemm::EpilogueDefault>; + + // Mainloop + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + DispatchPolicy, + TileShape, + ElementInputA, + StrideA, + ElementInputB, + StrideB, + TiledMma, + GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity, // A + GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity // B + >; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, + CollectiveMainloop, + CollectiveEpilogue + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + BenchmarkRunner runner; + + runner.run(options, hw_info); + + return 0; +} diff --git a/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu b/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu new file mode 100644 index 0000000000..69bc482f12 --- /dev/null +++ b/benchmarks/ampere/bench_ampere_gemm_fp16_fp16_fp32_tensor_op_fp32.cu @@ -0,0 +1,153 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "../common/benchmark_runner.hpp" +#include "gemm_configuration.hpp" + +int main(int argc, const char** argv) +{ + // + // Parse options + // + + Options options; + + options.parse(argc, argv); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.error) { + std::cerr << "Aborting execution." << std::endl; + return -1; + } + + // + // Run Benchmark + // + + // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This + // information is used by the underlying kernel. + cutlass::KernelHardwareInfo hw_info; + + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); + +// The code section below describes datatype for input, output matrices and computation between +// elements in input matrices. + using ElementAccumulator = float; // <- data type of accumulator + using ElementComputeEpilogue = float; // <- data type of epilogue operations + using ElementInputA = half_t; // <- data type of elements in input matrix A + using ElementInputB = half_t; // <- data type of elements in input matrix B + using ElementOutput = float; // <- data type of elements in output matrix D + + using LayoutA = cutlass::layout::ColumnMajor; + using LayoutB = cutlass::layout::ColumnMajor; + using LayoutC = cutlass::layout::ColumnMajor; + using LayoutD = cutlass::layout::ColumnMajor; + + using TileShape = Shape<_128, _128, _32>; + + using TiledMma = TiledMMA< + MMA_Atom, + Layout>, // 2x2x1 thread group + Tile<_32,_32,_16>>; // 32x32x8 MMA for LDSM, 1x2x1 value group + + static constexpr int kAlignmentA = 8; + using DefaultOperandA = DefaultGemm_TensorOpSm80_OperandA< + ElementInputA, LayoutA, kAlignmentA, 32>; + using SmemLayoutAtomA = typename DefaultOperandA::SmemLayoutAtom; // M, K + using SmemCopyAtomA = typename DefaultOperandA::SmemCopyAtom; + using GmemTiledCopyA = typename DefaultOperandA::GmemTiledCopy; + + static constexpr int kAlignmentB = 8; + using DefaultOperandB = DefaultGemm_TensorOpSm80_OperandB< + ElementInputB, LayoutB, kAlignmentB, 32>; + using SmemLayoutAtomB = typename DefaultOperandB::SmemLayoutAtom; // N, K + using SmemCopyAtomB = typename DefaultOperandB::SmemCopyAtom; + using GmemTiledCopyB = typename DefaultOperandB::GmemTiledCopy; + + using Stages = Int<3>; + + // This code section describes the epilogue part of the kernel + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, // <- data type of output matrix + 128 / cutlass::sizeof_bits::value, // <- the number of elements per vectorized + // memory access. For a byte, it's 16 + // elements. This becomes the vector width of + // math instructions in the epilogue too + ElementAccumulator, // <- data type of accumulator + ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function + + using DispatchPolicy = cutlass::gemm::MainloopSm80CpAsync; + + // Define strides (mixed) + using StrideA = cutlass::detail::TagToStrideA_t; + using StrideB = cutlass::detail::TagToStrideB_t; + using StrideC = cutlass::detail::TagToStrideC_t; + using StrideD = cutlass::detail::TagToStrideC_t; + + using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< + StrideC, + StrideD, + EpilogueOp, + cutlass::gemm::EpilogueDefault>; + + // Mainloop + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + DispatchPolicy, + TileShape, + ElementInputA, + StrideA, + ElementInputB, + StrideB, + TiledMma, + GmemTiledCopyA, SmemLayoutAtomA, SmemCopyAtomA, cute::identity, // A + GmemTiledCopyB, SmemLayoutAtomB, SmemCopyAtomB, cute::identity // B + >; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, + CollectiveMainloop, + CollectiveEpilogue + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + BenchmarkRunner runner; + + runner.run(options, hw_info); + + return 0; +} diff --git a/benchmarks/ampere/gemm_configuration.hpp b/benchmarks/ampere/gemm_configuration.hpp new file mode 100644 index 0000000000..484786567f --- /dev/null +++ b/benchmarks/ampere/gemm_configuration.hpp @@ -0,0 +1,197 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "cutlass/half.h" +#include "cutlass/layout/layout.h" + +#include "cute/swizzle.hpp" +#include "cute/layout.hpp" +#include "cute/arch/copy_sm75.hpp" +#include "cute/arch/copy_sm80.hpp" +#include "cute/atom/copy_atom.hpp" + +using namespace cute; + +template +struct DefaultGemm_TensorOpSm80_OperandA; + +template +struct DefaultGemm_TensorOpSm80_OperandB; + +///////////////////////////////////////////////////////////////////////// + +// half + +/// Operand A - Row-major (K-Major) +template <> +struct DefaultGemm_TensorOpSm80_OperandA +{ + // Smem + using SmemLayoutAtom = decltype( + composition(Swizzle<3,3,3>{}, + Layout, + Stride<_64, _1>>{})); + using SmemCopyAtom = Copy_Atom; + + // Gmem + using GmemTiledCopy = decltype( + make_tiled_copy(Copy_Atom, half_t>{}, + Layout, + Stride< _8,_1>>{}, + Layout>{})); +}; + +/// Operand A - Column-major (M-major) +template +struct DefaultGemm_TensorOpSm80_OperandA +{ + // Smem + using SmemLayoutAtom = decltype( + composition(Swizzle<3,3,3>{}, + Layout, + Stride< _1,_64>>{})); + using SmemCopyAtom = Copy_Atom; + + // Gmem + using GmemTiledCopy = decltype( + make_tiled_copy(Copy_Atom, half_t>{}, + Layout, + Stride< _1,_16>>{}, + Layout>{})); +}; + +/// Operand A - Row-major (K-Major) +template <> +struct DefaultGemm_TensorOpSm80_OperandA +{ + // Smem + using SmemLayoutAtom = decltype( + composition(Swizzle<2,3,3>{}, + Layout, + Stride<_32, _1>>{})); + using SmemCopyAtom = Copy_Atom; + + // Gmem + using GmemTiledCopy = decltype( + make_tiled_copy(Copy_Atom, half_t>{}, + Layout, + Stride< _4,_1>>{}, + Layout>{})); +}; + +// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands + +// Operand B - Column-Major (K-major) +template +struct DefaultGemm_TensorOpSm80_OperandB + : DefaultGemm_TensorOpSm80_OperandA +{}; + +// Operand B - Row-Major (N-major) +template +struct DefaultGemm_TensorOpSm80_OperandB + : DefaultGemm_TensorOpSm80_OperandA +{}; + +///////////////////////////////////////////////////////////////////////// + +// Bfloat + +/// Operand A - Row-major (K-Major) +template <> +struct DefaultGemm_TensorOpSm80_OperandA +{ + // Smem + using SmemLayoutAtom = decltype( + composition(Swizzle<3,3,3>{}, + Layout, + Stride<_64, _1>>{})); + using SmemCopyAtom = Copy_Atom; + + // Gmem + using GmemTiledCopy = decltype( + make_tiled_copy(Copy_Atom, bfloat16_t>{}, + Layout, + Stride< _8,_1>>{}, + Layout>{})); +}; + +/// Operand A - Column-major (M-major) +template +struct DefaultGemm_TensorOpSm80_OperandA +{ + // Smem + using SmemLayoutAtom = decltype( + composition(Swizzle<3,3,3>{}, + Layout, + Stride< _1,_64>>{})); + using SmemCopyAtom = Copy_Atom; + + // Gmem + using GmemTiledCopy = decltype( + make_tiled_copy(Copy_Atom, bfloat16_t>{}, + Layout, + Stride< _1,_16>>{}, + Layout>{})); +}; + +/// Operand A - Row-major (K-Major) +template <> +struct DefaultGemm_TensorOpSm80_OperandA +{ + // Smem + using SmemLayoutAtom = decltype( + composition(Swizzle<2,3,3>{}, + Layout, + Stride<_32, _1>>{})); + using SmemCopyAtom = Copy_Atom; + + // Gmem + using GmemTiledCopy = decltype( + make_tiled_copy(Copy_Atom, bfloat16_t>{}, + Layout, + Stride< _4,_1>>{}, + Layout>{})); +}; + +// Because the F32F16 TiledMMA is A-B symmetric, we can reuse the DefaultOperands + +// Operand B - Column-Major (K-major) +template +struct DefaultGemm_TensorOpSm80_OperandB + : DefaultGemm_TensorOpSm80_OperandA +{}; + +// Operand B - Row-Major (N-major) +template +struct DefaultGemm_TensorOpSm80_OperandB + : DefaultGemm_TensorOpSm80_OperandA +{}; diff --git a/benchmarks/common/benchmark_runner.hpp b/benchmarks/common/benchmark_runner.hpp new file mode 100644 index 0000000000..5eb2ade3eb --- /dev/null +++ b/benchmarks/common/benchmark_runner.hpp @@ -0,0 +1,388 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "cutlass/gemm/device/gemm.h" +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/gemm/device/gemm_universal.h" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/collective/collective_mma.hpp" +#include "cutlass/util/GPU_Clock.hpp" + +#include "cutlass/util/host_tensor.h" +#include "cutlass/util/reference/host/tensor_compare.h" +#include "cutlass/util/reference/host/tensor_copy.h" +#include "cutlass/util/reference/host/tensor_fill.h" +#include "cute/tensor.hpp" + +#include "cutlass/util/command_line.h" +#include "cutlass/util/device_memory.h" +#include "cutlass/util/packed_stride.hpp" +#include "cutlass/util/reference/device/gemm_complex.h" +#include "cutlass/util/reference/device/tensor_compare.h" +#include "cutlass/util/print_error.hpp" + +template +static void fill_matrix(std::vector &M) +{ + std::generate(std::begin(M), std::end(M), [&] + { return static_cast( 2 * (rand() / double(RAND_MAX)) - 1); }); +} + +using namespace cute; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +// Command line options parsing +struct Options { + + bool help; + bool error; + + int m, n, k, l, iterations; + float alpha, beta; + + Options(): + help(false), + error(false), + m(4096), n(4096), k(4096), l(1), iterations(100), + alpha(1.f), beta(0.f) + { } + + // Parses the command line + void parse(int argc, char const **args) { + cutlass::CommandLine cmd(argc, args); + + if (cmd.check_cmd_line_flag("help")) { + help = true; + return; + } + + cmd.get_cmd_line_argument("m", m, 4096); + cmd.get_cmd_line_argument("n", n, 4096); + cmd.get_cmd_line_argument("k", k, 4096); + cmd.get_cmd_line_argument("l", l, 1); + cmd.get_cmd_line_argument("alpha", alpha, 1.f); + cmd.get_cmd_line_argument("beta", beta, 0.f); + cmd.get_cmd_line_argument("iterations", iterations, 100); + } + + /// Prints the usage statement. + std::ostream & print_usage(std::ostream &out) const { + + out << "PVC GEMM Benchmark\n\n" + << "Options:\n\n" + << " --help If specified, displays this usage statement\n\n" + << " --m= Sets the M extent of the GEMM\n" + << " --n= Sets the N extent of the GEMM\n" + << " --k= Sets the K extent of the GEMM\n" + << " --l= Sets the L extent (batch count) of the GEMM\n" + << " --alpha= Epilogue scalar alpha\n" + << " --beta= Epilogue scalar beta\n\n" + << " --iterations= Iterations\n\n"; + + return out; + } +}; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct BenchmarkRunner { + + using StrideA = typename Gemm::GemmKernel::StrideA; + using StrideB = typename Gemm::GemmKernel::StrideB; + using StrideC = typename Gemm::GemmKernel::StrideC; + using StrideD = typename Gemm::GemmKernel::StrideD; + + using LayoutA = typename Gemm::LayoutA; + using LayoutB = typename Gemm::LayoutB; + using LayoutC = typename Gemm::LayoutC; + using LayoutD = typename Gemm::LayoutD; + + using ElementA = typename Gemm::ElementA; + using ElementB = typename Gemm::ElementB; + using ElementAcc = typename Gemm::ElementAccumulator; + + using CollectiveEpilogue = typename Gemm::CollectiveEpilogue; + using ElementC = typename Gemm::ElementC; + using ElementOutput = typename CollectiveEpilogue::ElementOutput; + using ElementCompute = typename CollectiveEpilogue::ElementCompute; + using ElementAccumulator = typename CollectiveEpilogue::ElementAccumulator; + + using ProblemShapeType = typename Gemm::GemmKernel::ProblemShape; + + // + // Data members + // + + /// Initialization + StrideA stride_A; + StrideB stride_B; + StrideC stride_C; + StrideD stride_D; + + cutlass::DeviceAllocation block_A; + cutlass::DeviceAllocation block_B; + cutlass::DeviceAllocation block_C; + cutlass::DeviceAllocation block_D; + cutlass::DeviceAllocation block_ref_D; + + // + // Methods + // + + bool verify(const ProblemShapeType& problem_size, ElementCompute alpha, ElementCompute beta) { + auto [M, N, K, L] = problem_size; + + cutlass::TensorRef ref_A(block_A.get(), LayoutA::packed({M, K})); + cutlass::TensorRef ref_B(block_B.get(), LayoutB::packed({K, N})); + cutlass::TensorRef ref_C(block_C.get(), LayoutC::packed({M, N})); + cutlass::TensorRef ref_D(block_ref_D.get(), LayoutD::packed({M, N})); + + cutlass::reference::device::GemmComplex( + {M, N, K}, + alpha, + ref_A, + cutlass::ComplexTransform::kNone, + ref_B, + cutlass::ComplexTransform::kNone, + beta, + ref_C, + ref_D, + ElementAccumulator(0), + L, // batch_count + M * K, // batch_stride_A + K * N, // batch_stride_B + M * N, // batch_stride_C + M * N // batch_stride_D + ); + +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::wait(); +#else + cudaDeviceSynchronize(); +#endif + + // Check if output from CUTLASS kernel and reference kernel are relatively equal or not + // need to set a larger error margin for comparison to succeed + auto epsilon = static_cast(0.1f); + auto nonzero_floor = static_cast(0.1f); + + bool passed = cutlass::reference::device::BlockCompareRelativelyEqual( + block_ref_D.get(), block_D.get(), block_D.size(), + epsilon, nonzero_floor); + + return passed; + } + + /// Initialize operands to be used in the GEMM and reference GEMM + virtual void initialize(const ProblemShapeType& problem_size) { + auto problem_shape_MNKL = cute::append<4>(problem_size, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(M, K, L)); + stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(N, K, L)); + stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(M, N, L)); + stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(M, N, L)); + + block_A.reset(M * K * L); + block_B.reset(K * N * L); + block_C.reset(M * N * L); + block_D.reset(M * N * L); + block_ref_D.reset(M * N * L); + + // TODO: Enable initialization on device directly once RNG is + // available through SYCL. + std::vector a(K * M * L); + std::vector b(K * N * L); + std::vector c(M * N * L); + std::vector d(M * N * L, ElementC{-1}); + std::vector ref_d(M * N * L, ElementC{-2}); + + fill_matrix(a); + fill_matrix(b); + fill_matrix(c); + + block_A.copy_from_host(a.data(), a.size()); + block_B.copy_from_host(b.data(), b.size()); + block_C.copy_from_host(c.data(), c.size()); + block_D.copy_from_host(d.data(), d.size()); + block_ref_D.copy_from_host(ref_d.data(), d.size()); + } + + virtual void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) { + ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; + + initialize(problem_size); + + typename Gemm::GemmKernel::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + {block_A.get(), stride_A, block_B.get(), stride_B}, + {{options.alpha, options.beta}, block_C.get(), stride_C, block_D.get(), stride_D}, + hw_info + }; + + Gemm gemm_op; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + gemm_op.can_implement(arguments); + + gemm_op.initialize(arguments, workspace.get()); + + // Run the GEMM + gemm_op.run(); + +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::wait(); +#else + cudaDeviceSynchronize(); +#endif + + // Verify that the result is correct + bool passed = verify(problem_size, options.alpha, options.beta); + std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; + + if (passed && options.iterations > 0) { + GPU_Clock timer; + timer.start(); + for (int i = 0; i < options.iterations; ++i) { + gemm_op.run(); + } + + float cute_time = timer.seconds() / options.iterations; + double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; + std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l + << std::endl; + printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time * 1000); + } + } +}; + +template +struct PvcBenchmarkRunner : BenchmarkRunner { + using Base = BenchmarkRunner; + + using ElementB = typename Base::ElementB; + + using ProblemShapeType = typename Base::ProblemShapeType; + + cutlass::DeviceAllocation block_B_vnni; + + template + void vnni_matrix( + T* dst, const T* src, + int batch, int numRows, int numCols, int factor) + { + for (int b = 0; b < batch; b++) { + for (int r = 0; r < numRows / factor; r++) { + for (int c = 0; c < numCols; c++) { + for (int k = 0; k < factor; k++) { + dst[((b * (numRows / factor) + r) * numCols + c) * factor + k] = + src[((b * (numRows / factor) + r) * factor + k) * numCols + c]; + } + } + } + } + } + + void initialize(const ProblemShapeType& problem_size) override { + Base::initialize(problem_size); + + auto problem_shape_MNKL = cute::append<4>(problem_size, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + block_B_vnni.reset(Base::block_B.size()); + + std::vector b(K * N * L); + std::vector b_vnni(b.size()); + + Base::block_B.copy_to_host(b.data()); + vnni_matrix(b_vnni.data(), b.data(), L, K, N, 2); + + block_B_vnni.copy_from_host(b_vnni.data()); + } + + void run(const Options& options, const cutlass::KernelHardwareInfo& hw_info) override { + ProblemShapeType problem_size = ProblemShapeType{options.m, options.n, options.k, options.l}; + + initialize(problem_size); + + typename Gemm::GemmKernel::Arguments arguments{ + cutlass::gemm::GemmUniversalMode::kGemm, + problem_size, + {Base::block_A.get(), Base::stride_A, block_B_vnni.get(), Base::stride_B}, + { + {options.alpha, options.beta}, + Base::block_C.get(), Base::stride_C, Base::block_D.get(), Base::stride_D + }, + hw_info + }; + + Gemm gemm_op; + + size_t workspace_size = Gemm::get_workspace_size(arguments); + cutlass::device_memory::allocation workspace(workspace_size); + + gemm_op.can_implement(arguments); + + gemm_op.initialize(arguments, workspace.get()); + + // Run the GEMM + gemm_op.run(); + +#if defined(CUTLASS_ENABLE_SYCL) + syclcompat::wait(); +#else + cudaDeviceSynchronize(); +#endif + + // Verify that the result is correct + bool passed = Base::verify(problem_size, options.alpha, options.beta); + std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl; + + if (passed && options.iterations > 0) { + GPU_Clock timer; + timer.start(); + for (int i = 0; i < options.iterations; ++i) { + gemm_op.run(); + } + + float cute_time = timer.seconds() / options.iterations; + double tflops = (2.0 * options.m * options.n * options.k * options.l) * 1e-12; + std::cout << "Problem Size: " << options.m << 'x' << options.n << 'x' << options.k << 'x' << options.l << std::endl; + printf("Cutlass GEMM Performance: [%4.3f]TFlop/s (%6.4f)ms\n", tflops / cute_time, cute_time*1000); + } + } +}; + diff --git a/benchmarks/pvc/CMakeLists.txt b/benchmarks/pvc/CMakeLists.txt new file mode 100644 index 0000000000..b97e679fce --- /dev/null +++ b/benchmarks/pvc/CMakeLists.txt @@ -0,0 +1,33 @@ +# Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +cutlass_benchmark_add_executable( + bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32 + bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp +) diff --git a/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp b/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp new file mode 100644 index 0000000000..67b76929db --- /dev/null +++ b/benchmarks/pvc/bench_pvc_gemm_bf16_bf16_fp32_dpas_fp32.cpp @@ -0,0 +1,136 @@ +/*************************************************************************************************** + * Copyright (c) 2024 - 2024 Codeplay Software Ltd. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#include "../common/benchmark_runner.hpp" + +using namespace cute; + +/////////////////////////////////////////////////////////////////////////////////////////////////// + +int main(int argc, const char** argv) +{ + // + // Parse options + // + + Options options; + + options.parse(argc, argv); + + if (options.help) { + options.print_usage(std::cout) << std::endl; + return 0; + } + + if (options.error) { + std::cerr << "Aborting execution." << std::endl; + return -1; + } + + // + // Run benchmark + // + + // The KernelHardwareInfo struct holds the number of EUs on the GPU with a given device ID. This + // information is used by the underlying kernel. + cutlass::KernelHardwareInfo hw_info; + + // Change device_id to another value if you are running on a machine with multiple GPUs and wish + // to use a GPU other than that with device ID 0. + hw_info.sm_count = cutlass::KernelHardwareInfo::query_device_multiprocessor_count(hw_info.device_id); + + bool passed; + + // The code section below describes datatype for input, output matrices and computation between + // elements in input matrices. + using ElementAccumulator = float; // <- data type of accumulator + using ElementComputeEpilogue = float; // <- data type of epilogue operations + using ElementInputA = bfloat16_t; // <- data type of elements in input matrix A + using ElementInputB = bfloat16_t; // <- data type of elements in input matrix B + using ElementOutput = float; // <- data type of elements in output matrix D + + using LayoutA = cutlass::layout::RowMajor; + using LayoutB = cutlass::layout::RowMajor; + using LayoutC = cutlass::layout::RowMajor; + using LayoutD = cutlass::layout::RowMajor; + + using GmemTiledCopyA = XE_2D_U16x8x16x4x2_LD_N; + using GmemTiledCopyB = XE_2D_U16x16x16x2x1_LD_N; + + using TileShape = Shape<_32, _64, _32>; + + using TiledMma = TiledMMA, + Layout>>; + + using DispatchPolicy = cutlass::gemm::MainloopIntelPVCUnpredicated; + + using EpilogueOp = cutlass::epilogue::thread::LinearCombination< + ElementOutput, // <- data type of output matrix + 128 / cutlass::sizeof_bits::value, // <- the number of elements per vectorized + // memory access. For a byte, it's 16 + // elements. This becomes the vector width of + // math instructions in the epilogue too + ElementAccumulator, // <- data type of accumulator + ElementComputeEpilogue>; // <- data type for alpha/beta in linear combination function + + using CollectiveEpilogue = cutlass::epilogue::collective::DefaultEpilogue< + cutlass::gemm::TagToStrideC_t, + cutlass::gemm::TagToStrideC_t, + EpilogueOp, + cutlass::gemm::EpilogueDefault>; + +// Mainloop + using CollectiveMainloop = cutlass::gemm::collective::CollectiveMma< + DispatchPolicy, + TileShape, + ElementInputA, + cutlass::gemm::TagToStrideA_t, + ElementInputB, + cutlass::gemm::TagToStrideB_t, + TiledMma, + GmemTiledCopyA, void, void, cute::identity, // A + GmemTiledCopyB, void, void, cute::identity // B + >; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal< + Shape, + CollectiveMainloop, + CollectiveEpilogue + >; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + PvcBenchmarkRunner runner; + + runner.run(options, hw_info); + + return 0; +} diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt index ef0449f902..b736ce35e8 100644 --- a/examples/sycl/CMakeLists.txt +++ b/examples/sycl/CMakeLists.txt @@ -27,6 +27,6 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -if("${DPCPP_SYCL_TARGET}" STREQUAL "intel_gpu_pvc") +if(SYCL_INTEL_TARGET) add_subdirectory(pvc) endif() diff --git a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp index c7ccfd832a..343075681a 100644 --- a/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp +++ b/examples/sycl/pvc/pvc_bfloat_dpas_gemm_cute.cpp @@ -235,7 +235,7 @@ struct ExampleRunner { block_D.reset(M * N * L); block_ref_D.reset(M * N * L); - // TODO: Enable initialization on device directly once RNG is + // TODO: Enable initialization on device directly once RNG is // available through SYCL. std::vector a(K * M * L); std::vector b(K * N * L); diff --git a/include/cute/arch/copy_sm75.hpp b/include/cute/arch/copy_sm75.hpp index 30d86b7ba7..b0bce99f9a 100644 --- a/include/cute/arch/copy_sm75.hpp +++ b/include/cute/arch/copy_sm75.hpp @@ -48,6 +48,10 @@ #define CUTE_ARCH_NVCC_SUPPORTS_LDSM_SM75 ((__CUDACC_VER_MAJOR__ == 10 && __CUDACC_VER_MINOR__ >= 2) || __CUDACC_VER_MAJOR__ >= 11) #endif +#if defined(SYCL_NVIDIA_TARGET) + #define CUTE_ARCH_NVCC_SUPPORTS_LDSM_SM75 1 +#endif + #if ! defined(CUTE_ARCH_LDSM_SM75_SUPPORTED) #define CUTE_ARCH_LDSM_SM75_SUPPORTED (CUTE_ARCH_NVCC_SUPPORTS_LDSM_SM75 || CUTE_ARCH_CLANG_SUPPORTS_LDSM_SM75) #endif diff --git a/include/cutlass/vector_types.h b/include/cutlass/vector_types.h index e34730db3b..3086fcea80 100644 --- a/include/cutlass/vector_types.h +++ b/include/cutlass/vector_types.h @@ -31,13 +31,17 @@ #pragma once #if defined(CUTLASS_ENABLE_SYCL) -#include +#include "cutlass/detail/helper_macros.hpp" // Add these definitions in the cutlass namespace, so they do not clash with the ones in cuda namespace cutlass { // We use this struct instead of sycl::int4 because the sycl version requires x() to access x, // while the struct does not need the (). This prevents us from having to modify the Cutlass // implementation in all the places where these vector types are used. + using int2 = struct alignas(8) { + int x, y; + }; + using int4 = struct alignas(16) { int x, y, z, w; }; @@ -85,6 +89,16 @@ namespace cutlass { using double4 = struct alignas(16) { long long int x, y, z, w; }; + + CUTLASS_HOST_DEVICE + int2 make_int2(int x, int y) { + return int2{x,y}; + } + + CUTLASS_HOST_DEVICE + int4 make_int4(int x, int y, int z, int w) { + return int4 {x,y,z,w}; + } } #else #include