diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 355aba91f..0c83ec158 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -95,6 +95,13 @@ declare_mlir_python_extension(GcPythonSources.CpuInfoExtension CPUInfo.cpp ) +declare_mlir_python_extension(GcPythonSources.ToolsExtension + MODULE_NAME _tools + ADD_TO_PARENT GcPythonSources + SOURCES + Tools.cpp +) + ################################################################################ # Common CAPI ################################################################################ diff --git a/python/Tools.cpp b/python/Tools.cpp new file mode 100644 index 000000000..4953779b3 --- /dev/null +++ b/python/Tools.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2024 Intel Corporation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "gc/Analysis/MatmulConfigAnalysis.h" +#include "mlir/Bindings/Python/PybindAdaptors.h" + +PYBIND11_MODULE(_tools, m) { + + m.def( + "validate_matmul_config", + [](const std::vector &cfg_list, std::vector &shape, + bool allow_indivisible_innerblock, bool is_vnni_mm2d) { + if (cfg_list.size() != 9) { + throw std::invalid_argument("cfg_list must have exactly 9 elements"); + } + mlir::gc::MatmulConfig cfg{cfg_list[0], cfg_list[1], cfg_list[2], + cfg_list[3], cfg_list[4], cfg_list[5], + cfg_list[6], cfg_list[7], cfg_list[8]}; + return mlir::gc::validateConfig( + cfg, shape, allow_indivisible_innerblock, is_vnni_mm2d); + }, + py::arg("cfg_list"), py::arg("shape"), + py::arg("allow_indivisible_innerblock"), py::arg("is_vnni_mm2d"), + "Validate the matmul configuration"); +} \ No newline at end of file diff --git a/python/gc_mlir/tools/__init__.py b/python/gc_mlir/tools/__init__.py index 172887970..ee2856a1a 100644 --- a/python/gc_mlir/tools/__init__.py +++ b/python/gc_mlir/tools/__init__.py @@ -5,3 +5,5 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception # # ===-----------------------------------------------------------------------===# + +from .._mlir_libs._tools import validate_matmul_config \ No newline at end of file diff --git a/scripts/correctness.sh b/scripts/correctness.sh index 30998d481..d01bd0015 100755 --- a/scripts/correctness.sh +++ b/scripts/correctness.sh @@ -6,113 +6,113 @@ FAIL=0 set -e # bf16 -python3 -m benchgc --verbose 0 --driver linalg --case matmul --md 0:32x128xbf16 --md 1:128x64xbf16 --md 2:32x64xbf16 --cast cast_signed || FAIL=1 +python3 -m benchgc --driver linalg --case matmul --md 0:32x128xbf16 --md 1:128x64xbf16 --md 2:32x64xbf16 --cast cast_signed || FAIL=1 # f32 # reduce -python3 -m benchgc --verbose 0 --driver linalg --case reduce.add --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case reduce.mul --md 0:128x8xf32 --md 1:128xf32 --dimensions=1 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case reduce.max --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case reduce.min --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case reduce.l1 --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case reduce.l2_square --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1 +python3 -m benchgc --driver linalg --case reduce.add --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1 +python3 -m benchgc --driver linalg --case reduce.mul --md 0:128x8xf32 --md 1:128xf32 --dimensions=1 || FAIL=1 +python3 -m benchgc --driver linalg --case reduce.max --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1 +python3 -m benchgc --driver linalg --case reduce.min --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1 +python3 -m benchgc --driver linalg --case reduce.l1 --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1 +python3 -m benchgc --driver linalg --case reduce.l2_square --md 0:128x64x8xf32 --md 1:128xf32 --dimensions=1 --dimensions=2 || FAIL=1 # misc -python3 -m benchgc --verbose 0 --driver linalg --case fill --md 0:f32 --md 1:32x4096xf32 --cmp 1:P:0:0 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case copy --md 0:1024x1024xf32 --md 1:1024x1024xbf16 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case broadcast --md 0:1024xf32 --md 1:2x32x1024xf32 --dimensions=0 --dimensions=1 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case transpose --md 0:32x64x128xf32 --md 1:64x128x32xf32 --permutation=1 --permutation=2 --permutation=0 || FAIL=1 +python3 -m benchgc --driver linalg --case fill --md 0:f32 --md 1:32x4096xf32 --cmp 1:P:0:0 || FAIL=1 +python3 -m benchgc --driver linalg --case copy --md 0:1024x1024xf32 --md 1:1024x1024xbf16 || FAIL=1 +python3 -m benchgc --driver linalg --case broadcast --md 0:1024xf32 --md 1:2x32x1024xf32 --dimensions=0 --dimensions=1 || FAIL=1 +python3 -m benchgc --driver linalg --case transpose --md 0:32x64x128xf32 --md 1:64x128x32xf32 --permutation=1 --permutation=2 --permutation=0 || FAIL=1 # matmul -python3 -m benchgc --verbose 0 --driver linalg --case batch_matmul --md 0:16x512x64xf32 --md 1:16x64x32xf32 --md 2:16x512x32xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case batch_matmul_transpose_a --md 0:16x512x64xf32 --md 1:16x512x32xf32 --md 2:16x64x32xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case batch_matmul_transpose_b --md 0:16x512x64xf32 --md 1:16x128x64xf32 --md 2:16x512x128xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case batch_matvec --md 0:16x512x64xf32 --md 1:16x64xf32 --md 2:16x512xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case batch_mmt4d --md 0:4x4x8x4x2xf32 --md 1:4x8x8x4x2xf32 --md 2:4x4x8x4x4xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case batch_reduce_matmul --md 0:16x512x64xf32 --md 1:16x64x32xf32 --md 2:512x32xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case batch_vecmat --md 0:16x64xf32 --md 1:16x64x512xf32 --md 2:16x512xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case dot --md 0:4096xf32 --md 1:4096xf32 --md 2:0xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case matmul --md 0:1024x512xf32 --md 1:512x512xf32 --md 2:1024x512xf32 --cast cast_signed || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case matmul_transpose_a --md 0:1024x512xf32 --md 1:1024x512xf32 --md 2:512x512xf32 --cast cast_signed || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case matmul_transpose_b --md 0:1024x512xf32 --md 1:1024x512xf32 --md 2:1024x1024xf32 --cast cast_signed || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case matvec --md 0:512x64xf32 --md 1:64xf32 --md 2:512xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case mmt4d --md 0:4x8x4x2xf32 --md 1:8x8x4x2xf32 --md 2:4x8x4x4xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case vecmat --md 0:512xf32 --md 1:512x64xf32 --md 2:64xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case batch_matmul --md 0:16x512x64xf32 --md 1:16x64x32xf32 --md 2:16x512x32xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case batch_matmul_transpose_a --md 0:16x512x64xf32 --md 1:16x512x32xf32 --md 2:16x64x32xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case batch_matmul_transpose_b --md 0:16x512x64xf32 --md 1:16x128x64xf32 --md 2:16x512x128xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case batch_matvec --md 0:16x512x64xf32 --md 1:16x64xf32 --md 2:16x512xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case batch_mmt4d --md 0:4x4x8x4x2xf32 --md 1:4x8x8x4x2xf32 --md 2:4x4x8x4x4xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case batch_reduce_matmul --md 0:16x512x64xf32 --md 1:16x64x32xf32 --md 2:512x32xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case batch_vecmat --md 0:16x64xf32 --md 1:16x64x512xf32 --md 2:16x512xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case dot --md 0:4096xf32 --md 1:4096xf32 --md 2:0xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case matmul --md 0:1024x512xf32 --md 1:512x512xf32 --md 2:1024x512xf32 --cast cast_signed || FAIL=1 +python3 -m benchgc --driver linalg --case matmul_transpose_a --md 0:1024x512xf32 --md 1:1024x512xf32 --md 2:512x512xf32 --cast cast_signed || FAIL=1 +python3 -m benchgc --driver linalg --case matmul_transpose_b --md 0:1024x512xf32 --md 1:1024x512xf32 --md 2:1024x1024xf32 --cast cast_signed || FAIL=1 +python3 -m benchgc --driver linalg --case matvec --md 0:512x64xf32 --md 1:64xf32 --md 2:512xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case mmt4d --md 0:4x8x4x2xf32 --md 1:8x8x4x2xf32 --md 2:4x8x4x4xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case vecmat --md 0:512xf32 --md 1:512x64xf32 --md 2:64xf32 || FAIL=1 # binary -python3 -m benchgc --verbose 0 --driver linalg --case add --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case sub --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case mul --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case div --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case max --md 0:1024x1024xf32 --md 1:1024x1024xf32 --md 2:1024x1024xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case min --md 0:1024x1024xf32 --md 1:1024x1024xf32 --md 2:1024x1024xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case add --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case sub --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case mul --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case div --md 0:1x32x4096xf32 --md 1:1x32x4096xf32 --md 2:1x32x4096xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case max --md 0:1024x1024xf32 --md 1:1024x1024xf32 --md 2:1024x1024xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case min --md 0:1024x1024xf32 --md 1:1024x1024xf32 --md 2:1024x1024xf32 || FAIL=1 # element wise -python3 -m benchgc --verbose 0 --driver linalg --case abs --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case ceil --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case erf --md 0:1024x512xf32 --md 1:1024x512xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case floor --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case log --md 0:4096x32xf32 --md 1:4096x32xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case negf --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case exp --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case round --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 -# python3 -m benchgc --verbose 0 --driver linalg --case rsqrt --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case sqrt --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case square --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case tanh --md 0:128x128xf32 --md 1:128x128xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case abs --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case ceil --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case erf --md 0:1024x512xf32 --md 1:1024x512xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case floor --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case log --md 0:4096x32xf32 --md 1:4096x32xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case negf --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case exp --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case round --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 +# python3 -m benchgc --driver linalg --case rsqrt --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case sqrt --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case square --md 0:32x4096xf32 --md 1:32x4096xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case tanh --md 0:128x128xf32 --md 1:128x128xf32 || FAIL=1 # conv -python3 -m benchgc --verbose 0 --driver linalg --case conv_1d_ncw_fcw --md 0:4x4x32xf32 --md 1:8x4x4xf32 --md 2:4x8x13xf32 --strides 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case conv_1d_nwc_wcf --md 0:4x32x4xf32 --md 1:4x4x8xf32 --md 2:4x13x8xf32 --strides 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case conv_1d --md 0:32xf32 --md 1:4xf32 --md 2:29xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case conv_2d_nchw_fchw --md 0:4x4x32x32xf32 --md 1:8x4x4x4xf32 --md 2:4x8x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case conv_2d_ngchw_fgchw --md 0:4x2x2x32x32xf32 --md 1:4x2x2x4x4xf32 --md 2:4x2x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case conv_2d_ngchw_gfchw --md 0:4x2x2x32x32xf32 --md 1:2x4x2x4x4xf32 --md 2:4x2x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case conv_2d_nhwc_fhwc --md 0:4x32x32x4xf32 --md 1:8x4x4x4xf32 --md 2:4x13x13x8xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case conv_2d_nhwc_hwcf --md 0:4x32x32x4xf32 --md 1:4x4x4x8xf32 --md 2:4x13x13x8xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case conv_2d --md 0:32x32xf32 --md 1:4x4xf32 --md 2:29x29xf32 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case conv_3d_ncdhw_fcdhw --md 0:4x4x32x32x32xf32 --md 1:8x4x4x4x4xf32 --md 2:4x8x13x13x13xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case conv_3d_ndhwc_dhwcf --md 0:4x32x32x32x4xf32 --md 1:4x4x4x4x8xf32 --md 2:4x13x13x13x8xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case conv_3d --md 0:32x32x32xf32 --md 1:4x4x4xf32 --md 2:29x29x29xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case conv_1d_ncw_fcw --md 0:4x4x32xf32 --md 1:8x4x4xf32 --md 2:4x8x13xf32 --strides 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case conv_1d_nwc_wcf --md 0:4x32x4xf32 --md 1:4x4x8xf32 --md 2:4x13x8xf32 --strides 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case conv_1d --md 0:32xf32 --md 1:4xf32 --md 2:29xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case conv_2d_nchw_fchw --md 0:4x4x32x32xf32 --md 1:8x4x4x4xf32 --md 2:4x8x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case conv_2d_ngchw_fgchw --md 0:4x2x2x32x32xf32 --md 1:4x2x2x4x4xf32 --md 2:4x2x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case conv_2d_ngchw_gfchw --md 0:4x2x2x32x32xf32 --md 1:2x4x2x4x4xf32 --md 2:4x2x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case conv_2d_nhwc_fhwc --md 0:4x32x32x4xf32 --md 1:8x4x4x4xf32 --md 2:4x13x13x8xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case conv_2d_nhwc_hwcf --md 0:4x32x32x4xf32 --md 1:4x4x4x8xf32 --md 2:4x13x13x8xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case conv_2d --md 0:32x32xf32 --md 1:4x4xf32 --md 2:29x29xf32 || FAIL=1 +python3 -m benchgc --driver linalg --case conv_3d_ncdhw_fcdhw --md 0:4x4x32x32x32xf32 --md 1:8x4x4x4x4xf32 --md 2:4x8x13x13x13xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case conv_3d_ndhwc_dhwcf --md 0:4x32x32x32x4xf32 --md 1:4x4x4x4x8xf32 --md 2:4x13x13x13x8xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case conv_3d --md 0:32x32x32xf32 --md 1:4x4x4xf32 --md 2:29x29x29xf32 || FAIL=1 # depthwise conv -python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_1d_ncw_cw --md 0:4x4x32xf32 --md 1:4x4xf32 --md 2:4x4x13xf32 --strides 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_1d_nwc_wc --md 0:4x32x4xf32 --md 1:4x4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_1d_nwc_wcm --md 0:4x32x4xf32 --md 1:4x4x3xf32 --md 2:4x13x4x3xf32 --strides 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_2d_nchw_chw --md 0:4x4x32x32xf32 --md 1:4x4x4xf32 --md 2:4x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_2d_nhwc_hwc --md 0:4x32x32x4xf32 --md 1:4x4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_2d_nhwc_hwcm --md 0:4x32x32x4xf32 --md 1:4x4x4x3xf32 --md 2:4x13x13x4x3xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_3d_ncdhw_cdhw --md 0:4x4x32x32x32xf32 --md 1:4x4x4x4xf32 --md 2:4x4x13x13x13xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_3d_ndhwc_dhwc --md 0:4x32x32x32x4xf32 --md 1:4x4x4x4xf32 --md 2:4x13x13x13x4xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case depthwise_conv_3d_ndhwc_dhwcm --md 0:4x32x32x32x4xf32 --md 1:4x4x4x4x3xf32 --md 2:4x13x13x13x4x3xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case depthwise_conv_1d_ncw_cw --md 0:4x4x32xf32 --md 1:4x4xf32 --md 2:4x4x13xf32 --strides 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case depthwise_conv_1d_nwc_wc --md 0:4x32x4xf32 --md 1:4x4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case depthwise_conv_1d_nwc_wcm --md 0:4x32x4xf32 --md 1:4x4x3xf32 --md 2:4x13x4x3xf32 --strides 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case depthwise_conv_2d_nchw_chw --md 0:4x4x32x32xf32 --md 1:4x4x4xf32 --md 2:4x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case depthwise_conv_2d_nhwc_hwc --md 0:4x32x32x4xf32 --md 1:4x4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case depthwise_conv_2d_nhwc_hwcm --md 0:4x32x32x4xf32 --md 1:4x4x4x3xf32 --md 2:4x13x13x4x3xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case depthwise_conv_3d_ncdhw_cdhw --md 0:4x4x32x32x32xf32 --md 1:4x4x4x4xf32 --md 2:4x4x13x13x13xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case depthwise_conv_3d_ndhwc_dhwc --md 0:4x32x32x32x4xf32 --md 1:4x4x4x4xf32 --md 2:4x13x13x13x4xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case depthwise_conv_3d_ndhwc_dhwcm --md 0:4x32x32x32x4xf32 --md 1:4x4x4x4x3xf32 --md 2:4x13x13x13x4x3xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1 # pool -python3 -m benchgc --verbose 0 --driver linalg --case pooling_nchw_max --md 0:4x4x32x32xf32 --md 1:4x4xf32 --md 2:4x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case pooling_nchw_sum --md 0:4x4x32x32xf32 --md 1:4x4xf32 --md 2:4x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case pooling_ncw_max --md 0:4x4x32xf32 --md 1:4xf32 --md 2:4x4x13xf32 --strides 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case pooling_ncw_sum --md 0:4x4x32xf32 --md 1:4xf32 --md 2:4x4x13xf32 --strides 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case pooling_ndhwc_max --md 0:4x32x32x32x4xf32 --md 1:4x4x4xf32 --md 2:4x13x13x13x4xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case pooling_ndhwc_sum --md 0:4x32x32x32x4xf32 --md 1:4x4x4xf32 --md 2:4x13x13x13x4xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case pooling_nhwc_max --md 0:4x32x32x4xf32 --md 1:4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case pooling_nhwc_sum --md 0:4x32x32x4xf32 --md 1:4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case pooling_nhwc_min --md 0:4x32x32x4xf32 --md 1:4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case pooling_nwc_max --md 0:4x32x4xf32 --md 1:4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case pooling_nwc_sum --md 0:4x32x4xf32 --md 1:4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1 -python3 -m benchgc --verbose 0 --driver linalg --case pooling_nwc_min --md 0:4x32x4xf32 --md 1:4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case pooling_nchw_max --md 0:4x4x32x32xf32 --md 1:4x4xf32 --md 2:4x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case pooling_nchw_sum --md 0:4x4x32x32xf32 --md 1:4x4xf32 --md 2:4x4x13x13xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case pooling_ncw_max --md 0:4x4x32xf32 --md 1:4xf32 --md 2:4x4x13xf32 --strides 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case pooling_ncw_sum --md 0:4x4x32xf32 --md 1:4xf32 --md 2:4x4x13xf32 --strides 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case pooling_ndhwc_max --md 0:4x32x32x32x4xf32 --md 1:4x4x4xf32 --md 2:4x13x13x13x4xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case pooling_ndhwc_sum --md 0:4x32x32x32x4xf32 --md 1:4x4x4xf32 --md 2:4x13x13x13x4xf32 --strides 2 --strides 2 --strides 2 --dilations 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case pooling_nhwc_max --md 0:4x32x32x4xf32 --md 1:4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case pooling_nhwc_sum --md 0:4x32x32x4xf32 --md 1:4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case pooling_nhwc_min --md 0:4x32x32x4xf32 --md 1:4x4xf32 --md 2:4x13x13x4xf32 --strides 2 --strides 2 --dilations 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case pooling_nwc_max --md 0:4x32x4xf32 --md 1:4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case pooling_nwc_sum --md 0:4x32x4xf32 --md 1:4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1 +python3 -m benchgc --driver linalg --case pooling_nwc_min --md 0:4x32x4xf32 --md 1:4xf32 --md 2:4x13x4xf32 --strides 2 --dilations 2 || FAIL=1 # generic -python3 -m benchgc --verbose 0 --driver mlir --case ${CASE_DIR}/generic.mlir || FAIL=1 +python3 -m benchgc --driver mlir --case ${CASE_DIR}/generic.mlir || FAIL=1 # softmax -# python3 -m benchgc --verbose 0 --driver linalg --case softmax --md 0:32x4096xf32 --md 1:32x4096xf32 --dimension 1 || FAIL=1 +# python3 -m benchgc --driver linalg --case softmax --md 0:32x4096xf32 --md 1:32x4096xf32 --dimension 1 || FAIL=1 # mlir -# python3 -m benchgc --verbose 0 --driver mlir --case ${CASE_DIR}/llama2.mlir || FAIL=1 +# python3 -m benchgc --driver mlir --case ${CASE_DIR}/llama2.mlir || FAIL=1 #mlp -python3 -m benchgc --verbose 1 --driver pattern --case mlp --batch_size=32 --hidden_size_list=32x16x64 --has_bias=1x1 --act_type=noop --dtype=f32 +python3 -m benchgc --driver pattern --case mlp --batch_size=32 --hidden_size_list=32x16x64 --has_bias=1x1 --act_type=noop --dtype=f32 set +e exit $FAIL \ No newline at end of file diff --git a/test/benchgc/CMakeLists.txt b/test/benchgc/CMakeLists.txt index ff00d27b7..356cd449f 100644 --- a/test/benchgc/CMakeLists.txt +++ b/test/benchgc/CMakeLists.txt @@ -41,3 +41,4 @@ add_subdirectory("src/benchgc/tensor") add_subdirectory("src/benchgc/arith") add_subdirectory("src/benchgc/pattern") add_subdirectory("src/benchgc/math") +add_subdirectory("src/benchgc/tuner") diff --git a/test/benchgc/README.md b/test/benchgc/README.md index 239105c82..51e626739 100644 --- a/test/benchgc/README.md +++ b/test/benchgc/README.md @@ -44,6 +44,7 @@ python -m benchgc [OPTIONS] --mode [MODE] --driver [DRIVER] --case [CASE] ### --mode [str] * C : correctness testing (by default) * P : performance testing +* T : performance tuning, see tuner [`README.md`](src/benchgc/tuner/README.md) ### --driver [str] * linalg: test the single op in linalg dialect @@ -138,12 +139,15 @@ module { ### --bench_kind [str] * py : use the MLIR Python API to invoke the kernel and use Python to calculate the time cost * wrapper : modify MLIR by wrapping the kernel into a new method and calling the `nanoTime()` method before and after calling the kernel. Finally, calculate the difference as the time cost +* default: `py` ### --warm_up [int] * warm-up times of the execution +* default: 100 ### --repeat [int] * repeat times of the execution +* default: 100 ## Pattern Options Each pattern has its own unique options. diff --git a/test/benchgc/src/benchgc/__main__.py b/test/benchgc/src/benchgc/__main__.py index 379078a9d..eaeeaf58b 100644 --- a/test/benchgc/src/benchgc/__main__.py +++ b/test/benchgc/src/benchgc/__main__.py @@ -31,9 +31,15 @@ set_default_fill, ) from benchgc.arg.arg import Arg -from benchgc.bench import mlir_wrapper_bench, py_timeit_bench +from benchgc.bench import ( + batch_mlir_wrapper_bench, + batch_py_timeit_bench, + mlir_wrapper_bench, + py_timeit_bench, +) from benchgc.mlir.arg import get_mlir_args from benchgc.pattern import get_pattern_clz +from benchgc.tuner.tuner import GATuner, GridTuner, Tuner, TuningSpace from gc_mlir import ir from gc_mlir.graph_compiler import GraphCompiler @@ -44,7 +50,7 @@ def add_common_options(parser: argparse.ArgumentParser): "--mode", required=False, help="specify the test mode, C for correctness testing, P for performance testing", - choices=["C", "P"], + choices=["C", "P", "T"], default="C", type=str, ) @@ -198,13 +204,20 @@ def add_common_options(parser: argparse.ArgumentParser): def add_bench_options(parser: argparse.ArgumentParser): """add options for bench mode""" - if parser.parse_known_args()[0].mode == "P": + if parser.parse_known_args()[0].mode in ("P", "T"): parser.add_argument( "--bench_kind", type=str, choices=["py", "wrapper"], default="py" ) - parser.add_argument("--warm_up", type=int, default=100) - parser.add_argument("--repeat", type=int, default=100) - + parser.add_argument( + "--warm_up", + type=int, + default=100 if parser.parse_known_args()[0].mode == "P" else 2, + ) + parser.add_argument( + "--repeat", + type=int, + default=100 if parser.parse_known_args()[0].mode == "P" else 4, + ) def add_pattern_options(parser: argparse.ArgumentParser): @@ -213,6 +226,45 @@ def add_pattern_options(parser: argparse.ArgumentParser): pattern_name = parser.parse_known_args()[0].case get_pattern_clz(pattern_name).add_args(parser) +def add_tuner_options(parser: argparse.ArgumentParser): + """add options for the mode T""" + if parser.parse_known_args()[0].mode == "T": + parser.add_argument( + "--search_alg", type=str, choices=["grid", "ga"], default="grid" + ) + parser.add_argument( + "--tuning_batch", type=int, default=Tuner.DEFAULT_BATCH_SIZE + ) + parser.add_argument("--early_stop", type=int, default=Tuner.DEFAULT_EARLY_STOP) + parser.add_argument( + "--max_tuning_iters", type=int, default=Tuner.DEFAULT_MAX_ITERS + ) + parser.add_argument("--timeout", type=int, default=Tuner.DEFAULT_TIMEOUT) + parser.add_argument( + "--space_percent", type=float, default=TuningSpace.DEFAULT_SPACE_PERCENT + ) + parser.add_argument( + "--tuner_verbose", + action="store_true", + help="if we need print the tuner log", + ) + parser.add_argument("--checkpoint_path", type=str, default="") + + if parser.parse_known_args()[0].search_alg == "ga": + parser.add_argument( + "--ga_random_seed", type=int, default=GATuner.DEFAULT_RANDOM_SEED + ) + parser.add_argument( + "--ga_elite_num", type=int, default=GATuner.DEFAULT_ELITE_NUM + ) + parser.add_argument( + "--ga_mutation_prob", type=float, default=GATuner.DEFAULT_MUTATION_PROB + ) + parser.add_argument( + "--ga_expected_tune_num", + type=int, + default=GATuner.DEFAULT_EXPECTED_TUNE_NUM, + ) def get_module_and_args(flags: argparse.Namespace): args: List[Arg] = [] @@ -391,11 +443,71 @@ def performance_testing(flags: argparse.Namespace, module: ir.Module, args: List print(json_res) +def performance_tuning(flags: argparse.Namespace, module: ir.Module, args: List[Arg]): + gc_args: List[torch.Tensor | int] = [] + gc_tensors: Dict[str, torch.Tensor] = {} + for i in range(len(args)): + tensor = fill_tensor(flags, args[i], i) + gc_tensors["%arg" + str(i)] = tensor + if args[i].scalar: + gc_args.append(tensor.data_ptr()) + else: + gc_args.append(tensor) + + mlir_args = get_mlir_args(gc_args) + with module.context as ctx, ir.Location.unknown(): + if flags.ir_printing: + ctx.enable_multithreading(False) + batch_bench = ( + batch_py_timeit_bench + if flags.bench_kind == "py" + else batch_mlir_wrapper_bench + ) + + def tuner_batch_bench(ir_moudles): + return batch_bench( + ir_moudles, + flags.entry, + "any(gc-cpu-pipeline)", + mlir_args, + flags.ir_printing, + flags.repeat, + flags.warm_up, + ) + + assert flags.space_percent > 0 and flags.space_percent <= 1.0 + space = TuningSpace(module, flags.space_percent) + if flags.search_alg == "grid": + tuner = GridTuner( + tuner_batch_bench, + space, + flags.tuning_batch, + flags.early_stop, + flags.checkpoint_path, + flags.tuner_verbose, + ) + else: + tuner = GATuner( + tuner_batch_bench, + space, + flags.tuning_batch, + flags.early_stop, + flags.checkpoint_path, + flags.tuner_verbose, + flags.ga_elite_num, + flags.ga_mutation_prob, + random_seed=flags.ga_random_seed, + expected_tune_num=flags.ga_expected_tune_num, + ) + tuner.run(flags.max_tuning_iters, flags.timeout) + + if __name__ == "__main__": arg_parser = argparse.ArgumentParser(prog="benchmark tool for graph compiler") add_common_options(arg_parser) add_bench_options(arg_parser) add_pattern_options(arg_parser) + add_tuner_options(arg_parser) flags = arg_parser.parse_args() benchgc.util.set_seed(flags.seed) ir_module, module_args = get_module_and_args(flags) @@ -403,5 +515,7 @@ def performance_testing(flags: argparse.Namespace, module: ir.Module, args: List correctness_testing(flags, ir_module, module_args) elif flags.mode == "P": performance_testing(flags, ir_module, module_args) + elif flags.mode == "T": + performance_tuning(flags, ir_module, module_args) else: pass diff --git a/test/benchgc/src/benchgc/bench.py b/test/benchgc/src/benchgc/bench.py index 0c8763191..1dd41a142 100644 --- a/test/benchgc/src/benchgc/bench.py +++ b/test/benchgc/src/benchgc/bench.py @@ -117,21 +117,22 @@ def batch_py_timeit_bench( ir_modules: List[ir.Module], entry_name: str, pipeline: str, - mlir_args: list, + mlir_args: List[Any], ir_printing=False, repeat_time=5, warm_up=2, ) -> List[Tuple[float, float]]: """benchmark a batch of mlir with python timeit.""" compiler = GraphCompiler(pipeline) + engines = [] funcs = [] compile_costs = [] for m in ir_modules: compile_begin = timeit.default_timer() engine = compiler.compile_and_jit(m, ir_printing=ir_printing) + engines.append(engine) compile_cost = (timeit.default_timer() - compile_begin) * 1000 compile_costs.append(compile_cost) - funcs.append(engine.lookup(entry_name)) # Copied from execution_engine.py so that the cost of cast does not affect perf result. packed_args = (ctypes.c_void_p * len(mlir_args))() @@ -141,11 +142,11 @@ def batch_py_timeit_bench( def run_bench(func, arg): func(arg) - for func in funcs: + for func in [engine.lookup(entry_name) for engine in engines]: timeit.timeit(lambda: run_bench(func, packed_args), number=warm_up) execute_costs = [] - for func in funcs: + for func in [engine.lookup(entry_name) for engine in engines]: total_time = timeit.timeit( lambda: run_bench(func, packed_args), number=repeat_time ) @@ -158,7 +159,7 @@ def batch_mlir_wrapper_bench( ir_modules: ir.Module, entry_name: str, pipeline: str, - mlir_args: list, + mlir_args: List[Any], ir_printing=False, repeat_time=5, warm_up=2, diff --git a/test/benchgc/src/benchgc/tuner/CMakeLists.txt b/test/benchgc/src/benchgc/tuner/CMakeLists.txt new file mode 100644 index 000000000..506e36153 --- /dev/null +++ b/test/benchgc/src/benchgc/tuner/CMakeLists.txt @@ -0,0 +1,22 @@ +################################################################################ +# Copyright (C) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +################################################################################ + + +file(GLOB PYTHON_SCRIPTS "*.py") +foreach(PY_SCRIPT ${PYTHON_SCRIPTS}) + configure_file(${PY_SCRIPT} ${CMAKE_BINARY_DIR}/test/benchgc/src/benchgc/tuner/ COPYONLY) +endforeach() diff --git a/test/benchgc/src/benchgc/tuner/README.md b/test/benchgc/src/benchgc/tuner/README.md new file mode 100644 index 000000000..7baf5dd9b --- /dev/null +++ b/test/benchgc/src/benchgc/tuner/README.md @@ -0,0 +1,175 @@ +# Tuner - auto tuning tools +## Description +Tuner is a tool used to tuner is a tool used to select the best-performing configuration for a graph with tunable operations. Tunable operations refer to operations, such as matmul, conv, etc., whose kernel performance depends on certain configurations, and a tuner generates different configuration combinations for a graph and records their performance. + +## Prerequisite +`mode T` for benchgc + +## Options +Since bench is also required within the tuner, the tuner also supports benchmarking options. +Unlike bench mode, in tuner mode, a batch quantity of modules is generated each time, and The default values for warm-up and repeat have been adjusted accordingly. +* --bench_kind [py, grid] +* --warm_up [int], default: 2 +* --repeat [int], default: 4 + +### --tuning_batch [int] +* The batch size of configs, default: `50` +* The tuner first generates a batch of configurations, then proceeds to perform performance testing on these configs. + +### --early_stop [int] +* If the tuner does not find a better performance after testing the number of configurations specified by the `early_stop` value, it will terminate its execution. +* default: `-1`, represents that early stopping is disabled. + +### --max_tuning_iters [int] +* The maximum number of configurations the tuner needs to attempt. +* default: `sys.maxsize` + +### --timeout [int] +* The maximum runtime limit for the tuner, unit: second +* default: `-1`, means there is no limit. + +### --space_percent [float] +* For the set of all possible configurations for a graph, we refer to it as the tuning space. The value of `space_percent` represents the percentage of configurations that we need to tune. +* value range `(0, 1]`, default: 1.0, means 100 percent of tuning space + +### --checkpoint_path [str] +* When the checkpoint file exists, the tuner will first load the contents of the checkpoint to restore the previous state upon startup, and it will update the checkpoint file after executing each batch. + +### --search_alg [str] +* There are two algorithms within the tuner to search for new configurations. +* grid: grid search which is a exhaustive search +* ga: genetic algorithm. +* default: `grid` + +### Options when `--search_alg ga` +* --ga_random_seed [int]: random seed in genetic algorithm, default: 0 +* --ga_elite_num [int]: default: 9 +* --ga_mutation_prob [float]: default: 0.1 +* --ga_expected_tune_num [int] : default: 0, In the tuner implemented with a genetic algorithm, a data structure is needed to determine whether a new config is a duplicate of a previous one. By default, a set is used for this purpose when this option is not specified. If the user sets this value, a bloom filter is used instead. + +## OP config +If users need to make adjustments to the candidates in the config of tunable operations, please manually modify `op_config.py`.For example, you can reduce the tuning space by adjusting the candidates. + +## Skip the tuner for the specified OP + +If you need to skip the tuner for certain operations, you can add the following attribute to them in MLIR. +Then you can proceed with tuning by using the `--driver=mlir` option +``` +linalg.matmul {skipTuner = true} ins(..) outs(...) ... +``` + +## Example +* General cmd +``` +OMP_NUM_THREADS=1 python3 -m benchgc --mode T --driver linalg --case matmul --md 0:128x128xf32 --md 1:128x128xf32 --md 2:128x128xf32 --bench_kind wrapper --wram_up 2 --repeat 2 --search_alg grid --tunning_batch 100 --early_stop 1000 --max_tuning_iters 1000000 --timeout 1000000 --space_percent 0.8 --checkpoint_path {path_to_checkpoint_file} +``` + +* single matmul +``` +OMP_NUM_THREADS=1 python3 -m benchgc --mode T --driver linalg --case matmul --md 0:128x128xf32 --md 1:128x128xf32 --md 2:128x128xf32 + +[ 50 / 512 ] skipped: 79 best: 0.025305896997451782 ms +[ 100 / 512 ] skipped: 105 best: 0.025296583771705627 ms +[ 150 / 512 ] skipped: 115 best: 0.025296583771705627 ms +[ 200 / 512 ] skipped: 135 best: 0.025292858481407166 ms +[ 250 / 512 ] skipped: 147 best: 0.025292858481407166 ms +[ 300 / 512 ] skipped: 165 best: 0.025292858481407166 ms +[ 343 / 512 ] skipped: 169 best: 0.025292858481407166 ms +Tuner returns empty batch, early stop now +Tuning ends in 26.26677966117859 s +Best cost: 0.025292858481407166 ms +Best config: [{ + "MatMulConfig": { + "MThreads": 1, + "KThreads": 1, + "NThreads": 1, + "MBlock": 128, + "KBlock": 64, + "NBlock": 16, + "innerMostMBlock": 32, + "innerMostKBlock": 16, + "innerMostNBlock": 16 + } +}] +mlir: + module attributes {dlti.target_system_spec = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"L1_cache_size_in_bytes", 49152 : ui32>, #dlti.dl_entry<"L2_cache_size_in_bytes", 2097152 : ui64>, #dlti.dl_entry<"L3_cache_size_in_bytes", 110100480 : ui64>, #dlti.dl_entry<"num_threads", 1 : i32>, #dlti.dl_entry<"max_vector_width", 512 : i64>>>} { + func.func @entry(%arg0: tensor<128x128xf32>, %arg1: tensor<128x128xf32>) -> tensor<128x128xf32> attributes {llvm.emit_c_interface} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<128x128xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<128x128xf32>) -> tensor<128x128xf32> + %2 = linalg.matmul {KBlock = 64 : i32, KThreads = 1 : i32, MBlock = 128 : i32, MThreads = 1 : i32, NBlock = 16 : i32, NThreads = 1 : i32, cast = #linalg.type_fn, innerMostKBlock = 16 : i32, innerMostMBlock = 32 : i32, innerMostNBlock = 16 : i32} ins(%arg0, %arg1 : tensor<128x128xf32>, tensor<128x128xf32>) outs(%1 : tensor<128x128xf32>) -> tensor<128x128xf32> + return %2 : tensor<128x128xf32> + } +} +``` + +* mlp + +``` +OMP_NUM_THREADS=1 python -m benchgc --mode T --driver pattern --case mlp --batch_size=32 --hidden_size_list=16x32x64 --has_bias=1x1 --act_type=relu --warm_up 2 --repeat 2 +[ 50 / 1536 ] skipped: 352 best: 0.0069122761487960815 ms +[ 100 / 1536 ] skipped: 415 best: 0.006860122084617615 ms +[ 150 / 1536 ] skipped: 662 best: 0.006856396794319153 ms +[ 200 / 1536 ] skipped: 821 best: 0.006856396794319153 ms +[ 250 / 1536 ] skipped: 972 best: 0.006856396794319153 ms +[ 300 / 1536 ] skipped: 1029 best: 0.006856396794319153 ms +[ 350 / 1536 ] skipped: 1080 best: 0.006834045052528381 ms +[ 400 / 1536 ] skipped: 1131 best: 0.006834045052528381 ms +[ 405 / 1536 ] skipped: 1131 best: 0.006834045052528381 ms +Tuner returns empty batch, early stop now +Tuning ends in 80.10290145874023 s +Best cost: 0.006632879376411438 ms +Best config: [{ + "MatMulConfig": { + "MThreads": 1, + "KThreads": 1, + "NThreads": 1, + "MBlock": 32, + "KBlock": 16, + "NBlock": 32, + "innerMostMBlock": 32, + "innerMostKBlock": 16, + "innerMostNBlock": 16 + } +}, { + "MatMulConfig": { + "MThreads": 1, + "KThreads": 1, + "NThreads": 1, + "MBlock": 32, + "KBlock": 32, + "NBlock": 16, + "innerMostMBlock": 16, + "innerMostKBlock": 32, + "innerMostNBlock": 16 + } +}] +mlir: + module attributes {dlti.target_system_spec = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"L1_cache_size_in_bytes", 49152 : ui32>, #dlti.dl_entry<"L2_cache_size_in_bytes", 2097152 : ui64>, #dlti.dl_entry<"L3_cache_size_in_bytes", 110100480 : ui64>, #dlti.dl_entry<"num_threads", 1 : i32>, #dlti.dl_entry<"max_vector_width", 512 : i64>>>} { + func.func @entry(%arg0: tensor<32x16xf32>, %arg1: tensor<16x32xf32>, %arg2: tensor<32x64xf32>, %arg3: tensor<32xf32>, %arg4: tensor<64xf32>) -> tensor<32x64xf32> attributes {llvm.emit_c_interface} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<32x32xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<32x32xf32>) -> tensor<32x32xf32> + %2 = linalg.matmul {KBlock = 16 : i32, KThreads = 1 : i32, MBlock = 32 : i32, MThreads = 1 : i32, NBlock = 32 : i32, NThreads = 1 : i32, cast = #linalg.type_fn, innerMostKBlock = 16 : i32, innerMostMBlock = 32 : i32, innerMostNBlock = 16 : i32} ins(%arg0, %arg1 : tensor<32x16xf32>, tensor<16x32xf32>) outs(%1 : tensor<32x32xf32>) -> tensor<32x32xf32> + %3 = tensor.empty() : tensor<32x32xf32> + %broadcasted = linalg.broadcast ins(%arg3 : tensor<32xf32>) outs(%3 : tensor<32x32xf32>) dimensions = [0] + %4 = tensor.empty() : tensor<32x32xf32> + %5 = linalg.add ins(%2, %broadcasted : tensor<32x32xf32>, tensor<32x32xf32>) outs(%4 : tensor<32x32xf32>) -> tensor<32x32xf32> + %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf32> + %6 = tensor.empty() : tensor<32x32xf32> + %7 = linalg.max ins(%5, %cst_0 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%6 : tensor<32x32xf32>) -> tensor<32x32xf32> + %8 = tensor.empty() : tensor<32x64xf32> + %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<32x64xf32>) -> tensor<32x64xf32> + %10 = linalg.matmul {KBlock = 32 : i32, KThreads = 1 : i32, MBlock = 32 : i32, MThreads = 1 : i32, NBlock = 16 : i32, NThreads = 1 : i32, cast = #linalg.type_fn, innerMostKBlock = 32 : i32, innerMostMBlock = 16 : i32, innerMostNBlock = 16 : i32} ins(%7, %arg2 : tensor<32x32xf32>, tensor<32x64xf32>) outs(%9 : tensor<32x64xf32>) -> tensor<32x64xf32> + %11 = tensor.empty() : tensor<32x64xf32> + %broadcasted_1 = linalg.broadcast ins(%arg4 : tensor<64xf32>) outs(%11 : tensor<32x64xf32>) dimensions = [0] + %12 = tensor.empty() : tensor<32x64xf32> + %13 = linalg.add ins(%10, %broadcasted_1 : tensor<32x64xf32>, tensor<32x64xf32>) outs(%12 : tensor<32x64xf32>) -> tensor<32x64xf32> + %cst_2 = arith.constant dense<0.000000e+00> : tensor<32x64xf32> + %14 = tensor.empty() : tensor<32x64xf32> + %15 = linalg.max ins(%13, %cst_2 : tensor<32x64xf32>, tensor<32x64xf32>) outs(%14 : tensor<32x64xf32>) -> tensor<32x64xf32> + return %15 : tensor<32x64xf32> + } +} +``` + diff --git a/test/benchgc/src/benchgc/tuner/__init__.py b/test/benchgc/src/benchgc/tuner/__init__.py new file mode 100644 index 000000000..4d3e897ce --- /dev/null +++ b/test/benchgc/src/benchgc/tuner/__init__.py @@ -0,0 +1,15 @@ +################################################################################ +# Copyright 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ diff --git a/test/benchgc/src/benchgc/tuner/config_filter.py b/test/benchgc/src/benchgc/tuner/config_filter.py new file mode 100644 index 000000000..1dff74e27 --- /dev/null +++ b/test/benchgc/src/benchgc/tuner/config_filter.py @@ -0,0 +1,98 @@ +################################################################################ +# Copyright (C) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +################################################################################ + +import math +from abc import ABC, abstractmethod +from typing import List + + +class ConfigFilter(ABC): + """ + A class used to help filter out unseen configs. + """ + + @abstractmethod + def already_met(self, v: List[int]) -> bool: + """Check if the config has been met before.""" + + @abstractmethod + def add(self, v: List[int]): + """Add the config to the filter.""" + + @abstractmethod + def save(self): + """Save the satus of the filter.""" + + @abstractmethod + def load(self, data): + """Load the status of the filter.""" + + +class BloomFilter(ConfigFilter): + """Bloom Filter""" + + def __init__(self, num_samples: int, err_rate: float): + self.num_bits = int(-(num_samples * math.log(err_rate)) / (math.log(2) ** 2)) + self.num_hashes = int((self.num_bits / num_samples) * math.log(2)) + self.bit_array = [0] * self.num_bits + + def already_met(self, v): + for i in range(int(self.num_hashes)): + try: + import mmh3 + except ImportError: + raise ImportError("Please install mmh3 package") + hash_v = mmh3.hash(v, i) % self.num_bits + if self.bit_array[hash_v] == 0: + return False + return True + + def add(self, v): + for i in range(int(self.num_hashes)): + try: + import mmh3 + except ImportError: + raise ImportError("Please install mmh3 package") + hash_v = mmh3.hash(v, i) % self.num_bits + self.bit_array[hash_v] = 1 + + def save(self): + return self.bit_array + + def load(self, data): + self.bit_array = data + + +class HashSetFilter(ConfigFilter): + """Fliter based on HashSet""" + + def __init__(self): + self.data = set() + + def add(self, v): + self.data.add(tuple(v)) + + def already_met(self, v: List[int]) -> bool: + return tuple(v) in self.data + + def save(self): + return self.data + + def load(self, data): + self.data.clear() + for item in data: + self.add(item) diff --git a/test/benchgc/src/benchgc/tuner/op_config.py b/test/benchgc/src/benchgc/tuner/op_config.py new file mode 100644 index 000000000..49b925b9c --- /dev/null +++ b/test/benchgc/src/benchgc/tuner/op_config.py @@ -0,0 +1,212 @@ +################################################################################ +# Copyright (C) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +################################################################################ + +import json +import math +import os + +from gc_mlir.extras import types as T +from gc_mlir.ir import IntegerAttr, OpView +from gc_mlir.tools import validate_matmul_config + + +class Config: + def __init__(self): + self.field_candidates = {} + self.field_constraints = {} + self.init_candidates() + self.init_constraints() + + def init_candidates(self): + pass + + def init_constraints(self): + pass + + def attach_to_ir(self, op: OpView): + pass + + def verify(self) -> bool: + pass + + +def find_factors(num): + factors = set() + for i in range(1, int(math.sqrt(num)) + 1): + if num % i == 0: + factors.add(i) + factors.add(num // i) + return sorted(factors) + + +class MatMulConfig(Config): + def __init__( + self, + op: OpView, + MThreads: int = 1, + KThreads: int = 1, + NThreads: int = 1, + MBlock: int = 1, + KBlock: int = 1, + NBlock: int = 1, + innerMostMBlock: int = 1, + innerMostKBlock: int = 1, + innerMostNBlock: int = 1, + ): + # you can set the default value and candidates by info from matmul_op + self.m = op.inputs[0].type.shape[0] + self.k = op.inputs[0].type.shape[1] + self.n = op.inputs[1].type.shape[1] + self.input_a_dtype = str(op.inputs[0].type.element_type) + self.num_threads = int(os.environ.get("OMP_NUM_THREADS", 1)) + self.m_threads = MThreads + self.k_threads = KThreads + self.n_threads = NThreads + self.m_block = MBlock + self.k_block = KBlock + self.n_block = NBlock + self.innermost_m_block = innerMostMBlock + self.innermost_k_block = innerMostKBlock + self.innermost_n_block = innerMostNBlock + super().__init__() + + def init_candidates(self): + default_blocks = [16, 32, 64, 128, 256, 512] + default_innermost_blocks = [16, 32] + self.field_candidates["m_threads"] = find_factors(self.num_threads) + self.field_candidates["k_threads"] = find_factors(self.num_threads) + self.field_candidates["n_threads"] = find_factors(self.num_threads) + self.field_candidates["m_block"] = [ + block for block in default_blocks if self.m >= block + ] + self.field_candidates["k_block"] = [ + block for block in default_blocks if self.k >= block + ] + self.field_candidates["n_block"] = [ + block for block in default_blocks if self.n >= block + ] + self.field_candidates["innermost_m_block"] = [ + block for block in default_innermost_blocks if self.m >= block + ] + self.field_candidates["innermost_k_block"] = [ + block for block in default_innermost_blocks if self.k >= block + ] + self.field_candidates["innermost_n_block"] = [ + block for block in default_innermost_blocks if self.n >= block + ] + + def init_constraints(self): + # example: using lambda to add constraints, adding constraints by the order of the fields + self.field_constraints["m_threads"] = None + self.field_constraints["k_threads"] = ( + lambda MatMulConfig, k_threads: self.num_threads + % (MatMulConfig.m_threads * k_threads) + == 0 + ) + self.field_constraints["n_threads"] = ( + lambda MatMulConfig, n_threads: self.num_threads + % (MatMulConfig.m_threads * MatMulConfig.k_threads * n_threads) + == 0 + ) + self.field_constraints["m_block"] = None + self.field_constraints["k_block"] = None + self.field_constraints["n_block"] = None + self.field_constraints["innermost_m_block"] = ( + lambda MatMulConfig, innermost_m_block: MatMulConfig.m_block + % innermost_m_block + == 0 + ) + self.field_constraints["innermost_k_block"] = ( + lambda MatMulConfig, innermost_k_block: MatMulConfig.k_block + % innermost_k_block + == 0 + ) + self.field_constraints["innermost_n_block"] = ( + lambda MatMulConfig, innermost_n_block: MatMulConfig.n_block + % innermost_n_block + == 0 + ) + + def verify(self): + allow_indivisible_innerblock = False + is_vnni_mm2d = True if self.input_a_dtype == "bf16" else False + return validate_matmul_config( + [ + self.m_threads, + self.n_threads, + self.k_threads, + self.m_block, + self.n_block, + self.k_block, + self.innermost_m_block, + self.innermost_n_block, + self.innermost_k_block, + ], + [self.m, self.n, self.k], + allow_indivisible_innerblock, + is_vnni_mm2d, + ) + + def attach_to_ir(self, op: OpView): + attr_to_field = { + "MThreads": self.m_threads, + "KThreads": self.k_threads, + "NThreads": self.n_threads, + "MBlock": self.m_block, + "KBlock": self.k_block, + "NBlock": self.n_block, + "innermostMBlock": self.innermost_m_block, + "innermostKBlock": self.innermost_k_block, + "innermostNBlock": self.innermost_n_block, + } + for name, value in attr_to_field.items(): + op.attributes[name] = IntegerAttr.get(T.i32(), value) + + def __repr__(self) -> str: + return str( + [ + self.m_threads, + self.n_threads, + self.k_threads, + self.m_block, + self.n_block, + self.k_block, + self.innermost_m_block, + self.innermost_n_block, + self.innermost_k_block, + ] + ) + + def __str__(self) -> str: + obj_dict = { + "MatMulConfig": { + "MThreads": self.m_threads, + "NThreads": self.n_threads, + "KThreads": self.k_threads, + "MBlock": self.m_block, + "NBlock": self.n_block, + "KBlock": self.k_block, + "innerMostMBlock": self.innermost_m_block, + "innerMostNBlock": self.innermost_n_block, + "innerMostKBlock": self.innermost_k_block, + + } + } + return json.dumps(obj_dict, indent=4) + + +OP_TO_CONFIG = {"linalg.matmul": MatMulConfig, "onednn_graph.matmul": MatMulConfig} diff --git a/test/benchgc/src/benchgc/tuner/tuner.py b/test/benchgc/src/benchgc/tuner/tuner.py new file mode 100644 index 000000000..7e7f6a23f --- /dev/null +++ b/test/benchgc/src/benchgc/tuner/tuner.py @@ -0,0 +1,647 @@ +################################################################################ +# Copyright (C) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +################################################################################ + +import json +import os +import random +import sys +import time +from abc import ABC, abstractmethod +from copy import deepcopy +from typing import List + +from benchgc.tuner.config_filter import BloomFilter, HashSetFilter +from benchgc.tuner.op_config import * +from benchgc.tuner.utils import attach_configs_to_ir, gen_configs_from_ir +from gc_mlir import ir + + +class TuningSpace: + """ + The class works as a bridge between the tuner and the configs in MLIR module. + """ + + DEFAULT_SPACE_PERCENT = 1.0 + + def __init__( + self, ir_module: ir.Module, space_percent: float = DEFAULT_SPACE_PERCENT + ): + self.initial_ir = ir_module + self.graph_config = gen_configs_from_ir(ir_module) + self.space_size = 1 + self.flatten_candidates = [] + self.flatten_field_name = [] + self.flatten_constraints = [] + self.ind_candidate_to_config = {} + candidate_ind = 0 + for config_ind, config in enumerate(self.graph_config): + for field_name, candidates in config.field_candidates.items(): + self.space_size = self.space_size * len(candidates) + self.flatten_candidates.append(candidates) + self.flatten_field_name.append(field_name) + self.flatten_constraints.append(config.field_constraints[field_name]) + self.ind_candidate_to_config[candidate_ind] = config_ind + candidate_ind += 1 + self.space_size = int(self.space_size * space_percent) + + def make_config_from_indexes(self, indexes: List[int]): + """ + Make a config from a list of indexes of candidates. + """ + graph_config = deepcopy(self.graph_config) + for cid, candidate in enumerate(self.flatten_candidates): + val = candidate[indexes[cid]] + config = graph_config[self.ind_candidate_to_config[cid]] + field_name = self.flatten_field_name[cid] + setattr(config, field_name, val) + return graph_config + + def get_cur_config(self, candidate_ind: int): + """ + Get the current config with a incoming candidate index + """ + return self.graph_config[self.ind_candidate_to_config[candidate_ind]] + + def verify_config(self, candidate_idx, val) -> bool: + """ + Verify the config with constraints + """ + config = self.get_cur_config(candidate_idx) + field_name = self.flatten_field_name[candidate_idx] + constraint = self.flatten_constraints[candidate_idx] + val = self.flatten_candidates[candidate_idx][val] + setattr(config, field_name, val) + if constraint and (not constraint(config, val)): + return False + # verify the config when it has all fields + if (candidate_idx + 1) == len( + self.flatten_candidates + ) or self.ind_candidate_to_config[ + candidate_idx + 1 + ] != self.ind_candidate_to_config[ + candidate_idx + ]: + return config.verify() + return True + + def filter_next_candidates(self, candidate_idx, val) -> List[int]: + """ + Get the next candidates with the incoming candidate index and value + """ + field_name = self.flatten_field_name[candidate_idx] + config = self.get_cur_config(candidate_idx) + setattr( + config, + field_name, + self.flatten_candidates[candidate_idx][val], + ) + if (candidate_idx + 1) >= len(self.flatten_candidates): + return [] + constraint = self.flatten_constraints[candidate_idx + 1] + if constraint: + next_candidates = self.flatten_candidates[candidate_idx + 1] + return [ + index + for index, value in enumerate(next_candidates) + if constraint(config, value) + ] + else: + return list(range(len(self.flatten_candidates[candidate_idx + 1]))) + + +class Tuner(ABC): + """ + Class for creating different configs and choose the config with best perf + """ + + DEFAULT_BATCH_SIZE = 50 + DEFAULT_EARLY_STOP = -1 + DEFAULT_TIMEOUT = -1 + DEFAULT_MAX_ITERS = sys.maxsize + + def __init__( + self, + batch_executor, + tunning_space: TuningSpace, + batch_size=DEFAULT_BATCH_SIZE, + early_stop=DEFAULT_EARLY_STOP, + checkpoint="", + tuner_verbose=False, + ): + self.batch_executor = batch_executor + self.batch_size = batch_size + self.early_stop = early_stop + self.best_cost = sys.float_info.max + self.best = [] + self.iter = 0 + self.last_update_iter = 0 + self.skipped_num = 0 + self.tunning_space = tunning_space + self.checkpoint = checkpoint + if self.checkpoint: + os.makedirs(os.path.dirname(self.checkpoint), exist_ok=True) + self.tuner_verbose = tuner_verbose + assert len(tunning_space.graph_config), "There are no tunable ops" + + def tuner_update(self, config_indices_batch: List[List[int]], costs: List[float]): + """ + Update after each batch of configs was executed + """ + if min(costs) < self.best_cost: + self.best_cost = min(costs) + self.best = config_indices_batch[costs.index(min(costs))] + if self.checkpoint: + self.save_status() + + @abstractmethod + def get_next_config_indices_batch(self) -> List[List[int]]: + """ + Get the next batch of config indices + """ + pass + + @abstractmethod + def load_status(self): + """ + Load the Tuner status from the checkpoint + """ + pass + + @abstractmethod + def save_status(self): + """ + Save the Tuner status to the checkpoint + """ + pass + + def tuner_finish(self, tuning_time): + """ + Execute when tuning is finished + """ + print("Tuning ends in", tuning_time, "s") + best_config = self.tunning_space.make_config_from_indexes(self.best) + print("Best cost:", self.best_cost, "ms") + print("Best config:", [str(single_cfg) for single_cfg in best_config]) + attach_configs_to_ir(self.tunning_space.initial_ir, best_config) + print( + "mlir:\n", + self.tunning_space.initial_ir, + ) + + def run(self, max_iter: int = DEFAULT_MAX_ITERS, timeout: int = DEFAULT_TIMEOUT): + """ + Start of tuning process + """ + if self.early_stop > 0 and self.iter - self.last_update_iter > self.early_stop: + # in case of resuming from a saved state and it has already + # early-stopped + print("Early stop now") + return + start_time = time.time() + spaces_size = self.tunning_space.space_size + while self.iter < max_iter and self.iter < spaces_size: + config_indices_batch = self.get_next_config_indices_batch() + if not config_indices_batch: + print("Tuner returns empty batch, early stop now") + break + if len(config_indices_batch) > min( + max_iter - self.iter, spaces_size - self.iter + ): + config_indices_batch = config_indices_batch[ + : min(max_iter - self.iter, spaces_size - self.iter) + ] + + old_iter = self.iter + self.iter += len(config_indices_batch) + perf_result = [] + ir_modules = [] + for config_indexes in config_indices_batch: + real_config = self.tunning_space.make_config_from_indexes( + config_indexes + ) + # todo : ir.Module can not support deepcopy + new_ir = ir.Module.parse( + str(self.tunning_space.initial_ir), + self.tunning_space.initial_ir.context, + ) + attach_configs_to_ir(new_ir, real_config) + ir_modules.append(new_ir) + if self.tuner_verbose: + print("start to execute the batch of configs ...") + res = self.batch_executor(ir_modules) + perf_result = [item[1] for item in res] + # print the perf result of each config + if self.tuner_verbose: + for i, config_indexes in enumerate(config_indices_batch): + real_config = self.tunning_space.make_config_from_indexes( + config_indexes + ) + perf_to_cfg = {"cost": perf_result[i], "cfg": repr(real_config)} + print(json.dumps(perf_to_cfg)) + + old_best = self.best_cost + self.tuner_update(config_indices_batch, perf_result) + print( + "[", + self.iter, + "/", + min(max_iter, spaces_size), + "] skipped:", + self.skipped_num, + "best:", + self.best_cost, + "ms", + ) + if self.best_cost != old_best: + self.last_update_iter = old_iter + else: + if ( + self.early_stop > 0 + and old_iter - self.last_update_iter > self.early_stop + ): + print("Early stop now") + break + if timeout >= 0 and time.time() - start_time > timeout: + print("Tuning timeout...") + break + self.tuner_finish(time.time() - start_time) + + +class GridTuner(Tuner): + """ + Tuner with grid serach + """ + + def __init__( + self, + batch_executor, + tunning_space: TuningSpace, + batch_size=Tuner.DEFAULT_BATCH_SIZE, + early_stop=Tuner.DEFAULT_EARLY_STOP, + checkpoint="", + tuner_verbose=False, + ): + super().__init__( + batch_executor, + tunning_space, + batch_size, + early_stop, + checkpoint, + tuner_verbose, + ) + self.current_idx = 0 + self.cumulative_size = [1] * len(self.tunning_space.flatten_candidates) + self.cumulative_size[-1] = 1 + for i in range(len(self.cumulative_size) - 2, -1, -1): + self.cumulative_size[i] = self.cumulative_size[i + 1] * len( + self.tunning_space.flatten_candidates[i + 1] + ) + if self.checkpoint: + self.load_status() + + def get_next_config_indices_batch(self) -> list: + config_indices_batch = [] + while len(config_indices_batch) < self.batch_size: + if self.current_idx >= self.tunning_space.space_size: + break + config_ids = [-1] * len(self.tunning_space.flatten_candidates) + remain = self.current_idx + valid_config_idx = True + for j in range(len(config_ids)): + config_ids[j] = remain // self.cumulative_size[j] + valid_config_idx = self.tunning_space.verify_config(j, config_ids[j]) + if not valid_config_idx: + break + remain = remain % self.cumulative_size[j] + self.current_idx = self.current_idx + 1 + if valid_config_idx: + config_indices_batch.append(config_ids) + if self.tuner_verbose: + print( + "find valid config", + self.tunning_space.make_config_from_indexes(config_ids), + ) + else: + self.skipped_num += 1 + if self.tuner_verbose: + print("bad config, skip...") + return config_indices_batch + + def save_status(self): + save_dict = { + "iter": self.iter, + "last_update_iter": self.last_update_iter, + "best": self.best, + "best_cost": self.best_cost, + "current_idx": self.current_idx, + "skipped_num": self.skipped_num, + } + with open(self.checkpoint, "w") as file: + json.dump(save_dict, file, indent=4) + + def load_status(self): + print("continue tuning from checkpoint...") + with open( + self.checkpoint, + "r", + ) as file: + try: + data = json.load(file) + assert set( + [ + "iter", + "last_update_iter", + "best", + "best_cost", + "current_idx", + "skipped_num", + ] + ) == set(data.keys()) + self.iter = data["iter"] + self.last_update_iter = data["last_update_iter"] + self.best = data["best"] + self.best_cost = data["best_cost"] + self.current_idx = data["current_idx"] + self.skipped_num = data["skipped_num"] + except Exception as e: + print("load checkpoint failed", e) + + +class GATuner(Tuner): + """Tuner with Genetic Algorithm""" + + DEFAULT_ELITE_NUM = 9 + DEFAULT_MUTATION_PROB = 0.1 + DEFAULT_RANDOM_SEED = 0 + DEFAULT_EXPECTED_TUNE_NUM = 0 + + def __init__( + self, + batch_executor, + tuning_space, + pop_size=Tuner.DEFAULT_BATCH_SIZE, + early_stop=Tuner.DEFAULT_EARLY_STOP, + checkpoint="", + tuner_verbose=False, + elite_num: int = DEFAULT_ELITE_NUM, + mutation_prob: float = DEFAULT_MUTATION_PROB, + random_seed: int = DEFAULT_RANDOM_SEED, + expected_tune_num: int = DEFAULT_EXPECTED_TUNE_NUM, + ): + super().__init__( + batch_executor, + tuning_space, + pop_size, + early_stop, + checkpoint, + tuner_verbose, + ) + self.elite_num = min(elite_num, pop_size) + self.mutation_prob = mutation_prob + self.pop_size = pop_size + self.cur_mutation_prob = mutation_prob + self.prev_results = [] + self.elites = [] + random.seed(random_seed) + if expected_tune_num == 0: + self.filter = HashSetFilter() + else: + self.filter = BloomFilter(expected_tune_num, err_rate=0.01) + + self.candidate_indices = [[]] * len(self.tunning_space.flatten_candidates) + self.candidate_indices[0] = list( + range(len(self.tunning_space.flatten_candidates[0])) + ) + if self.checkpoint: + self.load_status() + + def save_status(self): + save_dict = { + "iter": self.iter, + "last_update_iter": self.last_update_iter, + "best": self.best, + "best_cost": self.best_cost, + "skipped_num": self.skipped_num, + "cur_mutation_prob": self.cur_mutation_prob, + "prev_results": self.prev_results, + "elites": self.elites, + "tuned": list(self.filter.save()), + } + with open(self.checkpoint, "w") as file: + json.dump(save_dict, file, indent=4) + + def load_status(self): + print("continue tuning from checkpoint...") + with open( + self.checkpoint, + "r", + ) as file: + try: + data = json.load(file) + assert set( + [ + "iter", + "last_update_iter", + "best", + "best_cost", + "skipped_num", + "cur_mutation_prob", + "prev_results", + "elites", + "tuned", + ] + ) == set(data.keys()) + self.iter = data["iter"] + self.last_update_iter = data["last_update_iter"] + self.best = data["best"] + self.best_cost = data["best_cost"] + self.skipped_num = data["skipped_num"] + self.cur_mutation_prob = data["cur_mutation_prob"] + self.prev_results = data["prev_results"] + self.elites = data["elites"] + self.filter.load(data["tuned"]) + except Exception as e: + print("load checkpoint failed", e) + + def set_field(self, gene, idx, val): + gene[idx] = val + self.update_candidate_indices(idx, val) + + def update_candidate_indices(self, idx, val): + next_candidates = self.tunning_space.filter_next_candidates(idx, val) + if idx + 1 < len(self.candidate_indices): + self.candidate_indices[idx + 1] = next_candidates + + @staticmethod + def update_mutation_prob(prob, lower_bound, move_up): + if move_up: + prob = min(prob * 1.01, 0.5) + else: + prob = max(prob * 0.98, lower_bound) + return prob + + @staticmethod + def random_choice(prob_range) -> int: + random_val = random.randint(0, sys.maxsize) / sys.maxsize + for i in range(len(prob_range)): + if random_val <= prob_range[i]: + return i + return -1 + + def push_to_tune(self, to_tune, gene) -> bool: + if self.filter.already_met(gene): + self.cur_mutation_prob = GATuner.update_mutation_prob( + self.cur_mutation_prob, self.mutation_prob, True + ) + return False + if gene in to_tune: + self.cur_mutation_prob = GATuner.update_mutation_prob( + self.cur_mutation_prob, self.mutation_prob, True + ) + return False + + graph_cfg = self.tunning_space.make_config_from_indexes(gene) + for cfg in graph_cfg: + if not cfg.verify(): + return False + + to_tune.append(gene) + self.cur_mutation_prob = GATuner.update_mutation_prob( + self.cur_mutation_prob, self.mutation_prob, False + ) + return True + + def get_next_config_indices_batch(self) -> list: + prob_range = [0.0] * len(self.prev_results) + total_score = 0 + for i, prev_result in enumerate(self.prev_results): + total_score += prev_result[1] + prob_range[i] = total_score + prob_range = [x / total_score for x in prob_range] + to_tune = [] + for i in range(self.pop_size): + self.get_next_config(prob_range, to_tune) + + if self.tuner_verbose: + print("to_tune list:") + for to_tune_config in to_tune: + print(self.tunning_space.make_config_from_indexes(to_tune_config)) + + if len(to_tune) < self.pop_size: + print( + f"GA Cannot generate enough unmet genes in this batch (batch_size={self.pop_size})" + ) + return to_tune + + def get_next_config(self, prob_range, to_tune): + max_tries = 20 + try_cnt = 0 + while try_cnt < max_tries: + try_cnt += 1 + if not self.elites: + gene = [-1] * len(self.tunning_space.flatten_candidates) + need_repo = True + redo_cnt = 0 + while redo_cnt < 50 and need_repo: + need_repo = False + for j in range(len(gene)): + # try to randomly pick one candidate + data, success = GATuner.random_item_from( + self.candidate_indices[j] + ) + if not success: + need_repo = True + break + else: + self.set_field(gene, j, data) + redo_cnt += 1 + if need_repo: + print("Cannot create a valid random gene") + if self.push_to_tune(to_tune, gene): + return + else: + assert len(self.prev_results) > 0 + # print("len(prob_range) = ", len(prob_range)) + if len(prob_range) == 1: + return + gene_size = len(self.tunning_space.flatten_candidates) + first_gene = GATuner.random_choice(prob_range) + second_gene = GATuner.random_choice(prob_range) + while second_gene == first_gene: + second_gene = GATuner.random_choice(prob_range) + + joint_point = random.randint(0, gene_size) + + new_gene = [-1] * gene_size + need_redo = False + for j in range(gene_size): + candidates = self.candidate_indices[j] + if not candidates: + need_redo = True + continue + if ( + random.randint(0, sys.maxsize) / sys.maxsize + ) < self.cur_mutation_prob: + self.set_field( + new_gene, j, GATuner.random_item_from(candidates)[0] + ) + else: + # inherit from parents + left_gene = self.prev_results[first_gene][0][j] + right_gene = self.prev_results[second_gene][0][j] + if j < joint_point: + prefered_gene = left_gene + unprefered_gene = right_gene + else: + prefered_gene = right_gene + unprefered_gene = left_gene + + if prefered_gene in candidates: + self.set_field(new_gene, j, prefered_gene) + elif unprefered_gene in candidates: + self.set_field(new_gene, j, unprefered_gene) + else: + self.set_field( + new_gene, j, GATuner.random_item_from(candidates)[0] + ) + if need_redo: + print("need_redo") + continue + + if self.push_to_tune(to_tune, new_gene): + return + + def tuner_update( + self, config_indices_batch: List[List[int]], perf_result: List[float] + ): + self.prev_results.clear() + for i, config_indices in enumerate(config_indices_batch): + self.filter.add(config_indices) + self.prev_results.append((config_indices, 1 / perf_result[i])) + + for elite in self.elites: + self.prev_results.append(elite) + self.elites = sorted(self.prev_results, key=lambda x: x[1], reverse=True)[ + : self.elite_num + ] + super().tuner_update(config_indices_batch, perf_result) + + @staticmethod + def random_item_from(v: List[int]): + if not v: + return 0, False + return v[random.randint(0, len(v) - 1)], True diff --git a/test/benchgc/src/benchgc/tuner/utils.py b/test/benchgc/src/benchgc/tuner/utils.py new file mode 100644 index 000000000..28e1ce0e1 --- /dev/null +++ b/test/benchgc/src/benchgc/tuner/utils.py @@ -0,0 +1,61 @@ +################################################################################ +# Copyright (C) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +################################################################################ + + +from typing import List + +from benchgc.tuner.op_config import OP_TO_CONFIG, Config +from gc_mlir import ir + + +def get_all_tunable_ops(op: ir.Operation): + """Get tunable ops from the children op""" + tunable_ops = [] + for region in op.regions: + for block in region: + for child_op in block: + if ( + "skipTuner" in child_op.attributes + and child_op.attributes["skipTuner"] + ): + continue + if child_op.name in OP_TO_CONFIG: + tunable_ops.append(child_op) + tunable_ops = tunable_ops + get_all_tunable_ops(child_op) + return tunable_ops + + +def gen_configs_from_ir(ir_module: ir.Module): + """Genrate configs from ir module""" + tunable_ops = get_all_tunable_ops(ir_module.operation) + configs = [] + for op in tunable_ops: + if op.name in OP_TO_CONFIG: + configs.append(OP_TO_CONFIG[op.name](op)) + return configs + + +def attach_configs_to_ir(ir_module: ir.Module, configs: List[Config]): + """Add configs to ir module""" + tunable_ops = get_all_tunable_ops(ir_module.operation) + assert len(tunable_ops) == len( + configs + ), "tunable ops and configs should have the same length" + for i, op in enumerate(tunable_ops): + if op.name in OP_TO_CONFIG: + with ir_module.context: + configs[i].attach_to_ir(op)