intel · xurui1995 · Sep 23, 2024 · Sep 24, 2024 · Sep 24, 2024 · Sep 24, 2024
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
@@ -95,6 +95,13 @@ declare_mlir_python_extension(GcPythonSources.CpuInfoExtension
     CPUInfo.cpp
 )
 
+declare_mlir_python_extension(GcPythonSources.ToolsExtension
+  MODULE_NAME _tools
+  ADD_TO_PARENT GcPythonSources
+  SOURCES
+    Tools.cpp
+)
+
 ################################################################################
 # Common CAPI
 ################################################################################

diff --git a/python/Tools.cpp b/python/Tools.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "gc/Analysis/MatmulConfigAnalysis.h"
+#include "mlir/Bindings/Python/PybindAdaptors.h"
+
+PYBIND11_MODULE(_tools, m) {
+
+  m.def(
+      "validate_matmul_config",
+      [](const std::vector<uint32_t> &cfg_list, std::vector<uint32_t> &shape,
+         bool allow_indivisible_innerblock, bool is_vnni_mm2d) {
+        if (cfg_list.size() != 9) {
+          throw std::invalid_argument("cfg_list must have exactly 9 elements");
+        }
+        mlir::gc::MatmulConfig cfg{cfg_list[0], cfg_list[1], cfg_list[2],
+                                   cfg_list[3], cfg_list[4], cfg_list[5],
+                                   cfg_list[6], cfg_list[7], cfg_list[8]};
+        return mlir::gc::validateConfig(
+            cfg, shape, allow_indivisible_innerblock, is_vnni_mm2d);
+      },
+      py::arg("cfg_list"), py::arg("shape"),
+      py::arg("allow_indivisible_innerblock"), py::arg("is_vnni_mm2d"),
+      "Validate the matmul configuration");
+}
diff --git a/python/gc_mlir/tools/__init__.py b/python/gc_mlir/tools/__init__.py
@@ -5,3 +5,5 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
 # ===-----------------------------------------------------------------------===#
+
+from .._mlir_libs._tools import validate_matmul_config
diff --git a/scripts/correctness.sh b/scripts/correctness.sh
diff --git a/test/benchgc/CMakeLists.txt b/test/benchgc/CMakeLists.txt
@@ -41,3 +41,4 @@ add_subdirectory("src/benchgc/tensor")
 add_subdirectory("src/benchgc/arith")
 add_subdirectory("src/benchgc/pattern")
 add_subdirectory("src/benchgc/math")
+add_subdirectory("src/benchgc/tuner")
diff --git a/test/benchgc/README.md b/test/benchgc/README.md
@@ -44,6 +44,7 @@ python -m benchgc [OPTIONS] --mode [MODE] --driver [DRIVER] --case [CASE]
 ### --mode [str]
 * C : correctness testing (by default)
 * P : performance testing
+* T : performance tuning, see tuner [`README.md`](src/benchgc/tuner/README.md)
 
 ###  --driver [str]
 * linalg: test the single op in linalg dialect
@@ -138,12 +139,15 @@ module {
 ### --bench_kind [str]
 * py : use the MLIR Python API to invoke the kernel and use Python to calculate the time cost
 * wrapper : modify MLIR by wrapping the kernel into a new method and calling the `nanoTime()` method before and after calling the kernel. Finally, calculate the difference as the time cost
+* default: `py`
 
 ### --warm_up [int]
 * warm-up times of the execution
+* default: 100
 
 ### --repeat [int]
 * repeat times of the execution
+* default: 100
 
 ## Pattern Options
 Each pattern has its own unique options.

diff --git a/test/benchgc/src/benchgc/__main__.py b/test/benchgc/src/benchgc/__main__.py
@@ -31,9 +31,15 @@
     set_default_fill,
 )
 from benchgc.arg.arg import Arg
-from benchgc.bench import mlir_wrapper_bench, py_timeit_bench
+from benchgc.bench import (
+    batch_mlir_wrapper_bench,
+    batch_py_timeit_bench,
+    mlir_wrapper_bench,
+    py_timeit_bench,
+)
 from benchgc.mlir.arg import get_mlir_args
 from benchgc.pattern import get_pattern_clz
+from benchgc.tuner.tuner import GATuner, GridTuner, Tuner, TuningSpace
 from gc_mlir import ir
 from gc_mlir.graph_compiler import GraphCompiler
 
@@ -44,7 +50,7 @@ def add_common_options(parser: argparse.ArgumentParser):
         "--mode",
         required=False,
         help="specify the test mode, C for correctness testing, P for performance testing",
-        choices=["C", "P"],
+        choices=["C", "P", "T"],
         default="C",
         type=str,
     )
@@ -198,13 +204,20 @@ def add_common_options(parser: argparse.ArgumentParser):
 
 def add_bench_options(parser: argparse.ArgumentParser):
     """add options for bench mode"""
-    if parser.parse_known_args()[0].mode == "P":
+    if parser.parse_known_args()[0].mode in ("P", "T"):
         parser.add_argument(
             "--bench_kind", type=str, choices=["py", "wrapper"], default="py"
         )
-        parser.add_argument("--warm_up", type=int, default=100)
-        parser.add_argument("--repeat", type=int, default=100)
-
+        parser.add_argument(
+            "--warm_up",
+            type=int,
+            default=100 if parser.parse_known_args()[0].mode == "P" else 2,
+        )
+        parser.add_argument(
+            "--repeat",
+            type=int,
+            default=100 if parser.parse_known_args()[0].mode == "P" else 4,
+        )
 
 
 def add_pattern_options(parser: argparse.ArgumentParser):
@@ -213,6 +226,45 @@ def add_pattern_options(parser: argparse.ArgumentParser):
         pattern_name = parser.parse_known_args()[0].case
         get_pattern_clz(pattern_name).add_args(parser)
 
+def add_tuner_options(parser: argparse.ArgumentParser):
+    """add options for the mode T"""
+    if parser.parse_known_args()[0].mode == "T":
+        parser.add_argument(
+            "--search_alg", type=str, choices=["grid", "ga"], default="grid"
+        )
+        parser.add_argument(
+            "--tuning_batch", type=int, default=Tuner.DEFAULT_BATCH_SIZE
+        )
+        parser.add_argument("--early_stop", type=int, default=Tuner.DEFAULT_EARLY_STOP)
+        parser.add_argument(
+            "--max_tuning_iters", type=int, default=Tuner.DEFAULT_MAX_ITERS
+        )
+        parser.add_argument("--timeout", type=int, default=Tuner.DEFAULT_TIMEOUT)
+        parser.add_argument(
+            "--space_percent", type=float, default=TuningSpace.DEFAULT_SPACE_PERCENT
+        )
+        parser.add_argument(
+            "--tuner_verbose",
+            action="store_true",
+            help="if we need print the tuner log",
+        )
+        parser.add_argument("--checkpoint_path", type=str, default="")
+
+        if parser.parse_known_args()[0].search_alg == "ga":
+            parser.add_argument(
+                "--ga_random_seed", type=int, default=GATuner.DEFAULT_RANDOM_SEED
+            )
+            parser.add_argument(
+                "--ga_elite_num", type=int, default=GATuner.DEFAULT_ELITE_NUM
+            )
+            parser.add_argument(
+                "--ga_mutation_prob", type=float, default=GATuner.DEFAULT_MUTATION_PROB
+            )
+            parser.add_argument(
+                "--ga_expected_tune_num",
+                type=int,
+                default=GATuner.DEFAULT_EXPECTED_TUNE_NUM,
+            )
 
 def get_module_and_args(flags: argparse.Namespace):
     args: List[Arg] = []
@@ -391,17 +443,79 @@ def performance_testing(flags: argparse.Namespace, module: ir.Module, args: List
         print(json_res)
 
 
+def performance_tuning(flags: argparse.Namespace, module: ir.Module, args: List[Arg]):
+    gc_args: List[torch.Tensor | int] = []
+    gc_tensors: Dict[str, torch.Tensor] = {}
+    for i in range(len(args)):
+        tensor = fill_tensor(flags, args[i], i)
+        gc_tensors["%arg" + str(i)] = tensor
+        if args[i].scalar:
+            gc_args.append(tensor.data_ptr())
+        else:
+            gc_args.append(tensor)
+
+    mlir_args = get_mlir_args(gc_args)
+    with module.context as ctx, ir.Location.unknown():
+        if flags.ir_printing:
+            ctx.enable_multithreading(False)
+        batch_bench = (
+            batch_py_timeit_bench
+            if flags.bench_kind == "py"
+            else batch_mlir_wrapper_bench
+        )
+
+        def tuner_batch_bench(ir_moudles):
+            return batch_bench(
+                ir_moudles,
+                flags.entry,
+                "any(gc-cpu-pipeline)",
+                mlir_args,
+                flags.ir_printing,
+                flags.repeat,
+                flags.warm_up,
+            )
+
+        assert flags.space_percent > 0 and flags.space_percent <= 1.0
+        space = TuningSpace(module, flags.space_percent)
+        if flags.search_alg == "grid":
+            tuner = GridTuner(
+                tuner_batch_bench,
+                space,
+                flags.tuning_batch,
+                flags.early_stop,
+                flags.checkpoint_path,
+                flags.tuner_verbose,
+            )
+        else:
+            tuner = GATuner(
+                tuner_batch_bench,
+                space,
+                flags.tuning_batch,
+                flags.early_stop,
+                flags.checkpoint_path,
+                flags.tuner_verbose,
+                flags.ga_elite_num,
+                flags.ga_mutation_prob,
+                random_seed=flags.ga_random_seed,
+                expected_tune_num=flags.ga_expected_tune_num,
+            )
+        tuner.run(flags.max_tuning_iters, flags.timeout)
+
+
 if __name__ == "__main__":
     arg_parser = argparse.ArgumentParser(prog="benchmark tool for graph compiler")
     add_common_options(arg_parser)
     add_bench_options(arg_parser)
     add_pattern_options(arg_parser)
+    add_tuner_options(arg_parser)
     flags = arg_parser.parse_args()
     benchgc.util.set_seed(flags.seed)
     ir_module, module_args = get_module_and_args(flags)
     if flags.mode == "C":
         correctness_testing(flags, ir_module, module_args)
     elif flags.mode == "P":
         performance_testing(flags, ir_module, module_args)
+    elif flags.mode == "T":
+        performance_tuning(flags, ir_module, module_args)
     else:
         pass
diff --git a/test/benchgc/src/benchgc/bench.py b/test/benchgc/src/benchgc/bench.py
@@ -117,21 +117,22 @@ def batch_py_timeit_bench(
     ir_modules: List[ir.Module],
     entry_name: str,
     pipeline: str,
-    mlir_args: list,
+    mlir_args: List[Any],
     ir_printing=False,
     repeat_time=5,
     warm_up=2,
 ) -> List[Tuple[float, float]]:
     """benchmark a batch of mlir with python timeit."""
     compiler = GraphCompiler(pipeline)
+    engines = []
     funcs = []
     compile_costs = []
     for m in ir_modules:
         compile_begin = timeit.default_timer()
         engine = compiler.compile_and_jit(m, ir_printing=ir_printing)
+        engines.append(engine)
         compile_cost = (timeit.default_timer() - compile_begin) * 1000
         compile_costs.append(compile_cost)
-        funcs.append(engine.lookup(entry_name))
 
     # Copied from execution_engine.py so that the cost of cast does not affect perf result.
     packed_args = (ctypes.c_void_p * len(mlir_args))()
@@ -141,11 +142,11 @@ def batch_py_timeit_bench(
     def run_bench(func, arg):
         func(arg)
 
-    for func in funcs:
+    for func in [engine.lookup(entry_name) for engine in engines]:
         timeit.timeit(lambda: run_bench(func, packed_args), number=warm_up)
 
     execute_costs = []
-    for func in funcs:
+    for func in [engine.lookup(entry_name) for engine in engines]:
         total_time = timeit.timeit(
             lambda: run_bench(func, packed_args), number=repeat_time
         )
@@ -158,7 +159,7 @@ def batch_mlir_wrapper_bench(
     ir_modules: ir.Module,
     entry_name: str,
     pipeline: str,
-    mlir_args: list,
+    mlir_args: List[Any],
     ir_printing=False,
     repeat_time=5,
     warm_up=2,

diff --git a/test/benchgc/src/benchgc/tuner/CMakeLists.txt b/test/benchgc/src/benchgc/tuner/CMakeLists.txt
@@ -0,0 +1,22 @@
+################################################################################
+# Copyright (C) 2024 Intel Corporation
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+# SPDX-License-Identifier: Apache-2.0
+################################################################################
+
+
+file(GLOB PYTHON_SCRIPTS "*.py")
+foreach(PY_SCRIPT ${PYTHON_SCRIPTS})
+  configure_file(${PY_SCRIPT} ${CMAKE_BINARY_DIR}/test/benchgc/src/benchgc/tuner/ COPYONLY)
+endforeach()