diff --git a/taichi/codegen/cpu/codegen_cpu.cpp b/taichi/codegen/cpu/codegen_cpu.cpp index 7eae6f054a72b..245596cf57ce7 100644 --- a/taichi/codegen/cpu/codegen_cpu.cpp +++ b/taichi/codegen/cpu/codegen_cpu.cpp @@ -302,39 +302,17 @@ FunctionType CodeGenCPU::codegen() { kernel->lower(/*to_executable=*/false); } - auto block = dynamic_cast(kernel->ir.get()); - auto &worker = get_llvm_program(kernel->program)->compilation_workers; - TI_ASSERT(block); - - auto &offloads = block->statements; - std::vector data(offloads.size()); - using TaskFunc = int32 (*)(void *); - std::vector task_funcs(offloads.size()); - for (int i = 0; i < offloads.size(); i++) { - auto compile_func = [&, i] { - auto offload = - irpass::analysis::clone(offloads[i].get(), offloads[i]->get_kernel()); - irpass::re_id(offload.get()); - auto new_data = this->modulegen(nullptr, offload->as()); - data[i].tasks = std::move(new_data.tasks); - data[i].module = std::move(new_data.module); - }; - if (kernel->is_evaluator) { - compile_func(); - } else { - worker.enqueue(compile_func); - } - } - if (!kernel->is_evaluator) { - worker.flush(); - } + CodeGenLLVMCPU gen(kernel, ir); + auto compiled_res = gen.run_compilation(); + CPUModuleToFunctionConverter converter{gen.tlctx, + llvm_prog->get_runtime_executor()}; + std::vector data_list; + data_list.push_back(std::move(compiled_res)); if (!kernel->is_evaluator) { - cache_module(kernel_key, data); + cache_module(kernel_key, data_list); } - CPUModuleToFunctionConverter converter( - tlctx, get_llvm_program(prog)->get_runtime_executor()); - return converter.convert(kernel, std::move(data)); + return converter.convert(this->kernel, std::move(data_list)); } TLANG_NAMESPACE_END diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp index 08508a8e7de34..b9542e32a39cc 100644 --- a/taichi/program/compile_config.cpp +++ b/taichi/program/compile_config.cpp @@ -58,7 +58,7 @@ CompileConfig::CompileConfig() { print_kernel_llvm_ir = false; print_kernel_nvptx = false; print_kernel_llvm_ir_optimized = false; - num_compile_threads = 2; + num_compile_threads = 0; // CUDA backend options: device_memory_GB = 1; // by default, preallocate 1 GB GPU memory diff --git a/tests/python/test_offline_cache.py b/tests/python/test_offline_cache.py index 6250534addcb4..ccbd8818011b9 100644 --- a/tests/python/test_offline_cache.py +++ b/tests/python/test_offline_cache.py @@ -5,8 +5,6 @@ from os import listdir, remove, rmdir, stat from os.path import join from tempfile import mkdtemp -from time import sleep -from typing import List import pytest @@ -38,11 +36,12 @@ def get_cache_files_size(path): return result -def get_expected_num_cache_files(num_offloads: List[int] = None) -> int: - if not num_offloads: +def get_expected_num_cache_files(num_kernels: int) -> int: + if num_kernels == 0: return 0 + NUM_CACHE_FILES_PER_KERNEL = 1 # metadata.{json, tcb} - return 2 + sum(num_offloads) + return 2 + NUM_CACHE_FILES_PER_KERNEL * num_kernels def tmp_offline_cache_file_path(): @@ -101,11 +100,10 @@ def python_kernel3(a, mat): simple_kernels_to_test = [ - (kernel0, (), python_kernel0, 1), - (kernel1, (100, 200, 10.2), python_kernel1, 1), - (kernel2, (1024, ), python_kernel2, 3), - (kernel3, (10, ti.Matrix([[1, 2], [256, 1024]], - ti.i32)), python_kernel3, 1), + (kernel0, (), python_kernel0), + (kernel1, (100, 200, 10.2), python_kernel1), + (kernel2, (1024, ), python_kernel2), + (kernel3, (10, ti.Matrix([[1, 2], [256, 1024]], ti.i32)), python_kernel3), ] @@ -129,8 +127,7 @@ def wrapped(*args, **kwargs): @_test_offline_cache_dec -def _test_offline_cache_for_a_kernel(curr_arch, kernel, args, result, - num_offloads): +def _test_offline_cache_for_a_kernel(curr_arch, kernel, args, result): count_of_cache_file = len(listdir(tmp_offline_cache_file_path())) ti.init(arch=curr_arch, @@ -138,20 +135,20 @@ def _test_offline_cache_for_a_kernel(curr_arch, kernel, args, result, **current_thread_ext_options()) res1 = kernel(*args) assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files() + ) - count_of_cache_file == get_expected_num_cache_files(0) ti.init(arch=curr_arch, enable_fallback=False, **current_thread_ext_options()) - assert len(listdir(tmp_offline_cache_file_path( - ))) - count_of_cache_file == get_expected_num_cache_files([num_offloads]) + assert len(listdir(tmp_offline_cache_file_path()) + ) - count_of_cache_file == get_expected_num_cache_files(1) res2 = kernel(*args) assert res1 == test_utils.approx(result) and res1 == test_utils.approx( res2) ti.reset() - assert len(listdir(tmp_offline_cache_file_path( - ))) - count_of_cache_file == get_expected_num_cache_files([num_offloads]) + assert len(listdir(tmp_offline_cache_file_path()) + ) - count_of_cache_file == get_expected_num_cache_files(1) @_test_offline_cache_dec @@ -163,13 +160,13 @@ def _test_closing_offline_cache_for_a_kernel(curr_arch, kernel, args, result): offline_cache_file_path=tmp_offline_cache_file_path()) res1 = kernel(*args) assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files() + ) - count_of_cache_file == get_expected_num_cache_files(0) ti.init(arch=curr_arch, enable_fallback=False, offline_cache_file_path=tmp_offline_cache_file_path()) assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files() + ) - count_of_cache_file == get_expected_num_cache_files(0) res2 = kernel(*args) assert res1 == test_utils.approx(result) and res1 == test_utils.approx( @@ -177,12 +174,12 @@ def _test_closing_offline_cache_for_a_kernel(curr_arch, kernel, args, result): ti.reset() assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files() + ) - count_of_cache_file == get_expected_num_cache_files(0) @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache) def test_closing_offline_cache(curr_arch): - for kernel, args, get_res, num_offloads in simple_kernels_to_test: + for kernel, args, get_res in simple_kernels_to_test: _test_closing_offline_cache_for_a_kernel(curr_arch=curr_arch, kernel=kernel, args=args, @@ -191,13 +188,11 @@ def test_closing_offline_cache(curr_arch): @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache) def test_offline_cache_per_kernel(curr_arch): - for kernel, args, get_res, num_offloads in simple_kernels_to_test: - _test_offline_cache_for_a_kernel( - curr_arch=curr_arch, - kernel=kernel, - args=args, - result=get_res(*args), - num_offloads=num_offloads if curr_arch is ti.cpu else 1) + for kernel, args, get_res in simple_kernels_to_test: + _test_offline_cache_for_a_kernel(curr_arch=curr_arch, + kernel=kernel, + args=args, + result=get_res(*args)) @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache) @@ -229,18 +224,18 @@ def compute_y(): **current_thread_ext_options()) helper() assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files() + ) - count_of_cache_file == get_expected_num_cache_files(0) ti.init(arch=curr_arch, enable_fallback=False, **current_thread_ext_options()) assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files([8]) + ) - count_of_cache_file == get_expected_num_cache_files(8) helper() ti.reset() assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files([8]) + ) - count_of_cache_file == get_expected_num_cache_files(8) @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache) @@ -264,7 +259,7 @@ def np_kernel(a, b): np_mat3 = mat3.to_numpy() assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files() + ) - count_of_cache_file == get_expected_num_cache_files(0) ti.init(arch=curr_arch, enable_fallback=False, **current_thread_ext_options()) @@ -274,7 +269,7 @@ def np_kernel(a, b): enable_fallback=False, **current_thread_ext_options()) assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files([1]) + ) - count_of_cache_file == get_expected_num_cache_files(1) assert (kernel(mat1, mat1).to_numpy() == np_kernel(np_mat1, np_mat1)).all() assert (kernel(mat1, mat2).to_numpy() == np_kernel(np_mat1, np_mat2)).all() @@ -283,7 +278,7 @@ def np_kernel(a, b): ti.reset() assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files([1]) + ) - count_of_cache_file == get_expected_num_cache_files(1) @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache) @@ -306,7 +301,7 @@ def helper(): assert y[None] == test_utils.approx(7.28) assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files() + ) - count_of_cache_file == get_expected_num_cache_files(0) ti.init(arch=curr_arch, enable_fallback=False, **current_thread_ext_options()) @@ -316,12 +311,12 @@ def helper(): enable_fallback=False, **current_thread_ext_options()) assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files([4]) + ) - count_of_cache_file == get_expected_num_cache_files(4) helper() ti.reset() assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files([4]) + ) - count_of_cache_file == get_expected_num_cache_files(4) @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache) @@ -330,7 +325,7 @@ def test_calling_many_kernels(curr_arch): count_of_cache_file = len(listdir(tmp_offline_cache_file_path())) def helper(): - for kernel, args, get_res, num_offloads in simple_kernels_to_test: + for kernel, args, get_res in simple_kernels_to_test: assert (kernel(*args) == test_utils.approx(get_res(*args))) ti.init(arch=curr_arch, @@ -338,23 +333,19 @@ def helper(): **current_thread_ext_options()) helper() assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files() + ) - count_of_cache_file == get_expected_num_cache_files(0) ti.init(arch=curr_arch, enable_fallback=False, **current_thread_ext_options()) assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files([ - kern[3] if curr_arch is ti.cpu else 1 - for kern in simple_kernels_to_test - ]) + ) - count_of_cache_file == get_expected_num_cache_files( + len(simple_kernels_to_test)) helper() ti.reset() assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files([ - kern[3] if curr_arch is ti.cpu else 1 - for kern in simple_kernels_to_test - ]) + ) - count_of_cache_file == get_expected_num_cache_files( + len(simple_kernels_to_test)) @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache) @@ -371,7 +362,7 @@ def helper(): c += i assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files() + ) - count_of_cache_file == get_expected_num_cache_files(0) ti.init(arch=curr_arch, enable_fallback=False, default_fp=ti.f32, @@ -383,14 +374,12 @@ def helper(): default_fp=ti.f64, **current_thread_ext_options()) assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files( - [2] if curr_arch is ti.cpu else [1]) + ) - count_of_cache_file == get_expected_num_cache_files(1) helper() ti.reset() assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files( - [2, 2] if curr_arch is ti.cpu else [1, 1]) + ) - count_of_cache_file == get_expected_num_cache_files(2) ti.init(arch=curr_arch, enable_fallback=False, default_fp=ti.f32, @@ -400,8 +389,7 @@ def helper(): ti.reset() assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files( - [2, 2] if curr_arch is ti.cpu else [1, 1]) + ) - count_of_cache_file == get_expected_num_cache_files(2) @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache) @@ -420,9 +408,8 @@ def only_init(max_size): def run_simple_kernels(max_size): only_init(max_size) - for kernel, args, get_res, num_offloads in simple_kernels_to_test: + for kernel, args, get_res in simple_kernels_to_test: assert kernel(*args) == test_utils.approx(get_res(*args)) - sleep(1) # make sure the kernels are not used in the same second kernel_count = len(simple_kernels_to_test) rem_factor = 1 if policy in [ @@ -431,39 +418,23 @@ def run_simple_kernels(max_size): count_of_cache_file = len(listdir(tmp_offline_cache_file_path())) assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files() + ) - count_of_cache_file == get_expected_num_cache_files(0) run_simple_kernels(1024**3) # 1GB ti.reset() # Dumping cache data size_of_cache_files = get_cache_files_size(tmp_offline_cache_file_path()) assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files([ - kern[3] if curr_arch is ti.cpu else 1 - for kern in simple_kernels_to_test - ]) + ) - count_of_cache_file == get_expected_num_cache_files( + len(simple_kernels_to_test)) only_init(size_of_cache_files * 2) ti.reset() assert len(listdir(tmp_offline_cache_file_path()) - ) - count_of_cache_file == get_expected_num_cache_files([ - kern[3] if curr_arch is ti.cpu else 1 - for kern in simple_kernels_to_test - ]) + ) - count_of_cache_file == get_expected_num_cache_files( + len(simple_kernels_to_test)) only_init(size_of_cache_files) ti.reset() - rem = 0 - if policy in ['never', 'version']: - rem = sum([ - kern[3] if curr_arch is ti.cpu else 1 - for kern in simple_kernels_to_test - ]) - else: - for i in range( - min(kernel_count - int(factor * kernel_count), kernel_count)): - rem += simple_kernels_to_test[kernel_count - i - - 1][3] if curr_arch is ti.cpu else 1 - if rem > 0: - rem += 2 - assert len(listdir( - tmp_offline_cache_file_path())) - count_of_cache_file == rem + assert len(listdir(tmp_offline_cache_file_path()) + ) - count_of_cache_file == get_expected_num_cache_files( + int(len(simple_kernels_to_test) * rem_factor))