diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp index a8d2d2060..e02cd0e2f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp @@ -268,6 +268,8 @@ LogicalResult AIETargetBackend::serializeExecutable( SmallVector xclbinIndices(ordinalCount); SmallVector asmInstrIndices(ordinalCount); + SmallVector> xclbinPaths; + for (size_t i = 0; i < entryPointNames.size(); i++) { uint64_t ordinal = entryPointOrdinals.at(entryPointNames[i]); @@ -300,18 +302,34 @@ LogicalResult AIETargetBackend::serializeExecutable( llvm::sys::path::append(npuInstPath, entryPointNamesFb[ordinal] + ".npu.txt"); - SmallVector cmdArgs{aie2xclbin, - inputMlirPath, - "--peano", - options.peanoInstallDir, - "--xclbin-name", - xclbinPath, - "--npu-insts-name", - npuInstPath, - "--xclbin-kernel-name", - entryPointNamesFb[ordinal], - "--tmpdir", - entryPointWorkDir}; + // Convert ordinal to hexadecimal string for xclbin kern id + std::stringstream ss; + ss << "0x" << std::hex << ordinal + 10; + std::string ordinalHex = ss.str(); + + SmallVector cmdArgs; + SmallVector cmdArgsBase{aie2xclbin, + inputMlirPath, + "--peano", + options.peanoInstallDir, + "--xclbin-name", + xclbinPath, + "--npu-insts-name", + npuInstPath, + "--xclbin-kernel-name", + entryPointNamesFb[ordinal], + "--tmpdir", + entryPointWorkDir, + "--xclbin-kernel-id", + ordinalHex}; + cmdArgs = cmdArgsBase; + bool AttemptingMerge = false; + if (i > 0) { + cmdArgs.push_back("--input-xclbin-name"); + cmdArgs.push_back(xclbinPaths.back()); + AttemptingMerge = true; + } + xclbinPaths.push_back(xclbinPath); auto addOpt = [&](StringRef arg, bool value) { if (value) cmdArgs.push_back(arg); @@ -350,11 +368,24 @@ LogicalResult AIETargetBackend::serializeExecutable( { SmallVector cmdEnvRefs{cmdEnv.begin(), cmdEnv.end()}; int result = llvm::sys::ExecuteAndWait(cmdArgs[0], cmdArgs, cmdEnvRefs); - if (result != 0) + if (result != 0 && AttemptingMerge) { + // we failed to create xclbin but maybe we failed becuase we were trying + // to merge the kerenel in exisiting kernel, try again to see if perhaps + // we have success if we dont try to merge. + AttemptingMerge = false; + result = + llvm::sys::ExecuteAndWait(cmdArgsBase[0], cmdArgsBase, cmdEnvRefs); + xclbinPaths.push_back(xclbinPath); + } + if (result != 0) { return moduleOp.emitOpError( "Failed to produce an XCLBin with external tool."); + } + // delete the previous xclbin if we were able to merge as the new one now + // will have all the kernels from the previous one. + if (AttemptingMerge) xclbinPaths.erase(xclbinPaths.end() - 2); + xclbinIndices[ordinal] = xclbinPaths.size() - 1; } - std::ifstream instrFile(static_cast(npuInstPath)); std::string line; while (std::getline(instrFile, line)) { @@ -369,7 +400,7 @@ LogicalResult AIETargetBackend::serializeExecutable( asmInstrIndices[ordinal] = asmInstrRefs.size(); asmInstrRefs.push_back( iree_amd_aie_hal_xrt_AsmInstDef_create(builder, npuInstrsVec)); - + /* xclbinIn = openInputFile(xclbinPath, &errorMessage); if (!xclbinIn) { moduleOp.emitOpError() << "Failed to open xclbin file: " << errorMessage; @@ -378,7 +409,21 @@ LogicalResult AIETargetBackend::serializeExecutable( xclbinIndices[ordinal] = xclbinRefs.size(); xclbinRefs.push_back( iree_amd_aie_hal_xrt_XclbinDef_create(builder, xclbinStringRef)); + */ + } + // write out the final xclbins to flatbuffer + for (auto xclbinPath : xclbinPaths) { + llvm::outs() << "writing xclbin from path: " << xclbinPath << "\n"; + std::string errorMessage; + xclbinIn = openInputFile(xclbinPath, &errorMessage); + if (!xclbinIn) { + moduleOp.emitOpError() << "Failed to open xclbin file: " << errorMessage; + } + auto xclbinStringRef = builder.createString(xclbinIn->getBuffer()); + xclbinRefs.push_back( + iree_amd_aie_hal_xrt_XclbinDef_create(builder, xclbinStringRef)); } + // Serialize the executable to flatbuffer format auto entryPointsRef = builder.createStringVec(entryPointNamesFb); diff --git a/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc b/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc index 8ee4e919b..f7d75be6a 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc @@ -128,6 +128,9 @@ iree_status_t iree_hal_xrt_native_executable_create( iree_amd_aie_hal_xrt_XclbinDef_vec_t xclbins_vec = iree_amd_aie_hal_xrt_ExecutableDef_xclbins_get(executable_def); + iree_host_size_t number_xclbin = + iree_amd_aie_hal_xrt_XclbinDef_vec_len(xclbins_vec); + iree_amd_aie_hal_xrt_AsmInstDef_vec_t asm_instrs_vec = iree_amd_aie_hal_xrt_ExecutableDef_asm_instrs_get(executable_def); @@ -163,17 +166,15 @@ iree_status_t iree_hal_xrt_native_executable_create( &executable->resource); executable->host_allocator = host_allocator; executable->entry_point_count = entry_point_count; - for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count; - entry_ordinal++) { - const char* entry_name = - flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); - uint32_t xclbin_index = - flatbuffers_uint32_vec_at(xclbin_indices_vec, entry_ordinal); + // collect all the hardware contexts first as muliple entry points can map to + // the same context and this way we dont need to keep reloading them. + std::vector contexts; + for (iree_host_size_t xclbin_index = 0; xclbin_index < number_xclbin; + xclbin_index++) { iree_amd_aie_hal_xrt_XclbinDef_table_t xclbin_def = iree_amd_aie_hal_xrt_XclbinDef_vec_at(xclbins_vec, xclbin_index); flatbuffers_string_t xclbin_fb = iree_amd_aie_hal_xrt_XclbinDef_xclbin_get(xclbin_def); - // XRT API needs this vector and cant actually read a void*. std::vector xclbinVector( xclbin_fb, xclbin_fb + flatbuffers_string_len(xclbin_fb)); @@ -186,6 +187,14 @@ iree_status_t iree_hal_xrt_native_executable_create( } device.register_xclbin(xclbin); xrt::hw_context context(device, xclbin.get_uuid()); + contexts.push_back(context); + } + for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count; + entry_ordinal++) { + const char* entry_name = + flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); + uint32_t xclbin_index = + flatbuffers_uint32_vec_at(xclbin_indices_vec, entry_ordinal); uint32_t asm_instr_index = flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal); iree_amd_aie_hal_xrt_AsmInstDef_table_t asminst_def = @@ -196,7 +205,8 @@ iree_status_t iree_hal_xrt_native_executable_create( std::unique_ptr kernel; std::unique_ptr instr; try { - kernel = std::make_unique(context, entry_name); + kernel = + std::make_unique(contexts[xclbin_index], entry_name); // XCL_BO_FLAGS_CACHEABLE is used to indicate that this is an instruction // buffer that resides in instr_memory. This buffer is always passed as // the first argument to the kernel and we can use the