diff --git a/build_tools/python_deploy/build_linux_packages.sh b/build_tools/python_deploy/build_linux_packages.sh index 54f45c28c82a..1a63b7048282 100755 --- a/build_tools/python_deploy/build_linux_packages.sh +++ b/build_tools/python_deploy/build_linux_packages.sh @@ -148,10 +148,12 @@ function build_iree_runtime() { export IREE_RUNTIME_BUILD_TRACY=ON # We install the needed build deps below for the tools. export IREE_RUNTIME_BUILD_TRACY_TOOLS=ON + export IREE_EXTERNAL_HAL_DRIVERS="rocm" build_wheel runtime/ } function build_iree_compiler() { + export IREE_TARGET_BACKEND_ROCM=ON build_wheel compiler/ } diff --git a/build_tools/python_deploy/build_windows_packages.ps1 b/build_tools/python_deploy/build_windows_packages.ps1 index 43906f8800f7..a8fbd3a9aadf 100644 --- a/build_tools/python_deploy/build_windows_packages.ps1 +++ b/build_tools/python_deploy/build_windows_packages.ps1 @@ -67,11 +67,13 @@ function run() { function build_iree_runtime() { param($python_version) $env:IREE_HAL_DRIVER_VULKAN = "ON" + $env:IREE_EXTERNAL_HAL_DRIVERS = "rocm" & py -${python_version} -m pip wheel -v -w $output_dir $repo_root/runtime/ } function build_iree_compiler() { param($python_version) + $env:IREE_TARGET_BACKEND_ROCM= "ON" py -${python_version} -m pip wheel -v -w $output_dir $repo_root/compiler/ } diff --git a/experimental/rocm/CMakeLists.txt b/experimental/rocm/CMakeLists.txt index c2b313ddd0e3..8b3c3b9eb2e2 100644 --- a/experimental/rocm/CMakeLists.txt +++ b/experimental/rocm/CMakeLists.txt @@ -25,7 +25,7 @@ set(IREE_ROCM_BC_DIR "${IREE_ROCM_BC_DIR_DEFAULT}" CACHE STRING iree_add_all_subdirs() if(NOT ROCM_HEADERS_API_ROOT) - set(ROCM_HEADERS_API_ROOT "/opt/rocm/include") + set(ROCM_HEADERS_API_ROOT ${CMAKE_CURRENT_LIST_DIR}) endif() if(EXISTS ${ROCM_HEADERS_API_ROOT}) @@ -68,7 +68,7 @@ iree_cc_library( INCLUDES "${CMAKE_CURRENT_LIST_DIR}/../.." "${PROJECT_BINARY_DIR}" - "${ROCM_HEADERS_API_ROOT}" + "${ROCM_HEADERS_API_ROOT}/.." DEPS ::dynamic_symbols iree::base diff --git a/experimental/rocm/hip_headers.h b/experimental/rocm/hip_headers.h new file mode 100644 index 000000000000..28bcd5667482 --- /dev/null +++ b/experimental/rocm/hip_headers.h @@ -0,0 +1,1489 @@ +/* + * Copyright 2011-2021 Blender Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License + */ + +#ifndef __HIPEW_H__ +#define __HIPEW_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#define HIP_IPC_HANDLE_SIZE 64 +#define hipHostMallocDefault 0x00 +#define hipHostMallocPortable 0x01 +#define hipHostMallocMapped 0x02 +#define hipHostMallocWriteCombined 0x04 +#define hipHostMallocNumaUser 0x20000000 +#define hipHostMallocCoherent 0x40000000 +#define hipHostMallocNonCoherent 0x80000000 +#define hipHostRegisterPortable 0x01 +#define hipHostRegisterMapped 0x02 +#define hipHostRegisterIoMemory 0x04 +#define hipCooperativeLaunchMultiDeviceNoPreSync 0x01 +#define hipCooperativeLaunchMultiDeviceNoPostSync 0x02 +#define hipArrayLayered 0x01 +#define hipArraySurfaceLoadStore 0x02 +#define hipArrayCubemap 0x04 +#define hipArrayTextureGather 0x08 +#define HIP_TRSA_OVERRIDE_FORMAT 0x01 +#define HIP_TRSF_READ_AS_INTEGER 0x01 +#define HIP_TRSF_NORMALIZED_COORDINATES 0x02 +#define HIP_LAUNCH_PARAM_BUFFER_POINTER ((void*)0x01) +#define HIP_LAUNCH_PARAM_BUFFER_SIZE ((void*)0x02) +#define HIP_LAUNCH_PARAM_END ((void*)0x03) + +/* Functions which changed 3.1 -> 3.2 for 64 bit stuff, + * the cuda library has both the old ones for compatibility and new + * ones with _v2 postfix, + */ +#define hipModuleGetGlobal hipModuleGetGlobal +#define hipMemGetInfo hipMemGetInfo +#define hipMemAllocPitch hipMemAllocPitch +#define hipMemGetAddressRange hipMemGetAddressRange +#define hipMemcpy hipMemcpy +#define hipMemcpyHtoD hipMemcpyHtoD +#define hipMemcpyDtoH hipMemcpyDtoH +#define hipMemcpyDtoD hipMemcpyDtoD +#define hipMemcpyHtoA hipMemcpyHtoA +#define hipMemcpyAtoH hipMemcpyAtoH +/* +* #define hipMemcpyAsync hipMemcpyAsync +* #define hipMemcpyHtoDAsync hipMemcpyHtoDAsync +* #define hipMemcpyDtoHAsync hipMemcpyDtoHAsync +* #define hipMemcpyDtoDAsync hipMemcpyDtoDAsync +*/ +#define hipMemsetD8 hipMemsetD8 +#define hipMemsetD16 hipMemsetD16 +#define hipMemsetD32 hipMemsetD32 +#define hipMemsetAsync hipMemsetAsync +#define hipMemsetD8Async hipMemsetD8Async +#define hipMemsetD16Async hipMemsetD16Async +#define hipMemsetD32Async hipMemsetD32Async +#define hipArrayCreate hipArrayCreate +#define hipArray3DCreate hipArray3DCreate +#define hipPointerGetAttributes hipPointerGetAttributes +#define hipTexRefSetAddress hipTexRefSetAddress +#define hipTexRefGetAddress hipTexRefGetAddress +#define hipStreamDestroy hipStreamDestroy +#define hipEventDestroy hipEventDestroy +#define hipTexRefSetAddress2D hipTexRefSetAddress2D + +/* Types. */ +#ifdef _MSC_VER +typedef unsigned __int32 hipuint32_t; +typedef unsigned __int64 hipuint64_t; +#else +#include +typedef uint32_t hipuint32_t; +typedef uint64_t hipuint64_t; +#endif + +#if defined(__x86_64) || defined(AMD64) || defined(_M_AMD64) || defined (__aarch64__) +/* + * Changed to void* for Windows / MSVC + * typedef unsigned long long hipDeviceptr_t; + */ +typedef void * hipDeviceptr_t; +#else +typedef unsigned int hipDeviceptr_t; +#endif + + +#ifdef _WIN32 +# define HIPAPI __stdcall +# define HIP_CB __stdcall +#else +# define HIPAPI +# define HIP_CB +#endif + +typedef int hipDevice_t; +typedef struct ihipCtx_t* hipCtx_t; +typedef struct ihipModule_t* hipModule_t; +typedef struct ihipModuleSymbol_t* hipFunction_t; +typedef struct hipArray* hArray; +typedef struct hipMipmappedArray_st* hipMipmappedArray_t; +typedef struct ihipEvent_t* hipEvent_t; +typedef struct ihipStream_t* hipStream_t; +typedef unsigned long long hipTextureObject_t; +typedef void* hipExternalMemory_t; + +typedef struct HIPuuid_st { + char bytes[16]; +} HIPuuid; + +typedef enum hipMemcpyKind { + hipMemcpyHostToHost = 0, + hipMemcpyHostToDevice = 1, + hipMemcpyDeviceToHost = 2, + hipMemcpyDeviceToDevice = 3, + hipMemcpyDefault = 4 +} hipMemcpyKind; + +typedef enum hipChannelFormatKind { + hipChannelFormatKindSigned = 0, + hipChannelFormatKindUnsigned = 1, + hipChannelFormatKindFloat = 2, + hipChannelFormatKindNone = 3, +}hipChannelFormatKind; + +typedef struct hipChannelFormatDesc { + int x; + int y; + int z; + int w; + enum hipChannelFormatKind f; +}hipChannelFormatDesc; + +typedef enum hipTextureFilterMode { + hipFilterModePoint = 0, + hipFilterModeLinear = 1, +} hipTextureFilterMode; + +typedef enum hipArray_Format { + HIP_AD_FORMAT_UNSIGNED_INT8 = 0x01, + HIP_AD_FORMAT_SIGNED_INT8 = 0x08, + HIP_AD_FORMAT_UNSIGNED_INT16 = 0x02, + HIP_AD_FORMAT_SIGNED_INT16 = 0x09, + HIP_AD_FORMAT_UNSIGNED_INT32 = 0x03, + HIP_AD_FORMAT_SIGNED_INT32 = 0x0a, + HIP_AD_FORMAT_HALF = 0x10, + HIP_AD_FORMAT_FLOAT = 0x20, +} hipArray_Format; + +typedef enum hipTextureAddressMode { + hipAddressModeWrap = 0, + hipAddressModeClamp = 1, + hipAddressModeMirror = 2, + hipAddressModeBorder = 3, +} hipTextureAddressMode; + +/** + * hip texture reference + */ +typedef struct textureReference { + int normalized; + //enum hipTextureReadMode readMode;// used only for driver API's + enum hipTextureFilterMode filterMode; + enum hipTextureAddressMode addressMode[3]; // Texture address mode for up to 3 dimensions + struct hipChannelFormatDesc channelDesc; + int sRGB; // Perform sRGB->linear conversion during texture read + unsigned int maxAnisotropy; // Limit to the anisotropy ratio + enum hipTextureFilterMode mipmapFilterMode; + float mipmapLevelBias; + float minMipmapLevelClamp; + float maxMipmapLevelClamp; + + hipTextureObject_t textureObject; + int numChannels; + enum hipArray_Format format; +}textureReference; + +typedef textureReference* hipTexRef; + +typedef enum hipMemoryType { + hipMemoryTypeHost = 0x00, + hipMemoryTypeDevice = 0x01, + hipMemoryTypeArray = 0x02, + hipMemoryTypeUnified = 0x03, +} hipMemoryType; + +/** + * Pointer attributes + */ +typedef struct hipPointerAttribute_t { + enum hipMemoryType memoryType; + int device; + void* devicePointer; + void* hostPointer; + int isManaged; + unsigned allocationFlags; /* flags specified when memory was allocated*/ + /* peers? */ +} hipPointerAttribute_t; + +typedef struct ihipIpcEventHandle_t { + char reserved[HIP_IPC_HANDLE_SIZE]; +} ihipIpcEventHandle_t; + +typedef struct hipIpcMemHandle_st { + char reserved[HIP_IPC_HANDLE_SIZE]; +} hipIpcMemHandle_t; + +typedef enum HIPipcMem_flags_enum { + hipIpcMemLazyEnablePeerAccess = 0x1, +} HIPipcMem_flags; + +typedef enum HIPmemAttach_flags_enum { + hipMemAttachGlobal = 0x1, + hipMemAttachHost = 0x2, + HIP_MEM_ATTACH_SINGLE = 0x4, +} HIPmemAttach_flags; + +typedef enum HIPctx_flags_enum { + hipDeviceScheduleAuto = 0x00, + hipDeviceScheduleSpin = 0x01, + hipDeviceScheduleYield = 0x02, + hipDeviceScheduleBlockingSync = 0x04, + hipDeviceScheduleMask = 0x07, + hipDeviceMapHost = 0x08, + hipDeviceLmemResizeToMax = 0x10, +} HIPctx_flags; + +typedef enum HIPstream_flags_enum { + hipStreamDefault = 0x0, + hipStreamNonBlocking = 0x1, +} HIPstream_flags; + +typedef enum HIPevent_flags_enum { + hipEventDefault = 0x0, + hipEventBlockingSync = 0x1, + hipEventDisableTiming = 0x2, + hipEventInterprocess = 0x4, +} HIPevent_flags; + +typedef enum HIPstreamWaitValue_flags_enum { + HIP_STREAM_WAIT_VALUE_GEQ = 0x0, + HIP_STREAM_WAIT_VALUE_EQ = 0x1, + HIP_STREAM_WAIT_VALUE_AND = 0x2, + HIP_STREAM_WAIT_VALUE_NOR = 0x3, + HIP_STREAM_WAIT_VALUE_FLUSH = (1 << 30), +} HIPstreamWaitValue_flags; + +typedef enum HIPstreamWriteValue_flags_enum { + HIP_STREAM_WRITE_VALUE_DEFAULT = 0x0, + HIP_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER = 0x1, +} HIPstreamWriteValue_flags; + +typedef enum HIPstreamBatchMemOpType_enum { + HIP_STREAM_MEM_OP_WAIT_VALUE_32 = 1, + HIP_STREAM_MEM_OP_WRITE_VALUE_32 = 2, + HIP_STREAM_MEM_OP_WAIT_VALUE_64 = 4, + HIP_STREAM_MEM_OP_WRITE_VALUE_64 = 5, + HIP_STREAM_MEM_OP_FLUSH_REMOTE_WRITES = 3, +} HIPstreamBatchMemOpType; + + +typedef union HIPstreamBatchMemOpParams_union { + HIPstreamBatchMemOpType operation; + struct HIPstreamMemOpWaitValueParams_st { + HIPstreamBatchMemOpType operation; + hipDeviceptr_t address; + union { + hipuint32_t value; + hipuint64_t value64; + }; + unsigned int flags; + hipDeviceptr_t alias; + } waitValue; + struct HIPstreamMemOpWriteValueParams_st { + HIPstreamBatchMemOpType operation; + hipDeviceptr_t address; + union { + hipuint32_t value; + hipuint64_t value64; + }; + unsigned int flags; + hipDeviceptr_t alias; + } writeValue; + struct HIPstreamMemOpFlushRemoteWritesParams_st { + HIPstreamBatchMemOpType operation; + unsigned int flags; + } flushRemoteWrites; + hipuint64_t pad[6]; +} HIPstreamBatchMemOpParams; + +typedef enum HIPoccupancy_flags_enum { + hipOccupancyDefault = 0x0, + HIP_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1, +} HIPoccupancy_flags; + +typedef enum hipDeviceAttribute_t { + hipDeviceAttributeCudaCompatibleBegin = 0, + hipDeviceAttributeEccEnabled = hipDeviceAttributeCudaCompatibleBegin, ///< Whether ECC support is enabled. + hipDeviceAttributeAccessPolicyMaxWindowSize, ///< Cuda only. The maximum size of the window policy in bytes. + hipDeviceAttributeAsyncEngineCount, ///< Cuda only. Asynchronous engines number. + hipDeviceAttributeCanMapHostMemory, ///< Whether host memory can be mapped into device address space + hipDeviceAttributeCanUseHostPointerForRegisteredMem,///< Cuda only. Device can access host registered memory + ///< at the same virtual address as the CPU + hipDeviceAttributeClockRate, ///< Peak clock frequency in kilohertz. + hipDeviceAttributeComputeMode, ///< Compute mode that device is currently in. + hipDeviceAttributeComputePreemptionSupported, ///< Cuda only. Device supports Compute Preemption. + hipDeviceAttributeConcurrentKernels, ///< Device can possibly execute multiple kernels concurrently. + hipDeviceAttributeConcurrentManagedAccess, ///< Device can coherently access managed memory concurrently with the CPU + hipDeviceAttributeCooperativeLaunch, ///< Support cooperative launch + hipDeviceAttributeCooperativeMultiDeviceLaunch, ///< Support cooperative launch on multiple devices + hipDeviceAttributeDeviceOverlap, ///< Cuda only. Device can concurrently copy memory and execute a kernel. + ///< Deprecated. Use instead asyncEngineCount. + hipDeviceAttributeDirectManagedMemAccessFromHost, ///< Host can directly access managed memory on + ///< the device without migration + hipDeviceAttributeGlobalL1CacheSupported, ///< Cuda only. Device supports caching globals in L1 + hipDeviceAttributeHostNativeAtomicSupported, ///< Cuda only. Link between the device and the host supports native atomic operations + hipDeviceAttributeIntegrated, ///< Device is integrated GPU + hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices. + hipDeviceAttributeKernelExecTimeout, ///< Run time limit for kernels executed on the device + hipDeviceAttributeL2CacheSize, ///< Size of L2 cache in bytes. 0 if the device doesn't have L2 cache. + hipDeviceAttributeLocalL1CacheSupported, ///< caching locals in L1 is supported + hipDeviceAttributeLuid, ///< Cuda only. 8-byte locally unique identifier in 8 bytes. Undefined on TCC and non-Windows platforms + hipDeviceAttributeLuidDeviceNodeMask, ///< Cuda only. Luid device node mask. Undefined on TCC and non-Windows platforms + hipDeviceAttributeComputeCapabilityMajor, ///< Major compute capability version number. + hipDeviceAttributeManagedMemory, ///< Device supports allocating managed memory on this system + hipDeviceAttributeMaxBlocksPerMultiProcessor, ///< Cuda only. Max block size per multiprocessor + hipDeviceAttributeMaxBlockDimX, ///< Max block size in width. + hipDeviceAttributeMaxBlockDimY, ///< Max block size in height. + hipDeviceAttributeMaxBlockDimZ, ///< Max block size in depth. + hipDeviceAttributeMaxGridDimX, ///< Max grid size in width. + hipDeviceAttributeMaxGridDimY, ///< Max grid size in height. + hipDeviceAttributeMaxGridDimZ, ///< Max grid size in depth. + hipDeviceAttributeMaxSurface1D, ///< Maximum size of 1D surface. + hipDeviceAttributeMaxSurface1DLayered, ///< Cuda only. Maximum dimensions of 1D layered surface. + hipDeviceAttributeMaxSurface2D, ///< Maximum dimension (width, height) of 2D surface. + hipDeviceAttributeMaxSurface2DLayered, ///< Cuda only. Maximum dimensions of 2D layered surface. + hipDeviceAttributeMaxSurface3D, ///< Maximum dimension (width, height, depth) of 3D surface. + hipDeviceAttributeMaxSurfaceCubemap, ///< Cuda only. Maximum dimensions of Cubemap surface. + hipDeviceAttributeMaxSurfaceCubemapLayered, ///< Cuda only. Maximum dimension of Cubemap layered surface. + hipDeviceAttributeMaxTexture1DWidth, ///< Maximum size of 1D texture. + hipDeviceAttributeMaxTexture1DLayered, ///< Cuda only. Maximum dimensions of 1D layered texture. + hipDeviceAttributeMaxTexture1DLinear, ///< Maximum number of elements allocatable in a 1D linear texture. + ///< Use cudaDeviceGetTexture1DLinearMaxWidth() instead on Cuda. + hipDeviceAttributeMaxTexture1DMipmap, ///< Cuda only. Maximum size of 1D mipmapped texture. + hipDeviceAttributeMaxTexture2DWidth, ///< Maximum dimension width of 2D texture. + hipDeviceAttributeMaxTexture2DHeight, ///< Maximum dimension hight of 2D texture. + hipDeviceAttributeMaxTexture2DGather, ///< Cuda only. Maximum dimensions of 2D texture if gather operations performed. + hipDeviceAttributeMaxTexture2DLayered, ///< Cuda only. Maximum dimensions of 2D layered texture. + hipDeviceAttributeMaxTexture2DLinear, ///< Cuda only. Maximum dimensions (width, height, pitch) of 2D textures bound to pitched memory. + hipDeviceAttributeMaxTexture2DMipmap, ///< Cuda only. Maximum dimensions of 2D mipmapped texture. + hipDeviceAttributeMaxTexture3DWidth, ///< Maximum dimension width of 3D texture. + hipDeviceAttributeMaxTexture3DHeight, ///< Maximum dimension height of 3D texture. + hipDeviceAttributeMaxTexture3DDepth, ///< Maximum dimension depth of 3D texture. + hipDeviceAttributeMaxTexture3DAlt, ///< Cuda only. Maximum dimensions of alternate 3D texture. + hipDeviceAttributeMaxTextureCubemap, ///< Cuda only. Maximum dimensions of Cubemap texture + hipDeviceAttributeMaxTextureCubemapLayered, ///< Cuda only. Maximum dimensions of Cubemap layered texture. + hipDeviceAttributeMaxThreadsDim, ///< Maximum dimension of a block + hipDeviceAttributeMaxThreadsPerBlock, ///< Maximum number of threads per block. + hipDeviceAttributeMaxThreadsPerMultiProcessor, ///< Maximum resident threads per multiprocessor. + hipDeviceAttributeMaxPitch, ///< Maximum pitch in bytes allowed by memory copies + hipDeviceAttributeMemoryBusWidth, ///< Global memory bus width in bits. + hipDeviceAttributeMemoryClockRate, ///< Peak memory clock frequency in kilohertz. + hipDeviceAttributeComputeCapabilityMinor, ///< Minor compute capability version number. + hipDeviceAttributeMultiGpuBoardGroupID, ///< Cuda only. Unique ID of device group on the same multi-GPU board + hipDeviceAttributeMultiprocessorCount, ///< Number of multiprocessors on the device. + hipDeviceAttributeName, ///< Device name. + hipDeviceAttributePageableMemoryAccess, ///< Device supports coherently accessing pageable memory + ///< without calling hipHostRegister on it + hipDeviceAttributePageableMemoryAccessUsesHostPageTables, ///< Device accesses pageable memory via the host's page tables + hipDeviceAttributePciBusId, ///< PCI Bus ID. + hipDeviceAttributePciDeviceId, ///< PCI Device ID. + hipDeviceAttributePciDomainID, ///< PCI Domain ID. + hipDeviceAttributePersistingL2CacheMaxSize, ///< Cuda11 only. Maximum l2 persisting lines capacity in bytes + hipDeviceAttributeMaxRegistersPerBlock, ///< 32-bit registers available to a thread block. This number is shared + ///< by all thread blocks simultaneously resident on a multiprocessor. + hipDeviceAttributeMaxRegistersPerMultiprocessor, ///< 32-bit registers available per block. + hipDeviceAttributeReservedSharedMemPerBlock, ///< Cuda11 only. Shared memory reserved by CUDA driver per block. + hipDeviceAttributeMaxSharedMemoryPerBlock, ///< Maximum shared memory available per block in bytes. + hipDeviceAttributeSharedMemPerBlockOptin, ///< Cuda only. Maximum shared memory per block usable by special opt in. + hipDeviceAttributeSharedMemPerMultiprocessor, ///< Cuda only. Shared memory available per multiprocessor. + hipDeviceAttributeSingleToDoublePrecisionPerfRatio, ///< Cuda only. Performance ratio of single precision to double precision. + hipDeviceAttributeStreamPrioritiesSupported, ///< Cuda only. Whether to support stream priorities. + hipDeviceAttributeSurfaceAlignment, ///< Cuda only. Alignment requirement for surfaces + hipDeviceAttributeTccDriver, ///< Cuda only. Whether device is a Tesla device using TCC driver + hipDeviceAttributeTextureAlignment, ///< Alignment requirement for textures + hipDeviceAttributeTexturePitchAlignment, ///< Pitch alignment requirement for 2D texture references bound to pitched memory; + hipDeviceAttributeTotalConstantMemory, ///< Constant memory size in bytes. + hipDeviceAttributeTotalGlobalMem, ///< Global memory available on devicice. + hipDeviceAttributeUnifiedAddressing, ///< Cuda only. An unified address space shared with the host. + hipDeviceAttributeUuid, ///< Cuda only. Unique ID in 16 byte. + hipDeviceAttributeWarpSize, ///< Warp size in threads. + hipDeviceAttributeCudaCompatibleEnd = 9999, + hipDeviceAttributeAmdSpecificBegin = 10000, + hipDeviceAttributeClockInstructionRate = hipDeviceAttributeAmdSpecificBegin, ///< Frequency in khz of the timer used by the device-side "clock*" + hipDeviceAttributeArch, ///< Device architecture + hipDeviceAttributeMaxSharedMemoryPerMultiprocessor, ///< Maximum Shared Memory PerMultiprocessor. + hipDeviceAttributeGcnArch, ///< Device gcn architecture + hipDeviceAttributeGcnArchName, ///< Device gcnArch name in 256 bytes + hipDeviceAttributeHdpMemFlushCntl, ///< Address of the HDP_MEM_COHERENCY_FLUSH_CNTL register + hipDeviceAttributeHdpRegFlushCntl, ///< Address of the HDP_REG_COHERENCY_FLUSH_CNTL register + hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc, ///< Supports cooperative launch on multiple + ///< devices with unmatched functions + hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim, ///< Supports cooperative launch on multiple + ///< devices with unmatched grid dimensions + hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim, ///< Supports cooperative launch on multiple + ///< devices with unmatched block dimensions + hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem, ///< Supports cooperative launch on multiple + ///< devices with unmatched shared memories + hipDeviceAttributeIsLargeBar, ///< Whether it is LargeBar + hipDeviceAttributeAsicRevision, ///< Revision of the GPU in this device + hipDeviceAttributeCanUseStreamWaitValue, ///< '1' if Device supports hipStreamWaitValue32() and + ///< hipStreamWaitValue64() , '0' otherwise. + hipDeviceAttributeAmdSpecificEnd = 19999, + hipDeviceAttributeVendorSpecificBegin = 20000, + // Extended attributes for vendors +} hipDeviceAttribute_t; + +typedef struct HIPdevprop_st { + int maxThreadsPerBlock; + int maxThreadsDim[3]; + int maxGridSize[3]; + int sharedMemPerBlock; + int totalConstantMemory; + int SIMDWidth; + int memPitch; + int regsPerBlock; + int clockRate; + int textureAlign; +} HIPdevprop; + +typedef struct { + // 32-bit Atomics + unsigned hasGlobalInt32Atomics : 1; ///< 32-bit integer atomics for global memory. + unsigned hasGlobalFloatAtomicExch : 1; ///< 32-bit float atomic exch for global memory. + unsigned hasSharedInt32Atomics : 1; ///< 32-bit integer atomics for shared memory. + unsigned hasSharedFloatAtomicExch : 1; ///< 32-bit float atomic exch for shared memory. + unsigned hasFloatAtomicAdd : 1; ///< 32-bit float atomic add in global and shared memory. + + // 64-bit Atomics + unsigned hasGlobalInt64Atomics : 1; ///< 64-bit integer atomics for global memory. + unsigned hasSharedInt64Atomics : 1; ///< 64-bit integer atomics for shared memory. + + // Doubles + unsigned hasDoubles : 1; ///< Double-precision floating point. + + // Warp cross-lane operations + unsigned hasWarpVote : 1; ///< Warp vote instructions (__any, __all). + unsigned hasWarpBallot : 1; ///< Warp ballot instructions (__ballot). + unsigned hasWarpShuffle : 1; ///< Warp shuffle operations. (__shfl_*). + unsigned hasFunnelShift : 1; ///< Funnel two words into one with shift&mask caps. + + // Sync + unsigned hasThreadFenceSystem : 1; ///< __threadfence_system. + unsigned hasSyncThreadsExt : 1; ///< __syncthreads_count, syncthreads_and, syncthreads_or. + + // Misc + unsigned hasSurfaceFuncs : 1; ///< Surface functions. + unsigned has3dGrid : 1; ///< Grid and group dims are 3D (rather than 2D). + unsigned hasDynamicParallelism : 1; ///< Dynamic parallelism. +} hipDeviceArch_t; + +typedef struct hipDeviceProp_t { + char name[256]; ///< Device name. + size_t totalGlobalMem; ///< Size of global memory region (in bytes). + size_t sharedMemPerBlock; ///< Size of shared memory region (in bytes). + int regsPerBlock; ///< Registers per block. + int warpSize; ///< Warp size. + int maxThreadsPerBlock; ///< Max work items per work group or workgroup max size. + int maxThreadsDim[3]; ///< Max number of threads in each dimension (XYZ) of a block. + int maxGridSize[3]; ///< Max grid dimensions (XYZ). + int clockRate; ///< Max clock frequency of the multiProcessors in khz. + int memoryClockRate; ///< Max global memory clock frequency in khz. + int memoryBusWidth; ///< Global memory bus width in bits. + size_t totalConstMem; ///< Size of shared memory region (in bytes). + int major; ///< Major compute capability. On HCC, this is an approximation and features may + ///< differ from CUDA CC. See the arch feature flags for portable ways to query + ///< feature caps. + int minor; ///< Minor compute capability. On HCC, this is an approximation and features may + ///< differ from CUDA CC. See the arch feature flags for portable ways to query + ///< feature caps. + int multiProcessorCount; ///< Number of multi-processors (compute units). + int l2CacheSize; ///< L2 cache size. + int maxThreadsPerMultiProcessor; ///< Maximum resident threads per multi-processor. + int computeMode; ///< Compute mode. + int clockInstructionRate; ///< Frequency in khz of the timer used by the device-side "clock*" + ///< instructions. New for HIP. + hipDeviceArch_t arch; ///< Architectural feature flags. New for HIP. + int concurrentKernels; ///< Device can possibly execute multiple kernels concurrently. + int pciDomainID; ///< PCI Domain ID + int pciBusID; ///< PCI Bus ID. + int pciDeviceID; ///< PCI Device ID. + size_t maxSharedMemoryPerMultiProcessor; ///< Maximum Shared Memory Per Multiprocessor. + int isMultiGpuBoard; ///< 1 if device is on a multi-GPU board, 0 if not. + int canMapHostMemory; ///< Check whether HIP can map host memory + int gcnArch; ///< DEPRECATED: use gcnArchName instead + char gcnArchName[256]; ///< AMD GCN Arch Name. + int integrated; ///< APU vs dGPU + int cooperativeLaunch; ///< HIP device supports cooperative launch + int cooperativeMultiDeviceLaunch; ///< HIP device supports cooperative launch on multiple devices + int maxTexture1DLinear; ///< Maximum size for 1D textures bound to linear memory + int maxTexture1D; ///< Maximum number of elements in 1D images + int maxTexture2D[2]; ///< Maximum dimensions (width, height) of 2D images, in image elements + int maxTexture3D[3]; ///< Maximum dimensions (width, height, depth) of 3D images, in image elements + unsigned int* hdpMemFlushCntl; ///< Addres of HDP_MEM_COHERENCY_FLUSH_CNTL register + unsigned int* hdpRegFlushCntl; ///< Addres of HDP_REG_COHERENCY_FLUSH_CNTL register + size_t memPitch; ///