matmul_nbits: Use GPU_WARP_SIZE_HOST for host side code

For ROCm device, the host side code needs to call GPU_WARP_SIZE_HOST to query warpsize of the underlying GPU device.
ROCm · Sep 10, 2024 · d7e1c61 · d7e1c61
1 parent eaa26ae
commit d7e1c61
Showing 1 changed file with 1 addition and 1 deletion.
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu
@@ -289,7 +289,7 @@ bool TryMatMul4Bits(
     return false;
   }
   dim3 blocks((n + kColsPerThreadBlock - 1) / kColsPerThreadBlock, m);
-  dim3 threads(kWarpSize, kColsPerThreadBlock);
+  dim3 threads(GPU_WARP_SIZE_HOST, kColsPerThreadBlock);
   int blocks_per_K = (k + block_size - 1) / block_size;
   int shared_mem_size = sizeof(T) * blocks_per_K * kColsPerThreadBlock +
                         (zero_points != nullptr ? (blocks_per_K + 1) / 2 * kColsPerThreadBlock * 2 : 0);