diff --git a/bin/hipify-perl b/bin/hipify-perl index e9c09657..275bfb53 100755 --- a/bin/hipify-perl +++ b/bin/hipify-perl @@ -1562,6 +1562,7 @@ sub rocSubstitutions { subst("cublasCgemvBatched", "rocblas_cgemv_batched", "library"); subst("cublasCgemvBatched_64", "rocblas_cgemv_batched_64", "library"); subst("cublasCgemvStridedBatched", "rocblas_cgemv_strided_batched", "library"); + subst("cublasCgemvStridedBatched_64", "rocblas_cgemv_strided_batched_64", "library"); subst("cublasCgemv_64", "rocblas_cgemv_64", "library"); subst("cublasCgemv_v2", "rocblas_cgemv", "library"); subst("cublasCgemv_v2_64", "rocblas_cgemv_64", "library"); @@ -1675,6 +1676,8 @@ sub rocSubstitutions { subst("cublasDgemv", "rocblas_dgemv", "library"); subst("cublasDgemvBatched", "rocblas_dgemv_batched", "library"); subst("cublasDgemvBatched_64", "rocblas_dgemv_batched_64", "library"); + subst("cublasDgemvStridedBatched", "rocblas_dgemv_strided_batched", "library"); + subst("cublasDgemvStridedBatched_64", "rocblas_dgemv_strided_batched_64", "library"); subst("cublasDgemv_64", "rocblas_dgemv_64", "library"); subst("cublasDgemv_v2", "rocblas_dgemv", "library"); subst("cublasDgemv_v2_64", "rocblas_dgemv_64", "library"); @@ -1771,9 +1774,11 @@ sub rocSubstitutions { subst("cublasHSHgemvBatched", "rocblas_hshgemv_batched", "library"); subst("cublasHSHgemvBatched_64", "rocblas_hshgemv_batched_64", "library"); subst("cublasHSHgemvStridedBatched", "rocblas_hshgemv_strided_batched", "library"); + subst("cublasHSHgemvStridedBatched_64", "rocblas_hshgemv_strided_batched_64", "library"); subst("cublasHSSgemvBatched", "rocblas_hssgemv_batched", "library"); subst("cublasHSSgemvBatched_64", "rocblas_hssgemv_batched_64", "library"); subst("cublasHSSgemvStridedBatched", "rocblas_hssgemv_strided_batched", "library"); + subst("cublasHSSgemvStridedBatched_64", "rocblas_hssgemv_strided_batched_64", "library"); subst("cublasHgemm", "rocblas_hgemm", "library"); subst("cublasHgemmBatched", "rocblas_hgemm_batched", "library"); subst("cublasHgemmStridedBatched", "rocblas_hgemm_strided_batched", "library"); @@ -1863,6 +1868,8 @@ sub rocSubstitutions { subst("cublasSgemv", "rocblas_sgemv", "library"); subst("cublasSgemvBatched", "rocblas_sgemv_batched", "library"); subst("cublasSgemvBatched_64", "rocblas_sgemv_batched_64", "library"); + subst("cublasSgemvStridedBatched", "rocblas_sgemv_strided_batched", "library"); + subst("cublasSgemvStridedBatched_64", "rocblas_sgemv_strided_batched_64", "library"); subst("cublasSgemv_64", "rocblas_sgemv_64", "library"); subst("cublasSgemv_v2", "rocblas_sgemv", "library"); subst("cublasSgemv_v2_64", "rocblas_sgemv_64", "library"); @@ -1933,9 +1940,11 @@ sub rocSubstitutions { subst("cublasTSSgemvBatched", "rocblas_tssgemv_batched", "library"); subst("cublasTSSgemvBatched_64", "rocblas_tssgemv_batched_64", "library"); subst("cublasTSSgemvStridedBatched", "rocblas_tssgemv_strided_batched", "library"); + subst("cublasTSSgemvStridedBatched_64", "rocblas_tssgemv_strided_batched_64", "library"); subst("cublasTSTgemvBatched", "rocblas_tstgemv_batched", "library"); subst("cublasTSTgemvBatched_64", "rocblas_tstgemv_batched_64", "library"); subst("cublasTSTgemvStridedBatched", "rocblas_tstgemv_strided_batched", "library"); + subst("cublasTSTgemvStridedBatched_64", "rocblas_tstgemv_strided_batched_64", "library"); subst("cublasZaxpy", "rocblas_zaxpy", "library"); subst("cublasZaxpy_64", "rocblas_zaxpy_64", "library"); subst("cublasZaxpy_v2", "rocblas_zaxpy", "library"); @@ -1974,6 +1983,7 @@ sub rocSubstitutions { subst("cublasZgemvBatched", "rocblas_zgemv_batched", "library"); subst("cublasZgemvBatched_64", "rocblas_zgemv_batched_64", "library"); subst("cublasZgemvStridedBatched", "rocblas_zgemv_strided_batched", "library"); + subst("cublasZgemvStridedBatched_64", "rocblas_zgemv_strided_batched_64", "library"); subst("cublasZgemv_64", "rocblas_zgemv_64", "library"); subst("cublasZgemv_v2", "rocblas_zgemv", "library"); subst("cublasZgemv_v2_64", "rocblas_zgemv_64", "library"); @@ -12505,7 +12515,6 @@ sub warnRocOnlyUnsupportedFunctions { "cublasZgerc_v2_64", "cublasZgerc_64", "cublasZgeqrfBatched", - "cublasZgemvStridedBatched_64", "cublasZgemm_v2_64", "cublasZgemm_64", "cublasZgemmStridedBatched_64", @@ -12517,8 +12526,6 @@ sub warnRocOnlyUnsupportedFunctions { "cublasZdgmm_64", "cublasXerbla", "cublasUint8gemmBias", - "cublasTSTgemvStridedBatched_64", - "cublasTSSgemvStridedBatched_64", "cublasSwapEx_64", "cublasSwapEx", "cublasStrttp", @@ -12569,8 +12576,6 @@ sub warnRocOnlyUnsupportedFunctions { "cublasSger_v2_64", "cublasSger_64", "cublasSgeqrfBatched", - "cublasSgemvStridedBatched_64", - "cublasSgemvStridedBatched", "cublasSgemm_v2_64", "cublasSgemm_64", "cublasSgemmStridedBatched_64", @@ -12666,8 +12671,6 @@ sub warnRocOnlyUnsupportedFunctions { "cublasHgemm_64", "cublasHgemmStridedBatched_64", "cublasHgemmBatched_64", - "cublasHSSgemvStridedBatched_64", - "cublasHSHgemvStridedBatched_64", "cublasGetVersion_v2", "cublasGetVersion", "cublasGetVector_64", @@ -12733,8 +12736,6 @@ sub warnRocOnlyUnsupportedFunctions { "cublasDger_v2_64", "cublasDger_64", "cublasDgeqrfBatched", - "cublasDgemvStridedBatched_64", - "cublasDgemvStridedBatched", "cublasDgemm_v2_64", "cublasDgemm_64", "cublasDgemmStridedBatched_64", @@ -12816,7 +12817,6 @@ sub warnRocOnlyUnsupportedFunctions { "cublasCgerc_v2_64", "cublasCgerc_64", "cublasCgeqrfBatched", - "cublasCgemvStridedBatched_64", "cublasCgemm_v2_64", "cublasCgemm_64", "cublasCgemmStridedBatched_64", diff --git a/docs/tables/CUBLAS_API_supported_by_HIP_and_ROC.md b/docs/tables/CUBLAS_API_supported_by_HIP_and_ROC.md index 141afe35..7e84ab13 100644 --- a/docs/tables/CUBLAS_API_supported_by_HIP_and_ROC.md +++ b/docs/tables/CUBLAS_API_supported_by_HIP_and_ROC.md @@ -1034,7 +1034,7 @@ |`cublasCgemvBatched`|11.6| | | |`hipblasCgemvBatched_v2`|6.0.0| | | | |`rocblas_cgemv_batched`|3.5.0| | | | | |`cublasCgemvBatched_64`|12.0| | | |`hipblasCgemvBatched_v2_64`|6.2.0| | | | |`rocblas_cgemv_batched_64`|6.2.0| | | | | |`cublasCgemvStridedBatched`|11.6| | | |`hipblasCgemvStridedBatched_v2`|6.0.0| | | | |`rocblas_cgemv_strided_batched`|3.5.0| | | | | -|`cublasCgemvStridedBatched_64`|12.0| | | |`hipblasCgemvStridedBatched_v2_64`|6.2.0| | | | | | | | | | | +|`cublasCgemvStridedBatched_64`|12.0| | | |`hipblasCgemvStridedBatched_v2_64`|6.2.0| | | | |`rocblas_cgemv_strided_batched_64`|6.2.0| | | | | |`cublasChemm`| | | | |`hipblasChemm_v2`|6.0.0| | | | |`rocblas_chemm`|3.5.0| | | | | |`cublasChemm_64`|12.0| | | | | | | | | | | | | | | | |`cublasChemm_v2`| | | | |`hipblasChemm_v2`|6.0.0| | | | |`rocblas_chemm`|3.5.0| | | | | @@ -1083,8 +1083,8 @@ |`cublasDgemm_v2_64`|12.0| | | | | | | | | | | | | | | | |`cublasDgemvBatched`|11.6| | | |`hipblasDgemvBatched`|3.0.0| | | | |`rocblas_dgemv_batched`|3.5.0| | | | | |`cublasDgemvBatched_64`|12.0| | | |`hipblasDgemvBatched_64`|6.2.0| | | | |`rocblas_dgemv_batched_64`|6.2.0| | | | | -|`cublasDgemvStridedBatched`|11.6| | | |`hipblasDgemvStridedBatched`|3.0.0| | | | | | | | | | | -|`cublasDgemvStridedBatched_64`|12.0| | | |`hipblasDgemvStridedBatched_64`|6.2.0| | | | | | | | | | | +|`cublasDgemvStridedBatched`|11.6| | | |`hipblasDgemvStridedBatched`|3.0.0| | | | |`rocblas_dgemv_strided_batched`|3.5.0| | | | | +|`cublasDgemvStridedBatched_64`|12.0| | | |`hipblasDgemvStridedBatched_64`|6.2.0| | | | |`rocblas_dgemv_strided_batched_64`|6.2.0| | | | | |`cublasDsymm`| | | | |`hipblasDsymm`|3.6.0| | | | |`rocblas_dsymm`|3.5.0| | | | | |`cublasDsymm_64`|12.0| | | | | | | | | | | | | | | | |`cublasDsymm_v2`| | | | |`hipblasDsymm`|3.6.0| | | | |`rocblas_dsymm`|3.5.0| | | | | @@ -1112,11 +1112,11 @@ |`cublasHSHgemvBatched`|11.6| | | | | | | | | |`rocblas_hshgemv_batched`|6.0.0| | | | | |`cublasHSHgemvBatched_64`|12.0| | | | | | | | | |`rocblas_hshgemv_batched_64`|6.2.0| | | | | |`cublasHSHgemvStridedBatched`|11.6| | | | | | | | | |`rocblas_hshgemv_strided_batched`|6.0.0| | | | | -|`cublasHSHgemvStridedBatched_64`|12.0| | | | | | | | | | | | | | | | +|`cublasHSHgemvStridedBatched_64`|12.0| | | | | | | | | |`rocblas_hshgemv_strided_batched_64`|6.2.0| | | | | |`cublasHSSgemvBatched`|11.6| | | | | | | | | |`rocblas_hssgemv_batched`|6.0.0| | | | | |`cublasHSSgemvBatched_64`|12.0| | | | | | | | | |`rocblas_hssgemv_batched_64`|6.2.0| | | | | |`cublasHSSgemvStridedBatched`|11.6| | | | | | | | | |`rocblas_hssgemv_strided_batched`|6.0.0| | | | | -|`cublasHSSgemvStridedBatched_64`|12.0| | | | | | | | | | | | | | | | +|`cublasHSSgemvStridedBatched_64`|12.0| | | | | | | | | |`rocblas_hssgemv_strided_batched_64`|6.2.0| | | | | |`cublasHgemm`|7.5| | | |`hipblasHgemm`|1.8.2| | | | |`rocblas_hgemm`|1.5.0| | | | | |`cublasHgemmBatched`|9.0| | | |`hipblasHgemmBatched`|3.0.0| | | | |`rocblas_hgemm_batched`|3.5.0| | | | | |`cublasHgemmBatched_64`|12.0| | | | | | | | | | | | | | | | @@ -1135,8 +1135,8 @@ |`cublasSgemm_v2_64`|12.0| | | | | | | | | | | | | | | | |`cublasSgemvBatched`|11.6| | | |`hipblasSgemvBatched`|1.6.0| | | | |`rocblas_sgemv_batched`|3.5.0| | | | | |`cublasSgemvBatched_64`|12.0| | | |`hipblasSgemvBatched_64`|6.2.0| | | | |`rocblas_sgemv_batched_64`|6.2.0| | | | | -|`cublasSgemvStridedBatched`|11.6| | | |`hipblasSgemvStridedBatched`|3.0.0| | | | | | | | | | | -|`cublasSgemvStridedBatched_64`|12.0| | | |`hipblasSgemvStridedBatched_64`|6.2.0| | | | | | | | | | | +|`cublasSgemvStridedBatched`|11.6| | | |`hipblasSgemvStridedBatched`|3.0.0| | | | |`rocblas_sgemv_strided_batched`|3.5.0| | | | | +|`cublasSgemvStridedBatched_64`|12.0| | | |`hipblasSgemvStridedBatched_64`|6.2.0| | | | |`rocblas_sgemv_strided_batched_64`|6.2.0| | | | | |`cublasSsymm`| | | | |`hipblasSsymm`|3.6.0| | | | |`rocblas_ssymm`|3.5.0| | | | | |`cublasSsymm_64`|12.0| | | | | | | | | | | | | | | | |`cublasSsymm_v2`| | | | |`hipblasSsymm`|3.6.0| | | | |`rocblas_ssymm`|3.5.0| | | | | @@ -1162,11 +1162,11 @@ |`cublasTSSgemvBatched`|11.6| | | | | | | | | |`rocblas_tssgemv_batched`|6.0.0| | | | | |`cublasTSSgemvBatched_64`|12.0| | | | | | | | | |`rocblas_tssgemv_batched_64`|6.2.0| | | | | |`cublasTSSgemvStridedBatched`|11.6| | | | | | | | | |`rocblas_tssgemv_strided_batched`|6.0.0| | | | | -|`cublasTSSgemvStridedBatched_64`|12.0| | | | | | | | | | | | | | | | +|`cublasTSSgemvStridedBatched_64`|12.0| | | | | | | | | |`rocblas_tssgemv_strided_batched_64`|6.2.0| | | | | |`cublasTSTgemvBatched`|11.6| | | | | | | | | |`rocblas_tstgemv_batched`|6.0.0| | | | | |`cublasTSTgemvBatched_64`|12.0| | | | | | | | | |`rocblas_tstgemv_batched_64`|6.2.0| | | | | |`cublasTSTgemvStridedBatched`|11.6| | | | | | | | | |`rocblas_tstgemv_strided_batched`|6.0.0| | | | | -|`cublasTSTgemvStridedBatched_64`|12.0| | | | | | | | | | | | | | | | +|`cublasTSTgemvStridedBatched_64`|12.0| | | | | | | | | |`rocblas_tstgemv_strided_batched_64`|6.2.0| | | | | |`cublasZgemm`| | | | |`hipblasZgemm_v2`|6.0.0| | | | |`rocblas_zgemm`|1.5.0| | | | | |`cublasZgemm3m`|8.0| | | | | | | | | | | | | | | | |`cublasZgemm3m_64`|12.0| | | | | | | | | | | | | | | | @@ -1180,7 +1180,7 @@ |`cublasZgemvBatched`|11.6| | | |`hipblasZgemvBatched_v2`|6.0.0| | | | |`rocblas_zgemv_batched`|3.5.0| | | | | |`cublasZgemvBatched_64`|12.0| | | |`hipblasZgemvBatched_v2_64`|6.2.0| | | | |`rocblas_zgemv_batched_64`|6.2.0| | | | | |`cublasZgemvStridedBatched`|11.6| | | |`hipblasZgemvStridedBatched_v2`|6.0.0| | | | |`rocblas_zgemv_strided_batched`|3.5.0| | | | | -|`cublasZgemvStridedBatched_64`|12.0| | | |`hipblasZgemvStridedBatched_v2_64`|6.2.0| | | | | | | | | | | +|`cublasZgemvStridedBatched_64`|12.0| | | |`hipblasZgemvStridedBatched_v2_64`|6.2.0| | | | |`rocblas_zgemv_strided_batched_64`|6.2.0| | | | | |`cublasZhemm`| | | | |`hipblasZhemm_v2`|6.0.0| | | | |`rocblas_zhemm`|3.5.0| | | | | |`cublasZhemm_64`|12.0| | | | | | | | | | | | | | | | |`cublasZhemm_v2`| | | | |`hipblasZhemm_v2`|6.0.0| | | | |`rocblas_zhemm`|3.5.0| | | | | diff --git a/docs/tables/CUBLAS_API_supported_by_ROC.md b/docs/tables/CUBLAS_API_supported_by_ROC.md index 5c0fa35d..39a847eb 100644 --- a/docs/tables/CUBLAS_API_supported_by_ROC.md +++ b/docs/tables/CUBLAS_API_supported_by_ROC.md @@ -1034,7 +1034,7 @@ |`cublasCgemvBatched`|11.6| | | |`rocblas_cgemv_batched`|3.5.0| | | | | |`cublasCgemvBatched_64`|12.0| | | |`rocblas_cgemv_batched_64`|6.2.0| | | | | |`cublasCgemvStridedBatched`|11.6| | | |`rocblas_cgemv_strided_batched`|3.5.0| | | | | -|`cublasCgemvStridedBatched_64`|12.0| | | | | | | | | | +|`cublasCgemvStridedBatched_64`|12.0| | | |`rocblas_cgemv_strided_batched_64`|6.2.0| | | | | |`cublasChemm`| | | | |`rocblas_chemm`|3.5.0| | | | | |`cublasChemm_64`|12.0| | | | | | | | | | |`cublasChemm_v2`| | | | |`rocblas_chemm`|3.5.0| | | | | @@ -1083,8 +1083,8 @@ |`cublasDgemm_v2_64`|12.0| | | | | | | | | | |`cublasDgemvBatched`|11.6| | | |`rocblas_dgemv_batched`|3.5.0| | | | | |`cublasDgemvBatched_64`|12.0| | | |`rocblas_dgemv_batched_64`|6.2.0| | | | | -|`cublasDgemvStridedBatched`|11.6| | | | | | | | | | -|`cublasDgemvStridedBatched_64`|12.0| | | | | | | | | | +|`cublasDgemvStridedBatched`|11.6| | | |`rocblas_dgemv_strided_batched`|3.5.0| | | | | +|`cublasDgemvStridedBatched_64`|12.0| | | |`rocblas_dgemv_strided_batched_64`|6.2.0| | | | | |`cublasDsymm`| | | | |`rocblas_dsymm`|3.5.0| | | | | |`cublasDsymm_64`|12.0| | | | | | | | | | |`cublasDsymm_v2`| | | | |`rocblas_dsymm`|3.5.0| | | | | @@ -1112,11 +1112,11 @@ |`cublasHSHgemvBatched`|11.6| | | |`rocblas_hshgemv_batched`|6.0.0| | | | | |`cublasHSHgemvBatched_64`|12.0| | | |`rocblas_hshgemv_batched_64`|6.2.0| | | | | |`cublasHSHgemvStridedBatched`|11.6| | | |`rocblas_hshgemv_strided_batched`|6.0.0| | | | | -|`cublasHSHgemvStridedBatched_64`|12.0| | | | | | | | | | +|`cublasHSHgemvStridedBatched_64`|12.0| | | |`rocblas_hshgemv_strided_batched_64`|6.2.0| | | | | |`cublasHSSgemvBatched`|11.6| | | |`rocblas_hssgemv_batched`|6.0.0| | | | | |`cublasHSSgemvBatched_64`|12.0| | | |`rocblas_hssgemv_batched_64`|6.2.0| | | | | |`cublasHSSgemvStridedBatched`|11.6| | | |`rocblas_hssgemv_strided_batched`|6.0.0| | | | | -|`cublasHSSgemvStridedBatched_64`|12.0| | | | | | | | | | +|`cublasHSSgemvStridedBatched_64`|12.0| | | |`rocblas_hssgemv_strided_batched_64`|6.2.0| | | | | |`cublasHgemm`|7.5| | | |`rocblas_hgemm`|1.5.0| | | | | |`cublasHgemmBatched`|9.0| | | |`rocblas_hgemm_batched`|3.5.0| | | | | |`cublasHgemmBatched_64`|12.0| | | | | | | | | | @@ -1135,8 +1135,8 @@ |`cublasSgemm_v2_64`|12.0| | | | | | | | | | |`cublasSgemvBatched`|11.6| | | |`rocblas_sgemv_batched`|3.5.0| | | | | |`cublasSgemvBatched_64`|12.0| | | |`rocblas_sgemv_batched_64`|6.2.0| | | | | -|`cublasSgemvStridedBatched`|11.6| | | | | | | | | | -|`cublasSgemvStridedBatched_64`|12.0| | | | | | | | | | +|`cublasSgemvStridedBatched`|11.6| | | |`rocblas_sgemv_strided_batched`|3.5.0| | | | | +|`cublasSgemvStridedBatched_64`|12.0| | | |`rocblas_sgemv_strided_batched_64`|6.2.0| | | | | |`cublasSsymm`| | | | |`rocblas_ssymm`|3.5.0| | | | | |`cublasSsymm_64`|12.0| | | | | | | | | | |`cublasSsymm_v2`| | | | |`rocblas_ssymm`|3.5.0| | | | | @@ -1162,11 +1162,11 @@ |`cublasTSSgemvBatched`|11.6| | | |`rocblas_tssgemv_batched`|6.0.0| | | | | |`cublasTSSgemvBatched_64`|12.0| | | |`rocblas_tssgemv_batched_64`|6.2.0| | | | | |`cublasTSSgemvStridedBatched`|11.6| | | |`rocblas_tssgemv_strided_batched`|6.0.0| | | | | -|`cublasTSSgemvStridedBatched_64`|12.0| | | | | | | | | | +|`cublasTSSgemvStridedBatched_64`|12.0| | | |`rocblas_tssgemv_strided_batched_64`|6.2.0| | | | | |`cublasTSTgemvBatched`|11.6| | | |`rocblas_tstgemv_batched`|6.0.0| | | | | |`cublasTSTgemvBatched_64`|12.0| | | |`rocblas_tstgemv_batched_64`|6.2.0| | | | | |`cublasTSTgemvStridedBatched`|11.6| | | |`rocblas_tstgemv_strided_batched`|6.0.0| | | | | -|`cublasTSTgemvStridedBatched_64`|12.0| | | | | | | | | | +|`cublasTSTgemvStridedBatched_64`|12.0| | | |`rocblas_tstgemv_strided_batched_64`|6.2.0| | | | | |`cublasZgemm`| | | | |`rocblas_zgemm`|1.5.0| | | | | |`cublasZgemm3m`|8.0| | | | | | | | | | |`cublasZgemm3m_64`|12.0| | | | | | | | | | @@ -1180,7 +1180,7 @@ |`cublasZgemvBatched`|11.6| | | |`rocblas_zgemv_batched`|3.5.0| | | | | |`cublasZgemvBatched_64`|12.0| | | |`rocblas_zgemv_batched_64`|6.2.0| | | | | |`cublasZgemvStridedBatched`|11.6| | | |`rocblas_zgemv_strided_batched`|3.5.0| | | | | -|`cublasZgemvStridedBatched_64`|12.0| | | | | | | | | | +|`cublasZgemvStridedBatched_64`|12.0| | | |`rocblas_zgemv_strided_batched_64`|6.2.0| | | | | |`cublasZhemm`| | | | |`rocblas_zhemm`|3.5.0| | | | | |`cublasZhemm_64`|12.0| | | | | | | | | | |`cublasZhemm_v2`| | | | |`rocblas_zhemm`|3.5.0| | | | | diff --git a/src/CUDA2HIP_BLAS_API_functions.cpp b/src/CUDA2HIP_BLAS_API_functions.cpp index 84ebe0bd..0be724b7 100644 --- a/src/CUDA2HIP_BLAS_API_functions.cpp +++ b/src/CUDA2HIP_BLAS_API_functions.cpp @@ -458,22 +458,22 @@ const std::map CUDA_BLAS_FUNCTION_MAP { {"cublasTSTgemvBatched_64", {"hipblasTSTgemvBatched_64", "rocblas_tstgemv_batched_64", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, HIP_UNSUPPORTED}}, {"cublasTSSgemvBatched", {"hipblasTSSgemvBatched", "rocblas_tssgemv_batched", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, HIP_UNSUPPORTED}}, {"cublasTSSgemvBatched_64", {"hipblasTSSgemvBatched_64", "rocblas_tssgemv_batched_64", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, HIP_UNSUPPORTED}}, - {"cublasSgemvStridedBatched", {"hipblasSgemvStridedBatched", "", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, ROC_UNSUPPORTED}}, - {"cublasSgemvStridedBatched_64", {"hipblasSgemvStridedBatched_64", "", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, ROC_UNSUPPORTED}}, - {"cublasDgemvStridedBatched", {"hipblasDgemvStridedBatched", "", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, ROC_UNSUPPORTED}}, - {"cublasDgemvStridedBatched_64", {"hipblasDgemvStridedBatched_64", "", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, ROC_UNSUPPORTED}}, + {"cublasSgemvStridedBatched", {"hipblasSgemvStridedBatched", "rocblas_sgemv_strided_batched", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3}}, + {"cublasSgemvStridedBatched_64", {"hipblasSgemvStridedBatched_64", "rocblas_sgemv_strided_batched_64", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3}}, + {"cublasDgemvStridedBatched", {"hipblasDgemvStridedBatched", "rocblas_dgemv_strided_batched", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3}}, + {"cublasDgemvStridedBatched_64", {"hipblasDgemvStridedBatched_64", "rocblas_dgemv_strided_batched_64", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3}}, {"cublasCgemvStridedBatched", {"hipblasCgemvStridedBatched_v2", "rocblas_cgemv_strided_batched", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3}}, - {"cublasCgemvStridedBatched_64", {"hipblasCgemvStridedBatched_v2_64", "", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, ROC_UNSUPPORTED}}, + {"cublasCgemvStridedBatched_64", {"hipblasCgemvStridedBatched_v2_64", "rocblas_cgemv_strided_batched_64", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3}}, {"cublasZgemvStridedBatched", {"hipblasZgemvStridedBatched_v2", "rocblas_zgemv_strided_batched", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3}}, - {"cublasZgemvStridedBatched_64", {"hipblasZgemvStridedBatched_v2_64", "", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, ROC_UNSUPPORTED}}, + {"cublasZgemvStridedBatched_64", {"hipblasZgemvStridedBatched_v2_64", "rocblas_zgemv_strided_batched_64", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3}}, {"cublasHSHgemvStridedBatched", {"hipblasHSHgemvStridedBatched", "rocblas_hshgemv_strided_batched", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, HIP_UNSUPPORTED}}, - {"cublasHSHgemvStridedBatched_64", {"hipblasHSHgemvStridedBatched_64", "", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, UNSUPPORTED}}, + {"cublasHSHgemvStridedBatched_64", {"hipblasHSHgemvStridedBatched_64", "rocblas_hshgemv_strided_batched_64", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, HIP_UNSUPPORTED}}, {"cublasHSSgemvStridedBatched", {"hipblasHSSgemvStridedBatched", "rocblas_hssgemv_strided_batched", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, HIP_UNSUPPORTED}}, - {"cublasHSSgemvStridedBatched_64", {"hipblasHSSgemvStridedBatched_64", "", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, UNSUPPORTED}}, + {"cublasHSSgemvStridedBatched_64", {"hipblasHSSgemvStridedBatched_64", "rocblas_hssgemv_strided_batched_64", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, HIP_UNSUPPORTED}}, {"cublasTSTgemvStridedBatched", {"hipblasTSTgemvStridedBatched", "rocblas_tstgemv_strided_batched", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, HIP_UNSUPPORTED}}, - {"cublasTSTgemvStridedBatched_64", {"hipblasTSTgemvStridedBatched_64", "", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, UNSUPPORTED}}, + {"cublasTSTgemvStridedBatched_64", {"hipblasTSTgemvStridedBatched_64", "rocblas_tstgemv_strided_batched_64", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, HIP_UNSUPPORTED}}, {"cublasTSSgemvStridedBatched", {"hipblasTSSgemvStridedBatched", "rocblas_tssgemv_strided_batched", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, HIP_UNSUPPORTED}}, - {"cublasTSSgemvStridedBatched_64", {"hipblasTSSgemvStridedBatched_64", "", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, UNSUPPORTED}}, + {"cublasTSSgemvStridedBatched_64", {"hipblasTSSgemvStridedBatched_64", "rocblas_tssgemv_strided_batched_64", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, HIP_UNSUPPORTED}}, // SYRK {"cublasSsyrk", {"hipblasSsyrk", "rocblas_ssyrk", CONV_LIB_FUNC, API_BLAS, SEC::BLAS_LEVEL_3, HIP_SUPPORTED_V2_ONLY}}, @@ -2329,6 +2329,16 @@ const std::map HIP_BLAS_FUNCTION_VER_MAP { {"rocblas_hssgemv_batched_64", {HIP_6020, HIP_0, HIP_0 }}, {"rocblas_tstgemv_batched_64", {HIP_6020, HIP_0, HIP_0 }}, {"rocblas_tssgemv_batched_64", {HIP_6020, HIP_0, HIP_0 }}, + {"rocblas_sgemv_strided_batched", {HIP_3050, HIP_0, HIP_0 }}, + {"rocblas_dgemv_strided_batched", {HIP_3050, HIP_0, HIP_0 }}, + {"rocblas_sgemv_strided_batched_64", {HIP_6020, HIP_0, HIP_0 }}, + {"rocblas_dgemv_strided_batched_64", {HIP_6020, HIP_0, HIP_0 }}, + {"rocblas_cgemv_strided_batched_64", {HIP_6020, HIP_0, HIP_0 }}, + {"rocblas_zgemv_strided_batched_64", {HIP_6020, HIP_0, HIP_0 }}, + {"rocblas_hshgemv_strided_batched_64", {HIP_6020, HIP_0, HIP_0 }}, + {"rocblas_hssgemv_strided_batched_64", {HIP_6020, HIP_0, HIP_0 }}, + {"rocblas_tstgemv_strided_batched_64", {HIP_6020, HIP_0, HIP_0 }}, + {"rocblas_tssgemv_strided_batched_64", {HIP_6020, HIP_0, HIP_0 }}, }; const std::map HIP_BLAS_FUNCTION_CHANGED_VER_MAP { diff --git a/tests/unit_tests/synthetic/libraries/cublas2hipblas_v2.cu b/tests/unit_tests/synthetic/libraries/cublas2hipblas_v2.cu index 9e7f7e54..ff3cdbff 100644 --- a/tests/unit_tests/synthetic/libraries/cublas2hipblas_v2.cu +++ b/tests/unit_tests/synthetic/libraries/cublas2hipblas_v2.cu @@ -1651,6 +1651,10 @@ int main() { __half* hc = 0; // CHECK: __half* hC = 0; __half* hC = 0; + // CHECK: __half* hx = 0; + __half* hx = 0; + // CHECK: __half* hy = 0; + __half* hy = 0; // CHECK: __half** hAarray = 0; __half** hAarray = 0; @@ -1848,16 +1852,24 @@ int main() { cublasDataType_t R_16BF = CUDA_R_16BF; cublasDataType_t C_16BF = CUDA_C_16BF; + // CHECK: hip_bfloat16* bf16A = nullptr; + __nv_bfloat16* bf16A = nullptr; // CHECK: hip_bfloat16** bf16Aarray = 0; __nv_bfloat16** bf16Aarray = 0; // CHECK: const hip_bfloat16** const bf16Aarray_const = const_cast(bf16Aarray); const __nv_bfloat16** const bf16Aarray_const = const_cast(bf16Aarray); + // CHECK: hip_bfloat16* bf16X = nullptr; + __nv_bfloat16* bf16X = nullptr; // CHECK: hip_bfloat16** bf16xarray = 0; __nv_bfloat16** bf16xarray = 0; - // CHECK: const hip_bfloat16** const bf16xarray_const = const_cast(bf16xarray_const); - const __nv_bfloat16** const bf16xarray_const = const_cast(bf16xarray_const); + // CHECK: const hip_bfloat16** const bf16xarray_const = const_cast(bf16xarray); + const __nv_bfloat16** const bf16xarray_const = const_cast(bf16xarray); + // CHECK: hip_bfloat16* bf16Y = nullptr; + __nv_bfloat16* bf16Y = nullptr; // CHECK: hip_bfloat16** bf16yarray = 0; __nv_bfloat16** bf16yarray = 0; + // CHECK: const hip_bfloat16** const bf16yarray_const = const_cast(bf16yarray); + const __nv_bfloat16** const bf16yarray_const = const_cast(bf16yarray); // CHECK: hipblasComputeType_t blasComputeType; cublasComputeType_t blasComputeType; @@ -2315,23 +2327,23 @@ int main() { // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, long long int strideA, const float* x, int64_t incx, long long int stridex, const float* beta, float* y, int64_t incy, long long int stridey, int64_t batchCount); // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasSgemvStridedBatched_64(hipblasHandle_t handle, hipblasOperation_t transA, int64_t m, int64_t n, const float* alpha, const float* AP, int64_t lda, hipblasStride strideA, const float* x, int64_t incx, hipblasStride stridex, const float* beta, float* y, int64_t incy, hipblasStride stridey, int64_t batchCount); - // CHECK: blasStatus = hipblasSgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &fa, &fA, lda_64, strideA, &fx, incx_64, strideX, &fb, &fy, incy_64, strideY, batchCount); - blasStatus = cublasSgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &fa, &fA, lda_64, strideA, &fx, incx_64, strideX, &fb, &fy, incy_64, strideY, batchCount); + // CHECK: blasStatus = hipblasSgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &fa, &fA, lda_64, strideA, &fx, incx_64, strideX, &fb, &fy, incy_64, strideY, batchCount_64); + blasStatus = cublasSgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &fa, &fA, lda_64, strideA, &fx, incx_64, strideX, &fb, &fy, incy_64, strideY, batchCount_64); // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, long long int strideA, const double* x, int64_t incx, long long int stridex, const double* beta, double* y, int64_t incy, long long int stridey, int64_t batchCount); // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasDgemvStridedBatched_64(hipblasHandle_t handle, hipblasOperation_t transA, int64_t m, int64_t n, const double* alpha, const double* AP, int64_t lda, hipblasStride strideA, const double* x, int64_t incx, hipblasStride stridex, const double* beta, double* y, int64_t incy, hipblasStride stridey, int64_t batchCount); - // CHECK: blasStatus = hipblasDgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &da, &dA, lda_64, strideA, &dx, incx_64, strideX, &db, &dy, incy_64, strideY, batchCount); - blasStatus = cublasDgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &da, &dA, lda_64, strideA, &dx, incx_64, strideX, &db, &dy, incy_64, strideY, batchCount); + // CHECK: blasStatus = hipblasDgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &da, &dA, lda_64, strideA, &dx, incx_64, strideX, &db, &dy, incy_64, strideY, batchCount_64); + blasStatus = cublasDgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &da, &dA, lda_64, strideA, &dx, incx_64, strideX, &db, &dy, incy_64, strideY, batchCount_64); // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, long long int strideA, const cuComplex* x, int64_t incx, long long int stridex, const cuComplex* beta, cuComplex* y, int64_t incy, long long int stridey, int64_t batchCount); // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasCgemvStridedBatched_v2_64(hipblasHandle_t handle, hipblasOperation_t transA, int64_t m, int64_t n,const hipComplex* alpha, const hipComplex* AP, int64_t lda,hipblasStride strideA, const hipComplex* x, int64_t incx, hipblasStride stridex, const hipComplex* beta, hipComplex* y, int64_t incy, hipblasStride stridey, int64_t batchCount); - // CHECK: blasStatus = hipblasCgemvStridedBatched_v2_64(blasHandle, blasOperation, m_64, n_64, &complexa, &complexA, lda_64, strideA, &complexx, incx_64, strideX, &complexb, &complexy, incy_64, strideY, batchCount); - blasStatus = cublasCgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &complexa, &complexA, lda_64, strideA, &complexx, incx_64, strideX, &complexb, &complexy, incy_64, strideY, batchCount); + // CHECK: blasStatus = hipblasCgemvStridedBatched_v2_64(blasHandle, blasOperation, m_64, n_64, &complexa, &complexA, lda_64, strideA, &complexx, incx_64, strideX, &complexb, &complexy, incy_64, strideY, batchCount_64); + blasStatus = cublasCgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &complexa, &complexA, lda_64, strideA, &complexx, incx_64, strideX, &complexb, &complexy, incy_64, strideY, batchCount_64); // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, long long int strideA, const cuDoubleComplex* x, int64_t incx, long long int stridex, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy, long long int stridey, int64_t batchCount); // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasZgemvStridedBatched_v2_64(hipblasHandle_t handle, hipblasOperation_t transA, int64_t m, int64_t n, const hipDoubleComplex* alpha, const hipDoubleComplex* AP, int64_t lda, hipblasStride strideA, const hipDoubleComplex* x, int64_t incx, hipblasStride stridex, const hipDoubleComplex* beta, hipDoubleComplex* y, int64_t incy, hipblasStride stridey, int64_t batchCount); - // CHECK: blasStatus = hipblasZgemvStridedBatched_v2_64(blasHandle, blasOperation, m_64, n_64, &dcomplexa, &dcomplexA, lda_64, strideA, &dcomplexx, incx_64, strideX, &dcomplexb, &dcomplexy, incy_64, strideY, batchCount); - blasStatus = cublasZgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &dcomplexa, &dcomplexA, lda_64, strideA, &dcomplexx, incx_64, strideX, &dcomplexb, &dcomplexy, incy_64, strideY, batchCount); + // CHECK: blasStatus = hipblasZgemvStridedBatched_v2_64(blasHandle, blasOperation, m_64, n_64, &dcomplexa, &dcomplexA, lda_64, strideA, &dcomplexx, incx_64, strideX, &dcomplexb, &dcomplexy, incy_64, strideY, batchCount_64); + blasStatus = cublasZgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &dcomplexa, &dcomplexA, lda_64, strideA, &dcomplexx, incx_64, strideX, &dcomplexb, &dcomplexy, incy_64, strideY, batchCount_64); // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSger_v2_64(cublasHandle_t handle, int64_t m, int64_t n, const float* alpha, const float* x, int64_t incx, const float* y, int64_t incy, float* A, int64_t lda); // HIP: HIPBLAS_EXPORT hipblasStatus_t hipblasSger_64(hipblasHandle_t handle, int64_t m, int64_t n, const float* alpha, const float* x, int64_t incx, const float* y, int64_t incy, float* AP, int64_t lda); diff --git a/tests/unit_tests/synthetic/libraries/cublas2rocblas_v2.cu b/tests/unit_tests/synthetic/libraries/cublas2rocblas_v2.cu index 1ffb03fb..90d9e515 100644 --- a/tests/unit_tests/synthetic/libraries/cublas2rocblas_v2.cu +++ b/tests/unit_tests/synthetic/libraries/cublas2rocblas_v2.cu @@ -1731,6 +1731,8 @@ int main() { long long int strideA = 0; long long int strideB = 0; long long int strideC = 0; + long long int strideX = 0; + long long int strideY = 0; #if CUDA_VERSION >= 7050 // CHECK: rocblas_half* ha = 0; @@ -1745,6 +1747,10 @@ int main() { __half* hc = 0; // CHECK: rocblas_half* hC = 0; __half* hC = 0; + // CHECK: rocblas_half* hx = 0; + __half* hx = 0; + // CHECK: rocblas_half* hy = 0; + __half* hy = 0; // CHECK: rocblas_half** hAarray = 0; __half** hAarray = 0; @@ -1958,17 +1964,24 @@ int main() { cublasDataType_t R_16BF = CUDA_R_16BF; cublasDataType_t C_16BF = CUDA_C_16BF; - // CHECK: rocblas_bfloat16** bfAarray = 0; - __nv_bfloat16** bfAarray = 0; - // CHECK: const rocblas_bfloat16** const bfAarray_const = const_cast(bfAarray); - const __nv_bfloat16** const bfAarray_const = const_cast(bfAarray); - // CHECK: rocblas_bfloat16** bfXarray = 0; - __nv_bfloat16** bfXarray = 0; - // CHECK: const rocblas_bfloat16** const bfXarray_const = const_cast(bfXarray); - const __nv_bfloat16** const bfXarray_const = const_cast(bfXarray); - __nv_bfloat16** bfYarray = 0; - // CHECK: const rocblas_bfloat16** const bfYarray_const = const_cast(bfYarray); - const __nv_bfloat16** const bfYarray_const = const_cast(bfYarray); + // CHECK: rocblas_bfloat16* bf16A = nullptr; + __nv_bfloat16* bf16A = nullptr; + // CHECK: rocblas_bfloat16** bf16Aarray = 0; + __nv_bfloat16** bf16Aarray = 0; + // CHECK: const rocblas_bfloat16** const bf16Aarray_const = const_cast(bf16Aarray); + const __nv_bfloat16** const bf16Aarray_const = const_cast(bf16Aarray); + // CHECK: rocblas_bfloat16* bf16X = nullptr; + __nv_bfloat16* bf16X = nullptr; + // CHECK: rocblas_bfloat16** bf16xarray = 0; + __nv_bfloat16** bf16xarray = 0; + // CHECK: const rocblas_bfloat16** const bf16xarray_const = const_cast(bf16xarray); + const __nv_bfloat16** const bf16xarray_const = const_cast(bf16xarray); + // CHECK: rocblas_bfloat16* bf16Y = nullptr; + __nv_bfloat16* bf16Y = nullptr; + // CHECK: rocblas_bfloat16** bf16yarray = 0; + __nv_bfloat16** bf16yarray = 0; + // CHECK: const rocblas_bfloat16** const bf16yarray_const = const_cast(bf16yarray); + const __nv_bfloat16** const bf16yarray_const = const_cast(bf16yarray); #endif #if CUDA_VERSION >= 11040 && CUBLAS_VERSION >= 11600 @@ -1988,6 +2001,36 @@ int main() { // ROC: ROCBLAS_EXPORT rocblas_status rocblas_dgemv_batched(rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, const double* alpha, const double* const A[], rocblas_int lda, const double* const x[], rocblas_int incx, const double* beta, double* const y[], rocblas_int incy, rocblas_int batch_count); // CHECK: blasStatus = rocblas_dgemv_batched(blasHandle, blasOperation, m, n, &da, dAarray_const, lda, dXarray_const, incx, &db, dYarray, incy, batchCount); blasStatus = cublasDgemvBatched(blasHandle, blasOperation, m, n, &da, dAarray_const, lda, dXarray_const, incx, &db, dYarray, incy, batchCount); + + // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const cuComplex* alpha, const cuComplex* const Aarray[], int lda, const cuComplex* const xarray[], int incx, const cuComplex* beta, cuComplex* const yarray[], int incy, int batchCount); + // ROC: ROCBLAS_EXPORT rocblas_status rocblas_cgemv_batched(rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, const rocblas_float_complex* alpha, const rocblas_float_complex* const A[], rocblas_int lda, const rocblas_float_complex* const x[], rocblas_int incx, const rocblas_float_complex* beta, rocblas_float_complex* const y[], rocblas_int incy, rocblas_int batch_count); + // CHECK: blasStatus = rocblas_cgemv_batched(blasHandle, blasOperation, m, n, &complexa, complexAarray_const, lda, complexXarray_const, incx, &complexb, complexYarray, incy, batchCount); + blasStatus = cublasCgemvBatched(blasHandle, blasOperation, m, n, &complexa, complexAarray_const, lda, complexXarray_const, incx, &complexb, complexYarray, incy, batchCount); + + // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemvBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* const Aarray[], int lda, const cuDoubleComplex* const xarray[], int incx, const cuDoubleComplex* beta, cuDoubleComplex* const yarray[], int incy, int batchCount); + // ROC: ROCBLAS_EXPORT rocblas_status rocblas_zgemv_batched(rocblas_handle handle, rocblas_operation trans, rocblas_int m, rocblas_int n, const rocblas_double_complex* alpha, const rocblas_double_complex* const A[], rocblas_int lda, const rocblas_double_complex* const x[], rocblas_int incx, const rocblas_double_complex* beta, rocblas_double_complex* const y[], rocblas_int incy, rocblas_int batch_count); + // CHECK: blasStatus = rocblas_zgemv_batched(blasHandle, blasOperation, m, n, &dcomplexa, dcomplexAarray_const, lda, dcomplexXarray_const, incx, &dcomplexb, dcomplexYarray, incy, batchCount); + blasStatus = cublasZgemvBatched(blasHandle, blasOperation, m, n, &dcomplexa, dcomplexAarray_const, lda, dcomplexXarray_const, incx, &dcomplexb, dcomplexYarray, incy, batchCount); + + // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const float* alpha, const float* A, int lda, long long int strideA, const float* x, int incx, long long int stridex, const float* beta, float* y, int incy, long long int stridey, int batchCount); + // ROC: ROCBLAS_EXPORT rocblas_status rocblas_sgemv_strided_batched(rocblas_handle handle, rocblas_operation transA, rocblas_int m, rocblas_int n, const float* alpha, const float* A, rocblas_int lda, rocblas_stride strideA, const float* x, rocblas_int incx, rocblas_stride stridex, const float* beta, float* y, rocblas_int incy, rocblas_stride stridey, rocblas_int batch_count); + // CHECK: blasStatus = rocblas_sgemv_strided_batched(blasHandle, blasOperation, m, n, &fa, &fA, lda, strideA, &fx, incx, strideX, &fb, &fy, incy, strideY, batchCount); + blasStatus = cublasSgemvStridedBatched(blasHandle, blasOperation, m, n, &fa, &fA, lda, strideA, &fx, incx, strideX, &fb, &fy, incy, strideY, batchCount); + + // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const double* alpha, const double* A, int lda, long long int strideA, const double* x, int incx, long long int stridex, const double* beta, double* y, int incy, long long int stridey, int batchCount); + // ROC: ROCBLAS_EXPORT rocblas_status rocblas_dgemv_strided_batched(rocblas_handle handle, rocblas_operation transA, rocblas_int m, rocblas_int n, const double* alpha, const double* A, rocblas_int lda, rocblas_stride strideA, const double* x, rocblas_int incx, rocblas_stride stridex, const double* beta, double* y, rocblas_int incy, rocblas_stride stridey, rocblas_int batch_count); + // CHECK: blasStatus = rocblas_dgemv_strided_batched(blasHandle, blasOperation, m, n, &da, &dA, lda, strideA, &dx, incx, strideX, &db, &dy, incy, strideY, batchCount); + blasStatus = cublasDgemvStridedBatched(blasHandle, blasOperation, m, n, &da, &dA, lda, strideA, &dx, incx, strideX, &db, &dy, incy, strideY, batchCount); + + // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const cuComplex* alpha, const cuComplex* A, int lda, long long int strideA, const cuComplex* x, int incx, long long int stridex, const cuComplex* beta, cuComplex* y, int incy, long long int stridey, int batchCount); + // ROC: ROCBLAS_EXPORT rocblas_status rocblas_cgemv_strided_batched(rocblas_handle handle, rocblas_operation transA, rocblas_int m, rocblas_int n, const rocblas_float_complex* alpha, const rocblas_float_complex* A, rocblas_int lda, rocblas_stride strideA, const rocblas_float_complex* x, rocblas_int incx, rocblas_stride stridex, const rocblas_float_complex* beta, rocblas_float_complex* y, rocblas_int incy, rocblas_stride stridey, rocblas_int batch_count); + // CHECK: blasStatus = rocblas_cgemv_strided_batched(blasHandle, blasOperation, m, n, &complexa, &complexA, lda, strideA, &complexx, incx, strideX, &complexb, &complexy, incy, strideY, batchCount); + blasStatus = cublasCgemvStridedBatched(blasHandle, blasOperation, m, n, &complexa, &complexA, lda, strideA, &complexx, incx, strideX, &complexb, &complexy, incy, strideY, batchCount); + + // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemvStridedBatched(cublasHandle_t handle, cublasOperation_t trans, int m, int n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int lda, long long int strideA, const cuDoubleComplex* x, int incx, long long int stridex, const cuDoubleComplex* beta, cuDoubleComplex* y, int incy, long long int stridey, int batchCount); + // ROC: ROCBLAS_EXPORT rocblas_status rocblas_zgemv_strided_batched(rocblas_handle handle, rocblas_operation transA, rocblas_int m, rocblas_int n, const rocblas_double_complex* alpha, const rocblas_double_complex* A, rocblas_int lda, rocblas_stride strideA, const rocblas_double_complex* x, rocblas_int incx, rocblas_stride stridex, const rocblas_double_complex* beta, rocblas_double_complex* y, rocblas_int incy, rocblas_stride stridey, rocblas_int batch_count); + // CHECK: blasStatus = rocblas_zgemv_strided_batched(blasHandle, blasOperation, m, n, &dcomplexa, &dcomplexA, lda, strideA, &dcomplexx, incx, strideX, &dcomplexb, &dcomplexy, incy, strideY, batchCount); + blasStatus = cublasZgemvStridedBatched(blasHandle, blasOperation, m, n, &dcomplexa, &dcomplexA, lda, strideA, &dcomplexx, incx, strideX, &dcomplexb, &dcomplexy, incy, strideY, batchCount); #endif #if CUDA_VERSION >= 12000 @@ -2440,13 +2483,53 @@ int main() { // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSTgemvBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const __nv_bfloat16* const Aarray[], int64_t lda, const __nv_bfloat16* const xarray[], int64_t incx, const float* beta, __nv_bfloat16* const yarray[], int64_t incy, int64_t batchCount); // ROC: ROCBLAS_EXPORT rocblas_status rocblas_tstgemv_batched_64(rocblas_handle handle, rocblas_operation trans, int64_t m, int64_t n, const float* alpha, const rocblas_bfloat16* const A[], int64_t lda, const rocblas_bfloat16* const x[], int64_t incx, const float* beta, rocblas_bfloat16* const y[], int64_t incy, int64_t batch_count); - // CHECK: blasStatus = rocblas_tstgemv_batched_64(blasHandle, blasOperation, m_64, n_64, &fa, bfAarray_const, lda_64, bfXarray_const, incx_64, &fb, bfYarray, incy_64, batchCount_64); - blasStatus = cublasTSTgemvBatched_64(blasHandle, blasOperation, m_64, n_64, &fa, bfAarray_const, lda_64, bfXarray_const, incx_64, &fb, bfYarray, incy_64, batchCount_64); + // CHECK: blasStatus = rocblas_tstgemv_batched_64(blasHandle, blasOperation, m_64, n_64, &fa, bf16Aarray_const, lda_64, bf16xarray_const, incx_64, &fb, bf16yarray, incy_64, batchCount_64); + blasStatus = cublasTSTgemvBatched_64(blasHandle, blasOperation, m_64, n_64, &fa, bf16Aarray_const, lda_64, bf16xarray_const, incx_64, &fb, bf16yarray, incy_64, batchCount_64); // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSSgemvBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const __nv_bfloat16* const Aarray[], int64_t lda, const __nv_bfloat16* const xarray[], int64_t incx, const float* beta, float* const yarray[], int64_t incy, int64_t batchCount); // ROC: ROCBLAS_EXPORT rocblas_status rocblas_tssgemv_batched_64(rocblas_handle handle, rocblas_operation trans, int64_t m, int64_t n, const float* alpha, const rocblas_bfloat16* const A[], int64_t lda, const rocblas_bfloat16* const x[], int64_t incx, const float* beta, float* const y[], int64_t incy, int64_t batch_count); - // CHECK: blasStatus = rocblas_tssgemv_batched_64(blasHandle, blasOperation, m_64, n_64, &fa, bfAarray_const, lda_64, bfXarray_const, incx_64, &fb, fYarray, incy_64, batchCount_64); - blasStatus = cublasTSSgemvBatched_64(blasHandle, blasOperation, m_64, n_64, &fa, bfAarray_const, lda_64, bfXarray_const, incx_64, &fb, fYarray, incy_64, batchCount_64); + // CHECK: blasStatus = rocblas_tssgemv_batched_64(blasHandle, blasOperation, m_64, n_64, &fa, bf16Aarray_const, lda_64, bf16xarray_const, incx_64, &fb, fYarray, incy_64, batchCount_64); + blasStatus = cublasTSSgemvBatched_64(blasHandle, blasOperation, m_64, n_64, &fa, bf16Aarray_const, lda_64, bf16xarray_const, incx_64, &fb, fYarray, incy_64, batchCount_64); + + // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, long long int strideA, const float* x, int64_t incx, long long int stridex, const float* beta, float* y, int64_t incy, long long int stridey, int64_t batchCount); + // ROC: ROCBLAS_EXPORT rocblas_status rocblas_sgemv_strided_batched_64(rocblas_handle handle, rocblas_operation transA, int64_t m, int64_t n, const float* alpha, const float* A, int64_t lda, rocblas_stride strideA, const float* x, int64_t incx, rocblas_stride stridex, const float* beta, float* y, int64_t incy, rocblas_stride stridey, int64_t batch_count); + // CHECK: blasStatus = rocblas_sgemv_strided_batched_64(blasHandle, blasOperation, m_64, n_64, &fa, &fA, lda_64, strideA, &fx, incx_64, strideX, &fb, &fy, incy_64, strideY, batchCount_64); + blasStatus = cublasSgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &fa, &fA, lda_64, strideA, &fx, incx_64, strideX, &fb, &fy, incy_64, strideY, batchCount_64); + + // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, long long int strideA, const double* x, int64_t incx, long long int stridex, const double* beta, double* y, int64_t incy, long long int stridey, int64_t batchCount); + // ROC: ROCBLAS_EXPORT rocblas_status rocblas_dgemv_strided_batched_64(rocblas_handle handle, rocblas_operation transA, int64_t m, int64_t n, const double* alpha, const double* A, int64_t lda, rocblas_stride strideA, const double* x, int64_t incx, rocblas_stride stridex, const double* beta, double* y, int64_t incy, rocblas_stride stridey, int64_t batch_count); + // CHECK: blasStatus = rocblas_dgemv_strided_batched_64(blasHandle, blasOperation, m_64, n_64, &da, &dA, lda_64, strideA, &dx, incx_64, strideX, &db, &dy, incy_64, strideY, batchCount_64); + blasStatus = cublasDgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &da, &dA, lda_64, strideA, &dx, incx_64, strideX, &db, &dy, incy_64, strideY, batchCount_64); + + // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const cuComplex* alpha, const cuComplex* A, int64_t lda, long long int strideA, const cuComplex* x, int64_t incx, long long int stridex, const cuComplex* beta, cuComplex* y, int64_t incy, long long int stridey, int64_t batchCount); + // ROC: ROCBLAS_EXPORT rocblas_status rocblas_cgemv_strided_batched_64(rocblas_handle handle, rocblas_operation transA, int64_t m, int64_t n, const rocblas_float_complex* alpha, const rocblas_float_complex* A, int64_t lda, rocblas_stride strideA, const rocblas_float_complex* x, int64_t incx, rocblas_stride stridex, const rocblas_float_complex* beta, rocblas_float_complex* y, int64_t incy, rocblas_stride stridey, int64_t batch_count); + // CHECK: blasStatus = rocblas_cgemv_strided_batched_64(blasHandle, blasOperation, m_64, n_64, &complexa, &complexA, lda_64, strideA, &complexx, incx_64, strideX, &complexb, &complexy, incy_64, strideY, batchCount_64); + blasStatus = cublasCgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &complexa, &complexA, lda_64, strideA, &complexx, incx_64, strideX, &complexb, &complexy, incy_64, strideY, batchCount_64); + + // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const cuDoubleComplex* alpha, const cuDoubleComplex* A, int64_t lda, long long int strideA, const cuDoubleComplex* x, int64_t incx, long long int stridex, const cuDoubleComplex* beta, cuDoubleComplex* y, int64_t incy, long long int stridey, int64_t batchCount); + // ROC: ROCBLAS_EXPORT rocblas_status rocblas_zgemv_strided_batched_64(rocblas_handle handle, rocblas_operation transA, int64_t m, int64_t n, const rocblas_double_complex* alpha, const rocblas_double_complex* A, int64_t lda, rocblas_stride strideA, const rocblas_double_complex* x, int64_t incx, rocblas_stride stridex, const rocblas_double_complex* beta, rocblas_double_complex* y, int64_t incy, rocblas_stride stridey, int64_t batch_count); + // CHECK: blasStatus = rocblas_zgemv_strided_batched_64(blasHandle, blasOperation, m_64, n_64, &dcomplexa, &dcomplexA, lda_64, strideA, &dcomplexx, incx_64, strideX, &dcomplexb, &dcomplexy, incy_64, strideY, batchCount_64); + blasStatus = cublasZgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &dcomplexa, &dcomplexA, lda_64, strideA, &dcomplexx, incx_64, strideX, &dcomplexb, &dcomplexy, incy_64, strideY, batchCount_64); + + // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSHgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const __half* A, int64_t lda, long long int strideA, const __half* x, int64_t incx, long long int stridex, const float* beta, __half* y, int64_t incy, long long int stridey, int64_t batchCount); + // ROC: ROCBLAS_EXPORT rocblas_status rocblas_hshgemv_strided_batched_64(rocblas_handle handle, rocblas_operation transA, int64_t m, int64_t n, const float* alpha, const rocblas_half* A, int64_t lda, rocblas_stride strideA, const rocblas_half* x, int64_t incx, rocblas_stride stridex, const float* beta, rocblas_half* y, int64_t incy, rocblas_stride stridey, int64_t batch_count); + // CHECK: blasStatus = rocblas_hshgemv_strided_batched_64(blasHandle, blasOperation, m_64, n_64, &fa, ha, lda_64, strideA, hx, incx_64, strideX, &fb, hy, incy_64, strideY, batchCount_64); + blasStatus = cublasHSHgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &fa, ha, lda_64, strideA, hx, incx_64, strideX, &fb, hy, incy_64, strideY, batchCount_64); + + // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSSgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const __half* A, int64_t lda, long long int strideA, const __half* x, int64_t incx, long long int stridex, const float* beta, float* y, int64_t incy, long long int stridey, int64_t batchCount); + // ROC: ROCBLAS_EXPORT rocblas_status rocblas_hssgemv_strided_batched_64(rocblas_handle handle, rocblas_operation transA, int64_t m, int64_t n, const float* alpha, const rocblas_half* A, int64_t lda, rocblas_stride strideA, const rocblas_half* x, int64_t incx, rocblas_stride stridex, const float* beta, float* y, int64_t incy, rocblas_stride stridey, int64_t batch_count); + // CHECK: blasStatus = rocblas_hssgemv_strided_batched_64(blasHandle, blasOperation, m_64, n_64, &fa, ha, lda_64, strideA, hx, incx_64, strideX, &fb, &fy, incy_64, strideY, batchCount_64); + blasStatus = cublasHSSgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &fa, ha, lda_64, strideA, hx, incx_64, strideX, &fb, &fy, incy_64, strideY, batchCount_64); + + // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSTgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const __nv_bfloat16* A, int64_t lda, long long int strideA, const __nv_bfloat16* x, int64_t incx, long long int stridex, const float* beta, __nv_bfloat16* y, int64_t incy, long long int stridey, int64_t batchCount); + // ROC: ROCBLAS_EXPORT rocblas_status rocblas_tstgemv_strided_batched_64(rocblas_handle handle, rocblas_operation transA, int64_t m, int64_t n, const float* alpha, const rocblas_bfloat16* A, int64_t lda, rocblas_stride strideA, const rocblas_bfloat16* x, int64_t incx, rocblas_stride stridex, const float* beta, rocblas_bfloat16* y, int64_t incy, rocblas_stride stridey, int64_t batch_count); + // CHECK: blasStatus = rocblas_tstgemv_strided_batched_64(blasHandle, blasOperation, m_64, n_64, &fa, bf16A, lda_64, strideA, bf16X, incx_64, strideX, &fb, bf16Y, incy_64, strideY, batchCount_64); + blasStatus = cublasTSTgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &fa, bf16A, lda_64, strideA, bf16X, incx_64, strideX, &fb, bf16Y, incy_64, strideY, batchCount_64); + + // CUDA: CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSSgemvStridedBatched_64(cublasHandle_t handle, cublasOperation_t trans, int64_t m, int64_t n, const float* alpha, const __nv_bfloat16* A, int64_t lda, long long int strideA, const __nv_bfloat16* x, int64_t incx, long long int stridex, const float* beta, float* y, int64_t incy, long long int stridey, int64_t batchCount); + // ROC: ROCBLAS_EXPORT rocblas_status rocblas_tssgemv_strided_batched_64(rocblas_handle handle, rocblas_operation transA, int64_t m, int64_t n, const float* alpha, const rocblas_bfloat16* A, int64_t lda, rocblas_stride strideA, const rocblas_bfloat16* x, int64_t incx, rocblas_stride stridex, const float* beta, float* y, int64_t incy, rocblas_stride stridey, int64_t batch_count); + // CHECK: blasStatus = rocblas_tssgemv_strided_batched_64(blasHandle, blasOperation, m_64, n_64, &fa, bf16A, lda_64, strideA, bf16X, incx_64, strideX, &fb, &fy, incy_64, strideY, batchCount_64); + blasStatus = cublasTSSgemvStridedBatched_64(blasHandle, blasOperation, m_64, n_64, &fa, bf16A, lda_64, strideA, bf16X, incx_64, strideX, &fb, &fy, incy_64, strideY, batchCount_64); #endif return 0;