diff --git a/VERSION b/VERSION index 3993d6f24f5..6321062b636 100644 --- a/VERSION +++ b/VERSION @@ -1,6 +1,6 @@ MAJOR = 2 MINOR = 1 -PATCH = 0-rc18 +PATCH = 0-rc19 # A specific DATE (YYYY-MM-DD) fixes an official release, otherwise # it is considered Development version. DATE = diff --git a/src/acc/acc.h b/src/acc/acc.h index 57c9531b8d1..f60a5a17df7 100644 --- a/src/acc/acc.h +++ b/src/acc/acc.h @@ -11,6 +11,12 @@ #include +#define DBCSR_STRINGIFY_AUX(SYMBOL) #SYMBOL +#define DBCSR_STRINGIFY(SYMBOL) DBCSR_STRINGIFY_AUX(SYMBOL) +#define DBCSR_CONCATENATE2(A, B) A##B +#define DBCSR_CONCATENATE(A, B) DBCSR_CONCATENATE2(A, B) + + #if defined(__cplusplus) extern "C" { #endif diff --git a/src/acc/acc_bench_smm.c b/src/acc/acc_bench_smm.c index e3e38f06b98..1802010d7da 100644 --- a/src/acc/acc_bench_smm.c +++ b/src/acc/acc_bench_smm.c @@ -20,6 +20,9 @@ #if !defined(ELEM_TYPE) # define ELEM_TYPE double #endif +#if !defined(EPSILON) +# define EPSILON 1E-3 +#endif #if !defined(MAX_KERNEL_DIM) # define MAX_KERNEL_DIM 80 #endif @@ -67,44 +70,66 @@ int main(int argc, char* argv[]) const int mn = m * n, mk = m * k, kn = k * n; #endif #if defined(WARMUP) && (0 < WARMUP) && !defined(_DEBUG) - const int warmup = WARMUP; + const int warmup = MAX(WARMUP, 2) / 2 * 2; #else const int warmup = 0; #endif - int *stack_hst = NULL, *stack_dev = NULL; + int *stack_hst = NULL, *stack_dev = NULL, *trans_hst = NULL, *trans_dev = NULL; ELEM_TYPE *amat_hst = NULL, *bmat_hst = NULL, *cmat_hst = NULL; ELEM_TYPE *amat_dev = NULL, *bmat_dev = NULL, *cmat_dev = NULL; - int result = EXIT_SUCCESS, r, i; + int result = EXIT_SUCCESS, ndevices = 0, r, i; void *stream = NULL; #if defined(USE_LIBXSMM) libxsmm_timer_tickint start; - double duration; + double duration, transpose; #endif assert(m <= (mn / n) && 0 == (mn % n) && k <= (mk / k) && 0 == (mk % k) && n <= (kn / n) && 0 == (kn % n)); - printf("%s%s%i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n, k); + printf("%s%s%i %i %i %i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", + nrepeat, stack_size, m, n, k, nc, na, nb); CHECK(acc_init(), &result); + CHECK(acc_get_ndevices(&ndevices), &result); + if (0 < ndevices) { +#if defined(_DEBUG) + fprintf(stderr, "number of devices found: %i\n", ndevices); +#endif + } + else { +#if defined(_DEBUG) + fprintf(stderr, "Error: no device found!\n"); +#endif + CHECK(acc_finalize(), NULL); + return result; + } + printf("element type: %s\n", DBCSR_STRINGIFY(ELEM_TYPE)); CHECK(acc_stream_create(&stream, "stream", -1/*default priority*/), &result); - CHECK(acc_host_mem_allocate((void**)&amat_hst, sizeof(ELEM_TYPE) * mk * stack_size, stream), &result); - CHECK(acc_host_mem_allocate((void**)&bmat_hst, sizeof(ELEM_TYPE) * kn * stack_size, stream), &result); - CHECK(acc_host_mem_allocate((void**)&cmat_hst, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result); + CHECK(acc_host_mem_allocate((void**)&amat_hst, sizeof(ELEM_TYPE) * mk * na, stream), &result); + CHECK(acc_host_mem_allocate((void**)&bmat_hst, sizeof(ELEM_TYPE) * kn * nb, stream), &result); + CHECK(acc_host_mem_allocate((void**)&cmat_hst, sizeof(ELEM_TYPE) * mn * nc, stream), &result); CHECK(acc_host_mem_allocate((void**)&stack_hst, sizeof(int) * 3 * stack_size, stream), &result); + CHECK(acc_host_mem_allocate((void**)&trans_hst, sizeof(int) * nb, stream), &result); CHECK(acc_stream_sync(stream), &result); /* ensure host-data is allocated */ - for (i = 0; i < stack_size; ++i) { /* initialize matrices */ + /* initialize matrices */ + for (i = 0; i < na; ++i) { init(i/*seed*/ + 42, &amat_hst[i*mk], m, k); + } + for (i = 0; i < nb; ++i) { init(i/*seed*/ + 24, &bmat_hst[i*kn], k, n); + trans_hst[i] = i * kn; } init_stack(stack_hst, stack_size, mn, mk, kn, nc, na, nb); - CHECK(acc_dev_mem_allocate((void**)&amat_dev, sizeof(ELEM_TYPE) * mk * stack_size), &result); - CHECK(acc_dev_mem_allocate((void**)&bmat_dev, sizeof(ELEM_TYPE) * kn * stack_size), &result); - CHECK(acc_dev_mem_allocate((void**)&cmat_dev, sizeof(ELEM_TYPE) * mn * stack_size), &result); + CHECK(acc_dev_mem_allocate((void**)&amat_dev, sizeof(ELEM_TYPE) * mk * na), &result); + CHECK(acc_dev_mem_allocate((void**)&bmat_dev, sizeof(ELEM_TYPE) * kn * nb), &result); + CHECK(acc_dev_mem_allocate((void**)&cmat_dev, sizeof(ELEM_TYPE) * mn * nc), &result); CHECK(acc_dev_mem_allocate((void**)&stack_dev, sizeof(int) * 3 * stack_size), &result); - CHECK(acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result); + CHECK(acc_dev_mem_allocate((void**)&trans_dev, sizeof(int) * nb), &result); + CHECK(acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * nc, stream), &result); + CHECK(acc_memcpy_h2d(trans_hst, trans_dev, sizeof(int) * nb, stream), &result); #if defined(USE_LIBXSMM) CHECK(acc_stream_sync(stream), &result); start = libxsmm_timer_tick(); #endif - CHECK(acc_memcpy_h2d(amat_hst, amat_dev, sizeof(ELEM_TYPE) * mk * stack_size, stream), &result); - CHECK(acc_memcpy_h2d(bmat_hst, bmat_dev, sizeof(ELEM_TYPE) * kn * stack_size, stream), &result); + CHECK(acc_memcpy_h2d(amat_hst, amat_dev, sizeof(ELEM_TYPE) * mk * na, stream), &result); + CHECK(acc_memcpy_h2d(bmat_hst, bmat_dev, sizeof(ELEM_TYPE) * kn * nb, stream), &result); CHECK(acc_memcpy_h2d(stack_hst, stack_dev, sizeof(int) * 3 * stack_size, stream), &result); #if defined(USE_LIBXSMM) CHECK(acc_stream_sync(stream), &result); @@ -113,17 +138,36 @@ int main(int argc, char* argv[]) (sizeof(ELEM_TYPE) * (mk + kn) + sizeof(int) * 3) * stack_size / (duration * (1ULL << 30))); #endif - /* warmup execution and prebuild JIT kernels */ + /* warmup execution and prebuild transpose-kernel */ + for (r = 0; r < warmup / 2; ++r) { + CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev, + DBCSR_TYPE(ELEM_TYPE), k, n, MAX_KERNEL_DIM, stream), &result); + CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev, + DBCSR_TYPE(ELEM_TYPE), n, k, MAX_KERNEL_DIM, stream), &result); + } +#if defined(USE_LIBXSMM) + CHECK(acc_stream_sync(stream), &result); + start = libxsmm_timer_tick(); +#endif + /* to perform NN-SMMs on the device, all B-matrices are transposed upfront (SMM-kernel is limited to NT) */ + CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev, + DBCSR_TYPE(ELEM_TYPE), k, n, MAX_KERNEL_DIM, stream), &result); +#if defined(USE_LIBXSMM) + CHECK(acc_stream_sync(stream), &result); + transpose = libxsmm_timer_duration(start, libxsmm_timer_tick()); +#endif + /* warmup execution and prebuild SMM-kernel */ for (r = 0; r < warmup; ++r) { CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, 3/*nparams*/, DBCSR_TYPE(ELEM_TYPE), amat_dev, bmat_dev, cmat_dev, m, n, k, MAX_KERNEL_DIM, 1/*homogeneous*/, stream, stream), &result); } + CHECK(acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * nc, stream), &result); #if defined(USE_LIBXSMM) CHECK(acc_stream_sync(stream), &result); start = libxsmm_timer_tick(); #endif for (r = 0; r < nrepeat; ++r) { - /* GPU-kernel is limited to C += Ai * Bi^T (i.e., NT, for NN, all Bi must be transposed upfront) */ + /* GPU-kernel is limited to C += Ai * Bi^T, i.e., NT (for NN, all Bi must be transposed upfront) */ CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, 3/*nparams*/, DBCSR_TYPE(ELEM_TYPE), amat_dev, bmat_dev, cmat_dev, m, n, k, MAX_KERNEL_DIM, 1/*homogeneous*/, stream, stream), &result); } @@ -131,37 +175,81 @@ int main(int argc, char* argv[]) CHECK(acc_stream_sync(stream), &result); duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); if (EXIT_SUCCESS == result) { - const char transa = 'N', transb = 'T'; + ELEM_TYPE *const gold_hst = (ELEM_TYPE*)libxsmm_malloc(sizeof(ELEM_TYPE) * mn * nc); + const char transa = 'N', transb = 'N'; const ELEM_TYPE alpha = 1, beta = 1; + printf("transpose: %.1f ms %.1f GFLOPS/s\n", 1000.0 * (duration + transpose) / nrepeat, + ((size_t)2 * m * n * k) * stack_size / ((duration + transpose) * (1ULL << 30) / nrepeat)); printf("device: %.1f ms %.1f GFLOPS/s\n", 1000.0 * duration / nrepeat, ((size_t)2 * m * n * k) * stack_size / (duration * (1ULL << 30) / nrepeat)); - memset(cmat_hst, 0, sizeof(ELEM_TYPE) * mn * stack_size); + memset(gold_hst, 0, sizeof(ELEM_TYPE) * mn * nc); + for (r = 0; r < warmup; ++r) { + libxsmm_gemm_batch_omp(LIBXSMM_GEMM_PRECISION(ELEM_TYPE), LIBXSMM_GEMM_PRECISION(ELEM_TYPE), + &transa, &transb, m, n, k, &alpha, amat_hst, &m/*lda*/, bmat_hst, &k/*ldb*/, + &beta, gold_hst, &m/*ldc*/, 1/*index_base*/, sizeof(int) * 3, + stack_hst + 0, stack_hst + 1, stack_hst + 2, stack_size); + } + memset(gold_hst, 0, sizeof(ELEM_TYPE) * mn * nc); start = libxsmm_timer_tick(); + /* CPU-kernel operates on data that is not initialized in NUMA-aware fashion */ for (r = 0; r < nrepeat; ++r) { - /* CPU-kernel performs C += Ai * Bi^T to match result of GPU-kernel (NT may perform below NN) */ libxsmm_gemm_batch_omp(LIBXSMM_GEMM_PRECISION(ELEM_TYPE), LIBXSMM_GEMM_PRECISION(ELEM_TYPE), &transa, &transb, m, n, k, &alpha, amat_hst, &m/*lda*/, bmat_hst, &k/*ldb*/, - &beta, cmat_hst, &m/*ldc*/, 1/*index_base*/, sizeof(int) * 3, + &beta, gold_hst, &m/*ldc*/, 1/*index_base*/, sizeof(int) * 3, stack_hst + 0, stack_hst + 1, stack_hst + 2, stack_size); } duration = libxsmm_timer_duration(start, libxsmm_timer_tick()); printf("host: %.1f ms %.1f GFLOPS/s\n", 1000.0 * duration / nrepeat, ((size_t)2 * m * n * k) * stack_size / (duration * (1ULL << 30) / nrepeat)); - /* transfer result from device back to host for validation */ - CHECK(acc_memcpy_d2h(cmat_dev, cmat_hst, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result); + /* transfer result from device to host for validation */ + CHECK(acc_memcpy_d2h(cmat_dev, cmat_hst, sizeof(ELEM_TYPE) * mn * nc, stream), &result); CHECK(acc_stream_sync(stream), &result); - /* TODO: validation code TBD */ + if (EXIT_SUCCESS == result) { + double abserror = 0, relerror = 0; + for (i = 0; i < nc; ++i) { + const ELEM_TYPE *const gold = gold_hst + mn * i; + const ELEM_TYPE *const test = cmat_hst + mn * i; + double diff = 0, a = 0, b = 0; + for (r = 0; r < (m * n); ++r) { + const double ar = (double)gold[r]; + const double br = (double)test[r]; + const double d = fabs(ar - br); + if (d > diff) { + diff = d; + a = ar; + b = br; + } + } + if (0 < diff) { +# if defined(_DEBUG) + print(stderr, "gold = ", gold, m, n); + print(stderr, "test = ", test, m, n); + fprintf(stderr, "diff = %g (%g != %g)\n", diff, a, b); +# endif + if (abserror < diff) { + relerror = fabs(0 != a ? (diff / a) : (diff / b)); + abserror = diff; + } + } + } + printf("max.error: rel=%g\n", relerror); + if (EPSILON < relerror) result = EXIT_FAILURE; + } + libxsmm_free(gold_hst); } #endif CHECK(acc_host_mem_deallocate(stack_hst, stream), NULL); + CHECK(acc_host_mem_deallocate(trans_hst, stream), NULL); CHECK(acc_host_mem_deallocate(amat_hst, stream), NULL); CHECK(acc_host_mem_deallocate(bmat_hst, stream), NULL); CHECK(acc_host_mem_deallocate(cmat_hst, stream), NULL); CHECK(acc_dev_mem_deallocate(stack_dev), NULL); + CHECK(acc_dev_mem_deallocate(trans_dev), NULL); CHECK(acc_dev_mem_deallocate(amat_dev), NULL); CHECK(acc_dev_mem_deallocate(bmat_dev), NULL); CHECK(acc_dev_mem_deallocate(cmat_dev), NULL); CHECK(acc_stream_destroy(stream), NULL); + CHECK(acc_finalize(), NULL); if (EXIT_SUCCESS != result) { fprintf(stderr, "FAILED\n"); } diff --git a/src/acc/acc_bench_trans.c b/src/acc/acc_bench_trans.c index 4b50a65e4a5..000150436a3 100644 --- a/src/acc/acc_bench_trans.c +++ b/src/acc/acc_bench_trans.c @@ -75,7 +75,7 @@ int main(int argc, char* argv[]) #endif int *stack_hst = NULL, *stack_dev = NULL; ELEM_TYPE *mat_hst = NULL, *mat_dev = NULL; - int result = EXIT_SUCCESS, r, i, mm = m, nn = n; + int result = EXIT_SUCCESS, ndevices = 0, r, i, mm = m, nn = n; void *stream = NULL; #if defined(USE_LIBXSMM) libxsmm_timer_tickint start; @@ -84,6 +84,20 @@ int main(int argc, char* argv[]) assert(m <= (mn / n) && 0 == (mn % n)); printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n); CHECK(acc_init(), &result); + CHECK(acc_get_ndevices(&ndevices), &result); + if (0 < ndevices) { +#if defined(_DEBUG) + fprintf(stderr, "number of devices found: %i\n", ndevices); +#endif + } + else { +#if defined(_DEBUG) + fprintf(stderr, "Error: no device found!\n"); +#endif + CHECK(acc_finalize(), NULL); + return result; + } + printf("element type: %s\n", DBCSR_STRINGIFY(ELEM_TYPE)); #if defined(PRIORITY) CHECK(acc_stream_priority_range(&priomin, &priomax), &result); CHECK(acc_stream_create(&stream, "stream", (priomin + priomax) / 2), &result); @@ -154,24 +168,23 @@ int main(int argc, char* argv[]) printf("host: %.1f ms %.1f GB/s\n", 1000.0 * duration / nodd, (sizeof(ELEM_TYPE) * mn + sizeof(int)) * stack_size / (duration * (1ULL << 30) / nodd)); - /* transfer result from device back to host for validation */ + /* transfer result from device to host for validation */ CHECK(acc_memcpy_d2h(mat_dev, mat_hst, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result); CHECK(acc_stream_sync(stream), &result); if (EXIT_SUCCESS == result) { unsigned int nerrors = 0; - int j; for (i = 0; i < stack_size; ++i) { ELEM_TYPE gold[MAX_KERNEL_DIM*MAX_KERNEL_DIM]; const ELEM_TYPE *const test = mat_hst + mn * i; init(i/*seed*/, gold, m, n); libxsmm_itrans(gold, sizeof(ELEM_TYPE), m, n, m, n); - for (j = 0; j < (m * n); ++j) { - if (gold[j] != test[j]) { + for (r = 0; r < (m * n); ++r) { + if (gold[r] != test[r]) { ++nerrors; # if defined(_DEBUG) print(stderr, "gold = ", gold, n, m); - print(stderr, "this = ", test, n, m); + print(stderr, "test = ", test, n, m); init(i/*seed*/, gold, m, n); print(stderr, "orig = ", gold, m, n); fprintf(stderr, "\n"); @@ -190,6 +203,7 @@ int main(int argc, char* argv[]) CHECK(acc_dev_mem_deallocate(stack_dev), NULL); CHECK(acc_dev_mem_deallocate(mat_dev), NULL); CHECK(acc_stream_destroy(stream), NULL); + CHECK(acc_finalize(), NULL); if (EXIT_SUCCESS != result) { fprintf(stderr, "FAILED\n"); } diff --git a/src/acc/acc_libsmm.h b/src/acc/acc_libsmm.h index e8fb3843c71..13c0cc565ec 100644 --- a/src/acc/acc_libsmm.h +++ b/src/acc/acc_libsmm.h @@ -11,7 +11,6 @@ #include "acc.h" -#define DBCSR_CONCATENATE(A, B) A##B #define DBCSR_TYPE(T) DBCSR_CONCATENATE(DBCSR_TYPE_, T) #define DBCSR_TYPE_double dbcsr_type_real_8 #define DBCSR_TYPE_float dbcsr_type_real_4 @@ -29,6 +28,7 @@ typedef enum libsmm_acc_data_t { } libsmm_acc_data_t; int libsmm_acc_init(void); +int libsmm_acc_finalize(void); acc_bool_t libsmm_acc_is_thread_safe(void); int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, diff --git a/src/acc/cuda/acc_init.cpp b/src/acc/cuda/acc_init.cpp index 405f3282496..d3a114aa8c3 100644 --- a/src/acc/cuda/acc_init.cpp +++ b/src/acc/cuda/acc_init.cpp @@ -45,5 +45,5 @@ extern "C" int acc_finalize(){ ACC_API_CALL(GetDevice, (&myDevice)); ACC_DRV_CALL(DeviceGet, (&acc_device, myDevice)); ACC_DRV_CALL(DevicePrimaryCtxRelease, (acc_device)); - return 0; + return libsmm_acc_finalize(); } diff --git a/src/acc/libsmm_acc/PACKAGE b/src/acc/libsmm_acc/PACKAGE index 9d4dcf3960c..3fd95a6d4db 100644 --- a/src/acc/libsmm_acc/PACKAGE +++ b/src/acc/libsmm_acc/PACKAGE @@ -1,5 +1,5 @@ { -"description": "Generic GPU-accelerated library for small matrix multiplications", +"description": "CUDA/HIP-accelerated library for small matrix multiplications", "archive": "libdbcsr", "requires": ["..", "../cuda", "../hip"] } diff --git a/src/acc/libsmm_acc/libcusmm/.gitignore b/src/acc/libsmm_acc/libcusmm/.gitignore new file mode 100644 index 00000000000..7ed9e3bd2f8 --- /dev/null +++ b/src/acc/libsmm_acc/libcusmm/.gitignore @@ -0,0 +1 @@ +cusmm_kernels.h diff --git a/src/acc/libsmm_acc/libcusmm/PACKAGE b/src/acc/libsmm_acc/libcusmm/PACKAGE index 9ebb80de9b4..b9818a727c5 100644 --- a/src/acc/libsmm_acc/libcusmm/PACKAGE +++ b/src/acc/libsmm_acc/libcusmm/PACKAGE @@ -1,5 +1,5 @@ { "description": "Cuda accelerated Small Matrix Multiplications", "archive": "libdbcsr", -"requires": ["kernels", "../include", "../../include"] +"requires": ["kernels", "..", "../../include"] } diff --git a/src/acc/libsmm_acc/libsmm_acc_init.cpp b/src/acc/libsmm_acc/libsmm_acc_init.cpp index 906e1e8db4a..881f3b47b3e 100644 --- a/src/acc/libsmm_acc/libsmm_acc_init.cpp +++ b/src/acc/libsmm_acc/libsmm_acc_init.cpp @@ -54,7 +54,7 @@ int libsmm_acc_gpu_blas_init(){ //=========================================================================== -int libsmm_acc_init() { +extern "C" int libsmm_acc_init() { #if !defined(NO_DBCSR_TIMESET) std::string routineN = "libsmm_acc_init"; int handle; @@ -71,7 +71,7 @@ int libsmm_acc_init() { //=========================================================================== -int libsmm_acc_finalize() { +extern "C" int libsmm_acc_finalize() { #if !defined(NO_DBCSR_TIMESET) std::string routineN = "libsmm_acc_finalize"; int handle; diff --git a/src/acc/libsmm_acc/libsmm_acc_init.h b/src/acc/libsmm_acc/libsmm_acc_init.h index 5806db2fd48..17f4f21a7b3 100644 --- a/src/acc/libsmm_acc/libsmm_acc_init.h +++ b/src/acc/libsmm_acc/libsmm_acc_init.h @@ -24,6 +24,7 @@ void timestop(int handle); #endif extern "C" int libsmm_acc_init (void); +extern "C" int libsmm_acc_finalize (void); int libsmm_acc_gpu_blas_init(); diff --git a/src/mm/dbcsr_mm.F b/src/mm/dbcsr_mm.F index bba840f9a60..98f80588b74 100644 --- a/src/mm/dbcsr_mm.F +++ b/src/mm/dbcsr_mm.F @@ -394,11 +394,12 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, & output_unit INTEGER(KIND=int_8) :: my_flop LOGICAL :: ab_dense, keep_product_data, keep_sparsity, product_reindex, release_tdist, & - transpose_left, transpose_right, use_dense_mult, use_mempools + transpose_left, transpose_right, use_dense_mult, use_mempools, thread_dist_force REAL(KIND=dp) :: cs TYPE(array_i1d_obj) :: dense_col_sizes, dense_k_sizes, dense_row_sizes, k_vmap, m_map, & n_map, old_product_col_blk_offsets, old_product_col_blk_sizes, & - old_product_row_blk_offsets, old_product_row_blk_sizes + old_product_row_blk_offsets, old_product_row_blk_sizes, & + matrix_c_thread_dist TYPE(dbcsr_2d_array_type), POINTER :: m2s_left, m2s_right TYPE(dbcsr_distribution_obj) :: dense_product_distribution, & old_product_distribution @@ -702,11 +703,20 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, & ! ! The thread distribution must reflect the current (possibly ! dense) distribution + thread_dist_force = .FALSE. IF (.NOT. dbcsr_distribution_has_threads(matrix_c%dist)) THEN release_tdist = .TRUE. CALL dbcsr_distribution_make_threads(matrix_c%dist) ELSE release_tdist = .FALSE. + ! Make sure matrix_c thread dist == matrix_left thread dist + ! This is currently a workaround + IF (dbcsr_distribution_has_threads(matrix_left%dist)) THEN + matrix_c_thread_dist = matrix_c%dist%d%thread_dist + matrix_c%dist%d%thread_dist = matrix_left%dist%d%thread_dist + CALL array_hold(matrix_left%dist%d%thread_dist) + thread_dist_force = .TRUE. + ENDIF ENDIF ! ! Compute number of images (rows and columns) @@ -896,11 +906,15 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, & CALL mp_isync(comm, request_sync_mult) ENDIF ! - IF (transpose_left) CALL dbcsr_release(matrix_left) - IF (transpose_right) CALL dbcsr_release(matrix_right) IF (release_tdist) THEN CALL dbcsr_distribution_no_threads(product_matrix%dist) + ELSEIF (thread_dist_force) THEN + ! Restore matrix_c thread-dist + matrix_c%dist%d%thread_dist = matrix_c_thread_dist + CALL array_release(matrix_left%dist%d%thread_dist) ENDIF + IF (transpose_left) CALL dbcsr_release(matrix_left) + IF (transpose_right) CALL dbcsr_release(matrix_right) ! CALL dbcsr_release_locals(product_matrix) ! The index of the product matrix is reset to the CP2K form if it diff --git a/tests/dbcsr_test_scale_by_vector.F b/tests/dbcsr_test_scale_by_vector.F index 7df29deaf6d..248a0b87428 100644 --- a/tests/dbcsr_test_scale_by_vector.F +++ b/tests/dbcsr_test_scale_by_vector.F @@ -154,7 +154,7 @@ FUNCTION dbcsr_test_scale_by_vectors(test_name, mp_group, mp_env, npdims, io_uni ! ! Prepare test parameters - success = test_scale_by_vector(mp_env, npdims, matrix, vector_data, do_exact_comparison) + success = test_scale_by_vector(mp_env, npdims, matrix, vector_data, do_exact_comparison) .AND. success IF (io_unit > 0) THEN IF (success) THEN