diff --git a/VERSION b/VERSION
index 3993d6f24f5..6321062b636 100644
--- a/VERSION
+++ b/VERSION
@@ -1,6 +1,6 @@
 MAJOR = 2
 MINOR = 1
-PATCH = 0-rc18
+PATCH = 0-rc19
 # A specific DATE (YYYY-MM-DD) fixes an official release, otherwise
 # it is considered Development version.
 DATE  =
diff --git a/src/acc/acc.h b/src/acc/acc.h
index 57c9531b8d1..f60a5a17df7 100644
--- a/src/acc/acc.h
+++ b/src/acc/acc.h
@@ -11,6 +11,12 @@
 
 #include <stddef.h>
 
+#define DBCSR_STRINGIFY_AUX(SYMBOL) #SYMBOL
+#define DBCSR_STRINGIFY(SYMBOL) DBCSR_STRINGIFY_AUX(SYMBOL)
+#define DBCSR_CONCATENATE2(A, B) A##B
+#define DBCSR_CONCATENATE(A, B) DBCSR_CONCATENATE2(A, B)
+
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
diff --git a/src/acc/acc_bench_smm.c b/src/acc/acc_bench_smm.c
index e3e38f06b98..1802010d7da 100644
--- a/src/acc/acc_bench_smm.c
+++ b/src/acc/acc_bench_smm.c
@@ -20,6 +20,9 @@
 #if !defined(ELEM_TYPE)
 # define ELEM_TYPE double
 #endif
+#if !defined(EPSILON)
+# define EPSILON 1E-3
+#endif
 #if !defined(MAX_KERNEL_DIM)
 # define MAX_KERNEL_DIM 80
 #endif
@@ -67,44 +70,66 @@ int main(int argc, char* argv[])
   const int mn = m * n, mk = m * k, kn = k * n;
 #endif
 #if defined(WARMUP) && (0 < WARMUP) && !defined(_DEBUG)
-  const int warmup = WARMUP;
+  const int warmup = MAX(WARMUP, 2) / 2 * 2;
 #else
   const int warmup = 0;
 #endif
-  int *stack_hst = NULL, *stack_dev = NULL;
+  int *stack_hst = NULL, *stack_dev = NULL, *trans_hst = NULL, *trans_dev = NULL;
   ELEM_TYPE *amat_hst = NULL, *bmat_hst = NULL, *cmat_hst = NULL;
   ELEM_TYPE *amat_dev = NULL, *bmat_dev = NULL, *cmat_dev = NULL;
-  int result = EXIT_SUCCESS, r, i;
+  int result = EXIT_SUCCESS, ndevices = 0, r, i;
   void *stream = NULL;
 #if defined(USE_LIBXSMM)
   libxsmm_timer_tickint start;
-  double duration;
+  double duration, transpose;
 #endif
   assert(m <= (mn / n) && 0 == (mn % n) && k <= (mk / k) && 0 == (mk % k) && n <= (kn / n) && 0 == (kn % n));
-  printf("%s%s%i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n, k);
+  printf("%s%s%i %i %i %i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "",
+    nrepeat, stack_size, m, n, k, nc, na, nb);
   CHECK(acc_init(), &result);
+  CHECK(acc_get_ndevices(&ndevices), &result);
+  if (0 < ndevices) {
+#if defined(_DEBUG)
+    fprintf(stderr, "number of devices found: %i\n", ndevices);
+#endif
+  }
+  else {
+#if defined(_DEBUG)
+    fprintf(stderr, "Error: no device found!\n");
+#endif
+    CHECK(acc_finalize(), NULL);
+    return result;
+  }
+  printf("element type: %s\n", DBCSR_STRINGIFY(ELEM_TYPE));
   CHECK(acc_stream_create(&stream, "stream", -1/*default priority*/), &result);
-  CHECK(acc_host_mem_allocate((void**)&amat_hst, sizeof(ELEM_TYPE) * mk * stack_size, stream), &result);
-  CHECK(acc_host_mem_allocate((void**)&bmat_hst, sizeof(ELEM_TYPE) * kn * stack_size, stream), &result);
-  CHECK(acc_host_mem_allocate((void**)&cmat_hst, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
+  CHECK(acc_host_mem_allocate((void**)&amat_hst, sizeof(ELEM_TYPE) * mk * na, stream), &result);
+  CHECK(acc_host_mem_allocate((void**)&bmat_hst, sizeof(ELEM_TYPE) * kn * nb, stream), &result);
+  CHECK(acc_host_mem_allocate((void**)&cmat_hst, sizeof(ELEM_TYPE) * mn * nc, stream), &result);
   CHECK(acc_host_mem_allocate((void**)&stack_hst, sizeof(int) * 3 * stack_size, stream), &result);
+  CHECK(acc_host_mem_allocate((void**)&trans_hst, sizeof(int) * nb, stream), &result);
   CHECK(acc_stream_sync(stream), &result); /* ensure host-data is allocated */
-  for (i = 0; i < stack_size; ++i) { /* initialize matrices */
+  /* initialize matrices */
+  for (i = 0; i < na; ++i) {
     init(i/*seed*/ + 42, &amat_hst[i*mk], m, k);
+  }
+  for (i = 0; i < nb; ++i) {
     init(i/*seed*/ + 24, &bmat_hst[i*kn], k, n);
+    trans_hst[i] = i * kn;
   }
   init_stack(stack_hst, stack_size, mn, mk, kn, nc, na, nb);
-  CHECK(acc_dev_mem_allocate((void**)&amat_dev, sizeof(ELEM_TYPE) * mk * stack_size), &result);
-  CHECK(acc_dev_mem_allocate((void**)&bmat_dev, sizeof(ELEM_TYPE) * kn * stack_size), &result);
-  CHECK(acc_dev_mem_allocate((void**)&cmat_dev, sizeof(ELEM_TYPE) * mn * stack_size), &result);
+  CHECK(acc_dev_mem_allocate((void**)&amat_dev, sizeof(ELEM_TYPE) * mk * na), &result);
+  CHECK(acc_dev_mem_allocate((void**)&bmat_dev, sizeof(ELEM_TYPE) * kn * nb), &result);
+  CHECK(acc_dev_mem_allocate((void**)&cmat_dev, sizeof(ELEM_TYPE) * mn * nc), &result);
   CHECK(acc_dev_mem_allocate((void**)&stack_dev, sizeof(int) * 3 * stack_size), &result);
-  CHECK(acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
+  CHECK(acc_dev_mem_allocate((void**)&trans_dev, sizeof(int) * nb), &result);
+  CHECK(acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * nc, stream), &result);
+  CHECK(acc_memcpy_h2d(trans_hst, trans_dev, sizeof(int) * nb, stream), &result);
 #if defined(USE_LIBXSMM)
   CHECK(acc_stream_sync(stream), &result);
   start = libxsmm_timer_tick();
 #endif
-  CHECK(acc_memcpy_h2d(amat_hst, amat_dev, sizeof(ELEM_TYPE) * mk * stack_size, stream), &result);
-  CHECK(acc_memcpy_h2d(bmat_hst, bmat_dev, sizeof(ELEM_TYPE) * kn * stack_size, stream), &result);
+  CHECK(acc_memcpy_h2d(amat_hst, amat_dev, sizeof(ELEM_TYPE) * mk * na, stream), &result);
+  CHECK(acc_memcpy_h2d(bmat_hst, bmat_dev, sizeof(ELEM_TYPE) * kn * nb, stream), &result);
   CHECK(acc_memcpy_h2d(stack_hst, stack_dev, sizeof(int) * 3 * stack_size, stream), &result);
 #if defined(USE_LIBXSMM)
   CHECK(acc_stream_sync(stream), &result);
@@ -113,17 +138,36 @@ int main(int argc, char* argv[])
     (sizeof(ELEM_TYPE) * (mk + kn) + sizeof(int) * 3)
       * stack_size / (duration * (1ULL << 30)));
 #endif
-  /* warmup execution and prebuild JIT kernels */
+  /* warmup execution and prebuild transpose-kernel */
+  for (r = 0; r < warmup / 2; ++r) {
+    CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev,
+      DBCSR_TYPE(ELEM_TYPE), k, n, MAX_KERNEL_DIM, stream), &result);
+    CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev,
+      DBCSR_TYPE(ELEM_TYPE), n, k, MAX_KERNEL_DIM, stream), &result);
+  }
+#if defined(USE_LIBXSMM)
+  CHECK(acc_stream_sync(stream), &result);
+  start = libxsmm_timer_tick();
+#endif
+  /* to perform NN-SMMs on the device, all B-matrices are transposed upfront (SMM-kernel is limited to NT) */
+  CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev,
+    DBCSR_TYPE(ELEM_TYPE), k, n, MAX_KERNEL_DIM, stream), &result);
+#if defined(USE_LIBXSMM)
+  CHECK(acc_stream_sync(stream), &result);
+  transpose = libxsmm_timer_duration(start, libxsmm_timer_tick());
+#endif
+  /* warmup execution and prebuild SMM-kernel */
   for (r = 0; r < warmup; ++r) {
     CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, 3/*nparams*/, DBCSR_TYPE(ELEM_TYPE),
       amat_dev, bmat_dev, cmat_dev, m, n, k, MAX_KERNEL_DIM, 1/*homogeneous*/, stream, stream), &result);
   }
+  CHECK(acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * nc, stream), &result);
 #if defined(USE_LIBXSMM)
   CHECK(acc_stream_sync(stream), &result);
   start = libxsmm_timer_tick();
 #endif
   for (r = 0; r < nrepeat; ++r) {
-    /* GPU-kernel is limited to C += Ai * Bi^T (i.e., NT, for NN, all Bi must be transposed upfront) */
+    /* GPU-kernel is limited to C += Ai * Bi^T, i.e., NT (for NN, all Bi must be transposed upfront) */
     CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, 3/*nparams*/, DBCSR_TYPE(ELEM_TYPE),
       amat_dev, bmat_dev, cmat_dev, m, n, k, MAX_KERNEL_DIM, 1/*homogeneous*/, stream, stream), &result);
   }
@@ -131,37 +175,81 @@ int main(int argc, char* argv[])
   CHECK(acc_stream_sync(stream), &result);
   duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
   if (EXIT_SUCCESS == result) {
-    const char transa = 'N', transb = 'T';
+    ELEM_TYPE *const gold_hst = (ELEM_TYPE*)libxsmm_malloc(sizeof(ELEM_TYPE) * mn * nc);
+    const char transa = 'N', transb = 'N';
     const ELEM_TYPE alpha = 1, beta = 1;
+    printf("transpose: %.1f ms %.1f GFLOPS/s\n", 1000.0 * (duration + transpose) / nrepeat,
+      ((size_t)2 * m * n * k) * stack_size / ((duration + transpose) * (1ULL << 30) / nrepeat));
     printf("device: %.1f ms %.1f GFLOPS/s\n", 1000.0 * duration / nrepeat,
       ((size_t)2 * m * n * k) * stack_size / (duration * (1ULL << 30) / nrepeat));
-    memset(cmat_hst, 0, sizeof(ELEM_TYPE) * mn * stack_size);
+    memset(gold_hst, 0, sizeof(ELEM_TYPE) * mn * nc);
+    for (r = 0; r < warmup; ++r) {
+      libxsmm_gemm_batch_omp(LIBXSMM_GEMM_PRECISION(ELEM_TYPE), LIBXSMM_GEMM_PRECISION(ELEM_TYPE),
+        &transa, &transb, m, n, k, &alpha, amat_hst, &m/*lda*/, bmat_hst, &k/*ldb*/,
+        &beta, gold_hst, &m/*ldc*/, 1/*index_base*/, sizeof(int) * 3,
+        stack_hst + 0, stack_hst + 1, stack_hst + 2, stack_size);
+    }
+    memset(gold_hst, 0, sizeof(ELEM_TYPE) * mn * nc);
     start = libxsmm_timer_tick();
+    /* CPU-kernel operates on data that is not initialized in NUMA-aware fashion */
     for (r = 0; r < nrepeat; ++r) {
-      /* CPU-kernel performs C += Ai * Bi^T to match result of GPU-kernel (NT may perform below NN) */
       libxsmm_gemm_batch_omp(LIBXSMM_GEMM_PRECISION(ELEM_TYPE), LIBXSMM_GEMM_PRECISION(ELEM_TYPE),
         &transa, &transb, m, n, k, &alpha, amat_hst, &m/*lda*/, bmat_hst, &k/*ldb*/,
-        &beta, cmat_hst, &m/*ldc*/, 1/*index_base*/, sizeof(int) * 3,
+        &beta, gold_hst, &m/*ldc*/, 1/*index_base*/, sizeof(int) * 3,
         stack_hst + 0, stack_hst + 1, stack_hst + 2, stack_size);
     }
     duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
     printf("host: %.1f ms %.1f GFLOPS/s\n", 1000.0 * duration / nrepeat,
       ((size_t)2 * m * n * k) * stack_size / (duration * (1ULL << 30) / nrepeat));
-    /* transfer result from device back to host for validation */
-    CHECK(acc_memcpy_d2h(cmat_dev, cmat_hst, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
+    /* transfer result from device to host for validation */
+    CHECK(acc_memcpy_d2h(cmat_dev, cmat_hst, sizeof(ELEM_TYPE) * mn * nc, stream), &result);
     CHECK(acc_stream_sync(stream), &result);
-    /* TODO: validation code TBD */
+    if (EXIT_SUCCESS == result) {
+      double abserror = 0, relerror = 0;
+      for (i = 0; i < nc; ++i) {
+        const ELEM_TYPE *const gold = gold_hst + mn * i;
+        const ELEM_TYPE *const test = cmat_hst + mn * i;
+        double diff = 0, a = 0, b = 0;
+        for (r = 0; r < (m * n); ++r) {
+          const double ar = (double)gold[r];
+          const double br = (double)test[r];
+          const double d = fabs(ar - br);
+          if (d > diff) {
+            diff = d;
+            a = ar;
+            b = br;
+          }
+        }
+        if (0 < diff) {
+# if defined(_DEBUG)
+          print(stderr, "gold = ", gold, m, n);
+          print(stderr, "test = ", test, m, n);
+          fprintf(stderr, "diff = %g (%g != %g)\n", diff, a, b);
+# endif
+          if (abserror < diff) {
+            relerror = fabs(0 != a ? (diff / a) : (diff / b));
+            abserror = diff;
+          }
+        }
+      }
+      printf("max.error: rel=%g\n", relerror);
+      if (EPSILON < relerror) result = EXIT_FAILURE;
+    }
+    libxsmm_free(gold_hst);
   }
 #endif
   CHECK(acc_host_mem_deallocate(stack_hst, stream), NULL);
+  CHECK(acc_host_mem_deallocate(trans_hst, stream), NULL);
   CHECK(acc_host_mem_deallocate(amat_hst, stream), NULL);
   CHECK(acc_host_mem_deallocate(bmat_hst, stream), NULL);
   CHECK(acc_host_mem_deallocate(cmat_hst, stream), NULL);
   CHECK(acc_dev_mem_deallocate(stack_dev), NULL);
+  CHECK(acc_dev_mem_deallocate(trans_dev), NULL);
   CHECK(acc_dev_mem_deallocate(amat_dev), NULL);
   CHECK(acc_dev_mem_deallocate(bmat_dev), NULL);
   CHECK(acc_dev_mem_deallocate(cmat_dev), NULL);
   CHECK(acc_stream_destroy(stream), NULL);
+  CHECK(acc_finalize(), NULL);
   if (EXIT_SUCCESS != result) {
     fprintf(stderr, "FAILED\n");
   }
diff --git a/src/acc/acc_bench_trans.c b/src/acc/acc_bench_trans.c
index 4b50a65e4a5..000150436a3 100644
--- a/src/acc/acc_bench_trans.c
+++ b/src/acc/acc_bench_trans.c
@@ -75,7 +75,7 @@ int main(int argc, char* argv[])
 #endif
   int *stack_hst = NULL, *stack_dev = NULL;
   ELEM_TYPE *mat_hst = NULL, *mat_dev = NULL;
-  int result = EXIT_SUCCESS, r, i, mm = m, nn = n;
+  int result = EXIT_SUCCESS, ndevices = 0, r, i, mm = m, nn = n;
   void *stream = NULL;
 #if defined(USE_LIBXSMM)
   libxsmm_timer_tickint start;
@@ -84,6 +84,20 @@ int main(int argc, char* argv[])
   assert(m <= (mn / n) && 0 == (mn % n));
   printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n);
   CHECK(acc_init(), &result);
+  CHECK(acc_get_ndevices(&ndevices), &result);
+  if (0 < ndevices) {
+#if defined(_DEBUG)
+    fprintf(stderr, "number of devices found: %i\n", ndevices);
+#endif
+  }
+  else {
+#if defined(_DEBUG)
+    fprintf(stderr, "Error: no device found!\n");
+#endif
+    CHECK(acc_finalize(), NULL);
+    return result;
+  }
+  printf("element type: %s\n", DBCSR_STRINGIFY(ELEM_TYPE));
 #if defined(PRIORITY)
   CHECK(acc_stream_priority_range(&priomin, &priomax), &result);
   CHECK(acc_stream_create(&stream, "stream", (priomin + priomax) / 2), &result);
@@ -154,24 +168,23 @@ int main(int argc, char* argv[])
     printf("host: %.1f ms %.1f GB/s\n", 1000.0 * duration / nodd,
       (sizeof(ELEM_TYPE) * mn + sizeof(int))
         * stack_size / (duration * (1ULL << 30) / nodd));
-    /* transfer result from device back to host for validation */
+    /* transfer result from device to host for validation */
     CHECK(acc_memcpy_d2h(mat_dev, mat_hst,
       sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
     CHECK(acc_stream_sync(stream), &result);
     if (EXIT_SUCCESS == result) {
       unsigned int nerrors = 0;
-      int j;
       for (i = 0; i < stack_size; ++i) {
         ELEM_TYPE gold[MAX_KERNEL_DIM*MAX_KERNEL_DIM];
         const ELEM_TYPE *const test = mat_hst + mn * i;
         init(i/*seed*/, gold, m, n);
         libxsmm_itrans(gold, sizeof(ELEM_TYPE), m, n, m, n);
-        for (j = 0; j < (m * n); ++j) {
-          if (gold[j] != test[j]) {
+        for (r = 0; r < (m * n); ++r) {
+          if (gold[r] != test[r]) {
             ++nerrors;
 # if defined(_DEBUG)
             print(stderr, "gold = ", gold, n, m);
-            print(stderr, "this = ", test, n, m);
+            print(stderr, "test = ", test, n, m);
             init(i/*seed*/, gold, m, n);
             print(stderr, "orig = ", gold, m, n);
             fprintf(stderr, "\n");
@@ -190,6 +203,7 @@ int main(int argc, char* argv[])
   CHECK(acc_dev_mem_deallocate(stack_dev), NULL);
   CHECK(acc_dev_mem_deallocate(mat_dev), NULL);
   CHECK(acc_stream_destroy(stream), NULL);
+  CHECK(acc_finalize(), NULL);
   if (EXIT_SUCCESS != result) {
     fprintf(stderr, "FAILED\n");
   }
diff --git a/src/acc/acc_libsmm.h b/src/acc/acc_libsmm.h
index e8fb3843c71..13c0cc565ec 100644
--- a/src/acc/acc_libsmm.h
+++ b/src/acc/acc_libsmm.h
@@ -11,7 +11,6 @@
 
 #include "acc.h"
 
-#define DBCSR_CONCATENATE(A, B) A##B
 #define DBCSR_TYPE(T) DBCSR_CONCATENATE(DBCSR_TYPE_, T)
 #define DBCSR_TYPE_double dbcsr_type_real_8
 #define DBCSR_TYPE_float dbcsr_type_real_4
@@ -29,6 +28,7 @@ typedef enum libsmm_acc_data_t {
 } libsmm_acc_data_t;
 
 int libsmm_acc_init(void);
+int libsmm_acc_finalize(void);
 acc_bool_t libsmm_acc_is_thread_safe(void);
 
 int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size,
diff --git a/src/acc/cuda/acc_init.cpp b/src/acc/cuda/acc_init.cpp
index 405f3282496..d3a114aa8c3 100644
--- a/src/acc/cuda/acc_init.cpp
+++ b/src/acc/cuda/acc_init.cpp
@@ -45,5 +45,5 @@ extern "C" int acc_finalize(){
   ACC_API_CALL(GetDevice, (&myDevice));
   ACC_DRV_CALL(DeviceGet, (&acc_device, myDevice));
   ACC_DRV_CALL(DevicePrimaryCtxRelease, (acc_device));
-  return 0;
+  return libsmm_acc_finalize();
 }
diff --git a/src/acc/libsmm_acc/PACKAGE b/src/acc/libsmm_acc/PACKAGE
index 9d4dcf3960c..3fd95a6d4db 100644
--- a/src/acc/libsmm_acc/PACKAGE
+++ b/src/acc/libsmm_acc/PACKAGE
@@ -1,5 +1,5 @@
 {
-"description": "Generic GPU-accelerated library for small matrix multiplications",
+"description": "CUDA/HIP-accelerated library for small matrix multiplications",
 "archive": "libdbcsr",
 "requires": ["..", "../cuda", "../hip"]
 }
diff --git a/src/acc/libsmm_acc/libcusmm/.gitignore b/src/acc/libsmm_acc/libcusmm/.gitignore
new file mode 100644
index 00000000000..7ed9e3bd2f8
--- /dev/null
+++ b/src/acc/libsmm_acc/libcusmm/.gitignore
@@ -0,0 +1 @@
+cusmm_kernels.h
diff --git a/src/acc/libsmm_acc/libcusmm/PACKAGE b/src/acc/libsmm_acc/libcusmm/PACKAGE
index 9ebb80de9b4..b9818a727c5 100644
--- a/src/acc/libsmm_acc/libcusmm/PACKAGE
+++ b/src/acc/libsmm_acc/libcusmm/PACKAGE
@@ -1,5 +1,5 @@
 {
 "description": "Cuda accelerated Small Matrix Multiplications",
 "archive": "libdbcsr",
-"requires": ["kernels", "../include", "../../include"]
+"requires": ["kernels", "..", "../../include"]
 }
diff --git a/src/acc/libsmm_acc/libsmm_acc_init.cpp b/src/acc/libsmm_acc/libsmm_acc_init.cpp
index 906e1e8db4a..881f3b47b3e 100644
--- a/src/acc/libsmm_acc/libsmm_acc_init.cpp
+++ b/src/acc/libsmm_acc/libsmm_acc_init.cpp
@@ -54,7 +54,7 @@ int libsmm_acc_gpu_blas_init(){
 
 
 //===========================================================================
-int libsmm_acc_init() {
+extern "C" int libsmm_acc_init() {
 #if !defined(NO_DBCSR_TIMESET)
     std::string routineN = "libsmm_acc_init";
     int handle;
@@ -71,7 +71,7 @@ int libsmm_acc_init() {
 
 
 //===========================================================================
-int libsmm_acc_finalize() {
+extern "C" int libsmm_acc_finalize() {
 #if !defined(NO_DBCSR_TIMESET)
     std::string routineN = "libsmm_acc_finalize";
     int handle;
diff --git a/src/acc/libsmm_acc/libsmm_acc_init.h b/src/acc/libsmm_acc/libsmm_acc_init.h
index 5806db2fd48..17f4f21a7b3 100644
--- a/src/acc/libsmm_acc/libsmm_acc_init.h
+++ b/src/acc/libsmm_acc/libsmm_acc_init.h
@@ -24,6 +24,7 @@ void timestop(int handle);
 #endif
 
 extern "C" int libsmm_acc_init (void);
+extern "C" int libsmm_acc_finalize (void);
 
 int libsmm_acc_gpu_blas_init();
 
diff --git a/src/mm/dbcsr_mm.F b/src/mm/dbcsr_mm.F
index bba840f9a60..98f80588b74 100644
--- a/src/mm/dbcsr_mm.F
+++ b/src/mm/dbcsr_mm.F
@@ -394,11 +394,12 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, &
                  output_unit
       INTEGER(KIND=int_8)                                :: my_flop
       LOGICAL :: ab_dense, keep_product_data, keep_sparsity, product_reindex, release_tdist, &
-                 transpose_left, transpose_right, use_dense_mult, use_mempools
+                 transpose_left, transpose_right, use_dense_mult, use_mempools, thread_dist_force
       REAL(KIND=dp)                                      :: cs
       TYPE(array_i1d_obj) :: dense_col_sizes, dense_k_sizes, dense_row_sizes, k_vmap, m_map, &
                              n_map, old_product_col_blk_offsets, old_product_col_blk_sizes, &
-                             old_product_row_blk_offsets, old_product_row_blk_sizes
+                             old_product_row_blk_offsets, old_product_row_blk_sizes, &
+                             matrix_c_thread_dist
       TYPE(dbcsr_2d_array_type), POINTER                 :: m2s_left, m2s_right
       TYPE(dbcsr_distribution_obj)                       :: dense_product_distribution, &
                                                             old_product_distribution
@@ -702,11 +703,20 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, &
       !
       ! The thread distribution must reflect the current (possibly
       ! dense) distribution
+      thread_dist_force = .FALSE.
       IF (.NOT. dbcsr_distribution_has_threads(matrix_c%dist)) THEN
          release_tdist = .TRUE.
          CALL dbcsr_distribution_make_threads(matrix_c%dist)
       ELSE
          release_tdist = .FALSE.
+         ! Make sure matrix_c thread dist == matrix_left thread dist
+         ! This is currently a workaround
+         IF (dbcsr_distribution_has_threads(matrix_left%dist)) THEN
+            matrix_c_thread_dist = matrix_c%dist%d%thread_dist
+            matrix_c%dist%d%thread_dist = matrix_left%dist%d%thread_dist
+            CALL array_hold(matrix_left%dist%d%thread_dist)
+            thread_dist_force = .TRUE.
+         ENDIF
       ENDIF
       !
       ! Compute number of images (rows and columns)
@@ -896,11 +906,15 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, &
          CALL mp_isync(comm, request_sync_mult)
       ENDIF
       !
-      IF (transpose_left) CALL dbcsr_release(matrix_left)
-      IF (transpose_right) CALL dbcsr_release(matrix_right)
       IF (release_tdist) THEN
          CALL dbcsr_distribution_no_threads(product_matrix%dist)
+      ELSEIF (thread_dist_force) THEN
+         ! Restore matrix_c thread-dist
+         matrix_c%dist%d%thread_dist = matrix_c_thread_dist
+         CALL array_release(matrix_left%dist%d%thread_dist)
       ENDIF
+      IF (transpose_left) CALL dbcsr_release(matrix_left)
+      IF (transpose_right) CALL dbcsr_release(matrix_right)
       !
       CALL dbcsr_release_locals(product_matrix)
       ! The index of the product matrix is reset to the CP2K form if it
diff --git a/tests/dbcsr_test_scale_by_vector.F b/tests/dbcsr_test_scale_by_vector.F
index 7df29deaf6d..248a0b87428 100644
--- a/tests/dbcsr_test_scale_by_vector.F
+++ b/tests/dbcsr_test_scale_by_vector.F
@@ -154,7 +154,7 @@ FUNCTION dbcsr_test_scale_by_vectors(test_name, mp_group, mp_env, npdims, io_uni
 
             !
             ! Prepare test parameters
-            success = test_scale_by_vector(mp_env, npdims, matrix, vector_data, do_exact_comparison)
+            success = test_scale_by_vector(mp_env, npdims, matrix, vector_data, do_exact_comparison) .AND. success
 
             IF (io_unit > 0) THEN
                IF (success) THEN