Skip to content

Commit

Permalink
Merge branch 'release-2.1.0-rc19'
Browse files Browse the repository at this point in the history
  • Loading branch information
alazzaro committed Dec 3, 2020
2 parents 9243ed1 + 2cf7c7e commit d622ca0
Show file tree
Hide file tree
Showing 13 changed files with 166 additions and 42 deletions.
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
MAJOR = 2
MINOR = 1
PATCH = 0-rc18
PATCH = 0-rc19
# A specific DATE (YYYY-MM-DD) fixes an official release, otherwise
# it is considered Development version.
DATE =
6 changes: 6 additions & 0 deletions src/acc/acc.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@

#include <stddef.h>

#define DBCSR_STRINGIFY_AUX(SYMBOL) #SYMBOL
#define DBCSR_STRINGIFY(SYMBOL) DBCSR_STRINGIFY_AUX(SYMBOL)
#define DBCSR_CONCATENATE2(A, B) A##B
#define DBCSR_CONCATENATE(A, B) DBCSR_CONCATENATE2(A, B)


#if defined(__cplusplus)
extern "C" {
#endif
Expand Down
136 changes: 112 additions & 24 deletions src/acc/acc_bench_smm.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
#if !defined(ELEM_TYPE)
# define ELEM_TYPE double
#endif
#if !defined(EPSILON)
# define EPSILON 1E-3
#endif
#if !defined(MAX_KERNEL_DIM)
# define MAX_KERNEL_DIM 80
#endif
Expand Down Expand Up @@ -67,44 +70,66 @@ int main(int argc, char* argv[])
const int mn = m * n, mk = m * k, kn = k * n;
#endif
#if defined(WARMUP) && (0 < WARMUP) && !defined(_DEBUG)
const int warmup = WARMUP;
const int warmup = MAX(WARMUP, 2) / 2 * 2;
#else
const int warmup = 0;
#endif
int *stack_hst = NULL, *stack_dev = NULL;
int *stack_hst = NULL, *stack_dev = NULL, *trans_hst = NULL, *trans_dev = NULL;
ELEM_TYPE *amat_hst = NULL, *bmat_hst = NULL, *cmat_hst = NULL;
ELEM_TYPE *amat_dev = NULL, *bmat_dev = NULL, *cmat_dev = NULL;
int result = EXIT_SUCCESS, r, i;
int result = EXIT_SUCCESS, ndevices = 0, r, i;
void *stream = NULL;
#if defined(USE_LIBXSMM)
libxsmm_timer_tickint start;
double duration;
double duration, transpose;
#endif
assert(m <= (mn / n) && 0 == (mn % n) && k <= (mk / k) && 0 == (mk % k) && n <= (kn / n) && 0 == (kn % n));
printf("%s%s%i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n, k);
printf("%s%s%i %i %i %i %i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "",
nrepeat, stack_size, m, n, k, nc, na, nb);
CHECK(acc_init(), &result);
CHECK(acc_get_ndevices(&ndevices), &result);
if (0 < ndevices) {
#if defined(_DEBUG)
fprintf(stderr, "number of devices found: %i\n", ndevices);
#endif
}
else {
#if defined(_DEBUG)
fprintf(stderr, "Error: no device found!\n");
#endif
CHECK(acc_finalize(), NULL);
return result;
}
printf("element type: %s\n", DBCSR_STRINGIFY(ELEM_TYPE));
CHECK(acc_stream_create(&stream, "stream", -1/*default priority*/), &result);
CHECK(acc_host_mem_allocate((void**)&amat_hst, sizeof(ELEM_TYPE) * mk * stack_size, stream), &result);
CHECK(acc_host_mem_allocate((void**)&bmat_hst, sizeof(ELEM_TYPE) * kn * stack_size, stream), &result);
CHECK(acc_host_mem_allocate((void**)&cmat_hst, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
CHECK(acc_host_mem_allocate((void**)&amat_hst, sizeof(ELEM_TYPE) * mk * na, stream), &result);
CHECK(acc_host_mem_allocate((void**)&bmat_hst, sizeof(ELEM_TYPE) * kn * nb, stream), &result);
CHECK(acc_host_mem_allocate((void**)&cmat_hst, sizeof(ELEM_TYPE) * mn * nc, stream), &result);
CHECK(acc_host_mem_allocate((void**)&stack_hst, sizeof(int) * 3 * stack_size, stream), &result);
CHECK(acc_host_mem_allocate((void**)&trans_hst, sizeof(int) * nb, stream), &result);
CHECK(acc_stream_sync(stream), &result); /* ensure host-data is allocated */
for (i = 0; i < stack_size; ++i) { /* initialize matrices */
/* initialize matrices */
for (i = 0; i < na; ++i) {
init(i/*seed*/ + 42, &amat_hst[i*mk], m, k);
}
for (i = 0; i < nb; ++i) {
init(i/*seed*/ + 24, &bmat_hst[i*kn], k, n);
trans_hst[i] = i * kn;
}
init_stack(stack_hst, stack_size, mn, mk, kn, nc, na, nb);
CHECK(acc_dev_mem_allocate((void**)&amat_dev, sizeof(ELEM_TYPE) * mk * stack_size), &result);
CHECK(acc_dev_mem_allocate((void**)&bmat_dev, sizeof(ELEM_TYPE) * kn * stack_size), &result);
CHECK(acc_dev_mem_allocate((void**)&cmat_dev, sizeof(ELEM_TYPE) * mn * stack_size), &result);
CHECK(acc_dev_mem_allocate((void**)&amat_dev, sizeof(ELEM_TYPE) * mk * na), &result);
CHECK(acc_dev_mem_allocate((void**)&bmat_dev, sizeof(ELEM_TYPE) * kn * nb), &result);
CHECK(acc_dev_mem_allocate((void**)&cmat_dev, sizeof(ELEM_TYPE) * mn * nc), &result);
CHECK(acc_dev_mem_allocate((void**)&stack_dev, sizeof(int) * 3 * stack_size), &result);
CHECK(acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
CHECK(acc_dev_mem_allocate((void**)&trans_dev, sizeof(int) * nb), &result);
CHECK(acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * nc, stream), &result);
CHECK(acc_memcpy_h2d(trans_hst, trans_dev, sizeof(int) * nb, stream), &result);
#if defined(USE_LIBXSMM)
CHECK(acc_stream_sync(stream), &result);
start = libxsmm_timer_tick();
#endif
CHECK(acc_memcpy_h2d(amat_hst, amat_dev, sizeof(ELEM_TYPE) * mk * stack_size, stream), &result);
CHECK(acc_memcpy_h2d(bmat_hst, bmat_dev, sizeof(ELEM_TYPE) * kn * stack_size, stream), &result);
CHECK(acc_memcpy_h2d(amat_hst, amat_dev, sizeof(ELEM_TYPE) * mk * na, stream), &result);
CHECK(acc_memcpy_h2d(bmat_hst, bmat_dev, sizeof(ELEM_TYPE) * kn * nb, stream), &result);
CHECK(acc_memcpy_h2d(stack_hst, stack_dev, sizeof(int) * 3 * stack_size, stream), &result);
#if defined(USE_LIBXSMM)
CHECK(acc_stream_sync(stream), &result);
Expand All @@ -113,55 +138,118 @@ int main(int argc, char* argv[])
(sizeof(ELEM_TYPE) * (mk + kn) + sizeof(int) * 3)
* stack_size / (duration * (1ULL << 30)));
#endif
/* warmup execution and prebuild JIT kernels */
/* warmup execution and prebuild transpose-kernel */
for (r = 0; r < warmup / 2; ++r) {
CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev,
DBCSR_TYPE(ELEM_TYPE), k, n, MAX_KERNEL_DIM, stream), &result);
CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev,
DBCSR_TYPE(ELEM_TYPE), n, k, MAX_KERNEL_DIM, stream), &result);
}
#if defined(USE_LIBXSMM)
CHECK(acc_stream_sync(stream), &result);
start = libxsmm_timer_tick();
#endif
/* to perform NN-SMMs on the device, all B-matrices are transposed upfront (SMM-kernel is limited to NT) */
CHECK(libsmm_acc_transpose(trans_dev, 0/*offset*/, nb, bmat_dev,
DBCSR_TYPE(ELEM_TYPE), k, n, MAX_KERNEL_DIM, stream), &result);
#if defined(USE_LIBXSMM)
CHECK(acc_stream_sync(stream), &result);
transpose = libxsmm_timer_duration(start, libxsmm_timer_tick());
#endif
/* warmup execution and prebuild SMM-kernel */
for (r = 0; r < warmup; ++r) {
CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, 3/*nparams*/, DBCSR_TYPE(ELEM_TYPE),
amat_dev, bmat_dev, cmat_dev, m, n, k, MAX_KERNEL_DIM, 1/*homogeneous*/, stream, stream), &result);
}
CHECK(acc_memset_zero(cmat_dev, 0/*offset*/, sizeof(ELEM_TYPE) * mn * nc, stream), &result);
#if defined(USE_LIBXSMM)
CHECK(acc_stream_sync(stream), &result);
start = libxsmm_timer_tick();
#endif
for (r = 0; r < nrepeat; ++r) {
/* GPU-kernel is limited to C += Ai * Bi^T (i.e., NT, for NN, all Bi must be transposed upfront) */
/* GPU-kernel is limited to C += Ai * Bi^T, i.e., NT (for NN, all Bi must be transposed upfront) */
CHECK(libsmm_acc_process(stack_hst, stack_dev, stack_size, 3/*nparams*/, DBCSR_TYPE(ELEM_TYPE),
amat_dev, bmat_dev, cmat_dev, m, n, k, MAX_KERNEL_DIM, 1/*homogeneous*/, stream, stream), &result);
}
#if defined(USE_LIBXSMM)
CHECK(acc_stream_sync(stream), &result);
duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
if (EXIT_SUCCESS == result) {
const char transa = 'N', transb = 'T';
ELEM_TYPE *const gold_hst = (ELEM_TYPE*)libxsmm_malloc(sizeof(ELEM_TYPE) * mn * nc);
const char transa = 'N', transb = 'N';
const ELEM_TYPE alpha = 1, beta = 1;
printf("transpose: %.1f ms %.1f GFLOPS/s\n", 1000.0 * (duration + transpose) / nrepeat,
((size_t)2 * m * n * k) * stack_size / ((duration + transpose) * (1ULL << 30) / nrepeat));
printf("device: %.1f ms %.1f GFLOPS/s\n", 1000.0 * duration / nrepeat,
((size_t)2 * m * n * k) * stack_size / (duration * (1ULL << 30) / nrepeat));
memset(cmat_hst, 0, sizeof(ELEM_TYPE) * mn * stack_size);
memset(gold_hst, 0, sizeof(ELEM_TYPE) * mn * nc);
for (r = 0; r < warmup; ++r) {
libxsmm_gemm_batch_omp(LIBXSMM_GEMM_PRECISION(ELEM_TYPE), LIBXSMM_GEMM_PRECISION(ELEM_TYPE),
&transa, &transb, m, n, k, &alpha, amat_hst, &m/*lda*/, bmat_hst, &k/*ldb*/,
&beta, gold_hst, &m/*ldc*/, 1/*index_base*/, sizeof(int) * 3,
stack_hst + 0, stack_hst + 1, stack_hst + 2, stack_size);
}
memset(gold_hst, 0, sizeof(ELEM_TYPE) * mn * nc);
start = libxsmm_timer_tick();
/* CPU-kernel operates on data that is not initialized in NUMA-aware fashion */
for (r = 0; r < nrepeat; ++r) {
/* CPU-kernel performs C += Ai * Bi^T to match result of GPU-kernel (NT may perform below NN) */
libxsmm_gemm_batch_omp(LIBXSMM_GEMM_PRECISION(ELEM_TYPE), LIBXSMM_GEMM_PRECISION(ELEM_TYPE),
&transa, &transb, m, n, k, &alpha, amat_hst, &m/*lda*/, bmat_hst, &k/*ldb*/,
&beta, cmat_hst, &m/*ldc*/, 1/*index_base*/, sizeof(int) * 3,
&beta, gold_hst, &m/*ldc*/, 1/*index_base*/, sizeof(int) * 3,
stack_hst + 0, stack_hst + 1, stack_hst + 2, stack_size);
}
duration = libxsmm_timer_duration(start, libxsmm_timer_tick());
printf("host: %.1f ms %.1f GFLOPS/s\n", 1000.0 * duration / nrepeat,
((size_t)2 * m * n * k) * stack_size / (duration * (1ULL << 30) / nrepeat));
/* transfer result from device back to host for validation */
CHECK(acc_memcpy_d2h(cmat_dev, cmat_hst, sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
/* transfer result from device to host for validation */
CHECK(acc_memcpy_d2h(cmat_dev, cmat_hst, sizeof(ELEM_TYPE) * mn * nc, stream), &result);
CHECK(acc_stream_sync(stream), &result);
/* TODO: validation code TBD */
if (EXIT_SUCCESS == result) {
double abserror = 0, relerror = 0;
for (i = 0; i < nc; ++i) {
const ELEM_TYPE *const gold = gold_hst + mn * i;
const ELEM_TYPE *const test = cmat_hst + mn * i;
double diff = 0, a = 0, b = 0;
for (r = 0; r < (m * n); ++r) {
const double ar = (double)gold[r];
const double br = (double)test[r];
const double d = fabs(ar - br);
if (d > diff) {
diff = d;
a = ar;
b = br;
}
}
if (0 < diff) {
# if defined(_DEBUG)
print(stderr, "gold = ", gold, m, n);
print(stderr, "test = ", test, m, n);
fprintf(stderr, "diff = %g (%g != %g)\n", diff, a, b);
# endif
if (abserror < diff) {
relerror = fabs(0 != a ? (diff / a) : (diff / b));
abserror = diff;
}
}
}
printf("max.error: rel=%g\n", relerror);
if (EPSILON < relerror) result = EXIT_FAILURE;
}
libxsmm_free(gold_hst);
}
#endif
CHECK(acc_host_mem_deallocate(stack_hst, stream), NULL);
CHECK(acc_host_mem_deallocate(trans_hst, stream), NULL);
CHECK(acc_host_mem_deallocate(amat_hst, stream), NULL);
CHECK(acc_host_mem_deallocate(bmat_hst, stream), NULL);
CHECK(acc_host_mem_deallocate(cmat_hst, stream), NULL);
CHECK(acc_dev_mem_deallocate(stack_dev), NULL);
CHECK(acc_dev_mem_deallocate(trans_dev), NULL);
CHECK(acc_dev_mem_deallocate(amat_dev), NULL);
CHECK(acc_dev_mem_deallocate(bmat_dev), NULL);
CHECK(acc_dev_mem_deallocate(cmat_dev), NULL);
CHECK(acc_stream_destroy(stream), NULL);
CHECK(acc_finalize(), NULL);
if (EXIT_SUCCESS != result) {
fprintf(stderr, "FAILED\n");
}
Expand Down
26 changes: 20 additions & 6 deletions src/acc/acc_bench_trans.c
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ int main(int argc, char* argv[])
#endif
int *stack_hst = NULL, *stack_dev = NULL;
ELEM_TYPE *mat_hst = NULL, *mat_dev = NULL;
int result = EXIT_SUCCESS, r, i, mm = m, nn = n;
int result = EXIT_SUCCESS, ndevices = 0, r, i, mm = m, nn = n;
void *stream = NULL;
#if defined(USE_LIBXSMM)
libxsmm_timer_tickint start;
Expand All @@ -84,6 +84,20 @@ int main(int argc, char* argv[])
assert(m <= (mn / n) && 0 == (mn % n));
printf("%s%s%i %i %i %i\n", 0 < argc ? argv[0] : "", 0 < argc ? " " : "", nrepeat, stack_size, m, n);
CHECK(acc_init(), &result);
CHECK(acc_get_ndevices(&ndevices), &result);
if (0 < ndevices) {
#if defined(_DEBUG)
fprintf(stderr, "number of devices found: %i\n", ndevices);
#endif
}
else {
#if defined(_DEBUG)
fprintf(stderr, "Error: no device found!\n");
#endif
CHECK(acc_finalize(), NULL);
return result;
}
printf("element type: %s\n", DBCSR_STRINGIFY(ELEM_TYPE));
#if defined(PRIORITY)
CHECK(acc_stream_priority_range(&priomin, &priomax), &result);
CHECK(acc_stream_create(&stream, "stream", (priomin + priomax) / 2), &result);
Expand Down Expand Up @@ -154,24 +168,23 @@ int main(int argc, char* argv[])
printf("host: %.1f ms %.1f GB/s\n", 1000.0 * duration / nodd,
(sizeof(ELEM_TYPE) * mn + sizeof(int))
* stack_size / (duration * (1ULL << 30) / nodd));
/* transfer result from device back to host for validation */
/* transfer result from device to host for validation */
CHECK(acc_memcpy_d2h(mat_dev, mat_hst,
sizeof(ELEM_TYPE) * mn * stack_size, stream), &result);
CHECK(acc_stream_sync(stream), &result);
if (EXIT_SUCCESS == result) {
unsigned int nerrors = 0;
int j;
for (i = 0; i < stack_size; ++i) {
ELEM_TYPE gold[MAX_KERNEL_DIM*MAX_KERNEL_DIM];
const ELEM_TYPE *const test = mat_hst + mn * i;
init(i/*seed*/, gold, m, n);
libxsmm_itrans(gold, sizeof(ELEM_TYPE), m, n, m, n);
for (j = 0; j < (m * n); ++j) {
if (gold[j] != test[j]) {
for (r = 0; r < (m * n); ++r) {
if (gold[r] != test[r]) {
++nerrors;
# if defined(_DEBUG)
print(stderr, "gold = ", gold, n, m);
print(stderr, "this = ", test, n, m);
print(stderr, "test = ", test, n, m);
init(i/*seed*/, gold, m, n);
print(stderr, "orig = ", gold, m, n);
fprintf(stderr, "\n");
Expand All @@ -190,6 +203,7 @@ int main(int argc, char* argv[])
CHECK(acc_dev_mem_deallocate(stack_dev), NULL);
CHECK(acc_dev_mem_deallocate(mat_dev), NULL);
CHECK(acc_stream_destroy(stream), NULL);
CHECK(acc_finalize(), NULL);
if (EXIT_SUCCESS != result) {
fprintf(stderr, "FAILED\n");
}
Expand Down
2 changes: 1 addition & 1 deletion src/acc/acc_libsmm.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@

#include "acc.h"

#define DBCSR_CONCATENATE(A, B) A##B
#define DBCSR_TYPE(T) DBCSR_CONCATENATE(DBCSR_TYPE_, T)
#define DBCSR_TYPE_double dbcsr_type_real_8
#define DBCSR_TYPE_float dbcsr_type_real_4
Expand All @@ -29,6 +28,7 @@ typedef enum libsmm_acc_data_t {
} libsmm_acc_data_t;

int libsmm_acc_init(void);
int libsmm_acc_finalize(void);
acc_bool_t libsmm_acc_is_thread_safe(void);

int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size,
Expand Down
2 changes: 1 addition & 1 deletion src/acc/cuda/acc_init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,5 @@ extern "C" int acc_finalize(){
ACC_API_CALL(GetDevice, (&myDevice));
ACC_DRV_CALL(DeviceGet, (&acc_device, myDevice));
ACC_DRV_CALL(DevicePrimaryCtxRelease, (acc_device));
return 0;
return libsmm_acc_finalize();
}
2 changes: 1 addition & 1 deletion src/acc/libsmm_acc/PACKAGE
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"description": "Generic GPU-accelerated library for small matrix multiplications",
"description": "CUDA/HIP-accelerated library for small matrix multiplications",
"archive": "libdbcsr",
"requires": ["..", "../cuda", "../hip"]
}
1 change: 1 addition & 0 deletions src/acc/libsmm_acc/libcusmm/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
cusmm_kernels.h
2 changes: 1 addition & 1 deletion src/acc/libsmm_acc/libcusmm/PACKAGE
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"description": "Cuda accelerated Small Matrix Multiplications",
"archive": "libdbcsr",
"requires": ["kernels", "../include", "../../include"]
"requires": ["kernels", "..", "../../include"]
}
4 changes: 2 additions & 2 deletions src/acc/libsmm_acc/libsmm_acc_init.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ int libsmm_acc_gpu_blas_init(){


//===========================================================================
int libsmm_acc_init() {
extern "C" int libsmm_acc_init() {
#if !defined(NO_DBCSR_TIMESET)
std::string routineN = "libsmm_acc_init";
int handle;
Expand All @@ -71,7 +71,7 @@ int libsmm_acc_init() {


//===========================================================================
int libsmm_acc_finalize() {
extern "C" int libsmm_acc_finalize() {
#if !defined(NO_DBCSR_TIMESET)
std::string routineN = "libsmm_acc_finalize";
int handle;
Expand Down
1 change: 1 addition & 0 deletions src/acc/libsmm_acc/libsmm_acc_init.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ void timestop(int handle);
#endif

extern "C" int libsmm_acc_init (void);
extern "C" int libsmm_acc_finalize (void);

int libsmm_acc_gpu_blas_init();

Expand Down
Loading

0 comments on commit d622ca0

Please sign in to comment.