Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ocl: updated tuned parameters #842

Merged
merged 2 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions src/acc/acc_bench_smm.c
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ int main(int argc, char* argv[]) {
#if defined(USE_LIBXSMM)
libxsmm_timer_tickint start;
int print_offset = 0;
char print_buffer[1024];
char print_buffer[1024] = "";
# if defined(__OPENCL)
const char* const env_smm_repeat = getenv("SMM_NREPEAT");
const int smm_nrepeat = (NULL == env_smm_repeat ? 1 : MAX(atoi(env_smm_repeat), 1));
Expand Down Expand Up @@ -497,7 +497,7 @@ int main(int argc, char* argv[]) {
if (maxdiff < epsilon && NULL != file) maxdiff = epsilon;
if (0 < epsilon) {
if (LIBXSMM_NOTNAN(diff.v_tst)) {
PRINTF(" (|%g-%g|=%g)\n", diff.v_ref, diff.v_tst, fabs(diff.v_ref - diff.v_tst));
PRINTF(" (|%g-%g|=%g)\n", diff.v_ref, diff.v_tst, diff.linf_abs);
}
else {
PRINTF(" (%g)\n", diff.v_tst);
Expand All @@ -508,6 +508,7 @@ int main(int argc, char* argv[]) {
}
if (0 < check && check < epsilon) result = EXIT_FAILURE;
}
else fprintf(stderr, "ERROR: failed to validate!\n");
}
# endif
}
Expand Down
7 changes: 5 additions & 2 deletions src/acc/opencl/acc_opencl.c
Original file line number Diff line number Diff line change
Expand Up @@ -843,6 +843,9 @@ int c_dbcsr_acc_opencl_device_name(
cl_device_id device, char name[], size_t name_maxlen, char platform[], size_t platform_maxlen, int cleanup) {
int result_name = 0, result_platform = 0;
assert(NULL != name || NULL != platform);
if (NULL == device && 0 < c_dbcsr_acc_opencl_config.ndevices) {
device = c_dbcsr_acc_opencl_config.devices[0]; /* NULL-device refers to device 0 */
}
if (NULL != name && 0 != name_maxlen) {
result_name = clGetDeviceInfo(device, CL_DEVICE_NAME, name_maxlen, name, NULL);
if (0 != cleanup && EXIT_SUCCESS == result_name) {
Expand Down Expand Up @@ -1157,7 +1160,7 @@ int c_dbcsr_acc_opencl_set_active_device(ACC_OPENCL_LOCKTYPE* lock, int device_i

int c_dbcsr_acc_set_active_device(int device_id) {
int result = EXIT_SUCCESS;
# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)
# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) && 0
int routine_handle;
static const char* const routine_name_ptr = LIBXSMM_FUNCNAME;
static const int routine_name_len = (int)sizeof(LIBXSMM_FUNCNAME) - 1;
Expand All @@ -1177,7 +1180,7 @@ int c_dbcsr_acc_set_active_device(int device_id) {
# if !defined(NDEBUG)
else result = EXIT_FAILURE;
# endif
# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE)
# if defined(__DBCSR_ACC) && defined(ACC_OPENCL_PROFILE) && 0
c_dbcsr_timestop(&routine_handle);
# endif
ACC_OPENCL_RETURN(result);
Expand Down
122 changes: 48 additions & 74 deletions src/acc/opencl/smm/opencl_libsmm.c
Original file line number Diff line number Diff line change
Expand Up @@ -28,15 +28,20 @@
libxsmm_gemm_descriptor_dinit(BLOB, PREC, M, N, K, LDA, LDB, LDC, 1.0, 1.0, FLAGS, PREFETCH)
# endif

# if !defined(OPENCL_LIBSMM_VALIDATE_TRANS) && defined(OPENCL_LIBSMM_VALIDATE) && \
(1 < OPENCL_LIBSMM_VALIDATE || 0 > OPENCL_LIBSMM_VALIDATE)
# define OPENCL_LIBSMM_VALIDATE_TRANS
# endif
# if !defined(OPENCL_LIBSMM_VALIDATE_SMM) && defined(OPENCL_LIBSMM_VALIDATE)
# define OPENCL_LIBSMM_VALIDATE_SMM
# endif
# if !defined(OPENCL_LIBSMM_VALIDATE_EXIT) && defined(OPENCL_LIBSMM_VALIDATE) && 1
# define OPENCL_LIBSMM_VALIDATE_EXIT
# if defined(OPENCL_LIBSMM_VALIDATE)
# if !defined(OPENCL_LIBSMM_VALIDATE_TRANS) && (1 < OPENCL_LIBSMM_VALIDATE || 0 > OPENCL_LIBSMM_VALIDATE)
# define OPENCL_LIBSMM_VALIDATE_TRANS
# endif
# if !defined(OPENCL_LIBSMM_VALIDATE_SMM)
# define OPENCL_LIBSMM_VALIDATE_SMM
# endif
# if !defined(OPENCL_LIBSMM_VALIDATE_EXIT) && 1
# define OPENCL_LIBSMM_VALIDATE_EXIT
# endif
# if !defined(OPENCL_LIBSMM_VALIDATE_SCRATCH)
# define OPENCL_LIBSMM_VALIDATE_SCRATCH(SIZE, ALIGN) /*libxsmm_aligned_scratch(SIZE, ALIGN)*/ malloc(SIZE)
# define OPENCL_LIBSMM_VALIDATE_FREE(PTR) /*libxsmm_free(PTR)*/ free(PTR)
# endif
# endif
# if !defined(OPENCL_LIBSMM_KERNELNAME_TRANS)
# define OPENCL_LIBSMM_KERNELNAME_TRANS "trans"
Expand Down Expand Up @@ -111,31 +116,6 @@ int opencl_libsmm_use_cmem(cl_device_id device) {
}


# if defined(OPENCL_LIBSMM_VALIDATE) && (0 != OPENCL_LIBSMM_VALIDATE)
void opencl_libsmm_print_matrix(FILE* ostream, const char* label, libsmm_acc_data_t type, const void* mat, int m, int n) {
int i, j;
const char* const s = (NULL != label ? label : "");
const int len = (int)strlen(s);
for (i = 0; i < m; ++i) {
if (0 < i) {
fprintf(ostream, "%*s", len, " ");
}
else {
fprintf(ostream, "%s", s);
}
for (j = 0; j < n; ++j) {
switch (type) {
case dbcsr_type_real_8: fprintf(ostream, "%.2f ", ((const double*)mat)[i * n + j]); break;
case dbcsr_type_real_4: fprintf(ostream, "%.2f ", ((const float*)mat)[i * n + j]); break;
default: fprintf(ostream, "? ");
}
}
fprintf(ostream, "\n");
}
}
# endif


int opencl_libsmm_write_trans_params(FILE* stream, int only_key, const opencl_libsmm_transkey_t* key,
const opencl_libsmm_trans_t* config, const char* delim, const char* begin, const char* close) {
int result = 0;
Expand Down Expand Up @@ -786,7 +766,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
const size_t scratch_size = (sizeof(int) * offset_stack_size) /*stack*/
+ data_size /*imat*/ + data_size /*omat*/ + (mn * typesize) /*gold*/
+ 3 * (LIBXSMM_ALIGNMENT - 1) /*alignments*/;
scratch = libxsmm_aligned_scratch(scratch_size, LIBXSMM_ALIGNMENT);
scratch = OPENCL_LIBSMM_VALIDATE_SCRATCH(scratch_size, LIBXSMM_ALIGNMENT);
if (NULL != scratch) {
stack = (int*)scratch;
imat = (char*)LIBXSMM_UP2((uintptr_t)stack + sizeof(int) * offset_stack_size, LIBXSMM_ALIGNMENT);
Expand Down Expand Up @@ -855,20 +835,15 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
}
# if defined(OPENCL_LIBSMM_VALIDATE_TRANS)
ACC_OPENCL_CHECK(c_dbcsr_acc_memcpy_d2h(dev_data, omat, data_size, stream), "transfer validation test", result);
# endif
# if defined(OPENCL_LIBSMM_VALIDATE_TRANS)
ACC_OPENCL_CHECK(c_dbcsr_acc_stream_sync(stream), "sync stream", result);
# endif
# if defined(OPENCL_LIBSMM_VALIDATE_TRANS)
if (EXIT_SUCCESS == result) {
int i, j;
LIBXSMM_STDIO_ACQUIRE();
char print_buffer[2048] = "";
int print_offset = 0, i, j;
if (0 != c_dbcsr_acc_opencl_config.verbosity) {
fprintf(stderr,
"libsmm_acc_transpose("
"offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)",
offset, stack_size, dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m, n,
max_kernel_dim, stream);
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
"libsmm_acc_transpose(offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)", offset, stack_size,
dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m, n, max_kernel_dim,
stream);
}
for (i = offset; i < offset_stack_size; ++i) {
const size_t index = stack[i];
Expand All @@ -879,20 +854,12 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
libxsmm_itrans(gold, typesize, m, n, m, n);
if (0 != memcmp(gold, test, mn * typesize)) {
if (0 == c_dbcsr_acc_opencl_config.verbosity) {
fprintf(stderr,
"libsmm_acc_transpose("
"offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)",
offset, stack_size, dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m,
n, max_kernel_dim, stream);
}
fprintf(stderr, " => ERROR\n");
if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
fprintf(stderr, "stackposition = %i (index=%llu)\n", i, (unsigned long long)index);
opencl_libsmm_print_matrix(stderr, "orig = ", datatype, orig, m, n);
opencl_libsmm_print_matrix(stderr, "gold = ", datatype, gold, n, m);
opencl_libsmm_print_matrix(stderr, "test = ", datatype, test, n, m);
fprintf(stderr, "\n");
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
"libsmm_acc_transpose(offset=%i, size=%i, type=%s, m=%i, n=%i, max=%i, stream=%p)", offset, stack_size,
dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m, n, max_kernel_dim,
stream);
}
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => ERROR\n");
# if defined(OPENCL_LIBSMM_VALIDATE_EXIT)
exit(EXIT_FAILURE);
# else
Expand All @@ -903,7 +870,7 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
for (j = offset; j < i; ++j) {
const size_t duplicate = stack[j];
if (index == duplicate) {
fprintf(stderr, " => ERROR\n");
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => ERROR\n");
# if defined(OPENCL_LIBSMM_VALIDATE_EXIT)
exit(EXIT_FAILURE);
# else
Expand All @@ -915,8 +882,10 @@ int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int stack_size, v
}
}
if (0 != c_dbcsr_acc_opencl_config.verbosity && EXIT_SUCCESS == result) {
fprintf(stderr, " => OK\n");
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => OK\n");
}
LIBXSMM_STDIO_ACQUIRE();
fputs(print_buffer, stderr);
LIBXSMM_STDIO_RELEASE();
}
libxsmm_free(scratch);
Expand Down Expand Up @@ -1342,7 +1311,7 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
&blob, precision, m_max, n_max, k_max, m_max, k_max, m_max, LIBXSMM_GEMM_FLAG_NONE, LIBXSMM_PREFETCH_NONE);
const size_t scratch_size = psize + asize + bsize + csize + csize + k_max * n_max * typesize +
5 * (LIBXSMM_ALIGNMENT - 1) /*alignments*/;
scratch = libxsmm_aligned_scratch(scratch_size, LIBXSMM_ALIGNMENT);
scratch = OPENCL_LIBSMM_VALIDATE_SCRATCH(scratch_size, LIBXSMM_ALIGNMENT);
if (NULL != desc && NULL != scratch) {
pinp = (int*)scratch;
ainp = (char*)LIBXSMM_UP2((uintptr_t)pinp + psize, LIBXSMM_ALIGNMENT);
Expand Down Expand Up @@ -1429,10 +1398,12 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
const char* const env_tol = getenv("OPENCL_LIBSMM_SMM_TOLERANCE");
const double tolerance = ((NULL == env_tol || '\0' == *env_tol) ? 1E-3 : atof(env_tol));
const int* const params = pinp + (4 <= nparams ? (nparams - 4) : 0);
char print_buffer[2048] = "";
int print_offset = 0;
size_t i;
LIBXSMM_STDIO_ACQUIRE();
if (0 != c_dbcsr_acc_opencl_config.verbosity) {
fprintf(stderr, "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
"libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m_max, n_max, k_max,
max_kernel_dim, stream);
}
Expand All @@ -1458,20 +1429,21 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
# endif
if (tolerance < epsilon) {
if (0 == c_dbcsr_acc_opencl_config.verbosity) {
fprintf(stderr, "libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
"libsmm_acc_process(size=%i, type=%s, m=%i, n=%i, k=%i, max=%i, stream=%p)", stack_size,
dbcsr_type_real_8 == datatype ? "f64" : (dbcsr_type_real_4 == datatype ? "f32" : "unknown"), m_max, n_max, k_max,
max_kernel_dim, stream);
}
# if LIBXSMM_VERSION4(1, 17, 0, 0) < LIBXSMM_VERSION_NUMBER
fprintf(stderr, " => ERROR diff=%g (%g != %g)\n", diff.linf_abs, diff.v_ref, diff.v_tst);
# else
fprintf(stderr, " => ERROR diff=%g\n", diff.linf_abs);
if (LIBXSMM_NOTNAN(diff.v_tst)) {
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset,
" => ERROR diff=%g (|%g-%g|=%g)\n", epsilon, diff.v_ref, diff.v_tst, diff.linf_abs);
}
else
# endif
if (3 <= c_dbcsr_acc_opencl_config.verbosity || 0 > c_dbcsr_acc_opencl_config.verbosity) {
fprintf(stderr, "stackposition = %llu (index=%llu)\n", (unsigned long long)i, (unsigned long long)ic);
opencl_libsmm_print_matrix(stderr, "gold = ", datatype, gold + ic, m_max, n_max);
opencl_libsmm_print_matrix(stderr, "test = ", datatype, test + ic, m_max, n_max);
fprintf(stderr, "\n");
{
print_offset += LIBXSMM_SNPRINTF(
print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => ERROR diff=%g\n", epsilon);
}
# if defined(OPENCL_LIBSMM_VALIDATE_EXIT)
exit(EXIT_FAILURE);
Expand All @@ -1482,8 +1454,10 @@ int libsmm_acc_process(const int* host_param_stack, const int* dev_param_stack,
}
}
if (0 != c_dbcsr_acc_opencl_config.verbosity && EXIT_SUCCESS == result) {
fprintf(stderr, " => OK\n");
print_offset += LIBXSMM_SNPRINTF(print_buffer + print_offset, sizeof(print_buffer) - print_offset, " => OK\n");
}
LIBXSMM_STDIO_ACQUIRE();
fputs(print_buffer, stderr);
LIBXSMM_STDIO_RELEASE();
}
libxsmm_free(scratch);
Expand Down
Loading