diff --git a/VERSION b/VERSION index 92fe7407f5b..a7d5dde36dc 100644 --- a/VERSION +++ b/VERSION @@ -1,6 +1,6 @@ MAJOR = 2 MINOR = 1 -PATCH = 0-rc15 +PATCH = 0-rc16 # A specific DATE (YYYY-MM-DD) fixes an official release, otherwise # it is considered Development version. DATE = diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1b3ffc7e464..6fbe1fb5466 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -106,24 +106,27 @@ add_fypp_sources(DBCSR_SRCS work/dbcsr_work_operations.F ) -set(DBCSR_ACC_SRCS - acc/acc_dev.cpp - acc/acc_error.cpp - acc/acc_event.cpp - acc/acc_init.cpp - acc/acc_mem.cpp - acc/acc_stream.cpp - ) - set(DBCSR_CUDA_SRCS acc/cuda/acc_cuda.cpp acc/cuda/dbcsr_cuda_nvtx_cu.cu acc/cublaswrap/cublas.cu + acc/cuda/acc_dev.cpp + acc/cuda/acc_error.cpp + acc/cuda/acc_event.cpp + acc/cuda/acc_init.cpp + acc/cuda/acc_mem.cpp + acc/cuda/acc_stream.cpp ) set(DBCSR_HIP_SRCS acc/hip/acc_hip.cpp acc/hipblaswrap/hipblas.cpp + acc/cuda/acc_dev.cpp + acc/cuda/acc_error.cpp + acc/cuda/acc_event.cpp + acc/cuda/acc_init.cpp + acc/cuda/acc_mem.cpp + acc/cuda/acc_stream.cpp ) # set the __SHORT_FILE__ per file for dbcsr sources @@ -193,7 +196,7 @@ if (MPI_FOUND) endif () # ================================================================================================= -# DBCSR's OpenMP/offload backend +# Link OpenMP runtime library even if DBCSR main code is not built with OpenMP if (OpenMP_FOUND) target_link_libraries(dbcsr PRIVATE OpenMP::OpenMP_Fortran) @@ -243,7 +246,7 @@ if (USE_CUDA) target_link_libraries(dbcsr PRIVATE nvrtc) # Complete list of GPU-support sources - set(DBCSR_ACC_SRCS ${DBCSR_ACC_SRCS} ${DBCSR_CUDA_SRCS}) + set(DBCSR_ACC_SRCS ${DBCSR_CUDA_SRCS}) # Make an object library add_library(acc OBJECT ${DBCSR_ACC_SRCS}) @@ -284,7 +287,7 @@ if (USE_HIP) target_link_libraries(dbcsr PUBLIC ${ROCM_HIPRTC_LIB}) # Complete list of GPU-support sources - set(DBCSR_ACC_SRCS ${DBCSR_ACC_SRCS} ${DBCSR_HIP_SRCS}) + set(DBCSR_ACC_SRCS ${DBCSR_HIP_SRCS}) # Compile the rest of the HIP source files into a static library set_source_files_properties(${DBCSR_ACC_SRCS} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) diff --git a/src/acc/PACKAGE b/src/acc/PACKAGE index 7e5b5cee6dd..c75a1ba4006 100644 --- a/src/acc/PACKAGE +++ b/src/acc/PACKAGE @@ -1,5 +1,5 @@ { -"description": "Generic Accelerator API", +"description": "Generic accelerator API", "archive": "libdbcsr", -"requires": ["../base", "include", "cuda", "hip", "libsmm_acc"] +"requires": ["../base", "cuda", "hip", "libsmm_acc"] } diff --git a/src/acc/include/acc.h b/src/acc/acc.h similarity index 95% rename from src/acc/include/acc.h rename to src/acc/acc.h index e32f51bb059..b334d414884 100644 --- a/src/acc/include/acc.h +++ b/src/acc/acc.h @@ -11,7 +11,7 @@ #include -#ifdef __cplusplus +#if defined(__cplusplus) extern "C" { #endif @@ -28,10 +28,10 @@ typedef enum acc_data_t { ACC_DATA_UNKNOWN = -1 } acc_data_t; -/** accelerator driver initialization and finalization */ +/** initialization and finalization */ int acc_init(void); int acc_finalize(void); -int acc_clear_errors(void); +void acc_clear_errors(void); /** devices */ int acc_get_ndevices(int* n_devices); @@ -63,7 +63,7 @@ int acc_memcpy_d2d(const void* devmem_src, void* devmem_dst, size_t count, acc_s int acc_memset_zero(void* dev_mem, size_t offset, size_t length, acc_stream_t* stream); int acc_dev_mem_info(size_t* mem_free, size_t* mem_total); -#ifdef __cplusplus +#if defined(__cplusplus) } #endif diff --git a/src/acc/include/acc_libsmm.h b/src/acc/acc_libsmm.h similarity index 94% rename from src/acc/include/acc_libsmm.h rename to src/acc/acc_libsmm.h index c121b437e0b..9f4e9c69cb7 100644 --- a/src/acc/include/acc_libsmm.h +++ b/src/acc/acc_libsmm.h @@ -11,7 +11,7 @@ #include "acc.h" -#ifdef __cplusplus +#if defined(__cplusplus) extern "C" { #endif @@ -21,7 +21,7 @@ typedef struct libsmm_acc_stack_descriptor_type { } libsmm_acc_stack_descriptor_type; int libsmm_acc_init(void); -int libsmm_acc_is_thread_safe(void); +acc_bool_t libsmm_acc_is_thread_safe(void); int libsmm_acc_transpose(const int* dev_trs_stack, int offset, int nblks, void* dev_data, acc_data_t datatype, int m, int n, acc_stream_t* stream); @@ -30,7 +30,7 @@ int libsmm_acc_process(const libsmm_acc_stack_descriptor_type* dev_param_stack, int nparams, acc_data_t datatype, const void* dev_a_data, const void* dev_b_data, void* dev_c_data, int m_max, int n_max, int k_max, acc_bool_t def_mnk, acc_stream_t* stream); -#ifdef __cplusplus +#if defined(__cplusplus) } #endif diff --git a/src/acc/cublaswrap/cublas.cu b/src/acc/cublaswrap/cublas.cu index 28a9801af34..fdfacd6ee11 100644 --- a/src/acc/cublaswrap/cublas.cu +++ b/src/acc/cublaswrap/cublas.cu @@ -9,7 +9,7 @@ #include #include "cublas_v2.h" -#include "../acc_error.h" +#include "../cuda/acc_error.h" /****************************************************************************/ diff --git a/src/acc/cuda/PACKAGE b/src/acc/cuda/PACKAGE index b8deaad8a96..53587ad1bab 100644 --- a/src/acc/cuda/PACKAGE +++ b/src/acc/cuda/PACKAGE @@ -1,5 +1,5 @@ { -"description": "Cuda backend for accelerator api", +"description": "Cuda backend for accelerator API", "archive":"libdbcsr", -"requires": ["../../base", "../include"] +"requires": ["../../base", ".."] } diff --git a/src/acc/acc_dev.cpp b/src/acc/cuda/acc_dev.cpp similarity index 91% rename from src/acc/acc_dev.cpp rename to src/acc/cuda/acc_dev.cpp index 682f51f3a6d..b795fe319c4 100644 --- a/src/acc/acc_dev.cpp +++ b/src/acc/cuda/acc_dev.cpp @@ -7,16 +7,17 @@ * SPDX-License-Identifier: GPL-2.0+ * *------------------------------------------------------------------------------------------------*/ -#ifdef __CUDA -#include "cuda/acc_cuda.h" -#else -#include "hip/acc_hip.h" +#if defined(__CUDA) +# include "acc_cuda.h" +#elif defined(__HIP) +# include "../hip/acc_hip.h" #endif +#include "acc_error.h" +#include "../acc.h" + #include #include -#include "acc_error.h" -#include "include/acc.h" // for debug purpose static const int verbose_print = 1; @@ -42,7 +43,7 @@ extern "C" int acc_set_active_device(int device_id){ // establish context ACC_API_CALL(Free, (0)); -#ifdef __HIP_PLATFORM_NVCC__ +#if defined(__HIP_PLATFORM_NVCC__) if (verbose_print){ ACC_API_CALL(DeviceSetLimit, (ACC(LimitPrintfFifoSize), (size_t) 1000000000)); } diff --git a/src/acc/acc_error.cpp b/src/acc/cuda/acc_error.cpp similarity index 92% rename from src/acc/acc_error.cpp rename to src/acc/cuda/acc_error.cpp index c334071c804..4c9804c7c99 100644 --- a/src/acc/acc_error.cpp +++ b/src/acc/cuda/acc_error.cpp @@ -7,15 +7,16 @@ * SPDX-License-Identifier: GPL-2.0+ * *------------------------------------------------------------------------------------------------*/ -#ifdef __CUDA -#include "cuda/acc_cuda.h" -#else -#include "hip/acc_hip.h" +#if defined(__CUDA) +# include "acc_cuda.h" +#elif defined(__HIP) +# include "../hip/acc_hip.h" #endif +#include "acc_error.h" + #include #include -#include "acc_error.h" /****************************************************************************/ int acc_error_check (ACC(Error_t) error){ diff --git a/src/acc/acc_error.h b/src/acc/cuda/acc_error.h similarity index 90% rename from src/acc/acc_error.h rename to src/acc/cuda/acc_error.h index 5b3500f9a56..32e57c96a65 100644 --- a/src/acc/acc_error.h +++ b/src/acc/cuda/acc_error.h @@ -7,10 +7,10 @@ * SPDX-License-Identifier: GPL-2.0+ * *------------------------------------------------------------------------------------------------*/ -#ifdef __CUDA -#include "cuda/acc_cuda.h" -#else -#include "hip/acc_hip.h" +#if defined(__CUDA) +# include "acc_cuda.h" +#elif defined(__HIP) +# include "../hip/acc_hip.h" #endif int acc_error_check (ACC(Error_t) acc_error); diff --git a/src/acc/acc_event.cpp b/src/acc/cuda/acc_event.cpp similarity index 96% rename from src/acc/acc_event.cpp rename to src/acc/cuda/acc_event.cpp index 9bf389ba208..0a8745b685b 100644 --- a/src/acc/acc_event.cpp +++ b/src/acc/cuda/acc_event.cpp @@ -7,16 +7,17 @@ * SPDX-License-Identifier: GPL-2.0+ * *------------------------------------------------------------------------------------------------*/ -#ifdef __CUDA -#include "cuda/acc_cuda.h" -#else -#include "hip/acc_hip.h" +#if defined(__CUDA) +# include "acc_cuda.h" +#elif defined(__HIP) +# include "../hip/acc_hip.h" #endif +#include "acc_error.h" +#include "../acc.h" + #include #include -#include "acc_error.h" -#include "include/acc.h" static const int verbose_print = 0; diff --git a/src/acc/acc_init.cpp b/src/acc/cuda/acc_init.cpp similarity index 89% rename from src/acc/acc_init.cpp rename to src/acc/cuda/acc_init.cpp index 8d1690ab05e..0b578d6ef10 100644 --- a/src/acc/acc_init.cpp +++ b/src/acc/cuda/acc_init.cpp @@ -7,18 +7,19 @@ * SPDX-License-Identifier: GPL-2.0+ * *------------------------------------------------------------------------------------------------*/ -#ifdef __CUDA -#include "cuda/acc_cuda.h" -#else -#include "hip/acc_hip.h" +#if defined(__CUDA) +# include "acc_cuda.h" +#elif defined(__HIP) +# include "../hip/acc_hip.h" #endif +#include "../acc.h" +#include "../acc_libsmm.h" + #include -#include "include/acc.h" -#include "include/acc_libsmm.h" -#ifdef __CUDA_PROFILING -#include +#if defined(__CUDA_PROFILING) +# include #endif /****************************************************************************/ diff --git a/src/acc/acc_mem.cpp b/src/acc/cuda/acc_mem.cpp similarity index 97% rename from src/acc/acc_mem.cpp rename to src/acc/cuda/acc_mem.cpp index 2e4ea8d991e..52168e020a9 100644 --- a/src/acc/acc_mem.cpp +++ b/src/acc/cuda/acc_mem.cpp @@ -7,16 +7,17 @@ * SPDX-License-Identifier: GPL-2.0+ * *------------------------------------------------------------------------------------------------*/ -#ifdef __CUDA -#include "cuda/acc_cuda.h" -#else -#include "hip/acc_hip.h" +#if defined(__CUDA) +# include "acc_cuda.h" +#elif defined(__HIP) +# include "../hip/acc_hip.h" #endif +#include "acc_error.h" +#include "../acc.h" + #include #include -#include "acc_error.h" -#include "include/acc.h" static const int verbose_print = 0; diff --git a/src/acc/acc_stream.cpp b/src/acc/cuda/acc_stream.cpp similarity index 92% rename from src/acc/acc_stream.cpp rename to src/acc/cuda/acc_stream.cpp index 39bb0813d27..4379d234b55 100644 --- a/src/acc/acc_stream.cpp +++ b/src/acc/cuda/acc_stream.cpp @@ -7,19 +7,20 @@ * SPDX-License-Identifier: GPL-2.0+ * *------------------------------------------------------------------------------------------------*/ -#ifdef __CUDA -#include "cuda/acc_cuda.h" -#else -#include "hip/acc_hip.h" +#if defined(__CUDA) +# include "acc_cuda.h" +#elif defined(__HIP) +# include "../hip/acc_hip.h" #endif +#include "acc_error.h" +#include "../acc.h" + #include #include -#include "acc_error.h" -#include "include/acc.h" -#ifdef __CUDA_PROFILING -#include +#if defined(__CUDA_PROFILING) +# include #endif static const int verbose_print = 0; @@ -53,7 +54,7 @@ extern "C" int acc_stream_create(void** stream_p, const char* name, int priority if (acc_error_check(cErr)) return -1; if (acc_error_check(ACC(GetLastError)())) return -1; -#ifdef __CUDA_PROFILING +#if defined(__CUDA_PROFILING) nvtxNameCudaStreamA(*acc_stream, name); #endif diff --git a/src/acc/hip/PACKAGE b/src/acc/hip/PACKAGE index 5f1b63a7522..a7cfed379b1 100644 --- a/src/acc/hip/PACKAGE +++ b/src/acc/hip/PACKAGE @@ -1,5 +1,5 @@ { -"description": "HIP backend for accelerator api", +"description": "HIP backend for accelerator API", "archive":"libdbcsr", -"requires": ["../../base", "../include"] +"requires": ["../../base", "..", "../cuda"] } diff --git a/src/acc/hipblaswrap/hipblas.cpp b/src/acc/hipblaswrap/hipblas.cpp index 0bb6beb7193..8e0ad1ae03b 100644 --- a/src/acc/hipblaswrap/hipblas.cpp +++ b/src/acc/hipblaswrap/hipblas.cpp @@ -11,7 +11,7 @@ #include "hipblas_v2.h" #include #include -#include "../acc_error.h" +#include "../cuda/acc_error.h" /****************************************************************************/ diff --git a/src/acc/include/PACKAGE b/src/acc/include/PACKAGE deleted file mode 100644 index 56dc03ff445..00000000000 --- a/src/acc/include/PACKAGE +++ /dev/null @@ -1,5 +0,0 @@ -{ -"description": "Accelerator backend API and API for accelerated LIBSMM", -"archive": "libdbcsr", -"requires": [] -} diff --git a/src/acc/libsmm_acc/PACKAGE b/src/acc/libsmm_acc/PACKAGE index 7b314532e6c..9d4dcf3960c 100644 --- a/src/acc/libsmm_acc/PACKAGE +++ b/src/acc/libsmm_acc/PACKAGE @@ -1,5 +1,5 @@ { "description": "Generic GPU-accelerated library for small matrix multiplications", "archive": "libdbcsr", -"requires": ["../include", "../cuda", "../hip"] +"requires": ["..", "../cuda", "../hip"] } diff --git a/src/acc/libsmm_acc/kernels/smm_acc_common.h b/src/acc/libsmm_acc/kernels/smm_acc_common.h index f49053207ce..766b8c1055e 100644 --- a/src/acc/libsmm_acc/kernels/smm_acc_common.h +++ b/src/acc/libsmm_acc/kernels/smm_acc_common.h @@ -7,10 +7,8 @@ * SPDX-License-Identifier: GPL-2.0+ * *------------------------------------------------------------------------------------------------*/ -#ifdef __HIP -#if not defined(__HIP_PLATFORM_NVCC__) -#include -#endif +#if defined(__HIP) && !defined(__HIP_PLATFORM_NVCC__) +# include #endif #define MAX(x, y) (((x) > (y)) ? (x) : (y)) @@ -23,7 +21,7 @@ * There is no native support for atomicAdd on doubles in Cuda 5.0. However * * the following implementation is provided in the CUDA C Programing guide. * ******************************************************************************/ -#ifdef __CUDA +#if defined(__CUDA) #if (__CUDACC_VER_MAJOR__<8) || ( defined(__CUDA_ARCH__) && (__CUDA_ARCH__<600) ) static __device__ double atomicAdd(double *address, double val) { unsigned long long int *address_as_ull = @@ -43,7 +41,7 @@ static __device__ double atomicAdd(double *address, double val) { /****************************************************************************** * A simple __ldg replacement for older cuda devices. * ******************************************************************************/ -#ifdef __CUDA +#if defined(__CUDA) #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 350) #define __ldg(x) (*(x)) #endif diff --git a/src/acc/libsmm_acc/kernels/smm_acc_predict.py b/src/acc/libsmm_acc/kernels/smm_acc_predict.py index ddda9f1b858..235438a9238 100644 --- a/src/acc/libsmm_acc/kernels/smm_acc_predict.py +++ b/src/acc/libsmm_acc/kernels/smm_acc_predict.py @@ -435,38 +435,45 @@ def __init__( # found over all algorithms for this given (m, n, k) if not partial_initialization: - assert "threads" in params_df.columns.values, ( - "Missing column: threads. Available columns:\n" - + str(params_df.columns.values) + assert ( + "threads" in params_df.columns.values + ), "Missing column: threads. Available columns:\n" + str( + params_df.columns.values ) - assert "grouping" in params_df.columns.values, ( - "Missing column: grouping. Available columns:\n" - + str(params_df.columns.values) + assert ( + "grouping" in params_df.columns.values + ), "Missing column: grouping. Available columns:\n" + str( + params_df.columns.values ) - assert "minblocks" in params_df.columns.values, ( - "Missing column: minblocks. Available columns:\n" - + str(params_df.columns.values) + assert ( + "minblocks" in params_df.columns.values + ), "Missing column: minblocks. Available columns:\n" + str( + params_df.columns.values ) algos = np.unique(params_df["algorithm"].values) assert len(algos) == 1 algo = algos[0] if algo in ["small", "medium", "largeDB1", "largeDB2"]: - assert "tile_m" in params_df.columns.values, ( - "Missing column: tile_m. Available columns:\n" - + str(params_df.columns.values) + assert ( + "tile_m" in params_df.columns.values + ), "Missing column: tile_m. Available columns:\n" + str( + params_df.columns.values ) - assert "tile_n" in params_df.columns.values, ( - "Missing column: tile_n. Available columns:\n" - + str(params_df.columns.values) + assert ( + "tile_n" in params_df.columns.values + ), "Missing column: tile_n. Available columns:\n" + str( + params_df.columns.values ) if algo in ["largeDB1", "largeDB2"]: - assert "w" in params_df.columns.values, ( - "Missing column: w. Available columns:\n" - + str(params_df.columns.values) + assert ( + "w" in params_df.columns.values + ), "Missing column: w. Available columns:\n" + str( + params_df.columns.values ) - assert "v" in params_df.columns.values, ( - "Missing column: v. Available columns:\n" - + str(params_df.columns.values) + assert ( + "v" in params_df.columns.values + ), "Missing column: v. Available columns:\n" + str( + params_df.columns.values ) self.params = params_df diff --git a/src/acc/libsmm_acc/kernels/smm_acc_transpose.h b/src/acc/libsmm_acc/kernels/smm_acc_transpose.h index c0eddafd231..7d083d288ac 100644 --- a/src/acc/libsmm_acc/kernels/smm_acc_transpose.h +++ b/src/acc/libsmm_acc/kernels/smm_acc_transpose.h @@ -14,19 +14,52 @@ #include "smm_acc_common.h" -template < int m, int n> +/* + * Execution configuration: + * gridDim.x = number of matrix blocks in this batched matrix transpose + * = length of the batched transpose stack + * blockIdx.x = {0, ..., gridDim.x-1} + * blockDim.x = 128 or the smallest multiple of warp_size larger or equal to m*n + * threadIdx.x = {0, ..., blockDim.x-1} + + * Execute batched matrix transpose in place + + * Template parameters + * --- m, n: pair of integers characterising the dimensions of the matrix to transpose + + * Function arguments + * --- trs_stack: transpose stack (pointer to global memory): + * array of stack entries (indices), indicating where each matrix to transpose starts in the "mat" array + * --- mat (pointer to global memory): + * arrays containing the values of the matrix to be transposed + * mat is column major: m = number of rows, n = number of columns + + * Algorithm specificities: + * - the temporary buffer (of size m * n * 8 bytes) in which matrix elements are stored has to fit entirely into shared memory. Therefore, this kernel cannot be run for mattrix sizes such that m * n * 8 bytes > available shared memory per block. + */ + +template __global__ void transpose_d(int *trs_stack, double* mat){ __shared__ double buf[m*n]; + + /* Get the offset in the transpose-stack that this block ID should handle */ int offset = trs_stack[blockIdx.x]; + + /* Loop over m*n matrix elements */ for(int i=threadIdx.x; i < m*n; i+=blockDim.x){ + /* Load matrix elements into a temporary buffer */ buf[i] = mat[offset + i]; } syncthreads(); + /* Loop over elements of the matrix to be overwritten */ for(int i=threadIdx.x; i < m*n; i+=blockDim.x){ + /* Compute old row and column index of matrix element */ int r_out = i % n; int c_out = i / n; + /* Compute the corresponding old 1D index of matrix element */ int idx = r_out * m + c_out; + /* Overwrite the matrix element */ mat[offset + i] = buf[idx]; } diff --git a/src/acc/libsmm_acc/libsmm_acc.cpp b/src/acc/libsmm_acc/libsmm_acc.cpp index 4b068c1d40c..f213fba981f 100644 --- a/src/acc/libsmm_acc/libsmm_acc.cpp +++ b/src/acc/libsmm_acc/libsmm_acc.cpp @@ -21,7 +21,7 @@ #include #include -#if defined _OPENMP +#if defined(_OPENMP) #include #endif @@ -97,12 +97,12 @@ inline void validate_kernel(ACC_DRV(function)& kern_func, ACC_DRV(stream) stream ACC_API_CALL(Memcpy, (h->mat_c, h->d_mat_c, h->n_c * m * n * sizeof(double), ACC(MemcpyDeviceToHost))); // Validate the kernel based on results - double sumGPU = checkSum(h->mat_c, h->n_c, m, n); + double sumGPU = checkSum(h->mat_c, h->n_c, m, n); + libsmm_acc_benchmark_finalize(h); if(sumGPU != sumCPU){ - printf("Kernel validation failed for kernel %ix%ix%i\nchecksum_diff: %g\nthreads: %i, grouping: %i\n", m, n, k, sumGPU-sumCPU, threads, grouping); + printf("Kernel validation failed for multiplication kernel %ix%ix%i\nchecksum CPU: %g, checksum GPU: %g\nchecksum_diff: %g\n", m, n, k, sumCPU, sumGPU, sumGPU-sumCPU); exit(1); } - libsmm_acc_benchmark_finalize(h); } @@ -291,6 +291,39 @@ extern "C" int libsmm_acc_process (const libsmm_acc_stack_descriptor_type *param }; +//=========================================================================== +inline void validate_transpose_kernel(ACC_DRV(function)& kern_func, int threads, ACC_DRV(stream) stream, int m, int n){ + + libsmm_acc_benchmark_t* h; + libsmm_acc_benchmark_init(&h, test, m, 0, n); + + // Initialize arrays + matInit(h->mat_a, h->n_a, m, n, 42); + memset(h->mat_trs_a, 0, h->n_a * m * n * sizeof(double)); + stackInitTransp(h->stack_trs_a, h->n_stack_trs_a, m, n); + + // Run the matrix-matrix multiplication on the CPU + stackTransp(h->stack_trs_a, h->n_stack_trs_a, h->mat_a, h->mat_trs_a, m, n); + double sumCPU = checkSumTransp(h->mat_trs_a, h->n_stack_trs_a, m, n); + + // Run the matrix-matrix multiplication kernel on the GPU + ACC_API_CALL(Memcpy, (h->d_mat_a, h->mat_a, h->n_a * m * n * sizeof(double), ACC(MemcpyHostToDevice))); + ACC_API_CALL(Memcpy, (h->d_stack_trs_a, h->stack_trs_a, h->n_stack_trs_a * sizeof(int), ACC(MemcpyHostToDevice))); + + void *args[] = {&h->d_stack_trs_a, &h->d_mat_a}; + launch_kernel_from_handle(kern_func, h->n_stack_trs_a, threads, stream, args); + ACC_API_CALL(Memcpy, (h->mat_trs_a, h->d_mat_a, h->n_a * m * n * sizeof(double), ACC(MemcpyDeviceToHost))); + + // Validate the kernel based on results + double sumGPU = checkSumTransp(h->mat_trs_a, h->n_stack_trs_a, m, n); + libsmm_acc_benchmark_finalize(h); + if(sumGPU != sumCPU){ + printf("Kernel validation failed for transpose kernel %ix%i\nchecksum CPU: %g, checksum GPU: %g\nchecksum_diff: %g\n", m, n, sumCPU, sumGPU, sumGPU-sumCPU); + exit(1); + } +} + + //=========================================================================== void jit_transpose_handle(ACC_DRV(function)& kern_func, int m, int n){ @@ -346,10 +379,14 @@ void jit_transpose_handle(ACC_DRV(function)& kern_func, int m, int n){ //=========================================================================== -int libsmm_acc_transpose_d(const int *trs_stack, int offset, int nblks, +int libsmm_acc_transpose_d(const int *trs_stack, int offset, int stack_size, double *buffer, int m, int n, ACC_DRV(stream) stream) { ACC_DRV(function) kern_func; + int threads = 128; + if (m * n + warp_size <= 128){ + threads = m*n - (m*n % warp_size) + warp_size; + } // Look up the kernel in the table of already JITed kernels Triplet h_mnk = { m, n, 0 }; @@ -363,6 +400,7 @@ int libsmm_acc_transpose_d(const int *trs_stack, int offset, int nblks, // JIT and store a kernel for this transposition jit_transpose_handle(kern_func, m, n); + validate_transpose_kernel(kern_func, threads, stream, m, n); transpose_handles.emplace(h_mnk, kern_func); kernel_it = transpose_handles.find(h_mnk); @@ -373,18 +411,18 @@ int libsmm_acc_transpose_d(const int *trs_stack, int offset, int nblks, // Construct argument pointer list and launch function kern_func = kernel_it->second; // retrieve handle const int* trs_stack_ = trs_stack + offset; - void *args[] = { &trs_stack_, &buffer}; + void *args[] = {&trs_stack_, &buffer}; - return launch_kernel_from_handle(kern_func, nblks, 128, stream, args); + return launch_kernel_from_handle(kern_func, stack_size, threads, stream, args); } //=========================================================================== -extern "C" int libsmm_acc_transpose (const int *trs_stack, int offset, int nblks, void *buffer, acc_data_t datatype, int m, int n, acc_stream_t* stream) { +extern "C" int libsmm_acc_transpose (const int *trs_stack, int offset, int stack_size, void *buffer, acc_data_t datatype, int m, int n, acc_stream_t* stream) { if(datatype != dbcsr_type_real_8) return 0; // transpose not needed if(m>MAX_BLOCK_DIM || n>MAX_BLOCK_DIM) return 0; // maximum size over any dimension - return libsmm_acc_transpose_d(trs_stack, offset, nblks, (double *) buffer, m, n, *((ACC_DRV(stream) *) stream)); + return libsmm_acc_transpose_d(trs_stack, offset, stack_size, (double *) buffer, m, n, *((ACC_DRV(stream) *) stream)); } diff --git a/src/acc/libsmm_acc/libsmm_acc.h b/src/acc/libsmm_acc/libsmm_acc.h index 1eaac455d07..e0e7ba00a34 100644 --- a/src/acc/libsmm_acc/libsmm_acc.h +++ b/src/acc/libsmm_acc/libsmm_acc.h @@ -10,13 +10,13 @@ #ifndef LIBSMM_ACC_H #define LIBSMM_ACC_H -#ifdef __CUDA +#if defined(__CUDA) # include "../cuda/acc_cuda.h" -#else +#elif defined(__HIP) # include "../hip/acc_hip.h" #endif -#include "../include/acc_libsmm.h" +#include "../acc_libsmm.h" #include "parameters_utils.h" #include diff --git a/src/acc/libsmm_acc/libsmm_acc_benchmark.cpp b/src/acc/libsmm_acc/libsmm_acc_benchmark.cpp index f62eae448b2..9f8dade4cd9 100644 --- a/src/acc/libsmm_acc/libsmm_acc_benchmark.cpp +++ b/src/acc/libsmm_acc/libsmm_acc_benchmark.cpp @@ -214,14 +214,19 @@ double checkSum(double* mat_c, int n_c, int mat_m, int mat_n){ //=========================================================================== -double checkSumTransp(double* mat, int n, int n_stack, int mat_m, int mat_n){ +double checkSumTransp(double* mat, int n_stack, int mat_m, int mat_n){ // for transposition, a regular checkSum does not inform about the // transpose's correctness. Instead, we perform a checkSum on a // sample of elements. double res = 0; int size = mat_m * mat_n; int n_samples = size / 3; - int step = size / n_samples; + int step = size; + + if(n_samples > 0){ + step = size / n_samples; + } + for(int s=0; s < n_stack; s++){ int offset = s * size; for(int idx=s%step; idx < size; idx+=step) @@ -394,7 +399,7 @@ int libsmm_acc_benchmark_transpose_(int n_stack, int* stack, int* d_stack, // Reference result on CPU stackTransp(stack, n_stack, mat, mat_trs, mat_m, mat_n); - sumCPU = checkSumTransp(mat_trs, n, n_stack, mat_m, mat_n); + sumCPU = checkSumTransp(mat_trs, n_stack, mat_m, mat_n); // Compute on GPU ACC_API_CALL(Memcpy, (d_mat, mat, n * mat_m * mat_n * sizeof(double), ACC(MemcpyHostToDevice))); @@ -418,7 +423,7 @@ int libsmm_acc_benchmark_transpose_(int n_stack, int* stack, int* d_stack, ACC_API_CALL(Memcpy, (mat_trs, d_mat, n * mat_m * mat_n * sizeof(double), ACC(MemcpyDeviceToHost))); clean_string(kernel_descr[0], descr); - sumGPU = checkSumTransp(mat_trs, n, n_stack, mat_m, mat_n); + sumGPU = checkSumTransp(mat_trs, n_stack, mat_m, mat_n); if(sumGPU != sumCPU){ printf("%sERROR %s checksum_diff: %g\n", msg_prefix, descr, sumGPU-sumCPU); error_counter++; diff --git a/src/acc/libsmm_acc/libsmm_acc_benchmark.h b/src/acc/libsmm_acc/libsmm_acc_benchmark.h index 4a738f2c8b5..b37516f2178 100644 --- a/src/acc/libsmm_acc/libsmm_acc_benchmark.h +++ b/src/acc/libsmm_acc/libsmm_acc_benchmark.h @@ -10,10 +10,10 @@ #ifndef LIBSMM_ACC_BENCHMARK_H #define LIBSMM_ACC_BENCHMARK_H -#ifdef __CUDA -#include "../cuda/acc_cuda.h" -#else -#include "../hip/acc_hip.h" +#if defined(__CUDA) +# include "../cuda/acc_cuda.h" +#elif defined(__HIP) +# include "../hip/acc_hip.h" #endif #define MAX_BLOCK_DIM 80 @@ -59,7 +59,7 @@ void stackTransp(int* stack, int n_stack, double *mat_a, double* mat_atrs, int mat_m, int mat_n); double checkSum(double* mat_c, int n_c, int mat_m, int mat_n); -double checkSumTransp(double* mat, int n, int n_stack, int mat_m, int mat_n); +double checkSumTransp(double* mat, int n_stack, int mat_m, int mat_n); void libsmm_acc_benchmark_init(libsmm_acc_benchmark_t** handle, benchmark_mode mode, int max_m, int max_n, int max_k); diff --git a/src/acc/libsmm_acc/libsmm_acc_init.cpp b/src/acc/libsmm_acc/libsmm_acc_init.cpp index 3fb1ed3f294..75a9d21fa48 100644 --- a/src/acc/libsmm_acc/libsmm_acc_init.cpp +++ b/src/acc/libsmm_acc/libsmm_acc_init.cpp @@ -11,7 +11,7 @@ #include "libsmm_acc_init.h" #include "parameters.h" -#if defined _OPENMP +#if defined(_OPENMP) #include #endif @@ -41,7 +41,7 @@ int acc_get_gpu_warp_size() { //=========================================================================== extern "C" int libsmm_acc_is_thread_safe() { -#if defined _OPENMP +#if defined(_OPENMP) return 1; // i.e. true, libsmm_acc is threaded #else return 0; // i.e. false, libsmm_acc is not threaded diff --git a/src/acc/libsmm_acc/predict/prepare_training_data.py b/src/acc/libsmm_acc/predict/prepare_training_data.py index ad1435ebce2..37a795437b8 100755 --- a/src/acc/libsmm_acc/predict/prepare_training_data.py +++ b/src/acc/libsmm_acc/predict/prepare_training_data.py @@ -41,9 +41,6 @@ def update_maximums(dictionnary_to_update, dictionnary_partial): # =============================================================================== def get_idx_baseline(data_mnk, algorithm, baseline_pars): - """ - - """ if algorithm in ["tiny"]: idx_baseline = data_mnk[ (data_mnk.m == baseline_pars["m"]) diff --git a/src/block/dbcsr_block_access.F b/src/block/dbcsr_block_access.F index d8efe8e46cc..251f09b1fdf 100644 --- a/src/block/dbcsr_block_access.F +++ b/src/block/dbcsr_block_access.F @@ -885,9 +885,14 @@ SUBROUTINE dbcsr_reserve_block2d_${nametype1}$ (matrix, row, col, block, & data_block, found, data_block2) IF (.NOT. found) THEN +#if defined(_OPENMP) && (200711 <= _OPENMP) +!$OMP ATOMIC WRITE + matrix%valid = .FALSE. +#else !$OMP CRITICAL (critical_reserve_block2d) matrix%valid = .FALSE. !$OMP END CRITICAL (critical_reserve_block2d) +#endif matrix%wms(iw)%lastblk = matrix%wms(iw)%lastblk + 1 matrix%wms(iw)%datasize = matrix%wms(iw)%datasize + row_size*col_size ELSE @@ -1093,9 +1098,8 @@ SUBROUTINE dbcsr_put_block_${nametype1}$ (matrix, row, col, block, lb_row_col, t IF (.NOT. found) THEN matrix%wms(iw)%datasize = matrix%wms(iw)%datasize + nze ENDIF -!$OMP CRITICAL (dbcsr_put_block_critical) +!$OMP ATOMIC WRITE matrix%valid = .FALSE. -!$OMP END CRITICAL (dbcsr_put_block_critical) ENDIF IF (PRESENT(flop)) flop = flop + my_flop END SUBROUTINE dbcsr_put_block_${nametype1}$ diff --git a/src/block/dbcsr_iterator_operations.F b/src/block/dbcsr_iterator_operations.F index 8c3d1d8d4f5..45a366006e7 100644 --- a/src/block/dbcsr_iterator_operations.F +++ b/src/block/dbcsr_iterator_operations.F @@ -443,7 +443,7 @@ SUBROUTINE iterator_advance(iterator) TYPE(dbcsr_iterator), INTENT(INOUT) :: iterator !! the iterator - INTEGER :: ithread, my_old_row, next_row, p + INTEGER :: ithread, my_old_row, p LOGICAL :: advance, jumped_row ! --------------------------------------------------------------------------- @@ -466,9 +466,7 @@ SUBROUTINE iterator_advance(iterator) IF (jumped_row) THEN !$OMP CRITICAL (crit_common_pos) ! Set the common_pos to the next available row. - next_row = MAX(iterator%common_pos + 1, iterator%row) - next_row = next_row - iterator%common_pos - iterator%common_pos = iterator%common_pos + next_row + iterator%common_pos = MAX(iterator%row, iterator%common_pos + 1) iterator%row = iterator%common_pos !$OMP END CRITICAL (crit_common_pos) IF (iterator%row .GT. iterator%nblkrows_total) THEN diff --git a/src/core/dbcsr_config.F b/src/core/dbcsr_config.F index 6bf64129fb5..1b06ec2b6ab 100644 --- a/src/core/dbcsr_config.F +++ b/src/core/dbcsr_config.F @@ -16,6 +16,7 @@ MODULE dbcsr_config dp USE dbcsr_ptr_util, ONLY: dbcsr_data_allocation, & dbcsr_data_allocation_type + USE dbcsr_kinds, ONLY: real_8 #include "base/dbcsr_base_uses.f90" !$ USE OMP_LIB, ONLY: omp_get_max_threads @@ -115,6 +116,7 @@ MODULE dbcsr_config INTEGER :: accdrv_binning_nbins = 4096 INTEGER :: accdrv_binning_binsize = 16 LOGICAL :: use_mempools_cpu = .FALSE. + REAL(KIND=real_8) :: tas_split_factor = 3.0_real_8 END TYPE dbcsr_config_type TYPE(dbcsr_config_type), PROTECTED, SAVE :: dbcsr_cfg = dbcsr_config_type() ! defaults @@ -164,7 +166,8 @@ SUBROUTINE dbcsr_set_config( & accdrv_do_inhomogenous, & accdrv_binning_nbins, & accdrv_binning_binsize, & - use_mempools_cpu) + use_mempools_cpu, & + tas_split_factor) CHARACTER(len=*), INTENT(IN), OPTIONAL :: mm_driver LOGICAL, INTENT(IN), OPTIONAL :: use_mpi_allocator @@ -192,6 +195,7 @@ SUBROUTINE dbcsr_set_config( & INTEGER, INTENT(IN), OPTIONAL :: accdrv_binning_nbins, & accdrv_binning_binsize LOGICAL, INTENT(IN), OPTIONAL :: use_mempools_cpu + REAL(KIND=real_8), INTENT(IN), OPTIONAL :: tas_split_factor INTEGER :: nthreads @@ -215,6 +219,7 @@ SUBROUTINE dbcsr_set_config( & IF (PRESENT(accdrv_binning_nbins)) dbcsr_cfg%accdrv_binning_nbins = accdrv_binning_nbins IF (PRESENT(accdrv_binning_binsize)) dbcsr_cfg%accdrv_binning_binsize = accdrv_binning_binsize IF (PRESENT(use_mempools_cpu)) dbcsr_cfg%use_mempools_cpu = use_mempools_cpu + IF (PRESENT(tas_split_factor)) dbcsr_cfg%tas_split_factor = tas_split_factor IF (PRESENT(comm_thread_load)) THEN dbcsr_cfg%comm_thread_load = comm_thread_load @@ -255,7 +260,7 @@ SUBROUTINE dbcsr_set_config( & IF (.NOT. has_xsmm) DBCSR_ABORT("Support for libxsmm not compiled in.") dbcsr_cfg%mm_driver = mm_driver_xsmm ELSE - DBCSR_ABORT("Unkown MM driver: "//TRIM(mm_driver)) + DBCSR_ABORT("Unknown MM driver: "//TRIM(mm_driver)) ENDIF ENDIF @@ -284,7 +289,8 @@ SUBROUTINE dbcsr_get_default_config( & accdrv_do_inhomogenous, & accdrv_binning_nbins, & accdrv_binning_binsize, & - use_mempools_cpu) + use_mempools_cpu, & + tas_split_factor) ! LOGICAL, INTENT(OUT), OPTIONAL :: use_mpi_allocator INTEGER, INTENT(OUT), OPTIONAL :: mm_stack_size, avg_elements_images, & @@ -304,6 +310,7 @@ SUBROUTINE dbcsr_get_default_config( & INTEGER, INTENT(OUT), OPTIONAL :: accdrv_binning_nbins, & accdrv_binning_binsize LOGICAL, INTENT(OUT), OPTIONAL :: use_mempools_cpu + REAL(KIND=real_8), INTENT(OUT), OPTIONAL :: tas_split_factor TYPE(dbcsr_config_type) :: default_cfg TYPE(dbcsr_data_allocation_type) :: default_data_allocation @@ -330,6 +337,7 @@ SUBROUTINE dbcsr_get_default_config( & IF (PRESENT(accdrv_binning_nbins)) accdrv_binning_nbins = default_cfg%accdrv_binning_nbins IF (PRESENT(accdrv_binning_binsize)) accdrv_binning_binsize = default_cfg%accdrv_binning_binsize IF (PRESENT(use_mempools_cpu)) use_mempools_cpu = default_cfg%use_mempools_cpu + IF (PRESENT(tas_split_factor)) tas_split_factor = default_cfg%tas_split_factor DBCSR_ASSERT(default_cfg%nm_stacks == default_cfg%nn_stacks) DBCSR_ASSERT(default_cfg%nm_stacks == default_cfg%nk_stacks) @@ -350,7 +358,7 @@ SUBROUTINE dbcsr_print_config(unit_nr) CASE (mm_driver_smm); mm_name = mm_name_smm CASE (mm_driver_xsmm); mm_name = mm_name_xsmm CASE DEFAULT - DBCSR_ABORT("Unkown MM driver") + DBCSR_ABORT("Unknown MM driver") END SELECT WRITE (UNIT=unit_nr, FMT='(1X,A,T41,A40)') & "DBCSR| CPU Multiplication driver", ADJUSTR(mm_name(1:40)) @@ -434,6 +442,9 @@ SUBROUTINE dbcsr_print_config(unit_nr) END IF ENDIF + WRITE (UNIT=unit_nr, FMT='(1X,A,T74,ES7.1)') & + "DBCSR| Split modifier for TAS multiplication algorithm", dbcsr_cfg%tas_split_factor + END SUBROUTINE dbcsr_print_config FUNCTION get_accdrv_active_device_id() diff --git a/src/data/dbcsr_data_methods.F b/src/data/dbcsr_data_methods.F index 04c6f28da83..e5341164394 100644 --- a/src/data/dbcsr_data_methods.F +++ b/src/data/dbcsr_data_methods.F @@ -26,7 +26,7 @@ MODULE dbcsr_data_methods dbcsr_get_data_p_z, dbcsr_scalar, dbcsr_scalar_are_equal, dbcsr_scalar_fill_all, & dbcsr_scalar_get_type, dbcsr_scalar_get_value, dbcsr_scalar_negative, dbcsr_scalar_one, & dbcsr_scalar_set_type, dbcsr_scalar_zero, dbcsr_type_1d_to_2d, dbcsr_type_2d_to_1d, & - dbcsr_type_is_2d, internal_data_allocate, internal_data_deallocate + dbcsr_type_is_2d, internal_data_allocate, internal_data_deallocate, dbcsr_scalar_multiply USE dbcsr_data_types, ONLY: & dbcsr_data_obj, dbcsr_datatype_sizeof, dbcsr_memtype_default, dbcsr_memtype_type, & dbcsr_type_complex_4, dbcsr_type_complex_8, dbcsr_type_int_4, dbcsr_type_int_8, & @@ -55,7 +55,7 @@ MODULE dbcsr_data_methods dbcsr_scalar_are_equal, dbcsr_scalar_negative, & dbcsr_scalar_get_type, dbcsr_scalar_set_type, & dbcsr_scalar_fill_all, dbcsr_scalar_get_value, & - dbcsr_data_valid, dbcsr_data_exists + dbcsr_data_valid, dbcsr_data_exists, dbcsr_scalar_multiply PUBLIC :: dbcsr_data_init, dbcsr_data_new, dbcsr_data_hold, & dbcsr_data_release, dbcsr_data_get_size, dbcsr_data_get_type PUBLIC :: dbcsr_get_data, & diff --git a/src/data/dbcsr_data_methods_low.F b/src/data/dbcsr_data_methods_low.F index 559be8ddb5a..70bebfc6dc5 100644 --- a/src/data/dbcsr_data_methods_low.F +++ b/src/data/dbcsr_data_methods_low.F @@ -43,7 +43,7 @@ MODULE dbcsr_data_methods_low dbcsr_scalar_are_equal, dbcsr_scalar_negative, & dbcsr_scalar_get_type, dbcsr_scalar_set_type, & dbcsr_scalar_fill_all, dbcsr_scalar_get_value, & - dbcsr_data_valid + dbcsr_data_valid, dbcsr_scalar_multiply PUBLIC :: dbcsr_data_init, dbcsr_data_hold, & dbcsr_data_get_size, dbcsr_data_get_type PUBLIC :: dbcsr_get_data, & diff --git a/src/dbcsr.h b/src/dbcsr.h index 087d1f2d0eb..b0a769d9349 100644 --- a/src/dbcsr.h +++ b/src/dbcsr.h @@ -13,7 +13,7 @@ #include #include /* we need bool from C99 */ -#ifdef __cplusplus +#if defined(__cplusplus) extern "C" { #endif void c_dbcsr_init_lib_internal(MPI_Fint* fcomm, int* io_unit); @@ -56,7 +56,7 @@ extern "C" { void c_free_string(char** c_string); -#ifdef __cplusplus +#if defined(__cplusplus) } #endif diff --git a/src/mpi/dbcsr_mp_methods.F b/src/mpi/dbcsr_mp_methods.F index 2328cf95cca..a47675683e8 100644 --- a/src/mpi/dbcsr_mp_methods.F +++ b/src/mpi/dbcsr_mp_methods.F @@ -267,8 +267,6 @@ PURE SUBROUTINE dbcsr_mp_hold(mp_env) TYPE(dbcsr_mp_obj), INTENT(INOUT) :: mp_env !! multiprocessor environment -! --------------------------------------------------------------------------- - mp_env%mp%refcount = mp_env%mp%refcount + 1 END SUBROUTINE dbcsr_mp_hold @@ -286,60 +284,70 @@ FUNCTION dbcsr_mp_pgrid(mp_env) RESULT(pgrid) pgrid => mp_env%mp%pgrid END FUNCTION dbcsr_mp_pgrid + PURE FUNCTION dbcsr_mp_numnodes(mp_env) RESULT(numnodes) TYPE(dbcsr_mp_obj), INTENT(IN) :: mp_env INTEGER :: numnodes numnodes = mp_env%mp%numnodes END FUNCTION dbcsr_mp_numnodes + PURE FUNCTION dbcsr_mp_mynode(mp_env) RESULT(mynode) TYPE(dbcsr_mp_obj), INTENT(IN) :: mp_env INTEGER :: mynode mynode = mp_env%mp%mynode END FUNCTION dbcsr_mp_mynode + PURE FUNCTION dbcsr_mp_group(mp_env) RESULT(mp_group) TYPE(dbcsr_mp_obj), INTENT(IN) :: mp_env INTEGER :: mp_group mp_group = mp_env%mp%mp_group END FUNCTION dbcsr_mp_group + PURE FUNCTION dbcsr_mp_nprows(mp_env) RESULT(nprows) TYPE(dbcsr_mp_obj), INTENT(IN) :: mp_env INTEGER :: nprows nprows = SIZE(mp_env%mp%pgrid, 1) END FUNCTION dbcsr_mp_nprows + PURE FUNCTION dbcsr_mp_npcols(mp_env) RESULT(npcols) TYPE(dbcsr_mp_obj), INTENT(IN) :: mp_env INTEGER :: npcols npcols = SIZE(mp_env%mp%pgrid, 2) END FUNCTION dbcsr_mp_npcols + PURE FUNCTION dbcsr_mp_myprow(mp_env) RESULT(myprow) TYPE(dbcsr_mp_obj), INTENT(IN) :: mp_env INTEGER :: myprow myprow = mp_env%mp%myprow END FUNCTION dbcsr_mp_myprow + PURE FUNCTION dbcsr_mp_mypcol(mp_env) RESULT(mypcol) TYPE(dbcsr_mp_obj), INTENT(IN) :: mp_env INTEGER :: mypcol mypcol = mp_env%mp%mypcol END FUNCTION dbcsr_mp_mypcol + PURE FUNCTION dbcsr_mp_has_subgroups(mp_env) RESULT(has_subgroups) TYPE(dbcsr_mp_obj), INTENT(IN) :: mp_env LOGICAL :: has_subgroups has_subgroups = mp_env%mp%subgroups_defined END FUNCTION dbcsr_mp_has_subgroups + PURE FUNCTION dbcsr_mp_my_row_group(mp_env) RESULT(row_group) TYPE(dbcsr_mp_obj), INTENT(IN) :: mp_env INTEGER :: row_group row_group = mp_env%mp%prow_group END FUNCTION dbcsr_mp_my_row_group + PURE FUNCTION dbcsr_mp_my_col_group(mp_env) RESULT(col_group) TYPE(dbcsr_mp_obj), INTENT(IN) :: mp_env INTEGER :: col_group @@ -355,8 +363,6 @@ SUBROUTINE dbcsr_mp_new_transposed(mp_t, mp) TYPE(dbcsr_mp_obj), INTENT(IN) :: mp !! original multiprocessor environment -! --------------------------------------------------------------------------- - CALL dbcsr_mp_new(mp_t, dbcsr_mp_group(mp), & TRANSPOSE(dbcsr_mp_pgrid(mp)), & dbcsr_mp_mynode(mp), dbcsr_mp_numnodes(mp), & diff --git a/src/ops/dbcsr_operations.F b/src/ops/dbcsr_operations.F index dfe20922ff7..44b36b72069 100644 --- a/src/ops/dbcsr_operations.F +++ b/src/ops/dbcsr_operations.F @@ -121,6 +121,7 @@ MODULE dbcsr_operations PUBLIC :: dbcsr_crop_matrix PUBLIC :: dbcsr_get_info, dbcsr_may_be_dense, dbcsr_get_occupation PUBLIC :: dbcsr_clear, dbcsr_add_block_node, dbcsr_conform_scalar + PUBLIC :: dbcsr_zero ! The interfaces for the generic routines found in the generated ! generic files. diff --git a/src/tas/dbcsr_tas_base.F b/src/tas/dbcsr_tas_base.F index d5027f86aef..90c1c9ee51b 100644 --- a/src/tas/dbcsr_tas_base.F +++ b/src/tas/dbcsr_tas_base.F @@ -85,6 +85,8 @@ MODULE dbcsr_tas_base dbcsr_tas_nblkcols_total, & dbcsr_tas_nblkrows_local, & dbcsr_tas_nblkrows_total, & + dbcsr_tas_nfullrows_total, & + dbcsr_tas_nfullcols_total, & dbcsr_tas_put_block, & dbcsr_tas_reserve_blocks, & dbcsr_tas_set, & diff --git a/src/tas/dbcsr_tas_mm.F b/src/tas/dbcsr_tas_mm.F index 4e298f0f441..2f4d4c55405 100644 --- a/src/tas/dbcsr_tas_mm.F +++ b/src/tas/dbcsr_tas_mm.F @@ -19,7 +19,7 @@ MODULE dbcsr_tas_mm #:include "dbcsr_tas.fypp" USE dbcsr_data_methods, ONLY: & - dbcsr_scalar_zero, dbcsr_scalar + dbcsr_scalar_zero, dbcsr_scalar, dbcsr_scalar_multiply USE dbcsr_data_types, ONLY: & dbcsr_scalar_type, dbcsr_type_real_8, dbcsr_type_real_4, dbcsr_type_complex_8, dbcsr_type_complex_4 USE dbcsr_multiply_api, ONLY: dbcsr_multiply @@ -28,7 +28,8 @@ MODULE dbcsr_tas_mm dbcsr_tas_get_data_type, dbcsr_tas_info, dbcsr_tas_nblkcols_total, & dbcsr_tas_nblkrows_total, dbcsr_tas_filter, dbcsr_tas_get_info, dbcsr_tas_iterator_blocks_left, & dbcsr_tas_get_nze_total, dbcsr_tas_reserve_blocks, dbcsr_tas_iterator_start, dbcsr_tas_iterator_next_block, & - dbcsr_tas_iterator_stop, dbcsr_tas_copy, dbcsr_tas_get_block_p, dbcsr_tas_clear, dbcsr_tas_get_num_blocks + dbcsr_tas_iterator_stop, dbcsr_tas_copy, dbcsr_tas_get_block_p, dbcsr_tas_clear, dbcsr_tas_get_num_blocks, & + dbcsr_tas_nfullrows_total, dbcsr_tas_nfullcols_total USE dbcsr_tas_types, ONLY: & dbcsr_tas_distribution_type, dbcsr_tas_split_info, dbcsr_tas_type, dbcsr_tas_iterator USE dbcsr_tas_global, ONLY: & @@ -38,7 +39,7 @@ MODULE dbcsr_tas_mm dbcsr_tas_merge, dbcsr_tas_replicate, dbcsr_tas_reshape USE dbcsr_tas_split, ONLY: & rowsplit, colsplit, dbcsr_tas_get_split_info, dbcsr_tas_create_split, dbcsr_tas_mp_comm, & - dbcsr_tas_release_info, accept_pgrid_dims, dbcsr_tas_info_hold, default_nsplit_accept_ratio_1 + dbcsr_tas_release_info, accept_pgrid_dims, dbcsr_tas_info_hold, default_nsplit_accept_ratio USE dbcsr_tas_util, ONLY: & swap, invert_transpose_flag, array_eq, dbcsr_mp_environ USE dbcsr_types, ONLY: & @@ -47,16 +48,17 @@ MODULE dbcsr_tas_mm USE dbcsr_kinds, ONLY: & int_8, real_8, real_4, default_string_length USE dbcsr_mpiwrap, ONLY: & - mp_comm_compare, mp_environ, mp_sum, mp_comm_free, mp_cart_create + mp_comm_compare, mp_environ, mp_sum, mp_comm_free, mp_cart_create, mp_max, mp_sync USE dbcsr_operations, ONLY: & - dbcsr_scale, dbcsr_get_info, dbcsr_copy, dbcsr_clear, dbcsr_add + dbcsr_scale, dbcsr_get_info, dbcsr_copy, dbcsr_clear, dbcsr_add, dbcsr_zero USE dbcsr_tas_io, ONLY: & dbcsr_tas_write_dist, dbcsr_tas_write_matrix_info, dbcsr_tas_write_split_info, prep_output_unit USE dbcsr_work_operations, ONLY: dbcsr_create, dbcsr_finalize USE dbcsr_transformations, ONLY: dbcsr_redistribute USE dbcsr_dist_methods, ONLY: dbcsr_distribution_new USE dbcsr_methods, ONLY: & - dbcsr_mp_release, dbcsr_release, dbcsr_distribution_release, dbcsr_get_nze + dbcsr_mp_release, dbcsr_release, dbcsr_distribution_release, dbcsr_get_nze, dbcsr_nfullrows_total, dbcsr_nfullcols_total + USE dbcsr_config, ONLY: dbcsr_cfg #include "base/dbcsr_base_uses.f90" IMPLICIT NONE @@ -68,7 +70,9 @@ MODULE dbcsr_tas_mm dbcsr_tas_multiply, & dbcsr_tas_batched_mm_init, & dbcsr_tas_batched_mm_finalize, & - dbcsr_tas_result_index + dbcsr_tas_result_index, & + dbcsr_tas_set_batched_state, & + dbcsr_tas_batched_mm_complete CONTAINS @@ -86,10 +90,9 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, !! Whether distribution should be optimized internally. In the current implementation this guarantees optimal parameters !! only for dense matrices. TYPE(dbcsr_tas_split_info), INTENT(OUT), & - POINTER, OPTIONAL :: split_opt + OPTIONAL :: split_opt !! optionally return split info containing optimal grid and split parameters. This can be used to choose optimal process - !! grids for subsequent matrix multiplications with matrices of similar shape and sparsity. under some conditions, - !! split_opt can not be returned, in this case the pointer is not associated + !! grids for subsequent matrix multiplications with matrices of similar shape and sparsity. REAL(KIND=real_8), INTENT(IN), OPTIONAL :: filter_eps INTEGER(KIND=int_8), INTENT(OUT), OPTIONAL :: flop LOGICAL, INTENT(IN), OPTIONAL :: move_data_a, move_data_b, simple_split, retain_sparsity @@ -110,23 +113,26 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, INTEGER, DIMENSION(2) :: pdims, pcoord, pcoord_sub, pdims_sub INTEGER(KIND=int_8), DIMENSION(3) :: dims INTEGER :: max_mm_dim, data_type, mp_comm, comm_tmp, & - handle, handle2, unit_nr_prv, nsplit, nsplit_opt, numproc, numproc_sub, iproc, & + handle, handle2, handle3, handle4, & + unit_nr_prv, nsplit, nsplit_opt, numproc, numproc_sub, iproc, & mp_comm_group, mp_comm_mm, split_rc, split_a, split_b, split_c, & mp_comm_opt, batched_repl, max_mm_dim_batched, nsplit_batched CHARACTER(LEN=1) :: tr_case, transa_prv, transb_prv, transc_prv TYPE(dbcsr_scalar_type) :: zero LOGICAL :: new_a, new_b, new_c, simple_split_prv, opt_pgrid, & - move_a, move_b, do_batched, simple_split_save, & + move_a, move_b, do_batched, & nodata_3 TYPE(dbcsr_tas_split_info) :: info, info_a, info_b, info_c CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_tas_multiply', & routineP = moduleN//':'//routineN INTEGER(KIND=int_8) :: nze_a, nze_b, nze_c - TYPE(dbcsr_type), POINTER :: matrix_a_mm, matrix_b_mm, matrix_c_mm + TYPE(dbcsr_type) :: matrix_a_mm, matrix_b_mm, matrix_c_mm CALL timeset(routineN, handle) + CALL mp_sync(matrix_a%dist%info%mp_comm) + CALL timeset("dbcsr_tas_total", handle2) - NULLIFY (matrix_b_rs, matrix_a_rs, matrix_c_rs, matrix_a_mm, matrix_b_mm, matrix_c_mm) + NULLIFY (matrix_b_rs, matrix_a_rs, matrix_c_rs) unit_nr_prv = prep_output_unit(unit_nr) @@ -136,7 +142,7 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, simple_split_prv = .FALSE. info_a = dbcsr_tas_info(matrix_a); info_b = dbcsr_tas_info(matrix_b); info_c = dbcsr_tas_info(matrix_c) - IF (info_a%strict_split .OR. info_b%strict_split .OR. info_c%strict_split) simple_split_prv = .TRUE. + IF (info_a%strict_split(1) .OR. info_b%strict_split(1) .OR. info_c%strict_split(1)) simple_split_prv = .TRUE. ENDIF nodata_3 = .TRUE. @@ -149,7 +155,7 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, do_batched = .FALSE. IF (matrix_a%do_batched > 0) THEN do_batched = .TRUE. - IF (matrix_a%do_batched == 2) THEN + IF (matrix_a%do_batched == 3) THEN DBCSR_ASSERT(batched_repl == 0) batched_repl = 1 CALL dbcsr_tas_get_split_info( & @@ -162,7 +168,7 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, IF (matrix_b%do_batched > 0) THEN do_batched = .TRUE. - IF (matrix_b%do_batched == 2) THEN + IF (matrix_b%do_batched == 3) THEN DBCSR_ASSERT(batched_repl == 0) batched_repl = 2 CALL dbcsr_tas_get_split_info( & @@ -175,7 +181,7 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, IF (matrix_c%do_batched > 0) THEN do_batched = .TRUE. - IF (matrix_c%do_batched == 2) THEN + IF (matrix_c%do_batched == 3) THEN DBCSR_ASSERT(batched_repl == 0) batched_repl = 3 CALL dbcsr_tas_get_split_info( & @@ -186,11 +192,6 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, ENDIF ENDIF - IF (batched_repl > 0) THEN - simple_split_save = simple_split_prv - simple_split_prv = .TRUE. - ENDIF - move_a = .FALSE. move_b = .FALSE. @@ -251,12 +252,16 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, CALL mp_environ(numproc, iproc, mp_comm) ! derive optimal matrix layout and split factor from occupancies + nze_a = dbcsr_tas_get_nze_total(matrix_a) + nze_b = dbcsr_tas_get_nze_total(matrix_b) + IF (.NOT. simple_split_prv) THEN - nze_a = dbcsr_tas_get_nze_total(matrix_a) - nze_b = dbcsr_tas_get_nze_total(matrix_b) CALL dbcsr_tas_result_index(transa, transb, transc, matrix_a, matrix_b, matrix_c, filter_eps, & blk_ind=result_index, nze=nze_c, retain_sparsity=retain_sparsity) + IF (PRESENT(result_index)) THEN + CALL mp_sync(matrix_a%dist%info%mp_comm) + CALL timestop(handle2) CALL timestop(handle) RETURN ENDIF @@ -278,7 +283,6 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, nsplit = nsplit_batched nsplit_opt = nsplit max_mm_dim = max_mm_dim_batched - simple_split_prv = simple_split_save IF (unit_nr_prv > 0) THEN WRITE (unit_nr_prv, "(T2,A)") & "MM PARAMETERS" @@ -308,7 +312,7 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, CALL dbcsr_tas_get_split_info(info, split_rowcol=split_rc, mp_comm=mp_comm) new_b = .FALSE. - IF (matrix_b%do_batched <= 1) THEN + IF (matrix_b%do_batched <= 2) THEN ALLOCATE (matrix_b_rs) CALL reshape_mm_small(mp_comm, matrix_b, matrix_b_rs, transb_prv == dbcsr_transpose, transb_prv, move_data=move_b) new_b = .TRUE. @@ -338,13 +342,22 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, info = dbcsr_tas_info(matrix_a_rs) CALL dbcsr_tas_get_split_info(info, split_rowcol=split_rc, mp_comm=mp_comm) - IF (matrix_c%do_batched <= 1) THEN + IF (matrix_c%do_batched == 1) THEN + matrix_c%mm_storage%batched_beta = beta + ELSEIF (matrix_c%do_batched > 1) THEN + matrix_c%mm_storage%batched_beta = & + dbcsr_scalar_multiply(matrix_c%mm_storage%batched_beta, beta) + ENDIF + + IF (matrix_c%do_batched <= 2) THEN ALLOCATE (matrix_c_rs) CALL reshape_mm_small(mp_comm, matrix_c, matrix_c_rs, transc_prv == dbcsr_transpose, transc_prv, nodata=nodata_3) - IF (matrix_c%do_batched == 1) THEN - matrix_c%mm_storage%store_batched => matrix_c_rs - ENDIF - ELSEIF (matrix_c%do_batched == 2) THEN + + ! just leave sparsity structure for retain sparsity but no values + IF (.NOT. nodata_3) CALL dbcsr_zero(matrix_c_rs%matrix) + + IF (matrix_c%do_batched >= 1) matrix_c%mm_storage%store_batched => matrix_c_rs + ELSEIF (matrix_c%do_batched == 3) THEN matrix_c_rs => matrix_c%mm_storage%store_batched ENDIF @@ -373,7 +386,7 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, CALL dbcsr_tas_get_split_info(info, split_rowcol=split_rc, mp_comm=mp_comm) new_a = .FALSE. - IF (matrix_a%do_batched <= 1) THEN + IF (matrix_a%do_batched <= 2) THEN ALLOCATE (matrix_a_rs) CALL reshape_mm_small(mp_comm, matrix_a, matrix_a_rs, transa_prv == dbcsr_transpose, transa_prv, move_data=move_a) new_a = .TRUE. @@ -396,11 +409,7 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, CALL mp_environ(numproc, pdims, pcoord, mp_comm) CALL mp_environ(numproc_sub, pdims_sub, pcoord_sub, mp_comm_group) - IF (.NOT. simple_split_prv) THEN - opt_pgrid = .NOT. accept_pgrid_dims(pdims_sub, relative=.TRUE.) - ELSE - opt_pgrid = .FALSE. - ENDIF + opt_pgrid = .NOT. accept_pgrid_dims(pdims_sub, relative=.TRUE.) IF (PRESENT(filter_eps)) THEN filter_eps_prv = filter_eps @@ -433,14 +442,14 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, ! Convert DBCSR submatrices to optimized process grids and multiply SELECT CASE (max_mm_dim) CASE (1) - IF (matrix_b%do_batched <= 1) THEN + IF (matrix_b%do_batched <= 2) THEN ALLOCATE (matrix_b_rep) CALL dbcsr_tas_replicate(matrix_b_rs%matrix, dbcsr_tas_info(matrix_a_rs), matrix_b_rep, move_data=.TRUE.) - IF (matrix_b%do_batched == 1) THEN + IF (matrix_b%do_batched == 1 .or. matrix_b%do_batched == 2) THEN matrix_b%mm_storage%store_batched_repl => matrix_b_rep - matrix_b%do_batched = 2 + CALL dbcsr_tas_set_batched_state(matrix_b, state=3) ENDIF - ELSEIF (matrix_b%do_batched == 2) THEN + ELSEIF (matrix_b%do_batched == 3) THEN matrix_b_rep => matrix_b%mm_storage%store_batched_repl ENDIF @@ -455,58 +464,64 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, CALL convert_to_new_pgrid(mp_comm_mm, matrix_a_rs%matrix, matrix_a_mm, optimize_pgrid=opt_pgrid, move_data=move_a) - IF (new_a .AND. opt_pgrid) THEN + ! keep communicators alive even after releasing TAS matrices (communicator management does not work between DBCSR and TAS) + info_a = dbcsr_tas_info(matrix_a_rs) + CALL dbcsr_tas_info_hold(info_a) + + IF (new_a) THEN CALL dbcsr_tas_destroy(matrix_a_rs) DEALLOCATE (matrix_a_rs) ENDIF CALL convert_to_new_pgrid(mp_comm_mm, matrix_b_rep%matrix, matrix_b_mm, optimize_pgrid=opt_pgrid, & move_data=matrix_b%do_batched == 0) - IF (opt_pgrid .AND. matrix_b%do_batched == 0) THEN + info_b = dbcsr_tas_info(matrix_b_rep) + CALL dbcsr_tas_info_hold(info_b) + + IF (matrix_b%do_batched == 0) THEN CALL dbcsr_tas_destroy(matrix_b_rep) DEALLOCATE (matrix_b_rep) ENDIF CALL convert_to_new_pgrid(mp_comm_mm, matrix_c_rs%matrix, matrix_c_mm, nodata=nodata_3, optimize_pgrid=opt_pgrid) + info_c = dbcsr_tas_info(matrix_c_rs) + CALL dbcsr_tas_info_hold(info_c) + + CALL mp_sync(matrix_a%dist%info%mp_comm) + CALL timeset("dbcsr_tas_dbcsr", handle4) SELECT CASE (tr_case) CASE (dbcsr_no_transpose) - CALL timeset(routineN//"_mm_1N", handle2) + CALL timeset("dbcsr_tas_mm_1N", handle3) CALL dbcsr_multiply(transa=dbcsr_no_transpose, transb=dbcsr_no_transpose, alpha=alpha, & matrix_a=matrix_a_mm, matrix_b=matrix_b_mm, beta=beta, matrix_c=matrix_c_mm, & filter_eps=filter_eps_prv, retain_sparsity=retain_sparsity, flop=flop) - CALL timestop(handle2) + CALL timestop(handle3) CASE (dbcsr_transpose) - CALL timeset(routineN//"_mm_1T", handle2) + CALL timeset("dbcsr_tas_mm_1T", handle3) CALL dbcsr_multiply(transa=dbcsr_transpose, transb=dbcsr_no_transpose, alpha=alpha, & matrix_a=matrix_b_mm, matrix_b=matrix_a_mm, beta=beta, matrix_c=matrix_c_mm, & filter_eps=filter_eps_prv, retain_sparsity=retain_sparsity, flop=flop) - CALL timestop(handle2) + CALL timestop(handle3) END SELECT + CALL mp_sync(matrix_a%dist%info%mp_comm) + CALL timestop(handle4) - IF (opt_pgrid) THEN - CALL dbcsr_release(matrix_a_mm) - CALL dbcsr_release(matrix_b_mm) - IF (.NOT. new_c) THEN - CALL redistribute_and_sum(matrix_c_mm, matrix_c_rs%matrix, alpha=beta) - ELSE - CALL redistribute_and_sum(matrix_c_mm, matrix_c_rs%matrix) - ENDIF + CALL dbcsr_release(matrix_a_mm) + CALL dbcsr_release(matrix_b_mm) - CALL dbcsr_release(matrix_c_mm) + nze_c = dbcsr_get_nze(matrix_c_mm) - DEALLOCATE (matrix_a_mm, matrix_b_mm, matrix_c_mm) + IF (.NOT. new_c) THEN + CALL redistribute_and_sum(matrix_c_mm, matrix_c_rs%matrix, local_copy=.NOT. opt_pgrid, alpha=beta) ELSE - IF (new_a) CALL dbcsr_tas_destroy(matrix_a_rs) - IF (new_a) DEALLOCATE (matrix_a_rs) - IF (matrix_b%do_batched == 0) THEN - CALL dbcsr_tas_destroy(matrix_b_rep) - DEALLOCATE (matrix_b_rep) - ENDIF + CALL redistribute_and_sum(matrix_c_mm, matrix_c_rs%matrix, local_copy=.NOT. opt_pgrid) ENDIF + CALL dbcsr_release(matrix_c_mm) + IF (PRESENT(filter_eps)) CALL dbcsr_tas_filter(matrix_c_rs, filter_eps) IF (unit_nr_prv /= 0) THEN @@ -519,9 +534,16 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, CALL dbcsr_tas_replicate(matrix_c_rs%matrix, dbcsr_tas_info(matrix_a_rs), matrix_c_rep, nodata=nodata_3) IF (matrix_c%do_batched == 1) THEN matrix_c%mm_storage%store_batched_repl => matrix_c_rep - matrix_c%do_batched = 2 + CALL dbcsr_tas_set_batched_state(matrix_c, state=3) ENDIF ELSEIF (matrix_c%do_batched == 2) THEN + ALLOCATE (matrix_c_rep) + CALL dbcsr_tas_replicate(matrix_c_rs%matrix, dbcsr_tas_info(matrix_a_rs), matrix_c_rep, nodata=nodata_3) + ! just leave sparsity structure for retain sparsity but no values + IF (.not. nodata_3) CALL dbcsr_zero(matrix_c_rep%matrix) + matrix_c%mm_storage%store_batched_repl => matrix_c_rep + CALL dbcsr_tas_set_batched_state(matrix_c, state=3) + ELSEIF (matrix_c%do_batched == 3) THEN matrix_c_rep => matrix_c%mm_storage%store_batched_repl ENDIF @@ -531,38 +553,49 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, ENDIF CALL convert_to_new_pgrid(mp_comm_mm, matrix_a_rs%matrix, matrix_a_mm, optimize_pgrid=opt_pgrid, move_data=move_a) - IF (new_a .AND. opt_pgrid) THEN + + ! keep communicators alive even after releasing TAS matrices (communicator management does not work between DBCSR and TAS) + info_a = dbcsr_tas_info(matrix_a_rs) + CALL dbcsr_tas_info_hold(info_a) + + IF (new_a) THEN CALL dbcsr_tas_destroy(matrix_a_rs) DEALLOCATE (matrix_a_rs) ENDIF CALL convert_to_new_pgrid(mp_comm_mm, matrix_b_rs%matrix, matrix_b_mm, optimize_pgrid=opt_pgrid, move_data=move_b) - IF (new_b .AND. opt_pgrid) THEN + + info_b = dbcsr_tas_info(matrix_b_rs) + CALL dbcsr_tas_info_hold(info_b) + + IF (new_b) THEN CALL dbcsr_tas_destroy(matrix_b_rs) DEALLOCATE (matrix_b_rs) ENDIF CALL convert_to_new_pgrid(mp_comm_mm, matrix_c_rep%matrix, matrix_c_mm, nodata=nodata_3, optimize_pgrid=opt_pgrid) - CALL timeset(routineN//"_mm_2", handle2) + info_c = dbcsr_tas_info(matrix_c_rep) + CALL dbcsr_tas_info_hold(info_c) + + CALL mp_sync(matrix_a%dist%info%mp_comm) + CALL timeset("dbcsr_tas_dbcsr", handle4) + CALL timeset("dbcsr_tas_mm_2", handle3) CALL dbcsr_multiply(transa=transa_prv, transb=transb_prv, alpha=alpha, matrix_a=matrix_a_mm, & matrix_b=matrix_b_mm, beta=beta, matrix_c=matrix_c_mm, & filter_eps=filter_eps_prv/REAL(nsplit, KIND=real_8), retain_sparsity=retain_sparsity, flop=flop) - CALL timestop(handle2) + CALL mp_sync(matrix_a%dist%info%mp_comm) + CALL timestop(handle3) + CALL timestop(handle4) - IF (opt_pgrid) THEN - CALL dbcsr_release(matrix_a_mm) - CALL dbcsr_release(matrix_b_mm) - CALL redistribute_and_sum(matrix_c_mm, matrix_c_rep%matrix, alpha=beta) - CALL dbcsr_release(matrix_c_mm) + CALL dbcsr_release(matrix_a_mm) + CALL dbcsr_release(matrix_b_mm) - DEALLOCATE (matrix_a_mm, matrix_b_mm, matrix_c_mm) - ELSE - IF (new_a) CALL dbcsr_tas_destroy(matrix_a_rs) - IF (new_a) DEALLOCATE (matrix_a_rs) - IF (new_b) CALL dbcsr_tas_destroy(matrix_b_rs) - IF (new_b) DEALLOCATE (matrix_b_rs) - ENDIF + nze_c = dbcsr_get_nze(matrix_c_mm) + + CALL redistribute_and_sum(matrix_c_mm, matrix_c_rep%matrix, local_copy=.NOT. opt_pgrid, alpha=beta) + + CALL dbcsr_release(matrix_c_mm) IF (unit_nr_prv /= 0) THEN CALL dbcsr_tas_write_dist(matrix_c_rep, unit_nr_prv, full_info=log_verbose) @@ -580,14 +613,14 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, ENDIF IF (PRESENT(filter_eps)) CALL dbcsr_tas_filter(matrix_c_rs, filter_eps) CASE (3) - IF (matrix_a%do_batched <= 1) THEN + IF (matrix_a%do_batched <= 2) THEN ALLOCATE (matrix_a_rep) CALL dbcsr_tas_replicate(matrix_a_rs%matrix, dbcsr_tas_info(matrix_b_rs), matrix_a_rep, move_data=.TRUE.) - IF (matrix_a%do_batched == 1) THEN + IF (matrix_a%do_batched == 1 .or. matrix_a%do_batched == 2) THEN matrix_a%mm_storage%store_batched_repl => matrix_a_rep - matrix_a%do_batched = 2 + CALL dbcsr_tas_set_batched_state(matrix_a, state=3) ENDIF - ELSEIF (matrix_a%do_batched == 2) THEN + ELSEIF (matrix_a%do_batched == 3) THEN matrix_a_rep => matrix_a%mm_storage%store_batched_repl ENDIF @@ -603,56 +636,61 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, CALL convert_to_new_pgrid(mp_comm_mm, matrix_a_rep%matrix, matrix_a_mm, optimize_pgrid=opt_pgrid, & move_data=matrix_a%do_batched == 0) - IF (opt_pgrid .AND. matrix_a%do_batched == 0) THEN + ! keep communicators alive even after releasing TAS matrices (communicator management does not work between DBCSR and TAS) + info_a = dbcsr_tas_info(matrix_a_rep) + CALL dbcsr_tas_info_hold(info_a) + + IF (matrix_a%do_batched == 0) THEN CALL dbcsr_tas_destroy(matrix_a_rep) DEALLOCATE (matrix_a_rep) ENDIF CALL convert_to_new_pgrid(mp_comm_mm, matrix_b_rs%matrix, matrix_b_mm, optimize_pgrid=opt_pgrid, move_data=move_b) - IF (new_b .AND. opt_pgrid) THEN + info_b = dbcsr_tas_info(matrix_b_rs) + CALL dbcsr_tas_info_hold(info_b) + + IF (new_b) THEN CALL dbcsr_tas_destroy(matrix_b_rs) DEALLOCATE (matrix_b_rs) ENDIF CALL convert_to_new_pgrid(mp_comm_mm, matrix_c_rs%matrix, matrix_c_mm, nodata=nodata_3, optimize_pgrid=opt_pgrid) + info_c = dbcsr_tas_info(matrix_c_rs) + CALL dbcsr_tas_info_hold(info_c) + + CALL mp_sync(matrix_a%dist%info%mp_comm) + CALL timeset("dbcsr_tas_dbcsr", handle4) SELECT CASE (tr_case) CASE (dbcsr_no_transpose) - CALL timeset(routineN//"_mm_3N", handle2) + CALL timeset("dbcsr_tas_mm_3N", handle3) CALL dbcsr_multiply(transa=dbcsr_no_transpose, transb=dbcsr_no_transpose, alpha=alpha, & matrix_a=matrix_a_mm, matrix_b=matrix_b_mm, beta=beta, matrix_c=matrix_c_mm, & filter_eps=filter_eps_prv, retain_sparsity=retain_sparsity, flop=flop) - CALL timestop(handle2) + CALL timestop(handle3) CASE (dbcsr_transpose) - CALL timeset(routineN//"_mm_3T", handle2) + CALL timeset("dbcsr_tas_mm_3T", handle3) CALL dbcsr_multiply(transa=dbcsr_no_transpose, transb=dbcsr_transpose, alpha=alpha, & matrix_a=matrix_b_mm, matrix_b=matrix_a_mm, beta=beta, matrix_c=matrix_c_mm, & filter_eps=filter_eps_prv, retain_sparsity=retain_sparsity, flop=flop) - CALL timestop(handle2) + CALL timestop(handle3) END SELECT + CALL mp_sync(matrix_a%dist%info%mp_comm) + CALL timestop(handle4) - IF (opt_pgrid) THEN - CALL dbcsr_release(matrix_a_mm) - CALL dbcsr_release(matrix_b_mm) + CALL dbcsr_release(matrix_a_mm) + CALL dbcsr_release(matrix_b_mm) - IF (.NOT. new_c) THEN - CALL redistribute_and_sum(matrix_c_mm, matrix_c_rs%matrix, alpha=beta) - ELSE - CALL redistribute_and_sum(matrix_c_mm, matrix_c_rs%matrix) - ENDIF + nze_c = dbcsr_get_nze(matrix_c_mm) - CALL dbcsr_release(matrix_c_mm) - - DEALLOCATE (matrix_a_mm, matrix_b_mm, matrix_c_mm) + IF (.NOT. new_c) THEN + CALL redistribute_and_sum(matrix_c_mm, matrix_c_rs%matrix, local_copy=.NOT. opt_pgrid, alpha=beta) ELSE - IF (new_b) CALL dbcsr_tas_destroy(matrix_b_rs) - IF (new_b) DEALLOCATE (matrix_b_rs) - IF (matrix_a%do_batched == 0) THEN - CALL dbcsr_tas_destroy(matrix_a_rep) - DEALLOCATE (matrix_a_rep) - ENDIF + CALL redistribute_and_sum(matrix_c_mm, matrix_c_rs%matrix, local_copy=.NOT. opt_pgrid) ENDIF + CALL dbcsr_release(matrix_c_mm) + IF (PRESENT(filter_eps)) CALL dbcsr_tas_filter(matrix_c_rs, filter_eps) IF (unit_nr_prv /= 0) THEN @@ -662,14 +700,31 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, CALL mp_comm_free(mp_comm_mm) - CALL dbcsr_tas_get_split_info(dbcsr_tas_info(matrix_c), mp_comm=mp_comm) + CALL dbcsr_tas_get_split_info(info_c, mp_comm=mp_comm) - IF (PRESENT(split_opt) .AND. .NOT. simple_split_prv) THEN + IF (PRESENT(split_opt)) THEN + SELECT CASE (max_mm_dim) + CASE (1, 3) + CALL mp_sum(nze_c, mp_comm) + CASE (2) + CALL dbcsr_tas_get_split_info(info_c, mp_comm=mp_comm, mp_comm_group=mp_comm_group) + CALL mp_sum(nze_c, mp_comm_group) + CALL mp_max(nze_c, mp_comm) + END SELECT + nsplit_opt = split_factor_estimate(max_mm_dim, nze_a, nze_b, nze_c, numproc) ! ideally we should rederive the split factor from the actual sparsity of C, but ! due to parameter beta, we can not get the sparsity of AxB from DBCSR if not new_c - ALLOCATE (split_opt) mp_comm_opt = dbcsr_tas_mp_comm(mp_comm, split_rc, nsplit_opt) CALL dbcsr_tas_create_split(split_opt, mp_comm_opt, split_rc, nsplit_opt, own_comm=.TRUE.) + IF (unit_nr_prv > 0) THEN + WRITE (unit_nr_prv, "(T2,A)") & + "MM PARAMETERS" + WRITE (unit_nr_prv, "(T4,A,T68,I13)") "Number of matrix elements per CPU of result matrix:", & + (nze_c + numproc - 1)/numproc + + WRITE (unit_nr_prv, "(T4,A,T68,I13)") "Optimal split factor:", nsplit_opt + ENDIF + ENDIF IF (new_c) THEN @@ -681,7 +736,6 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, IF (PRESENT(filter_eps)) CALL dbcsr_tas_filter(matrix_c, filter_eps) ELSEIF (matrix_c%do_batched > 0) THEN IF (matrix_c%mm_storage%batched_out) THEN - matrix_c%mm_storage%batched_beta = beta matrix_c%mm_storage%batched_trans = transc_prv /= transc ENDIF ENDIF @@ -707,20 +761,38 @@ RECURSIVE SUBROUTINE dbcsr_tas_multiply(transa, transb, transc, alpha, matrix_a, WRITE (unit_nr_prv, '(A)') repeat("-", 80) ENDIF + CALL dbcsr_tas_release_info(info_a) + CALL dbcsr_tas_release_info(info_b) + CALL dbcsr_tas_release_info(info_c) + + CALL mp_sync(matrix_a%dist%info%mp_comm) + CALL timestop(handle2) CALL timestop(handle) END SUBROUTINE - SUBROUTINE redistribute_and_sum(matrix_in, matrix_out, alpha) + SUBROUTINE redistribute_and_sum(matrix_in, matrix_out, local_copy, alpha) TYPE(dbcsr_type), INTENT(IN) :: matrix_in TYPE(dbcsr_type), INTENT(INOUT) :: matrix_out + LOGICAL, INTENT(IN), OPTIONAL :: local_copy TYPE(dbcsr_scalar_type), INTENT(IN), OPTIONAL :: alpha TYPE(dbcsr_type) :: matrix_tmp + LOGICAL :: local_copy_prv + + IF (PRESENT(local_copy)) THEN + local_copy_prv = local_copy + ELSE + local_copy_prv = .FALSE. + ENDIF - CALL dbcsr_create(matrix_tmp, matrix_out) - CALL dbcsr_redistribute(matrix_in, matrix_tmp) - CALL dbcsr_add(matrix_out, matrix_tmp, alpha_scalar=alpha) - CALL dbcsr_release(matrix_tmp) + IF (.NOT. local_copy_prv) THEN + CALL dbcsr_create(matrix_tmp, matrix_out) + CALL dbcsr_redistribute(matrix_in, matrix_tmp) + CALL dbcsr_add(matrix_out, matrix_tmp, alpha_scalar=alpha) + CALL dbcsr_release(matrix_tmp) + ELSE + CALL dbcsr_add(matrix_out, matrix_in, alpha_scalar=alpha) + ENDIF END SUBROUTINE @@ -1059,6 +1131,7 @@ SUBROUTINE change_split(matrix_in, matrix_out, nsplit, split_rowcol, is_new, opt CLASS(dbcsr_tas_rowcol_data), ALLOCATABLE :: rbsize, cbsize CHARACTER(LEN=*), PARAMETER :: routineN = 'change_split', & routineP = moduleN//':'//routineN + NULLIFY (matrix_out) is_new = .TRUE. @@ -1358,8 +1431,11 @@ FUNCTION split_factor_estimate(max_mm_dim, nze_a, nze_b, nze_c, numnodes) RESULT !! number of non-zeroes in C INTEGER, INTENT(IN) :: numnodes !! number of MPI ranks - INTEGER :: nsplit, nsplit_comm, nsplit_memory + INTEGER :: nsplit INTEGER(KIND=int_8) :: max_nze, min_nze + REAL(real_8) :: s_opt_factor + + s_opt_factor = dbcsr_cfg%tas_split_factor SELECT CASE (max_mm_dim) CASE (1) @@ -1375,15 +1451,8 @@ FUNCTION split_factor_estimate(max_mm_dim, nze_a, nze_b, nze_c, numnodes) RESULT DBCSR_ABORT("") END SELECT - nsplit_comm = NINT((REAL(nze_a + nze_b, real_8)/(2*min_nze))**(2._real_8/3)*REAL(numnodes, real_8)**(1._real_8/3)) - IF (nsplit_comm == 0) nsplit_comm = 1 - - ! nsplit_memory protects against excess memory usage - ! actual split factor may be up to default_nsplit_accept_ratio_1 times larger, so the largest nsplit - ! that fits into memory used by A or B needs to be divided by this factor - nsplit_memory = CEILING(REAL((max_nze - 1)/min_nze + 1, real_8)/default_nsplit_accept_ratio_1) - - nsplit = MIN(nsplit_comm, nsplit_memory) + nsplit = INT(MIN(INT(numnodes, KIND=int_8), NINT(REAL(max_nze, real_8)/(REAL(min_nze, real_8)*s_opt_factor), KIND=int_8))) + IF (nsplit == 0) nsplit = 1 END FUNCTION @@ -1449,8 +1518,8 @@ SUBROUTINE convert_to_new_pgrid(mp_comm_cart, matrix_in, matrix_out, move_data, INTEGER, INTENT(IN) :: mp_comm_cart !! new process grid - TYPE(dbcsr_type), INTENT(INOUT), TARGET :: matrix_in - TYPE(dbcsr_type), INTENT(OUT), POINTER :: matrix_out + TYPE(dbcsr_type), INTENT(INOUT) :: matrix_in + TYPE(dbcsr_type), INTENT(OUT) :: matrix_out LOGICAL, INTENT(IN), OPTIONAL :: move_data, nodata !! memory optimization: move data such that matrix_in is empty on return. !! Data of matrix_in should not be copied to matrix_out @@ -1470,26 +1539,26 @@ SUBROUTINE convert_to_new_pgrid(mp_comm_cart, matrix_in, matrix_out, move_data, NULLIFY (row_dist, col_dist, rbsize, rcsize) + CALL timeset(routineN, handle) + IF (PRESENT(optimize_pgrid)) THEN optimize_pgrid_prv = optimize_pgrid ELSE optimize_pgrid_prv = .TRUE. ENDIF - IF (.NOT. optimize_pgrid_prv) THEN - matrix_out => matrix_in - RETURN - ENDIF - - CALL timeset(routineN, handle) - IF (PRESENT(nodata)) THEN nodata_prv = nodata ELSE nodata_prv = .FALSE. ENDIF - ALLOCATE (matrix_out) + IF (.NOT. optimize_pgrid_prv) THEN + CALL dbcsr_create(matrix_out, template=matrix_in) + IF (.NOT. nodata_prv) CALL dbcsr_copy(matrix_out, matrix_in) + CALL timestop(handle) + RETURN + ENDIF CALL dbcsr_get_info(matrix_in, nblkrows_total=nbrows, nblkcols_total=nbcols, & row_blk_size=rbsize, col_blk_size=rcsize, & @@ -1519,19 +1588,88 @@ SUBROUTINE convert_to_new_pgrid(mp_comm_cart, matrix_in, matrix_out, move_data, SUBROUTINE dbcsr_tas_batched_mm_init(matrix) TYPE(dbcsr_tas_type), INTENT(INOUT) :: matrix - matrix%do_batched = 1 + CALL dbcsr_tas_set_batched_state(matrix, state=1) ALLOCATE (matrix%mm_storage) matrix%mm_storage%batched_out = .FALSE. END SUBROUTINE SUBROUTINE dbcsr_tas_batched_mm_finalize(matrix) TYPE(dbcsr_tas_type), INTENT(INOUT) :: matrix + INTEGER :: handle + + CALL mp_sync(matrix%dist%info%mp_comm) + CALL timeset("dbcsr_tas_total", handle) + + IF (matrix%do_batched == 0) RETURN + + IF (matrix%mm_storage%batched_out) THEN + CALL dbcsr_scale(matrix%matrix, matrix%mm_storage%batched_beta) + ENDIF + + CALL dbcsr_tas_batched_mm_complete(matrix) + + matrix%mm_storage%batched_out = .FALSE. + + DEALLOCATE (matrix%mm_storage) + CALL dbcsr_tas_set_batched_state(matrix, state=0) + + CALL mp_sync(matrix%dist%info%mp_comm) + CALL timestop(handle) + + END SUBROUTINE + + SUBROUTINE dbcsr_tas_set_batched_state(matrix, state, opt_grid) + !! set state flags during batched multiplication + + TYPE(dbcsr_tas_type), INTENT(INOUT) :: matrix + LOGICAL, INTENT(IN), OPTIONAL :: opt_grid + !! whether process grid was already optimized and should not be changed + INTEGER, INTENT(IN), OPTIONAL :: state + !! - 0 no batched MM + !! - 1 batched MM but mm_storage not yet initialized + !! - 2 batched MM and mm_storage requires update + !! - 3 batched MM and mm_storage initialized + + IF (PRESENT(opt_grid)) THEN + matrix%has_opt_pgrid = opt_grid + matrix%dist%info%strict_split(1) = .TRUE. + ENDIF + + IF (PRESENT(state)) THEN + matrix%do_batched = state + SELECT CASE (state) + CASE (0, 1) + ! reset to default + IF (matrix%has_opt_pgrid) THEN + matrix%dist%info%strict_split(1) = .TRUE. + ELSE + matrix%dist%info%strict_split(1) = matrix%dist%info%strict_split(2) + ENDIF + CASE (2, 3) + matrix%dist%info%strict_split(1) = .TRUE. + CASE DEFAULT + DBCSR_ABORT("should not happen") + END SELECT + ENDIF + END SUBROUTINE + + SUBROUTINE dbcsr_tas_batched_mm_complete(matrix, warn) + TYPE(dbcsr_tas_type), INTENT(INOUT) :: matrix + LOGICAL, INTENT(IN), OPTIONAL :: warn IF (matrix%do_batched == 0) RETURN ASSOCIATE (storage => matrix%mm_storage) - IF (storage%batched_out) THEN - CALL dbcsr_tas_merge(storage%store_batched%matrix, storage%store_batched_repl, move_data=.TRUE.) - CALL dbcsr_scale(matrix%matrix, storage%batched_beta) + IF (PRESENT(warn)) THEN + IF (warn .AND. matrix%do_batched == 3) THEN + CALL dbcsr_warn(__LOCATION__, & + "Optimizations for batched multiplication are disabled because of conflicting data access") + ENDIF + ENDIF + IF (storage%batched_out .AND. matrix%do_batched == 3) THEN + + CALL dbcsr_tas_merge(storage%store_batched%matrix, & + storage%store_batched_repl, move_data=.TRUE.) + CALL dbcsr_tas_reshape(storage%store_batched, matrix, summation=.TRUE., & transposed=storage%batched_trans, move_data=.TRUE.) CALL dbcsr_tas_destroy(storage%store_batched) @@ -1542,12 +1680,9 @@ SUBROUTINE dbcsr_tas_batched_mm_finalize(matrix) CALL dbcsr_tas_destroy(storage%store_batched_repl) DEALLOCATE (storage%store_batched_repl) ENDIF - - storage%batched_out = .FALSE. END ASSOCIATE - DEALLOCATE (matrix%mm_storage) - matrix%do_batched = 0 + CALL dbcsr_tas_set_batched_state(matrix, state=2) END SUBROUTINE diff --git a/src/tas/dbcsr_tas_reshape_ops.F b/src/tas/dbcsr_tas_reshape_ops.F index 08c08664cb4..da0c6be98c4 100644 --- a/src/tas/dbcsr_tas_reshape_ops.F +++ b/src/tas/dbcsr_tas_reshape_ops.F @@ -473,11 +473,12 @@ SUBROUTINE dbcsr_tas_replicate(matrix_in, info, matrix_out, nodata, move_data) END SUBROUTINE - SUBROUTINE dbcsr_tas_merge(matrix_out, matrix_in, move_data) + SUBROUTINE dbcsr_tas_merge(matrix_out, matrix_in, summation, move_data) !! Merge submatrices of matrix_in to matrix_out by sum TYPE(dbcsr_type), INTENT(INOUT) :: matrix_out TYPE(dbcsr_tas_type), INTENT(INOUT) :: matrix_in + LOGICAL, INTENT(IN), OPTIONAL :: summation LOGICAL, INTENT(IN), OPTIONAL :: move_data !! memory optimization: move data to matrix_out such that matrix_in is empty on return @@ -500,7 +501,11 @@ SUBROUTINE dbcsr_tas_merge(matrix_out, matrix_in, move_data) CALL timeset(routineN, handle) - CALL dbcsr_clear(matrix_out) + IF (PRESENT(summation)) THEN + IF (.NOT. summation) CALL dbcsr_clear(matrix_out) + ELSE + CALL dbcsr_clear(matrix_out) + ENDIF IF (PRESENT(move_data)) THEN move_prv = move_data diff --git a/src/tas/dbcsr_tas_split.F b/src/tas/dbcsr_tas_split.F index b4ed0cd8786..30fa8fe3312 100644 --- a/src/tas/dbcsr_tas_split.F +++ b/src/tas/dbcsr_tas_split.F @@ -41,19 +41,20 @@ MODULE dbcsr_tas_split dbcsr_tas_release_info, & dbcsr_tas_create_split, & dbcsr_tas_create_split_rows_or_cols, & + dbcsr_tas_set_strict_split, & group_to_mrowcol, & group_to_world_proc_map, & rowsplit, & world_to_group_proc_map, & accept_pgrid_dims, & - default_nsplit_accept_ratio_1 + default_nsplit_accept_ratio, & + default_pdims_accept_ratio CHARACTER(len=*), PARAMETER, PRIVATE :: moduleN = 'dbcsr_tas_split' INTEGER, PARAMETER :: rowsplit = 1, colsplit = 2 REAL(real_8), PARAMETER :: default_pdims_accept_ratio = 1.2_real_8 - REAL(real_8), PARAMETER :: default_nsplit_accept_ratio_1 = 1.5_real_8 - REAL(real_8), PARAMETER :: default_nsplit_accept_ratio_2 = 2.5_real_8 + REAL(real_8), PARAMETER :: default_nsplit_accept_ratio = 1.5_real_8 INTERFACE dbcsr_tas_mp_comm MODULE PROCEDURE dbcsr_tas_mp_comm @@ -148,7 +149,7 @@ FUNCTION dbcsr_tas_mp_comm(mp_comm, split_rowcol, nsplit) INTEGER, INTENT(IN) :: mp_comm INTEGER, INTENT(IN) :: split_rowcol - INTEGER, INTENT(INOUT) :: nsplit + INTEGER, INTENT(IN) :: nsplit INTEGER :: dbcsr_tas_mp_comm CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_tas_mp_comm', & @@ -219,7 +220,6 @@ FUNCTION get_opt_nsplit(numproc, nsplit, split_pgrid, pdim_nonsplit) INTEGER :: lb, ub, count, count_square, count_accept, split, & minpos, get_opt_nsplit INTEGER, DIMENSION(2) :: dims_sub - REAL(KIND=real_8) :: nsplit_accept_ratio DBCSR_ASSERT(nsplit > 0) @@ -227,18 +227,8 @@ FUNCTION get_opt_nsplit(numproc, nsplit, split_pgrid, pdim_nonsplit) DBCSR_ASSERT(PRESENT(pdim_nonsplit)) ENDIF - IF (split_pgrid) THEN - ! if process grid dimensions are fixed, we need more flexibility in split factor in order - ! to obtain balanced grid dimensions --> larger accept ratio - nsplit_accept_ratio = default_nsplit_accept_ratio_2 - ELSE - ! if process grid dimensions are flexible, less flexibility in split factor sufficient in - ! order to obtain balanced grid dimensions --> smaller accept ratio - nsplit_accept_ratio = default_nsplit_accept_ratio_1 - ENDIF - - lb = CEILING(REAL(nsplit, real_8)/nsplit_accept_ratio) - ub = FLOOR(REAL(nsplit, real_8)*nsplit_accept_ratio) + lb = CEILING(REAL(nsplit, real_8)/default_nsplit_accept_ratio) + ub = FLOOR(REAL(nsplit, real_8)*default_nsplit_accept_ratio) IF (ub < lb) ub = lb @@ -373,7 +363,10 @@ SUBROUTINE dbcsr_tas_create_split(split_info, mp_comm, split_rowcol, nsplit, own igroup = pcoord(split_rowcol)/pdims_group(split_rowcol) CALL dbcsr_tas_create_split_rows_or_cols(split_info, mp_comm, nsplit_opt, igroup, split_rowcol, own_comm=own_comm) - IF (.NOT. opt_nsplit_prv) split_info%strict_split = .TRUE. + + IF (nsplit > 0) THEN + ALLOCATE (split_info%ngroup_opt, SOURCE=nsplit) + ENDIF CALL timestop(handle) @@ -451,6 +444,8 @@ SUBROUTINE dbcsr_tas_release_info(split_info) ENDIF split_info%pdims = 0 + + IF (ALLOCATED(split_info%ngroup_opt)) DEALLOCATE (split_info%ngroup_opt) END SUBROUTINE SUBROUTINE dbcsr_tas_info_hold(split_info) @@ -644,4 +639,10 @@ SUBROUTINE group_to_mrowcol(info, rowcol_dist, igroup, rowcols) CALL sort(rowcols, nrowcols, sort_indices) END SUBROUTINE + SUBROUTINE dbcsr_tas_set_strict_split(info) + !! freeze current split factor such that it is never changed during multiplication + TYPE(dbcsr_tas_split_info), INTENT(INOUT) :: info + info%strict_split = [.TRUE., .TRUE.] + END SUBROUTINE + END MODULE diff --git a/src/tas/dbcsr_tas_types.F b/src/tas/dbcsr_tas_types.F index 0f01c81aa67..18bb42785a7 100644 --- a/src/tas/dbcsr_tas_types.F +++ b/src/tas/dbcsr_tas_types.F @@ -28,7 +28,8 @@ MODULE dbcsr_tas_types dbcsr_tas_distribution_type, & dbcsr_tas_iterator, & dbcsr_tas_split_info, & - dbcsr_tas_type + dbcsr_tas_type, & + dbcsr_tas_mm_storage ! info on MPI Cartesian grid that is split on MPI subgroups. ! info on distribution of matrix rows / columns to different subgroups. @@ -41,7 +42,9 @@ MODULE dbcsr_tas_types INTEGER :: pgrid_split_size ! how many process rows/cols in subgroups INTEGER :: group_size ! group size (how many cores) of subgroups INTEGER :: mp_comm_group ! sub communicator - LOGICAL :: strict_split = .FALSE. ! if .true., split factor should not be modified (for testing only) + INTEGER, ALLOCATABLE :: ngroup_opt ! optimal number of groups (split factor) + LOGICAL, DIMENSION(2) :: strict_split = [.FALSE., .FALSE.] + ! if .true., split factor should not be modified (2 parameters for current and general settings) INTEGER, POINTER :: refcount => NULL() ! lightweight reference counting for communicators END TYPE @@ -77,12 +80,9 @@ MODULE dbcsr_tas_types LOGICAL :: valid = .FALSE. ! has been created? ! storage and flags for batched matrix multiplication - INTEGER :: do_batched = 0 ! whether we are doing batched MM: - ! - 0 no batched MM - ! - 1 batched MM but mm_storage not yet initialized - ! - 2 batched MM and mm_storage initialized + INTEGER :: do_batched = 0 ! state flag for batched multiplication TYPE(dbcsr_tas_mm_storage), ALLOCATABLE :: mm_storage ! storage for batched processing of matrix matrix multiplication. - + LOGICAL :: has_opt_pgrid = .FALSE. ! whether pgrid was automatically optimized END TYPE TYPE dbcsr_tas_iterator diff --git a/src/tensors/dbcsr_tensor.F b/src/tensors/dbcsr_tensor.F index dce5dbd1f55..d5ebcd1446a 100644 --- a/src/tensors/dbcsr_tensor.F +++ b/src/tensors/dbcsr_tensor.F @@ -21,7 +21,8 @@ MODULE dbcsr_tensor USE dbcsr_allocate_wrap, ONLY: & allocate_any USE dbcsr_array_list_methods, ONLY: & - get_arrays, reorder_arrays, get_ith_array, array_list, array_sublist, check_equal, array_eq_i + get_arrays, reorder_arrays, get_ith_array, array_list, array_sublist, check_equal, array_eq_i, & + create_array_list, destroy_array_list, sizes_of_arrays USE dbcsr_api, ONLY: & dbcsr_type, dbcsr_iterator_type, dbcsr_iterator_blocks_left, & dbcsr_iterator_next_block, dbcsr_iterator_start, dbcsr_iterator_stop, & @@ -33,14 +34,15 @@ MODULE dbcsr_tensor USE dbcsr_tas_base, ONLY: & dbcsr_tas_copy, dbcsr_tas_finalize, dbcsr_tas_get_data_type, dbcsr_tas_get_info, dbcsr_tas_info USE dbcsr_tas_mm, ONLY: & - dbcsr_tas_multiply, dbcsr_tas_batched_mm_init, dbcsr_tas_batched_mm_finalize, dbcsr_tas_result_index + dbcsr_tas_multiply, dbcsr_tas_batched_mm_init, dbcsr_tas_batched_mm_finalize, dbcsr_tas_result_index, & + dbcsr_tas_batched_mm_complete, dbcsr_tas_set_batched_state USE dbcsr_tensor_block, ONLY: & dbcsr_t_iterator_type, dbcsr_t_get_block, dbcsr_t_put_block, dbcsr_t_iterator_start, & dbcsr_t_iterator_blocks_left, dbcsr_t_iterator_stop, dbcsr_t_iterator_next_block, & ndims_iterator, dbcsr_t_reserve_blocks, block_nd, destroy_block USE dbcsr_tensor_index, ONLY: & dbcsr_t_get_mapping_info, nd_to_2d_mapping, dbcsr_t_inverse_order, permute_index, get_nd_indices_tensor, & - ndims_mapping_row, ndims_mapping_column + ndims_mapping_row, ndims_mapping_column, ndims_mapping USE dbcsr_tensor_types, ONLY: & dbcsr_t_create, dbcsr_t_get_data_type, dbcsr_t_type, ndims_tensor, dims_tensor, & dbcsr_t_distribution_type, dbcsr_t_distribution, dbcsr_t_nd_mp_comm, dbcsr_t_destroy, & @@ -48,25 +50,25 @@ MODULE dbcsr_tensor blk_dims_tensor, dbcsr_t_hold, dbcsr_t_pgrid_type, mp_environ_pgrid, dbcsr_t_filter, & dbcsr_t_clear, dbcsr_t_finalize, dbcsr_t_get_num_blocks, dbcsr_t_scale, & dbcsr_t_get_num_blocks_total, dbcsr_t_get_info, ndims_matrix_row, ndims_matrix_column, & - dbcsr_t_max_nblks_local, dbcsr_t_default_distvec + dbcsr_t_max_nblks_local, dbcsr_t_default_distvec, dbcsr_t_contraction_storage, dbcsr_t_nblks_total, & + dbcsr_t_distribution_new, dbcsr_t_copy_contraction_storage, dbcsr_t_pgrid_destroy USE dbcsr_kinds, ONLY: & ${uselist(dtype_float_prec)}$, default_string_length, int_8, dp USE dbcsr_mpiwrap, ONLY: & - mp_environ, mp_max, mp_sum + mp_environ, mp_max, mp_sum, mp_comm_free, mp_cart_create, mp_sync USE dbcsr_toollib, ONLY: & sort USE dbcsr_tensor_reshape, ONLY: & dbcsr_t_reshape - USE dbcsr_mpiwrap, ONLY: & - mp_comm_free USE dbcsr_tas_split, ONLY: & - dbcsr_tas_mp_comm, rowsplit, colsplit, dbcsr_tas_info_hold, dbcsr_tas_release_info + dbcsr_tas_mp_comm, rowsplit, colsplit, dbcsr_tas_info_hold, dbcsr_tas_release_info, default_nsplit_accept_ratio, & + default_pdims_accept_ratio, dbcsr_tas_create_split USE dbcsr_data_types, ONLY: & dbcsr_scalar_type USE dbcsr_tensor_split, ONLY: & dbcsr_t_split_copyback, dbcsr_t_make_compatible_blocks, dbcsr_t_crop USE dbcsr_tensor_io, ONLY: & - dbcsr_t_write_tensor_info, dbcsr_t_write_tensor_dist, prep_output_unit + dbcsr_t_write_tensor_info, dbcsr_t_write_tensor_dist, prep_output_unit, dbcsr_t_write_split_info USE dbcsr_dist_operations, ONLY: & checker_tr USE dbcsr_toollib, ONLY: & @@ -121,6 +123,29 @@ SUBROUTINE dbcsr_t_copy(tensor_in, tensor_out, order, summation, bounds, move_da INTENT(IN), OPTIONAL :: bounds !! crop tensor data: start and end index for each tensor dimension INTEGER, INTENT(IN), OPTIONAL :: unit_nr + INTEGER :: handle + + CALL mp_sync(tensor_in%pgrid%mp_comm_2d) + CALL timeset("dbcsr_t_total", handle) + + ! make sure that it is safe to use dbcsr_t_copy during a batched contraction + CALL dbcsr_tas_batched_mm_complete(tensor_in%matrix_rep, warn=.TRUE.) + CALL dbcsr_tas_batched_mm_complete(tensor_out%matrix_rep, warn=.TRUE.) + + CALL dbcsr_t_copy_expert(tensor_in, tensor_out, order, summation, bounds, move_data, unit_nr) + CALL mp_sync(tensor_in%pgrid%mp_comm_2d) + CALL timestop(handle) + END SUBROUTINE + + SUBROUTINE dbcsr_t_copy_expert(tensor_in, tensor_out, order, summation, bounds, move_data, unit_nr) + !! expert routine for copying a tensor. For internal use only. + TYPE(dbcsr_t_type), INTENT(INOUT), TARGET :: tensor_in, tensor_out + INTEGER, DIMENSION(ndims_tensor(tensor_in)), & + INTENT(IN), OPTIONAL :: order + LOGICAL, INTENT(IN), OPTIONAL :: summation, move_data + INTEGER, DIMENSION(2, ndims_tensor(tensor_in)), & + INTENT(IN), OPTIONAL :: bounds + INTEGER, INTENT(IN), OPTIONAL :: unit_nr TYPE(dbcsr_t_type), POINTER :: in_tmp_1, in_tmp_2, & in_tmp_3, out_tmp_1 @@ -405,31 +430,28 @@ SUBROUTINE dbcsr_t_contract(alpha, tensor_1, tensor_2, beta, tensor_3, & !! * tensor_2(contract_2, notcontract_2) !! + beta * tensor_3(map_1, map_2) !! - !! note: block sizes of the corresponding indices need to be the same in all tensors. + !! @note + !! note 1: block sizes of the corresponding indices need to be the same in all tensors. + !! + !! note 2: for best performance the tensors should have been created in matrix layouts + !! compatible with the contraction, e.g. tensor_1 should have been created with either + !! map1_2d == contract_1 and map2_2d == notcontract_1 or map1_2d == notcontract_1 and + !! map2_2d == contract_1 (the same with tensor_2 and contract_2 / notcontract_2 and with + !! tensor_3 and map_1 / map_2). + !! Furthermore the two largest tensors involved in the contraction should map both to either + !! tall or short matrices: the largest matrix dimension should be "on the same side" + !! and should have identical distribution (which is always the case if the distributions were + !! obtained with dbcsr_t_default_distvec). !! - !! note: for best performance the tensors should have been created with consistent map1_2d and map2_2d - !! arguments as follows: - !! tensor_1: - !! map1_2d == contract_1 - !! map2_2d == notcontract_1 - !! tensor_2: - !! map1_2d == contract_2 - !! map2_2d == notcontract_2 - !! tensor_3 - !! map1_2d == map_1 - !! map2_2d == map_2 - !! or alternatively: - !! tensor_1: - !! map1_2d == notcontract_1 - !! map2_2d == contract_1 - !! tensor_2: - !! map1_2d == notcontract_2 - !! map2_2d == contract_2 - !! tensor_3 - !! map1_2d == map_1 - !! map2_2d == map_2 - !! this is an optimization, not a requirement. in general this condition can not always be satisfied - !! (if the same tensor appears in more than one contraction). + !! note 3: if the same tensor occurs in multiple contractions, a different tensor object should + !! be created for each contraction and the data should be copied between the tensors by use of + !! dbcsr_t_copy. If the same tensor object is used in multiple contractions, matrix layouts are + !! not compatible for all contractions (see note 2). + !! + !! note 4: automatic optimizations are enabled by using the feature of batched contraction, see + !! dbcsr_t_batched_contract_init, dbcsr_t_batched_contract_finalize. The arguments bounds_1, + !! bounds_2, bounds_3 give the index ranges of the batches. + !! @endnote TYPE(dbcsr_scalar_type), INTENT(IN) :: alpha TYPE(dbcsr_t_type), INTENT(INOUT), TARGET :: tensor_1 @@ -453,13 +475,16 @@ SUBROUTINE dbcsr_t_contract(alpha, tensor_1, tensor_2, beta, tensor_3, & !! contracted tensor (out) INTEGER, DIMENSION(2, SIZE(contract_1)), & INTENT(IN), OPTIONAL :: bounds_1 - !! bounds corresponding to contract_1 AKA contract_2 + !! bounds corresponding to contract_1 AKA contract_2: start and end index of an index range over + !! which to contract. For use in batched contraction. INTEGER, DIMENSION(2, SIZE(notcontract_1)), & INTENT(IN), OPTIONAL :: bounds_2 - !! bounds corresponding to notcontract_1 + !! bounds corresponding to notcontract_1: start and end index of an index range. + !! For use in batched contraction. INTEGER, DIMENSION(2, SIZE(notcontract_2)), & INTENT(IN), OPTIONAL :: bounds_3 - !! bounds corresponding to notcontract_2 + !! bounds corresponding to notcontract_2: start and end index of an index range. + !! For use in batched contraction. LOGICAL, INTENT(IN), OPTIONAL :: optimize_dist !! Whether distribution should be optimized internally. In the current implementation this guarantees optimal parameters !! only for dense matrices. @@ -494,7 +519,8 @@ SUBROUTINE dbcsr_t_contract(alpha, tensor_1, tensor_2, beta, tensor_3, & INTEGER :: handle - CALL timeset(routineN, handle) + CALL mp_sync(tensor_1%pgrid%mp_comm_2d) + CALL timeset("dbcsr_t_total", handle) CALL dbcsr_t_contract_expert(alpha, tensor_1, tensor_2, beta, tensor_3, & contract_1, notcontract_1, & contract_2, notcontract_2, & @@ -512,6 +538,7 @@ SUBROUTINE dbcsr_t_contract(alpha, tensor_1, tensor_2, beta, tensor_3, & retain_sparsity=retain_sparsity, & unit_nr=unit_nr, & log_verbose=log_verbose) + CALL mp_sync(tensor_1%pgrid%mp_comm_2d) CALL timestop(handle) END SUBROUTINE @@ -565,11 +592,13 @@ SUBROUTINE dbcsr_t_contract_expert(alpha, tensor_1, tensor_2, beta, tensor_3, & TYPE(dbcsr_t_type), POINTER :: tensor_contr_1, tensor_contr_2, tensor_contr_3 TYPE(dbcsr_t_type), TARGET :: tensor_algn_1, tensor_algn_2, tensor_algn_3 TYPE(dbcsr_t_type), POINTER :: tensor_crop_1, tensor_crop_2 + TYPE(dbcsr_t_type), POINTER :: tensor_small, tensor_large INTEGER(int_8), DIMENSION(:, :), ALLOCATABLE :: result_index_2d - LOGICAL :: assert_stmt + LOGICAL :: assert_stmt, tensors_remapped INTEGER :: data_type, max_mm_dim, max_tensor, mp_comm, & - iblk, nblk, unit_nr_prv + iblk, nblk, unit_nr_prv, ref_tensor, mp_comm_opt, & + handle INTEGER, DIMENSION(SIZE(contract_1)) :: contract_1_mod INTEGER, DIMENSION(SIZE(notcontract_1)) :: notcontract_1_mod INTEGER, DIMENSION(SIZE(contract_2)) :: contract_2_mod @@ -582,7 +611,7 @@ SUBROUTINE dbcsr_t_contract_expert(alpha, tensor_1, tensor_2, beta, tensor_3, & INTEGER :: occ_1, occ_2 INTEGER, DIMENSION(:), ALLOCATABLE :: dims1, dims2, dims3 - CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_t_contract_expert', & + CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_t_contract', & routineP = moduleN//':'//routineN CHARACTER(LEN=1), DIMENSION(:), ALLOCATABLE :: indchar1, indchar2, indchar3, indchar1_mod, & indchar2_mod, indchar3_mod @@ -590,10 +619,17 @@ SUBROUTINE dbcsr_t_contract_expert(alpha, tensor_1, tensor_2, beta, tensor_3, & ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o'] INTEGER, DIMENSION(2, ndims_tensor(tensor_1)) :: bounds_t1 INTEGER, DIMENSION(2, ndims_tensor(tensor_2)) :: bounds_t2 - LOGICAL :: do_crop_1, do_crop_2, do_write_3, nodata_3 - TYPE(dbcsr_tas_split_info), POINTER :: split_opt + LOGICAL :: do_crop_1, do_crop_2, do_write_3, nodata_3, do_batched, pgrid_changed, & + pgrid_changed_any, do_change_pgrid(2) + TYPE(dbcsr_tas_split_info) :: split_opt, split, split_opt_avg + INTEGER, DIMENSION(2) :: pdims_2d_opt, pdims_2d, pcoord_2d, pdims_sub, pdims_sub_opt + LOGICAL, DIMENSION(2) :: periods_2d + REAL(real_8) :: pdim_ratio, pdim_ratio_opt - NULLIFY (tensor_contr_1, tensor_contr_2, tensor_contr_3, tensor_crop_1, tensor_crop_2, split_opt) + NULLIFY (tensor_contr_1, tensor_contr_2, tensor_contr_3, tensor_crop_1, tensor_crop_2, & + tensor_small) + + CALL timeset(routineN, handle) DBCSR_ASSERT(tensor_1%valid) DBCSR_ASSERT(tensor_2%valid) @@ -681,6 +717,8 @@ SUBROUTINE dbcsr_t_contract_expert(alpha, tensor_1, tensor_2, beta, tensor_3, & CALL dbcsr_t_destroy(tensor_crop_2) DEALLOCATE (tensor_crop_2) ENDIF + + CALL timestop(handle) RETURN ENDIF @@ -768,12 +806,20 @@ SUBROUTINE dbcsr_t_contract_expert(alpha, tensor_1, tensor_2, beta, tensor_3, & CALL reshape_mm_compatible(tensor_algn_1, tensor_algn_3, tensor_contr_1, tensor_contr_3, & contract_1_mod, notcontract_1_mod, map_2_mod, map_1_mod, & - trans_1, trans_3, new_1, new_3, nodata2=nodata_3, optimize_dist=optimize_dist, & + trans_1, trans_3, new_1, new_3, ref_tensor, nodata2=nodata_3, optimize_dist=optimize_dist, & move_data_1=move_data_1, unit_nr=unit_nr_prv) CALL reshape_mm_small(tensor_algn_2, contract_2_mod, notcontract_2_mod, tensor_contr_2, trans_2, & new_2, move_data=move_data_2, unit_nr=unit_nr_prv) + SELECT CASE (ref_tensor) + CASE (1) + tensor_large => tensor_contr_1 + CASE (2) + tensor_large => tensor_contr_3 + END SELECT + tensor_small => tensor_contr_2 + CASE (2) IF (unit_nr_prv > 0) THEN WRITE (unit_nr_prv, '(T2,A)') "large tensors: 1, 2; small tensor: 3" @@ -793,13 +839,21 @@ SUBROUTINE dbcsr_t_contract_expert(alpha, tensor_1, tensor_2, beta, tensor_3, & CALL reshape_mm_compatible(tensor_algn_1, tensor_algn_2, tensor_contr_1, tensor_contr_2, & notcontract_1_mod, contract_1_mod, notcontract_2_mod, contract_2_mod, & - trans_1, trans_2, new_1, new_2, optimize_dist=optimize_dist, & + trans_1, trans_2, new_1, new_2, ref_tensor, optimize_dist=optimize_dist, & move_data_1=move_data_1, move_data_2=move_data_2, unit_nr=unit_nr_prv) CALL invert_transpose_flag(trans_1) CALL reshape_mm_small(tensor_algn_3, map_1_mod, map_2_mod, tensor_contr_3, trans_3, & new_3, nodata=nodata_3, unit_nr=unit_nr_prv) + SELECT CASE (ref_tensor) + CASE (1) + tensor_large => tensor_contr_1 + CASE (2) + tensor_large => tensor_contr_2 + END SELECT + tensor_small => tensor_contr_3 + CASE (3) IF (unit_nr_prv > 0) THEN WRITE (unit_nr_prv, '(T2,A)') "large tensors: 2, 3; small tensor: 1" @@ -818,7 +872,7 @@ SUBROUTINE dbcsr_t_contract_expert(alpha, tensor_1, tensor_2, beta, tensor_3, & CALL reshape_mm_compatible(tensor_algn_2, tensor_algn_3, tensor_contr_2, tensor_contr_3, & contract_2_mod, notcontract_2_mod, map_1_mod, map_2_mod, & - trans_2, trans_3, new_2, new_3, nodata2=nodata_3, optimize_dist=optimize_dist, & + trans_2, trans_3, new_2, new_3, ref_tensor, nodata2=nodata_3, optimize_dist=optimize_dist, & move_data_1=move_data_2, unit_nr=unit_nr_prv) CALL invert_transpose_flag(trans_2) @@ -827,6 +881,14 @@ SUBROUTINE dbcsr_t_contract_expert(alpha, tensor_1, tensor_2, beta, tensor_3, & CALL reshape_mm_small(tensor_algn_1, notcontract_1_mod, contract_1_mod, tensor_contr_1, & trans_1, new_1, move_data=move_data_1, unit_nr=unit_nr_prv) + SELECT CASE (ref_tensor) + CASE (1) + tensor_large => tensor_contr_2 + CASE (2) + tensor_large => tensor_contr_3 + END SELECT + tensor_small => tensor_contr_1 + END SELECT IF (unit_nr_prv /= 0) CALL dbcsr_t_print_contraction_index(tensor_contr_1, indchar1_mod, & @@ -848,6 +910,7 @@ SUBROUTINE dbcsr_t_contract_expert(alpha, tensor_1, tensor_2, beta, tensor_3, & split_opt=split_opt, & move_data_a=move_data_1, move_data_b=move_data_2, retain_sparsity=retain_sparsity) ELSE + CALL dbcsr_tas_result_index(trans_1, trans_2, trans_3, tensor_contr_1%matrix_rep, tensor_contr_2%matrix_rep, & tensor_contr_3%matrix_rep, filter_eps=filter_eps, blk_ind=result_index_2d) @@ -883,42 +946,77 @@ SUBROUTINE dbcsr_t_contract_expert(alpha, tensor_1, tensor_2, beta, tensor_3, & DEALLOCATE (tensor_crop_2) ENDIF - IF (ASSOCIATED(split_opt)) THEN - CALL dbcsr_tas_release_info(split_opt) - DEALLOCATE (split_opt) - ENDIF - CALL dbcsr_t_destroy(tensor_algn_1) CALL dbcsr_t_destroy(tensor_algn_2) CALL dbcsr_t_destroy(tensor_algn_3) + CALL timestop(handle) RETURN ENDIF - IF (PRESENT(pgrid_opt_1) .AND. ASSOCIATED(split_opt)) THEN + IF (PRESENT(pgrid_opt_1)) THEN IF (.NOT. new_1) THEN ALLOCATE (pgrid_opt_1) pgrid_opt_1 = opt_pgrid(tensor_1, split_opt) ENDIF ENDIF - IF (PRESENT(pgrid_opt_2) .AND. ASSOCIATED(split_opt)) THEN + IF (PRESENT(pgrid_opt_2)) THEN IF (.NOT. new_2) THEN ALLOCATE (pgrid_opt_2) pgrid_opt_2 = opt_pgrid(tensor_2, split_opt) ENDIF ENDIF - IF (PRESENT(pgrid_opt_3) .AND. ASSOCIATED(split_opt)) THEN + IF (PRESENT(pgrid_opt_3)) THEN IF (.NOT. new_3) THEN ALLOCATE (pgrid_opt_3) pgrid_opt_3 = opt_pgrid(tensor_3, split_opt) ENDIF ENDIF - IF (ASSOCIATED(split_opt)) THEN - CALL dbcsr_tas_release_info(split_opt) - DEALLOCATE (split_opt) + do_batched = tensor_small%matrix_rep%do_batched > 0 + + tensors_remapped = .FALSE. + IF (new_1 .OR. new_2 .OR. new_3) tensors_remapped = .TRUE. + + IF (tensors_remapped .AND. do_batched) THEN + CALL dbcsr_warn(__LOCATION__, & + "Internal process grid optimization disabled because tensors are not in contraction-compatible format") + ENDIF + + CALL mp_environ(tensor_large%pgrid%mp_comm_2d, 2, pdims_2d, pcoord_2d, periods_2d) + + ! optimize process grid during batched contraction + do_change_pgrid(:) = .FALSE. + IF ((.NOT. tensors_remapped) .AND. do_batched) THEN + ASSOCIATE (storage => tensor_small%contraction_storage) + DBCSR_ASSERT(storage%static) + split = dbcsr_tas_info(tensor_large%matrix_rep) + do_change_pgrid(:) = & + update_contraction_storage(storage, split_opt, split) + + IF (ANY(do_change_pgrid)) THEN + mp_comm_opt = dbcsr_tas_mp_comm(tensor_small%pgrid%mp_comm_2d, split_opt%split_rowcol, NINT(storage%nsplit_avg)) + CALL dbcsr_tas_create_split(split_opt_avg, mp_comm_opt, split_opt%split_rowcol, & + NINT(storage%nsplit_avg), own_comm=.TRUE.) + CALL mp_environ(split_opt_avg%mp_comm, 2, pdims_2d_opt, pcoord_2d, periods_2d) + ENDIF + + END ASSOCIATE + + IF (do_change_pgrid(1) .AND. .NOT. do_change_pgrid(2)) THEN + ! check if new grid has better subgrid, if not there is no need to change process grid + CALL mp_environ(split_opt_avg%mp_comm_group, 2, pdims_sub_opt, pcoord_2d, periods_2d) + CALL mp_environ(split%mp_comm_group, 2, pdims_sub, pcoord_2d, periods_2d) + + pdim_ratio = MAXVAL(REAL(pdims_sub, real_8))/MINVAL(pdims_sub) + pdim_ratio_opt = MAXVAL(REAL(pdims_sub_opt, real_8))/MINVAL(pdims_sub_opt) + IF (pdim_ratio/pdim_ratio_opt <= default_pdims_accept_ratio**2) THEN + do_change_pgrid(1) = .FALSE. + CALL dbcsr_tas_release_info(split_opt_avg) + ENDIF + ENDIF ENDIF IF (unit_nr_prv /= 0) THEN @@ -935,12 +1033,17 @@ SUBROUTINE dbcsr_t_contract_expert(alpha, tensor_1, tensor_2, beta, tensor_3, & IF (new_3) THEN ! need redistribute if we created new tensor for tensor 3 CALL dbcsr_t_scale(tensor_algn_3, beta) - CALL dbcsr_t_copy(tensor_contr_3, tensor_algn_3, summation=.TRUE., move_data=.TRUE.) + CALL dbcsr_t_copy_expert(tensor_contr_3, tensor_algn_3, summation=.TRUE., move_data=.TRUE.) IF (PRESENT(filter_eps)) CALL dbcsr_t_filter(tensor_algn_3, filter_eps) ! tensor_3 automatically has correct data because tensor_algn_3 contains a matrix ! pointer to data of tensor_3 ENDIF + ! transfer contraction storage + CALL dbcsr_t_copy_contraction_storage(tensor_contr_1, tensor_1) + CALL dbcsr_t_copy_contraction_storage(tensor_contr_2, tensor_2) + CALL dbcsr_t_copy_contraction_storage(tensor_contr_3, tensor_3) + IF (unit_nr_prv /= 0) THEN IF (new_3 .AND. do_write_3) CALL dbcsr_t_write_tensor_info(tensor_3, unit_nr_prv, full_info=log_verbose) IF (new_3 .AND. do_write_3) CALL dbcsr_t_write_tensor_dist(tensor_3, unit_nr_prv) @@ -951,23 +1054,11 @@ SUBROUTINE dbcsr_t_contract_expert(alpha, tensor_1, tensor_2, beta, tensor_3, & CALL dbcsr_t_destroy(tensor_algn_3) IF (do_crop_1) THEN - IF (tensor_1%matrix_rep%do_batched > 0) THEN - ! transfer data for batched contraction - DEALLOCATE (tensor_1%matrix_rep%mm_storage) - ALLOCATE (tensor_1%matrix_rep%mm_storage, SOURCE=tensor_crop_1%matrix_rep%mm_storage) - tensor_1%matrix_rep%do_batched = tensor_crop_1%matrix_rep%do_batched - ENDIF CALL dbcsr_t_destroy(tensor_crop_1) DEALLOCATE (tensor_crop_1) ENDIF IF (do_crop_2) THEN - IF (tensor_2%matrix_rep%do_batched > 0) THEN - ! transfer data for batched contraction - DEALLOCATE (tensor_2%matrix_rep%mm_storage) - ALLOCATE (tensor_2%matrix_rep%mm_storage, SOURCE=tensor_crop_2%matrix_rep%mm_storage) - tensor_2%matrix_rep%do_batched = tensor_crop_2%matrix_rep%do_batched - ENDIF CALL dbcsr_t_destroy(tensor_crop_2) DEALLOCATE (tensor_crop_2) ENDIF @@ -998,6 +1089,80 @@ SUBROUTINE dbcsr_t_contract_expert(alpha, tensor_1, tensor_2, beta, tensor_3, & WRITE (unit_nr_prv, '(A)') repeat("-", 80) ENDIF + IF (ANY(do_change_pgrid)) THEN + pgrid_changed_any = .FALSE. + SELECT CASE (max_mm_dim) + CASE (1) + IF (ALLOCATED(tensor_1%contraction_storage) .AND. ALLOCATED(tensor_3%contraction_storage)) THEN + CALL dbcsr_t_change_pgrid_2d(tensor_1, tensor_1%pgrid%mp_comm_2d, pdims=pdims_2d_opt, & + nsplit=split_opt_avg%ngroup, dimsplit=split_opt_avg%split_rowcol, & + pgrid_changed=pgrid_changed, & + unit_nr=unit_nr_prv) + IF (pgrid_changed) pgrid_changed_any = .TRUE. + CALL dbcsr_t_change_pgrid_2d(tensor_3, tensor_3%pgrid%mp_comm_2d, pdims=pdims_2d_opt, & + nsplit=split_opt_avg%ngroup, dimsplit=split_opt_avg%split_rowcol, & + pgrid_changed=pgrid_changed, & + unit_nr=unit_nr_prv) + IF (pgrid_changed) pgrid_changed_any = .TRUE. + ENDIF + IF (pgrid_changed_any) THEN + IF (tensor_2%matrix_rep%do_batched == 3) THEN + ! set flag that process grid has been optimized to make sure that no grid optimizations are done + ! in TAS multiply algorithm + CALL dbcsr_tas_batched_mm_complete(tensor_2%matrix_rep) + ENDIF + ENDIF + CASE (2) + IF (ALLOCATED(tensor_1%contraction_storage) .AND. ALLOCATED(tensor_2%contraction_storage)) THEN + CALL dbcsr_t_change_pgrid_2d(tensor_1, tensor_1%pgrid%mp_comm_2d, pdims=pdims_2d_opt, & + nsplit=split_opt_avg%ngroup, dimsplit=split_opt_avg%split_rowcol, & + pgrid_changed=pgrid_changed, & + unit_nr=unit_nr_prv) + IF (pgrid_changed) pgrid_changed_any = .TRUE. + CALL dbcsr_t_change_pgrid_2d(tensor_2, tensor_2%pgrid%mp_comm_2d, pdims=pdims_2d_opt, & + nsplit=split_opt_avg%ngroup, dimsplit=split_opt_avg%split_rowcol, & + pgrid_changed=pgrid_changed, & + unit_nr=unit_nr_prv) + IF (pgrid_changed) pgrid_changed_any = .TRUE. + ENDIF + IF (pgrid_changed_any) THEN + IF (tensor_3%matrix_rep%do_batched == 3) THEN + CALL dbcsr_tas_batched_mm_complete(tensor_3%matrix_rep) + ENDIF + ENDIF + CASE (3) + IF (ALLOCATED(tensor_2%contraction_storage) .AND. ALLOCATED(tensor_3%contraction_storage)) THEN + CALL dbcsr_t_change_pgrid_2d(tensor_2, tensor_2%pgrid%mp_comm_2d, pdims=pdims_2d_opt, & + nsplit=split_opt_avg%ngroup, dimsplit=split_opt_avg%split_rowcol, & + pgrid_changed=pgrid_changed, & + unit_nr=unit_nr_prv) + IF (pgrid_changed) pgrid_changed_any = .TRUE. + CALL dbcsr_t_change_pgrid_2d(tensor_3, tensor_3%pgrid%mp_comm_2d, pdims=pdims_2d_opt, & + nsplit=split_opt_avg%ngroup, dimsplit=split_opt_avg%split_rowcol, & + pgrid_changed=pgrid_changed, & + unit_nr=unit_nr_prv) + IF (pgrid_changed) pgrid_changed_any = .TRUE. + ENDIF + IF (pgrid_changed_any) THEN + IF (tensor_1%matrix_rep%do_batched == 3) THEN + CALL dbcsr_tas_batched_mm_complete(tensor_1%matrix_rep) + ENDIF + ENDIF + END SELECT + CALL dbcsr_tas_release_info(split_opt_avg) + ENDIF + + IF ((.NOT. tensors_remapped) .AND. do_batched) THEN + ! freeze TAS process grids if tensor grids were optimized + CALL dbcsr_tas_set_batched_state(tensor_1%matrix_rep, opt_grid=.TRUE.) + CALL dbcsr_tas_set_batched_state(tensor_2%matrix_rep, opt_grid=.TRUE.) + CALL dbcsr_tas_set_batched_state(tensor_3%matrix_rep, opt_grid=.TRUE.) + ENDIF + + CALL dbcsr_tas_release_info(split_opt) + + CALL timestop(handle) + END SUBROUTINE SUBROUTINE align_tensor(tensor_in, contract_in, notcontract_in, & @@ -1022,7 +1187,8 @@ SUBROUTINE align_tensor(tensor_in, contract_in, notcontract_in, & END SUBROUTINE SUBROUTINE reshape_mm_compatible(tensor1, tensor2, tensor1_out, tensor2_out, ind1_free, ind1_linked, & - ind2_free, ind2_linked, trans1, trans2, new1, new2, nodata1, nodata2, move_data_1, & + ind2_free, ind2_linked, trans1, trans2, new1, new2, ref_tensor, & + nodata1, nodata2, move_data_1, & move_data_2, optimize_dist, unit_nr) !! Prepare tensor for contraction: redistribute to a 2d format which can be contracted by !! matrix multiplication. This routine reshapes the two largest of the three tensors. Redistribution @@ -1046,6 +1212,7 @@ SUBROUTINE reshape_mm_compatible(tensor1, tensor2, tensor1_out, tensor2_out, ind LOGICAL, INTENT(OUT) :: new1, new2 !! whether a new tensor 1 was created !! whether a new tensor 2 was created + INTEGER, INTENT(OUT) :: ref_tensor LOGICAL, INTENT(IN), OPTIONAL :: nodata1, nodata2 !! don't copy data of tensor 1 !! don't copy data of tensor 2 @@ -1056,19 +1223,24 @@ SUBROUTINE reshape_mm_compatible(tensor1, tensor2, tensor1_out, tensor2_out, ind !! experimental: optimize distribution INTEGER, INTENT(IN), OPTIONAL :: unit_nr !! output unit - INTEGER :: ref_tensor, compat1, compat1_old, compat2, compat2_old, & + INTEGER :: compat1, compat1_old, compat2, compat2_old, & comm_2d, unit_nr_prv TYPE(array_list) :: dist_list - INTEGER, DIMENSION(:), ALLOCATABLE :: mp_dims, dims + INTEGER, DIMENSION(:), ALLOCATABLE :: mp_dims TYPE(dbcsr_t_distribution_type) :: dist_in INTEGER(KIND=int_8) :: nblkrows, nblkcols LOGICAL :: optimize_dist_prv + INTEGER, DIMENSION(ndims_tensor(tensor1)) :: dims1 + INTEGER, DIMENSION(ndims_tensor(tensor2)) :: dims2 NULLIFY (tensor1_out, tensor2_out) unit_nr_prv = prep_output_unit(unit_nr) - IF (SIZE(ind1_free) .GE. SIZE(ind2_free)) THEN + CALL blk_dims_tensor(tensor1, dims1) + CALL blk_dims_tensor(tensor2, dims2) + + IF (PRODUCT(int(dims1, int_8)) .GE. PRODUCT(int(dims2, int_8))) THEN ref_tensor = 1 ELSE ref_tensor = 2 @@ -1120,10 +1292,8 @@ SUBROUTINE reshape_mm_compatible(tensor1, tensor2, tensor1_out, tensor2_out, ind IF (ref_tensor == 1) THEN ! tensor 1 is reference and tensor 2 is reshaped compatible with tensor 1 IF (compat1 == 0 .OR. optimize_dist_prv) THEN ! tensor 1 is not contraction compatible --> reshape IF (unit_nr_prv > 0) WRITE (unit_nr_prv, '(T2,A,1X,A)') "Redistribution of", TRIM(tensor1%name) - ALLOCATE (dims(ndims_tensor(tensor1))) - CALL blk_dims_tensor(tensor1, dims) - nblkrows = PRODUCT(INT(dims(ind1_linked), KIND=int_8)) - nblkcols = PRODUCT(INT(dims(ind1_free), KIND=int_8)) + nblkrows = PRODUCT(INT(dims1(ind1_linked), KIND=int_8)) + nblkcols = PRODUCT(INT(dims1(ind1_free), KIND=int_8)) comm_2d = dbcsr_tas_mp_comm(tensor1%pgrid%mp_comm_2d, nblkrows, nblkcols) ALLOCATE (tensor1_out) CALL dbcsr_t_remap(tensor1, ind1_linked, ind1_free, tensor1_out, comm_2d=comm_2d, & @@ -1166,10 +1336,8 @@ SUBROUTINE reshape_mm_compatible(tensor1, tensor2, tensor1_out, tensor2_out, ind ELSE ! tensor 2 is reference and tensor 1 is reshaped compatible with tensor 2 IF (compat2 == 0 .OR. optimize_dist_prv) THEN ! tensor 2 is not contraction compatible --> reshape IF (unit_nr_prv > 0) WRITE (unit_nr_prv, '(T2,A,1X,A)') "Redistribution of", TRIM(tensor2%name) - ALLOCATE (dims(ndims_tensor(tensor2))) - CALL blk_dims_tensor(tensor2, dims) - nblkrows = PRODUCT(INT(dims(ind2_linked), KIND=int_8)) - nblkcols = PRODUCT(INT(dims(ind2_free), KIND=int_8)) + nblkrows = PRODUCT(INT(dims2(ind2_linked), KIND=int_8)) + nblkcols = PRODUCT(INT(dims2(ind2_free), KIND=int_8)) comm_2d = dbcsr_tas_mp_comm(tensor2%pgrid%mp_comm_2d, nblkrows, nblkcols) ALLOCATE (tensor2_out) CALL dbcsr_t_remap(tensor2, ind2_linked, ind2_free, tensor2_out, nodata=nodata2, move_data=move_data_2) @@ -1306,10 +1474,7 @@ SUBROUTINE reshape_mm_small(tensor_in, ind1, ind2, tensor_out, trans, new, nodat ALLOCATE (tensor_out) CALL dbcsr_t_remap(tensor_in, ind1, ind2, tensor_out, nodata=nodata, move_data=move_data) - IF (tensor_in%matrix_rep%do_batched > 0) THEN - ALLOCATE (tensor_out%matrix_rep%mm_storage, SOURCE=tensor_in%matrix_rep%mm_storage) - tensor_out%matrix_rep%do_batched = tensor_in%matrix_rep%do_batched - ENDIF + CALL dbcsr_t_copy_contraction_storage(tensor_in, tensor_out) compat1 = 1 compat2 = 2 new = .TRUE. @@ -1341,6 +1506,52 @@ SUBROUTINE reshape_mm_small(tensor_in, ind1, ind2, tensor_out, trans, new, nodat END SUBROUTINE + FUNCTION update_contraction_storage(storage, split_opt, split) RESULT(do_change_pgrid) + !! update contraction storage that keeps track of process grids during a batched contraction + !! and decide if tensor process grid needs to be optimized + TYPE(dbcsr_t_contraction_storage), INTENT(INOUT) :: storage + TYPE(dbcsr_tas_split_info), INTENT(IN) :: split_opt + !! optimized TAS process grid + TYPE(dbcsr_tas_split_info), INTENT(IN) :: split + !! current TAS process grid + INTEGER, DIMENSION(2) :: pdims_opt, coor, pdims, pdims_sub + LOGICAL, DIMENSION(2) :: periods + LOGICAL, DIMENSION(2) :: do_change_pgrid + REAL(kind=real_8) :: change_criterion, pdims_ratio + INTEGER :: nsplit_opt, nsplit + + DBCSR_ASSERT(ALLOCATED(split_opt%ngroup_opt)) + nsplit_opt = split_opt%ngroup_opt + nsplit = split%ngroup + + CALL mp_environ(split_opt%mp_comm, 2, pdims_opt, coor, periods) + CALL mp_environ(split%mp_comm, 2, pdims, coor, periods) + + storage%ibatch = storage%ibatch + 1 + + storage%nsplit_avg = (storage%nsplit_avg*REAL(storage%ibatch - 1, real_8) + REAL(nsplit_opt, real_8)) & + /REAL(storage%ibatch, real_8) + + SELECT CASE (split_opt%split_rowcol) + CASE (rowsplit) + pdims_ratio = REAL(pdims(1), real_8)/pdims(2) + CASE (colsplit) + pdims_ratio = REAL(pdims(2), real_8)/pdims(1) + END SELECT + + do_change_pgrid(:) = .FALSE. + + ! check for process grid dimensions + CALL mp_environ(split%mp_comm_group, 2, pdims_sub, coor, periods) + change_criterion = MAXVAL(REAL(pdims_sub, real_8))/MINVAL(pdims_sub) + IF (change_criterion > default_pdims_accept_ratio**2) do_change_pgrid(1) = .TRUE. + + ! check for split factor + change_criterion = MAX(REAL(nsplit, real_8)/storage%nsplit_avg, REAL(storage%nsplit_avg, real_8)/nsplit) + IF (change_criterion > default_nsplit_accept_ratio) do_change_pgrid(2) = .TRUE. + + END FUNCTION + FUNCTION compat_map(nd_index, compat_ind) !! Check if 2d index is compatible with tensor index TYPE(nd_to_2d_mapping), INTENT(IN) :: nd_index @@ -1492,7 +1703,7 @@ SUBROUTINE dbcsr_t_remap(tensor_in, map1_2d, map2_2d, tensor_out, comm_2d, dist1 nodata_prv = .FALSE. ENDIF - IF (.NOT. nodata_prv) CALL dbcsr_t_copy(tensor_in, tensor_out, move_data=move_data) + IF (.NOT. nodata_prv) CALL dbcsr_t_copy_expert(tensor_in, tensor_out, move_data=move_data) CALL dbcsr_t_distribution_destroy(dist) CALL timestop(handle) @@ -1568,6 +1779,12 @@ SUBROUTINE dbcsr_t_permute_index(tensor_in, tensor_out, order) tensor_out%name = tensor_in%name tensor_out%valid = .TRUE. + IF (ALLOCATED(tensor_in%contraction_storage)) THEN + ALLOCATE (tensor_out%contraction_storage, SOURCE=tensor_in%contraction_storage) + CALL destroy_array_list(tensor_out%contraction_storage%batch_ranges) + CALL reorder_arrays(tensor_in%contraction_storage%batch_ranges, tensor_out%contraction_storage%batch_ranges, order) + ENDIF + CALL timestop(handle) END SUBROUTINE @@ -1785,17 +2002,82 @@ SUBROUTINE dbcsr_t_print_contraction_index(tensor_1, indchar1, tensor_2, indchar END SUBROUTINE - SUBROUTINE dbcsr_t_batched_contract_init(tensor) - !! initialize batched contraction for this tensor. This ensures that no unnecessary communication takes place - !! in a series of batched contractions (i.e. partial contraction with specification of bounds - !! in order to reduce memory), under the following conditions: - !! - the same tensor is used in multiple contraction calls and its data is not changed via calls - !! to routines other than dbcsr_t_contract - !! - if tensor is the result of a contraction, it must not be reused as input for another contraction - !! If tensor is the result of a contraction, its publicly accessible data will not be updated until - !! dbcsr_t_batched_contract_finalize has been called. + SUBROUTINE dbcsr_t_batched_contract_init(tensor, ${varlist("batch_range")}$) + !! Initialize batched contraction for this tensor. + !! + !! Explanation: A batched contraction is a contraction performed in several consecutive steps by + !! specification of bounds in dbcsr_t_contract. This can be used to reduce memory by a large factor. + !! The routines dbcsr_t_batched_contract_init and dbcsr_t_batched_contract_finalize should be + !! called to define the scope of a batched contraction as this enables important optimizations + !! (adapting communication scheme to batches and adapting process grid to multiplication algorithm). + !! The routines dbcsr_t_batched_contract_init and dbcsr_t_batched_contract_finalize must be called + !! before the first and after the last contraction step on all 3 tensors. + !! + !! Requirements: + !! - the tensors are in a compatible matrix layout (see documentation of `dbcsr_t_contract`, note 2 & 3). + !! If they are not, process grid optimizations are disabled and a warning is issued. + !! - within the scope of a batched contraction, it is not allowed to access or change tensor data + !! except by calling the routines dbcsr_t_contract & dbcsr_t_copy. + !! - the bounds affecting indices of the smallest tensor must not change in the course of a batched + !! contraction (todo: get rid of this requirement). + !! + !! Side effects: + !! - the parallel layout (process grid and distribution) of all tensors may change. In order to + !! disable the process grid optimization including this side effect, call this routine only on the + !! smallest of the 3 tensors. + !! + !! @note + !! Note 1: for an example of batched contraction see `examples/dbcsr_tensor_example.F`. + !! (todo: the example is outdated and should be updated). + !! + !! Note 2: it is meaningful to use this feature if the contraction consists of one batch only + !! but if multiple contractions involving the same 3 tensors are performed + !! (batched_contract_init and batched_contract_finalize must then be called before/after each + !! contraction call). The process grid is then optimized after the first contraction + !! and future contraction may profit from this optimization. + !! @endnote TYPE(dbcsr_t_type), INTENT(INOUT) :: tensor - CALL dbcsr_tas_batched_mm_init(tensor%matrix_rep) + INTEGER, DIMENSION(:), OPTIONAL, INTENT(IN) :: ${varlist("batch_range")}$ + !! For internal load balancing optimizations, optionally specify the index ranges of + !! batched contraction. + !! batch_range_i refers to the ith tensor dimension and contains all block indices starting + !! a new range. The size should be the number of ranges plus one, the last element being the + !! block index plus one of the last block in the last range. + INTEGER, DIMENSION(ndims_tensor(tensor)) :: tdims + INTEGER, DIMENSION(:), ALLOCATABLE :: ${varlist("batch_range_prv")}$ + LOGICAL :: static_range + + CALL dbcsr_t_get_info(tensor, nblks_total=tdims) + + static_range = .TRUE. +#:for idim in range(1, maxdim+1) + IF (ndims_tensor(tensor) >= ${idim}$) THEN + IF (PRESENT(batch_range_${idim}$)) THEN + ALLOCATE (batch_range_prv_${idim}$, SOURCE=batch_range_${idim}$) + static_range = .FALSE. + ELSE + ALLOCATE (batch_range_prv_${idim}$ (2)) + batch_range_prv_${idim}$ (1) = 1 + batch_range_prv_${idim}$ (2) = tdims(${idim}$) + 1 + ENDIF + ENDIF +#:endfor + + ALLOCATE (tensor%contraction_storage) + tensor%contraction_storage%static = static_range + IF (static_range) THEN + CALL dbcsr_tas_batched_mm_init(tensor%matrix_rep) + ENDIF + tensor%contraction_storage%nsplit_avg = 0.0_real_8 + tensor%contraction_storage%ibatch = 0 + +#:for ndim in range(1, maxdim+1) + IF (ndims_tensor(tensor) == ${ndim}$) THEN + CALL create_array_list(tensor%contraction_storage%batch_ranges, ${ndim}$, & + ${varlist("batch_range_prv", nmax=ndim)}$) + ENDIF +#:endfor + END SUBROUTINE SUBROUTINE dbcsr_t_batched_contract_finalize(tensor, unit_nr) @@ -1804,16 +2086,20 @@ SUBROUTINE dbcsr_t_batched_contract_finalize(tensor, unit_nr) TYPE(dbcsr_t_type), INTENT(INOUT) :: tensor INTEGER, INTENT(IN), OPTIONAL :: unit_nr LOGICAL :: do_write - INTEGER :: unit_nr_prv + INTEGER :: unit_nr_prv, handle + CALL mp_sync(tensor%pgrid%mp_comm_2d) + CALL timeset("dbcsr_t_total", handle) unit_nr_prv = prep_output_unit(unit_nr) do_write = .FALSE. - IF (tensor%matrix_rep%do_batched > 0) THEN - IF (tensor%matrix_rep%mm_storage%batched_out) do_write = .TRUE. - ENDIF - CALL dbcsr_tas_batched_mm_finalize(tensor%matrix_rep) + IF (tensor%contraction_storage%static) THEN + IF (tensor%matrix_rep%do_batched > 0) THEN + IF (tensor%matrix_rep%mm_storage%batched_out) do_write = .TRUE. + ENDIF + CALL dbcsr_tas_batched_mm_finalize(tensor%matrix_rep) + ENDIF IF (do_write .AND. unit_nr_prv /= 0) THEN IF (unit_nr_prv > 0) THEN @@ -1824,6 +2110,221 @@ SUBROUTINE dbcsr_t_batched_contract_finalize(tensor, unit_nr) CALL dbcsr_t_write_tensor_dist(tensor, unit_nr_prv) ENDIF + CALL destroy_array_list(tensor%contraction_storage%batch_ranges) + DEALLOCATE (tensor%contraction_storage) + CALL mp_sync(tensor%pgrid%mp_comm_2d) + CALL timestop(handle) + + END SUBROUTINE + + SUBROUTINE dbcsr_t_change_pgrid(tensor, pgrid, ${varlist("batch_range")}$, & + nodata, pgrid_changed, unit_nr) + !! change the process grid of a tensor + TYPE(dbcsr_t_type), INTENT(INOUT) :: tensor + TYPE(dbcsr_t_pgrid_type), INTENT(IN) :: pgrid + INTEGER, DIMENSION(:), OPTIONAL, INTENT(IN) :: ${varlist("batch_range")}$ + !! For internal load balancing optimizations, optionally specify the index ranges of + !! batched contraction. + !! batch_range_i refers to the ith tensor dimension and contains all block indices starting + !! a new range. The size should be the number of ranges plus one, the last element being the + !! block index plus one of the last block in the last range. + LOGICAL, INTENT(IN), OPTIONAL :: nodata + !! optionally don't copy the tensor data (then tensor is empty on returned) + LOGICAL, INTENT(OUT), OPTIONAL :: pgrid_changed + INTEGER, INTENT(IN), OPTIONAL :: unit_nr + CHARACTER(LEN=*), PARAMETER :: routineN = 'dbcsr_t_change_pgrid' + CHARACTER(default_string_length) :: name + INTEGER :: handle + INTEGER, ALLOCATABLE, DIMENSION(:) :: ${varlist("bs")}$, & + ${varlist("dist")}$ + INTEGER, DIMENSION(ndims_tensor(tensor)) :: pcoord, pcoord_ref, pdims, pdims_ref, & + tdims + TYPE(dbcsr_t_type) :: t_tmp + TYPE(dbcsr_t_distribution_type) :: dist + INTEGER, DIMENSION(ndims_matrix_row(tensor)) :: map1 + INTEGER, & + DIMENSION(ndims_matrix_column(tensor)) :: map2 + LOGICAL, DIMENSION(ndims_tensor(tensor)) :: mem_aware + INTEGER, DIMENSION(ndims_tensor(tensor)) :: nbatch + INTEGER :: ind1, ind2, batch_size, ibatch + + IF (PRESENT(pgrid_changed)) pgrid_changed = .FALSE. + CALL mp_environ_pgrid(pgrid, pdims, pcoord) + CALL mp_environ_pgrid(tensor%pgrid, pdims_ref, pcoord_ref) + + IF (ALL(pdims == pdims_ref)) THEN + IF (ALLOCATED(pgrid%tas_split_info) .AND. ALLOCATED(tensor%pgrid%tas_split_info)) THEN + IF (pgrid%tas_split_info%ngroup == tensor%pgrid%tas_split_info%ngroup) THEN + RETURN + ENDIF + ENDIF + ENDIF + + CALL timeset(routineN, handle) + +#:for idim in range(1, maxdim+1) + IF (ndims_tensor(tensor) >= ${idim}$) THEN + mem_aware(${idim}$) = PRESENT(batch_range_${idim}$) + IF (mem_aware(${idim}$)) nbatch(${idim}$) = SIZE(batch_range_${idim}$) - 1 + ENDIF +#:endfor + + CALL dbcsr_t_get_info(tensor, nblks_total=tdims, name=name) + +#:for idim in range(1, maxdim+1) + IF (ndims_tensor(tensor) >= ${idim}$) THEN + ALLOCATE (bs_${idim}$ (dbcsr_t_nblks_total(tensor, ${idim}$))) + CALL get_ith_array(tensor%blk_sizes, ${idim}$, bs_${idim}$) + ALLOCATE (dist_${idim}$ (tdims(${idim}$))) + dist_${idim}$ = 0 + IF (mem_aware(${idim}$)) THEN + DO ibatch = 1, nbatch(${idim}$) + ind1 = batch_range_${idim}$ (ibatch) + ind2 = batch_range_${idim}$ (ibatch + 1) - 1 + batch_size = ind2 - ind1 + 1 + CALL dbcsr_t_default_distvec(batch_size, pdims(${idim}$), & + bs_${idim}$ (ind1:ind2), dist_${idim}$ (ind1:ind2)) + ENDDO + ELSE + CALL dbcsr_t_default_distvec(tdims(${idim}$), pdims(${idim}$), bs_${idim}$, dist_${idim}$) + ENDIF + ENDIF +#:endfor + + CALL dbcsr_t_get_mapping_info(tensor%nd_index_blk, map1_2d=map1, map2_2d=map2) +#:for ndim in ndims + IF (ndims_tensor(tensor) == ${ndim}$) THEN + CALL dbcsr_t_distribution_new(dist, pgrid, ${varlist("dist", nmax=ndim)}$) + CALL dbcsr_t_create(t_tmp, name, dist, map1, map2, dbcsr_type_real_8, ${varlist("bs", nmax=ndim)}$) + ENDIF +#:endfor + CALL dbcsr_t_distribution_destroy(dist) + + IF (PRESENT(nodata)) THEN + IF (.NOT. nodata) CALL dbcsr_t_copy_expert(tensor, t_tmp, move_data=.TRUE.) + ELSE + CALL dbcsr_t_copy_expert(tensor, t_tmp, move_data=.TRUE.) + ENDIF + + CALL dbcsr_t_copy_contraction_storage(tensor, t_tmp) + + CALL dbcsr_t_destroy(tensor) + tensor = t_tmp + + IF (PRESENT(unit_nr)) THEN + IF (unit_nr > 0) THEN + WRITE (unit_nr, "(T2,A,1X,A)") "OPTIMIZED PGRID INFO FOR", TRIM(tensor%name) + WRITE (unit_nr, "(T4,A,1X,3I6)") "process grid dimensions:", pdims + CALL dbcsr_t_write_split_info(pgrid, unit_nr) + ENDIF + ENDIF + + IF (PRESENT(pgrid_changed)) pgrid_changed = .TRUE. + + CALL timestop(handle) + END SUBROUTINE + + SUBROUTINE dbcsr_t_change_pgrid_2d(tensor, mp_comm, pdims, nodata, nsplit, dimsplit, pgrid_changed, unit_nr) + !! map tensor to a new 2d process grid for the matrix representation. + TYPE(dbcsr_t_type), INTENT(INOUT) :: tensor + INTEGER, INTENT(IN) :: mp_comm + INTEGER, DIMENSION(2), INTENT(IN), OPTIONAL :: pdims + LOGICAL, INTENT(IN), OPTIONAL :: nodata + INTEGER, INTENT(IN), OPTIONAL :: nsplit, dimsplit + LOGICAL, INTENT(OUT), OPTIONAL :: pgrid_changed + INTEGER, INTENT(IN), OPTIONAL :: unit_nr + INTEGER, DIMENSION(ndims_matrix_row(tensor)) :: map1 + INTEGER, DIMENSION(ndims_matrix_column(tensor)) :: map2 + INTEGER, DIMENSION(ndims_tensor(tensor)) :: dims, nbatches + TYPE(dbcsr_t_pgrid_type) :: pgrid + INTEGER, DIMENSION(:), ALLOCATABLE :: ${varlist("batch_range")}$ + INTEGER, DIMENSION(:), ALLOCATABLE :: array + INTEGER :: idim + + CALL dbcsr_t_get_mapping_info(tensor%pgrid%nd_index_grid, map1_2d=map1, map2_2d=map2) + CALL blk_dims_tensor(tensor, dims) + + IF (ALLOCATED(tensor%contraction_storage)) THEN + ASSOCIATE (batch_ranges => tensor%contraction_storage%batch_ranges) + nbatches = sizes_of_arrays(tensor%contraction_storage%batch_ranges) - 1 + ! for good load balancing the process grid dimensions should be chosen adapted to the + ! tensor dimenions. For batched contraction the tensor dimensions should be divided by + ! the number of batches (number of index ranges). + DO idim = 1, ndims_tensor(tensor) + CALL get_ith_array(tensor%contraction_storage%batch_ranges, idim, array) + dims(idim) = array(nbatches(idim) + 1) - array(1) + DEALLOCATE (array) + dims(idim) = dims(idim)/nbatches(idim) + IF (dims(idim) <= 0) dims(idim) = 1 + ENDDO + END ASSOCIATE + ENDIF + + pgrid = dbcsr_t_nd_mp_comm(mp_comm, map1, map2, pdims_2d=pdims, tdims=dims, nsplit=nsplit, dimsplit=dimsplit) + IF (ALLOCATED(tensor%contraction_storage)) THEN +#:for ndim in range(1, maxdim+1) + IF (ndims_tensor(tensor) == ${ndim}$) THEN + CALL get_arrays(tensor%contraction_storage%batch_ranges, ${varlist("batch_range", nmax=ndim)}$) + CALL dbcsr_t_change_pgrid(tensor, pgrid, ${varlist("batch_range", nmax=ndim)}$, & + nodata=nodata, pgrid_changed=pgrid_changed, unit_nr=unit_nr) + ENDIF +#:endfor + ELSE + CALL dbcsr_t_change_pgrid(tensor, pgrid, nodata=nodata, pgrid_changed=pgrid_changed, unit_nr=unit_nr) + ENDIF + CALL dbcsr_t_pgrid_destroy(pgrid) + + END SUBROUTINE + + SUBROUTINE pdims_realtoint(nproc, pdims_real, pdims_int, pdims_min, pdims_max) + INTEGER, INTENT(IN) :: nproc + REAL(real_8), DIMENSION(2), INTENT(IN) :: pdims_real + INTEGER, DIMENSION(2), INTENT(OUT) :: pdims_int, pdims_min, pdims_max + INTEGER, DIMENSION(2) :: ind + + IF (pdims_real(1) > pdims_real(2)) THEN + ind = [2, 1] + ELSE + ind = [1, 2] + ENDIF + + CALL nearest_divisor(nproc, pdims_real(ind(1)), pdims_min(1), pdims_max(1), pdims_int(1)) + pdims_int(2) = nproc/pdims_int(1) + DBCSR_ASSERT(pdims_int(1)*pdims_int(2) == nproc) + pdims_min(2) = nproc/pdims_min(1) + DBCSR_ASSERT(pdims_min(1)*pdims_min(2) == nproc) + pdims_max(2) = nproc/pdims_max(1) + DBCSR_ASSERT(pdims_max(1)*pdims_max(2) == nproc) + + pdims_int(ind) = pdims_int + pdims_min(ind) = pdims_min + pdims_max(ind) = pdims_max + END SUBROUTINE + + SUBROUTINE nearest_divisor(n, div_approx, div_min, div_max, div_near) + INTEGER, INTENT(IN) :: n + REAL(real_8), INTENT(IN) :: div_approx + INTEGER :: div_min, div_max, div_near + + div_min = FLOOR(div_approx) + div_min = MAX(1, div_min) + div_max = div_min + 1 + div_max = MIN(n, div_max) + + DO WHILE (MOD(n, div_min) .NE. 0) + div_min = div_min - 1 + ENDDO + + DO WHILE (MOD(n, div_max) .NE. 0) + div_max = div_max + 1 + ENDDO + + IF (div_approx/REAL(div_min, real_8) < REAL(div_max, real_8)/div_approx) THEN + div_near = div_min + ELSE + div_near = div_max + ENDIF + END SUBROUTINE END MODULE diff --git a/src/tensors/dbcsr_tensor.h b/src/tensors/dbcsr_tensor.h index 5a378bc0cad..cd67a361a28 100644 --- a/src/tensors/dbcsr_tensor.h +++ b/src/tensors/dbcsr_tensor.h @@ -17,7 +17,7 @@ #:set ndims = range(2,maxrank+1) #:set ddims = range(1,maxrank+1) -#ifdef __cplusplus +#if defined(__cplusplus) extern "C" { #endif @@ -194,7 +194,7 @@ extern "C" { long long int c_dbcsr_t_max_nblks_local(const void* c_tensor); -#ifdef __cplusplus +#if defined(__cplusplus) } #endif diff --git a/src/tensors/dbcsr_tensor_split.F b/src/tensors/dbcsr_tensor_split.F index d40799de5db..051458f80d6 100644 --- a/src/tensors/dbcsr_tensor_split.F +++ b/src/tensors/dbcsr_tensor_split.F @@ -42,7 +42,8 @@ MODULE dbcsr_tensor_split dbcsr_t_blk_sizes, & ndims_matrix_row, & ndims_matrix_column, & - dbcsr_t_filter + dbcsr_t_filter, & + dbcsr_t_copy_contraction_storage USE dbcsr_api, ONLY: ${uselist(dtype_float_param)}$ USE dbcsr_kinds, ONLY: ${uselist(dtype_float_prec)}$, dp @@ -690,10 +691,7 @@ SUBROUTINE dbcsr_t_crop(tensor_in, tensor_out, bounds, move_data) IF (move_data_prv) CALL dbcsr_t_clear(tensor_in) ! transfer data for batched contraction - IF (tensor_in%matrix_rep%do_batched > 0) THEN - ALLOCATE (tensor_out%matrix_rep%mm_storage, SOURCE=tensor_in%matrix_rep%mm_storage) - tensor_out%matrix_rep%do_batched = tensor_in%matrix_rep%do_batched - ENDIF + CALL dbcsr_t_copy_contraction_storage(tensor_in, tensor_out) END SUBROUTINE diff --git a/src/tensors/dbcsr_tensor_types.F b/src/tensors/dbcsr_tensor_types.F index 3ce31c4511e..30339e568a2 100644 --- a/src/tensors/dbcsr_tensor_types.F +++ b/src/tensors/dbcsr_tensor_types.F @@ -31,14 +31,15 @@ MODULE dbcsr_tensor_types dbcsr_tas_get_num_blocks, dbcsr_tas_get_num_blocks_total, dbcsr_tas_get_data_size, dbcsr_tas_get_nze, & dbcsr_tas_get_nze_total, dbcsr_tas_clear, dbcsr_tas_get_data_type USE dbcsr_tas_types, ONLY: & - dbcsr_tas_type, dbcsr_tas_distribution_type, dbcsr_tas_split_info + dbcsr_tas_type, dbcsr_tas_distribution_type, dbcsr_tas_split_info, dbcsr_tas_mm_storage + USE dbcsr_tas_mm, ONLY: dbcsr_tas_set_batched_state USE dbcsr_tensor_index, ONLY: & get_2d_indices_tensor, get_nd_indices_pgrid, create_nd_to_2d_mapping, destroy_nd_to_2d_mapping, & dbcsr_t_get_mapping_info, nd_to_2d_mapping, split_tensor_index, combine_tensor_index, combine_pgrid_index, & split_pgrid_index, ndims_mapping, ndims_mapping_row, ndims_mapping_column USE dbcsr_tas_split, ONLY: & dbcsr_tas_create_split_rows_or_cols, dbcsr_tas_release_info, dbcsr_tas_info_hold, & - dbcsr_tas_create_split, dbcsr_tas_get_split_info + dbcsr_tas_create_split, dbcsr_tas_get_split_info, dbcsr_tas_set_strict_split USE dbcsr_kinds, ONLY: default_string_length, int_8, dp USE dbcsr_mpiwrap, ONLY: & mp_cart_create, mp_cart_rank, mp_environ, mp_dims_create, mp_comm_free, mp_comm_dup, mp_sum, mp_max @@ -84,6 +85,7 @@ MODULE dbcsr_tensor_types dbcsr_t_pgrid_create_expert, & dbcsr_t_pgrid_destroy, & dbcsr_t_pgrid_type, & + dbcsr_t_pgrid_set_strict_split, & dbcsr_t_scale, & dbcsr_t_set, & dbcsr_t_type, & @@ -96,7 +98,9 @@ MODULE dbcsr_tensor_types dbcsr_t_nblks_total, & dbcsr_t_blk_size, & dbcsr_t_max_nblks_local, & - dbcsr_t_default_distvec + dbcsr_t_default_distvec, & + dbcsr_t_contraction_storage, & + dbcsr_t_copy_contraction_storage TYPE dbcsr_t_pgrid_type TYPE(nd_to_2d_mapping) :: nd_index_grid @@ -105,6 +109,13 @@ MODULE dbcsr_tensor_types INTEGER :: nproc END TYPE + TYPE dbcsr_t_contraction_storage + REAL(real_8) :: nsplit_avg + INTEGER :: ibatch + TYPE(array_list) :: batch_ranges + LOGICAL :: static + END TYPE + TYPE dbcsr_t_type TYPE(dbcsr_tas_type), POINTER :: matrix_rep => NULL() TYPE(nd_to_2d_mapping) :: nd_index_blk @@ -121,6 +132,7 @@ MODULE dbcsr_tensor_types CHARACTER(LEN=default_string_length) :: name ! lightweight reference counting for communicators: INTEGER, POINTER :: refcount => NULL() + TYPE(dbcsr_t_contraction_storage), ALLOCATABLE :: contraction_storage END TYPE dbcsr_t_type TYPE dbcsr_t_distribution_type @@ -359,7 +371,8 @@ FUNCTION tas_blk_size_t(t, rowcol) END FUNCTION - FUNCTION dbcsr_t_nd_mp_comm(comm_2d, map1_2d, map2_2d, dims_nd, dims1_nd, dims2_nd, pdims_2d, tdims) + FUNCTION dbcsr_t_nd_mp_comm(comm_2d, map1_2d, map2_2d, dims_nd, dims1_nd, dims2_nd, pdims_2d, tdims, & + nsplit, dimsplit) !! Create a default nd process topology that is consistent with a given 2d topology. !! Purpose: a nd tensor defined on the returned process grid can be represented as a DBCSR !! matrix with the given 2d topology. @@ -383,6 +396,7 @@ FUNCTION dbcsr_t_nd_mp_comm(comm_2d, map1_2d, map2_2d, dims_nd, dims1_nd, dims2_ !! tensor block dimensions. If present, process grid dimensions are created such that good !! load balancing is ensured even if some of the tensor dimensions are small (i.e. on the same order !! or smaller than nproc**(1/ndim) where ndim is the tensor rank) + INTEGER, INTENT(IN), OPTIONAL :: nsplit, dimsplit INTEGER :: ndim1, ndim2 INTEGER :: numtask INTEGER, DIMENSION(2) :: dims_2d, task_coor @@ -436,7 +450,7 @@ FUNCTION dbcsr_t_nd_mp_comm(comm_2d, map1_2d, map2_2d, dims_nd, dims1_nd, dims2_ ENDIF CALL dbcsr_t_pgrid_create_expert(comm_2d, dims_nd_prv, dbcsr_t_nd_mp_comm, & - tensor_dims=tdims, map1_2d=map1_2d, map2_2d=map2_2d) + tensor_dims=tdims, map1_2d=map1_2d, map2_2d=map2_2d, nsplit=nsplit, dimsplit=dimsplit) CALL timestop(handle) @@ -643,6 +657,8 @@ SUBROUTINE dbcsr_t_distribution_new_expert(dist, pgrid, map1_2d, map2_2d, ${varl CALL dbcsr_tas_distribution_new(dist%dist, comm_2d, row_dist_obj, col_dist_obj, split_info=pgrid_prv%tas_split_info) ELSE CALL dbcsr_tas_distribution_new(dist%dist, comm_2d, row_dist_obj, col_dist_obj) + ALLOCATE (pgrid_prv%tas_split_info, SOURCE=dist%dist%info) + CALL dbcsr_tas_info_hold(pgrid_prv%tas_split_info) ENDIF dist%nd_dist = nd_dist @@ -1241,6 +1257,12 @@ SUBROUTINE dbcsr_t_pgrid_destroy(pgrid, keep_comm) ENDIF END SUBROUTINE + SUBROUTINE dbcsr_t_pgrid_set_strict_split(pgrid) + !! freeze current split factor such that it is never changed during contraction + TYPE(dbcsr_t_pgrid_type), INTENT(INOUT) :: pgrid + IF (ALLOCATED(pgrid%tas_split_info)) CALL dbcsr_tas_set_strict_split(pgrid%tas_split_info) + END SUBROUTINE + SUBROUTINE dbcsr_t_pgrid_remap(pgrid_in, map1_2d, map2_2d, pgrid_out) !! remap a process grid (needed when mapping between tensor and matrix index is changed) @@ -1576,4 +1598,26 @@ SUBROUTINE dbcsr_t_default_distvec(nblk, nproc, blk_size, dist) END SUBROUTINE + SUBROUTINE dbcsr_t_copy_contraction_storage(tensor_in, tensor_out) + TYPE(dbcsr_t_type), INTENT(IN) :: tensor_in + TYPE(dbcsr_t_type), INTENT(INOUT) :: tensor_out + TYPE(dbcsr_t_contraction_storage), ALLOCATABLE :: tensor_storage_tmp + TYPE(dbcsr_tas_mm_storage), ALLOCATABLE :: tas_storage_tmp + + IF (tensor_in%matrix_rep%do_batched > 0) THEN + ALLOCATE (tas_storage_tmp, SOURCE=tensor_in%matrix_rep%mm_storage) + ! transfer data for batched contraction + IF (ALLOCATED(tensor_out%matrix_rep%mm_storage)) DEALLOCATE (tensor_out%matrix_rep%mm_storage) + CALL move_alloc(tas_storage_tmp, tensor_out%matrix_rep%mm_storage) + ENDIF + CALL dbcsr_tas_set_batched_state(tensor_out%matrix_rep, state=tensor_in%matrix_rep%do_batched, & + opt_grid=tensor_in%matrix_rep%has_opt_pgrid) + IF (ALLOCATED(tensor_in%contraction_storage)) THEN + ALLOCATE (tensor_storage_tmp, SOURCE=tensor_in%contraction_storage) + ENDIF + IF (ALLOCATED(tensor_out%contraction_storage)) DEALLOCATE (tensor_out%contraction_storage) + IF (ALLOCATED(tensor_storage_tmp)) CALL move_alloc(tensor_storage_tmp, tensor_out%contraction_storage) + + END SUBROUTINE + END MODULE