From a2798b2a38e5a4e5fe0a9dce1682528e597c2e21 Mon Sep 17 00:00:00 2001 From: dzzz2001 <153698752+dzzz2001@users.noreply.github.com> Date: Sun, 12 May 2024 15:37:47 +0800 Subject: [PATCH] Feature: compute module_gint by GPU (#4109) * add CUDA code for module_gint and fix bug about cusolver * add comments for code_gen.cpp * add integrated test cases for CUDA gint and cusolver * add some gint test cases * modify cuda tests * modify the location of the CUDA test folder * modify some GPU test cases * modify some STRU and INPUT files for GPU test cases to reduce testing time * format INPUT file in GPU test cases * format some code * add curly brackets to if and for statements * add const to some function arguments * fix comments * fix error in gint force * remove const in gint_rho * Update input_conv_test.cpp modify input test about GPU * remove time.sh in tests/integrate * modify INPUT file in GPU test cases * remove inappropriate changes in tests/performance. * remove USE_CUSOLVER_LCAO flag * enable check in non-debug enviroment * Use abbreviations instead of unreasonable naming * fix error in debug * modify the doc about ks_solver * change the default ks_solver to cusolver * modify cuda.md --------- Co-authored-by: A-006 <3158793232@qq.com> Co-authored-by: Mohan Chen --- CMakeLists.txt | 49 +- docs/advanced/acceleration/cuda.md | 18 +- docs/advanced/input_files/input-main.md | 13 +- examples/gpu/si16_lcao/INPUT | 29 + examples/gpu/si16_lcao/KPT | 4 + examples/gpu/si16_lcao/STRU | 37 + source/module_base/global_variable.cpp | 1 + source/module_base/global_variable.h | 3 + source/module_base/scalapack_connector.h | 12 +- source/module_basis/module_ao/ORB_control.cpp | 9 +- .../hamilt_lcaodft/local_orbital_wfc.cpp | 2 +- .../module_deepks/test/CMakeLists.txt | 2 +- .../module_gint/CMakeLists.txt | 22 + .../module_hamilt_lcao/module_gint/gint.cpp | 393 +- .../module_gint/gint_force.h | 264 + .../module_gint/gint_force_gpu.cu | 248 + .../module_hamilt_lcao/module_gint/gint_rho.h | 114 + .../module_gint/gint_rho_gpu.cu | 381 ++ .../module_hamilt_lcao/module_gint/gint_vl.h | 56 + .../module_gint/gint_vl_gpu.cu | 362 ++ .../module_gint/grid_technique.cpp | 1079 ++-- .../module_gint/grid_technique.h | 290 +- .../module_gint/gtask_force.cpp | 263 + .../module_gint/gtask_rho.cpp | 219 + .../module_gint/gtask_vl.cpp | 201 + .../module_gint/kernels/cuda/code_gen.cpp | 4448 +++++++++++++++++ .../module_gint/kernels/cuda/cuda_tools.cu | 213 + .../module_gint/kernels/cuda/cuda_tools.cuh | 78 + .../module_gint/kernels/cuda/gint_force.cu | 620 +++ .../module_gint/kernels/cuda/gint_force.cuh | 120 + .../module_gint/kernels/cuda/gint_rho.cu | 84 + .../module_gint/kernels/cuda/gint_rho.cuh | 66 + .../module_gint/kernels/cuda/gint_vl.cu | 68 + .../module_gint/kernels/cuda/gint_vl.cuh | 37 + .../module_gint/kernels/cuda/interp.cuh | 144 + .../module_gint/kernels/cuda/sph.cuh | 520 ++ .../kernels/cuda/vbatch_matrix_mul.cu | 659 +++ .../kernels/cuda/vbatch_matrix_mul.cuh | 115 + .../module_gint/test/CMakeLists.txt | 7 + .../module_gint/test/test_sph.cpp | 600 +++ .../module_gint/test/test_sph.cu | 138 + .../module_gint/test/test_sph.h | 19 + source/module_hsolver/CMakeLists.txt | 2 +- source/module_hsolver/diago_cusolver.cpp | 192 +- source/module_hsolver/diago_cusolver.h | 5 + source/module_hsolver/hsolver_lcao.cpp | 6 +- .../kernels/cuda/diag_cusolver.cu | 2 +- .../kernels/cuda/diag_cusolver.cuh | 11 + source/module_hsolver/test/CMakeLists.txt | 2 +- .../test/diago_lcao_cusolver_test.cpp | 8 +- source/module_io/input.cpp | 14 + source/module_io/input.h | 2 +- source/module_io/input_conv.cpp | 7 +- source/module_io/test/input_conv_test.cpp | 10 +- source/module_io/test/input_test_para.cpp | 1 + source/module_io/test/write_input_test.cpp | 1 + source/module_io/write_input.cpp | 1 + source/module_psi/kernels/device.cpp | 60 +- source/module_psi/kernels/device.h | 2 +- tests/PP_ORB/H_gga_8au_100Ry_1s.orb | 621 +++ tests/PP_ORB/Si_gga_8au_100Ry_3s3p2d.orb | 1637 ++++++ tests/integrate/930_NO_BI2SE2CU2O2_GPU/INPUT | 26 + tests/integrate/930_NO_BI2SE2CU2O2_GPU/KPT | 4 + tests/integrate/930_NO_BI2SE2CU2O2_GPU/STRU | 47 + .../930_NO_BI2SE2CU2O2_GPU/result.ref | 8 + tests/integrate/931_NO_H20_GPU/INPUT | 29 + tests/integrate/931_NO_H20_GPU/KPT | 4 + tests/integrate/931_NO_H20_GPU/STRU | 29 + tests/integrate/931_NO_H20_GPU/result.ref | 5 + tests/integrate/932_NO_H2_dzp_GPU/INPUT | 27 + tests/integrate/932_NO_H2_dzp_GPU/KPT | 4 + tests/integrate/932_NO_H2_dzp_GPU/STRU | 22 + tests/integrate/932_NO_H2_dzp_GPU/result.ref | 8 + tests/integrate/932_NO_H2_dzp_ns2_GPU/INPUT | 28 + tests/integrate/932_NO_H2_dzp_ns2_GPU/KPT | 4 + tests/integrate/932_NO_H2_dzp_ns2_GPU/STRU | 22 + .../932_NO_H2_dzp_ns2_GPU/result.ref | 8 + tests/integrate/932_NO_H2_sz_GPU/INPUT | 27 + tests/integrate/932_NO_H2_sz_GPU/KPT | 4 + tests/integrate/932_NO_H2_sz_GPU/STRU | 22 + tests/integrate/932_NO_H2_sz_GPU/result.ref | 8 + tests/integrate/932_NO_H2_sz_ns2_GPU/INPUT | 28 + tests/integrate/932_NO_H2_sz_ns2_GPU/KPT | 4 + tests/integrate/932_NO_H2_sz_ns2_GPU/STRU | 22 + .../integrate/932_NO_H2_sz_ns2_GPU/result.ref | 8 + tests/integrate/933_NO_H_dzp_GPU/INPUT | 27 + tests/integrate/933_NO_H_dzp_GPU/KPT | 4 + tests/integrate/933_NO_H_dzp_GPU/STRU | 21 + tests/integrate/933_NO_H_dzp_GPU/result.ref | 8 + tests/integrate/933_NO_H_dzp_ns2_GPU/INPUT | 31 + tests/integrate/933_NO_H_dzp_ns2_GPU/KPT | 4 + tests/integrate/933_NO_H_dzp_ns2_GPU/STRU | 21 + .../integrate/933_NO_H_dzp_ns2_GPU/result.ref | 8 + tests/integrate/934_NO_Si2_dzp_GPU/INPUT | 27 + tests/integrate/934_NO_Si2_dzp_GPU/KPT | 4 + tests/integrate/934_NO_Si2_dzp_GPU/STRU | 21 + tests/integrate/934_NO_Si2_dzp_GPU/result.ref | 8 + tests/integrate/934_NO_Si2_dzp_neq_GPU/INPUT | 27 + tests/integrate/934_NO_Si2_dzp_neq_GPU/KPT | 4 + tests/integrate/934_NO_Si2_dzp_neq_GPU/STRU | 21 + .../934_NO_Si2_dzp_neq_GPU/result.ref | 8 + .../934_NO_Si2_dzp_neq_ns2_GPU/INPUT | 28 + .../integrate/934_NO_Si2_dzp_neq_ns2_GPU/KPT | 4 + .../integrate/934_NO_Si2_dzp_neq_ns2_GPU/STRU | 21 + .../934_NO_Si2_dzp_neq_ns2_GPU/result.ref | 8 + tests/integrate/934_NO_Si2_dzp_ns2_GPU/INPUT | 28 + tests/integrate/934_NO_Si2_dzp_ns2_GPU/KPT | 4 + tests/integrate/934_NO_Si2_dzp_ns2_GPU/STRU | 21 + .../934_NO_Si2_dzp_ns2_GPU/result.ref | 8 + tests/integrate/934_NO_Si2_tzdp_GPU/INPUT | 27 + tests/integrate/934_NO_Si2_tzdp_GPU/KPT | 4 + tests/integrate/934_NO_Si2_tzdp_GPU/STRU | 21 + .../integrate/934_NO_Si2_tzdp_GPU/result.ref | 8 + tests/integrate/934_NO_Si2_tzdp_neq_GPU/INPUT | 27 + tests/integrate/934_NO_Si2_tzdp_neq_GPU/KPT | 4 + tests/integrate/934_NO_Si2_tzdp_neq_GPU/STRU | 21 + .../934_NO_Si2_tzdp_neq_GPU/result.ref | 8 + .../934_NO_Si2_tzdp_neq_ns2_GPU/INPUT | 28 + .../integrate/934_NO_Si2_tzdp_neq_ns2_GPU/KPT | 4 + .../934_NO_Si2_tzdp_neq_ns2_GPU/STRU | 21 + .../934_NO_Si2_tzdp_neq_ns2_GPU/result.ref | 8 + tests/integrate/934_NO_Si2_tzdp_ns2_GPU/INPUT | 28 + tests/integrate/934_NO_Si2_tzdp_ns2_GPU/KPT | 4 + tests/integrate/934_NO_Si2_tzdp_ns2_GPU/STRU | 21 + .../934_NO_Si2_tzdp_ns2_GPU/result.ref | 8 + tests/integrate/CASES_GPU.txt | 16 + 126 files changed, 14911 insertions(+), 702 deletions(-) create mode 100644 examples/gpu/si16_lcao/INPUT create mode 100644 examples/gpu/si16_lcao/KPT create mode 100644 examples/gpu/si16_lcao/STRU create mode 100644 source/module_hamilt_lcao/module_gint/gint_force.h create mode 100644 source/module_hamilt_lcao/module_gint/gint_force_gpu.cu create mode 100644 source/module_hamilt_lcao/module_gint/gint_rho.h create mode 100644 source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu create mode 100644 source/module_hamilt_lcao/module_gint/gint_vl.h create mode 100644 source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu create mode 100644 source/module_hamilt_lcao/module_gint/gtask_force.cpp create mode 100644 source/module_hamilt_lcao/module_gint/gtask_rho.cpp create mode 100644 source/module_hamilt_lcao/module_gint/gtask_vl.cpp create mode 100644 source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen.cpp create mode 100644 source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu create mode 100644 source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh create mode 100644 source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu create mode 100644 source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cuh create mode 100644 source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cu create mode 100644 source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh create mode 100644 source/module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cu create mode 100644 source/module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cuh create mode 100644 source/module_hamilt_lcao/module_gint/kernels/cuda/interp.cuh create mode 100644 source/module_hamilt_lcao/module_gint/kernels/cuda/sph.cuh create mode 100644 source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cu create mode 100644 source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh create mode 100644 source/module_hamilt_lcao/module_gint/test/CMakeLists.txt create mode 100644 source/module_hamilt_lcao/module_gint/test/test_sph.cpp create mode 100644 source/module_hamilt_lcao/module_gint/test/test_sph.cu create mode 100644 source/module_hamilt_lcao/module_gint/test/test_sph.h create mode 100755 tests/PP_ORB/H_gga_8au_100Ry_1s.orb create mode 100644 tests/PP_ORB/Si_gga_8au_100Ry_3s3p2d.orb create mode 100644 tests/integrate/930_NO_BI2SE2CU2O2_GPU/INPUT create mode 100644 tests/integrate/930_NO_BI2SE2CU2O2_GPU/KPT create mode 100644 tests/integrate/930_NO_BI2SE2CU2O2_GPU/STRU create mode 100644 tests/integrate/930_NO_BI2SE2CU2O2_GPU/result.ref create mode 100644 tests/integrate/931_NO_H20_GPU/INPUT create mode 100644 tests/integrate/931_NO_H20_GPU/KPT create mode 100644 tests/integrate/931_NO_H20_GPU/STRU create mode 100644 tests/integrate/931_NO_H20_GPU/result.ref create mode 100644 tests/integrate/932_NO_H2_dzp_GPU/INPUT create mode 100644 tests/integrate/932_NO_H2_dzp_GPU/KPT create mode 100644 tests/integrate/932_NO_H2_dzp_GPU/STRU create mode 100644 tests/integrate/932_NO_H2_dzp_GPU/result.ref create mode 100644 tests/integrate/932_NO_H2_dzp_ns2_GPU/INPUT create mode 100644 tests/integrate/932_NO_H2_dzp_ns2_GPU/KPT create mode 100644 tests/integrate/932_NO_H2_dzp_ns2_GPU/STRU create mode 100644 tests/integrate/932_NO_H2_dzp_ns2_GPU/result.ref create mode 100644 tests/integrate/932_NO_H2_sz_GPU/INPUT create mode 100644 tests/integrate/932_NO_H2_sz_GPU/KPT create mode 100644 tests/integrate/932_NO_H2_sz_GPU/STRU create mode 100644 tests/integrate/932_NO_H2_sz_GPU/result.ref create mode 100644 tests/integrate/932_NO_H2_sz_ns2_GPU/INPUT create mode 100644 tests/integrate/932_NO_H2_sz_ns2_GPU/KPT create mode 100644 tests/integrate/932_NO_H2_sz_ns2_GPU/STRU create mode 100644 tests/integrate/932_NO_H2_sz_ns2_GPU/result.ref create mode 100644 tests/integrate/933_NO_H_dzp_GPU/INPUT create mode 100644 tests/integrate/933_NO_H_dzp_GPU/KPT create mode 100644 tests/integrate/933_NO_H_dzp_GPU/STRU create mode 100644 tests/integrate/933_NO_H_dzp_GPU/result.ref create mode 100644 tests/integrate/933_NO_H_dzp_ns2_GPU/INPUT create mode 100644 tests/integrate/933_NO_H_dzp_ns2_GPU/KPT create mode 100644 tests/integrate/933_NO_H_dzp_ns2_GPU/STRU create mode 100644 tests/integrate/933_NO_H_dzp_ns2_GPU/result.ref create mode 100644 tests/integrate/934_NO_Si2_dzp_GPU/INPUT create mode 100644 tests/integrate/934_NO_Si2_dzp_GPU/KPT create mode 100644 tests/integrate/934_NO_Si2_dzp_GPU/STRU create mode 100644 tests/integrate/934_NO_Si2_dzp_GPU/result.ref create mode 100644 tests/integrate/934_NO_Si2_dzp_neq_GPU/INPUT create mode 100644 tests/integrate/934_NO_Si2_dzp_neq_GPU/KPT create mode 100644 tests/integrate/934_NO_Si2_dzp_neq_GPU/STRU create mode 100644 tests/integrate/934_NO_Si2_dzp_neq_GPU/result.ref create mode 100644 tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/INPUT create mode 100644 tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/KPT create mode 100644 tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/STRU create mode 100644 tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/result.ref create mode 100644 tests/integrate/934_NO_Si2_dzp_ns2_GPU/INPUT create mode 100644 tests/integrate/934_NO_Si2_dzp_ns2_GPU/KPT create mode 100644 tests/integrate/934_NO_Si2_dzp_ns2_GPU/STRU create mode 100644 tests/integrate/934_NO_Si2_dzp_ns2_GPU/result.ref create mode 100644 tests/integrate/934_NO_Si2_tzdp_GPU/INPUT create mode 100644 tests/integrate/934_NO_Si2_tzdp_GPU/KPT create mode 100644 tests/integrate/934_NO_Si2_tzdp_GPU/STRU create mode 100644 tests/integrate/934_NO_Si2_tzdp_GPU/result.ref create mode 100644 tests/integrate/934_NO_Si2_tzdp_neq_GPU/INPUT create mode 100644 tests/integrate/934_NO_Si2_tzdp_neq_GPU/KPT create mode 100644 tests/integrate/934_NO_Si2_tzdp_neq_GPU/STRU create mode 100644 tests/integrate/934_NO_Si2_tzdp_neq_GPU/result.ref create mode 100644 tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/INPUT create mode 100644 tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/KPT create mode 100644 tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/STRU create mode 100644 tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/result.ref create mode 100644 tests/integrate/934_NO_Si2_tzdp_ns2_GPU/INPUT create mode 100644 tests/integrate/934_NO_Si2_tzdp_ns2_GPU/KPT create mode 100644 tests/integrate/934_NO_Si2_tzdp_ns2_GPU/STRU create mode 100644 tests/integrate/934_NO_Si2_tzdp_ns2_GPU/result.ref diff --git a/CMakeLists.txt b/CMakeLists.txt index ea15ac34f7..2ce46a2e5a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,9 +13,8 @@ project( option(ENABLE_LCAO "Enable LCAO calculation." ON) option(ENABLE_DEEPKS "Enable DeePKS functionality" OFF) option(ENABLE_LIBXC "Enable LibXC functionality" OFF) -option(USE_CUDA "Enable support to CUDA for PW." OFF) +option(USE_CUDA "Enable support to CUDA for ABACUS." OFF) option(ENABLE_FLOAT_FFTW "Enable support to single precision FFTW library." OFF) -# option(USE_CUSOLVER_LCAO "Enable support to CUSOLVER for LCAO." OFF) option(USE_ROCM "Enable support to ROCm." OFF) option(USE_OPENMP "Enable OpenMP in ABACUS." ON) option(ENABLE_ASAN "Enable AddressSanitizer" OFF) @@ -68,11 +67,6 @@ if(ENABLE_RAPIDJSON) include_directories(${RapidJSON_INCLUDE_PATH}) endif() -if(USE_CUDA) - set(USE_CUSOLVER_LCAO ON) -else() - set(USE_CUSOLVER_LCAO OFF) -endif() # get commit info if(COMMIT_INFO) find_package(Git) @@ -247,37 +241,28 @@ endif() include(CheckLanguage) check_language(CUDA) if(CMAKE_CUDA_COMPILER) - if(NOT DEFINED USE_CUDA OR NOT DEFINED USE_CUSOLVER_LCAO) - if(NOT DEFINED USE_CUDA AND NOT DEFINED USE_CUSOLVER_LCAO) - message( - "CUDA components detected. \nWill build the CUDA for PW version of ABACUS by default." - ) - set(USE_CUDA ON) - set(USE_CUSOLVER_LCAO OFF) - elseif(NOT DEFINED USE_CUDA) - set(USE_CUDA OFF) - else() - set(USE_CUSOLVER_LCAO OFF) - endif() + if(NOT DEFINED USE_CUDA) + message( + "CUDA components detected. \nWill build the CUDA version of ABACUS by default." + ) + set(USE_CUDA ON) else() - if(NOT USE_CUDA AND NOT USE_CUSOLVER_LCAO) + if(NOT USE_CUDA) message( STATUS - "CUDA components detected, but both USE_CUDA and USE_CUSOLVER_LCAO set to OFF. NOT building CUDA version of ABACUS." + "CUDA components detected, but USE_CUDA is set to OFF. NOT building CUDA version of ABACUS." ) endif() endif() else() # CUDA not found - if(USE_CUDA OR USE_CUSOLVER_LCAO) + if(USE_CUDA) message( FATAL_ERROR - "USE_CUDA or USE_CUSOLVER_LCAO set but no CUDA components found.") - set(USE_CUDA OFF) - set(USE_CUSOLVER_LCAO OFF) + "USE_CUDA is set but no CUDA components found.") endif() endif() -if(USE_CUDA OR USE_CUSOLVER_LCAO) +if(USE_CUDA) cmake_minimum_required(VERSION 3.18) # required by `CUDA_ARCHITECTURES` below set_if_higher(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_EXTENSIONS ON) @@ -317,12 +302,12 @@ if(USE_CUDA OR USE_CUSOLVER_LCAO) if(USE_CUDA) add_compile_definitions(__CUDA) add_compile_definitions(__UT_USE_CUDA) - if(CMAKE_BUILD_TYPE STREQUAL "Debug") - set(CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS} "-g -G") + if (CMAKE_BUILD_TYPE STREQUAL "Debug") + set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G" CACHE STRING "CUDA flags for debug build" FORCE) + endif() + if (USE_OPENMP AND OpenMP_CXX_FOUND) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS}" CACHE STRING "CUDA flags" FORCE) endif() - endif() - if(USE_CUSOLVER_LCAO) - add_compile_definitions(__CUSOLVER_LCAO) endif() endif() @@ -716,7 +701,7 @@ if(ENABLE_LCAO) if(USE_ELPA) target_link_libraries(${ABACUS_BIN_NAME} genelpa) endif() - if(USE_CUSOLVER_LCAO) + if(USE_CUDA) target_link_libraries(diag_cusolver) endif() endif() diff --git a/docs/advanced/acceleration/cuda.md b/docs/advanced/acceleration/cuda.md index bafe979e7b..ebf8d1d52e 100644 --- a/docs/advanced/acceleration/cuda.md +++ b/docs/advanced/acceleration/cuda.md @@ -1,7 +1,6 @@ # CUDA GPU Implementations -In ABACUS, we provide the option to use the GPU devices to accelerate the performance. -And it has the following general features: +In ABACUS, we provide the option to use GPU devices to accelerate performance. The implementation of GPU acceleration differs between PW basis and LCAO basis. Specifically, under PW basis, it has the following features: - **Full gpu implementations**: During the SCF progress, `Psi`, `Hamilt`, `Hsolver`, `DiagCG`, and `DiagoDavid` classes are stored or calculated by the GPU devices. @@ -13,6 +12,8 @@ And it has the following general features: - **Parallel strategy**: K point parallel. +Unlike PW basis, only the grid integration module (module_gint) and the diagonalization of the Hamiltonian matrix (module_hsolver) have been implemented with GPU acceleration under LCAO basis, and the acceleration is limited to gamma only calculation. Additionally, LCAO basis does not support multi-GPU acceleration. Both the grid integration module and the Hamiltonian matrix solver only support acceleration on a single GPU. + ## Required hardware/software To compile and use ABACUS in CUDA mode, you currently need to have an NVIDIA GPU and install the corresponding NVIDIA CUDA toolkit software on your system (this is only tested on Linux and unsupported on Windows): @@ -36,14 +37,11 @@ In `INPUT` file we need to set the value keyword [device](../input_files/input-m We provides [examples](https://github.com/deepmodeling/abacus-develop/tree/develop/examples/gpu) of gpu calculations. ## Known limitations - -- CG, BPCG and Davidson methods are supported, so the input keyword `ks_solver` can take the values `cg`, `bpcg` or `dav`, -- Only PW basis is supported, so the input keyword `basis_type` can only take the value `pw`, +PW basis: +- CG, BPCG and Davidson methods are supported, so the input keyword `ks_solver` can take the values `cg`, `bpcg` or `dav`. - Only k point parallelization is supported, so the input keyword `kpar` will be set to match the number of MPI tasks automatically. - By default, CUDA architectures 60, 70, 75, 80, 86, and 89 are compiled (if supported). It can be overriden using the CMake variable [`CMAKE_CUDA_ARCHITECTURES`](https://cmake.org/cmake/help/latest/variable/CMAKE_CUDA_ARCHITECTURES.html) or the environmental variable [`CUDAARCHS`](https://cmake.org/cmake/help/latest/envvar/CUDAARCHS.html). -## FAQ -``` -Q: Does the GPU implementations support atomic orbital basis sets? -A: Currently no. -``` +LCAO basis: +- Does not support multi-k calculation, so if the input keyword `device` is set to `gpu`, the input keyword `gamma_only` can only take the value `1`. +- Does not support multi-GPU acceleration. diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md index 164bdd7bf0..a3055c70ec 100644 --- a/docs/advanced/input_files/input-main.md +++ b/docs/advanced/input_files/input-main.md @@ -61,6 +61,7 @@ - [search\_radius](#search_radius) - [search\_pbc](#search_pbc) - [bx, by, bz](#bx-by-bz) + - [num\_stream] (#num_stream) - [Electronic structure](#electronic-structure) - [basis\_type](#basis_type) - [ks\_solver](#ks_solver) @@ -643,10 +644,8 @@ If only one value is set (such as `kspacing 0.5`), then kspacing values of a/b/c - cpu: for CPUs via Intel, AMD, or Other supported CPU devices - gpu: for GPUs via CUDA or ROCm. - Known limitations: + Known limitations: If using the pw basis, the ks_solver must be cg/bpcg/dav to support `gpu` acceleration. If using the lcao basis, `gamma_only` must be set to `1`, as multi-k calculation is currently not supported for `gpu`. lcao_in_pw also does not support `gpu`. - - pw basis: required by the `gpu` acceleration options - - cg/bpcg/dav ks_solver: required by the `gpu` acceleration options - **Default**: cpu ### precision @@ -883,6 +882,12 @@ These variables are used to control the numerical atomic orbitals related parame - **Description**: In the matrix operation of grid integral, bx/by/bz grids (in x, y, z directions) are treated as a whole as a matrix element. A different value will affect the calculation speed. The default is 0, which means abacus will automatically calculate these values. - **Default**: 0 +### num_stream + +- **Type** :int +- **Description**: choose the number of streams in GPU when we compute the `LCAO`. According to different devices , we may have different effects.For most devices,the stream is +enough when the number is bigger then 2. +- **Default** : "4" [back to top](#full-list-of-input-keywords) ## Electronic structure @@ -914,7 +919,7 @@ calculations. - **genelpa**: This method should be used if you choose localized orbitals. - **scalapack_gvx**: Scalapack can also be used for localized orbitals. - - **cusolver**: (Unavailable currently, it will be fixed in future versions) This method needs building with the cusolver component for lcao and at least one gpu is available. + - **cusolver**: This method needs building with CUDA and at least one gpu is available. If you set ks_solver=`genelpa` for basis_type=`pw`, the program will be stopped with an error message: diff --git a/examples/gpu/si16_lcao/INPUT b/examples/gpu/si16_lcao/INPUT new file mode 100644 index 0000000000..57a4223a38 --- /dev/null +++ b/examples/gpu/si16_lcao/INPUT @@ -0,0 +1,29 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf +device gpu +gamma_only 1 # GPU acceleration currently only support gamma_only set to 1. +ks_solver cusolver # if not set, the default ks_solver is cusolver, + # you can also choose genelpa or scalapack_gvx. + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 100 +scf_thr 1e-6 +scf_nmax 100 +cal_force 1 +cal_stress 1 + +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 diff --git a/examples/gpu/si16_lcao/KPT b/examples/gpu/si16_lcao/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/examples/gpu/si16_lcao/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/examples/gpu/si16_lcao/STRU b/examples/gpu/si16_lcao/STRU new file mode 100644 index 0000000000..8162035857 --- /dev/null +++ b/examples/gpu/si16_lcao/STRU @@ -0,0 +1,37 @@ +ATOMIC_SPECIES +Si 14.000 ../../../tests/PP_ORB/Si_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../../tests/PP_ORB/Si_gga_8au_100Ry_2s2p1d.orb + +LATTICE_CONSTANT +0.999660 + +LATTICE_VECTORS + 10.20000 10.20000 0.00000 + 10.20000 0.00000 10.20000 + 0.00000 10.20000 10.20000 + +ATOMIC_POSITIONS +Direct + +Si +0.0 +16 + 0.0000000 0.0000000 0.0000000 1 1 1 + 0.1250000 0.1250000 0.1250000 1 1 1 + 0.0000000 0.0000000 0.5000000 1 1 1 + 0.1250000 0.1250000 0.6250000 1 1 1 + 0.0000000 0.5000000 0.0000000 1 1 1 + 0.1250000 0.6250000 0.1250000 1 1 1 + 0.0000000 0.5000000 0.5000000 1 1 1 + 0.1250000 0.6250000 0.6250000 1 1 1 + 0.5000000 0.0000000 0.0000000 1 1 1 + 0.6250000 0.1250000 0.1250000 1 1 1 + 0.5000000 0.0000000 0.5000000 1 1 1 + 0.6250000 0.1250000 0.6250000 1 1 1 + 0.5000000 0.5000000 0.0000000 1 1 1 + 0.6250000 0.6250000 0.1250000 1 1 1 + 0.5000000 0.5000000 0.5000000 1 1 1 + 0.6250000 0.6250000 0.6250000 1 1 1 + diff --git a/source/module_base/global_variable.cpp b/source/module_base/global_variable.cpp index 84ae03d359..926c949c09 100644 --- a/source/module_base/global_variable.cpp +++ b/source/module_base/global_variable.cpp @@ -47,6 +47,7 @@ int CURRENT_K = 0; int CAL_FORCE = 0; // if cal_force >1, means do the grid integration 'cal_force' times. double FORCE_THR = 1.0e-3; bool CAL_STRESS = false; +int NUM_STREAM = 4; double PRESS1 = 0.0; double PRESS2 = 0.0; double PRESS3 = 0.0; diff --git a/source/module_base/global_variable.h b/source/module_base/global_variable.h index bf181c84d9..b9168664f1 100644 --- a/source/module_base/global_variable.h +++ b/source/module_base/global_variable.h @@ -47,6 +47,9 @@ extern int CURRENT_K; // 8 extern int CAL_FORCE; // 8.1 extern double FORCE_THR; // 8.2 extern bool CAL_STRESS; // 8.25 calcualte the stress + +extern int NUM_STREAM; + extern double PRESS1; extern double PRESS2; extern double PRESS3; diff --git a/source/module_base/scalapack_connector.h b/source/module_base/scalapack_connector.h index 0294b63f1f..d8723d8880 100644 --- a/source/module_base/scalapack_connector.h +++ b/source/module_base/scalapack_connector.h @@ -99,12 +99,22 @@ extern "C" const std::complex *beta, const std::complex *c, const int *ic, const int *jc, const int *descc); - void pztranc_( + void pztranc_( const int *M, const int *N, const std::complex *alpha, const std::complex *A, const int *IA, const int *JA, const int *DESCA, const std::complex *beta, std::complex *C, const int *IC, const int *JC, const int *DESCC); + + void pdgemr2d_(const int *M, const int *N, + double *A, const int *IA, const int *JA, const int *DESCA, + double *B, const int *IB, const int *JB, const int *DESCB, + const int *ICTXT); + + void pzgemr2d_(const int *M, const int *N, + std::complex *A, const int *IA, const int *JA, const int *DESCA, + std::complex *B, const int *IB, const int *JB, const int *DESCB, + const int *ICTXT); } class ScalapackConnector diff --git a/source/module_basis/module_ao/ORB_control.cpp b/source/module_basis/module_ao/ORB_control.cpp index 6966ca99cf..7e1f3636c4 100644 --- a/source/module_basis/module_ao/ORB_control.cpp +++ b/source/module_basis/module_ao/ORB_control.cpp @@ -340,12 +340,7 @@ void ORB_control::divide_HS_2d( pv->dim0 = (int)sqrt((double)dsize); // mohan update 2012/01/13 // while (GlobalV::NPROC_IN_POOL%dim0!=0) - if (ks_solver == "cusolver") - { - pv->dim0 = 1; pv->dim1 = dsize; - } // Xu Shu add 2022-03-25 - else - pv->set_proc_dim(dsize); + pv->set_proc_dim(dsize); if (pv->testpb) ModuleBase::GlobalFunc::OUT(ofs_running, "dim0", pv->dim0); @@ -359,8 +354,6 @@ assert(nb2d > 0); #endif pv->set_block_size(nb2d); // mohan add 2010-06-28 - if (ks_solver == "cusolver") - pv->set_block_size(1); // Xu Shu add 2022-03-25 ModuleBase::GlobalFunc::OUT(ofs_running, "nb2d", pv->get_block_size()); this->set_parameters(ofs_running, ofs_warning); diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/local_orbital_wfc.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/local_orbital_wfc.cpp index cbf4ab56e7..fa651b1509 100644 --- a/source/module_hamilt_lcao/hamilt_lcaodft/local_orbital_wfc.cpp +++ b/source/module_hamilt_lcao/hamilt_lcaodft/local_orbital_wfc.cpp @@ -45,7 +45,7 @@ void Local_Orbital_wfc::gamma_file(psi::Psi* psid, elecstate::ElecState* || GlobalV::KS_SOLVER == "lapack_gvx" || GlobalV::KS_SOLVER == "scalapack_gvx" || GlobalV::KS_SOLVER == "cg_in_lcao" -#ifdef __CUSOLVER_LCAO +#ifdef __CUDA || GlobalV::KS_SOLVER == "cusolver" #endif ) diff --git a/source/module_hamilt_lcao/module_deepks/test/CMakeLists.txt b/source/module_hamilt_lcao/module_deepks/test/CMakeLists.txt index 48d6201514..9e140406d6 100644 --- a/source/module_hamilt_lcao/module_deepks/test/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_deepks/test/CMakeLists.txt @@ -31,7 +31,7 @@ if(USE_ELPA) genelpa ) endif() -if(USE_CUSOLVER_LCAO) +if(USE_CUDA) target_link_libraries(diag_cusolver) endif() diff --git a/source/module_hamilt_lcao/module_gint/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/CMakeLists.txt index 07b4fcddba..e0fb786318 100644 --- a/source/module_hamilt_lcao/module_gint/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_gint/CMakeLists.txt @@ -1,3 +1,5 @@ +#add_subdirectory(kernels/cuda) + list(APPEND objects gint.cpp gint_gamma.cpp @@ -20,6 +22,22 @@ list(APPEND objects grid_technique.cpp ) +if(USE_CUDA) + list(APPEND objects + kernels/cuda/cuda_tools.cu + kernels/cuda/vbatch_matrix_mul.cu + kernels/cuda/gint_vl.cu + kernels/cuda/gint_rho.cu + kernels/cuda/gint_force.cu + gint_vl_gpu.cu + gint_rho_gpu.cu + gint_force_gpu.cu + gtask_vl.cpp + gtask_rho.cpp + gtask_force.cpp + ) +endif() + add_library( gint OBJECT @@ -29,3 +47,7 @@ add_library( if(ENABLE_COVERAGE) add_coverage(gint) endif() + +IF (BUILD_TESTING) + add_subdirectory(test) +endif() \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp index 8ec71c1939..2392d21df2 100644 --- a/source/module_hamilt_lcao/module_gint/gint.cpp +++ b/source/module_hamilt_lcao/module_gint/gint.cpp @@ -1,11 +1,16 @@ #include "gint.h" +#if ((defined __CUDA)) +#include "gint_force.h" +#include "gint_rho.h" +#include "gint_vl.h" +#endif + #include "module_base/memory.h" #include "module_base/timer.h" #include "module_basis/module_ao/ORB_read.h" -#include "module_hamilt_pw/hamilt_pwdft/global.h" #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h" - +#include "module_hamilt_pw/hamilt_pwdft/global.h" #ifdef _OPENMP #include #endif @@ -28,7 +33,7 @@ Gint::~Gint() #endif } -void Gint::cal_gint(Gint_inout *inout) +void Gint::cal_gint(Gint_inout* inout) { ModuleBase::timer::tick("Gint_interface", "cal_gint"); @@ -63,28 +68,130 @@ void Gint::cal_gint(Gint_inout *inout) ModuleBase::timer::tick("Gint_interface","cal_gint_force_meta"); } - const int max_size = this->gridt->max_atom; - const int LD_pool = max_size*GlobalC::ucell.nwmax; + const int max_size = this->gridt->max_atom; + const int LD_pool = max_size * GlobalC::ucell.nwmax; const int lgd = this->gridt->lgd; const int nnrg = this->gridt->nnrg; - if(max_size!=0) + if (max_size != 0) { +#ifdef __CUDA + if (GlobalV::device_flag == "gpu" && GlobalV::GAMMA_ONLY_LOCAL + && lgd > 0) + { + double ylmcoef[100]; + ModuleBase::GlobalFunc::ZEROS(ylmcoef, 100); + for (int i = 0; i < 100; i++) + { + ylmcoef[i] = ModuleBase::Ylm::ylmcoef[i]; + } + + const int ntype = GlobalC::ORB.get_ntype(); + double* rcut = new double[ntype]; + for (int it = 0; it < ntype; it++) + { + rcut[it] = GlobalC::ORB.Phi[it].getRcut(); + } + + const double dr = GlobalC::ORB.dr_uniform; + + if (inout->job == Gint_Tools::job_type::vlocal) + { + GintKernel::gint_gamma_vl_gpu(this->hRGint, + lgd, + max_size, + GlobalC::ucell.omega + / this->ncxyz, + inout->vl, + ylmcoef, + this->nplane, + this->nbxx, + dr, + rcut, + *this->gridt, + GlobalC::ucell); + } + else if (inout->job == Gint_Tools::job_type::rho) + { + int nrxx = this->gridt->ncx * this->gridt->ncy * this->nplane; + for (int is = 0; is < GlobalV::NSPIN; ++is) + { + ModuleBase::GlobalFunc::ZEROS(inout->rho[is], nrxx); + GintKernel::gint_gamma_rho_gpu(this->DMRGint[is], + this->nplane, + ylmcoef, + dr, + rcut, + *this->gridt, + GlobalC::ucell, + inout->rho[is]); + } + } + else if (inout->job == Gint_Tools::job_type::force) + { + const int ncyz = this->ny * this->nplane; + int nat = GlobalC::ucell.nat; + // for (int is = 0; is < GlobalV::NSPIN; ++is) + // { + double *force = new double[GlobalC::ucell.nat * 3]; + for (int i = 0; i < nat * 3; i++) + { + force[i] = 0.0; + } + double *stress = new double[6]; + for (int i = 0; i < 6; i++) + { + stress[i] = 0.0; + } + GintKernel::gint_gamma_force_gpu(this->DMRGint[inout->ispin], + GlobalC::ucell.omega + / this->ncxyz, + inout->vl, + force, + stress, + this->nplane, + dr, + rcut, + *this->gridt, + GlobalC::ucell); + for (int iat = 0; iat < nat; iat++) + { + inout->fvl_dphi[0](iat, 0) += force[iat * 3]; + inout->fvl_dphi[0](iat, 1) += force[iat * 3 + 1]; + inout->fvl_dphi[0](iat, 2) += force[iat * 3 + 2]; + } + inout->svl_dphi[0](0, 0) += stress[0]; + inout->svl_dphi[0](0, 1) += stress[1]; + inout->svl_dphi[0](0, 2) += stress[2]; + inout->svl_dphi[0](1, 1) += stress[3]; + inout->svl_dphi[0](1, 2) += stress[4]; + inout->svl_dphi[0](2, 2) += stress[5]; + + delete[] force; + delete[] stress; + // } + } + } + else +#endif + { #ifdef __MKL - const int mkl_threads = mkl_get_max_threads(); - mkl_set_num_threads(1); + const int mkl_threads = mkl_get_max_threads(); + mkl_set_num_threads(1); #endif #ifdef _OPENMP - #pragma omp parallel +#pragma omp parallel #endif - { - //prepare some constants - const int ncyz = this->ny*this->nplane; // mohan add 2012-03-25 - const double dv = GlobalC::ucell.omega/this->ncxyz; + { + // prepare some constants + const int ncyz + = this->ny * this->nplane; // mohan add 2012-03-25 + const double dv = GlobalC::ucell.omega / this->ncxyz; - // it's a uniform grid to save orbital values, so the delta_r is a constant. - const double delta_r = GlobalC::ORB.dr_uniform; + // it's a uniform grid to save orbital values, so the delta_r is + // a constant. + const double delta_r = GlobalC::ORB.dr_uniform; if((inout->job==Gint_Tools::job_type::vlocal || inout->job==Gint_Tools::job_type::vlocal_meta) @@ -100,18 +207,25 @@ void Gint::cal_gint(Gint_inout *inout) } } - if(inout->job==Gint_Tools::job_type::dvlocal) - { - if(GlobalV::GAMMA_ONLY_LOCAL) - { - ModuleBase::WARNING_QUIT("Gint_interface::cal_gint","dvlocal only for k point!"); - } - ModuleBase::GlobalFunc::ZEROS(this->pvdpRx_reduced[inout->ispin], nnrg); - ModuleBase::GlobalFunc::ZEROS(this->pvdpRy_reduced[inout->ispin], nnrg); - ModuleBase::GlobalFunc::ZEROS(this->pvdpRz_reduced[inout->ispin], nnrg); - } + if (inout->job == Gint_Tools::job_type::dvlocal) + { + if (GlobalV::GAMMA_ONLY_LOCAL) + { + ModuleBase::WARNING_QUIT("Gint_interface::cal_gint", + "dvlocal only for k point!"); + } + ModuleBase::GlobalFunc::ZEROS( + this->pvdpRx_reduced[inout->ispin], + nnrg); + ModuleBase::GlobalFunc::ZEROS( + this->pvdpRy_reduced[inout->ispin], + nnrg); + ModuleBase::GlobalFunc::ZEROS( + this->pvdpRz_reduced[inout->ispin], + nnrg); + } - //perpare auxiliary arrays to store thread-specific values + // perpare auxiliary arrays to store thread-specific values #ifdef _OPENMP double* pvpR_thread = nullptr; hamilt::HContainer* hRGint_thread = nullptr;// auxiliary pointer for multi-threading @@ -160,13 +274,14 @@ void Gint::cal_gint(Gint_inout *inout) } } - #pragma omp for +#pragma omp for #endif - // entering the main loop of grid points - for(int grid_index = 0; grid_index < this->nbxx; grid_index++) - { - // get the value: how many atoms has orbital value on this grid. - const int na_grid = this->gridt->how_many_atoms[ grid_index ]; + // entering the main loop of grid points + for (int grid_index = 0; grid_index < this->nbxx; grid_index++) + { + // get the value: how many atoms has orbital value on this + // grid. + const int na_grid = this->gridt->how_many_atoms[grid_index]; if(na_grid==0) { @@ -312,47 +427,59 @@ void Gint::cal_gint(Gint_inout *inout) } // int grid_index #ifdef _OPENMP - if(inout->job==Gint_Tools::job_type::vlocal || inout->job==Gint_Tools::job_type::vlocal_meta) - { - if(GlobalV::GAMMA_ONLY_LOCAL && lgd>0) + if (inout->job == Gint_Tools::job_type::vlocal + || inout->job == Gint_Tools::job_type::vlocal_meta) { - #pragma omp critical(gint_gamma) + if (GlobalV::GAMMA_ONLY_LOCAL && lgd > 0) { - BlasConnector::axpy(this->hRGint->get_nnr(), 1.0, hRGint_thread->get_wrapper(), 1, this->hRGint->get_wrapper(), 1); +#pragma omp critical(gint_gamma) + { + BlasConnector::axpy(this->hRGint->get_nnr(), + 1.0, + hRGint_thread->get_wrapper(), + 1, + this->hRGint->get_wrapper(), + 1); + } + delete hRGint_thread; + } + if (!GlobalV::GAMMA_ONLY_LOCAL) + { +#pragma omp critical(gint_k) + { + BlasConnector::axpy(nnrg, + 1.0, + pvpR_thread, + 1, + pvpR_reduced[inout->ispin], + 1); + } + delete[] pvpR_thread; } - delete hRGint_thread; } - if(!GlobalV::GAMMA_ONLY_LOCAL) + +#pragma omp critical(gint) + if (inout->job == Gint_Tools::job_type::force + || inout->job == Gint_Tools::job_type::force_meta) { - #pragma omp critical(gint_k) - { - BlasConnector::axpy(nnrg, 1.0, pvpR_thread, 1, pvpR_reduced[inout->ispin], 1); - } - delete[] pvpR_thread; + if (inout->isforce) + { + inout->fvl_dphi[0] += fvl_dphi_thread; + } + if (inout->isstress) + { + inout->svl_dphi[0] += svl_dphi_thread; + } } - } - - #pragma omp critical(gint) - if(inout->job==Gint_Tools::job_type::force || inout->job==Gint_Tools::job_type::force_meta) - { - if(inout->isforce) - { - inout->fvl_dphi[0]+=fvl_dphi_thread; - } - if(inout->isstress) - { - inout->svl_dphi[0]+=svl_dphi_thread; - } - } #endif - } // end of #pragma omp parallel - + } // end of #pragma omp parallel #ifdef __MKL - mkl_set_num_threads(mkl_threads); + mkl_set_num_threads(mkl_threads); #endif - } // end of if (max_size) + } - ModuleBase::timer::tick("Gint_interface", "cal_gint"); + } // end of if (max_size) + ModuleBase::timer::tick("Gint_interface", "cal_gint"); if(inout->job==Gint_Tools::job_type::vlocal) ModuleBase::timer::tick("Gint_interface", "cal_gint_vlocal"); if(inout->job==Gint_Tools::job_type::vlocal_meta) ModuleBase::timer::tick("Gint_interface","cal_gint_vlocal_meta"); @@ -364,29 +491,28 @@ void Gint::cal_gint(Gint_inout *inout) return; } -void Gint::prep_grid( - const Grid_Technique& gt, - const int& nbx_in, - const int &nby_in, - const int &nbz_in, - const int &nbz_start_in, - const int& ncxyz_in, - const int& bx_in, - const int& by_in, - const int& bz_in, - const int& bxyz_in, - const int& nbxx_in, - const int& ny_in, - const int& nplane_in, - const int& startz_current_in) +void Gint::prep_grid(const Grid_Technique& gt, + const int& nbx_in, + const int& nby_in, + const int& nbz_in, + const int& nbz_start_in, + const int& ncxyz_in, + const int& bx_in, + const int& by_in, + const int& bz_in, + const int& bxyz_in, + const int& nbxx_in, + const int& ny_in, + const int& nplane_in, + const int& startz_current_in) { - ModuleBase::TITLE(GlobalV::ofs_running,"Gint_k","prep_grid"); + ModuleBase::TITLE(GlobalV::ofs_running, "Gint_k", "prep_grid"); this->gridt = > this->nbx = nbx_in; - this->nby = nby_in; - this->nbz = nbz_in; - this->ncxyz = ncxyz_in; + this->nby = nby_in; + this->nbz = nbz_in; + this->ncxyz = ncxyz_in; this->nbz_start = nbz_start_in; this->bx = bx_in; this->by = by_in; @@ -397,8 +523,8 @@ void Gint::prep_grid( this->nplane = nplane_in; this->startz_current = startz_current_in; assert(nbx > 0); - assert(nby>0); - assert(nbz>=0); + assert(nby > 0); + assert(nbz >= 0); assert(ncxyz > 0); assert(bx > 0); assert(by > 0); @@ -409,16 +535,14 @@ void Gint::prep_grid( assert(nplane >= 0); assert(startz_current >= 0); - assert( GlobalC::ucell.omega > 0.0); + assert(GlobalC::ucell.omega > 0.0); - return; + return; } -void Gint::initialize_pvpR( - const UnitCell& ucell_in, - Grid_Driver* gd) +void Gint::initialize_pvpR(const UnitCell& ucell_in, Grid_Driver* gd) { - ModuleBase::TITLE("Gint","initialize_pvpR"); + ModuleBase::TITLE("Gint", "initialize_pvpR"); int npol = 1; // there is the only resize code of DMRGint @@ -457,45 +581,47 @@ void Gint::initialize_pvpR( } this->DMRGint_full = new hamilt::HContainer(ucell_in.nat); #endif - } + } - // prepare the row_index and col_index for construct AtomPairs, they are same, name as orb_index - std::vector orb_index(ucell_in.nat + 1); - orb_index[0] = 0; - for(int i=1;i orb_index_npol; - if(npol == 2) - { - orb_index_npol.resize(ucell_in.nat + 1); - orb_index_npol[0] = 0; - for(int i=1;i orb_index(ucell_in.nat + 1); + orb_index[0] = 0; + for (int i = 1; i < orb_index.size(); i++) + { + int type = ucell_in.iat2it[i - 1]; + orb_index[i] = orb_index[i - 1] + ucell_in.atoms[type].nw; + } + std::vector orb_index_npol; + if (npol == 2) + { + orb_index_npol.resize(ucell_in.nat + 1); + orb_index_npol[0] = 0; + for (int i = 1; i < orb_index_npol.size(); i++) + { + int type = ucell_in.iat2it[i - 1]; + orb_index_npol[i] + = orb_index_npol[i - 1] + ucell_in.atoms[type].nw * npol; + } + } - if(GlobalV::GAMMA_ONLY_LOCAL && GlobalV::NSPIN != 4) - { - this->hRGint->fix_gamma(); - } - for (int T1 = 0; T1 < ucell_in.ntype; ++T1) - { - const Atom* atom1 = &(ucell_in.atoms[T1]); - for (int I1 = 0; I1 < atom1->na; ++I1) - { - auto& tau1 = atom1->tau[I1]; + if (GlobalV::GAMMA_ONLY_LOCAL && GlobalV::NSPIN != 4) + { + this->hRGint->fix_gamma(); + } + for (int T1 = 0; T1 < ucell_in.ntype; ++T1) + { + const Atom* atom1 = &(ucell_in.atoms[T1]); + for (int I1 = 0; I1 < atom1->na; ++I1) + { + auto& tau1 = atom1->tau[I1]; - gd->Find_atom(ucell_in, tau1, T1, I1); + gd->Find_atom(ucell_in, tau1, T1, I1); - const int iat1 = ucell_in.itia2iat(T1,I1); + const int iat1 = ucell_in.itia2iat(T1, I1); - // for grid integration (on FFT box), - // we only need to consider , + // for grid integration (on FFT box), + // we only need to consider , // whether this atom is in this processor. if(this->gridt->in_this_processor[iat1]) @@ -584,20 +710,19 @@ void Gint::initialize_pvpR( this->DMRGint_full->allocate(nullptr, 0); ModuleBase::Memory::record("Gint::DMRGint_full",this->DMRGint_full->get_memory_size()); #endif - } - + } } void Gint::transfer_DM2DtoGrid(std::vector*> DM2D) { - ModuleBase::TITLE("Gint","transfer_DMR"); - ModuleBase::timer::tick("Gint","transfer_DMR"); - if(GlobalV::NSPIN != 4) - { - for (int is = 0; is < this->DMRGint.size(); is++) - { + ModuleBase::TITLE("Gint", "transfer_DMR"); + ModuleBase::timer::tick("Gint", "transfer_DMR"); + if (GlobalV::NSPIN != 4) + { + for (int is = 0; is < this->DMRGint.size(); is++) + { #ifdef __MPI - hamilt::transferParallels2Serials(*DM2D[is], DMRGint[is]); + hamilt::transferParallels2Serials(*DM2D[is], DMRGint[is]); #else this->DMRGint[is]->set_zero(); this->DMRGint[is]->add(*DM2D[is]); @@ -607,7 +732,7 @@ void Gint::transfer_DM2DtoGrid(std::vector*> DM2D) else // NSPIN=4 case { #ifdef __MPI - hamilt::transferParallels2Serials(*DM2D[0], this->DMRGint_full); + hamilt::transferParallels2Serials(*DM2D[0], this->DMRGint_full); #else this->DMRGint_full = DM2D[0]; #endif diff --git a/source/module_hamilt_lcao/module_gint/gint_force.h b/source/module_hamilt_lcao/module_gint/gint_force.h new file mode 100644 index 0000000000..6b6718e142 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/gint_force.h @@ -0,0 +1,264 @@ +#ifndef GINT_FORCE_H +#define GINT_FORCE_H + +#include "module_hamilt_lcao/module_gint/gint.h" +#include "module_hamilt_lcao/module_gint/grid_technique.h" +namespace GintKernel +{ + +typedef struct +{ + int stream_num; + double* input_dou; + int* input_int; + int* num_psir; + int* atom_pair_A_m; + int* atom_pair_B_n; + int* atom_pair_K; + int* atom_pair_lda; + int* atom_pair_ldb; + int* atom_pair_ldc; + double* input_double_g; + int* input_int_g; + int* num_psir_g; + double* psir_dm_device; + double* psir_r_device; + double* psir_lx_device; + double* psir_ly_device; + double* psir_lz_device; + double* psir_lxx_device; + double* psir_lxy_device; + double* psir_lxz_device; + double* psir_lyy_device; + double* psir_lyz_device; + double* psir_lzz_device; + int* A_m_device; + int* B_n_device; + int* K_device; + int* lda_device; + int* ldb_device; + int* ldc_device; + double** matrix_A; + double** matrix_B; + double** matrix_C; + double** matrix_A_device; + double** matrix_B_device; + double** matrix_C_device; +} SGridParameter; + +typedef struct +{ + double* stress_device; + double* stress_host; + double* force_device; + double* force_host; + int* iat_device; + int* iat_host; + +} ForceStressIat; + +typedef struct +{ + double* stress_global; + double* force_global; + int* iat_global; +} ForceStressIatGlobal; + +typedef struct +{ + double* density_mat_h; + double* density_mat_d; +} DensityMat; + +/** + * @brief Calculate forces using GPU. + * + * This function calculates forces and stress for a given set of parameters. + * + * @param dm A pointer to hamilt::HContainer. + * @param vfactor Scaling factor for forces. + * @param vlocal Local potential values. + * @param force Output array for forces. + * @param stress Output array for stress. + * @param nczp Size parameter. + * @param ylmcoef_now Coefficients for spherical harmonics. + * @param gridt Reference to Grid_Technique object. + */ +void gint_gamma_force_gpu(hamilt::HContainer* dm, + const double vfactor, + const double* vlocal, + double* force, + double* stress, + const int nczp, + double dr, + double* rcut, + const Grid_Technique& gridt, + const UnitCell& ucell); + +/** + * @brief GPU task generator for forces. + * + * This function generates GPU tasks for force calculations. + * + * @param gridt Reference to Grid_Technique object. + * @param i Value of i,stand for the x-axis gird. + * @param j Value of j.stand for the y-axis grid. + * @param psi_size_max Maximum size of psi. + * @param max_size Maximum size of atoms on a grid. + * @param nczp Size parameter,stand for the current z-axis grids. + * @param vfactor Scaling factor,stand for the Local potential. + * @param rcut distance for each atom orbits + * @param vlocal_global_value Global values of local potential. + * @param iat_per_nbz save the number of the iat on per nbz grids. + * @param lgd Value of lgd,stand for the local grid dimension. + * @param num_psir Array for num_psir values,contained the each number of the + * atom psir on a grid. + * @param dm_matrix_g GPU array for dm_matrix,send as the denstiy matrix. + * @param max_m Maximum value of m,stand for the max number of mat_m. + * @param max_n Maximum value of n,stand for the max number of mat_n. + * @param atom_pair_num Number of atom pairs,stand for the max number of mat_n. + * @param para Grid parameter in task generator, + */ + +void gpu_task_generator_force(const Grid_Technique& gridt, + const UnitCell& ucell, + const int i, + const int j, + const int psi_size_max, + const int max_size, + const int nczp, + const double vfactor, + double* ruct, + const double* vlocal_global_value, + int* iat_per_nbz, + const int lgd, + double* dm_matrix_g, + int& max_m, + int& max_n, + int& atom_pair_num, + SGridParameter& para); +/** + * @brief Density Matrix,force Stress Iat Init + * + * Using structure to init the parameter + * + * @param denstiy_mat DensityMat,contained the density_mat_dice and + * destiyMatHost + * @param f_s_iat_dev ForceStressIatGlobal,contined the Force Stress and + * Iat Number + * @param dm hamilt::HContainer,denstiy stored in the Hcontainer + * @param gridt Grid_Technique,stored the major method in the the gint. + * @param UnitCell ucell,stored the cell tools + * @param lgd Value of lgd,stand for the local grid dimension. + * @param cuda_block in stress compute,used for Block nums + * @param atom_num_grid in force calculate,used for Block nums + */ +void calculateInit(DensityMat& denstiy_mat, + ForceStressIatGlobal& f_s_iat_dev, + hamilt::HContainer* dm, + const Grid_Technique& gridt, + const UnitCell& ucell, + const int lgd, + const int cuda_block, + const int atom_num_grid); + +/** + * @brief Density Matrix,from Hcontainer to structure + * + * Using structure to init the parameter + * + * @param matrix_host double *,contained the destiyMatHost + * @param dm hamilt::HContainer,denstiy stored in the Hcontainer + * @param gridt Grid_Technique,stored the major method in the the gint. + * @param lgd Value of lgd,stand for the local grid dimension. + */ +void allocateDm(double* matrix_host, + hamilt::HContainer* dm, + const Grid_Technique& gridt, + const UnitCell& ucell); + +/** + * @brief grid parameter Init + * + * GridParameter init + * + * @param para double *,contained the destiyMatHost + * @param iter_num int , used for calcute the stream + * @param nbz int,stand for the number of Z-axis + * @param gridt Grid_Technique,stored the major method in the the gint. + */ +void para_init(SGridParameter& para, + const int iter_num, + const int nbz, + const Grid_Technique& gridt); +/** + * @brief ForceStressIat on host and device Init + * + * GridParameter init + * + * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Host + * @param stream_num int , record the stream in GPU + * @param cuda_block in stress compute,used for Block nums + * @param atom_num_grid in force calculate,used for Block nums + * @param max_size Maximum size of atoms on a grid. + * @param ForceStressIatGlobal ForceStressIatGlobal,contains the Force Stree Iat on Host + */ +void cal_init(ForceStressIat& f_s_iat, + const int stream_num, + const int cuda_block, + const int atom_num_grid, + const int max_size, + const ForceStressIatGlobal& f_s_iatg); +/** + * @brief GridParameter memCpy,from Host to Device + * + * parameter init,which contains the gpu task and multi matrix multiplication + * + * @param para Grid parameter in task generator, + * @param gridt Grid_Technique,stored the major method in the the gint. + * @param nbz int,stand for the number of Z-axis + * @param atom_num_grid in force calculate,used for Block nums + */ +void para_mem_copy(SGridParameter& para, + const Grid_Technique& gridt, + const int nbz, + const int atom_num_grid); +/** + * @brief Force Stress Force Iat memCpy,from Host to Device + * + * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Device + * and Host + * @param gridt Grid_Technique,stored the major method in the the gint. + * @param atom_num_grid in force calculate,used for Block nums + * @param cuda_block in stress compute,used for Block nums + * @param stream_num int , record the stream in GPU + */ +void cal_mem_cpy(ForceStressIat& f_s_iat, + const Grid_Technique& gridt, + const int atom_num_grid, + const int cuda_block, + const int stream_num); +/** + * @brief Force Calculate on Host + * + * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Device + * and Host + * @param force stored the force for each atom on each directions + * @param atom_num_grid in force calculate,used for Block nums + */ +void cal_force_add(ForceStressIat& f_s_iat, + double* force, + const int atom_num_grid); +/** + * @brief Stress Calculate on Host + * + * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Device + * and Host + * @param stress stored the stress for each directions + * @param cuda_block in stress compute,used for Block nums + */ +void cal_stress_add(ForceStressIat& f_s_iat, + double* stress, + const int cuda_block); +} // namespace GintKernel +#endif diff --git a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu new file mode 100644 index 0000000000..30ea8fa6ab --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu @@ -0,0 +1,248 @@ +#include + +#include +#include + +#include "gint_force.h" +#include "kernels/cuda/cuda_tools.cuh" +#include "kernels/cuda/gint_force.cuh" +#include "module_base/ylm.h" +#include "module_hamilt_lcao/module_gint/gint_tools.h" + +namespace GintKernel +{ + +// Function to calculate forces using GPU-accelerated gamma point Gint +/** + * @brief Calculate forces and stresses for the `gint_gamma_force_gpu` function. + * + * This function calculates forces and stresses based on given parameters. + * + * @param dm A pointer to the HContainer object. + * @param vfactor The scaling factor for some calculation. + * @param vlocal A pointer to an array of doubles. + * @param force A pointer to an array to store the calculated forces. + * @param stress A pointer to an array to store the calculated stresses. + * @param nczp An integer representing a parameter. + * @param ylmcoef_now A pointer to an array of doubles representing Ylm + * coefficients. + * @param gridt A reference to a Grid_Technique object. + */ +/** + * Function to calculate forces using GPU-accelerated gamma point Gint + * @brief Calculate forces and stresses for the `gint_gamma_force_gpu` function. + * + * This function calculates forces and stresses based on given parameters. + * + * @param dm Pointer to the HContainer object. + * @param vfactor The scaling factor for the gird calculation. + * @param vlocal One-dimensional array that holds the local potential of each + * gird. + * @param force One-dimensional array that holds the force of each gird. + * @param stress One-dimensional array that holds the stress of each gird. + * @param nczp The number of grid layers in the C direction. + * @param dr distance cut in calculate + * @param rcut distance for each atom orbits + * @param gridt The Grid_Technique object containing grid information. + * + * @note The grid integration on the GPU is mainly divided into the following + * steps: + * 1. Use the CPU to divide the grid integration into subtasks. + * 2. Copy the subtask information to the GPU. + * 3. Calculate the matrix elements on the GPU. + * 4. Perform matrix multiplication on the GPU. + * 5. stress dot on the GPU. + * 6. force dot on the GPU. + * 7. Copy the results back to the host. + */ +void gint_gamma_force_gpu(hamilt::HContainer* dm, + const double vfactor, + const double* vlocal, + double* force, + double* stress, + const int nczp, + double dr, + double* rcut, + const Grid_Technique& gridt, + const UnitCell& ucell) +{ + const int nbz = gridt.nbzp; + const int lgd = gridt.lgd; + const int max_size = gridt.max_atom; + const int nwmax = ucell.nwmax; + const int bxyz = gridt.bxyz; + const int atom_num_grid = nbz * bxyz * max_size; + const int cuda_threads = 256; + const int cuda_block + = std::min(64, (gridt.psir_size + cuda_threads - 1) / cuda_threads); + int iter_num = 0; + DensityMat denstiy_mat; + ForceStressIatGlobal f_s_iat_dev; + SGridParameter para; + ForceStressIat f_s_iat; + + calculateInit(denstiy_mat, + f_s_iat_dev, + dm, + gridt, + ucell, + lgd, + cuda_block, + atom_num_grid); + /*cuda stream allocate */ + for (int i = 0; i < gridt.nstreams; i++) + { + checkCuda(cudaStreamSynchronize(gridt.streams[i])); + } + + /*compute the psi*/ + for (int i = 0; i < gridt.nbx; i++) + { + for (int j = 0; j < gridt.nby; j++) + { + + int max_m = 0; + int max_n = 0; + int atom_pair_num = 0; + dim3 grid_psi(nbz, 8); + dim3 block_psi(64); + dim3 grid_dot_force(cuda_block); + dim3 block_dot_force(cuda_threads); + dim3 grid_dot(cuda_block); + dim3 block_dot(cuda_threads); + + para_init(para, iter_num, nbz, gridt); + cal_init(f_s_iat, + para.stream_num, + cuda_block, + atom_num_grid, + max_size, + f_s_iat_dev); + checkCuda(cudaStreamSynchronize(gridt.streams[para.stream_num])); + + /*gpu task compute in CPU */ + gpu_task_generator_force(gridt, + ucell, + i, + j, + gridt.psi_size_max_z, + max_size, + nczp, + vfactor, + rcut, + vlocal, + f_s_iat.iat_host, + lgd, + denstiy_mat.density_mat_d, + max_m, + max_n, + atom_pair_num, + para); + /*variables memcpy to gpu host*/ + para_mem_copy(para, + gridt, + nbz, + atom_num_grid); + cal_mem_cpy(f_s_iat, + gridt, + atom_num_grid, + cuda_block, + para.stream_num); + checkCuda(cudaStreamSynchronize(gridt.streams[para.stream_num])); + /* cuda stream compute and Multiplication of multinomial matrices */ + get_psi_force<<>>( + gridt.ylmcoef_g, + dr, + gridt.bxyz, + ucell.nwmax, + para.input_double_g, + para.input_int_g, + para.num_psir_g, + gridt.psi_size_max_z, + gridt.atom_nwl_g, + gridt.atom_new_g, + gridt.atom_ylm_g, + gridt.atom_l_g, + gridt.atom_nw_g, + gridt.nr_max, + gridt.psi_u_g, + para.psir_r_device, + para.psir_lx_device, + para.psir_ly_device, + para.psir_lz_device, + para.psir_lxx_device, + para.psir_lxy_device, + para.psir_lxz_device, + para.psir_lyy_device, + para.psir_lyz_device, + para.psir_lzz_device); + checkCudaLastError(); + gridt.fastest_matrix_mul(max_m, + max_n, + para.A_m_device, + para.B_n_device, + para.K_device, + para.matrix_A_device, + para.lda_device, + para.matrix_B_device, + para.ldb_device, + para.matrix_C_device, + para.ldc_device, + atom_pair_num, + gridt.streams[para.stream_num], + nullptr); + + checkCuda(cudaStreamSynchronize(gridt.streams[para.stream_num])); + /* force compute in GPU */ + dot_product_force<<>>( + para.psir_lx_device, + para.psir_ly_device, + para.psir_lz_device, + para.psir_dm_device, + f_s_iat.force_device, + f_s_iat.iat_device, + nwmax, + max_size, + gridt.psir_size / nwmax); + /* force compute in CPU*/ + cal_force_add(f_s_iat, force, atom_num_grid); + + /*stress compute in GPU*/ + dot_product_stress<<>>( + para.psir_lxx_device, + para.psir_lxy_device, + para.psir_lxz_device, + para.psir_lyy_device, + para.psir_lyz_device, + para.psir_lzz_device, + para.psir_dm_device, + f_s_iat.stress_device, + gridt.psir_size); + /* stress compute in CPU*/ + cal_stress_add(f_s_iat, stress, cuda_block); + iter_num++; + } + } + // cudaFree(f_s_iat.stress_device); + // cudaFree(f_s_iat.force_device); + // cudaFree(f_s_iat.iat_device); + delete[] f_s_iat.stress_host; + delete[] f_s_iat.force_host; + delete[] f_s_iat.iat_host; + /*free variables in CPU host*/ + for (int i = 0; i < gridt.nstreams; i++) + { + checkCuda(cudaStreamSynchronize(gridt.streams[i])); + } +} + +} // namespace GintKernel diff --git a/source/module_hamilt_lcao/module_gint/gint_rho.h b/source/module_hamilt_lcao/module_gint/gint_rho.h new file mode 100644 index 0000000000..bcb1925645 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/gint_rho.h @@ -0,0 +1,114 @@ +#ifndef GINT_RHO_H +#define GINT_RHO_H +#include +#include // for CUDA_VERSION +#include + +#include "module_hamilt_lcao/module_gint/gint.h" +#include "module_hamilt_lcao/module_gint/grid_technique.h" + +cudaError_t checkCuda(cudaError_t result); +namespace GintKernel +{ + +/** + * calculate the rho by GPU + * + * @param dm density matrix. + * @param nczp number of meshcells along the z-axis on this processor. + * @param ylmcoef_now coefficients for the spherical harmonics expansion. + * @param dr The grid spacing. + * @param rcut Pointer to the cutoff radius array. + * @param gridt Grid_Technique object containing grid information. + * @param ucell UnitCell. + * @param rho rho. + */ +void gint_gamma_rho_gpu(const hamilt::HContainer* dm, + const int nczp, + const double* ylmcoef_now, + const double dr, + const double* rcut, + const Grid_Technique& gridt, + const UnitCell& ucell, + double* rho); + +/** + * generate GPU tasks for computing the rho. + * the computation task can be divided into psir calculation, matrix + * multiplication and vector dot product. the matrix multiplication is mat_dm * + * mat_psir, and the vector dot product is psir * psir_dm. This function will be + * split into three separate functions, which are calculating psir, matrix + * multiplication, and vector dot product. + * + * @param gridt Grid_Technique object containing grid information. + * @param i X index of the bigcell. + * @param j Y index of the bigcell. + * @param max_size maximum number of atoms on a meshcell. + * @param nczp number of meshcells along the z-axis on this processor. + * @param ucell UnitCell object containing unit cell information. + * @param rcut Pointer to the cutoff radius array. + * @param input_double `double` type data used for calculating psir. + * @param input_int `int` type data used for calculating psir. + * @param num_psir number of atoms on each bigcell. + * @param lgd lgd. + * @param psir_ylm_g one-dimensional array storing psir. + * @param psir_dm_g one-dimensional array storing psir_dm. + * @param dm_matrix_g one-dimensional array storing mat_dm. + * @param mat_alpha alpha values for matrix multiplication. + * @param mat_m numbers of rows in mat_dm. + * @param mat_n numbers of columns in mat_psir. + * @param mat_k numbers of columns in mat_dm, + * which equal to the numbers of rows in mat_psir. + * @param mat_lda leading dimension of mat_dm. + * @param mat_ldb leading dimension of mat_psir. + * @param mat_ldc leading dimension of mat_psir_dm. + * @param mat_A pointers to mat_dm. + * @param mat_B pointers to mat_psir. + * @param mat_C pointers to mat_psir_dm. + * @param max_m maximum value of m. + * @param max_n maximum value of n. + * @param atom_pair_num total count of atom pairs, + * which is also the number of mat mul operations. + * @param rho_g rho. + * @param vec_l pointers to psir_ylm for vec dot product. + * @param vec_r pointers to psir_dm for vec dot product. + * @param dot_product pointers to the result of dot product. + * @param vec_len vector lengths for each dot product. + * @param dot_count total count of dot products. + */ +void gtask_rho(const Grid_Technique& gridt, + const int i, + const int j, + const int max_size, + const int nczp, + const UnitCell& ucell, + const double* rcut, + double* input_double, + int* input_int, + int* num_psir, + const int lgd, + double* const psir_ylm_g, + double* const psir_dm_g, + double* const dm_matrix_g, + double* mat_alpha, + int* mat_m, + int* mat_n, + int* mat_k, + int* mat_lda, + int* mat_ldb, + int* mat_ldc, + double** mat_A, + double** mat_B, + double** mat_C, + int& max_m, + int& max_n, + int& atom_pair_num, + double* rho_g, + double** vec_l, + double** vec_r, + double** dot_product, + int* vec_len, + int& dot_count); + +} // namespace GintKernel +#endif \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu new file mode 100644 index 0000000000..a598720c8c --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu @@ -0,0 +1,381 @@ +#include "kernels/cuda/cuda_tools.cuh" +#include "kernels/cuda/vbatch_matrix_mul.cuh" +#include "module_base/ylm.h" +#include "module_hamilt_lcao/module_gint/gint_rho.h" +#include "module_hamilt_lcao/module_gint/gint_tools.h" +#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh" + +namespace GintKernel +{ + +void gint_gamma_rho_gpu(const hamilt::HContainer* dm, + const int nczp, + const double* ylmcoef_now, + const double dr, + const double* rcut, + const Grid_Technique& gridt, + const UnitCell& ucell, + double* rho) +{ + const int nbz = gridt.nbzp; + const int lgd = gridt.lgd; + const int max_size = gridt.max_atom; + double* dm_matrix_h = new double[lgd * lgd]; + + checkCuda(cudaMemset(gridt.rho_g, 0, gridt.ncxyz * sizeof(double))); + + // retrieve the density matrix on the host + ModuleBase::GlobalFunc::ZEROS(dm_matrix_h, lgd * lgd); + for (int iat1 = 0; iat1 < ucell.nat; iat1++) + { + for (int iat2 = 0; iat2 < ucell.nat; iat2++) + { + int it1 = ucell.iat2it[iat1]; + int it2 = ucell.iat2it[iat2]; + int lo1 + = gridt.trace_lo[ucell.itiaiw2iwt(it1, ucell.iat2ia[iat1], 0)]; + int lo2 + = gridt.trace_lo[ucell.itiaiw2iwt(it2, ucell.iat2ia[iat2], 0)]; + + hamilt::AtomPair* tmp_ap = dm->find_pair(iat1, iat2); + int orb_index = 0; + if (tmp_ap == NULL) + { + continue; + } + for (int orb_i = 0; orb_i < tmp_ap->get_row_size(); orb_i++) + { + for (int orb_j = 0; orb_j < tmp_ap->get_col_size(); orb_j++) + { + dm_matrix_h[(lo1 + orb_i) * lgd + (lo2 + orb_j)] + = tmp_ap->get_pointer(0)[orb_index]; + orb_index++; + } + } + } + } + + // transfer the density matrix to the device + double* dm_matrix_g; + checkCuda(cudaMalloc((void**)&dm_matrix_g, lgd * lgd * sizeof(double))); + checkCuda(cudaMemcpy(dm_matrix_g, + dm_matrix_h, + lgd * lgd * sizeof(double), + cudaMemcpyHostToDevice)); + + for (int i = 0; i < gridt.nstreams; i++) + { + checkCuda(cudaStreamSynchronize(gridt.streams[i])); + } + + // calculate the rho for every nbz bigcells + int iter_num = 0; + for (int i = 0; i < gridt.nbx; i++) + { + for (int j = 0; j < gridt.nby; j++) + { + // get stream id + int stream_num = iter_num % gridt.nstreams; + + // psi_input contains data used to generate the psi values. + // The suffix "_g" indicates that the data is stored in the GPU, + // otherwise it is stored in the host. + double* input_double + = &gridt.psi_dbl_gbl[gridt.psi_size_max * stream_num * 5]; + int* input_int + = &gridt.psi_int_gbl[gridt.psi_size_max * stream_num * 2]; + double* input_double_g + = &gridt.psi_dbl_gbl_g[gridt.psi_size_max * stream_num * 5]; + int* input_int_g + = &gridt.psi_int_gbl_g[gridt.psi_size_max * stream_num * 2]; + + // num_psir represents the number of atoms in each bigcell. + int* num_psir = &gridt.num_psir_gbl[nbz * stream_num]; + int* num_psir_g = &gridt.num_psir_gbl_g[nbz * stream_num]; + + // ap_alpha represents the coefficient alpha in the + // expression alpha * mat_DM * mat_psir. + double* ap_alpha + = &gridt.alpha_global[gridt.atom_pair_nbz * stream_num]; + double* ap_alpha_g + = &gridt.alpha_global_g[gridt.atom_pair_nbz * stream_num]; + + // m, n, k, lda, ldb, ldc in matrix multiplication + int* atom_pair_A_m + = &gridt.l_info_global[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_B_n + = &gridt.r_info_global[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_k + = &gridt.k_info_global[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_lda + = &gridt.lda_info_global[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_ldb + = &gridt.ldb_info_global[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_ldc + = &gridt.ldc_info_global[gridt.atom_pair_nbz * stream_num]; + + int* atom_pair_A_m_g + = &gridt.l_info_global_g[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_B_n_g + = &gridt.r_info_global_g[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_k_g + = &gridt.k_info_global_g[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_lda_g + = &gridt.lda_info_gbl_g[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_ldb_g + = &gridt.ldb_info_gbl_g[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_ldc_g + = &gridt.ldc_info_gbl_g[gridt.atom_pair_nbz * stream_num]; + + // matrix A, B, C used in matrix multiplication + double** matrix_A + = &gridt.ap_left_gbl[gridt.atom_pair_nbz * stream_num]; + double** matrix_B + = &gridt.ap_right_gbl[gridt.atom_pair_nbz * stream_num]; + double** matrix_C + = &gridt.ap_output_gbl[gridt.atom_pair_nbz * stream_num]; + + double** matrix_A_g + = &gridt.ap_left_gbl_g[gridt.atom_pair_nbz * stream_num]; + double** matrix_B_g + = &gridt.ap_right_gbl_g[gridt.atom_pair_nbz * stream_num]; + double** matrix_C_g + = &gridt.ap_output_gbl_g[gridt.atom_pair_nbz * stream_num]; + + // psir_ylm_left_g is used to store the psi values. + // psir_r_g is used to store psir_dm, which is the product + // of mat_DM * mat_psir. + double* psir_ylm_left_g + = &gridt.left_global_g[gridt.psir_size * stream_num]; + double* psir_r_g + = &gridt.right_global_g[gridt.psir_size * stream_num]; + double* rho_g = gridt.rho_g; + + // variables for dot product psir * psir_dm + int dot_count = 0; + int* vec_len = &gridt.vec_len[gridt.num_mcell * stream_num]; + double** vec_l = &gridt.vec_l[gridt.num_mcell * stream_num]; + double** vec_r = &gridt.vec_r[gridt.num_mcell * stream_num]; + double** dot_product + = &gridt.dot_product[gridt.num_mcell * stream_num]; + + int* vec_len_g = &gridt.vec_len_g[gridt.num_mcell * stream_num]; + double** vec_l_g = &gridt.vec_l_g[gridt.num_mcell * stream_num]; + double** vec_r_g = &gridt.vec_r_g[gridt.num_mcell * stream_num]; + double** dot_product_g + = &gridt.dot_product_g[gridt.num_mcell * stream_num]; + + int max_m = 0; + int max_n = 0; + int atom_pair_num = 0; + + checkCuda(cudaStreamSynchronize(gridt.streams[stream_num])); + + // generate GPU tasks, including the calculation of psir, matrix + // multiplication, and dot product + gtask_rho(gridt, + i, + j, + max_size, + nczp, + ucell, + rcut, + input_double, + input_int, + num_psir, + lgd, + psir_ylm_left_g, + psir_r_g, + dm_matrix_g, + ap_alpha, + atom_pair_A_m, + atom_pair_B_n, + atom_pair_k, + atom_pair_lda, + atom_pair_ldb, + atom_pair_ldc, + matrix_A, + matrix_B, + matrix_C, + max_m, + max_n, + atom_pair_num, + rho_g, + vec_l, + vec_r, + dot_product, + vec_len, + dot_count); + + // Copying data from host to device + checkCuda(cudaMemcpyAsync(input_double_g, + input_double, + gridt.psi_size_max * 5 * sizeof(double), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(input_int_g, + input_int, + gridt.psi_size_max * 2 * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(num_psir_g, + num_psir, + nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + + checkCuda(cudaMemcpyAsync(ap_alpha_g, + ap_alpha, + gridt.atom_pair_nbz * sizeof(double), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(atom_pair_A_m_g, + atom_pair_A_m, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(atom_pair_B_n_g, + atom_pair_B_n, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(atom_pair_k_g, + atom_pair_k, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(atom_pair_lda_g, + atom_pair_lda, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(atom_pair_ldb_g, + atom_pair_ldb, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(atom_pair_ldc_g, + atom_pair_ldc, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + + checkCuda(cudaMemcpyAsync(matrix_A_g, + matrix_A, + gridt.atom_pair_nbz * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(matrix_B_g, + matrix_B, + gridt.atom_pair_nbz * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(matrix_C_g, + matrix_C, + gridt.atom_pair_nbz * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + + checkCuda(cudaMemcpyAsync(vec_len_g, + vec_len, + gridt.num_mcell * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(vec_l_g, + vec_l, + gridt.num_mcell * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(vec_r_g, + vec_r, + gridt.num_mcell * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(dot_product_g, + dot_product, + gridt.num_mcell * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + + checkCuda(cudaMemsetAsync(psir_ylm_left_g, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[stream_num])); + checkCuda(cudaMemsetAsync(psir_r_g, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[stream_num])); + + // Launching kernel to calculate psi + dim3 grid_psi(nbz, 8); + dim3 block_psi(64); + get_psi<<>>( + gridt.ylmcoef_g, + dr, + gridt.bxyz, + ucell.nwmax, + input_double_g, + input_int_g, + num_psir_g, + gridt.psi_size_max_z, + gridt.atom_nwl_g, + gridt.atom_new_g, + gridt.atom_ylm_g, + gridt.atom_nw_g, + gridt.nr_max, + gridt.psi_u_g, + psir_ylm_left_g); + checkCudaLastError(); + + // Performing matrix multiplication alpha * mat_dm * mat_psir + gridt.fastest_matrix_mul(max_m, + max_n, + atom_pair_A_m_g, + atom_pair_B_n_g, + atom_pair_k_g, + matrix_A_g, + atom_pair_lda_g, + matrix_B_g, + atom_pair_ldb_g, + matrix_C_g, + atom_pair_ldc_g, + atom_pair_num, + gridt.streams[stream_num], + ap_alpha_g); + + // Launching kernel to calculate dot product psir * psir_dm + dim3 grid_dot(64); + dim3 block_dot(64); + int incx = 1; + int incy = 1; + psir_dot<<>>( + vec_len_g, + vec_l_g, + incx, + vec_r_g, + incy, + dot_product_g, + dot_count); + + iter_num++; + } + } + + // Synchronizing streams + for (int i = 0; i < gridt.nstreams; i++) + { + checkCuda(cudaStreamSynchronize(gridt.streams[i])); + } + + // Copy rho from device to host + checkCuda(cudaMemcpy(rho, + gridt.rho_g, + nczp * gridt.ncx * gridt.ncy * sizeof(double), + cudaMemcpyDeviceToHost)); + + // free the memory + checkCuda(cudaFree(dm_matrix_g)); + delete[] dm_matrix_h; +} + +} // namespace GintKernel diff --git a/source/module_hamilt_lcao/module_gint/gint_vl.h b/source/module_hamilt_lcao/module_gint/gint_vl.h new file mode 100644 index 0000000000..9d75c41e03 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/gint_vl.h @@ -0,0 +1,56 @@ +#ifndef GINT_VL_H +#define GINT_VL_H +#include +#include // for CUDA_VERSION +#include + +#include "module_hamilt_lcao/module_gint/gint.h" +#include "module_hamilt_lcao/module_gint/grid_technique.h" + +cudaError_t checkCuda(cudaError_t result); + +namespace GintKernel +{ + +void gint_gamma_vl_gpu(hamilt::HContainer* hRGint, + const int lgd_now, + const int max_size, + double vfactor, + const double* vlocal, + const double* ylmcoef_now, + const int pwnczp, + const int nbxx, + const double dr, + const double* rcut, + const Grid_Technique& gridt, + const UnitCell& ucell); + +void gtask_vlocal(const Grid_Technique& gridt, + const double* rcut, + const UnitCell& ucell, + const int i, + const int j, + const int max_size, + const int nczp, + const double vfactor, + const double* vlocal_global_value, + double* psir_ylm_left, + double* psir_r, + double* input_double, + int* input_int, + int* num_psir, + int* atom_pair_left_info, + int* atom_pair_right_info, + int* atom_pair_lda, + int* atom_pair_ldb, + int* atom_pair_ldc, + double** atom_pair_left_v2, + double** atom_pair_right_v2, + double** atom_pair_output_v2, + int& atom_pair_num, + int& max_m, + int& max_n); + +} // namespace GintKernel + +#endif \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu new file mode 100644 index 0000000000..2c3d5b3922 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu @@ -0,0 +1,362 @@ +#include + +#include "kernels/cuda/cuda_tools.cuh" +#include "kernels/cuda/vbatch_matrix_mul.cuh" +#include "module_base/ylm.h" +#include "module_hamilt_lcao/module_gint/gint_tools.h" +#include "module_hamilt_lcao/module_gint/gint_vl.h" +#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cuh" + +namespace GintKernel +{ + +/** + * Computes the gamma component of the VL (Vlocal) integral on the GPU. + * + * @param hRGint Pointer to the HContainer object to store the computed + * integrals. + * @param lgd Dimension information for the computation results. + * @param max_size The maximum number of neighboring atoms for a grid point. + * @param vfactor Related to volume. The scaling factor for the Vlocal + * integrals. + * @param vlocal Pointer to the Vlocal array. + * @param ylmcoef_now Pointer to the Ylm coefficients array. + * @param nczp The number of grid layers in the C direction. + * @param nbxx The total number of grid points. + * @param dr The grid spacing. + * @param rcut Pointer to the cutoff radius array. + * @param gridt The Grid_Technique object containing grid information. + * @param ucell The UnitCell object containing unit cell information. + * + * @note The grid integration on the GPU is mainly divided into the following + * steps: + * 1. Use the CPU to divide the grid integration into subtasks. + * 2. Copy the subtask information to the GPU. + * 3. Calculate the matrix elements on the GPU. + * 4. Perform matrix multiplication on the GPU. + * 5. Copy the results back to the host. + */ +void gint_gamma_vl_gpu(hamilt::HContainer* hRGint, + const int lgd, + const int max_size, + const double vfactor, + const double* vlocal, + const double* ylmcoef_now, + const int nczp, + const int nbxx, + const double dr, + const double* rcut, + const Grid_Technique& gridt, + const UnitCell& ucell) +{ + const int nbz = gridt.nbzp; + checkCuda(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync)); + { + int iter_num = 0; + for (int iat1 = 0; iat1 < ucell.nat; iat1++) + { + for (int iat2 = 0; iat2 < ucell.nat; iat2++) + { + int stream_num = iter_num % gridt.nstreams; + int it1 = ucell.iat2it[iat1]; + int lo1 = gridt.trace_lo[ucell.itiaiw2iwt(it1, + ucell.iat2ia[iat1], + 0)]; + + int it2 = ucell.iat2it[iat2]; + int lo2 = gridt.trace_lo[ucell.itiaiw2iwt(it2, + ucell.iat2ia[iat2], + 0)]; + + if (lo1 <= lo2) + { + hamilt::AtomPair* tmp_ap + = hRGint->find_pair(iat1, iat2); + if (tmp_ap == nullptr) + { + continue; + } + int atom_pair_nw + = ucell.atoms[it1].nw * ucell.atoms[it2].nw; + if (gridt.grid_vlocal_g[iat1 * ucell.nat + iat2] == nullptr) + { + checkCuda(cudaMallocAsync( + (void**)&gridt + .grid_vlocal_g[iat1 * ucell.nat + iat2], + atom_pair_nw * sizeof(double), + gridt.streams[stream_num])); + } + checkCuda(cudaMemsetAsync( + gridt.grid_vlocal_g[iat1 * ucell.nat + iat2], + 0, + atom_pair_nw * sizeof(double), + gridt.streams[stream_num])); + iter_num++; + } + } + } + } + for (int i = 0; i < gridt.nstreams; i++) + { + checkCuda(cudaStreamSynchronize(gridt.streams[i])); + } + +#pragma omp parallel for num_threads(gridt.nstreams) collapse(2) + for (int i = 0; i < gridt.nbx; i++) + { + for (int j = 0; j < gridt.nby; j++) + { + int stream_num = omp_get_thread_num(); + checkCuda(cudaStreamSynchronize(gridt.streams[stream_num])); + double* input_double + = &gridt.psi_dbl_gbl[gridt.psi_size_max * stream_num * 5]; + int* input_int + = &gridt.psi_int_gbl[gridt.psi_size_max * stream_num * 2]; + int* num_psir = &gridt.num_psir_gbl[nbz * stream_num]; + int* atom_pair_A_m + = &gridt.l_info_global[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_B_n + = &gridt.r_info_global[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_k + = &gridt.k_info_global[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_lda + = &gridt.lda_info_global[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_ldb + = &gridt.ldb_info_global[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_ldc + = &gridt.ldc_info_global[gridt.atom_pair_nbz * stream_num]; + + double* input_double_g + = &gridt.psi_dbl_gbl_g[gridt.psi_size_max * stream_num * 5]; + int* input_int_g + = &gridt.psi_int_gbl_g[gridt.psi_size_max * stream_num * 2]; + int* num_psir_g = &gridt.num_psir_gbl_g[nbz * stream_num]; + double* psir_ylm_left_g + = &gridt.left_global_g[gridt.psir_size * stream_num]; + double* psir_r_g + = &gridt.right_global_g[gridt.psir_size * stream_num]; + + int* atom_pair_A_m_g + = &gridt.l_info_global_g[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_B_n_g + = &gridt.r_info_global_g[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_k_g + = &gridt.k_info_global_g[gridt.atom_pair_nbz * stream_num]; + + int* atom_pair_lda_g + = &gridt.lda_info_gbl_g[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_ldb_g + = &gridt.ldb_info_gbl_g[gridt.atom_pair_nbz * stream_num]; + int* atom_pair_ldc_g + = &gridt.ldc_info_gbl_g[gridt.atom_pair_nbz * stream_num]; + + double** matrix_A + = &gridt.ap_left_gbl[gridt.atom_pair_nbz * stream_num]; + double** matrix_B + = &gridt.ap_right_gbl[gridt.atom_pair_nbz * stream_num]; + double** matrix_C + = &gridt.ap_output_gbl[gridt.atom_pair_nbz * stream_num]; + + double** matrix_A_g + = &gridt.ap_left_gbl_g[gridt.atom_pair_nbz * stream_num]; + double** matrix_B_g + = &gridt.ap_right_gbl_g[gridt.atom_pair_nbz * stream_num]; + double** matrix_C_g + = &gridt.ap_output_gbl_g[gridt.atom_pair_nbz * stream_num]; + int atom_pair_num = 0; + int max_m = 0; + int max_n = 0; + + gtask_vlocal(gridt, + rcut, + ucell, + i, + j, + max_size, + nczp, + vfactor, + vlocal, + psir_ylm_left_g, + psir_r_g, + input_double, + input_int, + num_psir, + atom_pair_A_m, + atom_pair_B_n, + atom_pair_lda, + atom_pair_ldb, + atom_pair_ldc, + matrix_A, + matrix_B, + matrix_C, + atom_pair_num, + max_m, + max_n); + + for (int z = 0; z < gridt.atom_pair_nbz; z++) + { + atom_pair_k[z] = gridt.bxyz; + } + + checkCuda(cudaMemcpyAsync(input_double_g, + input_double, + gridt.psi_size_max * 5 * sizeof(double), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(input_int_g, + input_int, + gridt.psi_size_max * 2 * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(num_psir_g, + num_psir, + nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + + checkCuda(cudaMemcpyAsync(atom_pair_A_m_g, + atom_pair_A_m, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(atom_pair_B_n_g, + atom_pair_B_n, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(atom_pair_k_g, + atom_pair_k, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(atom_pair_lda_g, + atom_pair_lda, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(atom_pair_ldb_g, + atom_pair_ldb, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(atom_pair_ldc_g, + atom_pair_ldc, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + + checkCuda(cudaMemcpyAsync(matrix_A_g, + matrix_A, + gridt.atom_pair_nbz * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemcpyAsync(matrix_B_g, + matrix_B, + gridt.atom_pair_nbz * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + + checkCuda(cudaMemcpyAsync(matrix_C_g, + matrix_C, + gridt.atom_pair_nbz * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + + checkCuda(cudaMemsetAsync(psir_ylm_left_g, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[stream_num])); + checkCuda(cudaMemsetAsync(psir_r_g, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[stream_num])); + + dim3 grid_psi(nbz, 8); + dim3 block_psi(64); + + get_psi_and_vldr3<<>>( + gridt.ylmcoef_g, + dr, + gridt.bxyz, + ucell.nwmax, + input_double_g, + input_int_g, + num_psir_g, + gridt.psi_size_max_z, + gridt.atom_nwl_g, + gridt.atom_new_g, + gridt.atom_ylm_g, + gridt.atom_nw_g, + gridt.nr_max, + gridt.psi_u_g, + psir_ylm_left_g, + psir_r_g); + checkCudaLastError(); + gridt.fastest_matrix_mul(max_m, + max_n, + atom_pair_A_m_g, + atom_pair_B_n_g, + atom_pair_k_g, + matrix_A_g, + atom_pair_lda_g, + matrix_B_g, + atom_pair_ldb_g, + matrix_C_g, + atom_pair_ldc_g, + atom_pair_num, + gridt.streams[stream_num], + nullptr); + // checkCuda(cudaStreamSynchronize(gridt.streams[stream_num])); + } + } + for (int i = 0; i < gridt.nstreams; i++) + { + checkCuda(cudaStreamSynchronize(gridt.streams[i])); + } + { + int iter_num = 0; + for (int iat1 = 0; iat1 < ucell.nat; iat1++) + { + for (int iat2 = 0; iat2 < ucell.nat; iat2++) + { + int stream_num = iter_num % gridt.nstreams; + int it1 = ucell.iat2it[iat1]; + int lo1 = gridt.trace_lo[ucell.itiaiw2iwt(it1, + ucell.iat2ia[iat1], + 0)]; + + int it2 = ucell.iat2it[iat2]; + int lo2 = gridt.trace_lo[ucell.itiaiw2iwt(it2, + ucell.iat2ia[iat2], + 0)]; + if (lo1 <= lo2) + { + int atom_pair_nw + = ucell.atoms[it1].nw * ucell.atoms[it2].nw; + hamilt::AtomPair* tmp_ap + = hRGint->find_pair(iat1, iat2); + if (tmp_ap == nullptr) + { + continue; + } + checkCuda(cudaMemcpyAsync( + tmp_ap->get_pointer(0), + gridt.grid_vlocal_g[iat1 * ucell.nat + iat2], + atom_pair_nw * sizeof(double), + cudaMemcpyDeviceToHost, + gridt.streams[stream_num])); + iter_num++; + } + } + } + } + for (int i = 0; i < gridt.nstreams; i++) + { + checkCuda(cudaStreamSynchronize(gridt.streams[i])); + } +} + +} // namespace GintKernel \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/grid_technique.cpp b/source/module_hamilt_lcao/module_gint/grid_technique.cpp index f0ad2eaee2..691650aa63 100644 --- a/source/module_hamilt_lcao/module_gint/grid_technique.cpp +++ b/source/module_hamilt_lcao/module_gint/grid_technique.cpp @@ -4,26 +4,30 @@ #include "module_base/parallel_reduce.h" #include "module_base/timer.h" #include "module_hamilt_pw/hamilt_pwdft/global.h" - +#include "module_hsolver/kernels/cuda/helper_cuda.h" Grid_Technique::Grid_Technique() { - this->nlocdimg = nullptr; - this->nlocstartg = nullptr; - this->nad = nullptr; + this->nlocdimg = nullptr; + this->nlocstartg = nullptr; + this->nad = nullptr; this->how_many_atoms = nullptr; - this->start_ind = nullptr; - this->which_atom = nullptr; - this->which_bigcell = nullptr; - this->which_unitcell = nullptr; - this->bcell_start = nullptr; - this->in_this_processor = nullptr; - this->trace_lo = nullptr; - - this->total_atoms_on_grid = 0; + this->start_ind = nullptr; + this->which_atom = nullptr; + this->which_bigcell = nullptr; + this->which_unitcell = nullptr; + this->bcell_start = nullptr; + this->in_this_processor = nullptr; + this->trace_lo = nullptr; + this->total_atoms_on_grid = 0; allocate_find_R2 = false; +#if ((defined __CUDA) /* || (defined __ROCM) */) + if(GlobalV::device_flag == "gpu") + { + is_malloced = false; + } +#endif } - Grid_Technique::~Grid_Technique() { if(nlocdimg!=nullptr) @@ -82,103 +86,125 @@ Grid_Technique::~Grid_Technique() } if (allocate_find_R2) - { - for(int iat=0; iatset_grid_dim( - ncx_in,ncy_in,ncz_in, - bx_in,by_in,bz_in, - nbx_in,nby_in,nbz_in, - nbxx_in,nbzp_start_in,nbzp_in); + // (1) init_meshcell cell and big cell. + this->set_grid_dim(ncx_in, + ncy_in, + ncz_in, + bx_in, + by_in, + bz_in, + nbx_in, + nby_in, + nbz_in, + nbxx_in, + nbzp_start_in, + nbzp_in); - this->init_latvec(); + this->init_latvec(); - this->init_big_latvec(); + this->init_big_latvec(); - this->init_meshcell_pos(); + this->init_meshcell_pos(); - // (2) expand the grid - this->init_grid_expansion(); + // (2) expand the grid + this->init_grid_expansion(); - // (3) calculate the extended grid. - this->cal_extended_cell(this->dxe, this->dye, this->dze); + // (3) calculate the extended grid. + this->cal_extended_cell(this->dxe, this->dye, this->dze); - this->init_tau_in_bigcell(); + this->init_tau_in_bigcell(); - // init meshball - this->delete_meshball_positions(); //LiuXh add 2018-12-14 + // init meshball + this->delete_meshball_positions(); // LiuXh add 2018-12-14 - this->init_meshball(); + this->init_meshball(); - this->init_atoms_on_grid(ny, nplane, startz_current); + this->init_atoms_on_grid(ny, nplane, startz_current); - this->cal_trace_lo(); + this->cal_trace_lo(); +#if ((defined __CUDA) /* || (defined __ROCM) */) + if(GlobalV::device_flag == "gpu") + { + this->init_gpu_gint_variables(); + } +#endif - ModuleBase::timer::tick("Grid_Technique","init"); - return; + ModuleBase::timer::tick("Grid_Technique", "init"); + return; } -void Grid_Technique::get_startind(const int& ny, const int& nplane, const int& startz_current) +void Grid_Technique::get_startind(const int& ny, + const int& nplane, + const int& startz_current) { - ModuleBase::TITLE("Grid_Technique","get_startind"); - - assert(nbxx>=0); - // calculates start_ind, which stores the - // starting index of each bigcell - - delete[] this->start_ind; - if(nbxx > 0) - { - this->start_ind = new int[nbxx]; - ModuleBase::Memory::record("GT::start_ind", sizeof(int) * nbxx); - ModuleBase::GlobalFunc::ZEROS(start_ind, nbxx); - } - else - { - this->start_ind = nullptr; - return; - } + ModuleBase::TITLE("Grid_Technique", "get_startind"); + + assert(nbxx >= 0); + // calculates start_ind, which stores the + // starting index of each bigcell + + delete[] this->start_ind; + if (nbxx > 0) + { + this->start_ind = new int[nbxx]; + ModuleBase::Memory::record("GT::start_ind", sizeof(int) * nbxx); + ModuleBase::GlobalFunc::ZEROS(start_ind, nbxx); + } + else + { + this->start_ind = nullptr; + return; + } for(int i=0;ibx; - iy = iby * this->by; - iz = (ibz + nbzp_start) * this->bz - startz_current; + ibx = i / (nby * nbzp); + iby = (i - ibx * nby * nbzp) / nbzp; + ibz = i % nbzp; - int ind = iz + iy * nplane + ix * ny*nplane; - - start_ind[i] = ind; - } + ix = ibx * this->bx; + iy = iby * this->by; + iz = (ibz + nbzp_start) * this->bz - startz_current; - return; + int ind = iz + iy * nplane + ix * ny * nplane; + + start_ind[i] = ind; + } + + return; } // PLEASE update this 'init_atoms_on_grid' to make // it adapted to 'cuboid' shape of grid // mohan add 2021-04-06 -void Grid_Technique::init_atoms_on_grid(const int& ny, const int& nplane, const int& startz_current) +void Grid_Technique::init_atoms_on_grid(const int& ny, + const int& nplane, + const int& startz_current) { - ModuleBase::TITLE("Grid_Technique","init_atoms_on_grid"); - - assert(nbxx>=0); - this->get_startind(ny, nplane, startz_current); - - // (1) prepare data. - // counting the number of atoms whose orbitals have - // values on the bigcell. - delete[] this->how_many_atoms; - if(nbxx > 0) - { - this->how_many_atoms = new int[nbxx]; - ModuleBase::Memory::record("GT::how_many_atoms", sizeof(int) * nbxx); - ModuleBase::GlobalFunc::ZEROS(how_many_atoms, nbxx); - } - else - { - this->how_many_atoms = nullptr; - } - - // (2) information about gloabl grid - // and local grid. - // mohan add 2010-07-02 - int *ind_bigcell; - bool *bigcell_on_processor; // normal local form. - this->check_bigcell(ind_bigcell, bigcell_on_processor); - - // (3) Find the atoms using - // when doing grid integration. - delete[] in_this_processor; - this->in_this_processor = new bool[GlobalC::ucell.nat]; - for(int i=0; inxyze > 0); - int* index2normal = new int[this->nxyze]; - assert( index2normal != NULL ); - ModuleBase::Memory::record("GT::index2normal", sizeof(int) * this->nxyze); - this->grid_expansion_index(1,index2normal); + ModuleBase::TITLE("Grid_Technique", "init_atoms_on_grid"); + + assert(nbxx >= 0); + this->get_startind(ny, nplane, startz_current); + + // (1) prepare data. + // counting the number of atoms whose orbitals have + // values on the bigcell. + delete[] this->how_many_atoms; + if (nbxx > 0) + { + this->how_many_atoms = new int[nbxx]; + ModuleBase::Memory::record("GT::how_many_atoms", sizeof(int) * nbxx); + ModuleBase::GlobalFunc::ZEROS(how_many_atoms, nbxx); + } + else + { + this->how_many_atoms = nullptr; + } + + // (2) information about gloabl grid + // and local grid. + // mohan add 2010-07-02 + int* ind_bigcell; + bool* bigcell_on_processor; // normal local form. + this->check_bigcell(ind_bigcell, bigcell_on_processor); + + // (3) Find the atoms using + // when doing grid integration. + delete[] in_this_processor; + this->in_this_processor = new bool[GlobalC::ucell.nat]; + for (int i = 0; i < GlobalC::ucell.nat; i++) + { + in_this_processor[i] = false; + } + + // init atoms on grid + assert(this->nxyze > 0); + int* index2normal = new int[this->nxyze]; + assert(index2normal != NULL); + ModuleBase::Memory::record("GT::index2normal", sizeof(int) * this->nxyze); + this->grid_expansion_index(1, index2normal); // (5) record how many atoms on // each local grid point (ix,iy,iz) @@ -270,75 +298,84 @@ void Grid_Technique::init_atoms_on_grid(const int& ny, const int& nplane, const // ball[im]: relative position of adjacent bcell. normal = index2normal[ this->index_atom[iat] + this->index_ball[im] ]; - if(normal >= nbxyz) - { - std::cout << " index_atom=" << index_atom[iat] << std::endl; - std::cout << " index_ball=" << index_ball[im] << std::endl; - std::cout << " normal=" << normal << std::endl; - std::cout << " nbxyz=" << nbxyz << std::endl; - ModuleBase::WARNING_QUIT("Grid_Technique::init_atoms_on_grid","normal >= nbxyz"); - } - - assert(normal>=0); - - int f = ind_bigcell[normal]; - if(!bigcell_on_processor[normal]) continue; - - ++how_many_atoms[f]; - ++total_atoms_on_grid; - - this->in_this_processor[iat] = true; - } - if(this->in_this_processor[iat]) ++nat_local; - ++iat; - } - } - - delete[] ind_bigcell; - delete[] bigcell_on_processor; - - if(GlobalV::test_gridt)ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running,"Total_atoms_on_grid",total_atoms_on_grid); - - int stop = 0; - if(total_atoms_on_grid == 0) - { - GlobalV::ofs_running << " No atoms on this sub-FFT-mesh." << std::endl; - stop = 1; - } + if (normal >= nbxyz) + { + std::cout << " index_atom=" << index_atom[iat] << std::endl; + std::cout << " index_ball=" << index_ball[im] << std::endl; + std::cout << " normal=" << normal << std::endl; + std::cout << " nbxyz=" << nbxyz << std::endl; + ModuleBase::WARNING_QUIT( + "Grid_Technique::init_atoms_on_grid", + "normal >= nbxyz"); + } + + assert(normal >= 0); + + int f = ind_bigcell[normal]; + if (!bigcell_on_processor[normal]) + continue; + + ++how_many_atoms[f]; + ++total_atoms_on_grid; + + this->in_this_processor[iat] = true; + } + if (this->in_this_processor[iat]) + ++nat_local; + ++iat; + } + } + + delete[] ind_bigcell; + delete[] bigcell_on_processor; + + if (GlobalV::test_gridt) + ModuleBase::GlobalFunc::OUT(GlobalV::ofs_running, + "Total_atoms_on_grid", + total_atoms_on_grid); + + int stop = 0; + if (total_atoms_on_grid == 0) + { + GlobalV::ofs_running << " No atoms on this sub-FFT-mesh." << std::endl; + stop = 1; + } Parallel_Reduce::reduce_all(stop); - if(stop) - { - ModuleBase::WARNING("Grid_Technique::init_atoms_on_grid","No atom on this sub-FFT-mesh."); - } - - // calculate the trach of local ia to global iat - if(nat_local>0) - { - this->trace_iat.resize(nat_local); - for(int iat=GlobalC::ucell.nat-1; iat>=0;iat--) - { - if(this->in_this_processor[iat]) - { - this->trace_iat[--nat_local] = iat; - } - } - } - - // need how_many_atoms first. - this->cal_grid_integration_index(); - // bcell_start is needed. - this->init_atoms_on_grid2(index2normal); - delete[] index2normal; - return; + if (stop) + { + ModuleBase::WARNING("Grid_Technique::init_atoms_on_grid", + "No atom on this sub-FFT-mesh."); + } + + // calculate the trach of local ia to global iat + if (nat_local > 0) + { + this->trace_iat.resize(nat_local); + for (int iat = GlobalC::ucell.nat - 1; iat >= 0; iat--) + { + if (this->in_this_processor[iat]) + { + this->trace_iat[--nat_local] = iat; + } + } + } + + // need how_many_atoms first. + this->cal_grid_integration_index(); + // bcell_start is needed. + this->init_atoms_on_grid2(index2normal); + delete[] index2normal; + return; } -void Grid_Technique::check_bigcell(int* &ind_bigcell, bool* &bigcell_on_processor) +void Grid_Technique::check_bigcell(int*& ind_bigcell, + bool*& bigcell_on_processor) { - //check if a given bigcell is treated on this processor - const int zstart = nbzp_start; - const int zend = nbzp + zstart; - const int nbyz = nby * nbz; - const int nz = nbzp; + // check if a given bigcell is treated on this processor + const int zstart = nbzp_start; + const int zend = nbzp + zstart; + const int nbyz = nby * nbz; + const int nz = nbzp; int iz_now=0; int ix=0; @@ -347,41 +384,42 @@ void Grid_Technique::check_bigcell(int* &ind_bigcell, bool* &bigcell_on_processo int ind=0; bool flag=false; - ind_bigcell = new int[nbxyz]; - bigcell_on_processor=new bool[nbxyz]; - for(int i=0;i=zend) - { - flag=false; - } - else - { - flag=true; - ix = i / nbyz; - iy = ( i - ix * nbyz ) / nbz; - iz = iz_now - zstart; - ind = ix * nby * nz + iy * nz + iz; - //no need to calculate index if bigcell is - //not on this processor - } - - ind_bigcell[i]=ind; - bigcell_on_processor[i]=flag; - } - return; + ind_bigcell = new int[nbxyz]; + bigcell_on_processor = new bool[nbxyz]; + for (int i = 0; i < nbxyz; i++) + { + int iz_now = i % nbz; + if (iz_now < zstart || iz_now >= zend) + { + flag = false; + } + else + { + flag = true; + ix = i / nbyz; + iy = (i - ix * nbyz) / nbz; + iz = iz_now - zstart; + ind = ix * nby * nz + iy * nz + iz; + // no need to calculate index if bigcell is + // not on this processor + } + + ind_bigcell[i] = ind; + bigcell_on_processor[i] = flag; + } + return; } void Grid_Technique::init_atoms_on_grid2(const int* index2normal) -{ - ModuleBase::TITLE("Grid_Techinique","init_atoms_on_grid2"); +{ + ModuleBase::TITLE("Grid_Techinique", "init_atoms_on_grid2"); - if(total_atoms_on_grid==0) - { - ModuleBase::WARNING("Grid_Technique::init_atoms_on_grid2","no atom on this sub FFT grid."); - return; - } + if (total_atoms_on_grid == 0) + { + ModuleBase::WARNING("Grid_Technique::init_atoms_on_grid2", + "no atom on this sub FFT grid."); + return; + } int* index2ucell = new int[this->nxyze]; assert( index2ucell != NULL ); @@ -392,18 +430,20 @@ void Grid_Technique::init_atoms_on_grid2(const int* index2normal) bool *bigcell_on_processor; // normal local form. this->check_bigcell(ind_bigcell, bigcell_on_processor); - //-------------------------------------- - // save which atom is in the bigcell. - //-------------------------------------- - delete[] which_atom; - this->which_atom = new int[total_atoms_on_grid]; - assert( which_atom != 0); - ModuleBase::Memory::record("GT::which_atom", sizeof(int) * total_atoms_on_grid); - - delete[] which_bigcell; - this->which_bigcell = new int[total_atoms_on_grid]; - assert( which_bigcell != 0); - ModuleBase::Memory::record("GT::which_bigcell", sizeof(int) * total_atoms_on_grid); + //-------------------------------------- + // save which atom is in the bigcell. + //-------------------------------------- + delete[] which_atom; + this->which_atom = new int[total_atoms_on_grid]; + assert(which_atom != 0); + ModuleBase::Memory::record("GT::which_atom", + sizeof(int) * total_atoms_on_grid); + + delete[] which_bigcell; + this->which_bigcell = new int[total_atoms_on_grid]; + assert(which_bigcell != 0); + ModuleBase::Memory::record("GT::which_bigcell", + sizeof(int) * total_atoms_on_grid); delete[] which_unitcell; this->which_unitcell = new int[total_atoms_on_grid]; @@ -465,43 +505,46 @@ void Grid_Technique::init_atoms_on_grid2(const int* index2normal) void Grid_Technique::cal_grid_integration_index(void) { - // save the start - delete[] this->bcell_start; - if(nbxx > 0) - { - this->bcell_start = new int[nbxx]; - ModuleBase::Memory::record("GT::bcell_start", sizeof(int) * nbxx); - this->bcell_start[0] = 0; - for(int i=1; ibcell_start[i] = this->bcell_start[i-1] + this->how_many_atoms[i-1]; - } - } - else - { - this->bcell_start = nullptr; - } - // calculate which grid has the largest number of atoms, - // and how many atoms. - this->max_atom = 0; - for(int i=0; imax_atom = std::max( this->max_atom, this->how_many_atoms[i]); - } + // save the start + delete[] this->bcell_start; + if (nbxx > 0) + { + this->bcell_start = new int[nbxx]; + ModuleBase::Memory::record("GT::bcell_start", sizeof(int) * nbxx); + this->bcell_start[0] = 0; + for (int i = 1; i < nbxx; i++) + { + this->bcell_start[i] + = this->bcell_start[i - 1] + this->how_many_atoms[i - 1]; + } + } + else + { + this->bcell_start = nullptr; + } + // calculate which grid has the largest number of atoms, + // and how many atoms. + this->max_atom = 0; + for (int i = 0; i < nbxx; i++) + { + this->max_atom = std::max(this->max_atom, this->how_many_atoms[i]); + } #ifdef __MPI - int* all = new int[GlobalV::NPROC]; - ModuleBase::GlobalFunc::ZEROS(all, GlobalV::NPROC); - Parallel_Reduce::gather_int_all(max_atom,all); - if(GlobalV::MY_RANK==0) - { - GlobalV::ofs_warning << std::setw(15) << "Processor" << std::setw(15) << "Atom" << std::endl; - for(int i=0; itrace_lo = new int[GlobalV::NLOCAL]; - for(int i=0; itrace_lo[i] = -1; - } - ModuleBase::Memory::record("GT::trace_lo", sizeof(int) * GlobalV::NLOCAL); - - this->lnat = 0; - this->lgd = 0; - int iat = 0; - int iw_all=0; - int iw_local=0; +{ + ModuleBase::TITLE("Grid_Technique", "cal_trace_lo"); + // save the atom information in trace_lo, + // in fact the trace_lo dimension can be reduced + // to GlobalC::ucell.nat, but I think this is another way. + delete[] trace_lo; + this->trace_lo = new int[GlobalV::NLOCAL]; + for (int i = 0; i < GlobalV::NLOCAL; i++) + { + this->trace_lo[i] = -1; + } + ModuleBase::Memory::record("GT::trace_lo", sizeof(int) * GlobalV::NLOCAL); + + this->lnat = 0; + this->lgd = 0; + int iat = 0; + int iw_all = 0; + int iw_local = 0; for(int it=0; it max_cut) + { + max_cut = GlobalC::ORB.Phi[i].getRcut(); + } + } + + int atom_nw_now[GlobalC::ucell.ntype]; + int ucell_atom_nwl_now[GlobalC::ucell.ntype]; + for (int i = 0; i < GlobalC::ucell.ntype; i++) + { + atom_nw_now[i] = GlobalC::ucell.atoms[i].nw; + ucell_atom_nwl_now[i] = GlobalC::ucell.atoms[i].nwl; + } + + nr_max = static_cast(1000 * max_cut) + 10; + // double psi_u_now[GlobalC::ucell.ntype * GlobalC::ucell.nwmax * nr_max * + // 2]; + double* psi_u_now + = (double*)malloc(GlobalC::ucell.ntype * GlobalC::ucell.nwmax * nr_max + * 2 * sizeof(double)); + memset(psi_u_now, + 0, + GlobalC::ucell.ntype * GlobalC::ucell.nwmax * nr_max * 2 + * sizeof(double)); + bool* atom_iw2_new_now = (bool*)malloc( + GlobalC::ucell.ntype * GlobalC::ucell.nwmax * sizeof(bool)); + memset(atom_iw2_new_now, + 0, + GlobalC::ucell.ntype * GlobalC::ucell.nwmax * sizeof(bool)); + int* atom_iw2_ylm_now = (int*)malloc(GlobalC::ucell.ntype + * GlobalC::ucell.nwmax * sizeof(int)); + memset(atom_iw2_ylm_now, + 0, + GlobalC::ucell.ntype * GlobalC::ucell.nwmax * sizeof(int)); + int* atom_iw2_l_now = (int*)malloc(GlobalC::ucell.ntype + * GlobalC::ucell.nwmax * sizeof(int)); + memset(atom_iw2_l_now, + 0, + GlobalC::ucell.ntype * GlobalC::ucell.nwmax * sizeof(int)); + + Atom* atomx; + for (int i = 0; i < GlobalC::ucell.ntype; i++) + { + atomx = &GlobalC::ucell.atoms[i]; + for (int j = 0; j < GlobalC::ucell.nwmax; j++) + { + if (j < atomx->nw) + { + atom_iw2_new_now[i * GlobalC::ucell.nwmax + j] + = atomx->iw2_new[j]; + atom_iw2_ylm_now[i * GlobalC::ucell.nwmax + j] + = atomx->iw2_ylm[j]; + atom_iw2_l_now[i * GlobalC::ucell.nwmax + j] = atomx->iw2l[j]; + pointer = &GlobalC::ORB.Phi[i].PhiLN(atomx->iw2l[j], + atomx->iw2n[j]); + for (int k = 0; k < nr_max; k++) + { + int index_temp + = (i * GlobalC::ucell.nwmax * nr_max + j * nr_max + k) + * 2; + if (k < pointer->nr_uniform) + { + psi_u_now[index_temp] = pointer->psi_uniform[k]; + psi_u_now[index_temp + 1] = pointer->dpsi_uniform[k]; + } + } + } + } + } + + checkCudaErrors( + cudaMalloc((void**)&atom_nw_g, GlobalC::ucell.ntype * sizeof(int))); + checkCudaErrors(cudaMemcpy(atom_nw_g, + atom_nw_now, + GlobalC::ucell.ntype * sizeof(int), + cudaMemcpyHostToDevice)); + + checkCudaErrors(cudaMalloc((void**)&atom_nwl_g, + GlobalC::ucell.ntype * sizeof(int))); + checkCudaErrors(cudaMemcpy(atom_nwl_g, + ucell_atom_nwl_now, + GlobalC::ucell.ntype * sizeof(int), + cudaMemcpyHostToDevice)); + + checkCudaErrors(cudaMalloc((void**)&psi_u_g, + GlobalC::ucell.ntype * GlobalC::ucell.nwmax + * nr_max * sizeof(double) * 2)); + checkCudaErrors(cudaMemcpy(psi_u_g, + psi_u_now, + GlobalC::ucell.ntype * GlobalC::ucell.nwmax + * nr_max * sizeof(double) * 2, + cudaMemcpyHostToDevice)); + + checkCudaErrors( + cudaMalloc((void**)&atom_new_g, + GlobalC::ucell.ntype * GlobalC::ucell.nwmax * sizeof(bool))); + checkCudaErrors( + cudaMalloc((void**)&atom_ylm_g, + GlobalC::ucell.ntype * GlobalC::ucell.nwmax * sizeof(int))); + checkCudaErrors( + cudaMalloc((void**)&atom_l_g, + GlobalC::ucell.ntype * GlobalC::ucell.nwmax * sizeof(int))); + + checkCudaErrors( + cudaMemcpy(atom_new_g, + atom_iw2_new_now, + GlobalC::ucell.ntype * GlobalC::ucell.nwmax * sizeof(bool), + cudaMemcpyHostToDevice)); + checkCudaErrors( + cudaMemcpy(atom_ylm_g, + atom_iw2_ylm_now, + GlobalC::ucell.ntype * GlobalC::ucell.nwmax * sizeof(int), + cudaMemcpyHostToDevice)); + checkCudaErrors( + cudaMemcpy(atom_l_g, + atom_iw2_l_now, + GlobalC::ucell.ntype * GlobalC::ucell.nwmax * sizeof(int), + cudaMemcpyHostToDevice)); + + const int max_atom_pair_number = GlobalC::ucell.nat * GlobalC::ucell.nat; + checkCudaErrors(cudaMallocHost( + (void**)&grid_vlocal_g, + max_atom_pair_number + * sizeof(double*))); // the points to gpu memory, but + // gpu memory address save on host + + for (int iat = 0; iat < max_atom_pair_number; iat++) + { + grid_vlocal_g[iat] = nullptr; + } + + psir_size = nbzp * max_atom * bxyz * GlobalC::ucell.nwmax; + + checkCudaErrors(cudaMalloc((void**)&left_global_g, + psir_size * nstreams * sizeof(double))); + checkCudaErrors( + cudaMalloc((void**)&d_left_x_g, psir_size * nstreams * sizeof(double))); + checkCudaErrors( + cudaMalloc((void**)&d_left_y_g, psir_size * nstreams * sizeof(double))); + checkCudaErrors( + cudaMalloc((void**)&d_left_z_g, psir_size * nstreams * sizeof(double))); + checkCudaErrors(cudaMalloc((void**)&dd_left_xx_g, + psir_size * nstreams * sizeof(double))); + checkCudaErrors(cudaMalloc((void**)&dd_left_xy_g, + psir_size * nstreams * sizeof(double))); + checkCudaErrors(cudaMalloc((void**)&dd_left_xz_g, + psir_size * nstreams * sizeof(double))); + checkCudaErrors(cudaMalloc((void**)&dd_left_yy_g, + psir_size * nstreams * sizeof(double))); + checkCudaErrors(cudaMalloc((void**)&dd_left_yz_g, + psir_size * nstreams * sizeof(double))); + checkCudaErrors(cudaMalloc((void**)&dd_left_zz_g, + psir_size * nstreams * sizeof(double))); + checkCudaErrors(cudaMalloc((void**)&right_global_g, + psir_size * nstreams * sizeof(double))); + checkCudaErrors(cudaMalloc((void**)&right_global_g, + psir_size * nstreams * sizeof(double))); + checkCudaErrors(cudaMalloc((void**)&dm_global_g, + psir_size * nstreams * sizeof(double))); + checkCudaErrors( + cudaMemset(left_global_g, 0, psir_size * nstreams * sizeof(double))); + checkCudaErrors( + cudaMemset(right_global_g, 0, psir_size * nstreams * sizeof(double))); + + atom_pair_mesh = max_atom * max_atom; + atom_pair_nbz = atom_pair_mesh * nbzp; + + checkCudaErrors(cudaMallocHost((void**)&alpha_global, + atom_pair_nbz * nstreams * sizeof(double))); + checkCudaErrors(cudaMalloc((void**)&alpha_global_g, + atom_pair_nbz * nstreams * sizeof(double))); + + checkCudaErrors(cudaMallocHost((void**)&l_info_global, + atom_pair_nbz * nstreams * sizeof(int))); + checkCudaErrors(cudaMalloc((void**)&l_info_global_g, + atom_pair_nbz * nstreams * sizeof(int))); + + checkCudaErrors( + cudaMemset(d_left_x_g, 0, psir_size * nstreams * sizeof(double))); + checkCudaErrors( + cudaMemset(d_left_y_g, 0, psir_size * nstreams * sizeof(double))); + checkCudaErrors( + cudaMemset(d_left_z_g, 0, psir_size * nstreams * sizeof(double))); + checkCudaErrors( + cudaMemset(dd_left_xx_g, 0, psir_size * nstreams * sizeof(double))); + checkCudaErrors( + cudaMemset(dd_left_xy_g, 0, psir_size * nstreams * sizeof(double))); + checkCudaErrors( + cudaMemset(dd_left_xz_g, 0, psir_size * nstreams * sizeof(double))); + checkCudaErrors( + cudaMemset(dd_left_yy_g, 0, psir_size * nstreams * sizeof(double))); + checkCudaErrors( + cudaMemset(dd_left_yz_g, 0, psir_size * nstreams * sizeof(double))); + checkCudaErrors( + cudaMemset(dd_left_zz_g, 0, psir_size * nstreams * sizeof(double))); + checkCudaErrors( + cudaMemset(dm_global_g, 0, psir_size * nstreams * sizeof(double))); + + checkCudaErrors(cudaMallocHost((void**)&r_info_global, + atom_pair_nbz * nstreams * sizeof(int))); + checkCudaErrors(cudaMalloc((void**)&r_info_global_g, + atom_pair_nbz * nstreams * sizeof(int))); + + checkCudaErrors(cudaMallocHost((void**)&k_info_global, + atom_pair_nbz * nstreams * sizeof(int))); + checkCudaErrors(cudaMalloc((void**)&k_info_global_g, + atom_pair_nbz * nstreams * sizeof(int))); + + checkCudaErrors(cudaMallocHost((void**)&lda_info_global, + atom_pair_nbz * nstreams * sizeof(int))); + checkCudaErrors(cudaMalloc((void**)&lda_info_gbl_g, + atom_pair_nbz * nstreams * sizeof(int))); + + checkCudaErrors(cudaMallocHost((void**)&ldb_info_global, + atom_pair_nbz * nstreams * sizeof(int))); + checkCudaErrors(cudaMalloc((void**)&ldb_info_gbl_g, + atom_pair_nbz * nstreams * sizeof(int))); + + checkCudaErrors(cudaMallocHost((void**)&ldc_info_global, + atom_pair_nbz * nstreams * sizeof(int))); + checkCudaErrors(cudaMalloc((void**)&ldc_info_gbl_g, + atom_pair_nbz * nstreams * sizeof(int))); + + checkCudaErrors(cudaMallocHost((void**)&ap_left_gbl, + atom_pair_nbz * nstreams * sizeof(double*))); + checkCudaErrors(cudaMallocHost((void**)&ap_right_gbl, + atom_pair_nbz * nstreams * sizeof(double*))); + checkCudaErrors(cudaMallocHost((void**)&ap_output_gbl, + atom_pair_nbz * nstreams * sizeof(double*))); + + checkCudaErrors(cudaMalloc((void**)&ap_left_gbl_g, + atom_pair_nbz * nstreams * sizeof(double*))); + checkCudaErrors(cudaMalloc((void**)&ap_right_gbl_g, + atom_pair_nbz * nstreams * sizeof(double*))); + + checkCudaErrors(cudaMalloc((void**)&ap_output_gbl_g, + atom_pair_nbz * nstreams * sizeof(double*))); + + psi_size_max = max_atom * bxyz * nbzp; + psi_size_max_z = max_atom * bxyz; + + checkCudaErrors( + cudaMallocHost((void**)&psi_dbl_gbl, + psi_size_max * nstreams * 5 * sizeof(double))); + checkCudaErrors(cudaMalloc((void**)&psi_dbl_gbl_g, + psi_size_max * nstreams * 5 * sizeof(double))); + + checkCudaErrors(cudaMallocHost((void**)&psi_int_gbl, + psi_size_max * nstreams * 2 * sizeof(int))); + checkCudaErrors(cudaMalloc((void**)&psi_int_gbl_g, + psi_size_max * nstreams * 2 * sizeof(int))); + + checkCudaErrors( + cudaMallocHost((void**)&num_psir_gbl, nbzp * nstreams * sizeof(int))); + checkCudaErrors( + cudaMalloc((void**)&num_psir_gbl_g, nbzp * nstreams * sizeof(int))); + + num_mcell = nbzp * bxyz; + checkCudaErrors(cudaMalloc((void**)&rho_g, this->ncxyz * sizeof(double))); + checkCudaErrors(cudaMemset(rho_g, 0, this->ncxyz * sizeof(double))); + checkCudaErrors( + cudaMallocHost((void**)&vec_l, num_mcell * nstreams * sizeof(double*))); + checkCudaErrors( + cudaMalloc((void**)&vec_l_g, num_mcell * nstreams * sizeof(double*))); + checkCudaErrors( + cudaMallocHost((void**)&vec_r, num_mcell * nstreams * sizeof(double*))); + checkCudaErrors( + cudaMalloc((void**)&vec_r_g, num_mcell * nstreams * sizeof(double*))); + checkCudaErrors(cudaMallocHost((void**)&dot_product, + num_mcell * nstreams * sizeof(double*))); + checkCudaErrors(cudaMalloc((void**)&dot_product_g, + num_mcell * nstreams * sizeof(double*))); + checkCudaErrors( + cudaMallocHost((void**)&vec_len, num_mcell * nstreams * sizeof(int))); + checkCudaErrors( + cudaMalloc((void**)&vec_len_g, num_mcell * nstreams * sizeof(int))); + + for (int i = 0; i < nstreams; ++i) + { + checkCudaErrors(cudaStreamCreate(&streams[i])); + } + + gemm_algo_selector(bxyz, fastest_matrix_mul); + + is_malloced = true; + + free(psi_u_now); + free(atom_iw2_new_now); + free(atom_iw2_ylm_now); +} + +void Grid_Technique::free_gpu_gint_variables() +{ + if (!is_malloced) + { + return; + } + for (int i = 0; i < nstreams; ++i) + checkCudaErrors(cudaStreamDestroy(streams[i])); + + checkCudaErrors(cudaFree(ylmcoef_g)); + checkCudaErrors(cudaFree(atom_nwl_g)); + checkCudaErrors(cudaFree(psi_u_g)); + checkCudaErrors(cudaFree(atom_new_g)); + checkCudaErrors(cudaFree(atom_ylm_g)); + checkCudaErrors(cudaFree(atom_nw_g)); + checkCudaErrors(cudaFree(atom_l_g)); + + checkCudaErrors(cudaFreeHost(psi_dbl_gbl)); + checkCudaErrors(cudaFreeHost(psi_int_gbl)); + checkCudaErrors(cudaFreeHost(num_psir_gbl)); + + checkCudaErrors(cudaFree(psi_dbl_gbl_g)); + checkCudaErrors(cudaFree(psi_int_gbl_g)); + checkCudaErrors(cudaFree(num_psir_gbl_g)); + checkCudaErrors(cudaFree(left_global_g)); + checkCudaErrors(cudaFree(right_global_g)); + + checkCudaErrors(cudaFreeHost(l_info_global)); + checkCudaErrors(cudaFree(l_info_global_g)); + + checkCudaErrors(cudaFreeHost(r_info_global)); + checkCudaErrors(cudaFree(r_info_global_g)); + + checkCudaErrors(cudaFreeHost(alpha_global)); + checkCudaErrors(cudaFree(alpha_global_g)); + + checkCudaErrors(cudaFreeHost(k_info_global)); + checkCudaErrors(cudaFree(k_info_global_g)); + + checkCudaErrors(cudaFreeHost(lda_info_global)); + checkCudaErrors(cudaFree(lda_info_gbl_g)); + + checkCudaErrors(cudaFreeHost(ldb_info_global)); + checkCudaErrors(cudaFree(ldb_info_gbl_g)); + + checkCudaErrors(cudaFreeHost(ldc_info_global)); + checkCudaErrors(cudaFree(ldc_info_gbl_g)); + + checkCudaErrors(cudaFreeHost(ap_left_gbl)); + checkCudaErrors(cudaFreeHost(ap_right_gbl)); + checkCudaErrors(cudaFreeHost(ap_output_gbl)); + + checkCudaErrors(cudaFree(ap_left_gbl_g)); + checkCudaErrors(cudaFree(d_left_x_g)); + checkCudaErrors(cudaFree(d_left_y_g)); + checkCudaErrors(cudaFree(d_left_z_g)); + checkCudaErrors(cudaFree(dd_left_xx_g)); + checkCudaErrors(cudaFree(dd_left_xy_g)); + checkCudaErrors(cudaFree(dd_left_xz_g)); + checkCudaErrors(cudaFree(dd_left_yy_g)); + checkCudaErrors(cudaFree(dd_left_yz_g)); + checkCudaErrors(cudaFree(dd_left_zz_g)); + checkCudaErrors(cudaFree(ap_right_gbl_g)); + checkCudaErrors(cudaFree(dm_global_g)); + checkCudaErrors(cudaFree(ap_output_gbl_g)); + + checkCudaErrors(cudaFreeHost(vec_len)); + checkCudaErrors(cudaFreeHost(vec_l)); + checkCudaErrors(cudaFreeHost(vec_r)); + checkCudaErrors(cudaFreeHost(dot_product)); + + checkCudaErrors(cudaFree(vec_len_g)); + checkCudaErrors(cudaFree(vec_l_g)); + checkCudaErrors(cudaFree(vec_r_g)); + checkCudaErrors(cudaFree(dot_product_g)); + checkCudaErrors(cudaFree(rho_g)); + + const int max_atom_pair_number = GlobalC::ucell.nat * GlobalC::ucell.nat; + for (int i = 0; i < max_atom_pair_number; i++) + { + if (grid_vlocal_g[i] != nullptr) + { + checkCudaErrors(cudaFree(grid_vlocal_g[i])); + } + } + checkCudaErrors(cudaFreeHost(grid_vlocal_g)); + + is_malloced = false; } +#endif diff --git a/source/module_hamilt_lcao/module_gint/grid_technique.h b/source/module_hamilt_lcao/module_gint/grid_technique.h index ede87210b5..a32eb7feb2 100644 --- a/source/module_hamilt_lcao/module_gint/grid_technique.h +++ b/source/module_hamilt_lcao/module_gint/grid_technique.h @@ -1,126 +1,226 @@ #ifndef GRID_TECHNIQUE_H #define GRID_TECHNIQUE_H -#include "grid_meshball.h" #include "grid_index.h" +#include "grid_meshball.h" #include "module_basis/module_ao/parallel_orbitals.h" +#if ((defined __CUDA) /* || (defined __ROCM) */) +#include + +#include "kernels/cuda/cuda_tools.cuh" +#include "kernels/cuda/vbatch_matrix_mul.cuh" +#endif // Author: mohan // Date: 2009-10-17 class Grid_Technique : public Grid_MeshBall { - // public variables. - public: - - //------------------------------------ - // 1: Info about atom number on grid. - //------------------------------------ - // record how many atoms on each grid. - int* how_many_atoms; - // max atom on grid - int max_atom; - // sum of how_many_atoms - int total_atoms_on_grid; - - int* start_ind; - - //------------------------------------ - // 2: Info about which atom on grid. - //------------------------------------ - // save the start position of each big cell's adjacent - // atoms in 1D grid. - int* bcell_start; - // save the 'iat' atom. - // dim: total_atoms_on_grid. - int* which_atom; - - //-------------------------------------- - // save the bigcell index in meshball. - // dim: total_atoms_on_grid. - //-------------------------------------- - int* which_bigcell; - int* which_unitcell; - - //------------------------------------ - // 3: which atom on local grid. - //------------------------------------ - bool* in_this_processor; - std::vector trace_iat; - int lnat; // local nat. - int lgd; // local grid dimension. lgd * lgd symmetry matrix. - int* trace_lo; // trace local orbital. + // public variables. + public: + //------------------------------------ + // 1: Info about atom number on grid. + //------------------------------------ + // record how many atoms on each grid. + int* how_many_atoms; + // max atom on grid + int max_atom; + // sum of how_many_atoms + int total_atoms_on_grid; + + int* start_ind; + + //------------------------------------ + // 2: Info about which atom on grid. + //------------------------------------ + // save the start position of each big cell's adjacent + // atoms in 1D grid. + int* bcell_start; + // save the 'iat' atom. + // dim: total_atoms_on_grid. + int* which_atom; + + //-------------------------------------- + // save the bigcell index in meshball. + // dim: total_atoms_on_grid. + //-------------------------------------- + int* which_bigcell; + int* which_unitcell; + + //------------------------------------ + // 3: which atom on local grid. + //------------------------------------ + bool* in_this_processor; + std::vector trace_iat; + int lnat; // local nat. + int lgd; // local grid dimension. lgd * lgd symmetry matrix. + int* trace_lo; // trace local orbital. //--------------------------------------- - // nnrg: number of matrix elements on - // each processor's real space grid. - // use: GridT.in_this_processor - //--------------------------------------- - int nnrg; - int *nlocdimg; - int *nlocstartg; - + // nnrg: number of matrix elements on + // each processor's real space grid. + // use: GridT.in_this_processor + //--------------------------------------- + int nnrg; + int* nlocdimg; + int* nlocstartg; + int* nad; // number of adjacent atoms for each atom. - int **find_R2; - int **find_R2_sorted_index; - int **find_R2st; + int** find_R2; + int** find_R2_sorted_index; + int** find_R2st; bool allocate_find_R2; - int binary_search_find_R2_offset(int val, int iat) const; + int binary_search_find_R2_offset(int val, int iat) const; + + // indexes for nnrg -> orbital index + R index + std::vector nnrg_index; - //indexes for nnrg -> orbital index + R index - std::vector nnrg_index; - // public functions - public: - - Grid_Technique(); - ~Grid_Technique(); - - void set_pbc_grid( - const int &ncx_in, - const int &ncy_in, - const int &ncz_in, - const int &bx_in, - const int &by_in, - const int &bz_in, - const int &nbx_in, - const int &nby_in, - const int &nbz_in, - const int &nbxx_in, - const int &nbzp_start_in, - const int& nbzp_in, - const int& ny, - const int& nplane, - const int& startz_current); + public: + Grid_Technique(); + ~Grid_Technique(); + + void set_pbc_grid(const int& ncx_in, + const int& ncy_in, + const int& ncz_in, + const int& bx_in, + const int& by_in, + const int& bz_in, + const int& nbx_in, + const int& nby_in, + const int& nbz_in, + const int& nbxx_in, + const int& nbzp_start_in, + const int& nbzp_in, + const int& ny, + const int& nplane, + const int& startz_current); /// number of elements(basis-pairs) in this processon /// on all adjacent atoms-pairs(Grid division) void cal_nnrg(Parallel_Orbitals* pv); - int cal_RindexAtom(const int& u1, const int& u2, const int& u3, const int& iat2) const; - -private: + int cal_RindexAtom(const int& u1, + const int& u2, + const int& u3, + const int& iat2) const; + private: void cal_max_box_index(void); - + int maxB1; int maxB2; - int maxB3; + int maxB3; - int minB1; - int minB2; - int minB3; + int minB1; + int minB2; + int minB3; - int nB1; - int nB2; - int nB3; + int nB1; + int nB2; + int nB3; int nbox; - // atoms on meshball - void init_atoms_on_grid(const int& ny, const int& nplane, const int& startz_current); - void init_atoms_on_grid2(const int* index2normal); - void cal_grid_integration_index(void); - void cal_trace_lo(void); - void check_bigcell(int* &ind_bigcell, bool* &bigcell_on_processor); - void get_startind(const int& ny, const int& nplane, const int& startz_current); + // atoms on meshball + void init_atoms_on_grid(const int& ny, + const int& nplane, + const int& startz_current); + void init_atoms_on_grid2(const int* index2normal); + void cal_grid_integration_index(void); + void cal_trace_lo(void); + void check_bigcell(int*& ind_bigcell, bool*& bigcell_on_processor); + void get_startind(const int& ny, + const int& nplane, + const int& startz_current); + +#if ((defined __CUDA) /* || (defined __ROCM) */) + public: + double* ylmcoef_g; + bool is_malloced; + + int* atom_nw_g; + int* atom_nwl_g; + double* psi_u_g; + bool* atom_new_g; + int* atom_ylm_g; + int* atom_l_g; + double** grid_vlocal_g; + int nr_max; + int psi_size_max; + int psi_size_max_z; + int psir_size; + int atom_pair_mesh; + int atom_pair_nbz; + + const int nstreams = 4; + cudaStream_t streams[4]; + // streams[nstreams] + // TODO it needs to be implemented through configuration files + + double* left_global_g; + double* d_left_x_g; + double* d_left_y_g; + double* d_left_z_g; + + double* dd_left_xx_g; + double* dd_left_xy_g; + double* dd_left_xz_g; + double* dd_left_yy_g; + double* dd_left_yz_g; + double* dd_left_zz_g; + double* right_global_g; + double* dm_global_g; + + double* alpha_global; + double* alpha_global_g; + int* l_info_global; + int* l_info_global_g; + int* r_info_global; + int* r_info_global_g; + int* k_info_global; + int* k_info_global_g; + + int* lda_info_global; + int* lda_info_gbl_g; + int* ldb_info_global; + int* ldb_info_gbl_g; + int* ldc_info_global; + int* ldc_info_gbl_g; + + double** ap_left_gbl; + double** ap_right_gbl; + double** ap_output_gbl; + + double** ap_left_gbl_g; + double** ap_right_gbl_g; + double** ap_output_gbl_g; + + double* psi_dbl_gbl; + double* psi_dbl_gbl_g; + + int* psi_int_gbl; + int* psi_int_gbl_g; + + int* num_psir_gbl; + int* num_psir_gbl_g; + + // additional variables for rho calculating + int num_mcell; + double* rho_g; + int* vec_len; + int* vec_len_g; + double** vec_l; + double** vec_l_g; + double** vec_r; + double** vec_r_g; + double** dot_product; + double** dot_product_g; + + matrix_multiple_func_type fastest_matrix_mul; + + private: + void init_gpu_gint_variables(); + void free_gpu_gint_variables(); + +#endif }; #endif diff --git a/source/module_hamilt_lcao/module_gint/gtask_force.cpp b/source/module_hamilt_lcao/module_gint/gtask_force.cpp new file mode 100644 index 0000000000..7c30d3db83 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/gtask_force.cpp @@ -0,0 +1,263 @@ +#include + +#include "gint_force.h" +#include "module_base/ylm.h" +#include "module_hamilt_lcao/module_gint/gint_tools.h" +namespace GintKernel +{ + +/** + * @brief Description of the function. + * + * Detailed description of the function. + * + * @param gridt The Grid_Technique object. + * @param i The integer parameter i. + * @param j The integer parameter j. + * @param psiSizeMax The maximum size of psi. + * @param max_size The maximum size. + * @param nczp The nczp parameter. + * @param vfactor The vfactor parameter. + * @param vlocal_global_value The array of vlocal_global_value. + * @param iat_per_nbz The array of iat_per_nbz. + * @param input_dou The double array of psi_input. + * @param psiInputInt The integer array of psi_input. + * @param num_psir The array of num_psir. + * @param lgd The lgd parameter. + * @param psir_ylm_g The double array of psir_ylm_g. + * @param psir_zeros_g The double array of psir_zeros_g. + * @param dm_matrix_g The double array of dm_matrix_g. + * @param mat_m The array of mat_m. + * @param mat_n The array of mat_n. + * @param mat_k The array of mat_k. + * @param mat_lda The array of mat_lda. + * @param mat_ldb The array of mat_ldb. + * @param mat_ldc The array of mat_ldc. + * @param mat_A The pointer to mat_A. + * @param mat_B The pointer to mat_B. + * @param mat_C The pointer to mat_C. + * @param max_m The reference to max_m. + * @param max_n The reference to max_n. + * @param atom_pair_num The reference to atom_pair_num. + */ +void gpu_task_generator_force(const Grid_Technique& gridt, + const UnitCell& ucell, + const int i, + const int j, + const int psiSizeMax, + const int max_size, + const int nczp, + const double vfactor, + double* rcut, + const double* vlocal_global_value, + int* iat_per_nbz, + const int lgd, + double* dm_matrix_g, + int& max_m, + int& max_n, + int& atom_pair_num, + SGridParameter& para) +{ + const int grid_index_ij = i * gridt.nby * gridt.nbzp + j * gridt.nbzp; + const int nwmax = ucell.nwmax; + bool* gpu_mat_cal_flag = new bool[max_size * gridt.nbzp]; + + for (int i = 0; i < max_size * gridt.nbzp; i++) + { + gpu_mat_cal_flag[i] = false; + } + // psir generate + for (int z_index = 0; z_index < gridt.nbzp; z_index++) + { + int num_get_psi = 0; + int grid_index = grid_index_ij + z_index; + int num_psi_pos = psiSizeMax * z_index; + int calc_flag_index = max_size * z_index; + int bcell_start_index = gridt.bcell_start[grid_index]; + int na_grid = gridt.how_many_atoms[grid_index]; + + for (int id = 0; id < na_grid; id++) + { + int ib = 0; + int mcell_index = bcell_start_index + id; + int imcell = gridt.which_bigcell[mcell_index]; + int iat = gridt.which_atom[mcell_index]; + int it_temp = ucell.iat2it[iat]; + int start_ind_grid = gridt.start_ind[grid_index]; + + for (int bx_index = 0; bx_index < gridt.bx; bx_index++) + { + for (int by_index = 0; by_index < gridt.by; by_index++) + { + for (int bz_index = 0; bz_index < gridt.bz; bz_index++) + { + double dr_temp[3]; + dr_temp[0] = gridt.meshcell_pos[ib][0] + + gridt.meshball_positions[imcell][0] + - gridt.tau_in_bigcell[iat][0]; + dr_temp[1] = gridt.meshcell_pos[ib][1] + + gridt.meshball_positions[imcell][1] + - gridt.tau_in_bigcell[iat][1]; + dr_temp[2] = gridt.meshcell_pos[ib][2] + + gridt.meshball_positions[imcell][2] + - gridt.tau_in_bigcell[iat][2]; + /* compute distance in and allocate the paramter in + * z_index */ + double distance = sqrt(dr_temp[0] * dr_temp[0] + + dr_temp[1] * dr_temp[1] + + dr_temp[2] * dr_temp[2]); + if (distance <= rcut[it_temp]) + { + gpu_mat_cal_flag[calc_flag_index + id] = true; + int pos_temp_double = num_psi_pos + num_get_psi; + int pos_temp_int = pos_temp_double * 2; + pos_temp_double *= 5; + if (distance < 1.0E-9) + { + distance += 1.0E-9; + } + para.input_dou[pos_temp_double] = dr_temp[0]; + para.input_dou[pos_temp_double + 1] = dr_temp[1]; + para.input_dou[pos_temp_double + 2] = dr_temp[2]; + para.input_dou[pos_temp_double + 3] = distance; + int vindex_global = bx_index * gridt.ncy * nczp + + by_index * nczp + bz_index + + start_ind_grid; + para.input_dou[pos_temp_double + 4] + = vlocal_global_value[vindex_global] * vfactor; + + para.input_int[pos_temp_int] = it_temp; + para.input_int[pos_temp_int + 1] + = (z_index * gridt.bxyz + ib) * max_size * nwmax + + id * nwmax; + iat_per_nbz[z_index * gridt.bxyz * max_size + + ib * max_size + id] + = iat; + num_get_psi++; + } + ib++; + } + } + } + } + para.num_psir[z_index] = num_get_psi; + } + + /* allocate the Multiplication of multinomial matrices */ + int tid = 0; + max_m = 0; + max_n = 0; + + for (int z_index = 0; z_index < gridt.nbzp; z_index++) + { + int grid_index = grid_index_ij + z_index; + int calc_flag_index = max_size * z_index; + int bcell_start_index = gridt.bcell_start[grid_index]; + int bcell_start_psir = z_index * gridt.bxyz * max_size * nwmax; + + for (int atom1 = 0; atom1 < gridt.how_many_atoms[grid_index]; atom1++) + { + if (!gpu_mat_cal_flag[calc_flag_index + atom1]) + { + continue; + } + const int mcell_index1 = bcell_start_index + atom1; + int iat1 = gridt.which_atom[mcell_index1]; + int it1 = ucell.iat2it[iat1]; + int lo1 + = gridt.trace_lo[ucell.itiaiw2iwt(it1, ucell.iat2ia[iat1], 0)]; + int nw1 = ucell.atoms[it1].nw; + + for (int atom2 = 0; atom2 < gridt.how_many_atoms[grid_index]; + atom2++) + { + if (!gpu_mat_cal_flag[calc_flag_index + atom2]) + { + continue; + } + const int mcell_index2 = bcell_start_index + atom2; + int iat2 = gridt.which_atom[mcell_index2]; + int it2 = ucell.iat2it[iat2]; + int lo2 = gridt.trace_lo[ucell.itiaiw2iwt(it2, + ucell.iat2ia[iat2], + 0)]; + int nw2 = ucell.atoms[it2].nw; + + int mat_A_idx = bcell_start_psir + atom2 * nwmax; + int mat_B_idx = lgd * lo1 + lo2; + int mat_C_idx = bcell_start_psir + atom1 * nwmax; + para.atom_pair_A_m[tid] = gridt.bxyz; + para.atom_pair_B_n[tid] = nw1; + para.atom_pair_K[tid] = nw2; + para.atom_pair_lda[tid] = nwmax * max_size; + para.atom_pair_ldb[tid] = lgd; + para.atom_pair_ldc[tid] = nwmax * max_size; + para.matrix_A[tid] = para.psir_r_device + mat_A_idx; + para.matrix_B[tid] = dm_matrix_g + mat_B_idx; + para.matrix_C[tid] = para.psir_dm_device + mat_C_idx; + + if (para.atom_pair_A_m[tid] > max_m) + { + max_m = para.atom_pair_A_m[tid]; + } + + if (para.atom_pair_B_n[tid] > max_n) + { + max_n = para.atom_pair_B_n[tid]; + } + + tid++; + } + } + } + atom_pair_num = tid; + + delete[] gpu_mat_cal_flag; +} + +void allocateDm(double* matrixHost, + hamilt::HContainer* dm, + const Grid_Technique& gridt, + const UnitCell& ucell) +{ + ModuleBase::GlobalFunc::ZEROS(matrixHost, gridt.lgd * gridt.lgd); + for (int iatRow = 0; iatRow < ucell.nat; iatRow++) + { + for (int iatColumn = 0; iatColumn < ucell.nat; iatColumn++) + { + int indexTypeRow = ucell.iat2it[iatRow]; + int indexTypeColumn = ucell.iat2it[iatColumn]; + int localOrbitRow + = gridt.trace_lo[ucell.itiaiw2iwt(indexTypeRow, + ucell.iat2ia[iatRow], + 0)]; + int localOrbitColumn + = gridt.trace_lo[ucell.itiaiw2iwt(indexTypeColumn, + ucell.iat2ia[iatColumn], + 0)]; + hamilt::AtomPair* tmpAtomPair + = dm->find_pair(iatRow, iatColumn); + int orbitIndex = 0; + if (tmpAtomPair == NULL) + { + continue; + } + for (int orbitRow = 0; orbitRow < tmpAtomPair->get_row_size(); + orbitRow++) + { + for (int orbitColumn = 0; + orbitColumn < tmpAtomPair->get_col_size(); + orbitColumn++) + { + matrixHost[(localOrbitRow + orbitRow) * gridt.lgd + + (localOrbitColumn + orbitColumn)] + = tmpAtomPair->get_pointer(0)[orbitIndex]; + orbitIndex++; + } + } + } + } + return; +} + +} // namespace GintKernel diff --git a/source/module_hamilt_lcao/module_gint/gtask_rho.cpp b/source/module_hamilt_lcao/module_gint/gtask_rho.cpp new file mode 100644 index 0000000000..d9949a147a --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/gtask_rho.cpp @@ -0,0 +1,219 @@ +#include "gint_rho.h" +#include "module_base/ylm.h" +#include "module_hamilt_lcao/module_gint/gint_tools.h" +#include "omp.h" +namespace GintKernel +{ + +void gtask_rho(const Grid_Technique& gridt, + const int i, + const int j, + const int max_size, + const int nczp, + const UnitCell& ucell, + const double* rcut, + double* input_double, + int* input_int, + int* num_psir, + const int lgd, + double* const psir_ylm_g, + double* const psir_dm_g, + double* const dm_matrix_g, + double* mat_alpha, + int* mat_m, + int* mat_n, + int* mat_k, + int* mat_lda, + int* mat_ldb, + int* mat_ldc, + double** mat_A, + double** mat_B, + double** mat_C, + int& max_m, + int& max_n, + int& atom_pair_num, + double* rho_g, + double** vec_l, + double** vec_r, + double** dot_product, + int* vec_len, + int& dot_count) +{ + const int grid_index_ij = i * gridt.nby * gridt.nbzp + j * gridt.nbzp; + const int nwmax = ucell.nwmax; + const int psi_size_max = max_size * gridt.bxyz; + + // record whether mat_psir is a zero matrix or not. + bool* gpu_mat_cal_flag = new bool[max_size * gridt.nbzp]; + + for (int i = 0; i < max_size * gridt.nbzp; i++) + { + gpu_mat_cal_flag[i] = false; + } + dot_count = 0; + + // generate data for calculating psir + for (int z_index = 0; z_index < gridt.nbzp; z_index++) + { + int num_get_psi = 0; + int grid_index = grid_index_ij + z_index; + int num_psi_pos = psi_size_max * z_index; + int calc_flag_index = max_size * z_index; + int bcell_start_index = gridt.bcell_start[grid_index]; + int na_grid = gridt.how_many_atoms[grid_index]; + + for (int id = 0; id < na_grid; id++) + { + int ib = 0; + int mcell_index = bcell_start_index + id; + int imcell = gridt.which_bigcell[mcell_index]; + int iat = gridt.which_atom[mcell_index]; + int it_temp = ucell.iat2it[iat]; + int start_ind_grid = gridt.start_ind[grid_index]; + + for (int bx_index = 0; bx_index < gridt.bx; bx_index++) + { + for (int by_index = 0; by_index < gridt.by; by_index++) + { + for (int bz_index = 0; bz_index < gridt.bz; bz_index++) + { + double dr_temp[3]; + dr_temp[0] = gridt.meshcell_pos[ib][0] + + gridt.meshball_positions[imcell][0] + - gridt.tau_in_bigcell[iat][0]; + dr_temp[1] = gridt.meshcell_pos[ib][1] + + gridt.meshball_positions[imcell][1] + - gridt.tau_in_bigcell[iat][1]; + dr_temp[2] = gridt.meshcell_pos[ib][2] + + gridt.meshball_positions[imcell][2] + - gridt.tau_in_bigcell[iat][2]; + + double distance = sqrt(dr_temp[0] * dr_temp[0] + + dr_temp[1] * dr_temp[1] + + dr_temp[2] * dr_temp[2]); + if (distance <= rcut[it_temp]) + { + gpu_mat_cal_flag[calc_flag_index + id] = true; + int pos_temp_double = num_psi_pos + num_get_psi; + int pos_temp_int = pos_temp_double * 2; + pos_temp_double *= 5; + if (distance < 1.0E-9) + { + distance += 1.0E-9; + } + input_double[pos_temp_double] + = dr_temp[0] / distance; + input_double[pos_temp_double + 1] + = dr_temp[1] / distance; + input_double[pos_temp_double + 2] + = dr_temp[2] / distance; + input_double[pos_temp_double + 3] = distance; + + input_int[pos_temp_int] = it_temp; // atom type + input_int[pos_temp_int + 1] + = (z_index * gridt.bxyz + ib) * max_size * nwmax + + id * nwmax; // psir index in psir_ylm + num_get_psi++; + } + ib++; + } + } + } + } + num_psir[z_index] = num_get_psi; + } + + int tid = 0; + max_m = 0; + max_n = 0; + + // generate matrix multiplication tasks + for (int z_index = 0; z_index < gridt.nbzp; z_index++) + { + int grid_index = grid_index_ij + z_index; + int calc_flag_index = max_size * z_index; + int bcell_start_index = gridt.bcell_start[grid_index]; + int bcell_start_psir = z_index * gridt.bxyz * max_size * nwmax; + + for (int atom1 = 0; atom1 < gridt.how_many_atoms[grid_index]; atom1++) + { + if (!gpu_mat_cal_flag[calc_flag_index + atom1]) + { + continue; + } + int mcell_index1 = bcell_start_index + atom1; + int iat1 = gridt.which_atom[mcell_index1]; + int it1 = ucell.iat2it[iat1]; + int lo1 + = gridt.trace_lo[ucell.itiaiw2iwt(it1, ucell.iat2ia[iat1], 0)]; + int nw1 = ucell.atoms[it1].nw; + + for (int atom2 = atom1; atom2 < gridt.how_many_atoms[grid_index]; + atom2++) + { + if (!gpu_mat_cal_flag[calc_flag_index + atom2]) + { + continue; + } + int mcell_index2 = bcell_start_index + atom2; + int iat2 = gridt.which_atom[mcell_index2]; + int it2 = ucell.iat2it[iat2]; + int lo2 = gridt.trace_lo[ucell.itiaiw2iwt(it2, + ucell.iat2ia[iat2], + 0)]; + int nw2 = ucell.atoms[it2].nw; + + int mat_A_idx = bcell_start_psir + atom2 * nwmax; + int mat_B_idx = lgd * lo1 + lo2; + int mat_C_idx = bcell_start_psir + atom1 * nwmax; + + mat_alpha[tid] = atom2 == atom1 ? 1 : 2; + mat_m[tid] = gridt.bxyz; + mat_n[tid] = nw1; + mat_k[tid] = nw2; + mat_lda[tid] = nwmax * max_size; + mat_ldb[tid] = lgd; + mat_ldc[tid] = nwmax * max_size; + mat_A[tid] = psir_ylm_g + mat_A_idx; + mat_B[tid] = dm_matrix_g + mat_B_idx; + mat_C[tid] = psir_dm_g + mat_C_idx; + + if (mat_m[tid] > max_m) + { + max_m = mat_m[tid]; + } + + if (mat_n[tid] > max_n) + { + max_n = mat_n[tid]; + } + + tid++; + } + } + + // generate vec dot product tasks + int* vindex = Gint_Tools::get_vindex(gridt.bxyz, + gridt.bx, + gridt.by, + gridt.bz, + nczp, + gridt.start_ind[grid_index], + gridt.ncy * nczp); + for (int i = 0; i < gridt.bxyz; i++) + { + vec_l[dot_count] + = psir_ylm_g + (bcell_start_psir + i * max_size * nwmax); + vec_r[dot_count] + = psir_dm_g + (bcell_start_psir + i * max_size * nwmax); + dot_product[dot_count] = rho_g + vindex[i]; + vec_len[dot_count] = nwmax * max_size; + dot_count++; + } + } + atom_pair_num = tid; + + delete[] gpu_mat_cal_flag; +} + +} // namespace GintKernel \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/gtask_vl.cpp b/source/module_hamilt_lcao/module_gint/gtask_vl.cpp new file mode 100644 index 0000000000..930cecb61a --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/gtask_vl.cpp @@ -0,0 +1,201 @@ +#include + +#include "gint_vl.h" +#include "module_base/ylm.h" +#include "module_hamilt_lcao/module_gint/gint_tools.h" +namespace GintKernel +{ + +void gtask_vlocal(const Grid_Technique& gridt, + const double* rcut, + const UnitCell& ucell, + const int i, + const int j, + const int max_size, + const int nczp, + const double vfactor, + const double* vlocal_global_value, + double* psir_ylm_left, + double* psir_r, + double* input_double, + int* input_int, + int* num_psir, + int* atom_pair_A_m, + int* atom_pair_B_n, + int* atom_pair_lda, + int* atom_pair_ldb, + int* atom_pair_ldc, + double** atom_pair_mat_A, + double** atom_pair_mat_B, + double** atom_pair_mat_C, + int& atom_pair_num, + int& max_m, + int& max_n) +{ + + const int grid_index_ij = i * gridt.nby * gridt.nbzp + j * gridt.nbzp; + const int nwmax = ucell.nwmax; + bool* gpu_matrix_calc_flag = new bool[max_size * gridt.nbzp]; + for (int i = 0; i < max_size * gridt.nbzp; i++) + { + gpu_matrix_calc_flag[i] = false; + } + for (int z_index = 0; z_index < gridt.nbzp; z_index++) + { + int num_get_psi = 0; + int grid_index = grid_index_ij + z_index; + int num_psi_pos = gridt.psi_size_max_z * z_index; + int calc_flag_index = max_size * z_index; + int bcell_start_index = gridt.bcell_start[grid_index]; + + for (int id = 0; id < gridt.how_many_atoms[grid_index]; id++) + { + int ib = 0; + int mcell_index = bcell_start_index + id; + int imcell = gridt.which_bigcell[mcell_index]; + int iat = gridt.which_atom[mcell_index]; + int it_temp = ucell.iat2it[iat]; + int start_ind_grid = gridt.start_ind[grid_index]; + for (int bx_index = 0; bx_index < gridt.bx; bx_index++) + { + for (int by_index = 0; by_index < gridt.by; by_index++) + { + for (int bz_index = 0; bz_index < gridt.bz; bz_index++) + { + double dr_temp[3]; + dr_temp[0] = gridt.meshcell_pos[ib][0] + + gridt.meshball_positions[imcell][0] + - gridt.tau_in_bigcell[iat][0]; + dr_temp[1] = gridt.meshcell_pos[ib][1] + + gridt.meshball_positions[imcell][1] + - gridt.tau_in_bigcell[iat][1]; + dr_temp[2] = gridt.meshcell_pos[ib][2] + + gridt.meshball_positions[imcell][2] + - gridt.tau_in_bigcell[iat][2]; + + double distance = sqrt(dr_temp[0] * dr_temp[0] + + dr_temp[1] * dr_temp[1] + + dr_temp[2] * dr_temp[2]); + if (distance <= rcut[it_temp]) + { + gpu_matrix_calc_flag[calc_flag_index + id] = true; + int pos_temp_double = num_psi_pos + num_get_psi; + int pos_temp_int = pos_temp_double * 2; + pos_temp_double *= 5; + if (distance < 1.0E-9) + { + distance += 1.0E-9; + } + input_double[pos_temp_double] + = dr_temp[0] / distance; + input_double[pos_temp_double + 1] + = dr_temp[1] / distance; + input_double[pos_temp_double + 2] + = dr_temp[2] / distance; + input_double[pos_temp_double + 3] = distance; + + int vindex_global = bx_index * gridt.ncy * nczp + + by_index * nczp + bz_index + + start_ind_grid; + input_double[pos_temp_double + 4] + = vlocal_global_value[vindex_global] * vfactor; + + input_int[pos_temp_int] = it_temp; + input_int[pos_temp_int + 1] + = ((z_index * max_size + id) * gridt.bxyz) + * nwmax + + ib; + num_get_psi++; + } + ib++; + } + } + } + } + num_psir[z_index] = num_get_psi; + } + + atom_pair_num = 0; + max_m = 0; + max_n = 0; + for (int z_index = 0; z_index < gridt.nbzp; z_index++) + { + int grid_index = grid_index_ij + z_index; + int atom_num = gridt.how_many_atoms[grid_index]; + int vldr3_index = z_index * max_size * nwmax * gridt.bxyz; + int bcell_start_index = gridt.bcell_start[grid_index]; + int calc_flag_index = max_size * z_index; + for (int atom1 = 0; atom1 < atom_num; atom1++) + { + + int iat1 = gridt.which_atom[bcell_start_index + atom1]; + int it1 = ucell.iat2it[iat1]; + int lo1 + = gridt.trace_lo[ucell.itiaiw2iwt(it1, ucell.iat2ia[iat1], 0)]; + if (gpu_matrix_calc_flag[calc_flag_index + atom1] == false) + { + continue; + } + for (int atom2 = 0; atom2 < atom_num; atom2++) + { + if (gpu_matrix_calc_flag[calc_flag_index + atom2] == false) + continue; + + int iat2 = gridt.which_atom[bcell_start_index + atom2]; + int it2 = ucell.iat2it[iat2]; + int lo2 = gridt.trace_lo[ucell.itiaiw2iwt(it2, + ucell.iat2ia[iat2], + 0)]; + if (lo1 <= lo2) + { + int atom_pair_nw + = ucell.atoms[it1].nw * ucell.atoms[it2].nw; + if (gridt.grid_vlocal_g[iat1 * ucell.nat + iat2] == nullptr) + { + // Note that this situation occurs here because the + // logic in hcontainer and + // grid integration is different. + // In hcontainer, it is iat1<=iat2, and in grid + // integral, it is lo1<=lo2. This is not entirely + // equivalent in practice. We need to investigate + // what's going on later. + // TODO + continue; + // std::cout << "Error: GridVlocal did not malloc" << + // std::endl; + } + + int calc_index1 = vldr3_index + atom1 * nwmax * gridt.bxyz; + int calc_index2 = vldr3_index + atom2 * nwmax * gridt.bxyz; + + atom_pair_mat_A[atom_pair_num] + = psir_ylm_left + calc_index1; + atom_pair_mat_B[atom_pair_num] + = psir_r + calc_index2; + atom_pair_mat_C[atom_pair_num] + = gridt.grid_vlocal_g[iat1 * ucell.nat + iat2]; + + atom_pair_lda[atom_pair_num] = gridt.bxyz; + atom_pair_ldb[atom_pair_num] = gridt.bxyz; + atom_pair_ldc[atom_pair_num] = ucell.atoms[it2].nw; + + atom_pair_A_m[atom_pair_num] = ucell.atoms[it1].nw; + atom_pair_B_n[atom_pair_num] = ucell.atoms[it2].nw; + if (atom_pair_A_m[atom_pair_num] > max_m) + { + max_m = atom_pair_A_m[atom_pair_num]; + } + if (atom_pair_B_n[atom_pair_num] > max_n) + { + max_n = atom_pair_B_n[atom_pair_num]; + } + atom_pair_num++; + } + } + } + } + + delete[] gpu_matrix_calc_flag; +} + +} // namespace GintKernel \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen.cpp b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen.cpp new file mode 100644 index 0000000000..4edfce05cb --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/code_gen.cpp @@ -0,0 +1,4448 @@ +// Generate and test the efficiency of matrix multiplication functions with different parameters +// This file takes a long time to compile + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); + +gemm_time_measure(max_m, + max_n, + d_m, + d_n, + d_k, + d_global_A_array, + d_global_lda, + d_global_B_array, + d_global_ldb, + d_global_C_array, + d_global_ldc, + batchCount, + temp_stream, + fastest_time, + fastest_algo, + cpu_result, + h_global_C, + d_global_C); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu new file mode 100644 index 0000000000..7c4b2289f2 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cu @@ -0,0 +1,213 @@ +#include + +#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" +cudaError_t checkCuda(cudaError_t result) +{ + if (result != cudaSuccess) + { + fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); + assert(result == cudaSuccess); + } + return result; +} +cudaError_t checkCudaLastError() +{ + cudaError_t result = cudaGetLastError(); + if (result != cudaSuccess) + { + fprintf(stderr, "CUDA Runtime Error: %s\n", cudaGetErrorString(result)); + assert(result == cudaSuccess); + } + return result; +} + +void dump_cuda_array_to_file(double* cuda_array, + int width, + int hight, + const std::string& filename) +{ + double* h_data = new double[width * hight]; + cudaMemcpy(h_data, + cuda_array, + width * hight * sizeof(double), + cudaMemcpyDeviceToHost); + + std::ofstream outFile(filename); + if (!outFile.is_open()) + { + std::cerr << "Failed to open file for writing." << std::endl; + } + for (int j = 0; j < hight; ++j) + { + for (int i = 0; i < width; ++i) + { + outFile << "hight" << j << " width:" << i << " " + << h_data[j * width + i] << std::endl; + } + } + outFile.close(); + delete[] h_data; +} + +template +Cuda_Mem_Wrapper::Cuda_Mem_Wrapper(int one_stream_size_in, + int one_stream_size_aligned_in, + int stream_number_in, + bool malloc_host_in) +{ + this->stream_number = stream_number_in; + this->one_stream_size = one_stream_size_in; + this->one_stream_size_aligned = one_stream_size_aligned_in; + this->total_size_aligned + = this->one_stream_size_aligned * this->stream_number; + + checkCuda(cudaMalloc((void**)&this->device_pointer, + this->total_size_aligned * sizeof(T))); + checkCuda(cudaMemset(this->device_pointer, + 0, + this->total_size_aligned * sizeof(T))); + this->host_pointer = nullptr; + + if (malloc_host_in) + { + checkCuda(cudaMallocHost((void**)&this->host_pointer, + this->total_size_aligned * sizeof(T))); + memset(this->host_pointer, 0, this->total_size_aligned * sizeof(T)); + } +} +template +Cuda_Mem_Wrapper::Cuda_Mem_Wrapper(int one_stream_size_in, + int stream_number_in, + bool malloc_host_in) + : Cuda_Mem_Wrapper(one_stream_size_in, + one_stream_size_in, + stream_number_in, + malloc_host_in) +{ +} +template +void Cuda_Mem_Wrapper::free_all() +{ + checkCuda(cudaFree(this->device_pointer)); + if (this->host_pointer != nullptr) + { + checkCuda(cudaFreeHost(this->host_pointer)); + } +} + +template +Cuda_Mem_Wrapper::~Cuda_Mem_Wrapper() +{ + this->free_all(); +} + +template +void Cuda_Mem_Wrapper::copy_host_to_device_sync(int stream_id) +{ + if (this->host_pointer == nullptr || this->device_pointer == nullptr) + { + std::cerr << "host_pointer is nullptr, can not copy host to device" + << std::endl; + exit(1); + } + checkCuda(cudaMemcpy( + this->device_pointer + stream_id * this->one_stream_size_aligned, + this->host_pointer + stream_id * this->one_stream_size_aligned, + this->one_stream_size * sizeof(T), + cudaMemcpyHostToDevice)); +} + +template +void Cuda_Mem_Wrapper::copy_host_to_device_async(cudaStream_t stream, + int stream_id) +{ + if (this->host_pointer == nullptr || this->device_pointer == nullptr) + { + std::cerr << "host_pointer is nullptr, can not copy host to device" + << std::endl; + exit(1); + } + checkCuda(cudaMemcpyAsync( + this->device_pointer + stream_id * this->one_stream_size_aligned, + this->host_pointer + stream_id * this->one_stream_size_aligned, + this->one_stream_size * sizeof(T), + cudaMemcpyHostToDevice, + stream)); +} + +template +void Cuda_Mem_Wrapper::copy_device_to_host_sync(int stream_id) +{ + if (this->host_pointer == nullptr || this->device_pointer == nullptr) + { + std::cerr << "host_pointer is nullptr, can not copy device to host" + << std::endl; + exit(1); + } + checkCuda(cudaMemcpy( + this->host_pointer + stream_id * this->one_stream_size_aligned, + this->device_pointer + stream_id * this->one_stream_size_aligned, + this->one_stream_size * sizeof(T), + cudaMemcpyDeviceToHost)); +} + +template +void Cuda_Mem_Wrapper::copy_device_to_host_async(cudaStream_t stream, + int stream_id) +{ + if (this->host_pointer == nullptr || this->device_pointer == nullptr) + { + std::cerr << "host_pointer is nullptr, can not copy device to host" + << std::endl; + exit(1); + } + checkCuda(cudaMemcpyAsync( + this->host_pointer + stream_id * this->one_stream_size_aligned, + this->device_pointer + stream_id * this->one_stream_size_aligned, + this->one_stream_size * sizeof(T), + cudaMemcpyDeviceToHost, + stream)); +} +template +void Cuda_Mem_Wrapper::memset_device_sync(int stream_id, int value) +{ + checkCuda(cudaMemset(this->device_pointer + + stream_id * this->one_stream_size_aligned, + value, + this->one_stream_size * sizeof(T))); +} + +template +void Cuda_Mem_Wrapper::memset_device_async(cudaStream_t stream, + int stream_id, + int value) +{ + checkCuda(cudaMemsetAsync(this->device_pointer + + stream_id * this->one_stream_size_aligned, + value, + this->one_stream_size * sizeof(T), + stream)); +} + +template +void Cuda_Mem_Wrapper::memset_host(int stream_id, int value) +{ + memset(this->host_pointer + stream_id * this->one_stream_size_aligned, + value, + this->one_stream_size * sizeof(T)); +} + +template +T* Cuda_Mem_Wrapper::get_device_pointer(int stream_id) +{ + return this->device_pointer + stream_id * this->one_stream_size_aligned; +} + +template +T* Cuda_Mem_Wrapper::get_host_pointer(int stream_id) +{ + return this->host_pointer + stream_id * this->one_stream_size_aligned; +} +template class Cuda_Mem_Wrapper; +template class Cuda_Mem_Wrapper; +template class Cuda_Mem_Wrapper; \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh new file mode 100644 index 0000000000..e0d7b4d2c0 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh @@ -0,0 +1,78 @@ +#ifndef CUDA_TOOLS_CUH +#define CUDA_TOOLS_CUH +#include // for assert +#include +#include // for CUDA_VERSION +#include + +#include +#include +#include +cudaError_t checkCuda(cudaError_t result); +cudaError_t checkCudaLastError(); + +void dump_cuda_array_to_file(double* cuda_array, + int width, + int hight, + const std::string& filename); + +/* + * @brief: A simple wrapper for cudaMalloc and cudaFree, sync and async CUDA + * memory copy + * @param: T: the type of the data + * + * @note: + * Manual management of CUDA memory is a very delicate task; complex pointers + * and malloc/free operations make it easy for us to encounter memory bugs. The + * severity of the issues increases significantly when introducing multi-node, + * multi-GPU, and multi-stream parallelism. + * Debugging after encountering bugs is also very difficult, finding the leaking + * pointer from dozens of variables can be quite a headache. + * Therefore, considering that our use and management of memory have some + * homogeneity, we have abstracted these needs into the following encapsulations + * to reduce the cost of maintenance and development. The memory is allocated in + * the constructor and freed in the destructor. + * + * The following interface is primarily designed for the following requirements: + * 1. We need to split a large task into multiple subtasks to run on multiple + * streams across multiple GPUs on multiple nodes. + * 2. It is necessary to allocate memory of the same shape on both host and + * device. + * 3. Data copying between host and device sync or async is required. + */ + +template +class Cuda_Mem_Wrapper +{ + public: + Cuda_Mem_Wrapper(int one_stream_size, + int one_stream_size_aligned, + int stream_number = 1, + bool malloc_host = true); + Cuda_Mem_Wrapper(int one_stream_size, + int stream_number = 1, + bool malloc_host = true); + ~Cuda_Mem_Wrapper(); + void copy_host_to_device_sync(int stream_id = 0); + void copy_host_to_device_async(cudaStream_t stream, int stream_id); + void copy_device_to_host_sync(int stream_id = 0); + void copy_device_to_host_async(cudaStream_t stream, int stream_id); + void memset_device_sync(int stream_id = 0, int value = 0); + void memset_device_async(cudaStream_t stream, + int stream_id = 0, + int value = 0); + void memset_host(int stream_id = 0, int value = 0); + T* get_device_pointer(int stream_id = 0); + T* get_host_pointer(int stream_id = 0); + void free_all(); + + private: + T* device_pointer; + T* host_pointer; + int one_stream_size; + int one_stream_size_aligned; + int stream_number; + int total_size_aligned; +}; + +#endif // CUDA_TOOLS_CUH#ifndef CUDA_TOOLS_CUH \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu new file mode 100644 index 0000000000..bbc1d7dcb8 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu @@ -0,0 +1,620 @@ +#include "gint_force.cuh" +#include "interp.cuh" +#include "module_hamilt_lcao/module_gint/gint_force.h" +#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" +#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cuh" +#include "module_hamilt_lcao/module_gint/kernels/cuda/sph.cuh" +// CUDA kernel to calculate psi and force +namespace GintKernel +{ + +/*! + * \file + * \brief CUDA kernel to calculate psi and force + * + * CUDA kernel that performs calculations on psi and force. + * + * \param ylmcoef Pointer to the Ylm coefficients + * \param delta_r_g Delta r value + * \param bxyz_g Bxyz value + * \param nwmax_g Nwmax value + * \param input_double Array of double input values + * \param input_int Array of int input values + * \param num_psir Array containing the number of psi for each block + * \param psi_size_max Maximum size of psi + * \param ucell_atom_nwl Array containing Ucell atom nwl values + * \param atom_iw2_new Array indicating whether atom_iw2 is new + * \param atom_iw2_ylm Array of atom_iw2 Ylm values + * \param atom_iw2_l Array of atom_iw2 l values + * \param atom_nw Array of atom_nw values + * \param nr_max Maximum nr value + * \param psi_u Array for psi_u values,recording the Spherical Harmonics from psi + * \param psir_r Array for psir_r values,recored the distance from psi + * \param psir_lx Array for psir_lx values,recored the force left in x + * \param psir_ly Array for psir_ly values,recored the force left in y + * \param psir_lz Array for psir_lz values,recored the force left in z + * \param psir_lxx Array for psir_lxx values,recored the stress left in xx + * \param psir_lxy Array for psir_lxy values,recored the stress left in xy + * \param psir_lxz Array for psir_lxz values,recored the stress left in xz + * \param psir_lyy Array for psir_lyy values,recored the stress left in yy + * \param psir_lyz Array for psir_lyz values,recored the stress left in yz + * \param psir_lzz Array for psir_lzz values,recored the stress left in zz + */ + +__global__ void get_psi_force(double* ylmcoef, + double delta_r_g, + int bxyz_g, + double nwmax_g, + double* input_double, + int* input_int, + int* num_psir, + int psi_size_max, + int* ucell_atom_nwl, + bool* atom_iw2_new, + int* atom_iw2_ylm, + int* atom_iw2_l, + int* atom_nw, + int nr_max, + double* psi_u, + double* psir_r, + double* psir_lx, + double* psir_ly, + double* psir_lz, + double* psir_lxx, + double* psir_lxy, + double* psir_lxz, + double* psir_lyy, + double* psir_lyz, + double* psir_lzz) +{ + // Get the size of psi for the current block + int size = num_psir[blockIdx.x]; + int start_index = psi_size_max * blockIdx.x; + int end_index = start_index + size; + start_index += threadIdx.x + blockDim.x * blockIdx.y; + // Loop over the psi indices for the current block + for (int index = start_index; index < end_index; + index += blockDim.x * gridDim.y) + { + // Extract information from input arrays + double dr[3]; + int index_double = index * 5; + dr[0] = input_double[index_double]; + dr[1] = input_double[index_double + 1]; + dr[2] = input_double[index_double + 2]; + double distance = input_double[index_double + 3]; + distance = distance * distance; + double vlbr3_value = input_double[index_double + 4]; + // begin calculation + double ylma[49]; // Attention!!! At present, we only use L=5 at + // most. So (L+1) * (L+1)=36 + double grly[49][3]; + int index_int = index * 2; + int it = input_int[index_int]; + int dist_tmp = input_int[index_int + 1]; + + int nwl = ucell_atom_nwl[it]; + spherical_harmonics_d(dr, distance, grly, nwl, ylma, ylmcoef); + + interpolate_f(distance, + delta_r_g, + it, + nwmax_g, + nr_max, + atom_nw, + atom_iw2_new, + psi_u, + atom_iw2_l, + atom_iw2_ylm, + psir_r, + dist_tmp, + ylma, + vlbr3_value, + psir_lx, + dr, + grly, + psir_ly, + psir_lz, + psir_lxx, + psir_lxy, + psir_lxz, + psir_lyy, + psir_lyz, + psir_lzz); + } +} + + +/** + * \brief Compute dot product of stress components and partial derivatives. + * + * This CUDA kernel computes the dot product of stress components and partial + * derivatives based on the input arrays. + * + * \param psir_lxx Array of psir_lxx values. + * \param psir_lxy Array of psir_lxy values. + * \param psir_lxz Array of psir_lxz values. + * \param psir_lyy Array of psir_lyy values. + * \param psir_lyz Array of psir_lyz values. + * \param psir_lzz Array of psir_lzz values. + * \param psir_ylm_dm Array of psir_ylm_dm values. + * \param stress_dot Output array for the dot product of stress components. + * \param elements_num Number of elements in the input arrays. + */ + +__global__ void dot_product_stress(double* psir_lxx, + double* psir_lxy, + double* psir_lxz, + double* psir_lyy, + double* psir_lyz, + double* psir_lzz, + double* psir_ylm_dm, + double* stress_dot, + int elements_num) +{ + + __shared__ double cache[256][6]; // == threadsPerBlock + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int cacheIndex = threadIdx.x; + double tmp[6] = {0.0}; + while (tid < elements_num) + { + tmp[0] += psir_lxx[tid] * psir_ylm_dm[tid] * 2; + tmp[1] += psir_lxy[tid] * psir_ylm_dm[tid] * 2; + tmp[2] += psir_lxz[tid] * psir_ylm_dm[tid] * 2; + tmp[3] += psir_lyy[tid] * psir_ylm_dm[tid] * 2; + tmp[4] += psir_lyz[tid] * psir_ylm_dm[tid] * 2; + tmp[5] += psir_lzz[tid] * psir_ylm_dm[tid] * 2; + tid += blockDim.x * gridDim.x; + } + + for (int i = 0; i < 6; i++) + { + cache[cacheIndex][i] = tmp[i]; + } + __syncthreads(); + + int i = blockDim.x / 2; + while (i != 0) + { + if (cacheIndex < i) + { + for (int index = 0; index < 6; index++) + { + cache[cacheIndex][index] += cache[cacheIndex + i][index]; + } + } + __syncthreads(); + i /= 2; + } + + if (cacheIndex == 0){ + for (int index = 0; index < 6; index++) + { + stress_dot[blockIdx.x + gridDim.x * index] = cache[0][index]; + } + } +} + +/** + * @brief Calculate the dot product force. + * + * This function calculates the dot product force based on the provided + * parameters. + * + * @param psir_lx Pointer to the array of psir_lx values. + * @param psir_ly Pointer to the array of psir_ly values. + * @param psir_lz Pointer to the array of psir_lz values. + * @param psir_ylm_dm Pointer to the array of psir_ylm_dm values. + * @param force_dot Pointer to the array where the calculated force will be + * stored. + * @param iat Pointer to the array of iat values. + * @param nwmax Maximum value for nwmax. + * @param max_size Maximum size for arrays. + * @param elements_num Number of elements to process. + */ + +__global__ void dot_product_force(double* psir_lx, + double* psir_ly, + double* psir_lz, + double* psir_ylm_dm, + double* force_dot, + int* iat, + int nwmax, + int max_size, + int elements_num) +{ + int tid = threadIdx.x + blockIdx.x * blockDim.x; + while (tid < elements_num) + { + int iat_on_nbz = iat[tid]; + if (iat_on_nbz <= -1) + { + tid += blockDim.x * gridDim.x; + continue; + } + + int iat_index = tid * 3; + int dist = tid * nwmax; + double tmp[3] = {0.0}; + + for (int i = 0; i < nwmax; i++) + { + tmp[0] += psir_lx[dist + i] * psir_ylm_dm[dist + i] * 2; + tmp[1] += psir_ly[dist + i] * psir_ylm_dm[dist + i] * 2; + tmp[2] += psir_lz[dist + i] * psir_ylm_dm[dist + i] * 2; + } + + for (int i = 0; i < 3; i++) + { + force_dot[iat_index + i] = tmp[i]; + } + tid += blockDim.x * gridDim.x; + } +} +void calculateInit(DensityMat& denstiy_mat, + ForceStressIatGlobal& f_s_iat_dev, + hamilt::HContainer* dm, + const Grid_Technique& gridt, + const UnitCell& ucell, + int lgd, + int cuda_block, + int atom_num_grid) +{ + denstiy_mat.density_mat_h = new double[lgd * lgd]; + allocateDm(denstiy_mat.density_mat_h, dm, gridt, ucell); + + checkCuda(cudaMalloc((void**)&denstiy_mat.density_mat_d, + lgd * lgd * sizeof(double))); + checkCuda(cudaMemcpy(denstiy_mat.density_mat_d, + denstiy_mat.density_mat_h, + lgd * lgd * sizeof(double), + cudaMemcpyHostToDevice)); + + checkCuda(cudaMalloc((void**)&f_s_iat_dev.stress_global, + 6 * cuda_block * gridt.nstreams * sizeof(double))); + checkCuda(cudaMemset(f_s_iat_dev.stress_global, + 0, + 6 * cuda_block * gridt.nstreams * sizeof(double))); + + checkCuda(cudaMalloc((void**)&f_s_iat_dev.force_global, + 3 * atom_num_grid * gridt.nstreams * sizeof(double))); + checkCuda(cudaMemset(f_s_iat_dev.force_global, + 0, + 3 * atom_num_grid * gridt.nstreams * sizeof(double))); + + checkCuda(cudaMalloc((void**)&f_s_iat_dev.iat_global, + atom_num_grid * gridt.nstreams * sizeof(int))); + checkCuda(cudaMemset(f_s_iat_dev.iat_global, + 0, + atom_num_grid * gridt.nstreams * sizeof(int))); +} + +/** + * @brief grid parameter Init + * + * GridParameter init + * + * @param para double *,contained the destiyMatHost + * @param iter_num int , used for calcute the stream + * @param nbz int,stand for the number of Z-axis + * @param gridt Grid_Technique,stored the major method in the the gint. + */ +void para_init(SGridParameter& para, + int iter_num, + int nbz, + const Grid_Technique& gridt) +{ + + // stream_num stand for nstreams + para.stream_num = iter_num % gridt.nstreams; + //input_dou and input _int used for the Spherical Harmonics + para.input_dou + = &gridt.psi_dbl_gbl[gridt.psi_size_max * para.stream_num * 5]; + para.input_int + = &gridt.psi_int_gbl[gridt.psi_size_max * para.stream_num * 2]; + para.num_psir = &gridt.num_psir_gbl[nbz * para.stream_num]; + //one dimension,record the length and the leading dimension of three matrix + para.atom_pair_A_m + = &gridt.l_info_global[gridt.atom_pair_nbz * para.stream_num]; + para.atom_pair_B_n + = &gridt.r_info_global[gridt.atom_pair_nbz * para.stream_num]; + para.atom_pair_K + = &gridt.k_info_global[gridt.atom_pair_nbz * para.stream_num]; + para.atom_pair_lda + = &gridt.lda_info_global[gridt.atom_pair_nbz * para.stream_num]; + para.atom_pair_ldb + = &gridt.ldb_info_global[gridt.atom_pair_nbz * para.stream_num]; + para.atom_pair_ldc + = &gridt.ldc_info_global[gridt.atom_pair_nbz * para.stream_num]; + //input_double_g and input_int_g used for the Spherical Harmonics on GPU + para.input_double_g + = &gridt.psi_dbl_gbl_g[gridt.psi_size_max * para.stream_num * 5]; + para.input_int_g + = &gridt.psi_int_gbl_g[gridt.psi_size_max * para.stream_num * 2]; + para.num_psir_g = &gridt.num_psir_gbl_g[nbz * para.stream_num]; + para.psir_dm_device = &gridt.dm_global_g[gridt.psir_size * para.stream_num]; + para.psir_r_device + = &gridt.right_global_g[gridt.psir_size * para.stream_num]; + //psi function ,record the force in x y z,and the stress in six dimension + para.psir_lx_device = &gridt.d_left_x_g[gridt.psir_size * para.stream_num]; + para.psir_ly_device = &gridt.d_left_y_g[gridt.psir_size * para.stream_num]; + para.psir_lz_device = &gridt.d_left_z_g[gridt.psir_size * para.stream_num]; + para.psir_lxx_device + = &gridt.dd_left_xx_g[gridt.psir_size * para.stream_num]; + para.psir_lxy_device + = &gridt.dd_left_xy_g[gridt.psir_size * para.stream_num]; + para.psir_lxz_device + = &gridt.dd_left_xz_g[gridt.psir_size * para.stream_num]; + para.psir_lyy_device + = &gridt.dd_left_yy_g[gridt.psir_size * para.stream_num]; + para.psir_lyz_device + = &gridt.dd_left_yz_g[gridt.psir_size * para.stream_num]; + para.psir_lzz_device + = &gridt.dd_left_zz_g[gridt.psir_size * para.stream_num]; + //one dimension,record the length and the leading dimension of three matrix on GPU + para.A_m_device + = &gridt.l_info_global_g[gridt.atom_pair_nbz * para.stream_num]; + para.B_n_device + = &gridt.r_info_global_g[gridt.atom_pair_nbz * para.stream_num]; + para.K_device + = &gridt.k_info_global_g[gridt.atom_pair_nbz * para.stream_num]; + para.lda_device + = &gridt.lda_info_gbl_g[gridt.atom_pair_nbz * para.stream_num]; + para.ldb_device + = &gridt.ldb_info_gbl_g[gridt.atom_pair_nbz * para.stream_num]; + para.ldc_device + = &gridt.ldc_info_gbl_g[gridt.atom_pair_nbz * para.stream_num]; + //two dimension,record number to compute + para.matrix_A = &gridt.ap_left_gbl[gridt.atom_pair_nbz * para.stream_num]; + para.matrix_B = &gridt.ap_right_gbl[gridt.atom_pair_nbz * para.stream_num]; + para.matrix_C = &gridt.ap_output_gbl[gridt.atom_pair_nbz * para.stream_num]; + para.matrix_A_device + = &gridt.ap_left_gbl_g[gridt.atom_pair_nbz * para.stream_num]; + para.matrix_B_device + = &gridt.ap_right_gbl_g[gridt.atom_pair_nbz * para.stream_num]; + para.matrix_C_device + = &gridt.ap_output_gbl_g[gridt.atom_pair_nbz * para.stream_num]; +} +/** + * @brief ForceStressIat on host and device Init + * + * GridParameter init + * + * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Host + * @param stream_num int , record the stream in GPU + * @param cuda_block in stress compute,used for Block nums + * @param atom_num_grid in force calculate,used for Block nums + * @param max_size Maximum size of atoms on a grid. + * @param ForceStressIatGlobal ForceStressIatGlobal,contains the Force Stree Iat on Host + */ +void cal_init(ForceStressIat& f_s_iat, + const int stream_num, + const int cuda_block, + const int atom_num_grid, + const int max_size, + const ForceStressIatGlobal& f_s_iat_dev) +{ + const int iat_min = -max_size - 1; + f_s_iat.stress_host = new double[6 * cuda_block]; + f_s_iat.stress_device + = &f_s_iat_dev.stress_global[6 * cuda_block * stream_num]; + f_s_iat.force_device + = &f_s_iat_dev.force_global[3 * atom_num_grid * stream_num]; + f_s_iat.iat_device + = &f_s_iat_dev.iat_global[atom_num_grid * stream_num]; + f_s_iat.iat_host = new int[atom_num_grid]; + for (int index = 0; index < atom_num_grid; index++) + { + f_s_iat.iat_host[index] = iat_min; + } + f_s_iat.force_host = new double[3 * atom_num_grid]; + ModuleBase::GlobalFunc::ZEROS(f_s_iat.force_host, + 3 * atom_num_grid); +} + +/** + * @brief GridParameter memCpy,from Host to Device + * + * parameter init,which contains the gpu task and multi matrix multiplication + * + * @param para Grid parameter in task generator, + * @param gridt Grid_Technique,stored the major method in the the gint. + * @param nbz int,stand for the number of Z-axis + * @param atom_num_grid in force calculate,used for Block nums + */ +void para_mem_copy(SGridParameter& para, + const Grid_Technique& gridt, + const int nbz, + const int atom_num_grid) +{ + checkCuda(cudaMemcpyAsync(para.input_double_g, + para.input_dou, + gridt.psi_size_max * 5 * sizeof(double), + cudaMemcpyHostToDevice, + gridt.streams[para.stream_num])); + checkCuda(cudaMemcpyAsync(para.input_int_g, + para.input_int, + gridt.psi_size_max * 2 * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[para.stream_num])); + checkCuda(cudaMemcpyAsync(para.num_psir_g, + para.num_psir, + nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[para.stream_num])); + checkCuda(cudaMemcpyAsync(para.A_m_device, + para.atom_pair_A_m, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[para.stream_num])); + checkCuda(cudaMemcpyAsync(para.B_n_device, + para.atom_pair_B_n, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[para.stream_num])); + checkCuda(cudaMemcpyAsync(para.K_device, + para.atom_pair_K, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[para.stream_num])); + checkCuda(cudaMemcpyAsync(para.lda_device, + para.atom_pair_lda, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[para.stream_num])); + checkCuda(cudaMemcpyAsync(para.ldb_device, + para.atom_pair_ldb, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[para.stream_num])); + checkCuda(cudaMemcpyAsync(para.ldc_device, + para.atom_pair_ldc, + gridt.atom_pair_nbz * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[para.stream_num])); + checkCuda(cudaMemcpyAsync(para.matrix_A_device, + para.matrix_A, + gridt.atom_pair_nbz * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[para.stream_num])); + checkCuda(cudaMemcpyAsync(para.matrix_B_device, + para.matrix_B, + gridt.atom_pair_nbz * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[para.stream_num])); + checkCuda(cudaMemcpyAsync(para.matrix_C_device, + para.matrix_C, + gridt.atom_pair_nbz * sizeof(double*), + cudaMemcpyHostToDevice, + gridt.streams[para.stream_num])); + checkCuda(cudaMemsetAsync(para.psir_dm_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[para.stream_num])); + checkCuda(cudaMemsetAsync(para.psir_r_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[para.stream_num])); + checkCuda(cudaMemsetAsync(para.psir_lx_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[para.stream_num])); + checkCuda(cudaMemsetAsync(para.psir_ly_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[para.stream_num])); + checkCuda(cudaMemsetAsync(para.psir_lz_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[para.stream_num])); + checkCuda(cudaMemsetAsync(para.psir_lxx_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[para.stream_num])); + checkCuda(cudaMemsetAsync(para.psir_lxy_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[para.stream_num])); + checkCuda(cudaMemsetAsync(para.psir_lxz_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[para.stream_num])); + checkCuda(cudaMemsetAsync(para.psir_lyy_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[para.stream_num])); + checkCuda(cudaMemsetAsync(para.psir_lyz_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[para.stream_num])); + checkCuda(cudaMemsetAsync(para.psir_lzz_device, + 0, + gridt.psir_size * sizeof(double), + gridt.streams[para.stream_num])); +} +/** + * @brief Force Stress Force Iat memCpy,from Host to Device + * + * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Device + * and Host + * @param gridt Grid_Technique,stored the major method in the the gint. + * @param atom_num_grid in force calculate,used for Block nums + * @param cuda_block in stress compute,used for Block nums + * @param stream_num int , record the stream in GPU + */ +void cal_mem_cpy(ForceStressIat& f_s_iat, + const Grid_Technique& gridt, + const int atom_num_grid, + const int cuda_block, + const int stream_num) +{ + checkCuda(cudaMemcpyAsync(f_s_iat.iat_device, + f_s_iat.iat_host, + atom_num_grid * sizeof(int), + cudaMemcpyHostToDevice, + gridt.streams[stream_num])); + checkCuda(cudaMemsetAsync(f_s_iat.stress_device, + 0, + 6 * cuda_block * sizeof(double), + gridt.streams[stream_num])); + checkCuda(cudaMemsetAsync(f_s_iat.force_device, + 0, + 3 * atom_num_grid * sizeof(double), + gridt.streams[stream_num])); +} +/* + * @brief Force Calculate on Host + * + * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Device + * and Host + * @param force stored the force for each atom on each directions + * @param atom_num_grid in force calculate,used for Block nums + */ +void cal_force_add(ForceStressIat& f_s_iat, + double* force, + const int atom_num_grid) +{ + checkCuda(cudaMemcpy(f_s_iat.force_host, + f_s_iat.force_device, + 3 * atom_num_grid * sizeof(double), + cudaMemcpyDeviceToHost)); + for (int index1 = 0; index1 < atom_num_grid; index1++) + { + int iat1 = f_s_iat.iat_host[index1]; + if (iat1 >= 0) + { + for (int index2 = 0; index2 < 3; index2++) + { + force[iat1 * 3 + index2] + += f_s_iat.force_host[index1 * 3 + index2]; + } + } + } +} +/** + * @brief Stress Calculate on Host + * + * @param ForceStressIat ForceStressIat,contains the Force Stree Iat on Device + * and Host + * @param stress stored the stress for each directions + * @param cuda_block in stress compute,used for Block nums + */ +void cal_stress_add(ForceStressIat& f_s_iat, + double* stress, + const int cuda_block) +{ + checkCuda(cudaMemcpy(f_s_iat.stress_host, + f_s_iat.stress_device, + 6 * cuda_block * sizeof(double), + cudaMemcpyDeviceToHost)); + for (int i = 0; i < 6; i++) + { + for (int index = 0; index < cuda_block; index++) + { + // printf("the stress is %f\n",stress[i]); + stress[i] += f_s_iat.stress_host[i * cuda_block + index]; + } + } +} +} // namespace GintKernel diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cuh new file mode 100644 index 0000000000..ee199f90bf --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cuh @@ -0,0 +1,120 @@ +#ifndef GINT_FORCE_CUH +#define GINT_FORCE_CUH + +#include +namespace GintKernel +{ + +/** + * @brief GPU kernel to calculate the force. + * + * This kernel calculates the force based on provided input parameters. + * + * @param ylmcoef Coefficients for Ylm. + * @param delta_r_g Delta r value. + * @param bxyz_g Bxyz values. + * @param nwmax_g Maximum nw value. + * @param input_double Array of input double values. + * @param input_int Array of input int values. + * @param num_psir Array representing the number of psir. + * @param psi_size_max Maximum size of psi. + * @param ucell_atom_nwl Array representing the unit cell atom nwl. + * @param atom_iw2_new Array representing the atom iw2 new. + * @param atom_iw2_ylm Array representing the atom iw2 ylm. + * @param atom_iw2_l Array representing the atom iw2 l. + * @param atom_nw Array representing the atom nw. + * @param nr_max Maximum nr value. + * @param psi_u Array representing psi_u values. + * @param psir_r Array representing psir ylm right values. + * @param psir_lx Array representing dpsir ylm left x values. + * @param psir_ly Array representing dpsir ylm left y values. + * @param psir_lz Array representing dpsir ylm left z values. + * @param psir_lxx Array representing ddpsir ylm left xx values. + * @param psir_lxy Array representing ddpsir ylm left xy values. + * @param psir_lxz Array representing ddpsir ylm left xz values. + * @param psir_lyy Array representing ddpsir ylm left yy values. + * @param psir_lyz Array representing ddpsir ylm left yz values. + * @param psir_lzz Array representing ddpsir ylm left zz values. + */ +__global__ void get_psi_force(double* ylmcoef, + double delta_r_g, + int bxyz_g, + double nwmax_g, + double* input_double, + int* input_int, + int* num_psir, + int psi_size_max, + int* ucell_atom_nwl, + bool* atom_iw2_new, + int* atom_iw2_ylm, + int* atom_iw2_l, + int* atom_nw, + int nr_max, + double* psi_u, + double* psir_r, + double* psir_lx, + double* psir_ly, + double* psir_lz, + double* psir_lxx, + double* psir_lxy, + double* psir_lxz, + double* psir_lyy, + double* psir_lyz, + double* psir_lzz); + + + +/** + * @brief GPU kernel to calculate the dot product for stress. + * + * This kernel calculates the dot product for stress based on provided input + * parameters. + * + * @param psir_lxx Array representing ddpsir ylm left xx values. + * @param psir_lxy Array representing ddpsir ylm left xy values. + * @param psir_lxz Array representing ddpsir ylm left xz values. + * @param psir_lyy Array representing ddpsir ylm left yy values. + * @param psir_lyz Array representing ddpsir ylm left yz values. + * @param psir_lzz Array representing ddpsir ylm left zz values. + * @param psir_ylm_dm Array representing psir ylm dm values. + * @param stress_dot Array representing stress dot values. + * @param elements_num Number of elements. + */ +__global__ void dot_product_stress(double* psir_lxx, + double* psir_lxy, + double* psir_lxz, + double* psir_lyy, + double* psir_lyz, + double* psir_lzz, + double* psir_ylm_dm, + double* stress_dot, + int elements_num); + +/** + * @brief GPU kernel to calculate the dot product for force. + * + * This kernel calculates the dot product for force based on provided input + * parameters. + * + * @param psir_lx Array representing dpsir ylm left x values. + * @param psir_ly Array representing dpsir ylm left y values. + * @param psir_lz Array representing dpsir ylm left z values. + * @param psir_ylm_dm Array representing psir ylm dm values. + * @param force_dot Array representing force dot values. + * @param iat Array representing iat values. + * @param nwmax Maximum nw value. + * @param max_size Maximum size value. + * @param elements_num Number of elements. + */ +__global__ void dot_product_force(double* psir_lx, + double* psir_ly, + double* psir_lz, + double* psir_ylm_dm, + double* force_dot, + int* iat, + int nwmax, + int max_size, + int elements_num); + +} // namespace GintKernel +#endif // GINT_VL_CUH diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cu new file mode 100644 index 0000000000..d7dcf4e6d7 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cu @@ -0,0 +1,84 @@ +#include "interp.cuh" +#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh" +#include "sph.cuh" + +namespace GintKernel +{ + +__global__ void get_psi(const double* const ylmcoef, + double delta_r_g, + int bxyz_g, + double nwmax_g, + const double* const input_double, + const int* const input_int, + const int* const num_psir, + int psi_size_max, + const int* const ucell_atom_nwl, + const bool* const atom_iw2_new, + const int* const atom_iw2_ylm, + const int* const atom_nw, + int nr_max, + const double* const psi_u, + double* psir_ylm) +{ + int size = num_psir[blockIdx.x]; + int start_index = psi_size_max * blockIdx.x; + int end_index = start_index + size; + start_index += threadIdx.x + blockDim.x * blockIdx.y; + for (int index = start_index; index < end_index; + index += blockDim.x * gridDim.y) + { + double dr[3]; + int index_double = index * 5; + dr[0] = input_double[index_double]; + dr[1] = input_double[index_double + 1]; + dr[2] = input_double[index_double + 2]; + double distance = input_double[index_double + 3]; + double ylma[49]; + int index_int = index * 2; + int it = input_int[index_int]; + int dist_tmp = input_int[index_int + 1]; + int nwl = ucell_atom_nwl[it]; + + spherical_harmonics(dr, distance, nwl, ylma, ylmcoef); + + interpolate(distance, + delta_r_g, + it, + nwmax_g, + nr_max, + atom_nw, + atom_iw2_new, + psi_u, + ylma, + atom_iw2_ylm, + psir_ylm, + dist_tmp, + 1); + } +} + +__global__ void psir_dot(const int* n, + double** vec_l_g, + int incl, + double** vec_r_g, + int incr, + double** results_g, + int batchcount) +{ + int id = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = id; i < batchcount; i += stride) + { + double* sum = results_g[i]; + double* x = vec_l_g[i]; + double* y = vec_r_g[i]; + + for (int j = 0; j < n[i]; j++) + { + sum[0] += x[j * incl] * y[j * incr]; + } + } +} + +} // namespace GintKernel \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh new file mode 100644 index 0000000000..958b598954 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh @@ -0,0 +1,66 @@ +#ifndef GINT_RHO_CUH +#define GINT_RHO_CUH + +#include +namespace GintKernel +{ + +/** + * @brief CUDA kernel to calculate psir. + * + * This kernel calculates the wave function psir using the provided input + * parameters. + * + * @param ylmcoef pointer to the array of Ylm coefficients. + * @param delta_r_g value of delta_r_g. + * @param bxyz_g number of meshcells in a bigcell. + * @param nwmax_g maximum nw. + * @param input_double `double` type datas used to calculate psir. + * @param input_int `int` type datas used to calculate psir. + * @param num_psir number of atoms on each bigcell. + * @param psi_size_max maximum number of atoms on bigcell. + * @param ucell_atom_nwl nw of each type of atom. + * @param atom_iw2_new + * @param atom_iw2_ylm + * @param atom_nw pointer to the array of atom_nw values. + * @param nr_max + * @param psi_u + * @param psir_ylm + */ +__global__ void get_psi(const double* const ylmcoef, + double delta_r_g, + int bxyz_g, + double nwmax_g, + const double* const input_double, + const int* const input_int, + const int* const num_psir, + int psi_size_max, + const int* const ucell_atom_nwl, + const bool* const atom_iw2_new, + const int* const atom_iw2_ylm, + const int* const atom_nw, + int nr_max, + const double* const psi_u, + double* psir_ylm); + +/** + * @brief Kernel function to calculate batch vector dot products. + * + * @param n vector length. + * @param vec_l_g pointers to left vec. + * @param incl stride between consecutive elements in the `vec_l_g`. + * @param vec_r_g pointers to right vec. + * @param incr stride between consecutive elements in the `vec_r_g`. + * @param results_g dot product results. + * @param batchcount total count of dot products to compute. + */ +__global__ void psir_dot(const int* n, + double** vec_l_g, + int incl, + double** vec_r_g, + int incr, + double** results_g, + int batchcount); + +} // namespace GintKernel +#endif // GINT_RHO_CUH \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cu new file mode 100644 index 0000000000..62edcc7f42 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cu @@ -0,0 +1,68 @@ +#include "gint_vl.cuh" +#include "interp.cuh" +#include "module_hamilt_lcao/module_gint/kernels/cuda/cuda_tools.cuh" +#include "sph.cuh" +namespace GintKernel +{ + +__global__ void get_psi_and_vldr3(double* ylmcoef, + double delta_r_g, + int bxyz_g, + double nwmax_g, + double* input_double, + int* input_int, + int* num_psir, + int psi_size_max, + int* ucell_atom_nwl, + bool* atom_iw2_new, + int* atom_iw2_ylm, + int* atom_nw, + int nr_max, + double* psi_u, + double* psir_ylm_left, + double* psir_r) +{ + int size = num_psir[blockIdx.x]; + int start_index = psi_size_max * blockIdx.x; + int end_index = start_index + size; + start_index += threadIdx.x + blockDim.x * blockIdx.y; + for (int index = start_index; index < end_index; + index += blockDim.x * gridDim.y) + { + double dr[3]; + int index_double = index * 5; + dr[0] = input_double[index_double]; + dr[1] = input_double[index_double + 1]; + dr[2] = input_double[index_double + 2]; + double distance = input_double[index_double + 3]; + double vlbr3_value = input_double[index_double + 4]; + double ylma[49]; + int index_int = index * 2; + int it = input_int[index_int]; + int dist_tmp = input_int[index_int + 1]; + int nwl = ucell_atom_nwl[it]; + spherical_harmonics(dr, distance, nwl, ylma, ylmcoef); + + interpolate(distance, + delta_r_g, + it, + nwmax_g, + nr_max, + atom_nw, + atom_iw2_new, + psi_u, + ylma, + atom_iw2_ylm, + psir_ylm_left, + dist_tmp, + bxyz_g); + + for (int iw = 0; iw < atom_nw[it]; ++iw) + { + psir_r[dist_tmp] = psir_ylm_left[dist_tmp] * vlbr3_value; + dist_tmp += bxyz_g; + } + } +} + +} // namespace GintKernel \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cuh new file mode 100644 index 0000000000..afac41d36b --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cuh @@ -0,0 +1,37 @@ +#ifndef GINT_VL_CUH +#define GINT_VL_CUH + +#include +namespace GintKernel +{ +/* + * @brief: get the value of the spherical harmonics + * + * + * @note the left and right matrix elements of the grid point integral. + * We can understand the grid point integral of the local potential term + * as the following operation: + * H = psi * vlocal * psi * dr^3. + * Here, the matrix element of the left matrix is psi, and the matrix + * element of the right matrix is vlocal * psi * dr^3. + */ + +__global__ void get_psi_and_vldr3(double* ylmcoef, + double delta_r_g, + int bxyz_g, + double nwmax_g, + double* input_double, + int* input_int, + int* num_psir, + int psi_size_max, + int* ucell_atom_nwl, + bool* atom_iw2_new, + int* atom_iw2_ylm, + int* atom_nw, + int nr_max, + double* psi_u, + double* psir_ylm_left, + double* psir_r); + +} // namespace GintKernel +#endif // GINT_VL_CUH \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/interp.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/interp.cuh new file mode 100644 index 0000000000..042254e04d --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/interp.cuh @@ -0,0 +1,144 @@ +#ifndef INTERP_CUH +#define INTERP_CUH + +#include + +namespace GintKernel +{ +static __device__ void interpolate(double distance, + double delta_r_g, + int it, + double nwmax_g, + int nr_max, + const int* const atom_nw, + const bool* const atom_iw2_new, + const double* const psi_u, + const double ylma[49], + const int* const atom_iw2_ylm, + double* psir_ylm_left, + int dist_tmp, + int stride) +{ + distance /= delta_r_g; + + int ip = (int)(distance); + double dx = distance - ip; + double dx2 = dx * dx; + double dx3 = dx2 * dx; + + double c3 = 3.0 * dx2 - 2.0 * dx3; + double c1 = 1.0 - c3; + double c2 = (dx - 2.0 * dx2 + dx3) * delta_r_g; + double c4 = (dx3 - dx2) * delta_r_g; + + double phi = 0.0; + int it_nw = it * nwmax_g; + int iw_nr = (it_nw * nr_max + ip) * 2; + int it_nw_iw = it_nw; + for (int iw = 0; iw < atom_nw[it]; ++iw) + { + if (atom_iw2_new[it_nw_iw]) + { + phi = c1 * psi_u[iw_nr] + c2 * psi_u[iw_nr + 1] + + c3 * psi_u[iw_nr + 2] + c4 * psi_u[iw_nr + 3]; + } + psir_ylm_left[dist_tmp] = phi * ylma[atom_iw2_ylm[it_nw_iw]]; + dist_tmp += stride; + iw_nr += 2 * nr_max; + it_nw_iw++; + } +} + +static __device__ void interpolate_f(double distance, + double delta_r_g, + int it, + double nwmax_g, + int nr_max, + const int* const atom_nw, + const bool* const atom_iw2_new, + const double* const psi_u, + const int* const atom_iw2_l, + const int* const atom_iw2_ylm, + double* psir_r, + int dist_tmp, + const double ylma[49], + double vlbr3_value, + double* psir_lx, + const double dr[3], + const double grly[49][3], + double* psir_ly, + double* psir_lz, + double* psir_lxx, + double* psir_lxy, + double* psir_lxz, + double* psir_lyy, + double* psir_lyz, + double* psir_lzz) +{ + // Calculate normalized position for interpolation + distance = sqrt(distance); + const double postion = distance / delta_r_g; + // Extract integer part and fractional part of the position + const double ip = static_cast(postion); + const double x0 = postion - ip; + const double x1 = 1.0 - x0; + const double x2 = 2.0 - x0; + const double x3 = 3.0 - x0; + const double x12 = x1 * x2 / 6; + const double x03 = x0 * x3 / 2; + // Temporary variables for interpolation + double tmp, dtmp; + // Loop over non-zero elements in atom_nw array + int it_nw = it * nwmax_g; + int iw_nr = (it_nw * nr_max + ip) * 2; + int it_nw_iw = it_nw; + for (int iw = 0; iw < atom_nw[it]; ++iw) + { + if (atom_iw2_new[it_nw_iw]) + { + // Perform interpolation using cubic B-spline + // basis functions + tmp = x12 * (psi_u[iw_nr] * x3 + psi_u[iw_nr + 6] * x0) + + x03 * (psi_u[iw_nr + 2] * x2 - psi_u[iw_nr + 4] * x1); + dtmp = x12 * (psi_u[iw_nr + 1] * x3 + psi_u[iw_nr + 7] * x0) + + x03 * (psi_u[iw_nr + 3] * x2 - psi_u[iw_nr + 5] * x1); + } + // Extract information from atom_iw2_* arrays + const int ll = atom_iw2_l[it_nw_iw]; + + const int idx_lm = atom_iw2_ylm[it_nw_iw]; + + const double rl = pow(distance, ll); + + // Compute right-hand side of the equation + psir_r[dist_tmp] = tmp * ylma[idx_lm] / rl * vlbr3_value; + // Compute derivatives with respect to spatial + // coordinates + const double tmpdphi_rly + = (dtmp - tmp * ll / distance) / rl * ylma[idx_lm] / distance; + const double tmprl = tmp / rl; + psir_lx[dist_tmp] + = tmpdphi_rly * dr[0] + tmprl * grly[idx_lm][0]; + + psir_ly[dist_tmp] + = tmpdphi_rly * dr[1] + tmprl * grly[idx_lm][1]; + psir_lz[dist_tmp] + = tmpdphi_rly * dr[2] + tmprl * grly[idx_lm][2]; + + psir_lxx[dist_tmp] = psir_lx[dist_tmp] * dr[0]; + psir_lxy[dist_tmp] = psir_lx[dist_tmp] * dr[1]; + psir_lxz[dist_tmp] = psir_lx[dist_tmp] * dr[2]; + psir_lyy[dist_tmp] = psir_ly[dist_tmp] * dr[1]; + psir_lyz[dist_tmp] = psir_ly[dist_tmp] * dr[2]; + psir_lzz[dist_tmp] = psir_lz[dist_tmp] * dr[2]; + + // Update loop counters and indices + dist_tmp += 1; + iw_nr += nr_max; + iw_nr += nr_max; + it_nw_iw++; + } +} +} // namespace GintKernel + +#endif \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/sph.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/sph.cuh new file mode 100644 index 0000000000..0c05e24bfe --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/sph.cuh @@ -0,0 +1,520 @@ +#ifndef SPH_CUH +#define SPH_CUH + +#include "cuda_runtime.h" +#include "device_launch_parameters.h" + +namespace GintKernel +{ + +static __device__ void spherical_harmonics(const double* const dr, + double distance, + int nwl, + double (&ylma)[49], + const double* const ylmcoef) +{ + /*************************** + L = 0 + ***************************/ + ylma[0] = ylmcoef[0]; // l=0, m=0 + double tmp0; + if (nwl == 0) + return; + + /*************************** + L = 1 + ***************************/ + ylma[1] = ylmcoef[1] * dr[2]; // l=1, m=0 + ylma[2] = -ylmcoef[1] * dr[0]; // l=1, m=1 + ylma[3] = -ylmcoef[1] * dr[1]; // l=1, m=-1 + if (nwl == 1) + return; + + /*************************** + L = 2 + ***************************/ + tmp0=ylmcoef[3] * ylma[0]; + ylma[4] = ylmcoef[2] * dr[2] * ylma[1] - tmp0 ; // l=2, m=0 + tmp0 = ylmcoef[4] * dr[2]; + ylma[5] = tmp0 * ylma[2]; // l=2,m=1 + ylma[6] = tmp0 * ylma[3]; // l=2,m=-1 + + tmp0 = ylmcoef[4] * dr[0]; + ylma[7] = ylmcoef[5] * ylma[4] - ylmcoef[6] * ylma[0] + - tmp0 * ylma[2]; // l=2,m=2 + ylma[8] = -tmp0 * ylma[3]; + if (nwl == 2) + return; + + /*************************** + L = 3 + ***************************/ + tmp0=ylmcoef[8] * ylma[1]; + ylma[9] = ylmcoef[7] * dr[2] * ylma[4] - tmp0; // l=3, m=0 + + tmp0 = ylmcoef[9] * dr[2]; + ylma[10] = tmp0 * ylma[5] - ylmcoef[10] * ylma[2]; // l=3,m=1 + ylma[11] = tmp0 * ylma[6] - ylmcoef[10] * ylma[3]; // l=3,m=-1 + + tmp0 = ylmcoef[11] * dr[2]; + ylma[12] = tmp0 * ylma[7]; // l=3,m=2 + ylma[13] = tmp0 * ylma[8]; // l=3,m=-2 + + tmp0 = ylmcoef[14] * dr[0]; + ylma[14] = ylmcoef[12] * ylma[10] - ylmcoef[13] * ylma[2] + - tmp0 * ylma[7]; // l=3,m=3 + ylma[15] = ylmcoef[12] * ylma[11] - ylmcoef[13] * ylma[3] + - tmp0 * ylma[8]; // l=3,m=-3 + if (nwl == 3) + return; + + /*************************** + L = 4 + ***************************/ + tmp0=ylmcoef[16] * ylma[4]; + ylma[16] = ylmcoef[15] * dr[2] * ylma[9] - tmp0; // l=4,m=0 + + tmp0 = ylmcoef[17] * dr[2]; + ylma[17] = tmp0 * ylma[10] - ylmcoef[18] * ylma[5]; // l=4,m=1 + ylma[18] = tmp0 * ylma[11] - ylmcoef[18] * ylma[6]; // l=4,m=-1 + + tmp0 = ylmcoef[19] * dr[2]; + ylma[19] = tmp0 * ylma[12] - ylmcoef[20] * ylma[7]; // l=4,m=2 + ylma[20] = tmp0 * ylma[13] - ylmcoef[20] * ylma[8]; // l=4,m=-2 + + tmp0 = 3.0 * dr[2]; + ylma[21] = tmp0 * ylma[14]; // l=4,m=3 + ylma[22] = tmp0 * ylma[15]; // l=4,m=-3 + + tmp0 = ylmcoef[23] * dr[0]; + ylma[23] = ylmcoef[21] * ylma[19] - ylmcoef[22] * ylma[7] + - tmp0 * ylma[14]; // l=4,m=4 + ylma[24] = ylmcoef[21] * ylma[20] - ylmcoef[22] * ylma[8] + - tmp0 * ylma[15]; // l=4,m=-4 + if (nwl == 4) + return; + + /*************************** + L = 5 + ***************************/ + tmp0=ylmcoef[25] * ylma[9]; + ylma[25] + = ylmcoef[24] * dr[2] * ylma[16] - tmp0; // l=5,m=0 + + tmp0 = ylmcoef[26] * dr[2]; + ylma[26] = tmp0 * ylma[17] - ylmcoef[27] * ylma[10]; // l=5,m=1 + ylma[27] = tmp0 * ylma[18] - ylmcoef[27] * ylma[11]; // l=5,m=-1 + + tmp0 = ylmcoef[28] * dr[2]; + ylma[28] = tmp0 * ylma[19] - ylmcoef[29] * ylma[12]; // l=5,m=2 + ylma[29] = tmp0 * ylma[20] - ylmcoef[29] * ylma[13]; // l=5,m=-2 + + tmp0 = ylmcoef[30] * dr[2]; + ylma[30] = tmp0 * ylma[21] - ylmcoef[31] * ylma[14]; // l=5,m=3 + ylma[31] = tmp0 * ylma[22] - ylmcoef[31] * ylma[15]; // l=5,m=-3 + + tmp0 = ylmcoef[32] * dr[2]; + ylma[32] = tmp0 * ylma[23]; // l=5,m=4 + ylma[33] = tmp0 * ylma[24]; // l=5,m=-4 + + tmp0 = ylmcoef[35] * dr[0]; + ylma[34] = ylmcoef[33] * ylma[30] - ylmcoef[34] * ylma[14] + - tmp0 * ylma[23]; // l=5,m=5 + ylma[35] = ylmcoef[33] * ylma[31] - ylmcoef[34] * ylma[15] + - tmp0 * ylma[24]; // l=5,m=-5 + if (nwl == 5) + return; + /* + // if nwl > 5 + for (int il = 6; il <= nwl; il++) + { + int istart = il * il; + int istart1 = (il - 1) * (il - 1); + int istart2 = (il - 2) * (il - 2); + + double fac2 = sqrt(4.0 * istart - 1.0); + double fac4 = sqrt(4.0 * istart1 - 1.0); + + for (int im = 0; im < 2 * il - 1; im++) + { + int imm = (im + 1) / 2; + ylma[istart + im] = fac2 / sqrt((double)istart - imm * imm) * (dr[2] + * ylma[istart1 + im] - sqrt((double)istart1 - imm * imm) / fac4 * + ylma[istart2 + im]); + } + + double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0)); + double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0)); + double bl3 = sqrt(2.0) / fac2; + + ylma[istart + 2 * il - 1] = (bl3 * ylma[istart + 2 * il - 5] - bl2 * + ylma[istart2 + 2 * il - 5] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 3]) / + bl1; ylma[istart + 2 * il] = (bl3 * ylma[istart + 2 * il - 4] - bl2 * + ylma[istart2 + 2 * il - 4] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 2]) / + bl1; + }*/ +} + +static __device__ void spherical_harmonics_d(const double* const dr, + double distance, + double (&grly)[49][3], + int nwl, + double (&ylma)[49], + const double* const ylmcoef) +{ + double tmp0; + double tx = 2.0 * dr[0]; + double ty = 2.0 * dr[1]; + double tz = 2.0 * dr[2]; + ylma[0] = ylmcoef[0]; // l=0, m=0 + grly[0][0] = grly[0][1] = grly[0][2] = 0.0; + if (nwl == 0) + return; + + /*************************** + L = 1 + ***************************/ + ylma[1] = ylmcoef[1] * dr[2]; // l=1, m=0 + grly[1][0] = grly[1][1] = 0.0; + grly[1][2] = ylmcoef[1]; + ylma[2] = -ylmcoef[1] * dr[0]; // l=1, m=1 + grly[2][1] = grly[2][2] = 0.0; + grly[2][0] = -ylmcoef[1]; + ylma[3] = -ylmcoef[1] * dr[1]; // l=1, m=-1 + grly[3][0] = grly[3][2] = 0.0; + grly[3][1] = -ylmcoef[1]; + if (nwl == 1) + return; + + /*************************** + L = 2 + ***************************/ + ylma[4] = ylmcoef[2] * dr[2] * ylma[1] + - ylmcoef[3] * ylma[0] * distance; // l=2, m=0 + grly[4][0] + = ylmcoef[2] * dr[2] * grly[1][0] + - ylmcoef[3] * (grly[0][0] * distance + ylma[0] * tx); // l=2, m=0 + grly[4][1] + = ylmcoef[2] * dr[2] * grly[1][1] + - ylmcoef[3] * (grly[0][1] * distance + ylma[0] * ty); // l=2, m=0 + grly[4][2] + = ylmcoef[2] * (dr[2] * grly[1][2] + ylma[1]) + - ylmcoef[3] * (grly[0][2] * distance + ylma[0] * tz); // l=2, m=0 + + tmp0 = ylmcoef[4] * dr[2]; + ylma[5] = tmp0 * ylma[2]; // l=2,m=1 + grly[5][0] = tmp0 * grly[2][0]; + grly[5][1] = tmp0 * grly[2][1]; + grly[5][2] = ylmcoef[4] * (ylma[2] + dr[2] * grly[2][2]); + + ylma[6] = tmp0 * ylma[3]; // l=2,m=-1 + grly[6][0] = tmp0 * grly[3][0]; + grly[6][1] = tmp0 * grly[3][1]; + grly[6][2] = ylmcoef[4] * (ylma[3] + dr[2] * grly[3][2]); + + tmp0 = ylmcoef[4] * dr[0]; + ylma[7] = ylmcoef[5] * ylma[4] - ylmcoef[6] * ylma[0] * distance + - tmp0 * ylma[2]; // l=2,m=2 + grly[7][0] = ylmcoef[5] * grly[4][0] + - ylmcoef[6] * (ylma[0] * tx + grly[0][0] * distance) + - ylmcoef[4] * (dr[0] * grly[2][0] + ylma[2]); + grly[7][1] = ylmcoef[5] * grly[4][1] + - ylmcoef[6] * (ylma[0] * ty + grly[0][1] * distance) + - tmp0 * grly[2][1]; + grly[7][2] = ylmcoef[5] * grly[4][2] + - ylmcoef[6] * (ylma[0] * tz + grly[0][2] * distance) + - tmp0 * grly[2][2]; + + ylma[8] = -tmp0 * ylma[3]; + grly[8][0] = -ylmcoef[4] * (ylma[3] + dr[0] * grly[3][0]); + grly[8][1] = -tmp0 * grly[3][1]; + grly[8][2] = -tmp0 * grly[3][2]; + if (nwl == 2) + return; + + /*************************** + L = 3 + ***************************/ + ylma[9] = ylmcoef[7] * dr[2] * ylma[4] + - ylmcoef[8] * ylma[1] * distance; // l=3, m=0 + grly[9][0] = ylmcoef[7] * dr[2] * grly[4][0] + - ylmcoef[8] * (ylma[1] * tx + grly[1][0] * distance); + grly[9][1] = ylmcoef[7] * dr[2] * grly[4][1] + - ylmcoef[8] * (ylma[1] * ty + grly[1][1] * distance); + grly[9][2] = ylmcoef[7] * (ylma[4] + dr[2] * grly[4][2]) + - ylmcoef[8] * (ylma[1] * tz + grly[1][2] * distance); + + tmp0 = ylmcoef[9] * dr[2]; + ylma[10] = tmp0 * ylma[5] - ylmcoef[10] * ylma[2] * distance; // l=3,m=1 + grly[10][0] = tmp0 * grly[5][0] + - ylmcoef[10] * (grly[2][0] * distance + ylma[2] * tx); + grly[10][1] = tmp0 * grly[5][1] + - ylmcoef[10] * (grly[2][1] * distance + ylma[2] * ty); + grly[10][2] = ylmcoef[9] * (dr[2] * grly[5][2] + ylma[5]) + - ylmcoef[10] * (grly[2][2] * distance + ylma[2] * tz); + + ylma[11] = tmp0 * ylma[6] - ylmcoef[10] * ylma[3] * distance; // l=3,m=-1 + grly[11][0] = tmp0 * grly[6][0] + - ylmcoef[10] * (grly[3][0] * distance + ylma[3] * tx); + grly[11][1] = tmp0 * grly[6][1] + - ylmcoef[10] * (grly[3][1] * distance + ylma[3] * ty); + grly[11][2] = ylmcoef[9] * (dr[2] * grly[6][2] + ylma[6]) + - ylmcoef[10] * (grly[3][2] * distance + ylma[3] * tz); + + tmp0 = ylmcoef[11] * dr[2]; + ylma[12] = tmp0 * ylma[7]; // l=3,m=2 + grly[12][0] = tmp0 * grly[7][0]; + grly[12][1] = tmp0 * grly[7][1]; + grly[12][2] = ylmcoef[11] * (dr[2] * grly[7][2] + ylma[7]); + + ylma[13] = tmp0 * ylma[8]; // l=3,m=-2 + grly[13][0] = tmp0 * grly[8][0]; + grly[13][1] = tmp0 * grly[8][1]; + grly[13][2] = ylmcoef[11] * (dr[2] * grly[8][2] + ylma[8]); + + tmp0 = ylmcoef[14] * dr[0]; + ylma[14] = ylmcoef[12] * ylma[10] - ylmcoef[13] * ylma[2] * distance + - tmp0 * ylma[7]; // l=3,m=3 + grly[14][0] = ylmcoef[12] * grly[10][0] + - ylmcoef[13] * (ylma[2] * tx + grly[2][0] * distance) + - ylmcoef[14] * (ylma[7] + dr[0] * grly[7][0]); + grly[14][1] = ylmcoef[12] * grly[10][1] + - ylmcoef[13] * (ylma[2] * ty + grly[2][1] * distance) + - tmp0 * grly[7][1]; + grly[14][2] = ylmcoef[12] * grly[10][2] + - ylmcoef[13] * (ylma[2] * tz + grly[2][2] * distance) + - tmp0 * grly[7][2]; + + ylma[15] = ylmcoef[12] * ylma[11] - ylmcoef[13] * ylma[3] * distance + - tmp0 * ylma[8]; // l=3,m=-3 + grly[15][0] = ylmcoef[12] * grly[11][0] + - ylmcoef[13] * (ylma[3] * tx + grly[3][0] * distance) + - ylmcoef[14] * (ylma[8] + dr[0] * grly[8][0]); + grly[15][1] = ylmcoef[12] * grly[11][1] + - ylmcoef[13] * (ylma[3] * ty + grly[3][1] * distance) + - tmp0 * grly[8][1]; + grly[15][2] = ylmcoef[12] * grly[11][2] + - ylmcoef[13] * (ylma[3] * tz + grly[3][2] * distance) + - tmp0 * grly[8][2]; + if (nwl == 3) + return; + + /*************************** + L = 4 + ***************************/ + ylma[16] = ylmcoef[15] * dr[2] * ylma[9] + - ylmcoef[16] * ylma[4] * distance; // l=4,m=0 + grly[16][0] = ylmcoef[15] * dr[2] * grly[9][0] + - ylmcoef[16] * (ylma[4] * tx + grly[4][0] * distance); + grly[16][1] = ylmcoef[15] * dr[2] * grly[9][1] + - ylmcoef[16] * (ylma[4] * ty + grly[4][1] * distance); + grly[16][2] = ylmcoef[15] * (dr[2] * grly[9][2] + ylma[9]) + - ylmcoef[16] * (ylma[4] * tz + grly[4][2] * distance); + + tmp0 = ylmcoef[17] * dr[2]; + ylma[17] = tmp0 * ylma[10] - ylmcoef[18] * ylma[5] * distance; // l=4,m=1 + grly[17][0] = tmp0 * grly[10][0] + - ylmcoef[18] * (ylma[5] * tx + grly[5][0] * distance); + grly[17][1] = tmp0 * grly[10][1] + - ylmcoef[18] * (ylma[5] * ty + grly[5][1] * distance); + grly[17][2] = ylmcoef[17] * (dr[2] * grly[10][2] + ylma[10]) + - ylmcoef[18] * (ylma[5] * tz + grly[5][2] * distance); + + ylma[18] = tmp0 * ylma[11] - ylmcoef[18] * ylma[6] * distance; // l=4,m=-1 + grly[18][0] = tmp0 * grly[11][0] + - ylmcoef[18] * (ylma[6] * tx + grly[6][0] * distance); + grly[18][1] = tmp0 * grly[11][1] + - ylmcoef[18] * (ylma[6] * ty + grly[6][1] * distance); + grly[18][2] = ylmcoef[17] * (dr[2] * grly[11][2] + ylma[11]) + - ylmcoef[18] * (ylma[6] * tz + grly[6][2] * distance); + + tmp0 = ylmcoef[19] * dr[2]; + ylma[19] = tmp0 * ylma[12] - ylmcoef[20] * ylma[7] * distance; // l=4,m=2 + grly[19][0] = tmp0 * grly[12][0] + - ylmcoef[20] * (ylma[7] * tx + grly[7][0] * distance); + grly[19][1] = tmp0 * grly[12][1] + - ylmcoef[20] * (ylma[7] * ty + grly[7][1] * distance); + grly[19][2] = ylmcoef[19] * (dr[2] * grly[12][2] + ylma[12]) + - ylmcoef[20] * (ylma[7] * tz + grly[7][2] * distance); + + ylma[20] = tmp0 * ylma[13] - ylmcoef[20] * ylma[8] * distance; // l=4,m=-2 + grly[20][0] = tmp0 * grly[13][0] + - ylmcoef[20] * (ylma[8] * tx + grly[8][0] * distance); + grly[20][1] = tmp0 * grly[13][1] + - ylmcoef[20] * (ylma[8] * ty + grly[8][1] * distance); + grly[20][2] = ylmcoef[19] * (dr[2] * grly[13][2] + ylma[13]) + - ylmcoef[20] * (ylma[8] * tz + grly[8][2] * distance); + + tmp0 = 3.0 * dr[2]; + ylma[21] = tmp0 * ylma[14]; // l=4,m=3 + grly[21][0] = tmp0 * grly[14][0]; + grly[21][1] = tmp0 * grly[14][1]; + grly[21][2] = 3.0 * (dr[2] * grly[14][2] + ylma[14]); + + ylma[22] = tmp0 * ylma[15]; // l=4,m=-3 + grly[22][0] = tmp0 * grly[15][0]; + grly[22][1] = tmp0 * grly[15][1]; + grly[22][2] = 3.0 * (dr[2] * grly[15][2] + ylma[15]); + + tmp0 = ylmcoef[23] * dr[0]; + ylma[23] = ylmcoef[21] * ylma[19] - ylmcoef[22] * ylma[7] * distance + - tmp0 * ylma[14]; // l=4,m=4 + grly[23][0] = ylmcoef[21] * grly[19][0] + - ylmcoef[22] * (ylma[7] * tx + grly[7][0] * distance) + - ylmcoef[23] * (dr[0] * grly[14][0] + ylma[14]); + grly[23][1] = ylmcoef[21] * grly[19][1] + - ylmcoef[22] * (ylma[7] * ty + grly[7][1] * distance) + - tmp0 * grly[14][1]; + grly[23][2] = ylmcoef[21] * grly[19][2] + - ylmcoef[22] * (ylma[7] * tz + grly[7][2] * distance) + - tmp0 * grly[14][2]; + + ylma[24] = ylmcoef[21] * ylma[20] - ylmcoef[22] * ylma[8] * distance + - tmp0 * ylma[15]; // l=4,m=-4 + grly[24][0] = ylmcoef[21] * grly[20][0] + - ylmcoef[22] * (ylma[8] * tx + grly[8][0] * distance) + - ylmcoef[23] * (dr[0] * grly[15][0] + ylma[15]); + grly[24][1] = ylmcoef[21] * grly[20][1] + - ylmcoef[22] * (ylma[8] * ty + grly[8][1] * distance) + - tmp0 * grly[15][1]; + grly[24][2] = ylmcoef[21] * grly[20][2] + - ylmcoef[22] * (ylma[8] * tz + grly[8][2] * distance) + - tmp0 * grly[15][2]; + if (nwl == 4) + return; + + /*************************** + L = 5 + ***************************/ + ylma[25] = ylmcoef[24] * dr[2] * ylma[16] + - ylmcoef[25] * ylma[9] * distance; // l=5,m=0 + grly[25][0] = ylmcoef[24] * dr[2] * grly[16][0] + - ylmcoef[25] * (ylma[9] * tx + grly[9][0] * distance); + grly[25][1] = ylmcoef[24] * dr[2] * grly[16][1] + - ylmcoef[25] * (ylma[9] * ty + grly[9][1] * distance); + grly[25][2] = ylmcoef[24] * (dr[2] * grly[16][2] + ylma[16]) + - ylmcoef[25] * (ylma[9] * tz + grly[9][2] * distance); + + tmp0 = ylmcoef[26] * dr[2]; + ylma[26] = tmp0 * ylma[17] - ylmcoef[27] * ylma[10] * distance; // l=5,m=1 + grly[26][0] = tmp0 * grly[17][0] + - ylmcoef[27] * (ylma[10] * tx + grly[10][0] * distance); + grly[26][1] = tmp0 * grly[17][1] + - ylmcoef[27] * (ylma[10] * ty + grly[10][1] * distance); + grly[26][2] = ylmcoef[26] * (dr[2] * grly[17][2] + ylma[17]) + - ylmcoef[27] * (ylma[10] * tz + grly[10][2] * distance); + + ylma[27] = tmp0 * ylma[18] - ylmcoef[27] * ylma[11] * distance; // l=5,m=-1 + grly[27][0] = tmp0 * grly[18][0] + - ylmcoef[27] * (ylma[11] * tx + grly[11][0] * distance); + grly[27][1] = tmp0 * grly[18][1] + - ylmcoef[27] * (ylma[11] * ty + grly[11][1] * distance); + grly[27][2] = ylmcoef[26] * (dr[2] * grly[18][2] + ylma[18]) + - ylmcoef[27] * (ylma[11] * tz + grly[11][2] * distance); + + tmp0 = ylmcoef[28] * dr[2]; + ylma[28] = tmp0 * ylma[19] - ylmcoef[29] * ylma[12] * distance; // l=5,m=2 + grly[28][0] = tmp0 * grly[19][0] + - ylmcoef[29] * (ylma[12] * tx + grly[12][0] * distance); + grly[28][1] = tmp0 * grly[19][1] + - ylmcoef[29] * (ylma[12] * ty + grly[12][1] * distance); + grly[28][2] = ylmcoef[28] * (dr[2] * grly[19][2] + ylma[19]) + - ylmcoef[29] * (ylma[12] * tz + grly[12][2] * distance); + + ylma[29] = tmp0 * ylma[20] - ylmcoef[29] * ylma[13] * distance; // l=5,m=-2 + grly[29][0] = tmp0 * grly[20][0] + - ylmcoef[29] * (ylma[13] * tx + grly[13][0] * distance); + grly[29][1] = tmp0 * grly[20][1] + - ylmcoef[29] * (ylma[13] * ty + grly[13][1] * distance); + grly[29][2] = ylmcoef[28] * (dr[2] * grly[20][2] + ylma[20]) + - ylmcoef[29] * (ylma[13] * tz + grly[13][2] * distance); + + tmp0 = ylmcoef[30] * dr[2]; + ylma[30] = tmp0 * ylma[21] - ylmcoef[31] * ylma[14] * distance; // l=5,m=3 + grly[30][0] = tmp0 * grly[21][0] + - ylmcoef[31] * (grly[14][0] * distance + ylma[14] * tx); + grly[30][1] = tmp0 * grly[21][1] + - ylmcoef[31] * (grly[14][1] * distance + ylma[14] * ty); + grly[30][2] = ylmcoef[30] * (dr[2] * grly[21][2] + ylma[21]) + - ylmcoef[31] * (ylma[14] * tz + grly[14][2] * distance); + + ylma[31] = tmp0 * ylma[22] - ylmcoef[31] * ylma[15] * distance; // l=5,m=-3 + grly[31][0] = tmp0 * grly[22][0] + - ylmcoef[31] * (grly[15][0] * distance + ylma[15] * tx); + grly[31][1] = tmp0 * grly[22][1] + - ylmcoef[31] * (grly[15][1] * distance + ylma[15] * ty); + grly[31][2] = ylmcoef[30] * (dr[2] * grly[22][2] + ylma[22]) + - ylmcoef[31] * (ylma[15] * tz + grly[15][2] * distance); + + tmp0 = ylmcoef[32] * dr[2]; + ylma[32] = tmp0 * ylma[23]; // l=5,m=4 + grly[32][0] = tmp0 * grly[23][0]; + grly[32][1] = tmp0 * grly[23][1]; + grly[32][2] = ylmcoef[32] * (ylma[23] + dr[2] * grly[23][2]); + + ylma[33] = tmp0 * ylma[24]; // l=5,m=-4 + grly[33][0] = tmp0 * grly[24][0]; + grly[33][1] = tmp0 * grly[24][1]; + grly[33][2] = ylmcoef[32] * (ylma[24] + dr[2] * grly[24][2]); + + tmp0 = ylmcoef[35] * dr[0]; + ylma[34] = ylmcoef[33] * ylma[30] - ylmcoef[34] * ylma[14] * distance + - tmp0 * ylma[23]; // l=5,m=5 + grly[34][0] = ylmcoef[33] * grly[30][0] + - ylmcoef[34] * (ylma[14] * tx + grly[14][0] * distance) + - ylmcoef[35] * (dr[0] * grly[23][0] + ylma[23]); + grly[34][1] = ylmcoef[33] * grly[30][1] + - ylmcoef[34] * (ylma[14] * ty + grly[14][1] * distance) + - tmp0 * grly[23][1]; + grly[34][2] = ylmcoef[33] * grly[30][2] + - ylmcoef[34] * (ylma[14] * tz + grly[14][2] * distance) + - tmp0 * grly[23][2]; + + ylma[35] = ylmcoef[33] * ylma[31] - ylmcoef[34] * ylma[15] * distance + - tmp0 * ylma[24]; // l=5,m=-5 + grly[35][0] = ylmcoef[33] * grly[31][0] + - ylmcoef[34] * (ylma[15] * tx + grly[15][0] * distance) + - ylmcoef[35] * (dr[0] * grly[24][0] + ylma[24]); + grly[35][1] = ylmcoef[33] * grly[31][1] + - ylmcoef[34] * (ylma[15] * ty + grly[15][1] * distance) + - tmp0 * grly[24][1]; + grly[35][2] = ylmcoef[33] * grly[31][2] + - ylmcoef[34] * (ylma[15] * tz + grly[15][2] * distance) + - tmp0 * grly[24][2]; + + if (nwl == 5) + return; + /* + // if nwl > 5 + for (int il = 6; il <= nwl; il++) + { + int istart = il * il; + int istart1 = (il - 1) * (il - 1); + int istart2 = (il - 2) * (il - 2); + + double fac2 = sqrt(4.0 * istart - 1.0); + double fac4 = sqrt(4.0 * istart1 - 1.0); + + for (int im = 0; im < 2 * il - 1; im++) + { + int imm = (im + 1) / 2; + ylma[istart + im] = fac2 / sqrt((double)istart - imm * imm) * (dr[2] + * ylma[istart1 + im] - sqrt((double)istart1 - imm * imm) / fac4 * + ylma[istart2 + im]); + } + + double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0)); + double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0)); + double bl3 = sqrt(2.0) / fac2; + + ylma[istart + 2 * il - 1] = (bl3 * ylma[istart + 2 * il - 5] - bl2 * + ylma[istart2 + 2 * il - 5] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 3]) / + bl1; ylma[istart + 2 * il] = (bl3 * ylma[istart + 2 * il - 4] - bl2 * + ylma[istart2 + 2 * il - 4] - 2.0 * dr[0] * ylma[istart1 + 2 * il - 2]) / + bl1; + }*/ +} + +} // namespace GintKernel + +#endif \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cu new file mode 100644 index 0000000000..ed710edf5a --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cu @@ -0,0 +1,659 @@ +#include + +#include "cuda_tools.cuh" +#include "module_base/blas_connector.h" +#include "module_hamilt_pw/hamilt_pwdft/global.h" +#include "vbatch_matrix_mul.cuh" + +#define sA(i, j) sA[(j)*slda + (i)] +#define sB(i, j) sB[(j)*sldb + (i)] +#define fetch(A, m, n, bound) offs_d##A[min(n * LD##A + m, bound)] + +template +static __device__ void vbatched_gemm_device(int M, + int N, + int K, + T* __restrict__ A, + int LDA, + T* __restrict__ B, + int LDB, + T* __restrict__ C, + int LDC, + T* sA, + int slda, + T* sB, + int sldb, + T alpha) +{ + int idx = threadIdx.x; // thread's m dimension + int idy = threadIdx.y; // thread's n dimension + + int idt = DIM_X * idy + idx; // thread's global number + + int idxA = idt % DIM_XA; // idx within A + int idyA = idt / DIM_XA; // idy within A + + int idxB = idt % DIM_XB; // idx within B + int idyB = idt / DIM_XB; // idy within B + + int blx = blockIdx.x; // block's m dimension + int bly = blockIdx.y; // block's n dimension + + // Registers for the innermost loop + T rC[THR_N][THR_M]; + T rA[THR_M]; + T rB[THR_N]; + + // Registers for the dev->shmem copy + T ra[BLK_M / DIM_YA][BLK_K / DIM_XA]; + T rb[BLK_N / DIM_YB][BLK_K / DIM_XB]; + + // bound is the correction to offs_d in order to not get out of memory bound + // so bound could be negative value since offs_d could be out of bound + T* offs_dA = A + blx * BLK_M * LDA + idyA * LDA + idxA; + int boundA + = (LDA * (M - 1) + K) - (blx * BLK_M * LDA + idyA * LDA + idxA) - 1; + + T* offs_dB = B + bly * BLK_N * LDB + idyB * LDB + idxB; + int boundB + = (LDB * (N - 1) + K) - (bly * BLK_N * LDB + idyB * LDB + idxB) - 1; + + int m, n, k, kk; + +// Zero C +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] = 0.0; + } + } + +// Load A dev->shmem +#pragma unroll + for (n = 0; n < BLK_M; n += DIM_YA) + { +#pragma unroll + for (m = 0; m < BLK_K; m += DIM_XA) + { + sA(n + idyA, m + idxA) = fetch(A, m, n, boundA); + } + } + +#pragma unroll + for (n = 0; n < BLK_N; n += DIM_YB) + { +#pragma unroll + for (m = 0; m < BLK_K; m += DIM_XB) + { + sB(m + idxB, n + idyB) = fetch(B, m, n, boundB); + } + } + + __syncthreads(); + + for (kk = 0; kk < K - BLK_K; kk += BLK_K) + { + offs_dA += BLK_K; + boundA -= BLK_K; + + offs_dB += BLK_K; + boundB -= BLK_K; + +// Load A dev->regs +#pragma unroll + for (n = 0; n < BLK_M / DIM_YA; n++) + { +#pragma unroll + for (m = 0; m < BLK_K / DIM_XA; m++) + { + ra[n][m] = fetch(A, m * DIM_XA, n * DIM_YA, boundA); + } + } + +// Load B dev->regs +#pragma unroll + for (n = 0; n < BLK_N / DIM_YB; n++) + { +#pragma unroll + for (m = 0; m < BLK_K / DIM_XB; m++) + { + rb[n][m] = fetch(B, m * DIM_XB, n * DIM_YB, boundB); + } + } + +// Multiply +#pragma unroll + for (k = 0; k < BLK_K; k++) + { +// Load A shmem->regs +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rA[m] = sA(m * DIM_X + idx, k); + } + +// Load B shmem->regs +#pragma unroll + for (n = 0; n < THR_N; n++) + { + rB[n] = sB(k, n * DIM_Y + idy); + } + +// Compute +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] += rA[m] * rB[n]; + } + } + } + + __syncthreads(); + +// Load A regs->shmem +#pragma unroll + for (n = 0; n < BLK_M / DIM_YA; n++) + { +#pragma unroll + for (m = 0; m < BLK_K / DIM_XA; m++) + { + sA(n * DIM_YA + idyA, m * DIM_XA + idxA) = ra[n][m]; + } + } + +// Load B regs->shmem +#pragma unroll + for (n = 0; n < BLK_N / DIM_YB; n++) + { +#pragma unroll + for (m = 0; m < BLK_K / DIM_XB; m++) + { + sB(m * DIM_XB + idxB, n * DIM_YB + idyB) = rb[n][m]; + } + } + __syncthreads(); + } + + // Multiply last full (BLK_K) or partial block of + // columns of op(A) and rows of op(B). + // It's okay that m,n exceed matrix bounds as all work is in registers + // or shared memory, and out-of-bounds rC[n][m] will not be saved later. + kk = K - kk; +#pragma unroll + for (k = 0; k < kk; k++) + { +// Load A shmem->regs +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rA[m] = sA(m * DIM_X + idx, k); + } + +// Load B shmem->regs +#pragma unroll + for (n = 0; n < THR_N; n++) + { + rB[n] = sB(k, n * DIM_Y + idy); + } + +// Compute +#pragma unroll + for (n = 0; n < THR_N; n++) + { +#pragma unroll + for (m = 0; m < THR_M; m++) + { + rC[n][m] += rA[m] * rB[n]; + } + } + } + +// Store C regs->dev +#pragma unroll + for (n = 0; n < THR_N; n++) + { + int coord_dCn = bly * BLK_N + n * DIM_Y + idy; +#pragma unroll + for (m = 0; m < THR_M; m++) + { + int coord_dCm = blx * BLK_M + m * DIM_X + idx; + if (coord_dCm < M && coord_dCn < N) + { + int offsC = coord_dCn * LDC + coord_dCm; + + atomicAdd(C + offsC, rC[n][m] * alpha); + } + } + } +} + +/******************************************************************************/ +template +static __global__ void vbatched_gemm_kernel(int* M, + int* N, + int* K, + T** global_A_array, + int* global_lda, + T** global_B_array, + int* global_ldb, + T** global_C_array, + int* global_ldc, + T* alpha) +{ + extern __shared__ __align__(sizeof(T)) unsigned char smem[]; + T* shared_mem = reinterpret_cast(smem); + + int batchid = blockIdx.z; + int local_M = (int)M[batchid]; + int local_N = (int)N[batchid]; + int local_K = (int)K[batchid]; + + if (blockIdx.x >= (local_M + BLK_M - 1) / BLK_M) + return; + if (blockIdx.y >= (local_N + BLK_N - 1) / BLK_N) + return; + + int shared_lda = BLK_M + 1; + int shared_ldb = BLK_K + 1; + T* shared_A = (T*)shared_mem; + T* shared_B = shared_A + shared_lda * BLK_K; + double alpha_tmp = 1.0; + if (alpha != nullptr) + { + alpha_tmp = alpha[batchid]; + } + vbatched_gemm_device(local_M, + local_N, + local_K, + global_A_array[batchid], + (int)global_lda[batchid], + global_B_array[batchid], + (int)global_ldb[batchid], + global_C_array[batchid], + (int)global_ldc[batchid], + shared_A, + shared_lda, + shared_B, + shared_ldb, + alpha_tmp); +} + +static inline int ceildiv(int x, int y) +{ + return (x + y - 1) / y; +} + +template +void vbatched_gemm_impl(int max_m, + int max_n, + int* m, + int* n, + int* k, + T** global_A_array, + int* global_lda, + T** global_B_array, + int* global_ldb, + T** global_C_array, + int* global_ldc, + int batchCount, + cudaStream_t stream, + T* alpha) +{ + // The positions of A and B have been swapped here. + // This is because the original code is for column-major matrices. + // We use row-major matrices, so we need to swap A and B. + // The vbatched_gemm_impl is for C = trans(A) * B + C, but we need trans(C). + // Which means: trans(C) = trans(trans(A)*B + C) = trans(B) * A + trans(C) + // Then, ldc should be N, lda and ldb should be K + + size_t shared_mem_size = 0; + shared_mem_size += (BLK_M + 1) * BLK_K * sizeof(T); + shared_mem_size += (BLK_K + 1) * BLK_N * sizeof(T); + dim3 dimBlock(DIM_X, DIM_Y); + const int max_batch_count = 32768; + const int loop_num = batchCount / max_batch_count; + const int remain_num = batchCount % max_batch_count; + + for (int i = 0; i < loop_num; ++i) + { + dim3 dimGrid(ceildiv(max_n, BLK_M), + ceildiv(max_m, BLK_N), + max_batch_count); + T* alpha_tmp = nullptr; + if (alpha != nullptr) + { + alpha_tmp = alpha + i * max_batch_count; + } + + vbatched_gemm_kernel + <<>>( + n + i * max_batch_count, + m + i * max_batch_count, + k + i * max_batch_count, + global_B_array + i * max_batch_count, + global_ldb + i * max_batch_count, + global_A_array + i * max_batch_count, + global_lda + i * max_batch_count, + global_C_array + i * max_batch_count, + global_ldc + i * max_batch_count, + alpha_tmp); + checkCudaLastError(); + } + if (remain_num > 0) + { + dim3 dimGrid(ceildiv(max_n, BLK_M), ceildiv(max_m, BLK_N), remain_num); + T* alpha_tmp = nullptr; + if (alpha != nullptr) + { + alpha_tmp = alpha + loop_num * max_batch_count; + } + vbatched_gemm_kernel + <<>>( + n + loop_num * max_batch_count, + m + loop_num * max_batch_count, + k + loop_num * max_batch_count, + global_B_array + loop_num * max_batch_count, + global_ldb + loop_num * max_batch_count, + global_A_array + loop_num * max_batch_count, + global_lda + loop_num * max_batch_count, + global_C_array + loop_num * max_batch_count, + global_ldc + loop_num * max_batch_count, + alpha_tmp); + checkCudaLastError(); + } +} + +template +void gemm_time_measure(int max_m, + int max_n, + int* m, + int* n, + int* k, + T** global_A_array, + int* global_lda, + T** global_B_array, + int* global_ldb, + T** global_C_array, + int* global_ldc, + int batchCount, + cudaStream_t stream, + float& fast_time, + matrix_multiple_func_type& fastest_algo, + double* cpu_result, + double* h_global_C, + double* d_global_C) +{ + cudaEvent_t start, stop; + checkCuda( + cudaMemset(d_global_C, 0, batchCount * max_m * max_n * sizeof(double))); + checkCuda(cudaEventCreate(&start)); + checkCuda(cudaEventCreate(&stop)); + checkCuda(cudaEventRecord(start, stream)); + vbatched_gemm_impl(max_m, + max_n, + m, + n, + k, + global_A_array, + global_lda, + global_B_array, + global_ldb, + global_C_array, + global_ldc, + batchCount, + stream); + checkCuda(cudaEventRecord(stop, stream)); + cudaError_t cuda_status = cudaGetLastError(); + checkCuda(cudaStreamSynchronize(stream)); + float milliseconds = 0; + checkCuda(cudaEventElapsedTime(&milliseconds, start, stop)); + + // WARNING !!!!! Here we assume that all m and n are the same + checkCuda(cudaMemcpy(h_global_C, + d_global_C, + batchCount * max_m * max_n * sizeof(double), + cudaMemcpyDeviceToHost)); + bool check_result = true; + for (int i = 0; i < batchCount * max_m * max_n; ++i) + { + if (abs(cpu_result[i] - h_global_C[i]) > 0.001) + { + check_result = false; + break; + } + } + if (milliseconds < fast_time && cuda_status == cudaSuccess && check_result) + { + fast_time = milliseconds; + fastest_algo = vbatched_gemm_impl; +#ifdef __DEBUG + std::cout << "found! fastest time: " << fast_time << std::endl; + std::cout << DIM_X << "," << DIM_Y << "," << BLK_M << "," << BLK_N + << "," << BLK_K << "," << DIM_XA << "," << DIM_YA << "," + << DIM_XB << "," << DIM_YB << std::endl; +#endif + } +} + +/* + * Here we have utilized a very straightforward and brute-force method to select + * the optimal matrix multiplication kernel for a given scale of computation: we + * compute with all scales of kernels under the current computational task to + * find the fastest parameter combination. This approach can lead to an increase + * in compilation time (TODO: so in the future, it will be necessary to split + * this large section of code into multiple files, multiple compilation units). + */ +void gemm_algo_selector(int matrix_k, matrix_multiple_func_type& fastest_algo) +{ + int batchCount_per_type = 32; + int batchCount + = batchCount_per_type * GlobalC::ucell.ntype * GlobalC::ucell.ntype; + + Cuda_Mem_Wrapper m(batchCount); + Cuda_Mem_Wrapper n(batchCount); + Cuda_Mem_Wrapper k(batchCount); + + int max_m = GlobalC::ucell.nwmax, max_n = GlobalC::ucell.nwmax; + + Cuda_Mem_Wrapper A(batchCount * max_m * matrix_k); + Cuda_Mem_Wrapper B(batchCount * max_n * matrix_k); + Cuda_Mem_Wrapper C(batchCount * max_m * max_n); + + Cuda_Mem_Wrapper lda(batchCount); + Cuda_Mem_Wrapper ldb(batchCount); + Cuda_Mem_Wrapper ldc(batchCount); + + Cuda_Mem_Wrapper A_array(batchCount); + Cuda_Mem_Wrapper B_array(batchCount); + Cuda_Mem_Wrapper C_array(batchCount); + + for (int i = 0; i < batchCount * max_m * matrix_k; ++i) + { + A.get_host_pointer()[i] = i * 0.001; + } + for (int i = 0; i < batchCount * max_n * matrix_k; ++i) + { + B.get_host_pointer()[i] = i * 0.002; + } + + double* cpu_result = new double[batchCount * max_m * max_n]; + memset(cpu_result, 0, batchCount * max_m * max_n * sizeof(double)); + int index = 0; + for (int i = 0; i < batchCount_per_type; ++i) + { + for (int j = 0; j < GlobalC::ucell.ntype; j++) + { + for (int l = 0; l < GlobalC::ucell.ntype; l++) + { + m.get_host_pointer()[index] = GlobalC::ucell.atoms[j].nw; + n.get_host_pointer()[index] = GlobalC::ucell.atoms[l].nw; + k.get_host_pointer()[index] = matrix_k; + + lda.get_host_pointer()[index] = matrix_k; + ldb.get_host_pointer()[index] = matrix_k; + ldc.get_host_pointer()[index] = GlobalC::ucell.atoms[l].nw; + + A_array.get_host_pointer()[index] + = &A.get_device_pointer()[index * max_m * matrix_k]; + B_array.get_host_pointer()[index] + = &B.get_device_pointer()[index * max_n * matrix_k]; + C_array.get_host_pointer()[index] + = &C.get_device_pointer()[index * max_n + * max_m]; // test atom add + BlasConnector::gemm( + 'N', + 'T', + m.get_host_pointer()[index], + n.get_host_pointer()[index], + matrix_k, + 1.0, + &A.get_host_pointer()[index * max_m * matrix_k], + matrix_k, + &B.get_host_pointer()[index * max_n * matrix_k], + matrix_k, + 1.0, + &cpu_result[index * max_m * max_n], + n.get_host_pointer()[index]); + index++; + } + } + } + + m.copy_host_to_device_sync(); + n.copy_host_to_device_sync(); + k.copy_host_to_device_sync(); + + lda.copy_host_to_device_sync(); + ldb.copy_host_to_device_sync(); + ldc.copy_host_to_device_sync(); + + A.copy_host_to_device_sync(); + B.copy_host_to_device_sync(); + A_array.copy_host_to_device_sync(); + B_array.copy_host_to_device_sync(); + C_array.copy_host_to_device_sync(); + + cudaStream_t temp_stream; + checkCuda(cudaStreamCreate(&temp_stream)); + + float fastest_time = 1000000; + fastest_algo = vbatched_gemm_impl; + + int* d_m = m.get_device_pointer(); + int* d_n = n.get_device_pointer(); + int* d_k = k.get_device_pointer(); + + double** d_global_A_array = A_array.get_device_pointer(); + double** d_global_B_array = B_array.get_device_pointer(); + double** d_global_C_array = C_array.get_device_pointer(); + + double* h_global_C = C.get_host_pointer(); + double* d_global_C = C.get_device_pointer(); + + int* d_global_lda = lda.get_device_pointer(); + int* d_global_ldb = ldb.get_device_pointer(); + int* d_global_ldc = ldc.get_device_pointer(); + +/* + * Please do not manually modify the code in the following file; + * it should simply be generated through a loop using a short Python program. + */ +#include "code_gen.cpp" + checkCuda(cudaStreamDestroy(temp_stream)); + std::cout << " gemm_algo_selector::Fastest time: " << fastest_time << " ms" + << std::endl; + // fastest_algo = vbatched_gemm_impl; + delete[] cpu_result; +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh new file mode 100644 index 0000000000..3972918675 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/vbatch_matrix_mul.cuh @@ -0,0 +1,115 @@ +#ifndef VBATCH_MATRIX_MUL_H +#define VBATCH_MATRIX_MUL_H +#include // for assert +#include +#include // for CUDA_VERSION +#include +#include // for fprintf and stderr + +#include + +/** + * Performs a batched matrix multiplication using the vbatched_gemm_impl + * function. + * + * C = alpha * A * B + C + * @tparam T The data type of the matrices. + * @tparam DIM_X The number of threads in the x-dimension of each block. + * @tparam DIM_Y The number of threads in the y-dimension of each block. + * @tparam BLK_M The number of rows processed by each thread block. + * @tparam BLK_N The number of columns processed by each thread block. + * @tparam BLK_K The number of elements processed by each thread block along the + * K dimension. + * @tparam DIM_XA The number of threads in the x-dimension used for loading + * matrix A. + * @tparam DIM_YA The number of threads in the y-dimension used for loading + * matrix A. + * @tparam DIM_XB The number of threads in the x-dimension used for loading + * matrix B. + * @tparam DIM_YB The number of threads in the y-dimension used for loading + * matrix B. + * @param max_m The maximum number of rows in the matrices. + * @param max_n The maximum number of columns in the matrices. + * @param m An array of batch sizes for the number of rows in each matrix. + * @param n An array of batch sizes for the number of columns in each matrix. + * @param k An array of batch sizes for the number of elements in each matrix + * along the K dimension. + * @param global_A_array An array of pointers to the input matrices A. + * @param global_lda An array of leading dimensions for the input matrices A. + * @param global_B_array An array of pointers to the input matrices B. + * @param global_ldb An array of leading dimensions for the input matrices B. + * @param global_C_array An array of pointers to the output matrices C. + * @param global_ldc An array of leading dimensions for the output matrices C. + * @param batchCount The number of matrices in the batch. + * @param stream The CUDA stream to use for the computation. + * @param alpha The scalar value to multiply the matrices by (optional, default + * is nullptr). generate by copilot + */ + +/* + * Why do we need to implement our own matrix multiplication based on the magma + * code? There are two main reasons. First is when we are doing batch matrix + * multiplication, since we need to accumulate the results of the + * multiplications, it is necessary to pass the same memory address of matrix C + * to different multiplications. This way, the accumulation can be done directly + * through atomic operations during the matrix multiplication, avoiding the + * reduction operations after the multiplication. Secondly, when calculating the + * charge density, where C = alpha * A * B + C, the value of alpha might be + * different for the same batch of matrices. Using the standard matrix + * multiplication interface would require breaking down the batch matrix + * multiplication into smaller batches. In practice, it is difficult to + * accumulate a batch. + * + * Moreover, taking into account the specific requirements of our application, + * especially the fact that we can relatively easily control the arrangement of + * the matrix elements, we have only implemented one type of requirement for + * matrix transposition. That is, we have implemented the operation C = alpha * + * trans(A) * B + C under the constraint of column-major order. + * + * Finally, we would like to thank Magma for its contributions to the field of + * scientific computing. + */ + +template +void vbatched_gemm_impl(int max_m, + int max_n, + int* m, + int* n, + int* k, + T** global_A_array, + int* global_lda, + T** global_B_array, + int* global_ldb, + T** global_C_array, + int* global_ldc, + int batchCount, + cudaStream_t stream, + T* alpha = nullptr); + +typedef std::function + matrix_multiple_func_type; + +void gemm_algo_selector(int k, matrix_multiple_func_type& func); +#endif // VBATCH_MATRIX_MUL_H \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/test/CMakeLists.txt b/source/module_hamilt_lcao/module_gint/test/CMakeLists.txt new file mode 100644 index 0000000000..386f904f67 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/test/CMakeLists.txt @@ -0,0 +1,7 @@ +if(ENABLE_LCAO AND USE_CUDA) + AddTest( + TARGET gint_gpu_test + LIBS ${math_libs} psi base device + SOURCES test_sph.cu test_sph.cpp +) +endif() \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/test/test_sph.cpp b/source/module_hamilt_lcao/module_gint/test/test_sph.cpp new file mode 100644 index 0000000000..167a4ab30b --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/test/test_sph.cpp @@ -0,0 +1,600 @@ +#include "test_sph.h" +using namespace std; + +void sph_harm(const int& Lmax, // max momentum of l + const double& xdr, + const double& ydr, + const double& zdr, + std::vector& rly, + double* ylmcoef) +{ + + // begin calculation + /*************************** + L = 0 + ***************************/ + rly[0] = ylmcoef[0]; // l=0, m=0 + if (Lmax == 0) + return; + + /*************************** + L = 1 + ***************************/ + rly[1] = ylmcoef[1] * zdr; // l=1, m=0 + rly[2] = -ylmcoef[1] * xdr; // l=1, m=1 + rly[3] = -ylmcoef[1] * ydr; // l=1, m=-1 + if (Lmax == 1) + return; + + /*************************** + L = 2 + ***************************/ + double tmp0 = ylmcoef[3] * rly[0]; + rly[4] = ylmcoef[2] * zdr * rly[1] - tmp0; // l=2, m=0 + + tmp0 = ylmcoef[4] * zdr; + rly[5] = tmp0 * rly[2]; // l=2,m=1 + rly[6] = tmp0 * rly[3]; // l=2,m=-1 + + double tmp2 = ylmcoef[4] * xdr; + rly[7] + = ylmcoef[5] * rly[4] - ylmcoef[6] * rly[0] - tmp2 * rly[2]; // l=2,m=2 + rly[8] = -tmp2 * rly[3]; + // rly[8] = tmp1+tmp2*rly[3];//l=2,m=-2 + if (Lmax == 2) + return; + + /*************************** + L = 3 + ***************************/ + tmp0 = ylmcoef[8] * rly[1]; + rly[9] = ylmcoef[7] * zdr * rly[4] - tmp0; // l=3, m=0 + + double tmp3 = ylmcoef[9] * zdr; + rly[10] = tmp3 * rly[5] - ylmcoef[10] * rly[2]; // l=3,m=1 + rly[11] = tmp3 * rly[6] - ylmcoef[10] * rly[3]; // l=3,m=-1 + + double tmp4 = ylmcoef[11] * zdr; + rly[12] = tmp4 * rly[7]; // l=3,m=2 + rly[13] = tmp4 * rly[8]; // l=3,m=-2 + + double tmp5 = ylmcoef[14] * xdr; + rly[14] = ylmcoef[12] * rly[10] - ylmcoef[13] * rly[2] + - tmp5 * rly[7]; // l=3,m=3 + rly[15] = ylmcoef[12] * rly[11] - ylmcoef[13] * rly[3] + - tmp5 * rly[8]; // l=3,m=-3 + if (Lmax == 3) + return; + + /*************************** + L = 4 + ***************************/ + tmp0 = ylmcoef[16] * rly[4]; + rly[16] = ylmcoef[15] * zdr * rly[9] - tmp0; // l=4,m=0 + + double tmp6 = ylmcoef[17] * zdr; + rly[17] = tmp6 * rly[10] - ylmcoef[18] * rly[5]; // l=4,m=1 + rly[18] = tmp6 * rly[11] - ylmcoef[18] * rly[6]; // l=4,m=-1 + + double tmp7 = ylmcoef[19] * zdr; + rly[19] = tmp7 * rly[12] - ylmcoef[20] * rly[7]; // l=4,m=2 + rly[20] = tmp7 * rly[13] - ylmcoef[20] * rly[8]; // l=4,m=-2 + + double tmp8 = 3.0 * zdr; + rly[21] = tmp8 * rly[14]; // l=4,m=3 + rly[22] = tmp8 * rly[15]; // l=4,m=-3 + + double tmp9 = ylmcoef[23] * xdr; + rly[23] = ylmcoef[21] * rly[19] - ylmcoef[22] * rly[7] + - tmp9 * rly[14]; // l=4,m=4 + rly[24] = ylmcoef[21] * rly[20] - ylmcoef[22] * rly[8] + - tmp9 * rly[15]; // l=4,m=-4 + if (Lmax == 4) + return; + + /*************************** + L = 5 + ***************************/ + tmp0 = ylmcoef[25] * rly[9]; + rly[25] = ylmcoef[24] * zdr * rly[16] - tmp0; // l=5,m=0 + + double tmp10 = ylmcoef[26] * zdr; + rly[26] = tmp10 * rly[17] - ylmcoef[27] * rly[10]; // l=5,m=1 + rly[27] = tmp10 * rly[18] - ylmcoef[27] * rly[11]; // l=5,m=-1 + + double tmp11 = ylmcoef[28] * zdr; + rly[28] = tmp11 * rly[19] - ylmcoef[29] * rly[12]; // l=5,m=2 + rly[29] = tmp11 * rly[20] - ylmcoef[29] * rly[13]; // l=5,m=-2 + + double tmp12 = ylmcoef[30] * zdr; + rly[30] = tmp12 * rly[21] - ylmcoef[31] * rly[14]; // l=5,m=3 + rly[31] = tmp12 * rly[22] - ylmcoef[31] * rly[15]; // l=5,m=-3 + + double tmp13 = ylmcoef[32] * zdr; + rly[32] = tmp13 * rly[23]; // l=5,m=4 + rly[33] = tmp13 * rly[24]; // l=5,m=-4 + + double tmp14 = ylmcoef[35] * xdr; + rly[34] = ylmcoef[33] * rly[30] - ylmcoef[34] * rly[14] + - tmp14 * rly[23]; // l=5,m=5 + rly[35] = ylmcoef[33] * rly[31] - ylmcoef[34] * rly[15] + - tmp14 * rly[24]; // l=5,m=-5 + if (Lmax == 5) + return; + + // if Lmax > 5 + for (int il = 6; il <= Lmax; il++) + { + int istart = il * il; + int istart1 = (il - 1) * (il - 1); + int istart2 = (il - 2) * (il - 2); + + double fac2 = sqrt(4.0 * istart - 1.0); + double fac4 = sqrt(4.0 * istart1 - 1.0); + + for (int im = 0; im < 2 * il - 1; im++) + { + int imm = (im + 1) / 2; + // if (im % 2 == 0) imm *= -1; + + rly[istart + im] = fac2 / sqrt((double)istart - imm * imm) + * (zdr * rly[istart1 + im] + - sqrt((double)istart1 - imm * imm) / fac4 + * rly[istart2 + im]); + } + + double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0)); + double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0)); + double bl3 = sqrt(2.0) / fac2; + + rly[istart + 2 * il - 1] + = (bl3 * rly[istart + 2 * il - 5] - bl2 * rly[istart2 + 2 * il - 5] + - 2.0 * xdr * rly[istart1 + 2 * il - 3]) + / bl1; + rly[istart + 2 * il] + = (bl3 * rly[istart + 2 * il - 4] - bl2 * rly[istart2 + 2 * il - 4] + - 2.0 * xdr * rly[istart1 + 2 * il - 2]) + / bl1; + } + + return; +} +void grad_rl_sph_harm(const int& Lmax, // max momentum of L + const double& x, + const double& y, + const double& z, + std::vector& rly, + std::vector>& grly, + const double* ylmcoef) +{ + rly.resize((Lmax + 1) * (Lmax + 1)); + grly.resize((Lmax + 1) * (Lmax + 1), std::vector(3)); + + double radius2 = x * x + y * y + z * z; + double tx = 2.0 * x; + double ty = 2.0 * y; + double tz = 2.0 * z; + + // begin calculation + /*************************** + L = 0 + ***************************/ + rly[0] = ylmcoef[0]; // l=0, m=0 + grly[0][0] = grly[0][1] = grly[0][2] = 0.0; + if (Lmax == 0) + return; + + /*************************** + L = 1 + ***************************/ + rly[1] = ylmcoef[1] * z; // l=1, m=0 + grly[1][0] = grly[1][1] = 0.0; + grly[1][2] = ylmcoef[1]; + + rly[2] = -ylmcoef[1] * x; // l=1, m=1 + grly[2][1] = grly[2][2] = 0.0; + grly[2][0] = -ylmcoef[1]; + + rly[3] = -ylmcoef[1] * y; // l=1, m=-1 + grly[3][0] = grly[3][2] = 0.0; + grly[3][1] = -ylmcoef[1]; + + if (Lmax == 1) + return; + + /*************************** + L = 2 + ***************************/ + rly[4] + = ylmcoef[2] * z * rly[1] - ylmcoef[3] * rly[0] * radius2; // l=2, m=0 + grly[4][0] + = ylmcoef[2] * z * grly[1][0] + - ylmcoef[3] * (grly[0][0] * radius2 + rly[0] * tx); // l=2, m=0 + grly[4][1] + = ylmcoef[2] * z * grly[1][1] + - ylmcoef[3] * (grly[0][1] * radius2 + rly[0] * ty); // l=2, m=0 + grly[4][2] + = ylmcoef[2] * (z * grly[1][2] + rly[1]) + - ylmcoef[3] * (grly[0][2] * radius2 + rly[0] * tz); // l=2, m=0 + + double tmp0 = ylmcoef[4] * z; + rly[5] = tmp0 * rly[2]; // l=2,m=1 + grly[5][0] = tmp0 * grly[2][0]; + grly[5][1] = tmp0 * grly[2][1]; + grly[5][2] = ylmcoef[4] * (rly[2] + z * grly[2][2]); + + rly[6] = tmp0 * rly[3]; // l=2,m=-1 + grly[6][0] = tmp0 * grly[3][0]; + grly[6][1] = tmp0 * grly[3][1]; + grly[6][2] = ylmcoef[4] * (rly[3] + z * grly[3][2]); + + double tmp2 = ylmcoef[4] * x; + rly[7] = ylmcoef[5] * rly[4] - ylmcoef[6] * rly[0] * radius2 + - tmp2 * rly[2]; // l=2,m=2 + grly[7][0] = ylmcoef[5] * grly[4][0] + - ylmcoef[6] * (rly[0] * tx + grly[0][0] * radius2) + - ylmcoef[4] * (x * grly[2][0] + rly[2]); + + // std::cout << "\np1 = "<< ylmcoef[5]*grly[4][0] << " p2 = " << + //-ylmcoef[6]*rly[0]*tx + // << " p3 = " << -ylmcoef[4]*x*grly[2][0] << " p4 = " + //<< -ylmcoef[4]*rly[2] << std::endl; + + grly[7][1] = ylmcoef[5] * grly[4][1] + - ylmcoef[6] * (rly[0] * ty + grly[0][1] * radius2) + - tmp2 * grly[2][1]; + grly[7][2] = ylmcoef[5] * grly[4][2] + - ylmcoef[6] * (rly[0] * tz + grly[0][2] * radius2) + - tmp2 * grly[2][2]; + + rly[8] = -tmp2 * rly[3]; + grly[8][0] = -ylmcoef[4] * (rly[3] + x * grly[3][0]); + grly[8][1] = -tmp2 * grly[3][1]; + grly[8][2] = -tmp2 * grly[3][2]; + // rly[8] = tmp1+tmp2*rly[3];//l=2,m=-2 + if (Lmax == 2) + return; + + /*************************** + L = 3 + ***************************/ + rly[9] + = ylmcoef[7] * z * rly[4] - ylmcoef[8] * rly[1] * radius2; // l=3, m=0 + grly[9][0] = ylmcoef[7] * z * grly[4][0] + - ylmcoef[8] * (rly[1] * tx + grly[1][0] * radius2); + grly[9][1] = ylmcoef[7] * z * grly[4][1] + - ylmcoef[8] * (rly[1] * ty + grly[1][1] * radius2); + grly[9][2] = ylmcoef[7] * (rly[4] + z * grly[4][2]) + - ylmcoef[8] * (rly[1] * tz + grly[1][2] * radius2); + + double tmp3 = ylmcoef[9] * z; + rly[10] = tmp3 * rly[5] - ylmcoef[10] * rly[2] * radius2; // l=3,m=1 + grly[10][0] = tmp3 * grly[5][0] + - ylmcoef[10] * (grly[2][0] * radius2 + rly[2] * tx); + grly[10][1] = tmp3 * grly[5][1] + - ylmcoef[10] * (grly[2][1] * radius2 + rly[2] * ty); + grly[10][2] = ylmcoef[9] * (z * grly[5][2] + rly[5]) + - ylmcoef[10] * (grly[2][2] * radius2 + rly[2] * tz); + + rly[11] = tmp3 * rly[6] - ylmcoef[10] * rly[3] * radius2; // l=3,m=-1 + grly[11][0] = tmp3 * grly[6][0] + - ylmcoef[10] * (grly[3][0] * radius2 + rly[3] * tx); + grly[11][1] = tmp3 * grly[6][1] + - ylmcoef[10] * (grly[3][1] * radius2 + rly[3] * ty); + grly[11][2] = ylmcoef[9] * (z * grly[6][2] + rly[6]) + - ylmcoef[10] * (grly[3][2] * radius2 + rly[3] * tz); + + double tmp4 = ylmcoef[11] * z; + rly[12] = tmp4 * rly[7]; // l=3,m=2 + grly[12][0] = tmp4 * grly[7][0]; + grly[12][1] = tmp4 * grly[7][1]; + grly[12][2] = ylmcoef[11] * (z * grly[7][2] + rly[7]); + + rly[13] = tmp4 * rly[8]; // l=3,m=-2 + grly[13][0] = tmp4 * grly[8][0]; + grly[13][1] = tmp4 * grly[8][1]; + grly[13][2] = ylmcoef[11] * (z * grly[8][2] + rly[8]); + + double tmp5 = ylmcoef[14] * x; + rly[14] = ylmcoef[12] * rly[10] - ylmcoef[13] * rly[2] * radius2 + - tmp5 * rly[7]; // l=3,m=3 + grly[14][0] = ylmcoef[12] * grly[10][0] + - ylmcoef[13] * (rly[2] * tx + grly[2][0] * radius2) + - ylmcoef[14] * (rly[7] + x * grly[7][0]); + grly[14][1] = ylmcoef[12] * grly[10][1] + - ylmcoef[13] * (rly[2] * ty + grly[2][1] * radius2) + - tmp5 * grly[7][1]; + grly[14][2] = ylmcoef[12] * grly[10][2] + - ylmcoef[13] * (rly[2] * tz + grly[2][2] * radius2) + - tmp5 * grly[7][2]; + + rly[15] = ylmcoef[12] * rly[11] - ylmcoef[13] * rly[3] * radius2 + - tmp5 * rly[8]; // l=3,m=-3 + grly[15][0] = ylmcoef[12] * grly[11][0] + - ylmcoef[13] * (rly[3] * tx + grly[3][0] * radius2) + - ylmcoef[14] * (rly[8] + x * grly[8][0]); + grly[15][1] = ylmcoef[12] * grly[11][1] + - ylmcoef[13] * (rly[3] * ty + grly[3][1] * radius2) + - tmp5 * grly[8][1]; + grly[15][2] = ylmcoef[12] * grly[11][2] + - ylmcoef[13] * (rly[3] * tz + grly[3][2] * radius2) + - tmp5 * grly[8][2]; + if (Lmax == 3) + return; + + /*************************** + L = 4 + ***************************/ + rly[16] + = ylmcoef[15] * z * rly[9] - ylmcoef[16] * rly[4] * radius2; // l=4,m=0 + grly[16][0] = ylmcoef[15] * z * grly[9][0] + - ylmcoef[16] * (rly[4] * tx + grly[4][0] * radius2); + grly[16][1] = ylmcoef[15] * z * grly[9][1] + - ylmcoef[16] * (rly[4] * ty + grly[4][1] * radius2); + grly[16][2] = ylmcoef[15] * (z * grly[9][2] + rly[9]) + - ylmcoef[16] * (rly[4] * tz + grly[4][2] * radius2); + + double tmp6 = ylmcoef[17] * z; + rly[17] = tmp6 * rly[10] - ylmcoef[18] * rly[5] * radius2; // l=4,m=1 + grly[17][0] = tmp6 * grly[10][0] + - ylmcoef[18] * (rly[5] * tx + grly[5][0] * radius2); + grly[17][1] = tmp6 * grly[10][1] + - ylmcoef[18] * (rly[5] * ty + grly[5][1] * radius2); + grly[17][2] = ylmcoef[17] * (z * grly[10][2] + rly[10]) + - ylmcoef[18] * (rly[5] * tz + grly[5][2] * radius2); + + rly[18] = tmp6 * rly[11] - ylmcoef[18] * rly[6] * radius2; // l=4,m=-1 + grly[18][0] = tmp6 * grly[11][0] + - ylmcoef[18] * (rly[6] * tx + grly[6][0] * radius2); + grly[18][1] = tmp6 * grly[11][1] + - ylmcoef[18] * (rly[6] * ty + grly[6][1] * radius2); + grly[18][2] = ylmcoef[17] * (z * grly[11][2] + rly[11]) + - ylmcoef[18] * (rly[6] * tz + grly[6][2] * radius2); + + double tmp7 = ylmcoef[19] * z; + rly[19] = tmp7 * rly[12] - ylmcoef[20] * rly[7] * radius2; // l=4,m=2 + grly[19][0] = tmp7 * grly[12][0] + - ylmcoef[20] * (rly[7] * tx + grly[7][0] * radius2); + grly[19][1] = tmp7 * grly[12][1] + - ylmcoef[20] * (rly[7] * ty + grly[7][1] * radius2); + grly[19][2] = ylmcoef[19] * (z * grly[12][2] + rly[12]) + - ylmcoef[20] * (rly[7] * tz + grly[7][2] * radius2); + + rly[20] = tmp7 * rly[13] - ylmcoef[20] * rly[8] * radius2; // l=4,m=-2 + grly[20][0] = tmp7 * grly[13][0] + - ylmcoef[20] * (rly[8] * tx + grly[8][0] * radius2); + grly[20][1] = tmp7 * grly[13][1] + - ylmcoef[20] * (rly[8] * ty + grly[8][1] * radius2); + grly[20][2] = ylmcoef[19] * (z * grly[13][2] + rly[13]) + - ylmcoef[20] * (rly[8] * tz + grly[8][2] * radius2); + + double tmp8 = 3.0 * z; + rly[21] = tmp8 * rly[14]; // l=4,m=3 + grly[21][0] = tmp8 * grly[14][0]; + grly[21][1] = tmp8 * grly[14][1]; + grly[21][2] = 3.0 * (z * grly[14][2] + rly[14]); + + rly[22] = tmp8 * rly[15]; // l=4,m=-3 + grly[22][0] = tmp8 * grly[15][0]; + grly[22][1] = tmp8 * grly[15][1]; + grly[22][2] = 3.0 * (z * grly[15][2] + rly[15]); + + double tmp9 = ylmcoef[23] * x; + rly[23] = ylmcoef[21] * rly[19] - ylmcoef[22] * rly[7] * radius2 + - tmp9 * rly[14]; // l=4,m=4 + grly[23][0] = ylmcoef[21] * grly[19][0] + - ylmcoef[22] * (rly[7] * tx + grly[7][0] * radius2) + - ylmcoef[23] * (x * grly[14][0] + rly[14]); + grly[23][1] = ylmcoef[21] * grly[19][1] + - ylmcoef[22] * (rly[7] * ty + grly[7][1] * radius2) + - tmp9 * grly[14][1]; + grly[23][2] = ylmcoef[21] * grly[19][2] + - ylmcoef[22] * (rly[7] * tz + grly[7][2] * radius2) + - tmp9 * grly[14][2]; + + rly[24] = ylmcoef[21] * rly[20] - ylmcoef[22] * rly[8] * radius2 + - tmp9 * rly[15]; // l=4,m=-4 + grly[24][0] = ylmcoef[21] * grly[20][0] + - ylmcoef[22] * (rly[8] * tx + grly[8][0] * radius2) + - ylmcoef[23] * (x * grly[15][0] + rly[15]); + grly[24][1] = ylmcoef[21] * grly[20][1] + - ylmcoef[22] * (rly[8] * ty + grly[8][1] * radius2) + - tmp9 * grly[15][1]; + grly[24][2] = ylmcoef[21] * grly[20][2] + - ylmcoef[22] * (rly[8] * tz + grly[8][2] * radius2) + - tmp9 * grly[15][2]; + + if (Lmax == 4) + return; + + /*************************** + L = 5 + ***************************/ + rly[25] + = ylmcoef[24] * z * rly[16] - ylmcoef[25] * rly[9] * radius2; // l=5,m=0 + grly[25][0] = ylmcoef[24] * z * grly[16][0] + - ylmcoef[25] * (rly[9] * tx + grly[9][0] * radius2); + grly[25][1] = ylmcoef[24] * z * grly[16][1] + - ylmcoef[25] * (rly[9] * ty + grly[9][1] * radius2); + grly[25][2] = ylmcoef[24] * (z * grly[16][2] + rly[16]) + - ylmcoef[25] * (rly[9] * tz + grly[9][2] * radius2); + + double tmp10 = ylmcoef[26] * z; + rly[26] = tmp10 * rly[17] - ylmcoef[27] * rly[10] * radius2; // l=5,m=1 + grly[26][0] = tmp10 * grly[17][0] + - ylmcoef[27] * (rly[10] * tx + grly[10][0] * radius2); + grly[26][1] = tmp10 * grly[17][1] + - ylmcoef[27] * (rly[10] * ty + grly[10][1] * radius2); + grly[26][2] = ylmcoef[26] * (z * grly[17][2] + rly[17]) + - ylmcoef[27] * (rly[10] * tz + grly[10][2] * radius2); + + rly[27] = tmp10 * rly[18] - ylmcoef[27] * rly[11] * radius2; // l=5,m=-1 + grly[27][0] = tmp10 * grly[18][0] + - ylmcoef[27] * (rly[11] * tx + grly[11][0] * radius2); + grly[27][1] = tmp10 * grly[18][1] + - ylmcoef[27] * (rly[11] * ty + grly[11][1] * radius2); + grly[27][2] = ylmcoef[26] * (z * grly[18][2] + rly[18]) + - ylmcoef[27] * (rly[11] * tz + grly[11][2] * radius2); + + double tmp11 = ylmcoef[28] * z; + rly[28] = tmp11 * rly[19] - ylmcoef[29] * rly[12] * radius2; // l=5,m=2 + grly[28][0] = tmp11 * grly[19][0] + - ylmcoef[29] * (rly[12] * tx + grly[12][0] * radius2); + grly[28][1] = tmp11 * grly[19][1] + - ylmcoef[29] * (rly[12] * ty + grly[12][1] * radius2); + grly[28][2] = ylmcoef[28] * (z * grly[19][2] + rly[19]) + - ylmcoef[29] * (rly[12] * tz + grly[12][2] * radius2); + + rly[29] = tmp11 * rly[20] - ylmcoef[29] * rly[13] * radius2; // l=5,m=-2 + grly[29][0] = tmp11 * grly[20][0] + - ylmcoef[29] * (rly[13] * tx + grly[13][0] * radius2); + grly[29][1] = tmp11 * grly[20][1] + - ylmcoef[29] * (rly[13] * ty + grly[13][1] * radius2); + grly[29][2] = ylmcoef[28] * (z * grly[20][2] + rly[20]) + - ylmcoef[29] * (rly[13] * tz + grly[13][2] * radius2); + + double tmp12 = ylmcoef[30] * z; + rly[30] = tmp12 * rly[21] - ylmcoef[31] * rly[14] * radius2; // l=5,m=3 + grly[30][0] = tmp12 * grly[21][0] + - ylmcoef[31] * (grly[14][0] * radius2 + rly[14] * tx); + grly[30][1] = tmp12 * grly[21][1] + - ylmcoef[31] * (grly[14][1] * radius2 + rly[14] * ty); + grly[30][2] = ylmcoef[30] * (z * grly[21][2] + rly[21]) + - ylmcoef[31] * (grly[14][2] * radius2 + rly[14] * tz); + + rly[31] = tmp12 * rly[22] - ylmcoef[31] * rly[15] * radius2; // l=5,m=-3 + grly[31][0] = tmp12 * grly[22][0] + - ylmcoef[31] * (grly[15][0] * radius2 + rly[15] * tx); + grly[31][1] = tmp12 * grly[22][1] + - ylmcoef[31] * (grly[15][1] * radius2 + rly[15] * ty); + grly[31][2] = ylmcoef[30] * (z * grly[22][2] + rly[22]) + - ylmcoef[31] * (grly[15][2] * radius2 + rly[15] * tz); + + double tmp13 = ylmcoef[32] * z; + rly[32] = tmp13 * rly[23]; // l=5,m=4 + grly[32][0] = tmp13 * grly[23][0]; + grly[32][1] = tmp13 * grly[23][1]; + grly[32][2] = ylmcoef[32] * (rly[23] + z * grly[23][2]); + + rly[33] = tmp13 * rly[24]; // l=5,m=-4 + grly[33][0] = tmp13 * grly[24][0]; + grly[33][1] = tmp13 * grly[24][1]; + grly[33][2] = ylmcoef[32] * (rly[24] + z * grly[24][2]); + + double tmp14 = ylmcoef[35] * x; + rly[34] = ylmcoef[33] * rly[30] - ylmcoef[34] * rly[14] * radius2 + - tmp14 * rly[23]; // l=5,m=5 + grly[34][0] = ylmcoef[33] * grly[30][0] + - ylmcoef[34] * (rly[14] * tx + grly[14][0] * radius2) + - ylmcoef[35] * (x * grly[23][0] + rly[23]); + grly[34][1] = ylmcoef[33] * grly[30][1] + - ylmcoef[34] * (rly[14] * ty + grly[14][1] * radius2) + - tmp14 * grly[23][1]; + grly[34][2] = ylmcoef[33] * grly[30][2] + - ylmcoef[34] * (rly[14] * tz + grly[14][2] * radius2) + - tmp14 * grly[23][2]; + + rly[35] = ylmcoef[33] * rly[31] - ylmcoef[34] * rly[15] * radius2 + - tmp14 * rly[24]; // l=5,m=-5 + grly[35][0] = ylmcoef[33] * grly[31][0] + - ylmcoef[34] * (rly[15] * tx + grly[15][0] * radius2) + - ylmcoef[35] * (x * grly[24][0] + rly[24]); + grly[35][1] = ylmcoef[33] * grly[31][1] + - ylmcoef[34] * (rly[15] * ty + grly[15][1] * radius2) + - tmp14 * grly[24][1]; + grly[35][2] = ylmcoef[33] * grly[31][2] + - ylmcoef[34] * (rly[15] * tz + grly[15][2] * radius2) + - tmp14 * grly[24][2]; + + if (Lmax == 5) + return; + + // if Lmax > 5 + for (int il = 6; il <= Lmax; il++) + { + int istart = il * il; + int istart1 = (il - 1) * (il - 1); + int istart2 = (il - 2) * (il - 2); + + double fac2 = sqrt(4.0 * istart - 1.0); + double fac4 = sqrt(4.0 * istart1 - 1.0); + + for (int im = 0; im < 2 * il - 1; im++) + { + int imm = (im + 1) / 2; + // if (im % 2 == 0) imm *= -1; + + double var1 = fac2 / sqrt((double)istart - imm * imm); + double var2 = sqrt((double)istart1 - imm * imm) / fac4; + + rly[istart + im] = var1 + * (z * rly[istart1 + im] + - var2 * rly[istart2 + im] * radius2); + + grly[istart + im][0] + = var1 + * (z * grly[istart1 + im][0] + - var2 + * (rly[istart2 + im] * tx + + grly[istart2 + im][0] * radius2)); + grly[istart + im][1] + = var1 + * (z * grly[istart1 + im][1] + - var2 + * (rly[istart2 + im] * ty + + grly[istart2 + im][1] * radius2)); + grly[istart + im][2] + = var1 + * (z * grly[istart1 + im][2] + rly[istart1 + im] + - var2 + * (rly[istart2 + im] * tz + + grly[istart2 + im][2] * radius2)); + } + + double bl1 = sqrt(2.0 * il / (2.0 * il + 1.0)); + double bl2 = sqrt((2.0 * il - 2.0) / (2.0 * il - 1.0)); + double bl3 = sqrt(2.0) / fac2; + + int id1 = istart + 2 * il - 1; + int id2 = istart + 2 * il - 5; + int id3 = istart2 + 2 * il - 5; + int id4 = istart1 + 2 * il - 3; + + rly[id1] + = (bl3 * rly[id2] - bl2 * rly[id3] * radius2 - 2.0 * x * rly[id4]) + / bl1; + grly[id1][0] = (bl3 * grly[id2][0] + - bl2 * (grly[id3][0] * radius2 + rly[id3] * tx) + - 2.0 * (rly[id4] + x * grly[id4][0])) + / bl1; + grly[id1][1] = (bl3 * grly[id2][1] + - bl2 * (grly[id3][1] * radius2 + rly[id3] * ty) + - 2.0 * x * grly[id4][1]) + / bl1; + grly[id1][2] = (bl3 * grly[id2][2] + - bl2 * (grly[id3][2] * radius2 + rly[id3] * tz) + - 2.0 * x * grly[id4][2]) + / bl1; + + rly[id1 + 1] = (bl3 * rly[id2 + 1] - bl2 * rly[id3 + 1] * radius2 + - 2.0 * x * rly[id4 + 1]) + / bl1; + grly[id1 + 1][0] + = (bl3 * grly[id2 + 1][0] + - bl2 * (grly[id3 + 1][0] * radius2 + rly[id3 + 1] * tx) + - 2.0 * (rly[id4 + 1] + x * grly[id4 + 1][0])) + / bl1; + grly[id1 + 1][1] + = (bl3 * grly[id2 + 1][1] + - bl2 * (grly[id3 + 1][1] * radius2 + rly[id3 + 1] * ty) + - 2.0 * x * grly[id4 + 1][1]) + / bl1; + grly[id1 + 1][2] + = (bl3 * grly[id2 + 1][2] + - bl2 * (grly[id3 + 1][2] * radius2 + rly[id3 + 1] * tz) + - 2.0 * x * grly[id4 + 1][2]) + / bl1; + } + + return; +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/test/test_sph.cu b/source/module_hamilt_lcao/module_gint/test/test_sph.cu new file mode 100644 index 0000000000..5476ac88fa --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/test/test_sph.cu @@ -0,0 +1,138 @@ +#include +#include "../kernels/cuda/sph.cuh" + +#include "float.h" +#include "cuda_runtime.h" +#include "device_functions.h" +#include "device_launch_parameters.h" +#include "gtest/gtest.h" +#include "module_hamilt_lcao/module_hcontainer/hcontainer.h" +#include "test_sph.h" +using namespace std; + +class gintTest : public ::testing::Test +{ + public: +}; + +__global__ void cuda_test(double* dr, double distance, int nwl, double* ylma_g, double* ylmcoef) +{ + double ylma[49] = {0.0}; + GintKernel::spherical_harmonics(dr, distance, nwl, ylma, ylmcoef); + for (int i = 0; i < 49; i++) + { + ylma_g[i] = ylma[i]; + } +} +__global__ void cuda_test2(double* dr, double distance, int nwl, double* dylma_g, double* ylmcoef) +{ + double ylma[49] = {0.0}; + double grly[49][3] = {0.0}; + GintKernel::spherical_harmonics_d(dr, distance, grly, nwl, ylma, ylmcoef); + for (int i = 0; i < 49; i++) + { + dylma_g[i] = ylma[i]; + } +} + +void get_random_double(int min, int max, double* result, int length) +{ + std::random_device rd; + std::default_random_engine eng(rd()); + std::uniform_real_distribution distribution(0, 10); + for (int i = 0; i < 3; i++) + { + result[i] = distribution(eng); + } +} +void get_random_int(int min, int max, int& result) +{ + std::random_device rd; + std::default_random_engine eng(rd()); + std::uniform_int_distribution distribution(min, max); + result = distribution(eng); +} +// __global__ void cuda_test +TEST_F(gintTest, test) +{ + int nwl; + double distance; + + double* dr = new double[3]; + double* dr_g; + + double ylma[49]; + double dylma[49]; + double ylma_ans[49]; + + double* ylmcoef_g; + double* ylma_g; + double* dylma_g; + double* ylmcoef = new double[100]; + + std::vector ylma_cpu(49, 0.0); + std::vector ylma_cpu_dpsir(49, 0.0); + std::vector> ylma_cpu_ddpsir(49, vector(3, 0.0)); + + nwl=3; + for (int i=0;i<3;i++){ + dr[i]=i*1.0; + distance += dr[i] * dr[i]; + } + for (int i=0;i<100;i++) + { + ylmcoef[i]=i*0.1; + } + + cudaMalloc((void**)&ylmcoef_g, 100 * sizeof(double)); + cudaMalloc((void**)&dr_g, 3 * sizeof(double)); + cudaMalloc((void**)&ylma_g, 49 * sizeof(double)); + cudaMalloc((void**)&dylma_g, 49 * 3 * sizeof(double)); + + cudaMemcpy(ylmcoef_g, ylmcoef, 100 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(dr_g, dr, 3 * sizeof(double), cudaMemcpyHostToDevice); + cudaMemset(ylma_g, 0, 49 * sizeof(double)); + cudaMemset(dylma_g, 0, 49 * sizeof(double)); + + cuda_test<<<1, 1>>>(dr_g, distance, nwl, ylma_g, ylmcoef_g); + cuda_test2<<<1, 1>>>(dr_g, distance, nwl, dylma_g, ylmcoef_g); + sph_harm(nwl, dr[0], dr[1], dr[2], ylma_cpu, ylmcoef); + grad_rl_sph_harm(nwl, dr[0], dr[1], dr[2], ylma_cpu_dpsir, ylma_cpu_ddpsir, ylmcoef); + cudaMemcpy(ylma, ylma_g, 49 * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(dylma, dylma_g, 49 * sizeof(double), cudaMemcpyDeviceToHost); + cudaDeviceReset(); + + for (int i = 0; i < 49; i++) + { + ylma_ans[i] = ylma_cpu[i]; + if ((abs(ylma[i])!= 0) && (ylma_ans[i]==ylma_ans[i]) && (ylma[i]==ylma[i])) + { + EXPECT_LT(abs(ylma_ans[i] - ylma[i]) / abs(ylma[i]), 1e-15); + } + ylma_ans[i] = ylma_cpu_dpsir[i]; + if ((abs(dylma[i]) != 0) &&(ylma_ans[i]==ylma_ans[i]) && (dylma[i]==dylma[i])) + { + EXPECT_LT(abs(ylma_ans[i] - dylma[i]) / abs(dylma[i]), 1e-15); + } + } + delete[] dr; + delete[] ylmcoef; + +} + +int main(int argc, char** argv) +{ +#ifdef __MPI + MPI_Init(&argc, &argv); + MPI_Comm_size(MPI_COMM_WORLD, &GlobalV::NPROC); + MPI_Comm_rank(MPI_COMM_WORLD, &GlobalV::MY_RANK); +#endif + testing::InitGoogleTest(&argc, argv); + int result = RUN_ALL_TESTS(); + +#ifdef __MPI + MPI_Finalize(); +#endif + + return result; +} \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_gint/test/test_sph.h b/source/module_hamilt_lcao/module_gint/test/test_sph.h new file mode 100644 index 0000000000..1f7aab9878 --- /dev/null +++ b/source/module_hamilt_lcao/module_gint/test/test_sph.h @@ -0,0 +1,19 @@ +#ifndef TEST_SPH_H +#define TEST_SPH_H +#include +// using namespace std; +void sph_harm(const int& Lmax, + const double& xdr, + const double& ydr, + const double& zdr, + std::vector& rly, + double* ylmcoef); + +void grad_rl_sph_harm(const int& Lmax, // max momentum of L + const double& x, + const double& y, + const double& z, + std::vector& rly, + std::vector>& grly, + const double* ylmcoef); +#endif \ No newline at end of file diff --git a/source/module_hsolver/CMakeLists.txt b/source/module_hsolver/CMakeLists.txt index ec64e49f5f..eac67c9b32 100644 --- a/source/module_hsolver/CMakeLists.txt +++ b/source/module_hsolver/CMakeLists.txt @@ -21,7 +21,7 @@ if(ENABLE_LCAO) ) endif () - if(USE_CUSOLVER_LCAO AND USE_CUDA) + if(USE_CUDA) list(APPEND objects ./kernels/math_kernel_op.cpp ./kernels/dngvd_op.cpp diff --git a/source/module_hsolver/diago_cusolver.cpp b/source/module_hsolver/diago_cusolver.cpp index 871783334b..185788c07b 100644 --- a/source/module_hsolver/diago_cusolver.cpp +++ b/source/module_hsolver/diago_cusolver.cpp @@ -1,92 +1,184 @@ #include "diago_cusolver.h" #include "module_base/global_variable.h" -#include "module_base/lapack_connector.h" #include "module_base/timer.h" -#include "module_base/tool_quit.h" - -extern "C" -{ #include "module_base/blacs_connector.h" #include "module_base/scalapack_connector.h" -} -// Define matrix types for real and complex numbers -typedef hamilt::MatrixBlock matd; -typedef hamilt::MatrixBlock> matcd; +#include + +using complex = std::complex; // Namespace for the diagonalization solver namespace hsolver { // Initialize the DecomposedState variable for real and complex numbers - template <> - int DiagoCusolver::DecomposedState = 0; - template <> - int DiagoCusolver>::DecomposedState = 0; - - // Diagonalization function for complex numbers - template <> - void DiagoCusolver>::diag(hamilt::Hamilt>* phm_in, - psi::Psi>& psi, - Real* eigenvalue_in) - { - // Output the title for the current operation - ModuleBase::TITLE("DiagoCusolver", "diag"); + template + int DiagoCusolver::DecomposedState = 0; - // Create matrices for the Hamiltonian and overlap - matcd h_mat, s_mat; - phm_in->matrix(h_mat, s_mat); + template + DiagoCusolver::DiagoCusolver(const Parallel_Orbitals* ParaV) + { + this->ParaV = ParaV; + } - // Calculate the size based on the number of bands and basis functions - int size = psi.get_nbands() * psi.get_nbasis(); + template + DiagoCusolver::~DiagoCusolver() + { + } + + // Wrapper for pdgemr2d and pzgemr2d + static inline void Cpxgemr2d( + const int M, const int N, + double *a, const int ia, const int ja, const int *desca, + double *b, const int ib, const int jb, const int *descb, + const int blacs_ctxt) + { + pdgemr2d_(&M, &N, + a, &ia, &ja, desca, + b, &ib, &jb, descb, + &blacs_ctxt); + } - // Allocate memory for eigenvalues and eigenvectors - std::vector eigen(GlobalV::NLOCAL, 0.0); - std::complex* eigenvectors = new std::complex[h_mat.row * h_mat.col]; + static inline void Cpxgemr2d( + const int M, const int N, + complex *a, const int ia, const int ja, const int *desca, + complex *b, const int ib, const int jb, const int *descb, + const int blacs_ctxt) + { + pzgemr2d_(&M, &N, + a, &ia, &ja, desca, + b, &ib, &jb, descb, + &blacs_ctxt); + } - // Start the timer for the cusolver operation - ModuleBase::timer::tick("DiagoCusolver", "cusolver"); + // Use Cpxgemr2d to collect matrices from all processes to root process + template + static void gatherMatrix(const int myid, + const int root_proc, + const mat& mat_l, + mat& mat_g) + { + auto a = mat_l.p; + decltype(a) b; + const int* desca = mat_l.desc; + int ctxt = desca[1]; + int nrows = desca[2]; + int ncols = desca[3]; + + if (myid == root_proc) + b = new typename std::remove_reference::type[nrows * ncols]; + else + b = new typename std::remove_reference::type[1]; + + // Set descb, which has all elements in the only block in the root process + int descb[9] = {1, ctxt, nrows, ncols, nrows, ncols, 0, 0, nrows}; + + mat_g.desc = descb; + mat_g.row = nrows; + mat_g.col = ncols; + mat_g.p = b; + + Cpxgemr2d(nrows, ncols, a, 1, 1, desca, b, 1, 1, descb, ctxt); + } - // Call the dense complex diagonalization routine - this->dc.Dngvd_complex(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), eigenvectors); + // Convert the Psi to a 2D block storage format + template + static void distributePsi(const int* desc_psi, T* psi, T* psi_g) + { + int ctxt = desc_psi[1]; + int nrows = desc_psi[2]; + int ncols = desc_psi[3]; + int rsrc = desc_psi[6]; + int csrc = desc_psi[7]; - // Stop the timer for the cusolver operation - ModuleBase::timer::tick("DiagoCusolver", "cusolver"); + int descg[9] = {1, ctxt, nrows, ncols, nrows, ncols, rsrc, csrc, nrows}; + int descl[9]; - // Copy the eigenvalues and eigenvectors to the output arrays - const int inc = 1; - BlasConnector::copy(GlobalV::NBANDS, eigen.data(), inc, eigenvalue_in, inc); - BlasConnector::copy(size, eigenvectors, inc, psi.get_pointer(), inc); + std::copy(desc_psi, desc_psi + 9, descl); - // Free allocated memory - delete[] eigenvectors; + Cpxgemr2d(nrows, ncols, psi_g, 1, 1, descg, psi, 1, 1, descl, ctxt); } - // Diagonalization function for real numbers - template <> - void DiagoCusolver::diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* eigenvalue_in) + // Diagonalization function + template + void DiagoCusolver::diag(hamilt::Hamilt* phm_in, + psi::Psi& psi, + Real* eigenvalue_in) { // Output the title for the current operation ModuleBase::TITLE("DiagoCusolver", "diag"); // Create matrices for the Hamiltonian and overlap - matd h_mat, s_mat; + hamilt::MatrixBlock h_mat, s_mat; phm_in->matrix(h_mat, s_mat); +#ifdef __MPI + // global matrix + hamilt::MatrixBlock h_mat_g, s_mat_g; + + // global psi for distribute + T* psi_g; + + // get the context and process information + int ctxt = ParaV->blacs_ctxt; + int nprows, npcols, myprow, mypcol; + Cblacs_gridinfo(ctxt, &nprows, &npcols, &myprow, &mypcol); + int myid = Cblacs_pnum(ctxt, myprow, mypcol); + const int root_proc = Cblacs_pnum(ctxt, ParaV->desc[6], ParaV->desc[7]); + +#endif + // Allocate memory for eigenvalues std::vector eigen(GlobalV::NLOCAL, 0.0); // Start the timer for the cusolver operation ModuleBase::timer::tick("DiagoCusolver", "cusolver"); - // Call the dense double diagonalization routine - this->dc.Dngvd_double(h_mat.col, h_mat.row, h_mat.p, s_mat.p, eigen.data(), psi.get_pointer()); - +#ifdef __MPI + // gather matrices from processes to root process + gatherMatrix(myid, root_proc, h_mat, h_mat_g); + gatherMatrix(myid, root_proc, s_mat, s_mat_g); +#endif + + // Call the dense diagonalization routine +#ifdef __MPI + MPI_Barrier(MPI_COMM_WORLD); + if (myid == root_proc) + { + psi_g = new T[h_mat_g.row * h_mat_g.col]; + this->dc.Dngvd(h_mat_g.col, h_mat_g.row, h_mat_g.p, s_mat_g.p, eigen.data(), psi_g); + } + else + { + psi_g = new T[1]; + } + MPI_Barrier(MPI_COMM_WORLD); + // broadcast eigenvalues to all processes + MPI_Bcast(eigen.data(), GlobalV::NBANDS, MPI_DOUBLE, root_proc, MPI_COMM_WORLD); + + // distribute psi to all processes + distributePsi(this->ParaV->desc_wfc, psi.get_pointer(), psi_g); +#else + // Call the dense diagonalization routine + this->dc.Dngvd(h_mat.row, h_mat.col, h_mat.p, s_mat.p, eigen.data(), psi.get_pointer()); +#endif // Stop the timer for the cusolver operation ModuleBase::timer::tick("DiagoCusolver", "cusolver"); - // Copy the eigenvalues to the output array + // Copy the eigenvalues and eigenvectors to the output arrays const int inc = 1; BlasConnector::copy(GlobalV::NBANDS, eigen.data(), inc, eigenvalue_in, inc); + + // Free allocated memory +#ifdef __MPI + delete[] h_mat_g.p; + delete[] s_mat_g.p; + delete[] psi_g; +#endif } + // Explicit instantiation of the DiagoCusolver class for real and complex numbers + template class DiagoCusolver; + template class DiagoCusolver; + } // namespace hsolver diff --git a/source/module_hsolver/diago_cusolver.h b/source/module_hsolver/diago_cusolver.h index 5501c25fac..0b0d1f5fba 100644 --- a/source/module_hsolver/diago_cusolver.h +++ b/source/module_hsolver/diago_cusolver.h @@ -16,8 +16,13 @@ class DiagoCusolver : public DiagH private: // Real is the real part of the complex type T using Real = typename GetTypeReal::type; + Parallel_Orbitals const * ParaV; public: + + DiagoCusolver(const Parallel_Orbitals* ParaV = nullptr); + ~DiagoCusolver(); + // Override the diag function for CUSOLVER diagonalization void diag(hamilt::Hamilt* phm_in, psi::Psi& psi, Real* eigenvalue_in) override; diff --git a/source/module_hsolver/hsolver_lcao.cpp b/source/module_hsolver/hsolver_lcao.cpp index e9f00534fc..703430bbf0 100644 --- a/source/module_hsolver/hsolver_lcao.cpp +++ b/source/module_hsolver/hsolver_lcao.cpp @@ -13,7 +13,7 @@ #ifdef __ELPA #include "diago_elpa.h" #endif -#ifdef __CUSOLVER_LCAO +#ifdef __CUDA #include "diago_cusolver.h" #endif @@ -67,7 +67,7 @@ void HSolverLCAO::solveTemplate(hamilt::Hamilt* pHamilt, } } #endif -#ifdef __CUSOLVER_LCAO +#ifdef __CUDA else if (this->method == "cusolver") { if (this->pdiagh != nullptr) @@ -80,7 +80,7 @@ void HSolverLCAO::solveTemplate(hamilt::Hamilt* pHamilt, } if (this->pdiagh == nullptr) { - this->pdiagh = new DiagoCusolver(); + this->pdiagh = new DiagoCusolver(this->ParaV); this->pdiagh->method = this->method; } } diff --git a/source/module_hsolver/kernels/cuda/diag_cusolver.cu b/source/module_hsolver/kernels/cuda/diag_cusolver.cu index b094abfa4f..90548f1c9b 100644 --- a/source/module_hsolver/kernels/cuda/diag_cusolver.cu +++ b/source/module_hsolver/kernels/cuda/diag_cusolver.cu @@ -40,7 +40,7 @@ void Diag_Cusolver_gvd::finalize(){ Diag_Cusolver_gvd::~Diag_Cusolver_gvd(){ finalize(); if (cusolverH) {checkCudaErrors( cusolverDnDestroy(cusolverH) ); cusolverH = NULL;} - checkCudaErrors( cudaDeviceReset() ); + //checkCudaErrors( cudaDeviceReset() ); } diff --git a/source/module_hsolver/kernels/cuda/diag_cusolver.cuh b/source/module_hsolver/kernels/cuda/diag_cusolver.cuh index 434abcf938..ad9ba46eeb 100644 --- a/source/module_hsolver/kernels/cuda/diag_cusolver.cuh +++ b/source/module_hsolver/kernels/cuda/diag_cusolver.cuh @@ -74,6 +74,17 @@ public: void Dngvd_double(int N, int M, double *A, double *B, double *W, double *V); void Dngvd_complex(int N, int M, std::complex *A, std::complex *B, double *W, std::complex *V); + + void Dngvd(int N, int M, double *A, double *B, double *W, double *V) + { + return Dngvd_double(N, M, A, B, W, V); + }; + + void Dngvd(int N, int M, std::complex *A, std::complex *B, double *W, std::complex *V) + { + return Dngvd_complex(N, M, A, B, W, V); + }; + }; #endif diff --git a/source/module_hsolver/test/CMakeLists.txt b/source/module_hsolver/test/CMakeLists.txt index 90dcddbacd..795854fcc9 100644 --- a/source/module_hsolver/test/CMakeLists.txt +++ b/source/module_hsolver/test/CMakeLists.txt @@ -94,7 +94,7 @@ if(ENABLE_LCAO) ) endif() endif() -if (USE_CUDA AND USE_CUSOLVER_LCAO) +if (USE_CUDA) AddTest( TARGET HSolver_LCAO_cusolver LIBS ${math_libs} base psi device diff --git a/source/module_hsolver/test/diago_lcao_cusolver_test.cpp b/source/module_hsolver/test/diago_lcao_cusolver_test.cpp index a039d4f2bf..3f29d823d4 100644 --- a/source/module_hsolver/test/diago_lcao_cusolver_test.cpp +++ b/source/module_hsolver/test/diago_lcao_cusolver_test.cpp @@ -8,7 +8,7 @@ #ifdef __ELPA #include "module_hsolver/diago_elpa.h" #endif -#ifdef __CUSOLVER_LCAO +#ifdef __CUDA #include "module_hsolver/diago_cusolver.h" #endif @@ -77,7 +77,7 @@ class DiagoPrepare if (ks_solver == "scalapack_gvx") dh = new hsolver::DiagoBlas; -#ifdef __CUSOLVER_LCAO +#ifdef __CUDA else if (ks_solver == "cusolver") dh = new hsolver::DiagoCusolver; #endif @@ -303,7 +303,7 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( // int nlocal, int nbands, int nb2d, int sparsity, std::string ks_solver_in, std::string hfname, // std::string sfname DiagoPrepare(0, 0, 1, 0, "genelpa", "H-GammaOnly-Si2.dat", // "S-GammaOnly-Si2.dat") -#ifdef __CUSOLVER_LCAO +#ifdef __CUDA DiagoPrepare(0, 0, 32, 0, "cusolver", "H-GammaOnly-Si64.dat", "S-GammaOnly-Si64.dat"), #endif DiagoPrepare(0, 0, 1, 0, "scalapack_gvx", "H-GammaOnly-Si2.dat", "S-GammaOnly-Si2.dat"), @@ -332,7 +332,7 @@ INSTANTIATE_TEST_SUITE_P( DiagoKPointsTest, ::testing::Values( // int nlocal, int nbands, int nb2d, int sparsity, std::string ks_solver_in, std::string hfname, // std::string sfname DiagoPrepare>(800, 400, 32, 7, "genelpa", "", ""), -#ifdef __CUSOLVER_LCAO +#ifdef __CUDA DiagoPrepare>(0, 0, 1, 0, "cusolver", "H-KPoints-Si2.dat", "S-KPoints-Si2.dat"), #endif // DiagoPrepare>(0, 0, 32, 0, "genelpa", "H-KPoints-Si64.dat", "S-KPoints-Si64.dat"), diff --git a/source/module_io/input.cpp b/source/module_io/input.cpp index 042de80af9..e7ee360a28 100644 --- a/source/module_io/input.cpp +++ b/source/module_io/input.cpp @@ -375,6 +375,7 @@ void Input::Default(void) lcao_dr = 0.01; lcao_rmax = 30; // (a.u.) onsite_radius = 0; // (a.u.) + nstream=4; //---------------------------------------------------------- // efield and dipole correction Yu Liu add 2022-05-18 //---------------------------------------------------------- @@ -1534,6 +1535,10 @@ bool Input::Read(const std::string& fn) { read_value(ifs, onsite_radius); } + else if (strcmp("num_stream",word)==0) + { + read_value(ifs,nstream); + } //---------------------------------------------------------- // Molecule Dynamics // Yu Liu add 2021-07-30 @@ -3057,6 +3062,13 @@ void Input::Default_2(void) // jiyy add 2019-08-04 { if (ks_solver == "default") { + if(device == "gpu") + { + ks_solver = "cusolver"; + ModuleBase::GlobalFunc::AUTO_SET("ks_solver", "cusolver"); + } + else + { #ifdef __ELPA ks_solver = "genelpa"; ModuleBase::GlobalFunc::AUTO_SET("ks_solver", "genelpa"); @@ -3064,6 +3076,7 @@ void Input::Default_2(void) // jiyy add 2019-08-04 ks_solver = "scalapack_gvx"; ModuleBase::GlobalFunc::AUTO_SET("ks_solver", "scalapack_gvx"); #endif + } } if (lcao_ecut == 0) { @@ -3311,6 +3324,7 @@ void Input::Bcast() Parallel_Common::bcast_string(basis_type); // xiaohui add 2013-09-01 Parallel_Common::bcast_string(ks_solver); // xiaohui add 2013-09-01 + Parallel_Common::bcast_int(nstream); Parallel_Common::bcast_double(search_radius); Parallel_Common::bcast_bool(search_pbc); Parallel_Common::bcast_double(search_radius); diff --git a/source/module_io/input.h b/source/module_io/input.h index 03a6295bde..1f2f81903a 100644 --- a/source/module_io/input.h +++ b/source/module_io/input.h @@ -137,7 +137,7 @@ class Input double press2; double press3; bool cal_stress; // calculate the stress - + int nstream; std::string fixed_axes; // which axes are fixed bool fixed_ibrav; //whether to keep type of lattice; must be used along with latname bool fixed_atoms; //whether to fix atoms during vc-relax diff --git a/source/module_io/input_conv.cpp b/source/module_io/input_conv.cpp index 8d27db476f..755067f29b 100644 --- a/source/module_io/input_conv.cpp +++ b/source/module_io/input_conv.cpp @@ -308,10 +308,9 @@ void Input_Conv::Convert(void) GlobalV::MIN_DIST_COEF = INPUT.min_dist_coef; GlobalV::NBANDS = INPUT.nbands; GlobalV::NBANDS_ISTATE = INPUT.nbands_istate; + GlobalV::device_flag = psi::device::get_device_flag(INPUT.device, INPUT.ks_solver, INPUT.basis_type, INPUT.gamma_only_local); - GlobalV::device_flag = psi::device::get_device_flag(INPUT.device, INPUT.ks_solver, INPUT.basis_type); - - if (GlobalV::device_flag == "gpu") + if (GlobalV::device_flag == "gpu" && INPUT.basis_type == "pw") { GlobalV::KPAR = psi::device::get_device_kpar(INPUT.kpar); } @@ -363,6 +362,8 @@ void Input_Conv::Convert(void) GlobalV::CAL_STRESS = INPUT.cal_stress; + GlobalV::NUM_STREAM = INPUT.nstream; + GlobalV::RELAX_METHOD = INPUT.relax_method; GlobalV::relax_scale_force = INPUT.relax_scale_force; GlobalV::relax_new = INPUT.relax_new; diff --git a/source/module_io/test/input_conv_test.cpp b/source/module_io/test/input_conv_test.cpp index 818cbbf0cf..bd135d19d5 100644 --- a/source/module_io/test/input_conv_test.cpp +++ b/source/module_io/test/input_conv_test.cpp @@ -192,6 +192,8 @@ TEST_F(InputConvTest, Conv) EXPECT_EQ(GlobalV::sc_file, "sc.json"); EXPECT_EQ(GlobalV::MIXING_RESTART,0.0); EXPECT_EQ(GlobalV::MIXING_DMR,false); + + EXPECT_EQ(GlobalV::NUM_STREAM,4); } TEST_F(InputConvTest, ConvRelax) @@ -267,13 +269,7 @@ TEST_F(InputConvTest, ConvRelax) testing::internal::CaptureStdout(); EXPECT_EXIT(Input_Conv::Convert(), ::testing::ExitedWithCode(0),""); output2 = testing::internal::GetCapturedStdout(); - EXPECT_THAT(output2,testing::HasSubstr("INPUT device setting does not match the request!" - "\n Input device = gpu" - "\n Input basis_type = pw" - "\n Input ks_solver = cg" - "\n Compile setting = host" - "\n Environment device_num = -1" - "\n")); + EXPECT_THAT(output2,testing::HasSubstr("The GPU is not supported in this build!")); } TEST_F(InputConvTest, dftplus) diff --git a/source/module_io/test/input_test_para.cpp b/source/module_io/test/input_test_para.cpp index c0cf0ed2c7..28a4562947 100644 --- a/source/module_io/test/input_test_para.cpp +++ b/source/module_io/test/input_test_para.cpp @@ -389,6 +389,7 @@ TEST_F(InputParaTest, Bcast) EXPECT_EQ(INPUT.mixing_dmr,false); EXPECT_EQ(INPUT.out_bandgap, 0); EXPECT_EQ(INPUT.out_mat_t, 0); + EXPECT_EQ(INPUT.nstream,4); /* I need to test qo_switch, qo_strategy, qo_screening_coeff, qo_thr and qo_basis */ EXPECT_EQ(INPUT.qo_switch, 0); diff --git a/source/module_io/test/write_input_test.cpp b/source/module_io/test/write_input_test.cpp index 8f9059a67e..ef9c6a3d28 100644 --- a/source/module_io/test/write_input_test.cpp +++ b/source/module_io/test/write_input_test.cpp @@ -742,6 +742,7 @@ TEST_F(write_input, BerryWannier17) EXPECT_THAT( output, testing::HasSubstr("out_wannier_wvfn_formatted 1 #output UNK. file in text format or in binary format")); + EXPECT_THAT(output, testing::HasSubstr("")); ifs.close(); remove("write_input_test.log"); diff --git a/source/module_io/write_input.cpp b/source/module_io/write_input.cpp index fbe46f0451..a9754d8939 100644 --- a/source/module_io/write_input.cpp +++ b/source/module_io/write_input.cpp @@ -246,6 +246,7 @@ ModuleBase::GlobalFunc::OUTP(ofs, "out_bandgap", out_bandgap, "if true, print ou ModuleBase::GlobalFunc::OUTP(ofs, "bx", bx, "division of an element grid in FFT grid along x"); ModuleBase::GlobalFunc::OUTP(ofs, "by", by, "division of an element grid in FFT grid along y"); ModuleBase::GlobalFunc::OUTP(ofs, "bz", bz, "division of an element grid in FFT grid along z"); + ModuleBase::GlobalFunc::OUTP(ofs, "num_stream",nstream,"the nstream in compute the LCAO with CUDA"); ofs << "\n#Parameters (6.Smearing)" << std::endl; ModuleBase::GlobalFunc::OUTP(ofs, diff --git a/source/module_psi/kernels/device.cpp b/source/module_psi/kernels/device.cpp index 5cfeef47e2..3b717ee36b 100644 --- a/source/module_psi/kernels/device.cpp +++ b/source/module_psi/kernels/device.cpp @@ -556,42 +556,40 @@ int get_node_rank() { } #endif -std::string get_device_flag(const std::string& device, const std::string& ks_solver, const std::string& basis_type) { - std::string str = "gpu"; - std::string env = "host"; - int device_num = -1; -#if ((defined __CUDA) || (defined __ROCM)) - device_num = device::get_device_num(); - if (device_num <= 0) { - str = "cpu"; +std::string get_device_flag(const std::string& device, const std::string& ks_solver, + const std::string& basis_type, const bool& gamma_only) { + if (device == "cpu") { + return "cpu"; } - env = "device"; + else if (device == "gpu") { +#if ((defined __CUDA) || (defined __ROCM)) + int device_num = device::get_device_num(); + if (device_num <= 0) { + std::string msg = "Cannot find GPU on this computer!"; + ModuleBase::WARNING_QUIT("device", msg); + return "unknown"; + } #else - str = "cpu"; + std::string msg = "The GPU is not supported in this build!"; + ModuleBase::WARNING_QUIT("device", msg); + return "unknown"; #endif - if (ks_solver != "cg" && - ks_solver != "dav" && - ks_solver != "dav_subspace" && - ks_solver != "bpcg") - { - str = "cpu"; - } - if (basis_type != "pw") { - str = "cpu"; - } - if (device == "cpu") { - str = "cpu"; - } - if (str == device) { - return str; + if (basis_type == "lcao_in_pw") { + std::string msg = "The GPU currently does not support the basis type \"lcao_in_pw\"!"; + ModuleBase::WARNING_QUIT("device", msg); + return "unknown"; + } + else if (basis_type == "lcao" && gamma_only == false) { + std::string msg = "The GPU currently does not support the basis type \"lcao\" with \"gamma_only\" set to \"0\"!"; + ModuleBase::WARNING_QUIT("device", msg); + return "unknow"; + } + else { + return "gpu"; + } } else { - std::string msg = "INPUT device setting does not match the request!"; - msg += "\n Input device = " + device; - msg += "\n Input basis_type = " + basis_type; - msg += "\n Input ks_solver = " + ks_solver; - msg += "\n Compile setting = " + env; - msg += "\n Environment device_num = " + std::to_string(device_num) + "\n"; + std::string msg = "INPUT device can only be set to \"cpu\" or \"gpu\"!"; ModuleBase::WARNING_QUIT("device", msg); return "unknown"; } diff --git a/source/module_psi/kernels/device.h b/source/module_psi/kernels/device.h index ea364e508a..aacac89199 100644 --- a/source/module_psi/kernels/device.h +++ b/source/module_psi/kernels/device.h @@ -37,7 +37,7 @@ template void record_device_memory (const Device* dev, std::ofs std::string get_device_info(std::string device_flag); int get_device_kpar(const int& kpar); -std::string get_device_flag(const std::string& device, const std::string& ks_solver, const std::string& basis_type); +std::string get_device_flag(const std::string& device, const std::string& ks_solver, const std::string& basis_type, const bool& gamma_only); #if __MPI int get_node_rank(); diff --git a/tests/PP_ORB/H_gga_8au_100Ry_1s.orb b/tests/PP_ORB/H_gga_8au_100Ry_1s.orb new file mode 100755 index 0000000000..524b5c600b --- /dev/null +++ b/tests/PP_ORB/H_gga_8au_100Ry_1s.orb @@ -0,0 +1,621 @@ +--------------------------------------------------------------------------- +Element H +Energy Cutoff(Ry) 100 +Radius Cutoff(a.u.) 8 +Lmax 0 +Number of Sorbital--> 1 +Number of Porbital--> 0 +--------------------------------------------------------------------------- +SUMMARY END + +Mesh 801 +dr 0.01 + Type L N + 0 0 0 +1.90744135096601e+00 1.90714415834810e+00 1.90625308197139e+00 1.90476962465789e+00 +1.90269628575461e+00 1.90003655312386e+00 1.89679489197906e+00 1.89297673061510e+00 +1.88858844309606e+00 1.88363732897602e+00 1.87813159014190e+00 1.87208030487923e+00 +1.86549339927410e+00 1.85838161607520e+00 1.85075648115102e+00 1.84263026768663e+00 +1.83401595827414e+00 1.82492720505869e+00 1.81537828810982e+00 1.80538407219431e+00 +1.79495996213244e+00 1.78412185692442e+00 1.77288610283736e+00 1.76126944564628e+00 +1.74928898222428e+00 1.73696211167796e+00 1.72430648622422e+00 1.71133996200332e+00 +1.69808055002128e+00 1.68454636741156e+00 1.67075558920237e+00 1.65672640077097e+00 +1.64247695116102e+00 1.62802530743244e+00 1.61338941020624e+00 1.59858703055897e+00 +1.58363572841279e+00 1.56855281255805e+00 1.55335530243583e+00 1.53805989179737e+00 +1.52268291434693e+00 1.50724031146361e+00 1.49174760208638e+00 1.47621985483495e+00 +1.46067166242759e+00 1.44511711844513e+00 1.42956979647843e+00 1.41404273168500e+00 +1.39854840476859e+00 1.38309872838402e+00 1.36770503595819e+00 1.35237807290724e+00 +1.33712799021870e+00 1.32196434035774e+00 1.30689607544598e+00 1.29193154765252e+00 +1.27707851172739e+00 1.26234412959971e+00 1.24773497695472e+00 1.23325705169705e+00 +1.21891578420074e+00 1.20471604924122e+00 1.19066217949892e+00 1.17675798052007e+00 +1.16300674701651e+00 1.14941128038354e+00 1.13597390731232e+00 1.12269649937228e+00 +1.10958049343792e+00 1.09662691283445e+00 1.08383638907732e+00 1.07120918408205e+00 +1.05874521272254e+00 1.04644406561873e+00 1.03430503203750e+00 1.02232712279451e+00 +1.01050909304857e+00 9.98849464885200e-01 9.87346549590601e-01 9.75998469523295e-01 +9.64803179495990e-01 9.53758487586675e-01 9.42862075304184e-01 9.32111517040047e-01 +9.21504298745216e-01 9.11037835777091e-01 9.00709489869184e-01 8.90516585182730e-01 +8.80456423406465e-01 8.70526297877679e-01 8.60723506704410e-01 8.51045364875273e-01 +8.41489215349882e-01 8.32052439129051e-01 8.22732464309953e-01 8.13526774137160e-01 +8.04432914065864e-01 7.95448497858720e-01 7.86571212742464e-01 7.77798823654847e-01 +7.69129176616447e-01 7.60560201265489e-01 7.52089912597048e-01 7.43716411950765e-01 +7.35437887293628e-01 7.27252612846305e-01 7.19158948103107e-01 7.11155336296780e-01 +7.03240302360128e-01 6.95412450436802e-01 6.87670460993604e-01 6.80013087586304e-01 +6.72439153330262e-01 6.64947547126112e-01 6.57537219689449e-01 6.50207179431858e-01 +6.42956488238762e-01 6.35784257187444e-01 6.28689642246331e-01 6.21671839994114e-01 +6.14730083394654e-01 6.07863637660835e-01 6.01071796237651e-01 5.94353876931886e-01 +5.87709218212691e-01 5.81137175704386e-01 5.74637118889711e-01 5.68208428038824e-01 +5.61850491376267e-01 5.55562702495311e-01 5.49344458026195e-01 5.43195155562093e-01 +5.37114191844028e-01 5.31100961203474e-01 5.25154854259099e-01 5.19275256861895e-01 +5.13461549281012e-01 5.07713105620732e-01 5.02029293457474e-01 4.96409473684204e-01 +4.90853000548461e-01 4.85359221869088e-01 4.79927479415968e-01 4.74557109436364e-01 +4.69247443311000e-01 4.63997808322744e-01 4.58807528520617e-01 4.53675925661948e-01 +4.48602320215676e-01 4.43586032410208e-01 4.38626383309728e-01 4.33722695903521e-01 +4.28874296193613e-01 4.24080514266916e-01 4.19340685339017e-01 4.14654150757771e-01 +4.10020258956013e-01 4.05438366343786e-01 4.00907838131742e-01 3.96428049078550e-01 +3.91998384156400e-01 3.87618239129914e-01 3.83287021044989e-01 3.79004148625304e-01 +3.74769052575401e-01 3.70581175790304e-01 3.66439973472807e-01 3.62344913160466e-01 +3.58295474665325e-01 3.54291149930260e-01 3.50331442806576e-01 3.46415868758214e-01 +3.42543954498532e-01 3.38715237566107e-01 3.34929265846474e-01 3.31185597047008e-01 +3.27483798132417e-01 3.23823444728448e-01 3.20204120501475e-01 3.16625416521590e-01 +3.13086930616735e-01 3.09588266725206e-01 3.06129034253590e-01 3.02708847446882e-01 +2.99327324777149e-01 2.95984088356609e-01 2.92678763380583e-01 2.89410977605146e-01 +2.86180360863828e-01 2.82986544627041e-01 2.79829161607343e-01 2.76707845413006e-01 +2.73622230251711e-01 2.70571950685589e-01 2.67556641438180e-01 2.64575937253278e-01 +2.61629472805067e-01 2.58716882658363e-01 2.55837801277271e-01 2.52991863080056e-01 +2.50178702537599e-01 2.47397954312383e-01 2.44649253434599e-01 2.41932235511661e-01 +2.39246536967145e-01 2.36591795304972e-01 2.33967649394495e-01 2.31373739772061e-01 +2.28809708954547e-01 2.26275201760404e-01 2.23769865633779e-01 2.21293350967376e-01 +2.18845311419903e-01 2.16425404224097e-01 2.14033290481572e-01 2.11668635441015e-01 +2.09331108756501e-01 2.07020384723086e-01 2.04736142487133e-01 2.02478066229223e-01 +2.00245845317865e-01 1.98039174432621e-01 1.95857753655660e-01 1.93701288531132e-01 +1.91569490092165e-01 1.89462074855675e-01 1.87378764785523e-01 1.85319287224947e-01 +1.83283374799505e-01 1.81270765292101e-01 1.79281201491935e-01 1.77314431019501e-01 +1.75370206129966e-01 1.73448283497483e-01 1.71548423983134e-01 1.69670392389350e-01 +1.67813957203735e-01 1.65978890335290e-01 1.64164966846051e-01 1.62371964681139e-01 +1.60599664400201e-01 1.58847848913098e-01 1.57116303222643e-01 1.55404814176993e-01 +1.53713170234189e-01 1.52041161241111e-01 1.50388578228921e-01 1.48755213226817e-01 +1.47140859095699e-01 1.45545309383072e-01 1.43968358200220e-01 1.42409800122453e-01 +1.40869430112909e-01 1.39347043470114e-01 1.37842435799251e-01 1.36355403006779e-01 +1.34885741317806e-01 1.33433247315354e-01 1.31997718000411e-01 1.30578950871462e-01 +1.29176744021959e-01 1.27790896254038e-01 1.26421207206597e-01 1.25067477495756e-01 +1.23729508865558e-01 1.22407104346729e-01 1.21100068421232e-01 1.19808207190317e-01 +1.18531328543789e-01 1.17269242328193e-01 1.16021760511708e-01 1.14788697343580e-01 +1.13569869506034e-01 1.12365096256718e-01 1.11174199559877e-01 1.09997004204587e-01 +1.08833337908594e-01 1.07683031406451e-01 1.06545918520865e-01 1.05421836216376e-01 +1.04310624634710e-01 1.03212127111347e-01 1.02126190173120e-01 1.01052663516819e-01 +9.99913999690639e-02 9.89422554278694e-02 9.79050887865829e-02 9.68797618410371e-02 +9.58661391809758e-02 9.48640880669702e-02 9.38734782942084e-02 9.28941820446811e-02 +9.19260737294109e-02 9.09690298224744e-02 9.00229286886530e-02 8.90876504066055e-02 +8.81630765895008e-02 8.72490902050614e-02 8.63455753969695e-02 8.54524173095576e-02 +8.45695019176609e-02 8.36967158634388e-02 8.28339463018876e-02 8.19810807566576e-02 +8.11380069876688e-02 8.03046128718737e-02 7.94807862983706e-02 7.86664150788980e-02 +7.78613868745711e-02 7.70655891395314e-02 7.62789090819951e-02 7.55012336429879e-02 +7.47324494928582e-02 7.39724430454642e-02 7.32211004897359e-02 7.24783078381228e-02 +7.17439509912543e-02 7.10179158179657e-02 7.03000882496753e-02 6.95903543879471e-02 +6.88886006239332e-02 6.81947137682637e-02 6.75085811898452e-02 6.68300909619366e-02 +6.61591320137967e-02 6.54955942861453e-02 6.48393688886428e-02 6.41903482575767e-02 +6.35484263119503e-02 6.29134986061875e-02 6.22854624777147e-02 6.16642171877373e-02 +6.10496640536096e-02 6.04417065712910e-02 5.98402505264925e-02 5.92452040932439e-02 +5.86564779187496e-02 5.80739851935497e-02 5.74976417061689e-02 5.69273658815928e-02 +5.63630788030980e-02 5.58047042171302e-02 5.52521685211136e-02 5.47054007342494e-02 +5.41643324515488e-02 5.36288977815150e-02 5.30990332680676e-02 5.25746777974615e-02 +5.20557724911139e-02 5.15422605853936e-02 5.10340872995653e-02 5.05311996932000e-02 +5.00335465144709e-02 4.95410780408436e-02 4.90537459137457e-02 4.85715029688600e-02 +4.80943030637231e-02 4.76221009043360e-02 4.71548518724977e-02 4.66925118555591e-02 +4.62350370802628e-02 4.57823839522863e-02 4.53345089030417e-02 4.48913682452013e-02 +4.44529180383256e-02 4.40191139658545e-02 4.35899112246054e-02 4.31652644277810e-02 +4.27451275223506e-02 4.23294537215084e-02 4.19181954527615e-02 4.15113043220239e-02 +4.11087310939351e-02 4.07104256884435e-02 4.03163371935289e-02 3.99264138937696e-02 +3.95406033142948e-02 3.91588522795033e-02 3.87811069857804e-02 3.84073130872983e-02 +3.80374157938534e-02 3.76713599795738e-02 3.73090903012173e-02 3.69505513246897e-02 +3.65956876583272e-02 3.62444440914258e-02 3.58967657364495e-02 3.55525981733160e-02 +3.52118875941445e-02 3.48745809468492e-02 3.45406260759832e-02 3.42099718592680e-02 +3.38825683382995e-02 3.35583668419832e-02 3.32373201013368e-02 3.29193823543901e-02 +3.26045094400245e-02 3.22926588797097e-02 3.19837899462309e-02 3.16778637186356e-02 +3.13748431227779e-02 3.10746929569936e-02 3.07773799025929e-02 3.04828725190235e-02 +3.01911412237139e-02 2.99021582567730e-02 2.96158976308761e-02 2.93323350668273e-02 +2.90514479154355e-02 2.87732150664845e-02 2.84976168457132e-02 2.82246349008448e-02 +2.79542520778192e-02 2.76864522884822e-02 2.74212203710753e-02 2.71585419449412e-02 +2.68984032609211e-02 2.66407910489634e-02 2.63856923644883e-02 2.61330944350695e-02 +2.58829845089835e-02 2.56353497071615e-02 2.53901768800391e-02 2.51474524707479e-02 +2.49071623860259e-02 2.46692918761437e-02 2.44338254250464e-02 2.42007466518065e-02 +2.39700382243658e-02 2.37416817864129e-02 2.35156578981101e-02 2.32919459912376e-02 +2.30705243391755e-02 2.28513700419862e-02 2.26344590267112e-02 2.24197660628292e-02 +2.22072647926769e-02 2.19969277764700e-02 2.17887265514184e-02 2.15826317042805e-02 +2.13786129565670e-02 2.11766392614701e-02 2.09766789114802e-02 2.07786996555356e-02 +2.05826688244595e-02 2.03885534633510e-02 2.01963204695257e-02 2.00059367345468e-02 +1.98173692888461e-02 1.96305854474076e-02 1.94455529549764e-02 1.92622401292625e-02 +1.90806160006299e-02 1.89006504467963e-02 1.87223143211248e-02 1.85455795731509e-02 +1.83704193600722e-02 1.81968081480180e-02 1.80247218020249e-02 1.78541376637587e-02 +1.76850346161485e-02 1.75173931342345e-02 1.73511953216744e-02 1.71864249324951e-02 +1.70230673778351e-02 1.68611097175701e-02 1.67005406368740e-02 1.65413504079211e-02 +1.63835308370844e-02 1.62270751981387e-02 1.60719781521166e-02 1.59182356546037e-02 +1.57658448513877e-02 1.56148039634978e-02 1.54651121627741e-02 1.53167694392114e-02 +1.51697764613983e-02 1.50241344314517e-02 1.48798449358963e-02 1.47369097939852e-02 +1.45953309049839e-02 1.44551100959504e-02 1.43162489715391e-02 1.41787487673404e-02 +1.40426102082267e-02 1.39078333731324e-02 1.37744175676243e-02 1.36423612055454e-02 +1.35116617009219e-02 1.33823153712186e-02 1.32543173529167e-02 1.31276615302595e-02 +1.30023404778825e-02 1.28783454179013e-02 1.27556661918862e-02 1.26342912480018e-02 +1.25142076434351e-02 1.23954010620839e-02 1.22778558473197e-02 1.21615550494898e-02 +1.20464804876707e-02 1.19326128250454e-02 1.18199316571329e-02 1.17084156119753e-02 +1.15980424612592e-02 1.14887892412437e-02 1.13806323822626e-02 1.12735478454853e-02 +1.11675112655430e-02 1.10624980975713e-02 1.09584837671706e-02 1.08554438217587e-02 +1.07533540817736e-02 1.06521907901868e-02 1.05519307587999e-02 1.04525515098355e-02 +1.03540314113731e-02 1.02563498052457e-02 1.01594871260900e-02 1.00634250103266e-02 +9.96814639395133e-03 9.87363559813070e-03 9.77987840171545e-03 9.68686209991889e-03 +9.59457554854560e-03 9.50300919330178e-03 9.41215508386842e-03 9.32200687257104e-03 +9.23255979763860e-03 9.14381065119522e-03 9.05575773228555e-03 8.96840078538575e-03 +8.88174092499742e-03 8.79578054706426e-03 8.71052322808392e-03 8.62597361291426e-03 +8.54213729238506e-03 8.45902067193404e-03 8.37663083257503e-03 8.29497538558643e-03 +8.21406232237167e-03 8.13389986099568e-03 8.05449629093464e-03 7.97585981759707e-03 +7.89799840817896e-03 7.82091964040363e-03 7.74463055566894e-03 7.66913751808464e-03 +7.59444608082327e-03 7.52056086113558e-03 7.44748542529944e-03 7.37522218466907e-03 +7.30377230388427e-03 7.23313562217610e-03 7.16331058857753e-03 7.09429421170730e-03 +7.02608202464902e-03 6.95866806529852e-03 6.89204487239618e-03 6.82620349730270e-03 +6.76113353142113e-03 6.69682314900736e-03 6.63325916495911e-03 6.57042710701898e-03 +6.50831130168490e-03 6.44689497297848e-03 6.38616035309441e-03 6.32608880382997e-03 +6.26666094758616e-03 6.20785680663021e-03 6.14965594922738e-03 6.09203764117518e-03 +6.03498100121735e-03 5.97846515877334e-03 5.92246941239161e-03 5.86697338732423e-03 +5.81195719062748e-03 5.75740156221294e-03 5.70328802031077e-03 5.64959899986066e-03 +5.59631798241292e-03 5.54342961620389e-03 5.49091982516573e-03 5.43877590573865e-03 +5.38698661047357e-03 5.33554221754226e-03 5.28443458541266e-03 5.23365719209262e-03 +5.18320515849891e-03 5.13307525566736e-03 5.08326589567937e-03 5.03377710634419e-03 +4.98461048983915e-03 4.93576916567104e-03 4.88725769847946e-03 4.83908201135748e-03 +4.79124928551065e-03 4.74376784721584e-03 4.69664704316940e-03 4.64989710543662e-03 +4.60352900731916e-03 4.55755431155405e-03 4.51198501233828e-03 4.46683337273993e-03 +4.42211175910693e-03 4.37783247412167e-03 4.33400759016676e-03 4.29064878467104e-03 +4.24776717909051e-03 4.20537318314833e-03 4.16347634591093e-03 4.12208521521519e-03 +4.08120720688331e-03 4.04084848507066e-03 4.00101385498381e-03 3.96170666908961e-03 +3.92292874780475e-03 3.88468031551433e-03 3.84695995262057e-03 3.80976456416331e-03 +3.77308936539371e-03 3.73692788451341e-03 3.70127198262330e-03 3.66611189075383e-03 +3.63143626367926e-03 3.59723225004981e-03 3.56348557821269e-03 3.53018065693255e-03 +3.49730069007350e-03 3.46482780415933e-03 3.43274318759838e-03 3.40102724023744e-03 +3.36965973180068e-03 3.33861996767614e-03 3.30788696043076e-03 3.27743960537292e-03 +3.24725685843175e-03 3.21731791459253e-03 3.18760238511387e-03 3.15809047175479e-03 +3.12876313626176e-03 3.09960226340408e-03 3.07059081590089e-03 3.04171297965506e-03 +3.01295429779673e-03 2.98430179214278e-03 2.95574407079412e-03 2.92727142072394e-03 +2.89887588435188e-03 2.87055131924971e-03 2.84229344028809e-03 2.81409984370137e-03 +2.78597001272207e-03 2.75790530461769e-03 2.72990891914193e-03 2.70198584859664e-03 +2.67414280988143e-03 2.64638815908692e-03 2.61873178936241e-03 2.59118501295514e-03 +2.56376042848091e-03 2.53647177463419e-03 2.50933377168767e-03 2.48236195225667e-03 +2.45557248291956e-03 2.42898197838259e-03 2.40260730996182e-03 2.37646541022061e-03 +2.35057307565055e-03 2.32494676931406e-03 2.29960242538039e-03 2.27455525747993e-03 +2.24981957277749e-03 2.22540859362253e-03 2.20133428857240e-03 2.17760721450603e-03 +2.15423637145007e-03 2.13122907162684e-03 2.10859082410688e-03 2.08632523630688e-03 +2.06443393342099e-03 2.04291649670723e-03 2.02177042137597e-03 2.00099109464481e-03 +1.98057179433305e-03 1.96050370817563e-03 1.94077597383923e-03 1.92137573942313e-03 +1.90228824403188e-03 1.88349691781112e-03 1.86498350064698e-03 1.84672817854711e-03 +1.82870973654477e-03 1.81090572680154e-03 1.79329265043109e-03 1.77584615142361e-03 +1.75854122092579e-03 1.74135241001769e-03 1.72425404903628e-03 1.70722047141659e-03 +1.69022623996493e-03 1.67324637344047e-03 1.65625657130073e-03 1.63923343446962e-03 +1.62215468000730e-03 1.60499934760110e-03 1.58774799586043e-03 1.57038288647527e-03 +1.55288815440014e-03 1.53524996233898e-03 1.51745663794234e-03 1.49949879227469e-03 +1.48136941827531e-03 1.46306396811094e-03 1.44458040850593e-03 1.42591925333371e-03 +1.40708357295730e-03 1.38807898001789e-03 1.36891359158655e-03 1.34959796780879e-03 +1.33014502739283e-03 1.31056994050335e-03 1.29088999983854e-03 1.27112447086995e-03 +1.25129442242595e-03 1.23142253898585e-03 1.21153291622901e-03 1.19165084154900e-03 +1.17180256138911e-03 1.15201503739167e-03 1.13231569346832e-03 1.11273215599497e-03 +1.09329198941464e-03 1.07402242958782e-03 1.05495011726704e-03 1.03610083408499e-03 +1.01749924344142e-03 9.99168638639969e-04 9.81130700574411e-04 9.63405267185630e-04 +9.46010116807453e-04 9.28960767389201e-04 9.12270293424172e-04 8.95949162219249e-04 +8.80005090907775e-04 8.64442925328641e-04 8.49264541552996e-04 8.34468770424584e-04 +8.20051344973520e-04 8.06004869934624e-04 7.92318811833215e-04 7.78979507159545e-04 +7.65970185011967e-04 7.53270999237310e-04 7.40859063522746e-04 7.28708481141292e-04 +7.16790359182660e-04 7.05072795263805e-04 6.93520823127625e-04 6.82096302537586e-04 +6.70757738909506e-04 6.59460019762246e-04 6.48154059000508e-04 6.36786347006711e-04 +6.25298415246795e-04 6.13626239188753e-04 6.01699623079563e-04 5.89441634339956e-04 +5.76768183086631e-04 5.63587871740572e-04 5.49802267883250e-04 5.35306776411516e-04 +5.19992299495392e-04 5.03747869040363e-04 4.86464410347862e-04 4.68039742194362e-04 +4.48384834111137e-04 4.27431225662146e-04 4.05139368428294e-04 3.81507487571789e-04 +3.56580389919150e-04 3.30457488066075e-04 3.03299187431014e-04 2.75330719426666e-04 +2.46842521560822e-04 2.18186381938982e-04 1.89766790293003e-04 1.62027267368842e-04 +1.35431862269941e-04 1.10442481612373e-04 8.74932006070271e-05 6.69631508432783e-05 +4.91499257824303e-05 3.42456399879478e-05 2.23177809105838e-05 1.32967803981724e-05 +6.97180875556661e-06 2.99568503828600e-06 8.99056043772297e-07 1.13195668714447e-07 +0.00000000000000e+00 + Type L N + 0 0 1 +-2.95749301594759e+00 -2.95674715019790e+00 -2.95451087001972e+00 -2.95078812249282e+00 +-2.94558547232415e+00 -2.93891208127024e+00 -2.93077967945052e+00 -2.92120252867311e+00 +-2.91019737792800e+00 -2.89778341123545e+00 -2.88398218806936e+00 -2.86881757660607e+00 +-2.85231568007867e+00 -2.83450475654480e+00 -2.81541513240287e+00 -2.79507911001602e+00 +-2.77353086982671e+00 -2.75080636736561e+00 -2.72694322557808e+00 -2.70198062290806e+00 +-2.67595917759471e+00 -2.64892082864941e+00 -2.62090871399127e+00 -2.59196704622747e+00 +-2.56214098657024e+00 -2.53147651738606e+00 -2.50002031387315e+00 -2.46781961536243e+00 +-2.43492209673325e+00 -2.40137574042919e+00 -2.36722870955111e+00 -2.33252922249439e+00 +-2.29732542958443e+00 -2.26166529215041e+00 -2.22559646446072e+00 -2.18916617892497e+00 +-2.15242113494814e+00 -2.11540739180010e+00 -2.07817026584156e+00 -2.04075423242238e+00 +-2.00320283274330e+00 -1.96555858594527e+00 -1.92786290666317e+00 -1.89015602825261e+00 +-1.85247693186972e+00 -1.81486328155466e+00 -1.77735136544026e+00 -1.73997604317751e+00 +-1.70277069964023e+00 -1.66576720494206e+00 -1.62899588076989e+00 -1.59248547300936e+00 +-1.55626313061055e+00 -1.52035439061476e+00 -1.48478316923725e+00 -1.44957175887581e+00 +-1.41474083089114e+00 -1.38030944398229e+00 -1.34629505795913e+00 -1.31271355269407e+00 +-1.27957925201676e+00 -1.24690495229892e+00 -1.21470195546107e+00 -1.18298010612002e+00 +-1.15174783258370e+00 -1.12101219139057e+00 -1.09077891508237e+00 -1.06105246289282e+00 +-1.03183607403022e+00 -1.00313182322931e+00 -9.74940678246450e-01 -9.47262558973192e-01 +-9.20096397845391e-01 -8.93440201229186e-01 -8.67291111470610e-01 -8.41645469302695e-01 +-8.16498876312343e-01 -7.91846257179057e-01 -7.67681921408647e-01 -7.43999624297248e-01 +-7.20792626874196e-01 -6.98053754586555e-01 -6.75775454503142e-01 -6.53949850831660e-01 +-6.32568798559035e-01 -6.11623935041948e-01 -5.91106729391966e-01 -5.71008529517284e-01 +-5.51320606700974e-01 -5.32034197613519e-01 -5.13140543675348e-01 -4.94630927702780e-01 +-4.76496707788382e-01 -4.58729348383868e-01 -4.41320448570481e-01 -4.24261767518043e-01 +-4.07545247149507e-01 -3.91163032042877e-01 -3.75107486616592e-01 -3.59371209657937e-01 +-3.43947046266677e-01 -3.28828097297737e-01 -3.14007726397578e-01 -2.99479564738627e-01 +-2.85237513564889e-01 -2.71275744669571e-01 -2.57588698932254e-01 -2.44171083048711e-01 +-2.31017864591087e-01 -2.18124265539630e-01 -2.05485754429685e-01 -1.93098037259113e-01 +-1.80957047301803e-01 -1.69058933972458e-01 -1.57400050886502e-01 -1.45976943256630e-01 +-1.34786334764517e-01 -1.23825114042282e-01 -1.13090320893741e-01 -1.02579132380254e-01 +-9.22888488900691e-02 -8.22168803037166e-02 -7.23607323610866e-02 -6.27179933285523e-02 +-5.32863210568552e-02 -4.40634305125243e-02 -3.50470818574746e-02 -2.62350691431274e-02 +-1.76252096769981e-02 -9.21533411130605e-03 -1.00327729474762e-03 7.01313007966986e-03 +1.48360690060055e-02 2.24677388591814e-02 2.99103632420717e-02 3.71661949841796e-02 +4.42375202972276e-02 5.11266620996901e-02 5.78359825285329e-02 6.43678846622972e-02 +7.07248134849844e-02 7.69092561251113e-02 8.29237414086500e-02 8.87708387684490e-02 +9.44531565560483e-02 9.99733398045991e-02 1.05334067493905e-01 1.10538049370311e-01 +1.15588022375437e-01 1.20486746738460e-01 1.25237001786905e-01 1.29841581530657e-01 +1.34303290073247e-01 1.38624936903336e-01 1.42809332117830e-01 1.46859281626183e-01 +1.50777582383199e-01 1.54567017695170e-01 1.58230352641336e-01 1.61770329649636e-01 +1.65189664262476e-01 1.68491041124812e-01 1.71677110223298e-01 1.74750483401609e-01 +1.77713731173318e-01 1.80569379849988e-01 1.83319908998359e-01 1.85967749236854e-01 +1.88515280377944e-01 1.90964829919395e-01 1.93318671883974e-01 1.95579026003945e-01 +1.97748057243524e-01 1.99827875649604e-01 2.01820536518312e-01 2.03728040862477e-01 +2.05552336162857e-01 2.07295317383958e-01 2.08958828233539e-01 2.10544662643427e-01 +2.12054566448034e-01 2.13490239236041e-01 2.14853336350026e-01 2.16145471008400e-01 +2.17368216523844e-01 2.18523108592555e-01 2.19611647628907e-01 2.20635301120704e-01 +2.21595505980992e-01 2.22493670873319e-01 2.23331178488543e-01 2.24109387752575e-01 +2.24829635945923e-01 2.25493240717520e-01 2.26101501977004e-01 2.26655703651460e-01 +2.27157115294481e-01 2.27606993537349e-01 2.28006583374097e-01 2.28357119274190e-01 +2.28659826118530e-01 2.28915919956434e-01 2.29126608583143e-01 2.29293091939269e-01 +2.29416562335347e-01 2.29498204506367e-01 2.29539195502722e-01 2.29540704425492e-01 +2.29503892015350e-01 2.29429910105555e-01 2.29319900950615e-01 2.29174996443104e-01 +2.28996317231939e-01 2.28784971756001e-01 2.28542055207514e-01 2.28268648439881e-01 +2.27965816834863e-01 2.27634609143993e-01 2.27276056319019e-01 2.26891170345855e-01 +2.26480943096172e-01 2.26046345210188e-01 2.25588325023606e-01 2.25107807550857e-01 +2.24605693535998e-01 2.24082858581639e-01 2.23540152365260e-01 2.22978397951232e-01 +2.22398391205667e-01 2.21800900320102e-01 2.21186665448788e-01 2.20556398463146e-01 +2.19910782825724e-01 2.19250473584802e-01 2.18576097489550e-01 2.17888253224524e-01 +2.17187511761145e-01 2.16474416822709e-01 2.15749485458513e-01 2.15013208721671e-01 +2.14266052444382e-01 2.13508458103570e-01 2.12740843769137e-01 2.11963605126459e-01 +2.11177116564211e-01 2.10381732318211e-01 2.09577787661635e-01 2.08765600131730e-01 +2.07945470783041e-01 2.07117685457127e-01 2.06282516058823e-01 2.05440221829250e-01 +2.04591050606057e-01 2.03735240061681e-01 2.02873018910861e-01 2.02004608079094e-01 +2.01130221824323e-01 2.00250068804720e-01 1.99364353086133e-01 1.98473275083445e-01 +1.97577032430889e-01 1.96675820777105e-01 1.95769834501543e-01 1.94859267349642e-01 +1.93944312985012e-01 1.93025165457680e-01 1.92102019588265e-01 1.91175071268717e-01 +1.90244517681041e-01 1.89310557436130e-01 1.88373390635549e-01 1.87433218859714e-01 +1.86490245086565e-01 1.85544673545305e-01 1.84596709510321e-01 1.83646559040773e-01 +1.82694428671734e-01 1.81740525063016e-01 1.80785054612073e-01 1.79828223037501e-01 +1.78870234939748e-01 1.77911293345670e-01 1.76951599243511e-01 1.75991351114766e-01 +1.75030744469250e-01 1.74069971389386e-01 1.73109220089529e-01 1.72148674495727e-01 +1.71188513850965e-01 1.70228912350526e-01 1.69270038811591e-01 1.68312056380764e-01 +1.67355122282621e-01 1.66399387611905e-01 1.65444997171374e-01 1.64492089356795e-01 +1.63540796089972e-01 1.62591242800166e-01 1.61643548453668e-01 1.60697825630800e-01 +1.59754180649051e-01 1.58812713730582e-01 1.57873519211863e-01 1.56936685792764e-01 +1.56002296822034e-01 1.55070430615726e-01 1.54141160804835e-01 1.53214556708125e-01 +1.52290683725899e-01 1.51369603750311e-01 1.50451375587667e-01 1.49536055388107e-01 +1.48623697078024e-01 1.47714352790601e-01 1.46808073289924e-01 1.45904908384236e-01 +1.45004907324080e-01 1.44108119181253e-01 1.43214593204778e-01 1.42324379150333e-01 +1.41437527579950e-01 1.40554090129082e-01 1.39674119738541e-01 1.38797670849197e-01 +1.37924799557717e-01 1.37055563732073e-01 1.36190023085940e-01 1.35328239211581e-01 +1.34470275571206e-01 1.33616197447251e-01 1.32766071852430e-01 1.31919967400809e-01 +1.31077954141565e-01 1.30240103357434e-01 1.29406487330206e-01 1.28577179075960e-01 +1.27752252052977e-01 1.26931779845579e-01 1.26115835827295e-01 1.25304492807015e-01 +1.24497822661861e-01 1.23695895960676e-01 1.22898781582052e-01 1.22106546330878e-01 +1.21319254557356e-01 1.20536967782390e-01 1.19759744333175e-01 1.18987638992661e-01 +1.18220702666439e-01 1.17458982070373e-01 1.16702519442103e-01 1.15951352279268e-01 +1.15205513107042e-01 1.14465029277251e-01 1.13729922801047e-01 1.13000210216757e-01 +1.12275902494167e-01 1.11557004976195e-01 1.10843517358466e-01 1.10135433706999e-01 +1.09432742513833e-01 1.08735426790039e-01 1.08043464195241e-01 1.07356827202427e-01 +1.06675483296482e-01 1.05999395204609e-01 1.05328521156478e-01 1.04662815171709e-01 +1.04002227372042e-01 1.03346704315355e-01 1.02696189348476e-01 1.02050622975641e-01 +1.01409943239263e-01 1.00774086109656e-01 1.00142985880266e-01 9.95165755649462e-02 +9.88947872938504e-02 9.82775527045302e-02 9.76648033249298e-02 9.70564709450648e-02 +9.64524879743119e-02 9.58527877814058e-02 9.52573050144273e-02 9.46659758982853e-02 +9.40787385074282e-02 9.34955330117819e-02 9.29163018941769e-02 9.23409901378186e-02 +9.17695453826491e-02 9.12019180497527e-02 9.06380614332678e-02 9.00779317595794e-02 +8.95214882138738e-02 8.89686929344474e-02 8.84195109754569e-02 8.78739102390882e-02 +8.73318613784026e-02 8.67933376723729e-02 8.62583148748767e-02 8.57267710396330e-02 +8.51986863232757e-02 8.46740427689396e-02 8.41528240728889e-02 8.36350153368549e-02 +8.31206028088478e-02 8.26095736152897e-02 8.21019154873652e-02 8.15976164845037e-02 +8.10966647179055e-02 8.05990480769869e-02 8.01047539615597e-02 7.96137690224728e-02 +7.91260789133315e-02 7.86416680557720e-02 7.81605194206152e-02 7.76826143270397e-02 +7.72079322617189e-02 7.67364507196531e-02 7.62681450681995e-02 7.58029884355604e-02 +7.53409516247445e-02 7.48820030537505e-02 7.44261087224713e-02 7.39732322065421e-02 +7.35233346781009e-02 7.30763749531606e-02 7.26323095650444e-02 7.21910928630798e-02 +7.17526771355175e-02 7.13170127554038e-02 7.08840483479364e-02 7.04537309776254e-02 +7.00260063534104e-02 6.96008190497225e-02 6.91781127413405e-02 6.87578304497768e-02 +6.83399147988287e-02 6.79243082768638e-02 6.75109535033584e-02 6.70997934971830e-02 +6.66907719441315e-02 6.62838334612118e-02 6.58789238552618e-02 6.54759903735283e-02 +6.50749819439323e-02 6.46758494028581e-02 6.42785457084363e-02 6.38830261374365e-02 +6.34892484640532e-02 6.30971731190492e-02 6.27067633279112e-02 6.23179852268808e-02 +6.19308079559308e-02 6.15452037279853e-02 6.11611478738974e-02 6.07786188629336e-02 +6.03975982987364e-02 6.00180708909675e-02 5.96400244030500e-02 5.92634495766506e-02 +5.88883400337499e-02 5.85146921573450e-02 5.81425049520201e-02 5.77717798857926e-02 +5.74025207148041e-02 5.70347332925684e-02 5.66684253656152e-02 5.63036063574778e-02 +5.59402871430620e-02 5.55784798155038e-02 5.52181974476708e-02 5.48594538504968e-02 +5.45022633303407e-02 5.41466404475544e-02 5.37925997784116e-02 5.34401556824946e-02 +5.30893220775707e-02 5.27401122238960e-02 5.23925385197837e-02 5.20466123101462e-02 +5.17023437095904e-02 5.13597414414889e-02 5.10188126942937e-02 5.06795629961831e-02 +5.03419961089540e-02 5.00061139418832e-02 4.96719164860924e-02 4.93394017697504e-02 +4.90085658342553e-02 4.86794027313404e-02 4.83519045408550e-02 4.80260614087812e-02 +4.77018616048644e-02 4.73792915990600e-02 4.70583361558324e-02 4.67389784451830e-02 +4.64212001691439e-02 4.61049817023404e-02 4.57903022451082e-02 4.54771399875511e-02 +4.51654722828391e-02 4.48552758279765e-02 4.45465268502194e-02 4.42392012972873e-02 +4.39332750294960e-02 4.36287240119419e-02 4.33255245048848e-02 4.30236532505105e-02 +4.27230876543111e-02 4.24238059593848e-02 4.21257874120421e-02 4.18290124172033e-02 +4.15334626821828e-02 4.12391213475780e-02 4.09459731041162e-02 4.06540042944534e-02 +4.03632029990746e-02 4.00735591055977e-02 3.97850643609506e-02 3.94977124060553e-02 +3.92114987928221e-02 3.89264209834232e-02 3.86424783319874e-02 3.83596720490149e-02 +3.80780051489796e-02 3.77974823817309e-02 3.75181101484640e-02 3.72398964031604e-02 +3.69628505405275e-02 3.66869832715911e-02 3.64123064881949e-02 3.61388331177551e-02 +3.58665769697013e-02 3.55955525750984e-02 3.53257750209944e-02 3.50572597810769e-02 +3.47900225442429e-02 3.45240790426897e-02 3.42594448811263e-02 3.39961353686819e-02 +3.37341653550478e-02 3.34735490723347e-02 3.32142999840643e-02 3.29564306426319e-02 +3.26999525564859e-02 3.24448760681712e-02 3.21912102442654e-02 3.19389627781238e-02 +3.16881399062131e-02 3.14387463386861e-02 3.11907852047041e-02 3.09442580128756e-02 +3.06991646270314e-02 3.04555032574113e-02 3.02132704671912e-02 2.99724611941351e-02 +2.97330687870176e-02 2.94950850563257e-02 2.92585003386146e-02 2.90233035737755e-02 +2.87894823943521e-02 2.85570232259386e-02 2.83259113975943e-02 2.80961312611265e-02 +2.78676663180150e-02 2.76404993526953e-02 2.74146125708612e-02 2.71899877414175e-02 +2.69666063406856e-02 2.67444496974582e-02 2.65234991374994e-02 2.63037361261074e-02 +2.60851424073815e-02 2.58677001388823e-02 2.56513920204255e-02 2.54362014158149e-02 +2.52221124664002e-02 2.50091101954275e-02 2.47971806022502e-02 2.45863107455691e-02 +2.43764888149831e-02 2.41677041902486e-02 2.39599474877679e-02 2.37532105939524e-02 +2.35474866852356e-02 2.33427702346390e-02 2.31390570049263e-02 2.29363440285064e-02 +2.27346295743766e-02 2.25339131025140e-02 2.23341952062499e-02 2.21354775432645e-02 +2.19377627559540e-02 2.17410543820144e-02 2.15453567561783e-02 2.13506749041219e-02 +2.11570144296271e-02 2.09643813961459e-02 2.07727822039622e-02 2.05822234641807e-02 +2.03927118708021e-02 2.02042540721530e-02 2.00168565429422e-02 1.98305254582050e-02 +1.96452665703712e-02 1.94610850906635e-02 1.92779855759827e-02 1.90959718223830e-02 +1.89150467661742e-02 1.87352123936121e-02 1.85564696600526e-02 1.83788184193559e-02 +1.82022573642251e-02 1.80267839780597e-02 1.78523944987953e-02 1.76790838950849e-02 +1.75068458550600e-02 1.73356727877939e-02 1.71655558374644e-02 1.69964849101014e-02 +1.68284487126783e-02 1.66614348041997e-02 1.64954296583198e-02 1.63304187369228e-02 +1.61663865739929e-02 1.60033168690096e-02 1.58411925890137e-02 1.56799960784114e-02 +1.55197091755148e-02 1.53603133347531e-02 1.52017897534412e-02 1.50441195019497e-02 +1.48872836560904e-02 1.47312634305145e-02 1.45760403119099e-02 1.44215961907914e-02 +1.42679134906875e-02 1.41149752935572e-02 1.39627654603024e-02 1.38112687452907e-02 +1.36604709038583e-02 1.35103587918271e-02 1.33609204561469e-02 1.32121452158504e-02 +1.30640237326053e-02 1.29165480702351e-02 1.27697117426872e-02 1.26235097500298e-02 +1.24779386021666e-02 1.23329963300737e-02 1.21886824844728e-02 1.20449981219687e-02 +1.19019457787948e-02 1.17595294324166e-02 1.16177544513589e-02 1.14766275337212e-02 +1.13361566349513e-02 1.11963508855409e-02 1.10572204993944e-02 1.09187766737104e-02 +1.07810314812827e-02 1.06439977561987e-02 1.05076889739695e-02 1.03721191271728e-02 +1.02373025977287e-02 1.01032540269565e-02 9.96998818457696e-03 9.83751983783450e-03 +9.70586362190462e-03 9.57503391274574e-03 9.44504470352249e-03 9.31590948569876e-03 +9.18764113585194e-03 9.06025180920617e-03 8.93375284082184e-03 8.80815465530633e-03 +8.68346668583326e-03 8.55969730317261e-03 8.43685375534222e-03 8.31494211839601e-03 +8.19396725876075e-03 8.07393280743139e-03 7.95484114622624e-03 7.83669340619418e-03 +7.71948947815842e-03 7.60322803527082e-03 7.48790656734184e-03 7.37352142660890e-03 +7.26006788449994e-03 7.14754019885161e-03 7.03593169094949e-03 6.92523483166723e-03 +6.81544133590164e-03 6.70654226442698e-03 6.59852813222347e-03 6.49138902227947e-03 +6.38511470381432e-03 6.27969475383344e-03 6.17511868089216e-03 6.07137604992887e-03 +5.96845660701466e-03 5.86635040286717e-03 5.76504791398802e-03 5.66454016030023e-03 +5.56481881819379e-03 5.46587632792647e-03 5.36770599437344e-03 5.27030208018011e-03 +5.17365989043326e-03 5.07777584804267e-03 4.98264755910166e-03 4.88827386758532e-03 +4.79465489883213e-03 4.70179209135477e-03 4.60968821662530e-03 4.51834738658188e-03 +4.42777504871108e-03 4.33797796866640e-03 4.24896420048856e-03 4.16074304460320e-03 +4.07332499387108e-03 3.98672166807363e-03 3.90094573731016e-03 3.81601083488329e-03 +3.73193146033331e-03 3.64872287337037e-03 3.56640097952928e-03 3.48498220844135e-03 +3.40448338568225e-03 3.32492159920766e-03 3.24631406143470e-03 3.16867796806446e-03 +3.09203035476681e-03 3.01638795286601e-03 2.94176704517239e-03 2.86818332310232e-03 +2.79565174621044e-03 2.72418640523262e-03 2.65380038969485e-03 2.58450566108593e-03 +2.51631293251339e-03 2.44923155566273e-03 2.38326941574637e-03 2.31843283495790e-03 +2.25472648472286e-03 2.19215330674173e-03 2.13071444243772e-03 2.07040916992343e-03 +2.01123484695242e-03 1.95318685750395e-03 1.89625855862100e-03 1.84044122286552e-03 +1.78572397026385e-03 1.73209368190343e-03 1.67953488547675e-03 1.62802960116294e-03 +1.57755713449499e-03 1.52809380157585e-03 1.47961257158673e-03 1.43208261250382e-03 +1.38546872894161e-03 1.33973068678322e-03 1.29482242847035e-03 1.25069119616257e-03 +1.20727659787493e-03 1.16450967423410e-03 1.12231205015679e-03 1.08059528529883e-03 +1.03926056736656e-03 9.98198920133108e-04 9.57292119036522e-04 9.16414516490553e-04 +8.75435970946489e-04 8.34226042813870e-04 7.92659561924850e-04 7.50623582438776e-04 +7.08025621837986e-04 6.64802934564381e-04 6.20932405927554e-04 5.76440480916654e-04 +5.31412382595296e-04 4.85999746389171e-04 4.40425721891095e-04 3.94986594054725e-04 +3.50049068162956e-04 3.06042558067924e-04 2.63446115293318e-04 2.22770025989944e-04 +1.84532558835772e-04 1.49232832955747e-04 1.17321244335680e-04 8.91692902454584e-05 +6.50409124734892e-05 4.50675972376493e-05 2.92293913375038e-05 1.73437081096514e-05 +9.06331296769202e-06 3.88422986243556e-06 1.16355102168367e-06 1.46332911175313e-07 +-0.00000000000000e+00 + Type L N + 0 1 0 +0.00000000000000e+00 -2.39657616218608e-02 -4.79049458534175e-02 -7.17910542392746e-02 +-9.55977458072436e-02 -1.19298914845828e-01 -1.42868767531499e-01 -1.66281897033512e-01 +-1.89513356733507e-01 -2.12538731208813e-01 -2.35334204642262e-01 -2.57876626337243e-01 +-2.80143573034587e-01 -3.02113407747584e-01 -3.23765334852798e-01 -3.45079451197269e-01 +-3.66036793006959e-01 -3.86619378406801e-01 -4.06810245389234e-01 -4.26593485095408e-01 +-4.45954270301337e-01 -4.64878879029665e-01 -4.83354713236487e-01 -5.01370312551469e-01 +-5.18915363078165e-01 -5.35980701289795e-01 -5.52558313083610e-01 -5.68641328084083e-01 +-5.84224009311499e-01 -5.99301738357690e-01 -6.13870996234735e-01 -6.27929340085108e-01 +-6.41475375962892e-01 -6.54508727915275e-01 -6.67030003611279e-01 -6.79040756780668e-01 +-6.90543446739952e-01 -7.01541395294434e-01 -7.12038741315138e-01 -7.22040393297315e-01 +-7.31551980212866e-01 -7.40579800972572e-01 -7.49130772815388e-01 -7.57212378941331e-01 +-7.64832615701645e-01 -7.71999939655072e-01 -7.78723214792212e-01 -7.85011660221207e-01 +-7.90874798597455e-01 -7.96322405567834e-01 -8.01364460486057e-01 -8.06011098640540e-01 +-8.10272565219535e-01 -8.14159171220498e-01 -8.17681251491857e-01 -8.20849125075621e-01 +-8.23673057998903e-01 -8.26163228641410e-01 -8.28329695784620e-01 -8.30182369426787e-01 +-8.31730984426222e-01 -8.32985077013790e-01 -8.33953964194219e-01 -8.34646726034961e-01 +-8.35072190820982e-01 -8.35238923034236e-01 -8.35155214097766e-01 -8.34829075806524e-01 +-8.34268236350227e-01 -8.33480138817936e-01 -8.32471942059754e-01 -8.31250523767982e-01 +-8.29822485628598e-01 -8.28194160383725e-01 -8.26371620637276e-01 -8.24360689228916e-01 +-8.22166950996011e-01 -8.19795765739427e-01 -8.17252282206699e-01 -8.14541452905384e-01 +-8.11668049560160e-01 -8.08636679029528e-01 -8.05451799501603e-01 -8.02117736793582e-01 +-7.98638700585733e-01 -7.95018800428312e-01 -7.91262061368436e-01 -7.87372439053570e-01 +-7.83353834178835e-01 -7.79210106156681e-01 -7.74945085899492e-01 -7.70562587618249e-01 +-7.66066419553410e-01 -7.61460393567499e-01 -7.56748333542425e-01 -7.51934082538166e-01 +-7.47021508683031e-01 -7.42014509779099e-01 -7.36917016619607e-01 -7.31732995027816e-01 +-7.26466446639142e-01 -7.21121408460102e-01 -7.15701951248637e-01 -7.10212176770699e-01 +-7.04656213997482e-01 -6.99038214316286e-01 -6.93362345835693e-01 -6.87632786872433e-01 +-6.81853718712995e-01 -6.76029317747716e-01 -6.70163747078640e-01 -6.64261147704988e-01 +-6.58325629391555e-01 -6.52361261325753e-01 -6.46372062668448e-01 -6.40361993102118e-01 +-6.34334943477332e-01 -6.28294726655078e-01 -6.22245068638171e-01 -6.16189600079857e-01 +-6.10131848251897e-01 -6.04075229547943e-01 -5.98023042590949e-01 -5.91978462005771e-01 +-5.85944532910188e-01 -5.79924166169198e-01 -5.73920134448960e-01 -5.67935069097975e-01 +-5.61971457874391e-01 -5.56031643529475e-01 -5.50117823248704e-01 -5.44232048943374e-01 +-5.38376228377442e-01 -5.32552127106400e-01 -5.26761371197472e-01 -5.21005450693402e-01 +-5.15285723775591e-01 -5.09603421576378e-01 -5.03959653584981e-01 -4.98355413586893e-01 +-4.92791586072639e-01 -4.87268953048454e-01 -4.81788201179008e-01 -4.76349929190446e-01 +-4.70954655461046e-01 -4.65602825726445e-01 -4.60294820826875e-01 -4.55030964424937e-01 +-4.49811530624348e-01 -4.44636751422505e-01 -4.39506823932879e-01 -4.34421917316904e-01 +-4.29382179369234e-01 -4.24387742704954e-01 -4.19438730502401e-01 -4.14535261760775e-01 +-4.09677456037455e-01 -4.04865437635968e-01 -4.00099339221777e-01 -3.95379304849315e-01 +-3.90705492390088e-01 -3.86078075357982e-01 -3.81497244134193e-01 -3.76963206600274e-01 +-3.72476188193770e-01 -3.68036431406513e-01 -3.63644194751073e-01 -3.59299751225795e-01 +-3.55003386313517e-01 -3.50755395553174e-01 -3.46556081727226e-01 -3.42405751711014e-01 +-3.38304713032800e-01 -3.34253270195386e-01 -3.30251720811752e-01 -3.26300351608143e-01 +-3.22399434348468e-01 -3.18549221733740e-01 -3.14749943329608e-01 -3.11001801573790e-01 +-3.07304967913505e-01 -3.03659579120764e-01 -3.00065733830678e-01 -2.96523489344869e-01 +-2.93032858738511e-01 -2.89593808305733e-01 -2.86206255373915e-01 -2.82870066513038e-01 +-2.79585056161589e-01 -2.76350985685780e-01 -2.73167562883922e-01 -2.70034441942867e-01 +-2.66951223848478e-01 -2.63917457247164e-01 -2.60932639750733e-01 -2.57996219672108e-01 +-2.55107598174987e-01 -2.52266131816283e-01 -2.49471135456147e-01 -2.46721885506761e-01 +-2.44017623487710e-01 -2.41357559852781e-01 -2.38740878050491e-01 -2.36166738778472e-01 +-2.33634284390133e-01 -2.31142643410761e-01 -2.28690935119390e-01 -2.26278274152440e-01 +-2.23903775085197e-01 -2.21566556947795e-01 -2.19265747633308e-01 -2.17000488157025e-01 +-2.14769936727769e-01 -2.12573272594326e-01 -2.10409699632647e-01 -2.08278449642321e-01 +-2.06178785324038e-01 -2.04110002913186e-01 -2.02071434448383e-01 -2.00062449657580e-01 +-1.98082457448375e-01 -1.96130906993224e-01 -1.94207288404397e-01 -1.92311132997663e-01 +-1.90442013147802e-01 -1.88599541743115e-01 -1.86783371250001e-01 -1.84993192402517e-01 +-1.83228732535376e-01 -1.81489753582282e-01 -1.79776049764579e-01 -1.78087444998078e-01 +-1.76423790048449e-01 -1.74784959467757e-01 -1.73170848346630e-01 -1.71581368917969e-01 +-1.70016447049300e-01 -1.68476018661537e-01 -1.66960026112306e-01 -1.65468414581924e-01 +-1.64001128499683e-01 -1.62558108047303e-01 -1.61139285775233e-01 -1.59744583365965e-01 +-1.58373908576660e-01 -1.57027152391249e-01 -1.55704186409697e-01 -1.54404860499444e-01 +-1.53129000731077e-01 -1.51876407617184e-01 -1.50646854670047e-01 -1.49440087290398e-01 +-1.48255821995981e-01 -1.47093745995075e-01 -1.45953517106558e-01 -1.44834764024529e-01 +-1.43737086921981e-01 -1.42660058384605e-01 -1.41603224662485e-01 -1.40566107224312e-01 +-1.39548204595760e-01 -1.38548994460936e-01 -1.37567936003279e-01 -1.36604472460058e-01 +-1.35658033862620e-01 -1.34728039932907e-01 -1.33813903105365e-01 -1.32915031642380e-01 +-1.32030832810651e-01 -1.31160716085572e-01 -1.30304096350653e-01 -1.29460397059347e-01 +-1.28629053327284e-01 -1.27809514923871e-01 -1.27001249133494e-01 -1.26203743458130e-01 +-1.25416508134996e-01 -1.24639078444964e-01 -1.23871016789784e-01 -1.23111914518685e-01 +-1.22361393487618e-01 -1.21619107337261e-01 -1.20884742478848e-01 -1.20158018779985e-01 +-1.19438689945653e-01 -1.18726543592800e-01 -1.18021401019992e-01 -1.17323116676700e-01 +-1.16631577339810e-01 -1.15946701007831e-01 -1.15268435526079e-01 -1.14596756958722e-01 +-1.13931667726006e-01 -1.13273194527233e-01 -1.12621386072033e-01 -1.11976310644274e-01 +-1.11338053524416e-01 -1.10706714297362e-01 -1.10082404073774e-01 -1.09465242653499e-01 +-1.08855355660067e-01 -1.08252871675301e-01 -1.07657919402800e-01 -1.07070624888574e-01 +-1.06491108826223e-01 -1.05919483973017e-01 -1.05355852701838e-01 -1.04800304712377e-01 +-1.04252914923140e-01 -1.03713741563785e-01 -1.03182824485106e-01 -1.02660183701613e-01 +-1.02145818179119e-01 -1.01639704877179e-01 -1.01141798053477e-01 -1.00652028834552e-01 +-1.00170305054454e-01 -9.96965113601664e-02 -9.92305095799041e-02 -9.87721393476963e-02 +-9.83212189751103e-02 -9.78775465584754e-02 -9.74409013076410e-02 -9.70110450801259e-02 +-9.65877241025249e-02 -9.61706708592445e-02 -9.57596061270662e-02 -9.53542411326968e-02 +-9.49542798093638e-02 -9.45594211276747e-02 -9.41693614753744e-02 -9.37837970603238e-02 +-9.34024263109629e-02 -9.30249522487501e-02 -9.26510848075357e-02 -9.22805430755678e-02 +-9.19130574368007e-02 -9.15483715893943e-02 -9.11862444207184e-02 -9.08264517198204e-02 +-9.04687877101303e-02 -9.01130663871702e-02 -8.97591226481682e-02 -8.94068132027306e-02 +-8.90560172560838e-02 -8.87066369588185e-02 -8.83585976195503e-02 -8.80118476794011e-02 +-8.76663584497072e-02 -8.73221236168149e-02 -8.69791585202450e-02 -8.66374992128291e-02 +-8.62972013136578e-02 -8.59583386667803e-02 -8.56210018205560e-02 -8.52852963443535e-02 +-8.49513410009030e-02 -8.46192657940224e-02 -8.42892099126413e-02 -8.39613195930249e-02 +-8.36357459218499e-02 -8.33126426032962e-02 -8.29921637135815e-02 -8.26744614663988e-02 +-8.23596840124928e-02 -8.20479732961575e-02 -8.17394629907492e-02 -8.14342765343903e-02 +-8.11325252859151e-02 -8.08343068197769e-02 -8.05397033771145e-02 -8.02487804884952e-02 +-7.99615857820026e-02 -7.96781479883718e-02 -7.93984761527816e-02 -7.91225590607463e-02 +-7.88503648832977e-02 -7.85818410443712e-02 -7.83169143109944e-02 -7.80554911045789e-02 +-7.77974580293386e-02 -7.75426826116300e-02 -7.72910142418602e-02 -7.70422853085487e-02 +-7.67963125121853e-02 -7.65528983447214e-02 -7.63118327188764e-02 -7.60728947299511e-02 +-7.58358545315451e-02 -7.56004753054626e-02 -7.53665153051963e-02 -7.51337299516918e-02 +-7.49018739596362e-02 -7.46707034722708e-02 -7.44399781827256e-02 -7.42094634200766e-02 +-7.39789321787768e-02 -7.37481670707562e-02 -7.35169621803571e-02 -7.32851248033328e-02 +-7.30524770523934e-02 -7.28188573132072e-02 -7.25841215363575e-02 -7.23481443524834e-02 +-7.21108199996870e-02 -7.18720630542544e-02 -7.16318089577753e-02 -7.13900143358618e-02 +-7.11466571058121e-02 -7.09017363727355e-02 -7.06552721158229e-02 -7.04073046685876e-02 +-7.01578939989967e-02 -6.99071187974396e-02 -6.96550753824243e-02 -6.94018764357127e-02 +-6.91476495803253e-02 -6.88925358163924e-02 -6.86366878312458e-02 -6.83802682013660e-02 +-6.81234475048523e-02 -6.78664023639320e-02 -6.76093134376792e-02 -6.73523633855498e-02 +-6.70957348225748e-02 -6.68396082870652e-02 -6.65841602414885e-02 -6.63295611267662e-02 +-6.60759734896324e-02 -6.58235502018778e-02 -6.55724327893077e-02 -6.53227498870580e-02 +-6.50746158365705e-02 -6.48281294380323e-02 -6.45833728704455e-02 -6.43404107897481e-02 +-6.40992896135489e-02 -6.38600369991077e-02 -6.36226615191941e-02 -6.33871525384249e-02 +-6.31534802906222e-02 -6.29215961556781e-02 -6.26914331323811e-02 -6.24629065016681e-02 +-6.22359146728386e-02 -6.20103402034238e-02 -6.17860509816565e-02 -6.15629015588679e-02 +-6.13407346176405e-02 -6.11193825602088e-02 -6.08986692004204e-02 -6.06784115415641e-02 +-6.04584216215522e-02 -6.02385084063140e-02 -6.00184797118288e-02 -5.97981441349924e-02 +-5.95773129734864e-02 -5.93558021149961e-02 -5.91334338764946e-02 -5.89100387748867e-02 +-5.86854572110640e-02 -5.84595410503660e-02 -5.82321550835547e-02 -5.80031783536811e-02 +-5.77725053356435e-02 -5.75400469567777e-02 -5.73057314484926e-02 -5.70695050207146e-02 +-5.68313323527503e-02 -5.65911968960728e-02 -5.63491009864740e-02 -5.61050657649930e-02 +-5.58591309089775e-02 -5.56113541765924e-02 -5.53618107699745e-02 -5.51105925240866e-02 +-5.48578069300833e-02 -5.46035760036731e-02 -5.43480350105186e-02 -5.40913310621464e-02 +-5.38336215971288e-02 -5.35750727634321e-02 -5.33158577187926e-02 -5.30561548667790e-02 +-5.27961460468038e-02 -5.25360146967735e-02 -5.22759440072908e-02 -5.20161150863602e-02 +-5.17567051533865e-02 -5.14978857809049e-02 -5.12398212019403e-02 -5.09826667001719e-02 +-5.07265670991841e-02 -5.04716553660209e-02 -5.02180513430507e-02 -4.99658606207896e-02 +-4.97151735628532e-02 -4.94660644926126e-02 -4.92185910494436e-02 -4.89727937206973e-02 +-4.87286955536940e-02 -4.84863020501804e-02 -4.82456012438126e-02 -4.80065639593332e-02 +-4.77691442502521e-02 -4.75332800100062e-02 -4.72988937498007e-02 -4.70658935346308e-02 +-4.68341740673771e-02 -4.66036179093602e-02 -4.63740968243627e-02 -4.61454732318789e-02 +-4.59176017542615e-02 -4.56903308414922e-02 -4.54635044565410e-02 -4.52369638036857e-02 +-4.50105490817540e-02 -4.47841012440293e-02 -4.45574637465256e-02 -4.43304842664912e-02 +-4.41030163733401e-02 -4.38749211347306e-02 -4.36460686412123e-02 -4.34163394337281e-02 +-4.31856258192866e-02 -4.29538330612983e-02 -4.27208804323836e-02 -4.24867021188980e-02 +-4.22512479679668e-02 -4.20144840694624e-02 -4.17763931670670e-02 -4.15369748943440e-02 +-4.12962458335467e-02 -4.10542393967303e-02 -4.08110055305680e-02 -4.05666102480926e-02 +-4.03211349923665e-02 -4.00746758388179e-02 -3.98273425446378e-02 -3.95792574552064e-02 +-3.93305542789851e-02 -3.90813767436581e-02 -3.88318771475298e-02 -3.85822148212476e-02 +-3.83325545158442e-02 -3.80830647338401e-02 -3.78339160207266e-02 -3.75852792345505e-02 +-3.73373238115414e-02 -3.70902160457514e-02 -3.68441174005339e-02 -3.65991828693460e-02 +-3.63555594028543e-02 -3.61133844186294e-02 -3.58727844088661e-02 -3.56338736605512e-02 +-3.53967531013413e-02 -3.51615092831197e-02 -3.49282135137807e-02 -3.46969211462653e-02 +-3.44676710322556e-02 -3.42404851462384e-02 -3.40153683838985e-02 -3.37923085370109e-02 +-3.35712764451828e-02 -3.33522263229812e-02 -3.31350962591774e-02 -3.29198088830702e-02 +-3.27062721911306e-02 -3.24943805255625e-02 -3.22840156948087e-02 -3.20750482245708e-02 +-3.18673387265671e-02 -3.16607393710344e-02 -3.14550954479131e-02 -3.12502470007320e-02 +-3.10460305164559e-02 -3.08422806539749e-02 -3.06388319935077e-02 -3.04355207889634e-02 +-3.02321867052614e-02 -3.00286745227570e-02 -2.98248357912342e-02 -2.96205304164352e-02 +-2.94156281627741e-02 -2.92100100567210e-02 -2.90035696763492e-02 -2.87962143136904e-02 +-2.85878659978281e-02 -2.83784623680753e-02 -2.81679573881021e-02 -2.79563218934976e-02 +-2.77435439669435e-02 -2.75296291369324e-02 -2.73146003977612e-02 -2.70984980503551e-02 +-2.68813793653028e-02 -2.66633180713025e-02 -2.64444036740045e-02 -2.62247406119718e-02 +-2.60044472581471e-02 -2.57836547768094e-02 -2.55625058474795e-02 -2.53411532686126e-02 +-2.51197584551569e-02 -2.48984898451554e-02 -2.46775212315197e-02 -2.44570300358885e-02 +-2.42371955420946e-02 -2.40181971072067e-02 -2.38002123683640e-02 -2.35834154636923e-02 +-2.33679752854759e-02 -2.31540537834616e-02 -2.29418043356867e-02 -2.27313702035674e-02 +-2.25228830871538e-02 -2.23164617954658e-02 -2.21122110456824e-02 -2.19102204036690e-02 +-2.17105633769151e-02 -2.15132966694282e-02 -2.13184596064979e-02 -2.11260737355428e-02 +-2.09361426074675e-02 -2.07486517411434e-02 -2.05635687717668e-02 -2.03808437819920e-02 +-2.02004098128739e-02 -2.00221835498336e-02 -1.98460661770721e-02 -1.96719443921399e-02 +-1.94996915707290e-02 -1.93291690702129e-02 -1.91602276590311e-02 -1.89927090577125e-02 +-1.88264475761701e-02 -1.86612718308945e-02 -1.84970065248255e-02 -1.83334742720092e-02 +-1.81704974486570e-02 -1.80079000519057e-02 -1.78455095474652e-02 -1.76831586873996e-02 +-1.75206872795484e-02 -1.73579438905340e-02 -1.71947874649312e-02 -1.70310888439706e-02 +-1.68667321681241e-02 -1.67016161490488e-02 -1.65356551976436e-02 -1.63687803963902e-02 +-1.62009403056859e-02 -1.60321015955178e-02 -1.58622494955679e-02 -1.56913880586400e-02 +-1.55195402341686e-02 -1.53467477504680e-02 -1.51730708062990e-02 -1.49985875742506e-02 +-1.48233935203271e-02 -1.46476005459958e-02 -1.44713359607466e-02 -1.42947412949416e-02 +-1.41179709643619e-02 -1.39411907993866e-02 -1.37645764531264e-02 -1.35883117041006e-02 +-1.34125866701427e-02 -1.32375959511684e-02 -1.30635367192027e-02 -1.28906067746488e-02 +-1.27190025881860e-02 -1.25489173478803e-02 -1.23805390311148e-02 -1.22140485207520e-02 +-1.20496177845721e-02 -1.18874081364560e-02 -1.17275685970332e-02 -1.15702343705790e-02 +-1.14155254538441e-02 -1.12635453912405e-02 -1.11143801893956e-02 -1.09680974025456e-02 +-1.08247453985796e-02 -1.06843528137799e-02 -1.05469282024559e-02 -1.04124598857519e-02 +-1.02809160019442e-02 -1.01522447585488e-02 -1.00263748845530e-02 -9.90321627909459e-03 +-9.78266085094326e-03 -9.66458354122278e-03 -9.54884351996865e-03 -9.43528554534746e-03 +-9.32374147271410e-03 -9.21403189913927e-03 -9.10596792764380e-03 -8.99935303412230e-03 +-8.89398501885290e-03 -8.78965802357524e-03 -8.68616459438835e-03 -8.58329777018271e-03 +-8.48085317597783e-03 -8.37863110039570e-03 -8.27643853656437e-03 -8.17409116601047e-03 +-8.07141526556524e-03 -7.96824951797419e-03 -7.86444670775348e-03 -7.75987528487627e-03 +-7.65442078008361e-03 -7.54798705698995e-03 -7.44049738767929e-03 -7.33189534014615e-03 +-7.22214546771971e-03 -7.11123379248833e-03 -6.99916807671315e-03 -6.88597787825244e-03 +-6.77171438809915e-03 -6.65645005024223e-03 -6.54027796617568e-03 -6.42331108848030e-03 +-6.30568120996727e-03 -6.18753775688461e-03 -6.06904639662566e-03 -5.95038747222129e-03 +-5.83175427763261e-03 -5.71335118946721e-03 -5.59539167220078e-03 -5.47809617529496e-03 +-5.36168994172760e-03 -5.24640074840774e-03 -5.13245659969708e-03 -5.02008339581809e-03 +-4.90950259827096e-03 -4.80092891451162e-03 -4.69456802405312e-03 -4.59061436783500e-03 +-4.48924902215930e-03 -4.39063767770680e-03 -4.29492874311418e-03 -4.20225159129656e-03 +-4.11271496511737e-03 -4.02640555710284e-03 -3.94338677562674e-03 -3.86369770728293e-03 +-3.78735228193430e-03 -3.71433864306323e-03 -3.64461872142481e-03 -3.57812800446641e-03 +-3.51477548738126e-03 -3.45444378387834e-03 -3.39698936572294e-03 -3.34224288987837e-03 +-3.29000956093733e-03 -3.24006946502756e-03 -3.19217780052001e-03 -3.14606492219549e-03 +-3.10143611126606e-03 -3.05797098675461e-03 -3.01532248789178e-03 -2.97311538665309e-03 +-2.93094433881708e-03 -2.88837155511671e-03 -2.84492427409561e-03 -2.80009234577079e-03 +-2.75332638708186e-03 -2.70403713830627e-03 -2.65159681983708e-03 -2.59534343975040e-03 +-2.53458910637960e-03 -2.46863342320280e-03 -2.39678294962950e-03 -2.31837746657468e-03 +-2.23282336375606e-03 -2.13963385515171e-03 -2.03847494089996e-03 -1.92921510734253e-03 +-1.81197576313001e-03 -1.68717845113431e-03 -1.55558408214493e-03 -1.41831895019764e-03 +-1.27688225149128e-03 -1.13313035573195e-03 -9.89234239938452e-04 -8.47608291496192e-04 +-7.10811037207587e-04 -5.81421086687402e-04 -4.61894437974251e-04 -3.54411965955773e-04 +-2.60728058562509e-04 -1.82032657071334e-04 -1.18839135396120e-04 -7.09093682914694e-05 +-3.72249821693773e-05 -1.60103058245582e-05 -4.80824581306925e-06 -6.05630463922488e-07 +0.00000000000000e+00 diff --git a/tests/PP_ORB/Si_gga_8au_100Ry_3s3p2d.orb b/tests/PP_ORB/Si_gga_8au_100Ry_3s3p2d.orb new file mode 100644 index 0000000000..326b8d6f49 --- /dev/null +++ b/tests/PP_ORB/Si_gga_8au_100Ry_3s3p2d.orb @@ -0,0 +1,1637 @@ +--------------------------------------------------------------------------- +Element Si +Energy Cutoff(Ry) 100 +Radius Cutoff(a.u.) 8 +Lmax 2 +Number of Sorbital--> 3 +Number of Porbital--> 3 +Number of Dorbital--> 2 +--------------------------------------------------------------------------- +SUMMARY END + +Mesh 801 +dr 0.01 + Type L N + 0 0 0 +9.79216937054568e-02 9.79816864317061e-02 9.81616189465364e-02 9.84613543015764e-02 +9.88806643941938e-02 9.94192301846584e-02 1.00076642000530e-01 1.00852399928618e-01 +1.01745914294944e-01 1.02756506233202e-01 1.03883408342277e-01 1.05125765433411e-01 +1.06482635367613e-01 1.07952989983937e-01 1.09535716119191e-01 1.11229616719608e-01 +1.13033412044928e-01 1.14945740965229e-01 1.16965162350737e-01 1.19090156554657e-01 +1.21319126988905e-01 1.23650401792401e-01 1.26082235591352e-01 1.28612811350686e-01 +1.31240242315514e-01 1.33962574041209e-01 1.36777786510339e-01 1.39683796334366e-01 +1.42678459037680e-01 1.45759571421129e-01 1.48924874001889e-01 1.52172053526077e-01 +1.55498745550171e-01 1.58902537086909e-01 1.62380969310945e-01 1.65931540319203e-01 +1.69551707940491e-01 1.73238892588612e-01 1.76990480152890e-01 1.80803824919720e-01 +1.84676252518508e-01 1.88605062885098e-01 1.92587533235595e-01 1.96620921043320e-01 +2.00702467011499e-01 2.04829398034182e-01 2.08998930137865e-01 2.13208271396248e-01 +2.17454624810638e-01 2.21735191148546e-01 2.26047171733207e-01 2.30387771176888e-01 +2.34754200051097e-01 2.39143677487075e-01 2.43553433700253e-01 2.47980712432735e-01 +2.52422773308252e-01 2.56876894094482e-01 2.61340372868096e-01 2.65810530078417e-01 +2.70284710506097e-01 2.74760285113804e-01 2.79234652786499e-01 2.83705241959468e-01 +2.88169512132947e-01 2.92624955272759e-01 2.97069097097060e-01 3.01499498249919e-01 +3.05913755363089e-01 3.10309502007973e-01 3.14684409540380e-01 3.19036187841278e-01 +3.23362585957337e-01 3.27661392645563e-01 3.31930436826886e-01 3.36167587954007e-01 +3.40370756299268e-01 3.44537893168700e-01 3.48666991048751e-01 3.52756083692490e-01 +3.56803246152347e-01 3.60806594766622e-01 3.64764287107145e-01 3.68674521895533e-01 +3.72535538895536e-01 3.76345618788891e-01 3.80103083042041e-01 3.83806293770875e-01 +3.87453653610493e-01 3.91043605596661e-01 3.94574633065351e-01 3.98045259576369e-01 +4.01454048866623e-01 4.04799604838170e-01 4.08080571585607e-01 4.11295633466860e-01 +4.14443515220834e-01 4.17522982134759e-01 4.20532840263432e-01 4.23471936701922e-01 +4.26339159912576e-01 4.29133440106544e-01 4.31853749679312e-01 4.34499103699055e-01 +4.37068560445954e-01 4.39561221999950e-01 4.41976234873745e-01 4.44312790687264e-01 +4.46570126879167e-01 4.48747527450451e-01 4.50844323734644e-01 4.52859895188623e-01 +4.54793670197635e-01 4.56645126887709e-01 4.58413793938323e-01 4.60099251387896e-01 +4.61701131424453e-01 4.63219119153652e-01 4.64652953336253e-01 4.66002427087069e-01 +4.67267388527480e-01 4.68447741383635e-01 4.69543445522665e-01 4.70554517419403e-01 +4.71481030546398e-01 4.72323115680323e-01 4.73080961118277e-01 4.73754812797908e-01 +4.74344974315766e-01 4.74851806838850e-01 4.75275728904867e-01 4.75617216107345e-01 +4.75876800662383e-01 4.76055070854520e-01 4.76152670359862e-01 4.76170297445373e-01 +4.76108704043931e-01 4.75968694705508e-01 4.75751125425565e-01 4.75456902352499e-01 +4.75086980376707e-01 4.74642361604528e-01 4.74124093721057e-01 4.73533268246463e-01 +4.72871018691091e-01 4.72138518615257e-01 4.71336979600186e-01 4.70467649137080e-01 +4.69531808441801e-01 4.68530770203056e-01 4.67465876272372e-01 4.66338495304468e-01 +4.65150020356887e-01 4.63901866457987e-01 4.62595468152503e-01 4.61232277034014e-01 +4.59813759273658e-01 4.58341393154406e-01 4.56816666620135e-01 4.55241074848549e-01 +4.53616117856843e-01 4.51943298148685e-01 4.50224118410825e-01 4.48460079267252e-01 +4.46652677098396e-01 4.44803401932459e-01 4.42913735415401e-01 4.40985148865664e-01 +4.39019101419055e-01 4.37017038268713e-01 4.34980389004417e-01 4.32910566054891e-01 +4.30808963236109e-01 4.28676954407979e-01 4.26515892241096e-01 4.24327107094654e-01 +4.22111906005917e-01 4.19871571791065e-01 4.17607362256577e-01 4.15320509519742e-01 +4.13012219436312e-01 4.10683671132773e-01 4.08336016640178e-01 4.05970380626043e-01 +4.03587860220335e-01 4.01189524931200e-01 3.98776416645712e-01 3.96349549710621e-01 +3.93909911087786e-01 3.91458460578792e-01 3.88996131113037e-01 3.86523829093467e-01 +3.84042434794068e-01 3.81552802803141e-01 3.79055762506463e-01 3.76552118604433e-01 +3.74042651657431e-01 3.71528118653753e-01 3.69009253594648e-01 3.66486768091213e-01 +3.63961351968136e-01 3.61433673869554e-01 3.58904381862611e-01 3.56374104034594e-01 +3.53843449079927e-01 3.51313006873597e-01 3.48783349028036e-01 3.46255029430804e-01 +3.43728584760867e-01 3.41204534981608e-01 3.38683383809139e-01 3.36165619154855e-01 +3.33651713541544e-01 3.31142124492753e-01 3.28637294895431e-01 3.26137653336257e-01 +3.23643614412324e-01 3.21155579017165e-01 3.18673934603389e-01 3.16199055423419e-01 +3.13731302750036e-01 3.11271025078641e-01 3.08818558313270e-01 3.06374225938556e-01 +3.03938339179892e-01 3.01511197154142e-01 2.99093087013253e-01 2.96684284083139e-01 +2.94285052000170e-01 2.91895642847555e-01 2.89516297293801e-01 2.87147244735358e-01 +2.84788703445385e-01 2.82440880730457e-01 2.80103973096827e-01 2.77778166427709e-01 +2.75463636172782e-01 2.73160547550969e-01 2.70869055767266e-01 2.68589306244188e-01 +2.66321434868160e-01 2.64065568250941e-01 2.61821824005933e-01 2.59590311039004e-01 +2.57371129853206e-01 2.55164372866591e-01 2.52970124742079e-01 2.50788462728175e-01 +2.48619457009142e-01 2.46463171063071e-01 2.44319662026168e-01 2.42188981061436e-01 +2.40071173729842e-01 2.37966280361992e-01 2.35874336428249e-01 2.33795372905241e-01 +2.31729416636665e-01 2.29676490686314e-01 2.27636614681315e-01 2.25609805143582e-01 +2.23596075807617e-01 2.21595437922864e-01 2.19607900538946e-01 2.17633470772265e-01 +2.15672154052597e-01 2.13723954348469e-01 2.11788874370314e-01 2.09866915750580e-01 +2.07958079200158e-01 2.06062364640740e-01 2.04179771312883e-01 2.02310297859820e-01 +2.00453942387240e-01 1.98610702499482e-01 1.96780575312804e-01 1.94963557446582e-01 +1.93159644993491e-01 1.91368833469901e-01 1.89591117747894e-01 1.87826491970462e-01 +1.86074949451592e-01 1.84336482563057e-01 1.82611082609845e-01 1.80898739696252e-01 +1.79199442584709e-01 1.77513178549484e-01 1.75839933227409e-01 1.74179690467776e-01 +1.72532432183571e-01 1.70898138206111e-01 1.69276786145154e-01 1.67668351256430e-01 +1.66072806318457e-01 1.64490121520400e-01 1.62920264362581e-01 1.61363199571117e-01 +1.59818889027982e-01 1.58287291717637e-01 1.56768363691179e-01 1.55262058048758e-01 +1.53768324940826e-01 1.52287111588577e-01 1.50818362323726e-01 1.49362018647560e-01 +1.47918019309021e-01 1.46486300401338e-01 1.45066795476563e-01 1.43659435677158e-01 +1.42264149883618e-01 1.40880864876924e-01 1.39509505514492e-01 1.38149994918124e-01 +1.36802254672356e-01 1.35466205031465e-01 1.34141765133347e-01 1.32828853218366e-01 +1.31527386851238e-01 1.30237283143995e-01 1.28958458978022e-01 1.27690831223215e-01 +1.26434316952296e-01 1.25188833648378e-01 1.23954299403960e-01 1.22730633109570e-01 +1.21517754630424e-01 1.20315584969553e-01 1.19124046415990e-01 1.17943062676760e-01 +1.16772558991577e-01 1.15612462229307e-01 1.14462700965444e-01 1.13323205540029e-01 +1.12193908095643e-01 1.11074742595269e-01 1.09965644820051e-01 1.08866552347139e-01 +1.07777404508039e-01 1.06698142328028e-01 1.05628708447418e-01 1.04569047025614e-01 +1.03519103629063e-01 1.02478825104356e-01 1.01448159437898e-01 1.00427055603660e-01 +9.94154634006602e-02 9.84133332819113e-02 9.74206161766506e-02 9.64372633077214e-02 +9.54632260060286e-02 9.44984555240057e-02 9.35429028500391e-02 9.25965185257762e-02 +9.16592524682147e-02 9.07310537984143e-02 8.98118706786051e-02 8.89016501593748e-02 +8.80003380385202e-02 8.71078787330215e-02 8.62242151654715e-02 8.53492886661417e-02 +8.44830388917138e-02 8.36254037615350e-02 8.27763194120833e-02 8.19357201701465e-02 +8.11035385450357e-02 8.02797052399604e-02 7.94641491825090e-02 7.86567975739877e-02 +7.78575759571841e-02 7.70664083019436e-02 7.62832171077713e-02 7.55079235225047e-02 +7.47404474759472e-02 7.39807078272083e-02 7.32286225243572e-02 7.24841087748874e-02 +7.17470832253756e-02 7.10174621486379e-02 7.02951616366098e-02 6.95800977971245e-02 +6.88721869527292e-02 6.81713458396589e-02 6.74774918050881e-02 6.67905430008034e-02 +6.61104185714691e-02 6.54370388357238e-02 6.47703254584057e-02 6.41102016123018e-02 +6.34565921279167e-02 6.28094236298728e-02 6.21686246586889e-02 6.15341257768253e-02 +6.09058596580387e-02 6.02837611592515e-02 5.96677673743139e-02 5.90578176692085e-02 +5.84538536984333e-02 5.78558194024751e-02 5.72636609864740e-02 5.66773268803578e-02 +5.60967676809029e-02 5.55219360763581e-02 5.49527867544253e-02 5.43892762945598e-02 +5.38313630456957e-02 5.32790069906435e-02 5.27321695985328e-02 5.21908136667837e-02 +5.16549031541904e-02 5.11244030067805e-02 5.05992789781823e-02 5.00794974462756e-02 +4.95650252279419e-02 4.90558293937313e-02 4.85518770842728e-02 4.80531353302207e-02 +4.75595708774996e-02 4.70711500195477e-02 4.65878384381899e-02 4.61096010546799e-02 +4.56364018923502e-02 4.51682039521894e-02 4.47049691025368e-02 4.42466579839439e-02 +4.37932299301009e-02 4.33446429055647e-02 4.29008534608627e-02 4.24618167053689e-02 +4.20274862981800e-02 4.15978144570354e-02 4.11727519851539e-02 4.07522483156778e-02 +4.03362515732494e-02 3.99247086520705e-02 3.95175653096403e-02 3.91147662752120e-02 +3.87162553718636e-02 3.83219756509514e-02 3.79318695375884e-02 3.75458789856903e-02 +3.71639456410344e-02 3.67860110107037e-02 3.64120166372266e-02 3.60419042756796e-02 +3.56756160719925e-02 3.53130947406877e-02 3.49542837402916e-02 3.45991274446822e-02 +3.42475713086785e-02 3.38995620262362e-02 3.35550476796879e-02 3.32139778785575e-02 +3.28763038865805e-02 3.25419787356799e-02 3.22109573257777e-02 3.18831965094601e-02 +3.15586551606668e-02 3.12372942267295e-02 3.09190767632513e-02 3.06039679514849e-02 +3.02919350980423e-02 2.99829476169399e-02 2.96769769941569e-02 2.93739967350558e-02 +2.90739822951832e-02 2.87769109951285e-02 2.84827619202764e-02 2.81915158064358e-02 +2.79031549124640e-02 2.76176628811338e-02 2.73350245896032e-02 2.70552259909502e-02 +2.67782539483184e-02 2.65040960632959e-02 2.62327405001973e-02 2.59641758079660e-02 +2.56983907414286e-02 2.54353740836435e-02 2.51751144710705e-02 2.49176002232608e-02 +2.46628191787212e-02 2.44107585385426e-02 2.41614047193093e-02 2.39147432167094e-02 +2.36707584811629e-02 2.34294338066652e-02 2.31907512339126e-02 2.29546914686350e-02 +2.27212338159122e-02 2.24903561310915e-02 2.22620347877621e-02 2.20362446630731e-02 +2.18129591405096e-02 2.15921501300735e-02 2.13737881056413e-02 2.11578421591018e-02 +2.09442800707147e-02 2.07330683949682e-02 2.05241725610635e-02 2.03175569870087e-02 +2.01131852061717e-02 1.99110200050185e-02 1.97110235706527e-02 1.95131576466771e-02 +1.93173836958131e-02 1.91236630676501e-02 1.89319571698439e-02 1.87422276410495e-02 +1.85544365238557e-02 1.83685464359891e-02 1.81845207380699e-02 1.80023236962367e-02 +1.78219206380055e-02 1.76432780997954e-02 1.74663639646333e-02 1.72911475886457e-02 +1.71175999150564e-02 1.69456935745284e-02 1.67754029708251e-02 1.66067043509050e-02 +1.64395758587222e-02 1.62739975721577e-02 1.61099515226789e-02 1.59474216974882e-02 +1.57863940240970e-02 1.56268563374326e-02 1.54687983297585e-02 1.53122114838553e-02 +1.51570889900775e-02 1.50034256480583e-02 1.48512177539874e-02 1.47004629745288e-02 +1.45511602085796e-02 1.44033094381897e-02 1.42569115700731e-02 1.41119682692332e-02 +1.39684817863067e-02 1.38264547802926e-02 1.36858901383832e-02 1.35467907946416e-02 +1.34091595492884e-02 1.32729988903547e-02 1.31383108194397e-02 1.30050966832725e-02 +1.28733570127267e-02 1.27430913708640e-02 1.26142982114982e-02 1.24869747496732e-02 +1.23611168453312e-02 1.22367189013239e-02 1.21137737767803e-02 1.19922727166979e-02 +1.18722052984657e-02 1.17535593958664e-02 1.16363211609324e-02 1.15204750238609e-02 +1.14060037110147e-02 1.12928882808598e-02 1.11811081775180e-02 1.10706413014374e-02 +1.09614640965181e-02 1.08535516528685e-02 1.07468778242131e-02 1.06414153588286e-02 +1.05371360427516e-02 1.04340108538778e-02 1.03320101254645e-02 1.02311037174549e-02 +1.01312611939601e-02 1.00324520051740e-02 9.93464567194867e-03 9.83781197122650e-03 +9.74192112051579e-03 9.64694395959917e-03 9.55285212768884e-03 9.45961823428081e-03 +9.36721602202022e-03 9.27562051996120e-03 9.18480818569480e-03 9.09475703492419e-03 +9.00544675718343e-03 8.91685881652846e-03 8.82897653617152e-03 8.74178516618419e-03 +8.65527193355614e-03 8.56942607406617e-03 8.48423884559779e-03 8.39970352271044e-03 +8.31581537245895e-03 8.23257161163629e-03 8.14997134579551e-03 8.06801549058358e-03 +7.98670667609464e-03 7.90604913511608e-03 7.82604857629898e-03 7.74671204343580e-03 +7.66804776216387e-03 7.59006497554139e-03 7.51277377005300e-03 7.43618489370138e-03 +7.36030956792150e-03 7.28515929512027e-03 7.21074566369238e-03 7.13708015239361e-03 +7.06417393596388e-03 6.99203769388871e-03 6.92068142416123e-03 6.85011426386598e-03 +6.78034431834556e-03 6.71137850063381e-03 6.64322238274607e-03 6.57588006030713e-03 +6.50935403187465e-03 6.44364509417780e-03 6.37875225434108e-03 6.31467266000409e-03 +6.25140154807721e-03 6.18893221269602e-03 6.12725599275458e-03 6.06636227920938e-03 +6.00623854215526e-03 5.94687037748481e-03 5.88824157275188e-03 5.83033419167340e-03 +5.77312867652205e-03 5.71660396748588e-03 5.66073763790459e-03 5.60550604413351e-03 +5.55088448864153e-03 5.49684739481446e-03 5.44336849181719e-03 5.39042100776341e-03 +5.33797786935474e-03 5.28601190607999e-03 5.23449605701374e-03 5.18340357821980e-03 +5.13270824874919e-03 5.08238457322848e-03 5.03240797905590e-03 4.98275500626736e-03 +4.93340348819389e-03 4.88433272111310e-03 4.83552362119358e-03 4.78695886714471e-03 +4.73862302711464e-03 4.69050266852211e-03 4.64258644966503e-03 4.59486519211856e-03 +4.54733193311238e-03 4.49998195726651e-03 4.45281280725805e-03 4.40582427319053e-03 +4.35901836064090e-03 4.31239923756153e-03 4.26597316041852e-03 4.21974838014760e-03 +4.17373502870430e-03 4.12794498717501e-03 4.08239173659630e-03 4.03709019280106e-03 +3.99205652677006e-03 3.94730797211317e-03 3.90286262143688e-03 3.85873921347039e-03 +3.81495691292129e-03 3.77153508511271e-03 3.72849306751525e-03 3.68584994032979e-03 +3.64362429829835e-03 3.60183402592334e-03 3.56049607825496e-03 3.51962626936890e-03 +3.47923907059615e-03 3.43934742048784e-03 3.39996254840021e-03 3.36109381346818e-03 +3.32274856060258e-03 3.28493199499629e-03 3.24764707646039e-03 3.21089443473384e-03 +3.17467230672028e-03 3.13897649640689e-03 3.10380035801242e-03 3.06913480269733e-03 +3.03496832895135e-03 3.00128707655295e-03 2.96807490377394e-03 2.93531348728409e-03 +2.90298244399472e-03 2.87105947387044e-03 2.83952052253698e-03 2.80833996232024e-03 +2.77749079017125e-03 2.74694484076385e-03 2.71667301289950e-03 2.68664550721692e-03 +2.65683207308494e-03 2.62720226245789e-03 2.59772568839139e-03 2.56837228585770e-03 +2.53911257246068e-03 2.50991790663471e-03 2.48076074091622e-03 2.45161486790516e-03 +2.42245565658250e-03 2.39326027672100e-03 2.36400790921871e-03 2.33467994029742e-03 +2.30526013763949e-03 2.27573480668720e-03 2.24609292549588e-03 2.21632625671481e-03 +2.18642943546683e-03 2.15640003210658e-03 2.12623858905741e-03 2.09594863115463e-03 +2.06553664915799e-03 2.03501205633516e-03 2.00438711825976e-03 1.97367685620875e-03 +1.94289892478452e-03 1.91207346462092e-03 1.88122293126349e-03 1.85037190153353e-03 +1.81954685889648e-03 1.78877595955323e-03 1.75808878115625e-03 1.72751605622146e-03 +1.69708939245550e-03 1.66684098235153e-03 1.63680330451696e-03 1.60700881928674e-03 +1.57748966124409e-03 1.54827733131539e-03 1.51940239112714e-03 1.49089416230915e-03 +1.46278043340207e-03 1.43508717697224e-03 1.40783827945951e-03 1.38105528617714e-03 +1.35475716374852e-03 1.32896008209993e-03 1.30367721792882e-03 1.27891858132550e-03 +1.25469086693658e-03 1.23099733070809e-03 1.20783769281885e-03 1.18520806688976e-03 +1.16310091490793e-03 1.14150502650104e-03 1.12040552020684e-03 1.09978386316556e-03 +1.07961790418951e-03 1.05988191341678e-03 1.04054661973534e-03 1.02157923492656e-03 +1.00294345112131e-03 9.84599395895200e-04 9.66503527452629e-04 9.48608451313849e-04 +9.30862640322061e-04 9.13210042378052e-04 8.95589565970295e-04 8.77934443238570e-04 +8.60171484890407e-04 8.42220261486939e-04 8.23992271701546e-04 8.05390189759837e-04 +7.86307320046672e-04 7.66627424299346e-04 7.46225121966167e-04 7.24967091872979e-04 +7.02714316706293e-04 6.79325603600776e-04 6.54662576878862e-04 6.28596266330583e-04 +6.01015302325872e-04 5.71835577325015e-04 5.41011047000141e-04 5.08545134448645e-04 +4.74501985724313e-04 4.39016628113471e-04 4.02302932692286e-04 3.64658209883649e-04 +3.26463299028232e-04 2.88177171837701e-04 2.50325365249602e-04 2.13481986704443e-04 +1.78245571519429e-04 1.45209676941273e-04 1.14929713715307e-04 8.78880756707166e-05 +6.44600593528432e-05 4.48833031541250e-05 2.92334679271831e-05 1.74086018597671e-05 +9.12408482242341e-06 3.91926768825791e-06 1.17597750546750e-06 1.48041720711579e-07 +-0.00000000000000e+00 + Type L N + 0 0 1 +-7.83974201966619e-01 -7.83806368914025e-01 -7.83303045744561e-01 -7.82464759879517e-01 +-7.81292388545333e-01 -7.79787156071118e-01 -7.77950630127283e-01 -7.75784716927193e-01 +-7.73291655419739e-01 -7.70474010506517e-01 -7.67334665322851e-01 -7.63876812627174e-01 +-7.60103945348196e-01 -7.56019846343930e-01 -7.51628577430841e-01 -7.46934467745147e-01 +-7.41942101501722e-01 -7.36656305218908e-01 -7.31082134479947e-01 -7.25224860303717e-01 +-7.19089955198810e-01 -7.12683078975929e-01 -7.06010064393945e-01 -6.99076902714821e-01 +-6.91889729241963e-01 -6.84454808915396e-01 -6.76778522035532e-01 -6.68867350185148e-01 +-6.60727862416629e-01 -6.52366701768511e-01 -6.43790572171927e-01 -6.35006225803745e-01 +-6.26020450939072e-01 -6.16840060351247e-01 -6.07471880302806e-01 -5.97922740165805e-01 +-5.88199462704774e-01 -5.78308855050188e-01 -5.68257700384887e-01 -5.58052750360315e-01 +-5.47700718253899e-01 -5.37208272873289e-01 -5.26582033207686e-01 -5.15828563821053e-01 +-5.04954370976697e-01 -4.93965899477617e-01 -4.82869530202069e-01 -4.71671578309153e-01 +-4.60378292084821e-01 -4.48995852394608e-01 -4.37530372705665e-01 -4.25987899637226e-01 +-4.14374413995682e-01 -4.02695832247772e-01 -3.90958008383203e-01 -3.79166736116265e-01 +-3.67327751374587e-01 -3.55446735022345e-01 -3.43529315764706e-01 -3.31581073180248e-01 +-3.19607540828533e-01 -3.07614209380755e-01 -2.95606529722632e-01 -2.83589915980312e-01 +-2.71569748422029e-01 -2.59551376190558e-01 -2.47540119824198e-01 -2.35541273526933e-01 +-2.23560107151646e-01 -2.11601867863722e-01 -1.99671781456063e-01 -1.87775053290325e-01 +-1.75916868843233e-01 -1.64102393840884e-01 -1.52336773968098e-01 -1.40625134144091e-01 +-1.28972577359940e-01 -1.17384183077426e-01 -1.05865005192997e-01 -9.44200695745064e-02 +-8.30543711823035e-02 -7.17728707898865e-02 -6.05804913228715e-02 -4.94821138382869e-02 +-3.84825731692572e-02 -2.75866532629109e-02 -1.67990822418521e-02 -6.12452722173267e-03 +4.43241108065227e-03 1.48672039127492e-02 2.51754001878776e-02 3.53526322849720e-02 +4.53946219077267e-02 5.52971859641995e-02 6.50562424280281e-02 7.46678161428258e-02 +8.41280445320660e-02 9.34331831777856e-02 1.02579611232771e-01 1.11563836632475e-01 +1.20382501074761e-01 1.29032384737640e-01 1.37510410707463e-01 1.45813649092473e-01 +1.53939320799284e-01 1.61884800952578e-01 1.69647621941209e-01 1.77225476076832e-01 +1.84616217854177e-01 1.91817865805108e-01 1.98828603941649e-01 2.05646782786098e-01 +2.12270919989348e-01 2.18699700541334e-01 2.24931976580302e-01 2.30966766810209e-01 +2.36803255538039e-01 2.42440791345118e-01 2.47878885408645e-01 2.53117209491576e-01 +2.58155593620692e-01 2.62994023474193e-01 2.67632637501392e-01 2.72071723798116e-01 +2.76311716762174e-01 2.80353193553805e-01 2.84196870386287e-01 2.87843598671944e-01 +2.91294361048589e-01 2.94550267311048e-01 2.97612550271749e-01 3.00482561573548e-01 +3.03161767476913e-01 3.05651744642393e-01 3.07954175927923e-01 3.10070846218996e-01 +3.12003638308123e-01 3.13754528838212e-01 3.15325584322726e-01 3.16718957253544e-01 +3.17936882305546e-01 3.18981672644972e-01 3.19855716346675e-01 3.20561472923421e-01 +3.21101469968547e-01 3.21478299911420e-01 3.21694616883413e-01 3.21753133690465e-01 +3.21656618886737e-01 3.21407893942508e-01 3.21009830498139e-01 3.20465347694867e-01 +3.19777409572236e-01 3.18949022521164e-01 3.17983232781107e-01 3.16883123969323e-01 +3.15651814630031e-01 3.14292455791227e-01 3.12808228517041e-01 3.11202341443855e-01 +3.09478028288914e-01 3.07638545320800e-01 3.05687168781986e-01 3.03627192254659e-01 +3.01461923962103e-01 2.99194683999174e-01 2.96828801486736e-01 2.94367611646375e-01 +2.91814452793187e-01 2.89172663246042e-01 2.86445578156310e-01 2.83636526257677e-01 +2.80748826541298e-01 2.77785784862186e-01 2.74750690484287e-01 2.71646812573246e-01 +2.68477396647331e-01 2.65245660998370e-01 2.61954793095819e-01 2.58607945988272e-01 +2.55208234717722e-01 2.51758732762817e-01 2.48262468528062e-01 2.44722421896519e-01 +2.41141520863960e-01 2.37522638272651e-01 2.33868588663039e-01 2.30182125261436e-01 +2.26465937121524e-01 2.22722646437018e-01 2.18954806042110e-01 2.15164897115550e-01 +2.11355327103154e-01 2.07528427872383e-01 2.03686454111349e-01 1.99831581983113e-01 +1.95965908044606e-01 1.92091448437793e-01 1.88210138358942e-01 1.84323831809984e-01 +1.80434301634058e-01 1.76543239835332e-01 1.72652258181256e-01 1.68762889083334e-01 +1.64876586750592e-01 1.60994728607892e-01 1.57118616969375e-01 1.53249480955446e-01 +1.49388478639945e-01 1.45536699412508e-01 1.41695166539525e-01 1.37864839905718e-01 +1.34046618917041e-01 1.30241345544515e-01 1.26449807487595e-01 1.22672741434925e-01 +1.18910836399692e-01 1.15164737106377e-01 1.11435047405470e-01 1.07722333692686e-01 +1.04027128309350e-01 1.00349932900998e-01 9.66912217117667e-02 9.30514447928537e-02 +8.94310311042749e-02 8.58303914901772e-02 8.22499215092308e-02 7.86900041030014e-02 +7.51510120867255e-02 7.16333104485720e-02 6.81372584452228e-02 6.46632114834603e-02 +6.12115227793828e-02 5.77825447888548e-02 5.43766304048312e-02 5.09941339192504e-02 +4.76354117492550e-02 4.43008229295319e-02 4.09907293746065e-02 3.77054959168580e-02 +3.44454901279584e-02 3.12110819332224e-02 2.80026430301020e-02 2.48205461236387e-02 +2.16651639931781e-02 1.85368684059801e-02 1.54360288945376e-02 1.23630114154415e-02 +9.31817690847828e-03 6.30187977531507e-03 3.31446629762595e-03 3.56273014803623e-04 +-2.57237491846816e-03 -5.47116537459174e-03 -8.33980087543582e-03 -1.11780001752264e-02 +-1.39854997850619e-02 -1.67620554209638e-02 -1.95074433583150e-02 -2.22214616766537e-02 +-2.49039313800249e-02 -2.75546973794726e-02 -3.01736293257494e-02 -3.27606222819033e-02 +-3.53155972270880e-02 -3.78385013846905e-02 -4.03293083696676e-02 -4.27880181518221e-02 +-4.52146568336117e-02 -4.76092762429312e-02 -4.99719533431623e-02 -5.23027894645849e-02 +-5.46019093630072e-02 -5.68694601131658e-02 -5.91056098460561e-02 -6.13105463408656e-02 +-6.34844754835968e-02 -6.56276196057340e-02 -6.77402157174714e-02 -6.98225136510126e-02 +-7.18747741303054e-02 -7.38972667842679e-02 -7.58902681210852e-02 -7.78540594815077e-02 +-7.97889249892675e-02 -8.16951495167344e-02 -8.35730166837720e-02 -8.54228069074124e-02 +-8.72447955194784e-02 -8.90392509686045e-02 -9.08064331223064e-02 -9.25465916837783e-02 +-9.42599647369997e-02 -9.59467774325187e-02 -9.76072408249344e-02 -9.92415508716727e-02 +-1.00849887601128e-01 -1.02432414456655e-01 -1.03989277821240e-01 -1.05520606726012e-01 +-1.07026512744019e-01 -1.08507090069014e-01 -1.09962415777239e-01 -1.11392550268574e-01 +-1.12797537881707e-01 -1.14177407676458e-01 -1.15532174374832e-01 -1.16861839450969e-01 +-1.18166392358815e-01 -1.19445811885128e-01 -1.20700067614314e-01 -1.21929121490647e-01 +-1.23132929462568e-01 -1.24311443193090e-01 -1.25464611819816e-01 -1.26592383747682e-01 +-1.27694708457339e-01 -1.28771538312040e-01 -1.29822830345990e-01 -1.30848548017409e-01 +-1.31848662909967e-01 -1.32823156366824e-01 -1.33772021042251e-01 -1.34695262356640e-01 +-1.35592899841714e-01 -1.36464968363860e-01 -1.37311519214709e-01 -1.38132621059403e-01 +-1.38928360734397e-01 -1.39698843888083e-01 -1.40444195459066e-01 -1.41164559988458e-01 +-1.41860101764170e-01 -1.42531004796756e-01 -1.43177472627950e-01 -1.43799727974640e-01 +-1.44398012212519e-01 -1.44972584705178e-01 -1.45523721985797e-01 -1.46051716799976e-01 +-1.46556877019467e-01 -1.47039524437764e-01 -1.47499993459533e-01 -1.47938629696807e-01 +-1.48355788485669e-01 -1.48751833337789e-01 -1.49127134341756e-01 -1.49482066529455e-01 +-1.49817008223038e-01 -1.50132339378060e-01 -1.50428439938320e-01 -1.50705688217708e-01 +-1.50964459324004e-01 -1.51205123639062e-01 -1.51428045369174e-01 -1.51633581178646e-01 +-1.51822078918702e-01 -1.51993876462857e-01 -1.52149300658776e-01 -1.52288666405439e-01 +-1.52412275863175e-01 -1.52520417802749e-01 -1.52613367098352e-01 -1.52691384367834e-01 +-1.52754715762138e-01 -1.52803592904365e-01 -1.52838232977468e-01 -1.52858838958117e-01 +-1.52865599992837e-01 -1.52858691911204e-01 -1.52838277869494e-01 -1.52804509117011e-01 +-1.52757525876109e-01 -1.52697458325875e-01 -1.52624427678466e-01 -1.52538547336262e-01 +-1.52439924117229e-01 -1.52328659535316e-01 -1.52204851122202e-01 -1.52068593776409e-01 +-1.51919981125551e-01 -1.51759106887480e-01 -1.51586066216127e-01 -1.51400957018077e-01 +-1.51203881226287e-01 -1.50994946017799e-01 -1.50774264962969e-01 -1.50541959094428e-01 +-1.50298157884862e-01 -1.50043000123642e-01 -1.49776634683375e-01 -1.49499221168610e-01 +-1.49210930440092e-01 -1.48911945009254e-01 -1.48602459298951e-01 -1.48282679767762e-01 +-1.47952824896591e-01 -1.47613125037643e-01 -1.47263822127262e-01 -1.46905169265434e-01 +-1.46537430166117e-01 -1.46160878483815e-01 -1.45775797023047e-01 -1.45382476838524e-01 +-1.44981216234895e-01 -1.44572319675936e-01 -1.44156096613927e-01 -1.43732860250737e-01 +-1.43302926242806e-01 -1.42866611362754e-01 -1.42424232130748e-01 -1.41976103429057e-01 +-1.41522537113380e-01 -1.41063840634520e-01 -1.40600315683903e-01 -1.40132256876159e-01 +-1.39659950481619e-01 -1.39183673221086e-01 -1.38703691134583e-01 -1.38220258535095e-01 +-1.37733617057416e-01 -1.37243994811328e-01 -1.36751605647262e-01 -1.36256648541508e-01 +-1.35759307106857e-01 -1.35259749233305e-01 -1.34758126862223e-01 -1.34254575896014e-01 +-1.33749216244026e-01 -1.33242152004099e-01 -1.32733471777841e-01 -1.32223249116397e-01 +-1.31711543092209e-01 -1.31198398991059e-01 -1.30683849117493e-01 -1.30167913705634e-01 +-1.29650601926384e-01 -1.29131912981070e-01 -1.28611837270762e-01 -1.28090357629766e-01 +-1.27567450611188e-01 -1.27043087811952e-01 -1.26517237224319e-01 -1.25989864600672e-01 +-1.25460934818255e-01 -1.24930413230544e-01 -1.24398266992094e-01 -1.23864466343971e-01 +-1.23328985847284e-01 -1.22791805552859e-01 -1.22252912095725e-01 -1.21712299703846e-01 +-1.21169971111383e-01 -1.20625938367694e-01 -1.20080223534362e-01 -1.19532859263585e-01 +-1.18983889252488e-01 -1.18433368569103e-01 -1.17881363847048e-01 -1.17327953347215e-01 +-1.16773226886097e-01 -1.16217285631681e-01 -1.15660241769148e-01 -1.15102218039886e-01 +-1.14543347158579e-01 -1.13983771114325e-01 -1.13423640362855e-01 -1.12863112918043e-01 +-1.12302353351817e-01 -1.11741531712537e-01 -1.11180822372673e-01 -1.10620402817321e-01 +-1.10060452385671e-01 -1.09501150978006e-01 -1.08942677741151e-01 -1.08385209745498e-01 +-1.07828920666825e-01 -1.07273979486083e-01 -1.06720549220138e-01 -1.06168785696177e-01 +-1.05618836382052e-01 -1.05070839284280e-01 -1.04524921924796e-01 -1.03981200406741e-01 +-1.03439778578751e-01 -1.02900747306203e-01 -1.02364183856892e-01 -1.01830151407431e-01 +-1.01298698675555e-01 -1.00769859682235e-01 -1.00243653646266e-01 -9.97200850127064e-02 +-9.91991436152133e-02 -9.86808049710489e-02 -9.81650307062087e-02 -9.76517691068682e-02 +-9.71409557921037e-02 -9.66325145016629e-02 -9.61263579914291e-02 -9.56223890281764e-02 +-9.51205014742313e-02 -9.46205814517836e-02 -9.41225085757970e-02 -9.36261572438011e-02 +-9.31313979702787e-02 -9.26380987529217e-02 -9.21461264577075e-02 -9.16553482095538e-02 +-9.11656327752446e-02 -9.06768519253842e-02 -9.01888817623290e-02 -8.97016040013626e-02 +-8.92149071928288e-02 -8.87286878734930e-02 -8.82428516360825e-02 -8.77573141067411e-02 +-8.72720018210140e-02 -8.67868529899586e-02 -8.63018181490330e-02 -8.58168606835429e-02 +-8.53319572256228e-02 -8.48470979189620e-02 -8.43622865487706e-02 -8.38775405357792e-02 +-8.33928907943857e-02 -8.29083814563763e-02 -8.24240694629551e-02 -8.19400240290871e-02 +-8.14563259854112e-02 -8.09730670041570e-02 -8.04903487166385e-02 -8.00082817309505e-02 +-7.95269845594652e-02 -7.90465824666114e-02 -7.85672062481996e-02 -7.80889909542275e-02 +-7.76120745676562e-02 -7.71365966520907e-02 -7.66626969816019e-02 -7.61905141661195e-02 +-7.57201842858724e-02 -7.52518395482841e-02 -7.47856069805156e-02 -7.43216071705234e-02 +-7.38599530690317e-02 -7.34007488642454e-02 -7.29440889404286e-02 -7.24900569306742e-02 +-7.20387248732821e-02 -7.15901524801670e-02 -7.11443865246352e-02 -7.07014603547211e-02 +-7.02613935370515e-02 -6.98241916349517e-02 -6.93898461231967e-02 -6.89583344404888e-02 +-6.85296201794024e-02 -6.81036534121966e-02 -6.76803711495725e-02 -6.72596979281515e-02 +-6.68415465211935e-02 -6.64258187658594e-02 -6.60124064991836e-02 -6.56011925938425e-02 +-6.51920520838235e-02 -6.47848533692017e-02 -6.43794594884419e-02 -6.39757294459652e-02 +-6.35735195821557e-02 -6.31726849725436e-02 -6.27730808425914e-02 -6.23745639843260e-02 +-6.19769941610155e-02 -6.15802354861717e-02 -6.11841577633798e-02 -6.07886377738056e-02 +-6.03935604987094e-02 -5.99988202648935e-02 -5.96043218017355e-02 -5.92099811992834e-02 +-5.88157267578269e-02 -5.84214997203884e-02 -5.80272548806892e-02 -5.76329610603353e-02 +-5.72386014502212e-02 -5.68441738124511e-02 -5.64496905404202e-02 -5.60551785760654e-02 +-5.56606791846789e-02 -5.52662475890558e-02 -5.48719524661164e-02 -5.44778753104825e-02 +-5.40841096707899e-02 -5.36907602657662e-02 -5.32979419882879e-02 -5.29057788067379e-02 +-5.25144025740070e-02 -5.21239517554068e-02 -5.17345700875781e-02 -5.13464051811870e-02 +-5.09596070807796e-02 -5.05743267956238e-02 -5.01907148156935e-02 -4.98089196271328e-02 +-4.94290862415964e-02 -4.90513547537690e-02 -4.86758589411460e-02 -4.83027249197930e-02 +-4.79320698693106e-02 -4.75640008396039e-02 -4.71986136513145e-02 -4.68359919009061e-02 +-4.64762060804241e-02 -4.61193128208836e-02 -4.57653542670718e-02 -4.54143575903200e-02 +-4.50663346444878e-02 -4.47212817690452e-02 -4.43791797417338e-02 -4.40399938818552e-02 +-4.37036743037901e-02 -4.33701563189004e-02 -4.30393609825339e-02 -4.27111957814371e-02 +-4.23855554555163e-02 -4.20623229465645e-02 -4.17413704653257e-02 -4.14225606670893e-02 +-4.11057479249268e-02 -4.07907796886954e-02 -4.04774979170605e-02 -4.01657405690319e-02 +-3.98553431408771e-02 -3.95461402337783e-02 -3.92379671372401e-02 -3.89306614130344e-02 +-3.86240644643964e-02 -3.83180230752564e-02 -3.80123909045099e-02 -3.77070299206865e-02 +-3.74018117628833e-02 -3.70966190144652e-02 -3.67913463768043e-02 -3.64859017312276e-02 +-3.61802070783502e-02 -3.58741993450933e-02 -3.55678310509004e-02 -3.52610708259680e-02 +-3.49539037756862e-02 -3.46463316869189e-02 -3.43383730732441e-02 -3.40300630577955e-02 +-3.37214530938844e-02 -3.34126105251347e-02 -3.31036179883964e-02 -3.27945726642227e-02 +-3.24855853811704e-02 -3.21767795816120e-02 -3.18682901581061e-02 -3.15602621706563e-02 +-3.12528494563796e-02 -3.09462131441935e-02 -3.06405200881079e-02 -3.03359412335584e-02 +-3.00326499319420e-02 -2.97308202190941e-02 -2.94306250738896e-02 -2.91322346734313e-02 +-2.88358146614315e-02 -2.85415244463644e-02 -2.82495155457986e-02 -2.79599299929819e-02 +-2.76728988212726e-02 -2.73885406413756e-02 -2.71069603255712e-02 -2.68282478122055e-02 +-2.65524770426777e-02 -2.62797050419926e-02 -2.60099711526775e-02 -2.57432964304909e-02 +-2.54796832088977e-02 -2.52191148377500e-02 -2.49615556000283e-02 -2.47069508088593e-02 +-2.44552270853647e-02 -2.42062928162110e-02 -2.39600387880568e-02 -2.37163389944231e-02 +-2.34750516088838e-02 -2.32360201168882e-02 -2.29990745969980e-02 -2.27640331408770e-02 +-2.25307034000075e-02 -2.22988842458536e-02 -2.20683675290447e-02 -2.18389399221380e-02 +-2.16103848296329e-02 -2.13824843481748e-02 -2.11550212592957e-02 -2.09277810366146e-02 +-2.07005538491470e-02 -2.04731365422803e-02 -2.02453345780307e-02 -2.00169639164372e-02 +-1.97878528203449e-02 -1.95578435663972e-02 -1.93267940457788e-02 -1.90945792391283e-02 +-1.88610925510636e-02 -1.86262469909264e-02 -1.83899761876384e-02 -1.81522352279718e-02 +-1.79130013090470e-02 -1.76722741974749e-02 -1.74300764892413e-02 -1.71864536661777e-02 +-1.69414739466529e-02 -1.66952279299460e-02 -1.64478280356007e-02 -1.61994077408991e-02 +-1.59501206214172e-02 -1.57001392014109e-02 -1.54496536225215e-02 -1.51988701409616e-02 +-1.49480094649361e-02 -1.46973049455493e-02 -1.44470006358391e-02 -1.41973492338424e-02 +-1.39486099267323e-02 -1.37010461540530e-02 -1.34549233089137e-02 -1.32105063966747e-02 +-1.29680576711601e-02 -1.27278342687600e-02 -1.24900858609297e-02 -1.22550523455585e-02 +-1.20229615974498e-02 -1.17940272977397e-02 -1.15684468614618e-02 -1.13463994816413e-02 +-1.11280443072611e-02 -1.09135187711544e-02 -1.07029370823262e-02 -1.04963888953278e-02 +-1.02939381670478e-02 -1.00956222085339e-02 -9.90145093610690e-03 -9.71140632189378e-03 +-9.52544203878226e-03 -9.34348328845313e-03 -9.16542679328055e-03 -8.99114092324459e-03 +-8.82046591732402e-03 -8.65321414499474e-03 -8.48917033753181e-03 -8.32809170118681e-03 +-8.16970780591620e-03 -8.01372012575436e-03 -7.85980109266708e-03 -7.70759251835761e-03 +-7.55670324279652e-03 -7.40670589016039e-03 -7.25713265929078e-03 -7.10747015409586e-03 +-6.95715337646515e-03 -6.80555916551670e-03 -6.65199957428270e-03 -6.49571592479076e-03 +-6.33587456400888e-03 -6.17156563587175e-03 -6.00180645742594e-03 -5.82555129774991e-03 +-5.64170945490644e-03 -5.44917345093214e-03 -5.24685886002099e-03 -5.03375670147445e-03 +-4.80899843652965e-03 -4.57193240703692e-03 -4.32220908527571e-03 -4.05987085783254e-03 +-3.78544038385347e-03 -3.50000003774067e-03 -3.20525379139443e-03 -2.90356234657346e-03 +-2.59794261145427e-03 -2.29202389198140e-03 -1.98995551478319e-03 -1.69626397192401e-03 +-1.41566189757400e-03 -1.15281593020937e-03 -9.12085334605195e-04 -6.97247624126436e-04 +-5.11230777545307e-04 -3.55873472376768e-04 -2.31734664906854e-04 -1.37971629460846e-04 +-7.23012558736875e-05 -3.10532804770985e-05 -9.31671947870908e-06 -1.17280280165167e-06 +-0.00000000000000e+00 + Type L N + 0 0 2 +-2.83066952855300e+00 -2.83032046943513e+00 -2.82927355028300e+00 -2.82752954524026e+00 +-2.82508974301210e+00 -2.82195594456901e+00 -2.81813045994186e+00 -2.81361610411866e+00 +-2.80841619205540e+00 -2.80253453281676e+00 -2.79597542286473e+00 -2.78874363851617e+00 +-2.78084442759262e+00 -2.77228350028837e+00 -2.76306701928507e+00 -2.75320158914360e+00 +-2.74269424500598e+00 -2.73155244064239e+00 -2.71978403588011e+00 -2.70739728345315e+00 +-2.69440081531324e+00 -2.68080362844401e+00 -2.66661507022196e+00 -2.65184482336891e+00 +-2.63650289054190e+00 -2.62059957860705e+00 -2.60414548264521e+00 -2.58715146973736e+00 +-2.56962866257828e+00 -2.55158842296732e+00 -2.53304233522503e+00 -2.51400218958421e+00 +-2.49447996560372e+00 -2.47448781565282e+00 -2.45403804851297e+00 -2.43314311314328e+00 +-2.41181558265454e+00 -2.39006813853556e+00 -2.36791355517412e+00 -2.34536468471300e+00 +-2.32243444227999e+00 -2.29913579162854e+00 -2.27548173122388e+00 -2.25148528080675e+00 +-2.22715946846486e+00 -2.20251731823930e+00 -2.17757183829071e+00 -2.15233600964694e+00 +-2.12682277555130e+00 -2.10104503142744e+00 -2.07501561547369e+00 -2.04874729989688e+00 +-2.02225278279223e+00 -1.99554468067296e+00 -1.96863552164993e+00 -1.94153773925843e+00 +-1.91426366692627e+00 -1.88682553307391e+00 -1.85923545683463e+00 -1.83150544437937e+00 +-1.80364738582824e+00 -1.77567305272788e+00 -1.74759409607079e+00 -1.71942204483082e+00 +-1.69116830498584e+00 -1.66284415899707e+00 -1.63446076571196e+00 -1.60602916065574e+00 +-1.57756025667531e+00 -1.54906484489730e+00 -1.52055359596102e+00 -1.49203706148606e+00 +-1.46352567573339e+00 -1.43502975741818e+00 -1.40655951163249e+00 -1.37812503183556e+00 +-1.34973630187005e+00 -1.32140319796246e+00 -1.29313549066729e+00 -1.26494284671478e+00 +-1.23683483072379e+00 -1.20882090674244e+00 -1.18091043958096e+00 -1.15311269590289e+00 +-1.12543684504309e+00 -1.09789195952314e+00 -1.07048701523705e+00 -1.04323089128324e+00 +-1.01613236942108e+00 -9.89200133133558e-01 -9.62442766280581e-01 -9.35868751330556e-01 +-9.09486467161260e-01 -8.83304186424248e-01 -8.57330072470471e-01 -8.31572175838189e-01 +-8.06038430307625e-01 -7.80736648530285e-01 -7.55674517244152e-01 -7.30859592089317e-01 +-7.06299292041819e-01 -6.82000893486602e-01 -6.57971523953502e-01 -6.34218155543061e-01 +-6.10747598071668e-01 -5.87566491968087e-01 -5.64681300955782e-01 -5.42098304557590e-01 +-5.19823590461228e-01 -4.97863046785819e-01 -4.76222354291077e-01 -4.54906978571967e-01 +-4.33922162282624e-01 -4.13272917433968e-01 -3.92964017809828e-01 -3.72999991546541e-01 +-3.53385113920803e-01 -3.34123400390112e-01 -3.15218599929447e-01 -2.96674188706804e-01 +-2.78493364139005e-01 -2.60679039367640e-01 -2.43233838193260e-01 -2.26160090503948e-01 +-2.09459828232145e-01 -1.93134781871194e-01 -1.77186377580399e-01 -1.61615734904618e-01 +-1.46423665131377e-01 -1.31610670305412e-01 -1.17176942917255e-01 -1.03122366279163e-01 +-8.94465155981878e-02 -7.61486597527562e-02 -6.32277637755066e-02 -5.06824920416196e-02 +-3.85112121582904e-02 -2.67119995474623e-02 -1.52826427104481e-02 -4.22064915964631e-03 +6.47674800077162e-03 1.68125828665513e-02 2.67901489218715e-02 3.64129906520616e-02 +4.56848946107054e-02 5.46098799738873e-02 6.31921886167689e-02 7.14362747498529e-02 +7.93467941542810e-02 8.69285930572491e-02 9.41866966901095e-02 1.01126297572996e-01 +1.07752743570787e-01 1.14071525765962e-01 1.20088266194381e-01 1.25808705490228e-01 +1.31238690486300e-01 1.36384161815534e-01 1.41251141559093e-01 1.45845720985509e-01 +1.50174048424374e-01 1.54242317316745e-01 1.58056754482960e-01 1.61623608646860e-01 +1.64949139253481e-01 1.68039605615247e-01 1.70901256419390e-01 1.73540319627008e-01 +1.75962992791558e-01 1.78175433822024e-01 1.80183752213209e-01 1.81994000762829e-01 +1.83612167792180e-01 1.85044169884289e-01 1.86295845150495e-01 1.87372947033496e-01 +1.88281138651982e-01 1.89025987689116e-01 1.89612961824288e-01 1.90047424704810e-01 +1.90334632451602e-01 1.90479730690295e-01 1.90487752096795e-01 1.90363614443992e-01 +1.90112119134155e-01 1.89737950199518e-01 1.89245673751731e-01 1.88639737859131e-01 +1.87924472829317e-01 1.87104091873168e-01 1.86182692125331e-01 1.85164255995236e-01 +1.84052652821983e-01 1.82851640805856e-01 1.81564869188882e-01 1.80195880656653e-01 +1.78748113933662e-01 1.77224906544581e-01 1.75629497714273e-01 1.73965031379856e-01 +1.72234559288843e-01 1.70441044158184e-01 1.68587362870048e-01 1.66676309681239e-01 +1.64710599424401e-01 1.62692870680450e-01 1.60625688903091e-01 1.58511549477765e-01 +1.56352880698925e-01 1.54152046651116e-01 1.51911349981001e-01 1.49633034549114e-01 +1.47319287951793e-01 1.44972243905418e-01 1.42593984486726e-01 1.40186542224606e-01 +1.37751902040352e-01 1.35292003034890e-01 1.32808740122979e-01 1.30303965515778e-01 +1.27779490054503e-01 1.25237084399145e-01 1.22678480077357e-01 1.20105370399673e-01 +1.17519411248168e-01 1.14922221746499e-01 1.12315384820004e-01 1.09700447655130e-01 +1.07078922067986e-01 1.04452284792185e-01 1.01821977696410e-01 9.91894079423195e-02 +9.65559480934386e-02 9.39229361856420e-02 9.12916757696854e-02 8.86634359359865e-02 +8.60394513315248e-02 8.34209221783001e-02 8.08090143023092e-02 7.82048591814062e-02 +7.56095540198131e-02 7.30241618563351e-02 7.04497117126318e-02 6.78871987871088e-02 +6.53375846992140e-02 6.28017977880984e-02 6.02807334687784e-02 5.77752546480996e-02 +5.52861922019829e-02 5.28143455146037e-02 5.03604830793776e-02 4.79253431608302e-02 +4.55096345157112e-02 4.31140371709994e-02 4.07392032557937e-02 3.83857578834760e-02 +3.60543000799765e-02 3.37454037534738e-02 3.14596187004263e-02 2.91974716424502e-02 +2.69594672882510e-02 2.47460894145701e-02 2.25578019599161e-02 2.03950501247489e-02 +1.82582614717180e-02 1.61478470195775e-02 1.40642023244647e-02 1.20077085423675e-02 +9.97873346678333e-03 7.97763253581952e-03 6.00474980326314e-03 4.06041886848905e-03 +2.14496376043731e-03 2.58699771304491e-04 -1.59806576397665e-03 -3.42503274592510e-03 +-5.22190764934981e-03 -6.98840292531619e-03 -8.72423646821887e-03 -1.04291311494229e-02 +-1.21028144184295e-02 -1.37450179720109e-02 -1.53554774912721e-02 -1.69339324461079e-02 +-1.84801259660639e-02 -1.99938047761739e-02 -2.14747191959249e-02 -2.29226231991156e-02 +-2.43372745320299e-02 -2.57184348870079e-02 -2.70658701282321e-02 -2.83793505662834e-02 +-2.96586512778198e-02 -3.09035524665597e-02 -3.21138398616154e-02 -3.32893051491338e-02 +-3.44297464331470e-02 -3.55349687215124e-02 -3.66047844328455e-02 -3.76390139203982e-02 +-3.86374860089179e-02 -3.96000385406425e-02 -4.05265189267143e-02 -4.14167847004809e-02 +-4.22707040693161e-02 -4.30881564618154e-02 -4.38690330674360e-02 -4.46132373658758e-02 +-4.53206856437435e-02 -4.59913074963034e-02 -4.66250463123349e-02 -4.72218597403950e-02 +-4.77817201350153e-02 -4.83046149816036e-02 -4.87905472990492e-02 -4.92395360192481e-02 +-4.96516163429702e-02 -5.00268400716717e-02 -5.03652759150372e-02 -5.06670097741729e-02 +-5.09321450005186e-02 -5.11608026306430e-02 -5.13531215971881e-02 -5.15092589162843e-02 +-5.16293898518139e-02 -5.17137080569200e-02 -5.17624256931650e-02 -5.17757735277252e-02 +-5.17540010089834e-02 -5.16973763208191e-02 -5.16061864158465e-02 -5.14807370277616e-02 +-5.13213526628746e-02 -5.11283765708157e-02 -5.09021706942796e-02 -5.06431155975889e-02 +-5.03516103737286e-02 -5.00280725294056e-02 -4.96729378475809e-02 -4.92866602268164e-02 +-4.88697114966946e-02 -4.84225812084818e-02 -4.79457764001396e-02 -4.74398213347327e-02 +-4.69052572112426e-02 -4.63426418467758e-02 -4.57525493291529e-02 -4.51355696388802e-02 +-4.44923082395470e-02 -4.38233856357493e-02 -4.31294368977213e-02 -4.24111111519637e-02 +-4.16690710372760e-02 -4.09039921257502e-02 -4.01165623084475e-02 -3.93074811456619e-02 +-3.84774591818797e-02 -3.76272172257596e-02 -3.67574855956890e-02 -3.58690033317208e-02 +-3.49625173749396e-02 -3.40387817155770e-02 -3.30985565114521e-02 -3.21426071785878e-02 +-3.11717034561137e-02 -3.01866184478337e-02 -2.91881276430916e-02 -2.81770079198110e-02 +-2.71540365328256e-02 -2.61199900908315e-02 -2.50756435254977e-02 -2.40217690564450e-02 +-2.29591351559761e-02 -2.18885055175541e-02 -2.08106380321513e-02 -1.97262837766565e-02 +-1.86361860185808e-02 -1.75410792413150e-02 -1.64416881941826e-02 -1.53387269714722e-02 +-1.42328981245576e-02 -1.31248918110967e-02 -1.20153849851448e-02 -1.09050406318438e-02 +-9.79450705012719e-03 -8.68441718664159e-03 -7.57538802380773e-03 -6.46802002464899e-03 +-5.36289663668559e-03 -4.26058385684581e-03 -3.16162985898280e-03 -2.06656468519760e-03 +-9.75900001768642e-04 1.09871079910729e-04 1.19027411741164e-03 2.26485338680255e-03 +3.33317186849408e-03 4.39481117067023e-03 5.44937139198806e-03 6.49647092562665e-03 +7.53574620714751e-03 8.56685140900396e-03 9.58945808486493e-03 1.06032547672593e-02 +1.16079465223306e-02 1.26032544657568e-02 1.35889152441269e-02 1.45646804862648e-02 +1.55303162291454e-02 1.64856023231879e-02 1.74303318217789e-02 1.83643103599382e-02 +1.92873555270397e-02 2.01992962384657e-02 2.10999721109946e-02 2.19892328466145e-02 +2.28669376292914e-02 2.37329545390366e-02 2.45871599873854e-02 2.54294381781384e-02 +2.62596805969241e-02 2.70777855328189e-02 2.78836576349048e-02 2.86772075062834e-02 +2.94583513376603e-02 3.02270105822138e-02 3.09831116730333e-02 3.17265857839846e-02 +3.24573686344209e-02 3.31754003377236e-02 3.38806252932164e-02 3.45729921205720e-02 +3.52524536354113e-02 3.59189668643911e-02 3.65724930976872e-02 3.72129979764233e-02 +3.78404516122447e-02 3.84548287359372e-02 3.90561088716968e-02 3.96442765334193e-02 +4.02193214391599e-02 4.07812387397395e-02 4.13300292573394e-02 4.18656997298300e-02 +4.23882630565241e-02 4.28977385410332e-02 4.33941521269357e-02 4.38775366220330e-02 +4.43479319070843e-02 4.48053851250600e-02 4.52499508471446e-02 4.56816912119461e-02 +4.61006760346262e-02 4.65069828829647e-02 4.69006971176901e-02 4.72819118947540e-02 +4.76507281276067e-02 4.80072544079149e-02 4.83516068835754e-02 4.86839090932947e-02 +4.90042917574366e-02 4.93128925252691e-02 4.96098556791757e-02 4.98953317968266e-02 +5.01694773727273e-02 5.04324544009676e-02 5.06844299213963e-02 5.09255755318144e-02 +5.11560668691365e-02 5.13760830627996e-02 5.15858061639863e-02 5.17854205545095e-02 +5.19751123394246e-02 5.21550687276370e-02 5.23254774049307e-02 5.24865259039526e-02 +5.26384009757749e-02 5.27812879676804e-02 5.29153702118159e-02 5.30408284293041e-02 +5.31578401543156e-02 5.32665791824650e-02 5.33672150477311e-02 5.34599125318795e-02 +5.35448312101285e-02 5.36221250365100e-02 5.36919419720728e-02 5.37544236587237e-02 +5.38097051411429e-02 5.38579146388171e-02 5.38991733698238e-02 5.39335954275777e-02 +5.39612877113170e-02 5.39823499106597e-02 5.39968745441185e-02 5.40049470510161e-02 +5.40066459358003e-02 5.40020429633284e-02 5.39912034032693e-02 5.39741863213681e-02 +5.39510449149375e-02 5.39218268895730e-02 5.38865748737652e-02 5.38453268677684e-02 +5.37981167228204e-02 5.37449746465668e-02 5.36859277303439e-02 5.36210004938116e-02 +5.35502154423068e-02 5.34735936322036e-02 5.33911552395280e-02 5.33029201270793e-02 +5.32089084053454e-02 5.31091409825941e-02 5.30036400996398e-02 5.28924298449483e-02 +5.27755366459512e-02 5.26529897326668e-02 5.25248215700025e-02 5.23910682554102e-02 +5.22517698788951e-02 5.21069708427301e-02 5.19567201386039e-02 5.18010715803215e-02 +5.16400839905837e-02 5.14738213407865e-02 5.13023528432074e-02 5.11257529953743e-02 +5.09441015768294e-02 5.07574835989321e-02 5.05659892087488e-02 5.03697135484778e-02 +5.01687565722448e-02 4.99632228224604e-02 4.97532211682799e-02 4.95388645090125e-02 +4.93202694456199e-02 4.90975559236949e-02 4.88708468515329e-02 4.86402676970991e-02 +4.84059460678393e-02 4.81680112773978e-02 4.79265939033770e-02 4.76818253403097e-02 +4.74338373520070e-02 4.71827616274032e-02 4.69287293439337e-02 4.66718707423661e-02 +4.64123147168433e-02 4.61501884237116e-02 4.58856169124818e-02 4.56187227820179e-02 +4.53496258647712e-02 4.50784429415659e-02 4.48052874891231e-02 4.45302694621591e-02 +4.42534951115367e-02 4.39750668395752e-02 4.36950830932447e-02 4.34136382955881e-02 +4.31308228153268e-02 4.28467229742296e-02 4.25614210914418e-02 4.22749955636169e-02 +4.19875209793369e-02 4.16990682659732e-02 4.14097048668342e-02 4.11194949461479e-02 +4.08284996191703e-02 4.05367772044726e-02 4.02443834952561e-02 3.99513720463725e-02 +3.96577944735877e-02 3.93637007615257e-02 3.90691395766622e-02 3.87741585817073e-02 +3.84788047477237e-02 3.81831246603683e-02 3.78871648167292e-02 3.75909719093385e-02 +3.72945930940960e-02 3.69980762390155e-02 3.67014701509200e-02 3.64048247774509e-02 +3.61081913820230e-02 3.58116226896471e-02 3.55151730018521e-02 3.52188982792641e-02 +3.49228561907476e-02 3.46271061283577e-02 3.43317091877209e-02 3.40367281138208e-02 +3.37422272125301e-02 3.34482722285933e-02 3.31549301911135e-02 3.28622692279478e-02 +3.25703583507371e-02 3.22792672126168e-02 3.19890658409437e-02 3.16998243476461e-02 +3.14116126200480e-02 3.11244999952376e-02 3.08385549212349e-02 3.05538446083694e-02 +3.02704346744007e-02 2.99883887870031e-02 2.97077683072849e-02 2.94286319380315e-02 +2.91510353803390e-02 2.88750310022496e-02 2.86006675229102e-02 2.83279897156439e-02 +2.80570381331711e-02 2.77878488580179e-02 2.75204532809331e-02 2.72548779098815e-02 +2.69911442119055e-02 2.67292684898501e-02 2.64692617956219e-02 2.62111298813239e-02 +2.59548731892472e-02 2.57004868813490e-02 2.54479609084698e-02 2.51972801191728e-02 +2.49484244077178e-02 2.47013689003078e-02 2.44560841783863e-02 2.42125365374107e-02 +2.39706882791865e-02 2.37304980355276e-02 2.34919211207014e-02 2.32549099098435e-02 +2.30194142402692e-02 2.27853818323825e-02 2.25527587266919e-02 2.23214897332738e-02 +2.20915188898973e-02 2.18627899249279e-02 2.16352467210679e-02 2.14088337759702e-02 +2.11834966557741e-02 2.09591824376641e-02 2.07358401376388e-02 2.05134211197985e-02 +2.02918794836202e-02 2.00711724258739e-02 1.98512605740594e-02 1.96321082884920e-02 +1.94136839304392e-02 1.91959600940200e-02 1.89789137998939e-02 1.87625266491130e-02 +1.85467849358684e-02 1.83316797182285e-02 1.81172068463491e-02 1.79033669480168e-02 +1.76901653717729e-02 1.74776120882489e-02 1.72657215507217e-02 1.70545125162687e-02 +1.68440078292554e-02 1.66342341692342e-02 1.64252217656496e-02 1.62170040820518e-02 +1.60096174727922e-02 1.58031008154227e-02 1.55974951222438e-02 1.53928431346295e-02 +1.51891889039140e-02 1.49865773627443e-02 1.47850538908861e-02 1.45846638795176e-02 +1.43854522980568e-02 1.41874632675385e-02 1.39907396444970e-02 1.37953226192057e-02 +1.36012513319934e-02 1.34085625111835e-02 1.32172901360025e-02 1.30274651275704e-02 +1.28391150708224e-02 1.26522639699259e-02 1.24669320394431e-02 1.22831355331597e-02 +1.21008866121481e-02 1.19201932532734e-02 1.17410591989700e-02 1.15634839487404e-02 +1.13874627924383e-02 1.12129868850085e-02 1.10400433619795e-02 1.08686154946206e-02 +1.06986828833112e-02 1.05302216873182e-02 1.03632048888361e-02 1.01976025888321e-02 +1.00333823319390e-02 9.87050945737666e-03 9.70894747263249e-03 9.54865844642942e-03 +9.38960341732410e-03 9.23174281413423e-03 9.07503688428581e-03 8.91944612609148e-03 +8.76493172093570e-03 8.61145596134157e-03 8.45898267092507e-03 8.30747761232020e-03 +8.15690887925976e-03 8.00724726914373e-03 7.85846663259793e-03 7.71054419673877e-03 +7.56346085909004e-03 7.41720144936774e-03 7.27175495663742e-03 7.12711471965823e-03 +6.98327857856338e-03 6.84024898637288e-03 6.69803307919582e-03 6.55664270435345e-03 +6.41609440602892e-03 6.27640936843479e-03 6.13761331686651e-03 5.99973637739066e-03 +5.86281289628358e-03 5.72688122069677e-03 5.59198344237017e-03 5.45816510654219e-03 +5.32547488851370e-03 5.19396424060840e-03 5.06368701253089e-03 4.93469904835749e-03 +4.80705776359418e-03 4.68082170590717e-03 4.55605010326752e-03 4.43280240335010e-03 +4.31113780808967e-03 4.19111480731926e-03 4.07279071539649e-03 3.95622121465693e-03 +3.84145990941738e-03 3.72855789407972e-03 3.61756333864661e-03 3.50852109464207e-03 +3.40147232401961e-03 3.29645415311019e-03 3.19349935299038e-03 3.09263604680560e-03 +2.99388744352187e-03 2.89727159627177e-03 2.80280118186861e-03 2.71048329616231e-03 +2.62031925770382e-03 2.53230440970586e-03 2.44642790763094e-03 2.36267247707583e-03 +2.28101412422562e-03 2.20142177943290e-03 2.12385685396539e-03 2.04827269136590e-03 +1.97461389899686e-03 1.90281555310836e-03 1.83280228310201e-03 1.76448725836534e-03 +1.69777112464835e-03 1.63254096648618e-03 1.56866940693531e-03 1.50601399423653e-03 +1.44441706408425e-03 1.38370630184329e-03 1.32369625584077e-03 1.26419106424568e-03 +1.20498864682543e-03 1.14588657190874e-03 1.08668973209755e-03 1.02721984580016e-03 +9.67326645204068e-04 9.06900419321748e-04 8.45885363438846e-04 7.84292960088225e-04 +7.22214403843118e-04 6.59830909754835e-04 5.97420642499392e-04 5.35360998821627e-04 +4.74125093455818e-04 4.14271553153984e-04 3.56427116556186e-04 3.01262055105794e-04 +2.49459040284683e-04 2.01676736565497e-04 1.58510035509478e-04 1.20449394349500e-04 +8.78421319091020e-05 6.08587042352023e-05 3.94668878894923e-05 2.34164218147664e-05 +1.22360108460612e-05 5.24371904244346e-06 1.57075212911314e-06 1.97540990176965e-07 +0.00000000000000e+00 + Type L N + 0 1 0 +0.00000000000000e+00 -4.54093913860524e-03 -9.07970681469088e-03 -1.36141427090386e-02 +-1.81421087025845e-02 -2.26614997610921e-02 -2.71702545633380e-02 -3.16663657906285e-02 +-3.61478899982646e-02 -4.06129569930355e-02 -4.50597786448995e-02 -4.94866570656897e-02 +-5.38919920929056e-02 -5.82742880223829e-02 -6.26321595398218e-02 -6.69643368077494e-02 +-7.12696696714279e-02 -7.55471309544556e-02 -7.97958188222561e-02 -8.40149581992814e-02 +-8.82039012334648e-02 -9.23621268092178e-02 -9.64892391179846e-02 -1.00584965302991e-01 +-1.04649152202286e-01 -1.08681762221414e-01 -1.12682868374006e-01 -1.16652648535189e-01 +-1.20591378958920e-01 -1.24499427116130e-01 -1.28377243915797e-01 -1.32225355375823e-01 +-1.36044353814732e-01 -1.39834888638713e-01 -1.43597656801476e-01 -1.47333393016592e-01 +-1.51042859803540e-01 -1.54726837449604e-01 -1.58386113969863e-01 -1.62021475147091e-01 +-1.65633694732100e-01 -1.69223524883277e-01 -1.72791686921464e-01 -1.76338862473260e-01 +-1.79865685072024e-01 -1.83372732281585e-01 -1.86860518402844e-01 -1.90329487818106e-01 +-1.93780009022309e-01 -1.97212369384153e-01 -2.00626770673715e-01 -2.04023325386432e-01 +-2.07402053886430e-01 -2.10762882385097e-01 -2.14105641763669e-01 -2.17430067241435e-01 +-2.20735798884027e-01 -2.24022382939224e-01 -2.27289273980862e-01 -2.30535837834748e-01 +-2.33761355254131e-01 -2.36965026306217e-01 -2.40145975425549e-01 -2.43303257084827e-01 +-2.46435862028955e-01 -2.49542724013837e-01 -2.52622726987708e-01 -2.55674712649596e-01 +-2.58697488316981e-01 -2.61689835032695e-01 -2.64650515839799e-01 -2.67578284152449e-01 +-2.70471892150660e-01 -2.73330099127448e-01 -2.76151679717961e-01 -2.78935431941985e-01 +-2.81680184993548e-01 -2.84384806714249e-01 -2.87048210690357e-01 -2.89669362917645e-01 +-2.92247287982309e-01 -2.94781074711086e-01 -2.97269881248836e-01 -2.99712939527349e-01 +-3.02109559094821e-01 -3.04459130281438e-01 -3.06761126682580e-01 -3.09015106947369e-01 +-3.11220715866558e-01 -3.13377684759972e-01 -3.15485831169930e-01 -3.17545057873092e-01 +-3.19555351229105e-01 -3.21516778890068e-01 -3.23429486900216e-01 -3.25293696220328e-01 +-3.27109698716032e-01 -3.28877852653548e-01 -3.30598577750232e-01 -3.32272349830725e-01 +-3.33899695142427e-01 -3.35481184386415e-01 -3.37017426521776e-01 -3.38509062402702e-01 +-3.39956758308403e-01 -3.41361199426187e-01 -3.42723083347673e-01 -3.44043113637266e-01 +-3.45321993530629e-01 -3.46560419818945e-01 -3.47759076972413e-01 -3.48918631553526e-01 +-3.50039726967392e-01 -3.51122978592692e-01 -3.52168969332790e-01 -3.53178245622152e-01 +-3.54151313918588e-01 -3.55088637706937e-01 -3.55990635034765e-01 -3.56857676595438e-01 +-3.57690084368637e-01 -3.58488130823072e-01 -3.59252038680826e-01 -3.59981981237503e-01 +-3.60678083227217e-01 -3.61340422216481e-01 -3.61969030506222e-01 -3.62563897516661e-01 +-3.63124972625478e-01 -3.63652168425780e-01 -3.64145364366762e-01 -3.64604410736772e-01 +-3.65029132945667e-01 -3.65419336061017e-01 -3.65774809550738e-01 -3.66095332183357e-01 +-3.66380677036058e-01 -3.66630616560223e-01 -3.66844927654101e-01 -3.67023396692736e-01 +-3.67165824466169e-01 -3.67272030978319e-01 -3.67341860060735e-01 -3.67375183757656e-01 +-3.67371906441393e-01 -3.67331968620057e-01 -3.67255350402929e-01 -3.67142074592392e-01 +-3.66992209375189e-01 -3.66805870589863e-01 -3.66583223551508e-01 -3.66324484419329e-01 +-3.66029921097057e-01 -3.65699853660787e-01 -3.65334654313356e-01 -3.64934746868954e-01 +-3.64500605776066e-01 -3.64032754691198e-01 -3.63531764620026e-01 -3.62998251646551e-01 +-3.62432874274643e-01 -3.61836330409789e-01 -3.61209354012067e-01 -3.60552711454246e-01 +-3.59867197621380e-01 -3.59153631790449e-01 -3.58412853330334e-01 -3.57645717263755e-01 +-3.56853089733788e-01 -3.56035843418051e-01 -3.55194852933803e-01 -3.54330990276886e-01 +-3.53445120336704e-01 -3.52538096528377e-01 -3.51610756581650e-01 -3.50663918524339e-01 +-3.49698376895840e-01 -3.48714899223734e-01 -3.47714222793689e-01 -3.46697051739756e-01 +-3.45664054478868e-01 -3.44615861509796e-01 -3.43553063593161e-01 -3.42476210325258e-01 +-3.41385809114584e-01 -3.40282324565957e-01 -3.39166178273201e-01 -3.38037749017383e-01 +-3.36897373363745e-01 -3.35745346646652e-01 -3.34581924328286e-01 -3.33407323713292e-01 +-3.32221725998327e-01 -3.31025278632441e-01 -3.29818097961383e-01 -3.28600272126489e-01 +-3.27371864186564e-01 -3.26132915429324e-01 -3.24883448837431e-01 -3.23623472672975e-01 +-3.22352984143443e-01 -3.21071973111754e-01 -3.19780425812865e-01 -3.18478328539746e-01 +-3.17165671262120e-01 -3.15842451142421e-01 -3.14508675914691e-01 -3.13164367093826e-01 +-3.11809562984547e-01 -3.10444321461662e-01 -3.09068722495754e-01 -3.07682870401110e-01 +-3.06286895785664e-01 -3.04880957185837e-01 -3.03465242372414e-01 -3.02039969316952e-01 +-3.00605386811672e-01 -2.99161774739244e-01 -2.97709443992396e-01 -2.96248736046722e-01 +-2.94780022193473e-01 -2.93303702442437e-01 -2.91820204108187e-01 -2.90329980096030e-01 +-2.88833506906824e-01 -2.87331282382479e-01 -2.85823823216361e-01 -2.84311662254991e-01 +-2.82795345619269e-01 -2.81275429675072e-01 -2.79752477884354e-01 -2.78227057568812e-01 +-2.76699736618864e-01 -2.75171080180967e-01 -2.73641647356326e-01 -2.72111987943664e-01 +-2.70582639258123e-01 -2.69054123057338e-01 -2.67526942604519e-01 -2.66001579896772e-01 +-2.64478493085109e-01 -2.62958114110514e-01 -2.61440846578134e-01 -2.59927063889182e-01 +-2.58417107647447e-01 -2.56911286354518e-01 -2.55409874404847e-01 -2.53913111388771e-01 +-2.52421201708530e-01 -2.50934314509154e-01 -2.49452583923033e-01 -2.47976109623842e-01 +-2.46504957682498e-01 -2.45039161714903e-01 -2.43578724308370e-01 -2.42123618711023e-01 +-2.40673790765921e-01 -2.39229161069375e-01 -2.37789627330873e-01 -2.36355066910127e-01 +-2.34925339505233e-01 -2.33500289964556e-01 -2.32079751193951e-01 -2.30663547130137e-01 +-2.29251495750607e-01 -2.27843412090284e-01 -2.26439111235230e-01 -2.25038411264190e-01 +-2.23641136109407e-01 -2.22247118309164e-01 -2.20856201625743e-01 -2.19468243504003e-01 +-2.18083117347521e-01 -2.16700714591191e-01 -2.15320946551330e-01 -2.13943746036695e-01 +-2.12569068706251e-01 -2.11196894162184e-01 -2.09827226769319e-01 -2.08460096194902e-01 +-2.07095557665519e-01 -2.05733691940747e-01 -2.04374605005980e-01 -2.03018427489623e-01 +-2.01665313812585e-01 -2.00315441080611e-01 -1.98969007732488e-01 -1.97626231959520e-01 +-1.96287349913857e-01 -1.94952613725256e-01 -1.93622289347660e-01 -1.92296654258534e-01 +-1.90975995035253e-01 -1.89660604833914e-01 -1.88350780796762e-01 -1.87046821415001e-01 +-1.85749023874038e-01 -1.84457681408243e-01 -1.83173080692047e-01 -1.81895499293703e-01 +-1.80625203217241e-01 -1.79362444557147e-01 -1.78107459289024e-01 -1.76860465217997e-01 +-1.75621660104954e-01 -1.74391219988799e-01 -1.73169297720856e-01 -1.71956021725343e-01 +-1.70751494997510e-01 -1.69555794348578e-01 -1.68368969904126e-01 -1.67191044859967e-01 +-1.66022015496993e-01 -1.64861851453832e-01 -1.63710496253622e-01 -1.62567868078640e-01 +-1.61433860784114e-01 -1.60308345140165e-01 -1.59191170288606e-01 -1.58082165399239e-01 +-1.56981141508362e-01 -1.55887893520462e-01 -1.54802202352525e-01 -1.53723837199054e-01 +-1.52652557894798e-01 -1.51588117351295e-01 -1.50530264042741e-01 -1.49478744516279e-01 +-1.48433305901697e-01 -1.47393698395620e-01 -1.46359677695673e-01 -1.45331007360690e-01 +-1.44307461073868e-01 -1.43288824786898e-01 -1.42274898724315e-01 -1.41265499228860e-01 +-1.40260460430296e-01 -1.39259635721952e-01 -1.38262899031276e-01 -1.37270145872794e-01 +-1.36281294174087e-01 -1.35296284867718e-01 -1.34315082244391e-01 -1.33337674065064e-01 +-1.32364071432117e-01 -1.31394308422106e-01 -1.30428441485001e-01 -1.29466548617110e-01 +-1.28508728317125e-01 -1.27555098336854e-01 -1.26605794240199e-01 -1.25660967785800e-01 +-1.24720785150453e-01 -1.23785425011939e-01 -1.22855076511214e-01 -1.21929937115030e-01 +-1.21010210400968e-01 -1.20096103787539e-01 -1.19187826232451e-01 -1.18285585922383e-01 +-1.17389587977564e-01 -1.16500032194214e-01 -1.15617110847440e-01 -1.14741006576460e-01 +-1.13871890373113e-01 -1.13009919693494e-01 -1.12155236711237e-01 -1.11307966729454e-01 +-1.10468216766706e-01 -1.09636074330541e-01 -1.08811606390212e-01 -1.07994858558155e-01 +-1.07185854487635e-01 -1.06384595491823e-01 -1.05591060387267e-01 -1.04805205562498e-01 +-1.04026965270235e-01 -1.03256252139405e-01 -1.02492957901014e-01 -1.01736954319797e-01 +-1.00988094321485e-01 -1.00246213303678e-01 -9.95111306164519e-02 -9.87826511972151e-02 +-9.80605673428173e-02 -9.73446606006142e-02 -9.66347037590595e-02 -9.59304629174668e-02 +-9.52316996138709e-02 -9.45381729894065e-02 -9.38496419673395e-02 -9.31658674248163e-02 +-9.24866143355521e-02 -9.18116538620479e-02 -9.11407653765128e-02 -9.04737383904576e-02 +-8.98103743739169e-02 -8.91504884464315e-02 -8.84939109232776e-02 -8.78404887019392e-02 +-8.71900864754828e-02 -8.65425877612730e-02 -8.58978957353719e-02 -8.52559338649454e-02 +-8.46166463330572e-02 -8.39799982523428e-02 -8.33459756661828e-02 -8.27145853381398e-02 +-8.20858543325500e-02 -8.14598293912478e-02 -8.08365761134373e-02 -8.02161779476796e-02 +-7.95987350068253e-02 -7.89843627184631e-02 -7.83731903250747e-02 -7.77653592495447e-02 +-7.71610213429825e-02 -7.65603370329376e-02 -7.59634733910403e-02 -7.53706021398395e-02 +-7.47818976191698e-02 -7.41975347327106e-02 -7.36176868955447e-02 -7.30425240034416e-02 +-7.24722104443088e-02 -7.19069031717695e-02 -7.13467498601338e-02 -7.07918871591544e-02 +-7.02424390658980e-02 -6.96985154298283e-02 -6.91602106058112e-02 -6.86276022682154e-02 +-6.81007503976228e-02 -6.75796964498888e-02 -6.70644627154282e-02 -6.65550518746653e-02 +-6.60514467535885e-02 -6.55536102813301e-02 -6.50614856496430e-02 -6.45749966721209e-02 +-6.40940483389940e-02 -6.36185275613816e-02 -6.31483040969825e-02 -6.26832316473871e-02 +-6.22231491154844e-02 -6.17678820098618e-02 -6.13172439816449e-02 -6.08710384779326e-02 +-6.04290604948476e-02 -5.99910984122650e-02 -5.95569358915043e-02 -5.91263538166859e-02 +-5.86991322600623e-02 -5.82750524514430e-02 -5.78538987318435e-02 -5.74354604716955e-02 +-5.70195339343648e-02 -5.66059240663221e-02 -5.61944461960976e-02 -5.57849276251132e-02 +-5.53772090946209e-02 -5.49711461142585e-02 -5.45666101391687e-02 -5.41634895841829e-02 +-5.37616906652432e-02 -5.33611380599988e-02 -5.29617753813586e-02 -5.25635654596794e-02 +-5.21664904312111e-02 -5.17705516323773e-02 -5.13757693014300e-02 -5.09821820909579e-02 +-5.05898463966208e-02 -5.01988355093348e-02 -4.98092385998862e-02 -4.94211595466292e-02 +-4.90347156184808e-02 -4.86500360268573e-02 -4.82672603614898e-02 -4.78865369261970e-02 +-4.75080209916660e-02 -4.71318729830956e-02 -4.67582566211722e-02 -4.63873370352857e-02 +-4.60192788681287e-02 -4.56542443908728e-02 -4.52923916479697e-02 -4.49338726502837e-02 +-4.45788316347420e-02 -4.42274034079768e-02 -4.38797117905530e-02 -4.35358681773257e-02 +-4.31959702282714e-02 -4.28601007027872e-02 -4.25283264489845e-02 -4.22006975579096e-02 +-4.18772466909463e-02 -4.15579885868838e-02 -4.12429197533133e-02 -4.09320183451381e-02 +-4.06252442310946e-02 -4.03225392472721e-02 -4.00238276347380e-02 -3.97290166565132e-02 +-3.94379973873428e-02 -3.91506456679663e-02 -3.88668232139475e-02 -3.85863788675711e-02 +-3.83091499798918e-02 -3.80349639087189e-02 -3.77636396171746e-02 -3.74949893564642e-02 +-3.72288204156764e-02 -3.69649369207702e-02 -3.67031416644403e-02 -3.64432379482605e-02 +-3.61850314184103e-02 -3.59283318763747e-02 -3.56729550462894e-02 -3.54187242810632e-02 +-3.51654721900502e-02 -3.49130421718606e-02 -3.46612898368774e-02 -3.44100843051797e-02 +-3.41593093668489e-02 -3.39088644930422e-02 -3.36586656877378e-02 -3.34086461716801e-02 +-3.31587568917558e-02 -3.29089668508080e-02 -3.26592632547116e-02 -3.24096514753877e-02 +-3.21601548302945e-02 -3.19108141807879e-02 -3.16616873535738e-02 -3.14128483912601e-02 +-3.11643866397357e-02 -3.09164056817504e-02 -3.06690221276139e-02 -3.04223642753691e-02 +-3.01765706541037e-02 -2.99317884652355e-02 -2.96881719376273e-02 -2.94458806132464e-02 +-2.92050775807740e-02 -2.89659276750855e-02 -2.87285956608547e-02 -2.84932444186842e-02 +-2.82600331521291e-02 -2.80291156337566e-02 -2.78006385079836e-02 -2.75747396678458e-02 +-2.73515467221023e-02 -2.71311755681530e-02 -2.69137290851746e-02 -2.66992959606553e-02 +-2.64879496621603e-02 -2.62797475646829e-02 -2.60747302423645e-02 -2.58729209316990e-02 +-2.56743251716077e-02 -2.54789306239767e-02 -2.52867070764283e-02 -2.50976066272557e-02 +-2.49115640506100e-02 -2.47284973382084e-02 -2.45483084120496e-02 -2.43708840008963e-02 +-2.41960966716334e-02 -2.40238060050429e-02 -2.38538599040846e-02 -2.36860960214288e-02 +-2.35203432917874e-02 -2.33564235535305e-02 -2.31941532431719e-02 -2.30333451455752e-02 +-2.28738101821619e-02 -2.27153592190257e-02 -2.25578048766471e-02 -2.24009633228885e-02 +-2.22446560311130e-02 -2.20887114856228e-02 -2.19329668171356e-02 -2.17772693517255e-02 +-2.16214780575202e-02 -2.14654648744751e-02 -2.13091159137225e-02 -2.11523325143018e-02 +-2.09950321465161e-02 -2.08371491527025e-02 -2.06786353178387e-02 -2.05194602641275e-02 +-2.03596116654669e-02 -2.01990952795386e-02 -2.00379347970768e-02 -1.98761715097325e-02 +-1.97138637997733e-02 -1.95510864566632e-02 -1.93879298273125e-02 -1.92244988084745e-02 +-1.90609116913610e-02 -1.88972988700490e-02 -1.87338014266320e-02 -1.85705696073252e-02 +-1.84077612048425e-02 -1.82455398633235e-02 -1.80840733228859e-02 -1.79235316214981e-02 +-1.77640852723201e-02 -1.76059034349206e-02 -1.74491520988644e-02 -1.72939922980522e-02 +-1.71405783739109e-02 -1.69890563050570e-02 -1.68395621204064e-02 -1.66922204118833e-02 +-1.65471429618980e-02 -1.64044274996233e-02 -1.62641565988224e-02 -1.61263967285703e-02 +-1.59911974666859e-02 -1.58585908840680e-02 -1.57285911064174e-02 -1.56011940580518e-02 +-1.54763773906927e-02 -1.53541005982457e-02 -1.52343053167256e-02 -1.51169158066130e-02 +-1.50018396130862e-02 -1.48889683977775e-02 -1.47781789339649e-02 -1.46693342554518e-02 +-1.45622849478236e-02 -1.44568705693178e-02 -1.43529211872170e-02 -1.42502590144858e-02 +-1.41487001303335e-02 -1.40480562675106e-02 -1.39481366484405e-02 -1.38487498517591e-02 +-1.37497056904924e-02 -1.36508170829428e-02 -1.35519018973885e-02 -1.34527847519192e-02 +-1.33532987511397e-02 -1.32532871420654e-02 -1.31526048723008e-02 -1.30511200345320e-02 +-1.29487151824674e-02 -1.28452885046108e-02 -1.27407548436412e-02 -1.26350465506924e-02 +-1.25281141654493e-02 -1.24199269146993e-02 -1.23104730237756e-02 -1.21997598371882e-02 +-1.20878137466351e-02 -1.19746799265134e-02 -1.18604218789715e-02 -1.17451207924579e-02 +-1.16288747195964e-02 -1.15117975820430e-02 -1.13940180117309e-02 -1.12756780395738e-02 +-1.11569316442573e-02 -1.10379431751840e-02 -1.09188856649407e-02 -1.07999390478080e-02 +-1.06812883018271e-02 -1.05631215327575e-02 -1.04456280189042e-02 -1.03289962362473e-02 +-1.02134118835747e-02 -1.00990559273854e-02 -9.98610268621148e-03 -9.87471797368298e-03 +-9.76505731915248e-03 -9.65726428399385e-03 -9.55146889081165e-03 -9.44778618174344e-03 +-9.34631492082075e-03 -9.24713645398567e-03 -9.15031373885382e-03 -9.05589055468160e-03 +-8.96389090125617e-03 -8.87431859359432e-03 -8.78715705742920e-03 -8.70236932850171e-03 +-8.61989825667412e-03 -8.53966691386652e-03 -8.46157920280256e-03 -8.38552066155821e-03 +-8.31135945695655e-03 -8.23894755796116e-03 -8.16812207841030e-03 -8.09870677671911e-03 +-8.03051369857926e-03 -7.96334494721444e-03 -7.89699456442705e-03 -7.83125050450390e-03 +-7.76589668205375e-03 -7.70071507403267e-03 -7.63548785558655e-03 -7.56999954890842e-03 +-7.50403916407712e-03 -7.43740231081511e-03 -7.36989326028023e-03 -7.30132693638502e-03 +-7.23153081671616e-03 -7.16034672390183e-03 -7.08763248923777e-03 -7.01326347152452e-03 +-6.93713391538182e-03 -6.85915813477393e-03 -6.77927150909377e-03 -6.69743128089418e-03 +-6.61361714620933e-03 -6.52783163035719e-03 -6.44010024413983e-03 -6.35047141744038e-03 +-6.25901620933587e-03 -6.16582779598291e-03 -6.07102073966719e-03 -5.97473004452034e-03 +-5.87711000647370e-03 -5.77833286702514e-03 -5.67858728231435e-03 -5.57807662082457e-03 +-5.47701710472767e-03 -5.37563581145873e-03 -5.27416855351669e-03 -5.17285765574288e-03 +-5.07194965039661e-03 -4.97169291123620e-03 -4.87233524849552e-03 -4.77412148713028e-03 +-4.67729105097450e-03 -4.58207557549774e-03 -4.48869657168463e-03 -4.39736316315781e-03 +-4.30826991803922e-03 -4.22159479617706e-03 -4.13749723125494e-03 -4.05611636592377e-03 +-3.97756945643826e-03 -3.90195046130020e-03 -3.82932882606418e-03 -3.75974847367683e-03 +-3.69322700641042e-03 -3.62975512149790e-03 -3.56929623784568e-03 -3.51178632552908e-03 +-3.45713392301055e-03 -3.40522031901491e-03 -3.35589986668368e-03 -3.30900038704494e-03 +-3.26432360725074e-03 -3.22164556700392e-03 -3.18071691515725e-03 -3.14126300919561e-03 +-3.10298372551865e-03 -3.06555289119680e-03 -3.02861726202310e-03 -2.99179500169794e-03 +-2.95467366758180e-03 -2.91680778402562e-03 -2.87771618797152e-03 -2.83687946405376e-03 +-2.79373794474877e-03 -2.74769092694146e-03 -2.69809793478074e-03 -2.64428301784688e-03 +-2.58554318421704e-03 -2.52116209504184e-03 -2.45043005295907e-03 -2.37267106522889e-03 +-2.28727732619719e-03 -2.19375083047717e-03 -2.09175100907235e-03 -1.98114631610585e-03 +-1.86206665782953e-03 -1.73495255512714e-03 -1.60059609997802e-03 -1.46016825466369e-03 +-1.31522699662608e-03 -1.16770135362757e-03 -1.01984757613789e-03 -8.74175558512122e-04 +-7.33346062280289e-04 -6.00042135495552e-04 -4.76821097715059e-04 -3.65956243267344e-04 +-2.69279650361567e-04 -1.88038832207024e-04 -1.22780157814445e-04 -7.32708473556437e-05 +-3.84689013525885e-05 -1.65467114203757e-05 -4.96963888888944e-06 -6.25981473314889e-07 +0.00000000000000e+00 + Type L N + 0 1 1 +0.00000000000000e+00 4.69232677538598e-03 9.38620640957856e-03 1.40831752847260e-02 +1.87847369622547e-02 2.34923461028794e-02 2.82073927799375e-02 3.29311873119893e-02 +3.76649457362469e-02 4.24097760390039e-02 4.71666652528635e-02 5.19364675232755e-02 +5.67198932387388e-02 6.15174993100975e-02 6.63296806747091e-02 7.11566630909937e-02 +7.59984972780630e-02 8.08550544438761e-02 8.57260232337639e-02 9.06109081193169e-02 +9.55090292356335e-02 1.00419523662885e-01 1.05341348136178e-01 1.10273283155875e-01 +1.15213938458970e-01 1.20161759800955e-01 1.25115036986833e-01 1.30071913079783e-01 +1.35030394706413e-01 1.39988363368705e-01 1.44943587664696e-01 1.49893736312758e-01 +1.54836391868062e-01 1.59769065014481e-01 1.64689209310915e-01 1.69594236267738e-01 +1.74481530626890e-01 1.79348465718042e-01 1.84192418763222e-01 1.89010786003399e-01 +1.93800997522641e-01 1.98560531648686e-01 2.03286928812962e-01 2.07977804758338e-01 +2.12630862988965e-01 2.17243906363650e-01 2.21814847741960e-01 2.26341719600881e-01 +2.30822682549064e-01 2.35256032675500e-01 2.39640207679797e-01 2.43973791741947e-01 +2.48255519100482e-01 2.52484276319189e-01 2.56659103233881e-01 2.60779192582120e-01 +2.64843888330068e-01 2.68852682721750e-01 2.72805212086883e-01 2.76701251453834e-01 +2.80540708024369e-01 2.84323613576245e-01 2.88050115868615e-01 2.91720469133328e-01 +2.95335023742641e-01 2.98894215150400e-01 3.02398552209468e-01 3.05848604972959e-01 +3.09244992090628e-01 3.12588367914655e-01 3.15879409430838e-01 3.19118803132074e-01 +3.22307231950773e-01 3.25445362365664e-01 3.28533831796241e-01 3.31573236394923e-01 +3.34564119342922e-01 3.37506959750769e-01 3.40402162258665e-01 3.43250047425139e-01 +3.46050842985167e-01 3.48804676050897e-01 3.51511566319499e-01 3.54171420343578e-01 +3.56784026910074e-01 3.59349053563678e-01 3.61866044300723e-01 3.64334418449224e-01 +3.66753470740412e-01 3.69122372566811e-01 3.71440174411706e-01 3.73705809424865e-01 +3.75918098109668e-01 3.78075754077500e-01 3.80177390816339e-01 3.82221529412174e-01 +3.84206607154089e-01 3.86130986946777e-01 3.87992967447866e-01 3.89790793841821e-01 +3.91522669157381e-01 3.93186766031551e-01 3.94781238820069e-01 3.96304235952100e-01 +3.97753912425607e-01 3.99128442339518e-01 4.00426031359304e-01 4.01644929014043e-01 +4.02783440725362e-01 4.03839939471759e-01 4.04812876995831e-01 4.05700794466611e-01 +4.06502332514728e-01 4.07216240564162e-01 4.07841385391162e-01 4.08376758848098e-01 +4.08821484697795e-01 4.09174824512025e-01 4.09436182596274e-01 4.09605109911627e-01 +4.09681306973451e-01 4.09664625715527e-01 4.09555070317217e-01 4.09352797000126e-01 +4.09058112809436e-01 4.08671473403575e-01 4.08193479884052e-01 4.07624874705096e-01 +4.06966536710084e-01 4.06219475348617e-01 4.05384824134361e-01 4.04463833409482e-01 +4.03457862486515e-01 4.02368371242823e-01 4.01196911246434e-01 3.99945116494870e-01 +3.98614693850679e-01 3.97207413258679e-01 3.95725097830451e-01 3.94169613881359e-01 +3.92542861004349e-01 3.90846762263008e-01 3.89083254583878e-01 3.87254279424813e-01 +3.85361773792339e-01 3.83407661676500e-01 3.81393845966675e-01 3.79322200906297e-01 +3.77194565138424e-01 3.75012735387726e-01 3.72778460817753e-01 3.70493438095351e-01 +3.68159307186948e-01 3.65777647904104e-01 3.63349977208399e-01 3.60877747278365e-01 +3.58362344333914e-01 3.55805088206614e-01 3.53207232637258e-01 3.50569966275538e-01 +3.47894414350379e-01 3.45181640973571e-01 3.42432652033876e-01 3.39648398633849e-01 +3.36829781017167e-01 3.33977652930361e-01 3.31092826359631e-01 3.28176076580716e-01 +3.25228147457816e-01 3.22249756926180e-01 3.19241602592263e-01 3.16204367385341e-01 +3.13138725195008e-01 3.10045346430274e-01 3.06924903437792e-01 3.03778075719197e-01 +3.00605554890540e-01 2.97408049330340e-01 2.94186288466761e-01 2.90941026658921e-01 +2.87673046632156e-01 2.84383162432261e-01 2.81072221869243e-01 2.77741108426780e-01 +2.74390742619522e-01 2.71022082786323e-01 2.67636125313602e-01 2.64233904289011e-01 +2.60816490591651e-01 2.57384990430873e-01 2.53940543351421e-01 2.50484319728113e-01 +2.47017517778415e-01 2.43541360126097e-01 2.40057089953619e-01 2.36565966784912e-01 +2.33069261943822e-01 2.29568253736559e-01 2.26064222409083e-01 2.22558444932419e-01 +2.19052189670371e-01 2.15546710985065e-01 2.12043243836119e-01 2.08542998429058e-01 +2.05047154967850e-01 2.01556858565137e-01 1.98073214361949e-01 1.94597282906337e-01 +1.91130075837588e-01 1.87672551919415e-01 1.84225613461867e-01 1.80790103167678e-01 +1.77366801434379e-01 1.73956424138891e-01 1.70559620926394e-01 1.67176974020208e-01 +1.63808997564213e-01 1.60456137504041e-01 1.57118772007940e-01 1.53797212422916e-01 +1.50491704756529e-01 1.47202431669613e-01 1.43929514960257e-01 1.40673018514688e-01 +1.37432951696251e-01 1.34209273139534e-01 1.31001894912933e-01 1.27810687009528e-01 +1.24635482123144e-01 1.21476080663965e-01 1.18332255965921e-01 1.15203759636526e-01 +1.12090326998680e-01 1.08991682573377e-01 1.05907545552111e-01 1.02837635208206e-01 +9.97816761971537e-02 9.67394036974130e-02 9.37105683449828e-02 9.06949409173111e-02 +8.76923167248224e-02 8.47025196714253e-02 8.17254059488197e-02 7.87608673331814e-02 +7.58088340568660e-02 7.28692772320587e-02 6.99422108077971e-02 6.70276930464259e-02 +6.41258275103069e-02 6.12367635543978e-02 5.83606963251507e-02 5.54978662709423e-02 +5.26485581739693e-02 4.98130997180879e-02 4.69918596114949e-02 4.41852452873130e-02 +4.13937002090802e-02 3.86177008117711e-02 3.58577531122925e-02 3.31143890263462e-02 +3.03881624311219e-02 2.76796450154504e-02 2.49894219607847e-02 2.23180874976819e-02 +1.96662403833058e-02 1.70344793458762e-02 1.44233985419273e-02 1.18335830717331e-02 +9.26560459730251e-03 6.72001710596635e-03 4.19735286077247e-03 1.69811857672018e-03 +-7.77208140711610e-04 -3.22818206118258e-03 -5.65439326892945e-03 -8.05546974918463e-03 +-1.04310795984836e-02 -1.27809328397077e-02 -1.51047828268190e-02 -1.74024272284677e-02 +-1.96737085839635e-02 -2.19185144294375e-02 -2.41367769963651e-02 -2.63284724888891e-02 +-2.84936199505902e-02 -3.06322797353908e-02 -3.27445516011748e-02 -3.48305724483703e-02 +-3.68905137291676e-02 -3.89245785562008e-02 -4.09329985423447e-02 -4.29160304058004e-02 +-4.48739523767722e-02 -4.68070604438231e-02 -4.87156644793583e-02 -5.06000842846737e-02 +-5.24606455955554e-02 -5.42976760895731e-02 -5.61115014359384e-02 -5.79024414281283e-02 +-5.96708062383963e-02 -6.14168928318364e-02 -6.31409815758338e-02 -6.48433330785529e-02 +-6.65241852876119e-02 -6.81837508772866e-02 -6.98222149495077e-02 -7.14397330706127e-02 +-7.30364296622860e-02 -7.46123967614557e-02 -7.61676931600857e-02 -7.77023439319171e-02 +-7.92163403492526e-02 -8.07096401889115e-02 -8.21821684225586e-02 -8.36338182827293e-02 +-8.50644526921250e-02 -8.64739060401249e-02 -8.78619862870243e-02 -8.92284773732851e-02 +-9.05731419080927e-02 -9.18957241088056e-02 -9.31959529604709e-02 -9.44735455624781e-02 +-9.57282106276769e-02 -9.69596520978789e-02 -9.81675728386400e-02 -9.93516783755596e-02 +-1.00511680634065e-01 -1.01647301644761e-01 -1.02758277176906e-01 -1.03844360263452e-01 +-1.04905324582303e-01 -1.05940967660021e-01 -1.06951113866116e-01 -1.07935617168281e-01 +-1.08894363621420e-01 -1.09827273566105e-01 -1.10734303515074e-01 -1.11615447709601e-01 +-1.12470739330921e-01 -1.13300251355407e-01 -1.14104097045784e-01 -1.14882430074341e-01 +-1.15635444277784e-01 -1.16363373047045e-01 -1.17066488358990e-01 -1.17745099460522e-01 +-1.18399551218989e-01 -1.19030222156076e-01 -1.19637522185450e-01 -1.20221890077297e-01 +-1.20783790675507e-01 -1.21323711895636e-01 -1.21842161533820e-01 -1.22339663918596e-01 +-1.22816756438989e-01 -1.23273985983345e-01 -1.23711905324085e-01 -1.24131069484008e-01 +-1.24532032119716e-01 -1.24915341957484e-01 -1.25281539316165e-01 -1.25631152750719e-01 +-1.25964695848596e-01 -1.26282664209533e-01 -1.26585532637343e-01 -1.26873752570045e-01 +-1.27147749772156e-01 -1.27407922310255e-01 -1.27654638829985e-01 -1.27888237149559e-01 +-1.28109023181604e-01 -1.28317270191810e-01 -1.28513218399469e-01 -1.28697074921493e-01 +-1.28869014058084e-01 -1.29029177914777e-01 -1.29177677352233e-01 -1.29314593251902e-01 +-1.29439978082554e-01 -1.29553857749694e-01 -1.29656233707148e-01 -1.29747085307519e-01 +-1.29826372365921e-01 -1.29894037909369e-01 -1.29950011082434e-01 -1.29994210178324e-01 +-1.30026545763428e-01 -1.30046923862527e-01 -1.30055249171424e-01 -1.30051428263594e-01 +-1.30035372757634e-01 -1.30007002412865e-01 -1.29966248121238e-01 -1.29913054764934e-01 +-1.29847383910465e-01 -1.29769216311897e-01 -1.29678554197827e-01 -1.29575423319037e-01 +-1.29459874736274e-01 -1.29331986330277e-01 -1.29191864019099e-01 -1.29039642670754e-01 +-1.28875486702364e-01 -1.28699590360200e-01 -1.28512177678264e-01 -1.28313502116304e-01 +-1.28103845881449e-01 -1.27883518940807e-01 -1.27652857735517e-01 -1.27412223609705e-01 +-1.27162000970705e-01 -1.26902595199515e-01 -1.26634430333016e-01 -1.26357946541663e-01 +-1.26073597428445e-01 -1.25781847176578e-01 -1.25483167574934e-01 -1.25178034951319e-01 +-1.24866927044607e-01 -1.24550319847265e-01 -1.24228684450051e-01 -1.23902483920541e-01 +-1.23572170246771e-01 -1.23238181376510e-01 -1.22900938381682e-01 -1.22560842776112e-01 +-1.22218274013166e-01 -1.21873587187983e-01 -1.21527110966906e-01 -1.21179145764335e-01 +-1.20829962184750e-01 -1.20479799744901e-01 -1.20128865888334e-01 -1.19777335301455e-01 +-1.19425349537276e-01 -1.19073016949897e-01 -1.18720412939633e-01 -1.18367580505595e-01 +-1.18014531099433e-01 -1.17661245770974e-01 -1.17307676593537e-01 -1.16953748353974e-01 +-1.16599360489828e-01 -1.16244389253592e-01 -1.15888690081802e-01 -1.15532100144700e-01 +-1.15174441050470e-01 -1.14815521676534e-01 -1.14455141099209e-01 -1.14093091592110e-01 +-1.13729161663053e-01 -1.13363139098937e-01 -1.12994813988029e-01 -1.12623981689450e-01 +-1.12250445720197e-01 -1.11874020531002e-01 -1.11494534143489e-01 -1.11111830622563e-01 +-1.10725772359688e-01 -1.10336242144692e-01 -1.09943145005911e-01 -1.09546409800860e-01 +-1.09145990542209e-01 -1.08741867446517e-01 -1.08334047696017e-01 -1.07922565906674e-01 +-1.07507484298693e-01 -1.07088892568693e-01 -1.06666907465760e-01 -1.06241672076580e-01 +-1.05813354827790e-01 -1.05382148216499e-01 -1.04948267282692e-01 -1.04511947839766e-01 +-1.04073444481884e-01 -1.03633028389051e-01 -1.03190984952790e-01 -1.02747611247106e-01 +-1.02303213370899e-01 -1.01858103689262e-01 -1.01412598002067e-01 -1.00967012668915e-01 +-1.00521661719914e-01 -1.00076853981860e-01 -9.96328902491490e-02 -9.91900605282935e-02 +-9.87486413840772e-02 -9.83088934143419e-02 -9.78710588790384e-02 -9.74353595075788e-02 +-9.70019945066868e-02 -9.65711387888822e-02 -9.61429414394761e-02 -9.57175244375174e-02 +-9.52949816435395e-02 -9.48753780642496e-02 -9.44587494014836e-02 -9.40451018898734e-02 +-9.36344124247499e-02 -9.32266289788736e-02 -9.28216713036685e-02 -9.24194319077707e-02 +-9.20197773029105e-02 -9.16225495044642e-02 -9.12275677714540e-02 -9.08346305683806e-02 +-9.04435177290495e-02 -9.00539928005476e-02 -8.96658055437223e-02 -8.92786945649754e-02 +-8.88923900528796e-02 -8.85066165921019e-02 -8.81210960263678e-02 -8.77355503417349e-02 +-8.73497045412710e-02 -8.69632894823496e-02 -8.65760446481788e-02 -8.61877208258744e-02 +-8.57980826643537e-02 -8.54069110865690e-02 -8.50140055320826e-02 -8.46191860077279e-02 +-8.42222949260463e-02 -8.38231987133485e-02 -8.34217891715877e-02 -8.30179845807201e-02 +-8.26117305308577e-02 -8.22030004762406e-02 -8.17917960058738e-02 -8.13781468285151e-02 +-8.09621104725904e-02 -8.05437717044700e-02 -8.01232416713760e-02 -7.97006567779530e-02 +-7.92761773082062e-02 -7.88499858070602e-02 -7.84222852382005e-02 -7.79932969370920e-02 +-7.75632583801141e-02 -7.71324207925833e-02 -7.67010466200322e-02 -7.62694068884740e-02 +-7.58377784804674e-02 -7.54064413546289e-02 -7.49756757367770e-02 -7.45457593111612e-02 +-7.41169644401951e-02 -7.36895554408097e-02 -7.32637859449424e-02 -7.28398963708106e-02 +-7.24181115304814e-02 -7.19986383978512e-02 -7.15816640595178e-02 -7.11673538691651e-02 +-7.07558498240132e-02 -7.03472691796355e-02 -6.99417033170267e-02 -6.95392168732557e-02 +-6.91398471443677e-02 -6.87436037664497e-02 -6.83504686779621e-02 -6.79603963635984e-02 +-6.75733143770963e-02 -6.71891241376037e-02 -6.68077019914491e-02 -6.64289005284863e-02 +-6.60525501396183e-02 -6.56784607996731e-02 -6.53064240575385e-02 -6.49362152133656e-02 +-6.45675956607766e-02 -6.42003153703439e-02 -6.38341154891872e-02 -6.34687310303656e-02 +-6.31038936248376e-02 -6.27393343081325e-02 -6.23747863135241e-02 -6.20099878434364e-02 +-6.16446847910238e-02 -6.12786333843759e-02 -6.09116027265682e-02 -6.05433772058341e-02 +-6.01737587514343e-02 -5.98025689123577e-02 -5.94296507377664e-02 -5.90548704400938e-02 +-5.86781188238944e-02 -5.82993124658973e-02 -5.79183946342281e-02 -5.75353359373796e-02 +-5.71501346962469e-02 -5.67628170353212e-02 -5.63734366919759e-02 -5.59820745456163e-02 +-5.55888378712919e-02 -5.51938593251520e-02 -5.47972956718313e-02 -5.43993262664633e-02 +-5.40001513064979e-02 -5.35999898708341e-02 -5.31990777659312e-02 -5.27976652005253e-02 +-5.23960143123196e-02 -5.19943965715283e-02 -5.15930900874151e-02 -5.11923768449695e-02 +-5.07925398995874e-02 -5.03938605580773e-02 -4.99966155744729e-02 -4.96010743890175e-02 +-4.92074964382731e-02 -4.88161285636278e-02 -4.84272025445079e-02 -4.80409327813792e-02 +-4.76575141521397e-02 -4.72771200637925e-02 -4.68999007193450e-02 -4.65259816177418e-02 +-4.61554623023182e-02 -4.57884153707753e-02 -4.54248857570740e-02 -4.50648902929134e-02 +-4.47084175536685e-02 -4.43554279907995e-02 -4.40058543498751e-02 -4.36596023704733e-02 +-4.33165517613883e-02 -4.29765574417905e-02 -4.26394510363016e-02 -4.23050426093736e-02 +-4.19731226219302e-02 -4.16434640909679e-02 -4.13158249307416e-02 -4.09899504522921e-02 +-4.06655759964459e-02 -4.03424296740253e-02 -4.00202351858790e-02 -3.96987146944927e-02 +-3.93775917183574e-02 -3.90565940199882e-02 -3.87354564584890e-02 -3.84139237778476e-02 +-3.80917533027295e-02 -3.77687175143996e-02 -3.74446064805402e-02 -3.71192301141363e-02 +-3.67924202382515e-02 -3.64640324354035e-02 -3.61339476623544e-02 -3.58020736134278e-02 +-3.54683458179374e-02 -3.51327284599357e-02 -3.47952149112325e-02 -3.44558279714774e-02 +-3.41146198120097e-02 -3.37716716231264e-02 -3.34270929673841e-02 -3.30810208444856e-02 +-3.27336184762032e-02 -3.23850738226017e-02 -3.20355978435398e-02 -3.16854225220108e-02 +-3.13347986683043e-02 -3.09839935262148e-02 -3.06332882045597e-02 -3.02829749590844e-02 +-2.99333543513982e-02 -2.95847323128982e-02 -2.92374171426757e-02 -2.88917164691516e-02 +-2.85479342056520e-02 -2.82063675302972e-02 -2.78673039204473e-02 -2.75310182715083e-02 +-2.71977701291799e-02 -2.68678010632052e-02 -2.65413322093848e-02 -2.62185620050566e-02 +-2.58996641414192e-02 -2.55847857540281e-02 -2.52740458705196e-02 -2.49675341321558e-02 +-2.46653098031453e-02 -2.43674010789139e-02 -2.40738047015988e-02 -2.37844858880485e-02 +-2.34993785725552e-02 -2.32183859634635e-02 -2.29413814097093e-02 -2.26682095702814e-02 +-2.23986878765990e-02 -2.21326082748799e-02 -2.18697392327756e-02 -2.16098279918926e-02 +-2.13526030453337e-02 -2.10977768170992e-02 -2.08450485181116e-02 -2.05941071517915e-02 +-2.03446346405303e-02 -2.00963090430931e-02 -1.98488078319666e-02 -1.96018111989362e-02 +-1.93550053567541e-02 -1.91080858046500e-02 -1.88607605256291e-02 -1.86127530840162e-02 +-1.83638055925143e-02 -1.81136815191636e-02 -1.78621683059891e-02 -1.76090797728046e-02 +-1.73542582815840e-02 -1.70975766389946e-02 -1.68389397170971e-02 -1.65782857748235e-02 +-1.63155874656324e-02 -1.60508525196738e-02 -1.57841240918550e-02 -1.55154807703492e-02 +-1.52450362433036e-02 -1.49729386247514e-02 -1.46993694439806e-02 -1.44245423058358e-02 +-1.41487012325874e-02 -1.38721187010751e-02 -1.35950933917852e-02 -1.33179476693218e-02 +-1.30410248163648e-02 -1.27646860456317e-02 -1.24893073165685e-02 -1.22152759854481e-02 +-1.19429873192542e-02 -1.16728409051315e-02 -1.14052369883012e-02 -1.11405727721369e-02 +-1.08792387145809e-02 -1.06216148552204e-02 -1.03680672071575e-02 -1.01189442472543e-02 +-9.87457353743422e-03 -9.63525850843002e-03 -9.40127543568604e-03 -9.17287063500230e-03 +-8.95025790290302e-03 -8.73361622356439e-03 -8.52308776033654e-03 -8.31877614534030e-03 +-8.12074507513158e-03 -7.92901721384244e-03 -7.74357339728741e-03 -7.56435212206698e-03 +-7.39124929247549e-03 -7.22411818492114e-03 -7.06276957457087e-03 -6.90697195232163e-03 +-6.75645174259066e-03 -6.61089341500785e-03 -6.46993936777488e-03 -6.33318945005820e-03 +-6.20019998915058e-03 -6.07048220023054e-03 -5.94349988832796e-03 -5.81866641017906e-03 +-5.69534095468700e-03 -5.57282433034674e-03 -5.45035461954095e-03 -5.32710327216418e-03 +-5.20217245755211e-03 -5.07459475895161e-03 -4.94333655378524e-03 -4.80730664016874e-03 +-4.66537180008292e-03 -4.51638097943361e-03 -4.35919955966902e-03 -4.19275474423384e-03 +-4.01609234988955e-03 -3.82844426702395e-03 -3.62930455974657e-03 -3.41851068592990e-03 +-3.19632474901104e-03 -2.96350821463949e-03 -2.72138234120014e-03 -2.47186590773981e-03 +-2.21748189123952e-03 -1.96132572100798e-03 -1.70698971747543e-03 -1.45844129292698e-03 +-1.21985630831405e-03 -9.95413360406959e-04 -7.89059308951393e-04 -6.04260544935935e-04 +-4.43757812673090e-04 -3.09344326186520e-04 -2.01687066429197e-04 -1.20209286460457e-04 +-6.30483884221516e-05 -2.70977190229184e-05 -8.13394701802203e-06 -1.02421586488349e-06 +-0.00000000000000e+00 + Type L N + 0 1 2 +0.00000000000000e+00 1.01213877748589e-02 2.02445363407371e-02 3.03711620509090e-02 +4.05028927633044e-02 5.06412245400090e-02 6.07874794744706e-02 7.09427650075536e-02 +8.11079350811001e-02 9.12835534622485e-02 1.01469859553564e-01 1.11666736983203e-01 +1.21873685246062e-01 1.32089794641293e-01 1.42313724724015e-01 1.52543686459654e-01 +1.62777428238398e-01 1.73012225875028e-01 1.83244876686155e-01 1.93471697702887e-01 +2.03688528042541e-01 2.13890735428435e-01 2.24073226812331e-01 2.34230463020134e-01 +2.44356477308090e-01 2.54444897684491e-01 2.64488972820822e-01 2.74481601346787e-01 +2.84415364295942e-01 2.94282560442900e-01 3.04075244249598e-01 3.13785266116998e-01 +3.23404314620086e-01 3.32923960388250e-01 3.42335701280190e-01 3.51631008492572e-01 +3.60801373234679e-01 3.69838353597509e-01 3.78733621245019e-01 3.87479007557593e-01 +3.96066548863279e-01 4.04488530400781e-01 4.12737528669635e-01 4.20806451837218e-01 +4.28688577889167e-01 4.36377590229290e-01 4.43867610456892e-01 4.51153228073470e-01 +4.58229526896728e-01 4.65092107987609e-01 4.71737108925232e-01 4.78161219295126e-01 +4.84361692287569e-01 4.90336352335008e-01 4.96083598750088e-01 5.01602405358569e-01 +5.06892316154006e-01 5.11953437033271e-01 5.16786423703511e-01 5.21392465881743e-01 +5.25773267937682e-01 5.29931026158364e-01 5.33868402839419e-01 5.37588497432298e-01 +5.41094814999083e-01 5.44391232246638e-01 5.47481961429529e-01 5.50371512426305e-01 +5.53064653306228e-01 5.55566369713289e-01 5.57881823401277e-01 5.60016310257780e-01 +5.61975218156202e-01 5.63763984973262e-01 5.65388057104999e-01 5.66852848807065e-01 +5.68163702675233e-01 5.69325851569541e-01 5.70344382270605e-01 5.71224201139393e-01 +5.71970002032387e-01 5.72586236702783e-01 5.73077087895257e-01 5.73446445317268e-01 +5.73697884643890e-01 5.73834649686193e-01 5.73859637825310e-01 5.73775388785885e-01 +5.73584076793799e-01 5.73287506134192e-01 5.72887110097056e-01 5.72383953269360e-01 +5.71778737104981e-01 5.71071808676927e-01 5.70263172490638e-01 5.69352505212777e-01 +5.68339173147068e-01 5.67222252267566e-01 5.66000550600480e-01 5.64672632728368e-01 +5.63236846175425e-01 5.61691349419697e-01 5.60034141267566e-01 5.58263091317717e-01 +5.56375971236203e-01 5.54370486561022e-01 5.52244308754001e-01 5.49995107219558e-01 +5.47620581014143e-01 5.45118489976741e-01 5.42486685019703e-01 5.39723137330165e-01 +5.36825966245438e-01 5.33793465580724e-01 5.30624128204277e-01 5.27316668673458e-01 +5.23870043764885e-01 5.20283470752837e-01 5.16556443312041e-01 5.12688744943749e-01 +5.08680459847370e-01 5.04531981183676e-01 5.00244016699486e-01 4.95817591707560e-01 +4.91254049439025e-01 4.86555048808702e-01 4.81722559656101e-01 4.76758855546377e-01 +4.71666504235960e-01 4.66448355926810e-01 4.61107529451020e-01 4.55647396543803e-01 +4.50071564377460e-01 4.44383856541780e-01 4.38588292667211e-01 4.32689066896191e-01 +4.26690525414926e-01 4.20597143262897e-01 4.14413500640159e-01 4.08144258933332e-01 +4.01794136679882e-01 3.95367885687013e-01 3.88870267516268e-01 3.82306030537776e-01 +3.75679887749227e-01 3.68996495543989e-01 3.62260433600661e-01 3.55476186052743e-01 +3.48648124082208e-01 3.41780490064759e-01 3.34877383377572e-01 3.27942747962535e-01 +3.20980361719633e-01 3.13993827786278e-01 3.06986567739328e-01 2.99961816737390e-01 +2.92922620601977e-01 2.85871834817382e-01 2.78812125410828e-01 2.71745971656890e-01 +2.64675670533317e-01 2.57603342839528e-01 2.50530940874274e-01 2.43460257555384e-01 +2.36392936852282e-01 2.29330485391174e-01 2.22274285083492e-01 2.15225606620539e-01 +2.08185623671204e-01 2.01155427615270e-01 1.94136042642197e-01 1.87128441044299e-01 +1.80133558534043e-01 1.73152309417582e-01 1.66185601460734e-01 1.59234350289265e-01 +1.52299493172436e-01 1.45382002047354e-01 1.38482895651517e-01 1.31603250642004e-01 +1.24744211591893e-01 1.17906999767583e-01 1.11092920604601e-01 1.04303369813999e-01 +9.75398380665498e-02 9.08039142173592e-02 8.40972870491369e-02 7.74217455280549e-02 +7.07791775816739e-02 6.41715674237288e-02 5.76009914654415e-02 5.10696128673755e-02 +4.45796747994776e-02 3.81334924897805e-02 3.17334441541186e-02 2.53819609100387e-02 +1.90815157877817e-02 1.28346119596563e-02 6.64377031627942e-03 5.11516523940925e-04 +-5.55963229827051e-03 -1.15671811400166e-02 -1.75086707568821e-02 -2.33816912801104e-02 +-2.91838956372568e-02 -3.49130126449781e-02 -4.05668596393850e-02 -4.61433545149543e-02 +-5.16405270498033e-02 -5.70565294031162e-02 -6.23896456796175e-02 -6.76383004660446e-02 +-7.28010662555524e-02 -7.78766696877052e-02 -8.28639965440922e-02 -8.77620954524901e-02 +-9.25701802657832e-02 -9.72876310953618e-02 -1.01913993992373e-01 -1.06448979283795e-01 +-1.10892458583761e-01 -1.15244460513723e-01 -1.19505165177754e-01 -1.23674897451502e-01 +-1.27754119154806e-01 -1.31743420188725e-01 -1.35643508727617e-01 -1.39455200565772e-01 +-1.43179407725972e-01 -1.46817126444032e-01 -1.50369424648960e-01 -1.53837429062617e-01 +-1.57222312045859e-01 -1.60525278319834e-01 -1.63747551691594e-01 -1.66890361912338e-01 +-1.69954931794461e-01 -1.72942464710252e-01 -1.75854132590479e-01 -1.78691064535390e-01 +-1.81454336143856e-01 -1.84144959658521e-01 -1.86763875016100e-01 -1.89311941882346e-01 +-1.91789932740891e-01 -1.94198527094194e-01 -1.96538306823356e-01 -1.98809752741699e-01 +-2.01013242364828e-01 -2.03149048907627e-01 -2.05217341506251e-01 -2.07218186650995e-01 +-2.09151550803823e-01 -2.11017304162702e-01 -2.12815225523589e-01 -2.14545008180262e-01 +-2.16206266792118e-01 -2.17798545140833e-01 -2.19321324688297e-01 -2.20774033840785e-01 +-2.22156057817772e-01 -2.23466749018387e-01 -2.24705437774117e-01 -2.25871443373190e-01 +-2.26964085240001e-01 -2.27982694152118e-01 -2.28926623377682e-01 -2.29795259617560e-01 +-2.30588033639236e-01 -2.31304430493193e-01 -2.31943999207415e-01 -2.32506361861487e-01 +-2.32991221948621e-01 -2.33398371941643e-01 -2.33727699987536e-01 -2.33979195664328e-01 +-2.34152954744031e-01 -2.34249182915691e-01 -2.34268198433406e-01 -2.34210433665303e-01 +-2.34076435530715e-01 -2.33866864824194e-01 -2.33582494436351e-01 -2.33224206492639e-01 +-2.32792988442192e-01 -2.32289928139344e-01 -2.31716207970544e-01 -2.31073098088927e-01 +-2.30361948827602e-01 -2.29584182370890e-01 -2.28741283769946e-01 -2.27834791395652e-01 +-2.26866286927049e-01 -2.25837384978017e-01 -2.24749722468285e-01 -2.23604947847159e-01 +-2.22404710279555e-01 -2.21150648904029e-01 -2.19844382271531e-01 -2.18487498071495e-01 +-2.17081543248761e-01 -2.15628014610687e-01 -2.14128350018619e-01 -2.12583920251883e-01 +-2.10996021625496e-01 -2.09365869435100e-01 -2.07694592294206e-01 -2.05983227419784e-01 +-2.04232716912649e-01 -2.02443905069115e-01 -2.00617536750007e-01 -1.98754256822588e-01 +-1.96854610680219e-01 -1.94919045833896e-01 -1.92947914559151e-01 -1.90941477571387e-01 +-1.88899908692568e-01 -1.86823300462450e-01 -1.84711670638282e-01 -1.82564969518223e-01 +-1.80383088015723e-01 -1.78165866404831e-01 -1.75913103649933e-01 -1.73624567227831e-01 +-1.71300003345396e-01 -1.68939147452313e-01 -1.66541734945752e-01 -1.64107511962053e-01 +-1.61636246149926e-01 -1.59127737319971e-01 -1.56581827866794e-01 -1.53998412862349e-01 +-1.51377449722604e-01 -1.48718967353901e-01 -1.46023074690691e-01 -1.43289968542365e-01 +-1.40519940673804e-01 -1.37713384051837e-01 -1.34870798198027e-01 -1.31992793596936e-01 +-1.29080095118311e-01 -1.26133544421158e-01 -1.23154101317599e-01 -1.20142844084436e-01 +-1.17100968720459e-01 -1.14029787157652e-01 -1.10930724444436e-01 -1.07805314928829e-01 +-1.04655197478857e-01 -1.01482109786610e-01 -9.82878818108433e-02 -9.50744284210456e-02 +-9.18437413131827e-02 -8.85978802739461e-02 -8.53389638761668e-02 -8.20691596930264e-02 +-7.87906741228299e-02 -7.55057419192877e-02 -7.22166155245121e-02 -6.89255543032340e-02 +-6.56348137770616e-02 -6.23466349569730e-02 -5.90632338706347e-02 -5.57867913786083e-02 +-5.25194433700762e-02 -4.92632714244115e-02 -4.60202940197988e-02 -4.27924583642268e-02 +-3.95816329175794e-02 -3.63896006663356e-02 -3.32180532045961e-02 -3.00685856669010e-02 +-2.69426925496329e-02 -2.38417644488448e-02 -2.07670857331572e-02 -1.77198331610677e-02 +-1.47010754426694e-02 -1.17117737364985e-02 -8.75278306309481e-03 -5.82485460798584e-03 +-2.92863887823064e-03 -6.46896685322491e-05 2.76653121473679e-03 5.56464853033476e-03 +8.32936754497155e-03 1.10604674131063e-02 1.37577940696903e-02 1.64212527160513e-02 +1.90507999824897e-02 2.16464358537033e-02 2.42081954447852e-02 2.67361407163064e-02 +2.92303522168393e-02 3.16909209402497e-02 3.41179403831703e-02 3.65114988853089e-02 +3.88716723316616e-02 4.11985172913335e-02 4.34920646625864e-02 4.57523138879453e-02 +4.79792277968265e-02 5.01727281261971e-02 5.23326917623733e-02 5.44589477392552e-02 +5.65512750201594e-02 5.86094010820510e-02 6.06330013124659e-02 6.26216992208307e-02 +6.45750674573452e-02 6.64926296241417e-02 6.83738628551987e-02 7.02182011335110e-02 +7.20250393064126e-02 7.37937377527740e-02 7.55236276491118e-02 7.72140167755579e-02 +7.88641957971488e-02 8.04734449511112e-02 8.20410410667468e-02 8.35662648412282e-02 +8.50484082921052e-02 8.64867823056526e-02 8.78807241993203e-02 8.92296052165586e-02 +9.05328378730981e-02 9.17898830754249e-02 9.30002569346450e-02 9.41635372021609e-02 +9.52793692575708e-02 9.63474715838631e-02 9.73676406703169e-02 9.83397552894349e-02 +9.92637801006878e-02 1.00139768540775e-01 1.00967864967409e-01 1.01748306031272e-01 +1.02481421258673e-01 1.03167632835438e-01 1.03807454590721e-01 1.04401490187500e-01 +1.04950430534554e-01 1.05455050442581e-01 1.05916204554734e-01 1.06334822589108e-01 +1.06711903937659e-01 1.07048511672426e-01 1.07345766015851e-01 1.07604837337309e-01 +1.07826938742659e-01 1.08013318327585e-01 1.08165251168829e-01 1.08284031129911e-01 +1.08370962559670e-01 1.08427351962967e-01 1.08454499722975e-01 1.08453691953916e-01 +1.08426192561602e-01 1.08373235586985e-01 1.08296017904911e-01 1.08195692346644e-01 +1.08073361310319e-01 1.07930070918537e-01 1.07766805776686e-01 1.07584484379527e-01 +1.07383955206974e-01 1.07165993543048e-01 1.06931299044732e-01 1.06680494079858e-01 +1.06414122845520e-01 1.06132651270637e-01 1.05836467698514e-01 1.05525884337470e-01 +1.05201139459980e-01 1.04862400323388e-01 1.04509766778119e-01 1.04143275522569e-01 +1.03762904957544e-01 1.03368580587254e-01 1.02960180908621e-01 1.02537543725932e-01 +1.02100472823874e-01 1.01648744928604e-01 1.01182116883883e-01 1.00700332967402e-01 +1.00203132271313e-01 9.96902560705850e-02 9.91614551032457e-02 9.86164966877031e-02 +9.80551716043021e-02 9.74773006709014e-02 9.68827409456205e-02 9.62713914939288e-02 +9.56431986618977e-02 9.49981608026574e-02 9.43363324088459e-02 9.36578276100506e-02 +9.29628230008445e-02 9.22515597719626e-02 9.15243451243629e-02 9.07815529533035e-02 +9.00236237970688e-02 8.92510640525102e-02 8.84644444670696e-02 8.76643979243259e-02 +8.68516165473022e-02 8.60268481506959e-02 8.51908920797911e-02 8.43445944800090e-02 +8.34888430467902e-02 8.26245613107242e-02 8.17527025174930e-02 8.08742431662267e-02 +7.99901762732578e-02 7.91015044309472e-02 7.82092327332384e-02 7.73143616408363e-02 +7.64178798594192e-02 7.55207573040319e-02 7.46239382218293e-02 7.37283345436104e-02 +7.28348195321410e-02 7.19442217921417e-02 7.10573197030313e-02 7.01748363311146e-02 +6.92974348729337e-02 6.84257146760090e-02 6.75602078772433e-02 6.67013766928995e-02 +6.58496113873751e-02 6.50052289410219e-02 6.41684724301059e-02 6.33395111247049e-02 +6.25184413030099e-02 6.17052877731707e-02 6.09000060866137e-02 6.01024854197064e-02 +5.93125520938355e-02 5.85299736974714e-02 5.77544637676664e-02 5.69856869827548e-02 +5.62232648128315e-02 5.54667815699475e-02 5.47157907959109e-02 5.39698219221726e-02 +5.32283871335286e-02 5.24909883653263e-02 5.17571243625246e-02 5.10262977283632e-02 +5.02980218905217e-02 4.95718279135177e-02 4.88472710876822e-02 4.81239372273490e-02 +4.74014486138718e-02 4.66794695227229e-02 4.59577112781717e-02 4.52359367838704e-02 +4.45139644830176e-02 4.37916717075911e-02 4.30689973823661e-02 4.23459440560103e-02 +4.16225792384074e-02 4.08990360304209e-02 4.01755130395199e-02 3.94522735819658e-02 +3.87296441795163e-02 3.80080123657975e-02 3.72878238245231e-02 3.65695788885487e-02 +3.58538284352648e-02 3.51411692199888e-02 3.44322386947494e-02 3.37277093651125e-02 +3.30282827424220e-02 3.23346829529666e-02 3.16476500691118e-02 3.09679332302861e-02 +3.02962836238975e-02 2.96334473977132e-02 2.89801585759805e-02 2.83371320515758e-02 +2.77050567257435e-02 2.70845888655434e-02 2.64763457469630e-02 2.58808996488013e-02 +2.52987722589301e-02 2.47304295503955e-02 2.41762771801120e-02 2.36366564576403e-02 +2.31118409258074e-02 2.26020335887595e-02 2.21073648165075e-02 2.16278909482037e-02 +2.11635936093227e-02 2.07143797507117e-02 2.02800824101653e-02 1.98604621898736e-02 +1.94552094358328e-02 1.90639470981899e-02 1.86862342445871e-02 1.83215701919265e-02 +1.79693992156894e-02 1.76291157900583e-02 1.73000703066702e-02 1.69815752149342e-02 +1.66729115225204e-02 1.63733355909206e-02 1.60820861579223e-02 1.57983915164727e-02 +1.55214767777429e-02 1.52505711452748e-02 1.49849151268956e-02 1.47237676116290e-02 +1.44664127401124e-02 1.42121664990356e-02 1.39603829728215e-02 1.37104601891568e-02 +1.34618454990098e-02 1.32140404364118e-02 1.29666050084667e-02 1.27191613717598e-02 +1.24713968574908e-02 1.22230663141938e-02 1.19739937437757e-02 1.17240732137324e-02 +1.14732690357000e-02 1.12216152079263e-02 1.09692141266958e-02 1.07162345791678e-02 +1.04629090373847e-02 1.02095302803322e-02 9.95644737779618e-03 9.70406107629152e-03 +9.45281863348136e-03 9.20320815319515e-03 8.95575247831889e-03 8.71100270344403e-03 +8.46953137314772e-03 8.23192543511824e-03 7.99877901998571e-03 7.77068612164634e-03 +7.54823325306320e-03 7.33199215296191e-03 7.12251261852425e-03 6.92031553810616e-03 +6.72588619618421e-03 6.53966792018216e-03 6.36205613556996e-03 6.19339289169001e-03 +6.03396191619366e-03 5.88398425080339e-03 5.74361451540317e-03 5.61293784126252e-03 +5.49196750757501e-03 5.38064330849877e-03 5.27883067061274e-03 5.18632053319414e-03 +5.10282999607836e-03 5.02800373213776e-03 4.96141615370605e-03 4.90257431464206e-03 +4.85092152225252e-03 4.80584162605846e-03 4.76666394345732e-03 4.73266877577681e-03 +4.70309346210529e-03 4.67713891268000e-03 4.65397655856461e-03 4.63275564992293e-03 +4.61261083142189e-03 4.59266992023087e-03 4.57206180974308e-03 4.54992442056593e-03 +4.52541261952658e-03 4.49770602741647e-03 4.46601663697486e-03 4.42959616416571e-03 +4.38774305812992e-03 4.33980909827392e-03 4.28520551075804e-03 4.22340854113565e-03 +4.15396442503239e-03 4.07649370448626e-03 3.99069484384742e-03 3.89634710589292e-03 +3.79331265599248e-03 3.68153786968658e-03 3.56105382683889e-03 3.43197598352894e-03 +3.29450302097654e-03 3.14891487895381e-03 2.99556998927259e-03 2.83490173294585e-03 +2.66741415243808e-03 2.49367695795896e-03 2.31431987394868e-03 2.13002637867138e-03 +1.94152689611639e-03 1.74959150513450e-03 1.55502223585442e-03 1.35864502787796e-03 +1.16130142849680e-03 9.63840112168939e-04 7.67108304703752e-04 5.71943197012947e-04 +3.79163433865714e-04 1.89560762835279e-04 3.89192754031464e-06 -1.77129112628181e-04 +-3.52838556828907e-04 -5.22629655188116e-04 -6.85959047634637e-04 -8.42352482464512e-04 +-9.91409853619783e-04 -1.13280950195641e-03 -1.26631173263542e-03 -1.39176150813425e-03 +-1.50909028416651e-03 -1.61831696395506e-03 -1.71954795474067e-03 -1.81297631904940e-03 +-1.89888002200452e-03 -1.97761928477064e-03 -2.04963306296994e-03 -2.11543467753539e-03 +-2.17560663387485e-03 -2.23079467333591e-03 -2.28170110870084e-03 -2.32907750273362e-03 +-2.37371675557080e-03 -2.41644467293070e-03 -2.45811109264799e-03 -2.49958065186868e-03 +-2.54172328131418e-03 -2.58540451630048e-03 -2.63147571664209e-03 -2.68076428915880e-03 +-2.73406400720653e-03 -2.79212552146801e-03 -2.85564715515074e-03 -2.92526607475138e-03 +-3.00154992465737e-03 -3.08498901006919e-03 -3.17598910804348e-03 -3.27486498085652e-03 +-3.38183465934475e-03 -3.49701455631853e-03 -3.62041546145937e-03 -3.75193945909753e-03 +-3.89137779865568e-03 -4.03840973391682e-03 -4.19260233109035e-03 -4.35341122619711e-03 +-4.52018228873977e-03 -4.69215412002650e-03 -4.86846127998537e-03 -5.04813809516440e-03 +-5.23012285276129e-03 -5.41326213183157e-03 -5.59631496576329e-03 -5.77795647440713e-03 +-5.95678055764330e-03 -6.13130121602202e-03 -6.29995207380332e-03 -6.46108374449740e-03 +-6.61295882095881e-03 -6.75374451413351e-03 -6.88150332691318e-03 -6.99418264488468e-03 +-7.08960475295421e-03 -7.16545952414339e-03 -7.21930282512746e-03 -7.24856446072265e-03 +-7.25057012073488e-03 -7.22258215092926e-03 -7.16186387877019e-03 -7.06577151566998e-03 +-6.93187618689809e-03 -6.75811632088532e-03 -6.54297746559018e-03 -6.28569271798897e-03 +-5.98645262528082e-03 -5.64660906386103e-03 -5.26885377550224e-03 -4.85734957457175e-03 +-4.41779138154467e-03 -3.95737574888983e-03 -3.48466179901099e-03 -3.00931357451862e-03 +-2.54172342716152e-03 -2.09252756126560e-03 -1.67203714012721e-03 -1.28962010635427e-03 +-9.53078560426890e-04 -6.68072733344818e-04 -4.37644087867241e-04 -2.61886161419093e-04 +-1.37802316232855e-04 -5.93751834893968e-05 -1.78545551178222e-05 -2.25062591141454e-06 +0.00000000000000e+00 + Type L N + 0 2 0 +0.00000000000000e+00 -1.56646081370714e-04 -6.26377279478300e-04 -1.40857235260428e-03 +-2.50219555807900e-03 -3.90579616307359e-03 -5.61750780045757e-03 -7.63504770988692e-03 +-9.95571591493540e-03 -1.25763943971201e-02 -1.54935463369797e-02 -1.87032155008228e-02 +-2.22010258592700e-02 -2.59821815301708e-02 -3.00414671437914e-02 -3.43732487322845e-02 +-3.89714752482808e-02 -4.38296808189612e-02 -4.89409878421220e-02 -5.42981110295253e-02 +-5.98933625002198e-02 -6.57186580225307e-02 -7.17655244980772e-02 -7.80251087745145e-02 +-8.44881878657642e-02 -9.11451806493592e-02 -9.79861611002588e-02 -1.05000873109179e-01 +-1.12178746921227e-01 -1.19508917217551e-01 -1.26980242848904e-01 -1.34581328215660e-01 +-1.42300546273976e-01 -1.50126063132658e-01 -1.58045864190012e-01 -1.66047781744605e-01 +-1.74119523998751e-01 -1.82248705358581e-01 -1.90422877920163e-01 -1.98629564017233e-01 +-2.06856289692983e-01 -2.15090618946047e-01 -2.23320188589565e-01 -2.31532743552000e-01 +-2.39716172439470e-01 -2.47858543171700e-01 -2.55948138497533e-01 -2.63973491191191e-01 +-2.71923418727376e-01 -2.79787057231728e-01 -2.87553894503300e-01 -2.95213801907472e-01 +-3.02757064941193e-01 -3.10174412277575e-01 -3.17457043103613e-01 -3.24596652573179e-01 +-3.31585455207377e-01 -3.38416206085722e-01 -3.45082219684417e-01 -3.51577386232092e-01 +-3.57896185468683e-01 -3.64033697709470e-01 -3.69985612133650e-01 -3.75748232234882e-01 +-3.81318478390103e-01 -3.86693887522135e-01 -3.91872609851289e-01 -3.96853402750966e-01 +-4.01635621742147e-01 -4.06219208681315e-01 -4.10604677215799e-01 -4.14793095599434e-01 +-4.18786066979718e-01 -4.22585707285193e-01 -4.26194620858325e-01 -4.29615873994681e-01 +-4.32852966563520e-01 -4.35909801897867e-01 -4.38790655153727e-01 -4.41500140348085e-01 +-4.44043176293767e-01 -4.46424951655945e-01 -4.48650889360059e-01 -4.50726610584142e-01 +-4.52657898569907e-01 -4.54450662486567e-01 -4.56110901579123e-01 -4.57644669828834e-01 +-4.59058041347854e-01 -4.60357076722532e-01 -4.61547790510801e-01 -4.62636120088452e-01 +-4.63627896026986e-01 -4.64528814172300e-01 -4.65344409578816e-01 -4.66080032437829e-01 +-4.66740826122194e-01 -4.67331707451823e-01 -4.67857349266301e-01 -4.68322165372134e-01 +-4.68730297913076e-01 -4.69085607192660e-01 -4.69391663958783e-01 -4.69651744140960e-01 +-4.69868826012021e-01 -4.70045589727565e-01 -4.70184419178662e-01 -4.70287406076233e-01 +-4.70356356169325e-01 -4.70392797484366e-01 -4.70397990458419e-01 -4.70372939826724e-01 +-4.70318408113349e-01 -4.70234930563789e-01 -4.70122831349861e-01 -4.69982240870303e-01 +-4.69813113965139e-01 -4.69615248858212e-01 -4.69388306640188e-01 -4.69131831103965e-01 +-4.68845268745628e-01 -4.68527988746916e-01 -4.68179302759548e-01 -4.67798484317650e-01 +-4.67384787711809e-01 -4.66937466166953e-01 -4.66455789176159e-01 -4.65939058853540e-01 +-4.65386625181453e-01 -4.64797900040286e-01 -4.64172369922821e-01 -4.63509607249669e-01 +-4.62809280217136e-01 -4.62071161124258e-01 -4.61295133141209e-01 -4.60481195496930e-01 +-4.59629467079345e-01 -4.58740188456865e-01 -4.57813722344847e-01 -4.56850552555182e-01 +-4.55851281481077e-01 -4.54816626182225e-01 -4.53747413147904e-01 -4.52644571826888e-01 +-4.51509127023377e-01 -4.50342190267360e-01 -4.49144950275842e-01 -4.47918662628080e-01 +-4.46664638783470e-01 -4.45384234574802e-01 -4.44078838312384e-01 -4.42749858635918e-01 +-4.41398712251045e-01 -4.40026811686181e-01 -4.38635553202634e-01 -4.37226304987116e-01 +-4.35800395750665e-01 -4.34359103851730e-01 -4.32903647053885e-01 -4.31435173020276e-01 +-4.29954750637762e-01 -4.28463362253663e-01 -4.26961896897380e-01 -4.25451144547886e-01 +-4.23931791496374e-01 -4.22404416841316e-01 -4.20869490140920e-01 -4.19327370235650e-01 +-4.17778305241137e-01 -4.16222433699684e-01 -4.14659786866669e-01 -4.13090292096680e-01 +-4.11513777283204e-01 -4.09929976295306e-01 -4.08338535345055e-01 -4.06739020210533e-01 +-4.05130924231210e-01 -4.03513676985396e-01 -4.01886653553341e-01 -4.00249184264511e-01 +-3.98600564823584e-01 -3.96940066706864e-01 -3.95266947719069e-01 -3.93580462599860e-01 +-3.91879873570037e-01 -3.90164460708953e-01 -3.88433532057466e-01 -3.86686433344525e-01 +-3.84922557240276e-01 -3.83141352044300e-01 -3.81342329724225e-01 -3.79525073227321e-01 +-3.77689242995841e-01 -3.75834582625611e-01 -3.73960923616667e-01 -3.72068189174443e-01 +-3.70156397030098e-01 -3.68225661258842e-01 -3.66276193085534e-01 -3.64308300677272e-01 +-3.62322387933014e-01 -3.60318952290447e-01 -3.58298581580163e-01 -3.56261949966688e-01 +-3.54209813024886e-01 -3.52143002008699e-01 -3.50062417376947e-01 -3.47969021647973e-01 +-3.45863831661176e-01 -3.43747910328917e-01 -3.41622357966792e-01 -3.39488303293883e-01 +-3.37346894197241e-01 -3.35199288356516e-01 -3.33046643825331e-01 -3.30890109665685e-01 +-3.28730816730398e-01 -3.26569868686369e-01 -3.24408333368241e-01 -3.22247234548066e-01 +-3.20087544201620e-01 -3.17930175346407e-01 -3.15775975519969e-01 -3.13625720960113e-01 +-3.11480111541053e-01 -3.09339766511360e-01 -3.07205221071139e-01 -3.05076923817018e-01 +-3.02955235074487e-01 -3.00840426127986e-01 -2.98732679349891e-01 -2.96632089220433e-01 +-2.94538664221545e-01 -2.92452329578911e-01 -2.90372930817997e-01 -2.88300238091865e-01 +-2.86233951230989e-01 -2.84173705458342e-01 -2.82119077706639e-01 -2.80069593468957e-01 +-2.78024734109056e-01 -2.75983944553533e-01 -2.73946641284648e-01 -2.71912220550181e-01 +-2.69880066705071e-01 -2.67849560598854e-01 -2.65820087923082e-01 -2.63791047433914e-01 +-2.61761858966960e-01 -2.59731971164163e-01 -2.57700868836009e-01 -2.55668079886632e-01 +-2.53633181734324e-01 -2.51595807165620e-01 -2.49555649567301e-01 -2.47512467487436e-01 +-2.45466088483743e-01 -2.43416412225125e-01 -2.41363412820097e-01 -2.39307140353901e-01 +-2.37247721624286e-01 -2.35185360074222e-01 -2.33120334927986e-01 -2.31052999545176e-01 +-2.28983779015095e-01 -2.26913167021530e-01 -2.24841722015239e-01 -2.22770062738224e-01 +-2.20698863150247e-01 -2.18628846813762e-01 -2.16560780798618e-01 -2.14495469172364e-01 +-2.12433746145792e-01 -2.10376468946379e-01 -2.08324510494605e-01 -2.06278751959614e-01 +-2.04240075271404e-01 -2.02209355666651e-01 -2.00187454344409e-01 -1.98175211306275e-01 +-1.96173438453214e-01 -1.94182913008128e-01 -1.92204371329429e-01 -1.90238503176431e-01 +-1.88285946482331e-01 -1.86347282684955e-01 -1.84423032659372e-01 -1.82513653290024e-01 +-1.80619534713145e-01 -1.78740998253199e-01 -1.76878295069715e-01 -1.75031605523500e-01 +-1.73201039263745e-01 -1.71386636030070e-01 -1.69588367156262e-01 -1.67806137755251e-01 +-1.66039789558029e-01 -1.64289104372588e-01 -1.62553808122782e-01 -1.60833575421278e-01 +-1.59128034625521e-01 -1.57436773320937e-01 -1.55759344171548e-01 -1.54095271074710e-01 +-1.52444055553908e-01 -1.50805183321490e-01 -1.49178130941829e-01 -1.47562372524790e-01 +-1.45957386379437e-01 -1.44362661558745e-01 -1.42777704227570e-01 -1.41202043788367e-01 +-1.39635238701973e-01 -1.38076881944297e-01 -1.36526606043829e-01 -1.34984087649521e-01 +-1.33449051583709e-01 -1.31921274340313e-01 -1.30400586994514e-01 -1.28886877496339e-01 +-1.27380092327129e-01 -1.25880237504546e-01 -1.24387378928591e-01 -1.22901642067960e-01 +-1.21423210992879e-01 -1.19952326767321e-01 -1.18489285220010e-01 -1.17034434120027e-01 +-1.15588169788777e-01 -1.14150933185835e-01 -1.12723205511387e-01 -1.11305503372816e-01 +-1.09898373567259e-01 -1.08502387535681e-01 -1.07118135547166e-01 -1.05746220674640e-01 +-1.04387252625138e-01 -1.03041841488910e-01 -1.01710591472279e-01 -1.00394094678981e-01 +-9.90929250039973e-02 -9.78076322024253e-02 -9.65387361938646e-02 -9.52867216601304e-02 +-9.40520329908306e-02 -9.28350696275423e-02 -9.16361818530092e-02 -9.04556670670161e-02 +-8.92937665854221e-02 -8.81506629933103e-02 -8.70264780773968e-02 -8.59212713567911e-02 +-8.48350392249887e-02 -8.37677147096529e-02 -8.27191678503899e-02 -8.16892066883877e-02 +-8.06775788555519e-02 -7.96839737446930e-02 -7.87080252364492e-02 -7.77493149530432e-02 +-7.68073760037188e-02 -7.58816971818267e-02 -7.49717275690950e-02 -7.40768814986610e-02 +-7.31965438249904e-02 -7.23300754459205e-02 -7.14768190197311e-02 -7.06361048184381e-02 +-6.98072566573723e-02 -6.89895978406285e-02 -6.81824570620900e-02 -6.73851742024826e-02 +-6.65971059642697e-02 -6.58176312881402e-02 -6.50461564973620e-02 -6.42821201193190e-02 +-6.35249973371172e-02 -6.27743040281659e-02 -6.20296003510932e-02 -6.12904938471788e-02 +-6.05566420276407e-02 -5.98277544235277e-02 -5.91035940806094e-02 -5.83839784874372e-02 +-5.76687799306364e-02 -5.69579252774031e-02 -5.62513951910664e-02 -5.55492227913756e-02 +-5.48514917768255e-02 -5.41583340317778e-02 -5.34699267463260e-02 -5.27864890817217e-02 +-5.21082784186998e-02 -5.14355862301429e-02 -5.07687336231917e-02 -5.01080665990838e-02 +-4.94539510816766e-02 -4.88067677677353e-02 -4.81669068536426e-02 -4.75347626941902e-02 +-4.69107284495333e-02 -4.62951907762360e-02 -4.56885246176010e-02 -4.50910881471804e-02 +-4.45032179175133e-02 -4.39252242637631e-02 -4.33573870090446e-02 -4.27999515148846e-02 +-4.22531251164777e-02 -4.17170739782243e-02 -4.11919204005172e-02 -4.06777406039207e-02 +-4.01745630118172e-02 -3.96823670473331e-02 -3.92010824549496e-02 -3.87305891517178e-02 +-3.82707176074806e-02 -3.78212497480276e-02 -3.73819203697001e-02 -3.69524190487239e-02 +-3.65323925234775e-02 -3.61214475230989e-02 -3.57191540113174e-02 -3.53250488102261e-02 +-3.49386395649153e-02 -3.45594090065240e-02 -3.41868194683459e-02 -3.38203176071948e-02 +-3.34593392803047e-02 -3.31033145266335e-02 -3.27516726005685e-02 -3.24038470057026e-02 +-3.20592804765666e-02 -3.17174298569505e-02 -3.13777708247354e-02 -3.10398024149456e-02 +-3.07030512950241e-02 -3.03670757490828e-02 -3.00314693310755e-02 -2.96958641504307e-02 +-2.93599337576379e-02 -2.90233956015576e-02 -2.86860130347762e-02 -2.83475968481055e-02 +-2.80080063202797e-02 -2.76671497739851e-02 -2.73249846345035e-02 -2.69815169924228e-02 +-2.66368006769999e-02 -2.62909358518033e-02 -2.59440671491665e-02 -2.55963813646989e-02 +-2.52481047375684e-02 -2.48994998464649e-02 -2.45508621550119e-02 -2.42025162438947e-02 +-2.38548117700657e-02 -2.35081191960582e-02 -2.31628253346460e-02 -2.28193287558207e-02 +-2.24780351042937e-02 -2.21393523764623e-02 -2.18036862060055e-02 -2.14714352069790e-02 +-2.11429864224904e-02 -2.08187109257411e-02 -2.04989596184488e-02 -2.01840592694345e-02 +-1.98743088334888e-02 -1.95699760875560e-02 -1.92712946178306e-02 -1.89784611875732e-02 +-1.86916335113771e-02 -1.84109284572853e-02 -1.81364206936178e-02 -1.78681417926807e-02 +-1.76060797987136e-02 -1.73501792625820e-02 -1.71003417408379e-02 -1.68564267519500e-02 +-1.66182531777703e-02 -1.63856010937029e-02 -1.61582140066412e-02 -1.59358014755646e-02 +-1.57180420857882e-02 -1.55045867442831e-02 -1.52950622602533e-02 -1.50890751723109e-02 +-1.48862157811608e-02 -1.46860623447109e-02 -1.44881853909812e-02 -1.42921521031189e-02 +-1.40975307302321e-02 -1.39038949776509e-02 -1.37108283306019e-02 -1.35179282661341e-02 +-1.33248103094604e-02 -1.31311118926512e-02 -1.29364959758252e-02 -1.27406543935954e-02 +-1.25433108925257e-02 -1.23442238286909e-02 -1.21431884980904e-02 -1.19400390765851e-02 +-1.17346501501857e-02 -1.15269378208625e-02 -1.13168603775250e-02 -1.11044185264010e-02 +-1.08896551796630e-02 -1.06726548057686e-02 -1.04535423495528e-02 -1.02324817345787e-02 +-1.00096739645803e-02 -9.78535484496632e-03 -9.55979234926634e-03 -9.33328365902737e-03 +-9.10615190900219e-03 -8.87874267245219e-03 -8.65142022400163e-03 -8.42456361969278e-03 +-8.19856263568898e-03 -7.97381360842403e-03 -7.75071521990974e-03 -7.52966427235492e-03 +-7.31105149623851e-03 -7.09525743550489e-03 -6.88264845261788e-03 -6.67357289484338e-03 +-6.46835746132430e-03 -6.26730380830975e-03 -6.07068542730615e-03 -5.87874482797262e-03 +-5.69169105431167e-03 -5.50969755914728e-03 -5.33290045807190e-03 -5.16139718002180e-03 +-4.99524552745195e-03 -4.83446315476610e-03 -4.67902746926610e-03 -4.52887595446032e-03 +-4.38390691115589e-03 -4.24398060740847e-03 -4.10892082415477e-03 -3.97851677925238e-03 +-3.85252540874268e-03 -3.73067398047389e-03 -3.61266301181347e-03 -3.49816946007074e-03 +-3.38685015148475e-03 -3.27834541222042e-03 -3.17228286280071e-03 -3.06828133578828e-03 +-2.96595487534448e-03 -2.86491677653677e-03 -2.76478362195707e-03 -2.66517927334410e-03 +-2.56573877647712e-03 -2.46611213861510e-03 -2.36596793918812e-03 -2.26499673628447e-03 +-2.16291423370242e-03 -2.05946417592109e-03 -1.95442094127208e-03 -1.84759180681338e-03 +-1.73881886190965e-03 -1.62798055124617e-03 -1.51499283192680e-03 -1.39980993337945e-03 +-1.28242471297529e-03 -1.16286860451103e-03 -1.04121116097540e-03 -9.17559197262856e-04 +-7.92055542670399e-04 -6.64877417082440e-04 -5.36234448652780e-04 -4.06366354513395e-04 +-2.75540309519132e-04 -1.44048031256790e-04 -1.22026124543761e-05 1.19664865484761e-04 +2.51208901879896e-04 3.82073291309061e-04 5.11894944050122e-04 6.40307772153907e-04 +7.66946600270390e-04 8.91451060164945e-04 1.01346942811106e-03 1.13266236501617e-03 +1.24870652023524e-03 1.36129796151693e-03 1.47015539542440e-03 1.57502314482313e-03 +1.67567385263461e-03 1.77191088397356e-03 1.86357040199629e-03 1.95052309624114e-03 +2.03267554592401e-03 2.10997120450066e-03 2.18239099579663e-03 2.24995351609403e-03 +2.31271484069514e-03 2.37076793763470e-03 2.42424169531792e-03 2.47329957489972e-03 +2.51813790213421e-03 2.55898381717594e-03 2.59609290437445e-03 2.62974652742199e-03 +2.66024889826438e-03 2.68792391093612e-03 2.71311177389699e-03 2.73616547651737e-03 +2.75744712704334e-03 2.77732420067433e-03 2.79616573727256e-03 2.81433852870536e-03 +2.83220333587303e-03 2.85011117512020e-03 2.86839971294559e-03 2.88738980674934e-03 +2.90738222777708e-03 2.92865460046943e-03 2.95145859011823e-03 2.97601736809191e-03 +3.00252338094991e-03 3.03113644655199e-03 3.06198219681504e-03 3.09515088311585e-03 +3.13069655651930e-03 3.16863663106708e-03 3.20895183434047e-03 3.25158654543923e-03 +3.29644951646150e-03 3.34341496954476e-03 3.39232405759596e-03 3.44298667303422e-03 +3.49518358522875e-03 3.54866888387787e-03 3.60317270238629e-03 3.65840419236799e-03 +3.71405471779326e-03 3.76980123500513e-03 3.82530982290165e-03 3.88023932602161e-03 +3.93424507210326e-03 3.98698262492117e-03 4.03811153285123e-03 4.08729903367068e-03 +4.13422367657484e-03 4.17857882326817e-03 4.22007599126606e-03 4.25844800420681e-03 +4.29345191600213e-03 4.32487167803222e-03 4.35252052128802e-03 4.37624302835495e-03 +4.39591687338389e-03 4.41145421168022e-03 4.42280270420896e-03 4.42994616614554e-03 +4.43290483253938e-03 4.43173523817136e-03 4.42652971272966e-03 4.41741549646226e-03 +4.40455348544279e-03 4.38813661947352e-03 4.36838792939862e-03 4.34555826417839e-03 +4.31992372143875e-03 4.29178280833444e-03 4.26145336240007e-03 4.22926926460010e-03 +4.19557697898553e-03 4.16073195520727e-03 4.12509493159909e-03 4.08902817761841e-03 +4.05289171509802e-03 4.01703955802645e-03 3.98181601041483e-03 3.94755206124539e-03 +3.91456191452363e-03 3.88313969108695e-03 3.85355633707147e-03 3.82605677182449e-03 +3.80085730559055e-03 3.77814335452299e-03 3.75806747750968e-03 3.74074775597711e-03 +3.72626653429265e-03 3.71466953465018e-03 3.70596535644670e-03 3.70012536616220e-03 +3.69708397970246e-03 3.69673933508087e-03 3.69895434924592e-03 3.70355814885740e-03 +3.71034786090388e-03 3.71909074528704e-03 3.72952664790949e-03 3.74137074943171e-03 +3.75431658174180e-03 3.76803928134471e-03 3.78219904635388e-03 3.79644476158469e-03 +3.81041775442492e-03 3.82375564271814e-03 3.83609623485240e-03 3.84708144160897e-03 +3.85636115910663e-03 3.86359708237202e-03 3.86846640968053e-03 3.87066539883599e-03 +3.86991273797894e-03 3.86595269532780e-03 3.85855801443262e-03 3.84753252404846e-03 +3.83271343458292e-03 3.81397329621229e-03 3.79122159716067e-03 3.76440598426642e-03 +3.73351309177722e-03 3.69856896828464e-03 3.65963909579195e-03 3.61682799905714e-03 +3.57027844753560e-03 3.52017025640767e-03 3.46671869728788e-03 3.41017253321788e-03 +3.35081169641665e-03 3.28894463095235e-03 3.22490532597340e-03 3.15905006835798e-03 +3.09175394657863e-03 3.02340714020031e-03 2.95441103170671e-03 2.88517417926419e-03 +2.81610819055552e-03 2.74762353893687e-03 2.68012536387336e-03 2.61400929788283e-03 +2.54965736205416e-03 2.48743397160098e-03 2.42768209185997e-03 2.37071958363686e-03 +2.31683577483737e-03 2.26628829287637e-03 2.21930018942143e-03 2.17605738554557e-03 +2.13670646129551e-03 2.10135280893174e-03 2.07005916355982e-03 2.04284451839651e-03 +2.01968342432539e-03 2.00050566449589e-03 1.98519628431178e-03 1.97359594510576e-03 +1.96550155607636e-03 1.96066712389447e-03 1.95880474333987e-03 1.95958563653062e-03 +1.96264113460689e-03 1.96756348690177e-03 1.97390638245832e-03 1.98118508209455e-03 +1.98887609175279e-03 1.99641636568688e-03 2.00320211674375e-03 2.00858743455168e-03 +2.01188307149666e-03 2.01235594648391e-03 2.00923012615652e-03 2.00169025236148e-03 +1.98888856378087e-03 1.96995677052501e-03 1.94402403849981e-03 1.91024217908704e-03 +1.86781877774961e-03 1.81605840495603e-03 1.75441123008360e-03 1.68252733203111e-03 +1.60031383683141e-03 1.50799082200226e-03 1.40614085682378e-03 1.29574627045705e-03 +1.17820793588405e-03 1.05533968806227e-03 9.29333571940635e-04 8.02692975222181e-04 +6.78133276939918e-04 5.58452759173953e-04 4.46379900955937e-04 3.44406430348353e-04 +2.54618236721998e-04 1.78538031928774e-04 1.16994156045447e-04 7.00289366039619e-05 +3.68574877544804e-05 1.58839320891778e-05 4.77709033014763e-06 6.02219951222750e-07 +-0.00000000000000e+00 + Type L N + 0 2 1 +0.00000000000000e+00 -2.37181002835448e-04 -9.48276207466885e-04 -2.13194306510310e-03 +-3.78594687085924e-03 -5.90716507950551e-03 -8.49159334960620e-03 -1.15343533178962e-02 +-1.50297021059248e-02 -1.89710435609268e-02 -2.33509412325090e-02 -2.81611330860209e-02 +-3.33925479523686e-02 -3.90353237124966e-02 -4.50788272127743e-02 -5.15116759050501e-02 +-5.83217612021684e-02 -6.54962735362649e-02 -7.30217291031570e-02 -8.08839982716407e-02 +-8.90683356314908e-02 -9.75594116484627e-02 -1.06341345888631e-01 -1.15397741768010e-01 +-1.24711722776602e-01 -1.34265970118897e-01 -1.44042761705393e-01 -1.54024012422037e-01 +-1.64191315596606e-01 -1.74525985573090e-01 -1.85009101297091e-01 -1.95621550807265e-01 +-2.06344076519928e-01 -2.17157321186256e-01 -2.28041874394027e-01 -2.38978319478757e-01 +-2.49947280702411e-01 -2.60929470551640e-01 -2.71905737001917e-01 -2.82857110588962e-01 +-2.93764851124599e-01 -3.04610493890740e-01 -3.15375895142569e-01 -3.26043276750272e-01 +-3.36595269807914e-01 -3.47014957038280e-01 -3.57285913823738e-01 -3.67392247695497e-01 +-3.77318636116979e-01 -3.87050362401500e-01 -3.96573349609943e-01 -4.05874192280717e-01 +-4.14940185851943e-01 -4.23759353644441e-01 -4.32320471283781e-01 -4.40613088450231e-01 +-4.48627547856916e-01 -4.56355001368817e-01 -4.63787423188297e-01 -4.70917620046587e-01 +-4.77739238354986e-01 -4.84246768284402e-01 -4.90435544757074e-01 -4.96301745349897e-01 +-5.01842385124539e-01 -5.07055308415378e-01 -5.11939177622162e-01 -5.16493459069993e-01 +-5.20718406014754e-01 -5.24615038887223e-01 -5.28185122883818e-01 -5.31431143026034e-01 +-5.34356276824131e-01 -5.36964364693282e-01 -5.39259878282273e-01 -5.41247886885727e-01 +-5.42934022120674e-01 -5.44324441057084e-01 -5.45425787999533e-01 -5.46245155123563e-01 +-5.46790042175375e-01 -5.47068315447255e-01 -5.47088166243564e-01 -5.46858069053193e-01 +-5.46386739644038e-01 -5.45683093293402e-01 -5.44756203365151e-01 -5.43615260440076e-01 +-5.42269532200246e-01 -5.40728324261129e-01 -5.39000942137176e-01 -5.37096654517162e-01 +-5.35024658015282e-01 -5.32794043552534e-01 -5.30413764510675e-01 -5.27892606787903e-01 +-5.25239160871556e-01 -5.22461796028699e-01 -5.19568636700477e-01 -5.16567541170758e-01 +-5.13466082563960e-01 -5.10271532211131e-01 -5.06990845407500e-01 -5.03630649568948e-01 +-5.00197234779212e-01 -4.96696546704359e-01 -4.93134181836150e-01 -4.89515385011546e-01 +-4.85845049141807e-01 -4.82127717071617e-01 -4.78367585476365e-01 -4.74568510694372e-01 +-4.70734016380395e-01 -4.66867302857370e-01 -4.62971258035016e-01 -4.59048469756743e-01 +-4.55101239430260e-01 -4.51131596792482e-01 -4.47141315655685e-01 -4.43131930479492e-01 +-4.39104753612081e-01 -4.35060893044069e-01 -4.31001270519740e-01 -4.26926639852688e-01 +-4.22837605296457e-01 -4.18734639825352e-01 -4.14618103186191e-01 -4.10488259588341e-01 +-4.06345294906817e-01 -4.02189333281468e-01 -3.98020453004253e-01 -3.93838701596226e-01 +-3.89644109985983e-01 -3.85436705711949e-01 -3.81216525081827e-01 -3.76983624233743e-01 +-3.72738089055011e-01 -3.68480043925818e-01 -3.64209659266567e-01 -3.59927157878823e-01 +-3.55632820080845e-01 -3.51326987649364e-01 -3.47010066589591e-01 -3.42682528765205e-01 +-3.38344912429365e-01 -3.33997821706374e-01 -3.29641925081569e-01 -3.25277952964211e-01 +-3.20906694394524e-01 -3.16528992971605e-01 -3.12145742083641e-01 -3.07757879525693e-01 +-3.03366381593194e-01 -2.98972256741387e-01 -2.94576538901993e-01 -2.90180280548638e-01 +-2.85784545601943e-01 -2.81390402263646e-01 -2.76998915866839e-01 -2.72611141826307e-01 +-2.68228118769150e-01 -2.63850861921362e-01 -2.59480356820961e-01 -2.55117553422563e-01 +-2.50763360652190e-01 -2.46418641464467e-01 -2.42084208447514e-01 -2.37760820013578e-01 +-2.33449177206120e-01 -2.29149921146511e-01 -2.24863631135969e-01 -2.20590823420835e-01 +-2.16331950621834e-01 -2.12087401820752e-01 -2.07857503290931e-01 -2.03642519851269e-01 +-1.99442656817090e-01 -1.95258062515316e-01 -1.91088831325930e-01 -1.86935007206792e-01 +-1.82796587654475e-01 -1.78673528050031e-01 -1.74565746335398e-01 -1.70473127963627e-01 +-1.66395531064243e-01 -1.62332791763775e-01 -1.58284729600969e-01 -1.54251152976219e-01 +-1.50231864575505e-01 -1.46226666710404e-01 -1.42235366517727e-01 -1.38257780964745e-01 +-1.34293741609039e-01 -1.30343099065472e-01 -1.26405727136738e-01 -1.22481526568283e-01 +-1.18570428393056e-01 -1.14672396836538e-01 -1.10787431757705e-01 -1.06915570606931e-01 +-1.03056889887365e-01 -9.92115061118646e-02 -9.53795762531144e-02 -9.15612976900645e-02 +-8.77569076592064e-02 -8.39666822244162e-02 -8.01909347840934e-02 -7.64300141390467e-02 +-7.26843021489873e-02 -6.89542110095521e-02 -6.52401801854299e-02 -6.15426730384115e-02 +-5.78621731919523e-02 -5.41991806761540e-02 -5.05542078988650e-02 -4.69277754899216e-02 +-4.33204080663174e-02 -3.97326299663828e-02 -3.61649610007989e-02 -3.26179122675424e-02 +-2.90919820766150e-02 -2.55876520287229e-02 -2.21053832899088e-02 -1.86456131015920e-02 +-1.52087515625041e-02 -1.17951787157163e-02 -8.40524197033027e-03 -5.03925388353224e-03 +-1.69749032457707e-03 1.61981096200024e-03 4.91245138109751e-03 8.18027209133380e-03 +1.14231541851273e-02 1.46410184453075e-02 1.78338246835029e-02 2.10015706699190e-02 +2.41442906683531e-02 2.72620535943412e-02 3.03549608181310e-02 3.34231436377278e-02 +3.64667604504882e-02 3.94859936546465e-02 4.24810463147007e-02 4.54521386267475e-02 +4.83995042216081e-02 5.13233863449408e-02 5.42240339544373e-02 5.71016977747021e-02 +5.99566263504537e-02 6.27890621383244e-02 6.55992376767348e-02 6.83873718721240e-02 +7.11536664382246e-02 7.38983025231179e-02 7.66214375564925e-02 7.93232023469212e-02 +8.20036984560536e-02 8.46629958734700e-02 8.73011310125629e-02 8.99181050442574e-02 +9.25138825816891e-02 9.50883907251605e-02 9.76415184728462e-02 1.00173116498849e-01 +1.02682997296361e-01 1.05170935679900e-01 1.07636669636928e-01 1.10079901515604e-01 +1.12500299532085e-01 1.14897499577630e-01 1.17271107302880e-01 1.19620700454029e-01 +1.21945831433306e-01 1.24246030054148e-01 1.26520806459780e-01 1.28769654172615e-01 +1.30992053240894e-01 1.33187473448450e-01 1.35355377553218e-01 1.37495224520319e-01 +1.39606472716050e-01 1.41688583030016e-01 1.43741021893859e-01 1.45763264166630e-01 +1.47754795858712e-01 1.49715116668370e-01 1.51643742307432e-01 1.53540206595288e-01 +1.55404063303225e-01 1.57234887734191e-01 1.59032278026236e-01 1.60795856171178e-01 +1.62525268743341e-01 1.64220187336664e-01 1.65880308711756e-01 1.67505354657876e-01 +1.69095071578001e-01 1.70649229808310e-01 1.72167622686384e-01 1.73650065385213e-01 +1.75096393532722e-01 1.76506461638857e-01 1.77880141354395e-01 1.79217319587465e-01 +1.80517896505250e-01 1.81781783449596e-01 1.83008900796098e-01 1.84199175786783e-01 +1.85352540366741e-01 1.86468929054881e-01 1.87548276878563e-01 1.88590517401024e-01 +1.89595580869411e-01 1.90563392509805e-01 1.91493870993900e-01 1.92386927100003e-01 +1.93242462588803e-01 1.94060369311851e-01 1.94840528568096e-01 1.95582810720911e-01 +1.96287075085154e-01 1.96953170090674e-01 1.97580933725572e-01 1.98170194259302e-01 +1.98720771242545e-01 1.99232476777614e-01 1.99705117050035e-01 2.00138494108991e-01 +2.00532407881399e-01 2.00886658401712e-01 2.01201048236988e-01 2.01475385084461e-01 +2.01709484516789e-01 2.01903172848305e-01 2.02056290094089e-01 2.02168692992419e-01 +2.02240258060221e-01 2.02270884650531e-01 2.02260497980664e-01 2.02209052099827e-01 +2.02116532765264e-01 2.01982960196695e-01 2.01808391679791e-01 2.01592923990733e-01 +2.01336695615481e-01 2.01039888739228e-01 2.00702730983633e-01 2.00325496871769e-01 +1.99908509003260e-01 1.99452138924837e-01 1.98956807684403e-01 1.98422986059716e-01 +1.97851194455912e-01 1.97242002469234e-01 1.96596028117527e-01 1.95913936741265e-01 +1.95196439581989e-01 1.94444292048166e-01 1.93658291681404e-01 1.92839275838872e-01 +1.91988119110398e-01 1.91105730491297e-01 1.90193050334223e-01 1.89251047105422e-01 +1.88280713972591e-01 1.87283065253055e-01 1.86259132752238e-01 1.85209962023379e-01 +1.84136608580046e-01 1.83040134093391e-01 1.81921602606045e-01 1.80782076794321e-01 +1.79622614309730e-01 1.78444264229952e-01 1.77248063648153e-01 1.76035034428119e-01 +1.74806180150865e-01 1.73562483276430e-01 1.72304902542335e-01 1.71034370617769e-01 +1.69751792029969e-01 1.68458041376511e-01 1.67153961834393e-01 1.65840363973788e-01 +1.64518024881381e-01 1.63187687595113e-01 1.61850060849138e-01 1.60505819124753e-01 +1.59155603000158e-01 1.57800019788964e-01 1.56439644454708e-01 1.55075020785961e-01 +1.53706662814253e-01 1.52335056454763e-01 1.50960661347748e-01 1.49583912876886e-01 +1.48205224339199e-01 1.46824989239977e-01 1.45443583685136e-01 1.44061368842771e-01 +1.42678693445264e-01 1.41295896303220e-01 1.39913308802691e-01 1.38531257357642e-01 +1.37150065790395e-01 1.35770057613815e-01 1.34391558190327e-01 1.33014896744426e-01 +1.31640408207084e-01 1.30268434872516e-01 1.28899327849907e-01 1.27533448295088e-01 +1.26171168409618e-01 1.24812872197366e-01 1.23458955971339e-01 1.22109828606297e-01 +1.20765911535447e-01 1.19427638492299e-01 1.18095455001520e-01 1.16769817625307e-01 +1.15451192974434e-01 1.14140056495595e-01 1.12836891049049e-01 1.11542185292779e-01 +1.10256431891383e-01 1.08980125569735e-01 1.07713761033068e-01 1.06457830776465e-01 +1.05212822807897e-01 1.03979218309744e-01 1.02757489264396e-01 1.01548096069771e-01 +1.00351485170693e-01 9.91680867318000e-02 9.79983123771746e-02 9.68425530211260e-02 +9.57011768135275e-02 9.45745272218680e-02 9.34629212706876e-02 9.23666479573705e-02 +9.12859668613853e-02 9.02211069620006e-02 8.91722656772955e-02 8.81396081349523e-02 +8.71232666828782e-02 8.61233406451972e-02 8.51398963265878e-02 8.41729672653678e-02 +8.32225547331415e-02 8.22886284762912e-02 8.13711276920965e-02 8.04699622298756e-02 +7.95850140052404e-02 7.87161386134003e-02 7.78631671254409e-02 7.70259080496775e-02 +7.62041494385352e-02 7.53976611199877e-02 7.46061970313744e-02 7.38294976324454e-02 +7.30672923737626e-02 7.23193021961037e-02 7.15852420362978e-02 7.08648233149606e-02 +7.01577563818835e-02 6.94637528953777e-02 6.87825281126567e-02 6.81138030693683e-02 +6.74573066276289e-02 6.68127773733699e-02 6.61799653454587e-02 6.55586335808794e-02 +6.49485594622449e-02 6.43495358560261e-02 6.37613720321211e-02 6.31838943576965e-02 +6.26169467606311e-02 6.20603909603030e-02 6.15141064659060e-02 6.09779903449074e-02 +6.04519567666434e-02 5.99359363283832e-02 5.94298751734292e-02 5.89337339129573e-02 +5.84474863653037e-02 5.79711181282569e-02 5.75046250015926e-02 5.70480112785897e-02 +5.66012879265544e-02 5.61644706774607e-02 5.57375780506689e-02 5.53206293303031e-02 +5.49136425202522e-02 5.45166322998979e-02 5.41296080035664e-02 5.37525716463635e-02 +5.33855160184647e-02 5.30284228691250e-02 5.26812612006424e-02 5.23439856912701e-02 +5.20165352646333e-02 5.16988318215973e-02 5.13907791487519e-02 5.10922620157606e-02 +5.08031454717773e-02 5.05232743489928e-02 5.02524729791473e-02 4.99905451265738e-02 +4.97372741390256e-02 4.94924233152299e-02 4.92557364858134e-02 4.90269388019900e-02 +4.88057377242123e-02 4.85918242008904e-02 4.83848740252823e-02 4.81845493568067e-02 +4.79905003913119e-02 4.78023671632887e-02 4.76197814616548e-02 4.74423688395665e-02 +4.72697506977584e-02 4.71015464201669e-02 4.69373755400756e-02 4.67768599147338e-02 +4.66196258863411e-02 4.64653064074658e-02 4.63135431093700e-02 4.61639882923424e-02 +4.60163068179837e-02 4.58701778844464e-02 4.57252966668779e-02 4.55813758067491e-02 +4.54381467353527e-02 4.52953608185023e-02 4.51527903113473e-02 4.50102291142130e-02 +4.48674933224545e-02 4.47244215654725e-02 4.45808751322299e-02 4.44367378828330e-02 +4.42919159479618e-02 4.41463372201288e-02 4.39999506428964e-02 4.38527253062647e-02 +4.37046493584331e-02 4.35557287460160e-02 4.34059857965460e-02 4.32554576586972e-02 +4.31041946170951e-02 4.29522582998432e-02 4.27997197979521e-02 4.26466577167290e-02 +4.24931561798271e-02 4.23393028070950e-02 4.21851866875738e-02 4.20308963689789e-02 +4.18765178847704e-02 4.17221328394605e-02 4.15678165721382e-02 4.14136364173172e-02 +4.12596500811366e-02 4.11059041496918e-02 4.09524327448307e-02 4.07992563411769e-02 +4.06463807563987e-02 4.04937963249046e-02 4.03414772631865e-02 4.01893812330036e-02 +4.00374491065018e-02 3.98856049352360e-02 3.97337561229124e-02 3.95817937995288e-02 +3.94295933924784e-02 3.92770153881209e-02 3.91239062753396e-02 3.89700996607036e-02 +3.88154175430747e-02 3.86596717338460e-02 3.85026654074938e-02 3.83441947657874e-02 +3.81840507978357e-02 3.80220211171741e-02 3.78578918563241e-02 3.76914495986829e-02 +3.75224833272467e-02 3.73507863695282e-02 3.71761583181031e-02 3.69984069065068e-02 +3.68173498207109e-02 3.66328164271051e-02 3.64446493988263e-02 3.62527062233637e-02 +3.60568605756396e-02 3.58570035422022e-02 3.56530446837473e-02 3.54449129248923e-02 +3.52325572619618e-02 3.50159472814524e-02 3.47950734838428e-02 3.45699474094589e-02 +3.43406015651755e-02 3.41070891528279e-02 3.38694836022708e-02 3.36278779140668e-02 +3.33823838187660e-02 3.31331307616431e-02 3.28802647235731e-02 3.26239468904184e-02 +3.23643521848661e-02 3.21016676760661e-02 3.18360908836778e-02 3.15678279940030e-02 +3.12970920067775e-02 3.10241008318827e-02 3.07490753557326e-02 3.04722374973728e-02 +3.01938082743987e-02 2.99140058986672e-02 2.96330439214215e-02 2.93511294469031e-02 +2.90684614327667e-02 2.87852290946759e-02 2.85016104313315e-02 2.82177708848914e-02 +2.79338621502938e-02 2.76500211454076e-02 2.73663691522218e-02 2.70830111374718e-02 +2.68000352591965e-02 2.65175125637528e-02 2.62354968758016e-02 2.59540248817407e-02 +2.56731164050211e-02 2.53927748697608e-02 2.51129879470866e-02 2.48337283767161e-02 +2.45549549544485e-02 2.42766136744878e-02 2.39986390139090e-02 2.37209553450713e-02 +2.34434784604626e-02 2.31661171932668e-02 2.28887751159593e-02 2.26113522984164e-02 +2.23337471064103e-02 2.20558580209422e-02 2.17775854586607e-02 2.14988335736009e-02 +2.12195120206939e-02 2.09395376618998e-02 2.06588361964279e-02 2.03773436973177e-02 +2.00950080376406e-02 1.98117901907510e-02 1.95276653903455e-02 1.92426241375689e-02 +1.89566730440160e-02 1.86698355012124e-02 1.83821521689846e-02 1.80936812770376e-02 +1.78044987360293e-02 1.75146980564385e-02 1.72243900755467e-02 1.69337024948831e-02 +1.66427792324753e-02 1.63517795962090e-02 1.60608772864838e-02 1.57702592381628e-02 +1.54801243135086e-02 1.51906818593812e-02 1.49021501434156e-02 1.46147546851802e-02 +1.43287264994437e-02 1.40443002696203e-02 1.37617124702156e-02 1.34811994576591e-02 +1.32029955492623e-02 1.29273311101965e-02 1.26544306683242e-02 1.23845110764581e-02 +1.21177797411538e-02 1.18544329364725e-02 1.15946542202939e-02 1.13386129697122e-02 +1.10864630508331e-02 1.08383416369100e-02 1.05943681872352e-02 1.03546435975432e-02 +1.01192495309178e-02 9.88824793632603e-03 9.66168075996577e-03 9.43956985260976e-03 +9.22191707410352e-03 9.00870459412250e-03 8.79989538625313e-03 8.59543391045176e-03 +8.39524697696968e-03 8.19924478293687e-03 8.00732211099040e-03 7.81935967763494e-03 +7.63522561744725e-03 7.45477708780515e-03 7.27786197754243e-03 7.10432070182939e-03 +6.93398806464819e-03 6.76669516950078e-03 6.60227135845357e-03 6.44054615928921e-03 +6.28135122041644e-03 6.12452221327596e-03 5.96990068227878e-03 5.81733582281504e-03 +5.66668616857789e-03 5.51782117034965e-03 5.37062264947718e-03 5.22498611053097e-03 +5.08082189905984e-03 4.93805619192695e-03 4.79663180941273e-03 4.65650884009158e-03 +4.51766507139668e-03 4.38009622078351e-03 4.24381596444567e-03 4.10885576262116e-03 +3.97526448262409e-03 3.84310782282669e-03 3.71246754288028e-03 3.58344050747553e-03 +3.45613755288694e-03 3.33068218739873e-03 3.20720913845619e-03 3.08586276100215e-03 +2.96679532293590e-03 2.85016518494444e-03 2.73613489310342e-03 2.62486920359883e-03 +2.51653305968818e-03 2.41128954157614e-03 2.30929781022901e-03 2.21071106628220e-03 +2.11567454510750e-03 2.02432356878921e-03 1.93678167522413e-03 1.85315884378483e-03 +1.77354983599071e-03 1.69803266839175e-03 1.62666723338611e-03 1.55949408195501e-03 +1.49653338027431e-03 1.43778404983396e-03 1.38322309801373e-03 1.33280514297874e-03 +1.28646213319323e-03 1.24410325774623e-03 1.20561503894597e-03 1.17086159322872e-03 +1.13968504032447e-03 1.11190603390964e-03 1.08732437989413e-03 1.06571970147563e-03 +1.04685210393238e-03 1.03046278795499e-03 1.01627455976079e-03 1.00399219137655e-03 +9.93302597809181e-04 9.83874822065162e-04 9.75359856724342e-04 9.67390383950630e-04 +9.59580585009704e-04 9.51526253932439e-04 9.42805543186159e-04 9.32980763535652e-04 +9.21601742848727e-04 9.08211302475873e-04 8.92353414909149e-04 8.73584541472717e-04 +8.51488494419762e-04 8.25694910471434e-04 7.95901059231052e-04 7.61896251311558e-04 +7.23587586598823e-04 6.81025241236446e-04 6.34424998752067e-04 5.84185365050571e-04 +5.30896451585218e-04 4.75337940937241e-04 4.18463918030367e-04 3.61373177202302e-04 +3.05264773643558e-04 2.51379999099611e-04 2.00933497984825e-04 1.55037732625504e-04 +1.14626264416553e-04 8.03821526920402e-05 5.26780267691607e-05 3.15339579430237e-05 +1.65981250331966e-05 7.15349777321055e-06 2.15151456221187e-06 2.71236814037202e-07 +0.00000000000000e+00 diff --git a/tests/integrate/930_NO_BI2SE2CU2O2_GPU/INPUT b/tests/integrate/930_NO_BI2SE2CU2O2_GPU/INPUT new file mode 100644 index 0000000000..a6d4bf56a3 --- /dev/null +++ b/tests/integrate/930_NO_BI2SE2CU2O2_GPU/INPUT @@ -0,0 +1,26 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 30 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 +ks_solver cusolver +device gpu +gamma_only 1 diff --git a/tests/integrate/930_NO_BI2SE2CU2O2_GPU/KPT b/tests/integrate/930_NO_BI2SE2CU2O2_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/930_NO_BI2SE2CU2O2_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/930_NO_BI2SE2CU2O2_GPU/STRU b/tests/integrate/930_NO_BI2SE2CU2O2_GPU/STRU new file mode 100644 index 0000000000..433e0a478f --- /dev/null +++ b/tests/integrate/930_NO_BI2SE2CU2O2_GPU/STRU @@ -0,0 +1,47 @@ +ATOMIC_SPECIES +Bi 83.000 ../../PP_ORB/Bi_ONCV_PBE-1.0.upf +Se 34.000 ../../PP_ORB/Se_ONCV_PBE-1.0.upf +Cu 29.000 ../../PP_ORB/Cu_ONCV_PBE-1.0.upf +O 8.000 ../../PP_ORB/O_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/Bi_gga_9au_100Ry_2s2p2d.orb +../../PP_ORB/Se_gga_8au_100Ry_2s2p1d.orb +../../PP_ORB/Cu_gga_9au_100Ry_4s2p2d1f.orb +../../PP_ORB/O_gga_7au_100Ry_2s2p1d.orb + +LATTICE_CONSTANT +1.889716 + +LATTICE_VECTORS + 3.92604 0.00000 0.00000 + 0.00000 3.92604 0.00000 + 0.00000 0.00000 8.95669 + +ATOMIC_POSITIONS +Direct + +Bi +0.0 +2 + 0.2500000 0.2500000 0.1415833 1 1 1 + 0.7500000 0.7500000 0.8584167 1 1 1 + +Se +0.0 +2 + 0.2500000 0.2500000 0.6754258 1 1 1 + 0.7500000 0.7500000 0.3245741 1 1 1 + +Cu +0.0 +2 + 0.2500000 0.7500000 0.5000000 1 1 1 + 0.7500000 0.2500000 0.5000000 1 1 1 + +O +0.0 +2 + 0.2500000 0.7500000 0.0000000 1 1 1 + 0.7500000 0.2500000 0.0000000 1 1 1 + diff --git a/tests/integrate/930_NO_BI2SE2CU2O2_GPU/result.ref b/tests/integrate/930_NO_BI2SE2CU2O2_GPU/result.ref new file mode 100644 index 0000000000..6aeed1c252 --- /dev/null +++ b/tests/integrate/930_NO_BI2SE2CU2O2_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -15010.5138522188426577 +etotperatomref -1876.3142315274 +totalforceref 7.876160 +totalstressref 687.989082 +pointgroupref C_2h +spacegroupref D_4h +nksibzref 1 +totaltimeref 9.49 diff --git a/tests/integrate/931_NO_H20_GPU/INPUT b/tests/integrate/931_NO_H20_GPU/INPUT new file mode 100644 index 0000000000..b02963f8fe --- /dev/null +++ b/tests/integrate/931_NO_H20_GPU/INPUT @@ -0,0 +1,29 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf +nbands 6 +symmetry 0 + +#Parameters (2.Iteration) +ecutwfc 100 +scf_thr 1e-7 +scf_nmax 50 + +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gaussian +smearing_sigma 0.02 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.4 + +#Parameters (6.Deepks) +cal_force 1 +cal_stress 1 +ks_solver cusolver +device gpu +gamma_only 1 diff --git a/tests/integrate/931_NO_H20_GPU/KPT b/tests/integrate/931_NO_H20_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/931_NO_H20_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/931_NO_H20_GPU/STRU b/tests/integrate/931_NO_H20_GPU/STRU new file mode 100644 index 0000000000..ef3d1a974b --- /dev/null +++ b/tests/integrate/931_NO_H20_GPU/STRU @@ -0,0 +1,29 @@ +ATOMIC_SPECIES +H 1.008 ../../PP_ORB/H_ONCV_PBE-1.0.upf +O 15.9994 ../../PP_ORB/O_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/H_gga_8au_100Ry_2s1p.orb +../../PP_ORB/O_gga_7au_100Ry_2s2p1d.orb + + +LATTICE_CONSTANT +1 + +LATTICE_VECTORS +14 0 0 +0 14 0 +0 0 14 + +ATOMIC_POSITIONS +Cartesian + +H +0 +2 +-12.046787058887078 18.76558614676448 8.395247471328744 1 1 1 +-14.228868795885418 20.61549300274637 7.611989524516571 1 1 1 +O +0 +1 +-13.486789117423204 19.684192208418636 8.958321352749174 1 1 1 diff --git a/tests/integrate/931_NO_H20_GPU/result.ref b/tests/integrate/931_NO_H20_GPU/result.ref new file mode 100644 index 0000000000..30aff70bc5 --- /dev/null +++ b/tests/integrate/931_NO_H20_GPU/result.ref @@ -0,0 +1,5 @@ +etotref -466.4707274464855118 +etotperatomref -155.4902424822 +totalforceref 6.416124 +totalstressref 14.717452 +totaltimeref 8.75 diff --git a/tests/integrate/932_NO_H2_dzp_GPU/INPUT b/tests/integrate/932_NO_H2_dzp_GPU/INPUT new file mode 100644 index 0000000000..e8947e2b18 --- /dev/null +++ b/tests/integrate/932_NO_H2_dzp_GPU/INPUT @@ -0,0 +1,27 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 100 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 +gamma_only 1 +ks_solver cusolver +device gpu diff --git a/tests/integrate/932_NO_H2_dzp_GPU/KPT b/tests/integrate/932_NO_H2_dzp_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/932_NO_H2_dzp_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/932_NO_H2_dzp_GPU/STRU b/tests/integrate/932_NO_H2_dzp_GPU/STRU new file mode 100644 index 0000000000..74a1931d0c --- /dev/null +++ b/tests/integrate/932_NO_H2_dzp_GPU/STRU @@ -0,0 +1,22 @@ +ATOMIC_SPECIES +H 1.008 ../../PP_ORB/H_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/H_gga_8au_100Ry_2s1p.orb + +LATTICE_CONSTANT +1.8897270 + +LATTICE_VECTORS + 5.0000000 0.0000000 0.0000000 + 0.0000000 5.0000000 0.0000000 + 0.0000000 0.0000000 5.0000000 + +ATOMIC_POSITIONS +Direct + +H +0.0 +2 + 0 0 0 1 1 1 + 0.74 0 0 1 1 1 \ No newline at end of file diff --git a/tests/integrate/932_NO_H2_dzp_GPU/result.ref b/tests/integrate/932_NO_H2_dzp_GPU/result.ref new file mode 100644 index 0000000000..66d77b9b62 --- /dev/null +++ b/tests/integrate/932_NO_H2_dzp_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -29.8971789774787098 +etotperatomref -14.9485894887 +totalforceref 8.433242 +totalstressref 94.560210 +pointgroupref C_4v +spacegroupref D_4h +nksibzref 1 +totaltimeref 2.74 diff --git a/tests/integrate/932_NO_H2_dzp_ns2_GPU/INPUT b/tests/integrate/932_NO_H2_dzp_ns2_GPU/INPUT new file mode 100644 index 0000000000..eff7e5af40 --- /dev/null +++ b/tests/integrate/932_NO_H2_dzp_ns2_GPU/INPUT @@ -0,0 +1,28 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 50 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 +gamma_only 1 +ks_solver cusolver +device gpu +nspin 2 diff --git a/tests/integrate/932_NO_H2_dzp_ns2_GPU/KPT b/tests/integrate/932_NO_H2_dzp_ns2_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/932_NO_H2_dzp_ns2_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/932_NO_H2_dzp_ns2_GPU/STRU b/tests/integrate/932_NO_H2_dzp_ns2_GPU/STRU new file mode 100644 index 0000000000..74a1931d0c --- /dev/null +++ b/tests/integrate/932_NO_H2_dzp_ns2_GPU/STRU @@ -0,0 +1,22 @@ +ATOMIC_SPECIES +H 1.008 ../../PP_ORB/H_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/H_gga_8au_100Ry_2s1p.orb + +LATTICE_CONSTANT +1.8897270 + +LATTICE_VECTORS + 5.0000000 0.0000000 0.0000000 + 0.0000000 5.0000000 0.0000000 + 0.0000000 0.0000000 5.0000000 + +ATOMIC_POSITIONS +Direct + +H +0.0 +2 + 0 0 0 1 1 1 + 0.74 0 0 1 1 1 \ No newline at end of file diff --git a/tests/integrate/932_NO_H2_dzp_ns2_GPU/result.ref b/tests/integrate/932_NO_H2_dzp_ns2_GPU/result.ref new file mode 100644 index 0000000000..0eec60da0a --- /dev/null +++ b/tests/integrate/932_NO_H2_dzp_ns2_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -29.8972879599141912 +etotperatomref -14.9486439800 +totalforceref 8.429740 +totalstressref 94.665688 +pointgroupref C_4v +spacegroupref D_4h +nksibzref 1 +totaltimeref 4.60 diff --git a/tests/integrate/932_NO_H2_sz_GPU/INPUT b/tests/integrate/932_NO_H2_sz_GPU/INPUT new file mode 100644 index 0000000000..b9822a2a24 --- /dev/null +++ b/tests/integrate/932_NO_H2_sz_GPU/INPUT @@ -0,0 +1,27 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 50 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 +gamma_only 1 +ks_solver cusolver +device gpu diff --git a/tests/integrate/932_NO_H2_sz_GPU/KPT b/tests/integrate/932_NO_H2_sz_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/932_NO_H2_sz_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/932_NO_H2_sz_GPU/STRU b/tests/integrate/932_NO_H2_sz_GPU/STRU new file mode 100644 index 0000000000..74914ffebc --- /dev/null +++ b/tests/integrate/932_NO_H2_sz_GPU/STRU @@ -0,0 +1,22 @@ +ATOMIC_SPECIES +H 1.008 ../../PP_ORB/H_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/H_gga_8au_100Ry_1s.orb + +LATTICE_CONSTANT +1.8897270 + +LATTICE_VECTORS + 5.0000000 0.0000000 0.0000000 + 0.0000000 5.0000000 0.0000000 + 0.0000000 0.0000000 5.0000000 + +ATOMIC_POSITIONS +Direct + +H +0.0 +2 + 0 0 0 1 1 1 + 0.74 0 0 1 1 1 \ No newline at end of file diff --git a/tests/integrate/932_NO_H2_sz_GPU/result.ref b/tests/integrate/932_NO_H2_sz_GPU/result.ref new file mode 100644 index 0000000000..702b51c84f --- /dev/null +++ b/tests/integrate/932_NO_H2_sz_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -29.7733318630435235 +etotperatomref -14.8866659315 +totalforceref 9.449556 +totalstressref 103.659281 +pointgroupref C_4v +spacegroupref D_4h +nksibzref 1 +totaltimeref 1.74 diff --git a/tests/integrate/932_NO_H2_sz_ns2_GPU/INPUT b/tests/integrate/932_NO_H2_sz_ns2_GPU/INPUT new file mode 100644 index 0000000000..e5f00a390c --- /dev/null +++ b/tests/integrate/932_NO_H2_sz_ns2_GPU/INPUT @@ -0,0 +1,28 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 50 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 +gamma_only 1 +ks_solver cusolver +device gpu +nspin 2 diff --git a/tests/integrate/932_NO_H2_sz_ns2_GPU/KPT b/tests/integrate/932_NO_H2_sz_ns2_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/932_NO_H2_sz_ns2_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/932_NO_H2_sz_ns2_GPU/STRU b/tests/integrate/932_NO_H2_sz_ns2_GPU/STRU new file mode 100644 index 0000000000..74914ffebc --- /dev/null +++ b/tests/integrate/932_NO_H2_sz_ns2_GPU/STRU @@ -0,0 +1,22 @@ +ATOMIC_SPECIES +H 1.008 ../../PP_ORB/H_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/H_gga_8au_100Ry_1s.orb + +LATTICE_CONSTANT +1.8897270 + +LATTICE_VECTORS + 5.0000000 0.0000000 0.0000000 + 0.0000000 5.0000000 0.0000000 + 0.0000000 0.0000000 5.0000000 + +ATOMIC_POSITIONS +Direct + +H +0.0 +2 + 0 0 0 1 1 1 + 0.74 0 0 1 1 1 \ No newline at end of file diff --git a/tests/integrate/932_NO_H2_sz_ns2_GPU/result.ref b/tests/integrate/932_NO_H2_sz_ns2_GPU/result.ref new file mode 100644 index 0000000000..7e7d1d2ac4 --- /dev/null +++ b/tests/integrate/932_NO_H2_sz_ns2_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -29.7733486676083210 +etotperatomref -14.8866743338 +totalforceref 9.449540 +totalstressref 103.663285 +pointgroupref C_4v +spacegroupref D_4h +nksibzref 1 +totaltimeref 2.93 diff --git a/tests/integrate/933_NO_H_dzp_GPU/INPUT b/tests/integrate/933_NO_H_dzp_GPU/INPUT new file mode 100644 index 0000000000..a9c11eca9e --- /dev/null +++ b/tests/integrate/933_NO_H_dzp_GPU/INPUT @@ -0,0 +1,27 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 100 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 +gamma_only 1 +ks_solver cusolver +device gpu diff --git a/tests/integrate/933_NO_H_dzp_GPU/KPT b/tests/integrate/933_NO_H_dzp_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/933_NO_H_dzp_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/933_NO_H_dzp_GPU/STRU b/tests/integrate/933_NO_H_dzp_GPU/STRU new file mode 100644 index 0000000000..735c1ead52 --- /dev/null +++ b/tests/integrate/933_NO_H_dzp_GPU/STRU @@ -0,0 +1,21 @@ +ATOMIC_SPECIES +H 1.008 ../../PP_ORB/H_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/H_gga_8au_100Ry_2s1p.orb + +LATTICE_CONSTANT +1.8897270 + +LATTICE_VECTORS + 5.0000000 0.0000000 0.0000000 + 0.0000000 5.0000000 0.0000000 + 0.0000000 0.0000000 5.0000000 + +ATOMIC_POSITIONS +Direct + +H +0.0 +1 + 0 0 0 1 1 1 \ No newline at end of file diff --git a/tests/integrate/933_NO_H_dzp_GPU/result.ref b/tests/integrate/933_NO_H_dzp_GPU/result.ref new file mode 100644 index 0000000000..423c266f4e --- /dev/null +++ b/tests/integrate/933_NO_H_dzp_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -12.5682523517564988 +etotperatomref -12.5682523518 +totalforceref 0.000000 +totalstressref 12.070221 +pointgroupref O_h +spacegroupref O_h +nksibzref 1 +totaltimeref 2.30 diff --git a/tests/integrate/933_NO_H_dzp_ns2_GPU/INPUT b/tests/integrate/933_NO_H_dzp_ns2_GPU/INPUT new file mode 100644 index 0000000000..a01a8bbb3c --- /dev/null +++ b/tests/integrate/933_NO_H_dzp_ns2_GPU/INPUT @@ -0,0 +1,31 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 100 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.4 +mixing_beta_mag 0.4 +mixing_gg0 0.0 +mixing_gg0_mag 0.0 +gamma_only 1 +ks_solver genelpa +device cpu +nspin 2 diff --git a/tests/integrate/933_NO_H_dzp_ns2_GPU/KPT b/tests/integrate/933_NO_H_dzp_ns2_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/933_NO_H_dzp_ns2_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/933_NO_H_dzp_ns2_GPU/STRU b/tests/integrate/933_NO_H_dzp_ns2_GPU/STRU new file mode 100644 index 0000000000..de4fa11f4c --- /dev/null +++ b/tests/integrate/933_NO_H_dzp_ns2_GPU/STRU @@ -0,0 +1,21 @@ +ATOMIC_SPECIES +H 1.008 ../../PP_ORB/H_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/H_gga_8au_100Ry_2s1p.orb + +LATTICE_CONSTANT +1.8897270 + +LATTICE_VECTORS + 5.0000000 0.0000000 0.0000000 + 0.0000000 5.0000000 0.0000000 + 0.0000000 0.0000000 5.0000000 + +ATOMIC_POSITIONS +Direct + +H +1 +1 + 0 0 0 1 1 1 \ No newline at end of file diff --git a/tests/integrate/933_NO_H_dzp_ns2_GPU/result.ref b/tests/integrate/933_NO_H_dzp_ns2_GPU/result.ref new file mode 100644 index 0000000000..18013a7a7f --- /dev/null +++ b/tests/integrate/933_NO_H_dzp_ns2_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -13.6633692527736095 +etotperatomref -13.6633692528 +totalforceref 0.000000 +totalstressref 9.320412 +pointgroupref O_h +spacegroupref O_h +nksibzref 1 +totaltimeref 1.59 diff --git a/tests/integrate/934_NO_Si2_dzp_GPU/INPUT b/tests/integrate/934_NO_Si2_dzp_GPU/INPUT new file mode 100644 index 0000000000..b9822a2a24 --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_GPU/INPUT @@ -0,0 +1,27 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 50 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 +gamma_only 1 +ks_solver cusolver +device gpu diff --git a/tests/integrate/934_NO_Si2_dzp_GPU/KPT b/tests/integrate/934_NO_Si2_dzp_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/934_NO_Si2_dzp_GPU/STRU b/tests/integrate/934_NO_Si2_dzp_GPU/STRU new file mode 100644 index 0000000000..3cda19761c --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_GPU/STRU @@ -0,0 +1,21 @@ +ATOMIC_SPECIES +Si 14.000 ../../PP_ORB/Si_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/Si_gga_8au_100Ry_2s2p1d.orb + +LATTICE_CONSTANT +10.2 + +LATTICE_VECTORS +0.5 0.5 0.0 #Lattice vector 1 +0.5 0.0 0.5 #Lattice vector 2 +0.0 0.5 0.5 #Lattice vector 3 + +ATOMIC_POSITIONS +Cartesian #Cartesian(Unit is LATTICE_CONSTANT) +Si #Name of element +0.0 #Magnetic for this element. +2 #Number of atoms +0.00 0.00 0.00 0 0 0 #x,y,z, move_x, move_y, move_z +0.25 0.25 0.25 1 1 1 \ No newline at end of file diff --git a/tests/integrate/934_NO_Si2_dzp_GPU/result.ref b/tests/integrate/934_NO_Si2_dzp_GPU/result.ref new file mode 100644 index 0000000000..206c2d5097 --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -196.6221723701324322 +etotperatomref -98.3110861851 +totalforceref 0.000000 +totalstressref 1380.541062 +pointgroupref T_d +spacegroupref O_h +nksibzref 1 +totaltimeref 2.56 diff --git a/tests/integrate/934_NO_Si2_dzp_neq_GPU/INPUT b/tests/integrate/934_NO_Si2_dzp_neq_GPU/INPUT new file mode 100644 index 0000000000..b9822a2a24 --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_neq_GPU/INPUT @@ -0,0 +1,27 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 50 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 +gamma_only 1 +ks_solver cusolver +device gpu diff --git a/tests/integrate/934_NO_Si2_dzp_neq_GPU/KPT b/tests/integrate/934_NO_Si2_dzp_neq_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_neq_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/934_NO_Si2_dzp_neq_GPU/STRU b/tests/integrate/934_NO_Si2_dzp_neq_GPU/STRU new file mode 100644 index 0000000000..148f318aca --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_neq_GPU/STRU @@ -0,0 +1,21 @@ +ATOMIC_SPECIES +Si 14.000 ../../PP_ORB/Si_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/Si_gga_8au_100Ry_2s2p1d.orb + +LATTICE_CONSTANT +10.2 + +LATTICE_VECTORS +0.8 0.8 0.0 #Lattice vector 1 +0.5 0.0 0.5 #Lattice vector 2 +0.0 0.2 0.2 #Lattice vector 3 + +ATOMIC_POSITIONS +Cartesian #Cartesian(Unit is LATTICE_CONSTANT) +Si #Name of element +0.0 #Magnetic for this element. +2 #Number of atoms +0.00 0.00 0.00 0 0 0 #x,y,z, move_x, move_y, move_z +0.25 0.25 0.25 1 1 1 \ No newline at end of file diff --git a/tests/integrate/934_NO_Si2_dzp_neq_GPU/result.ref b/tests/integrate/934_NO_Si2_dzp_neq_GPU/result.ref new file mode 100644 index 0000000000..66c8d04b01 --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_neq_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -200.9596900406533280 +etotperatomref -100.4798450203 +totalforceref 143.742152 +totalstressref 18521.013103 +pointgroupref C_1 +spacegroupref S_2 +nksibzref 1 +totaltimeref 4.22 diff --git a/tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/INPUT b/tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/INPUT new file mode 100644 index 0000000000..eff7e5af40 --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/INPUT @@ -0,0 +1,28 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 50 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 +gamma_only 1 +ks_solver cusolver +device gpu +nspin 2 diff --git a/tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/KPT b/tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/STRU b/tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/STRU new file mode 100644 index 0000000000..148f318aca --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/STRU @@ -0,0 +1,21 @@ +ATOMIC_SPECIES +Si 14.000 ../../PP_ORB/Si_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/Si_gga_8au_100Ry_2s2p1d.orb + +LATTICE_CONSTANT +10.2 + +LATTICE_VECTORS +0.8 0.8 0.0 #Lattice vector 1 +0.5 0.0 0.5 #Lattice vector 2 +0.0 0.2 0.2 #Lattice vector 3 + +ATOMIC_POSITIONS +Cartesian #Cartesian(Unit is LATTICE_CONSTANT) +Si #Name of element +0.0 #Magnetic for this element. +2 #Number of atoms +0.00 0.00 0.00 0 0 0 #x,y,z, move_x, move_y, move_z +0.25 0.25 0.25 1 1 1 \ No newline at end of file diff --git a/tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/result.ref b/tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/result.ref new file mode 100644 index 0000000000..46619fa8c3 --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_neq_ns2_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -200.9596872554600964 +etotperatomref -100.4798436277 +totalforceref 143.742152 +totalstressref 18521.013099 +pointgroupref C_1 +spacegroupref S_2 +nksibzref 1 +totaltimeref 6.92 diff --git a/tests/integrate/934_NO_Si2_dzp_ns2_GPU/INPUT b/tests/integrate/934_NO_Si2_dzp_ns2_GPU/INPUT new file mode 100644 index 0000000000..eff7e5af40 --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_ns2_GPU/INPUT @@ -0,0 +1,28 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 50 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 +gamma_only 1 +ks_solver cusolver +device gpu +nspin 2 diff --git a/tests/integrate/934_NO_Si2_dzp_ns2_GPU/KPT b/tests/integrate/934_NO_Si2_dzp_ns2_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_ns2_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/934_NO_Si2_dzp_ns2_GPU/STRU b/tests/integrate/934_NO_Si2_dzp_ns2_GPU/STRU new file mode 100644 index 0000000000..3cda19761c --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_ns2_GPU/STRU @@ -0,0 +1,21 @@ +ATOMIC_SPECIES +Si 14.000 ../../PP_ORB/Si_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/Si_gga_8au_100Ry_2s2p1d.orb + +LATTICE_CONSTANT +10.2 + +LATTICE_VECTORS +0.5 0.5 0.0 #Lattice vector 1 +0.5 0.0 0.5 #Lattice vector 2 +0.0 0.5 0.5 #Lattice vector 3 + +ATOMIC_POSITIONS +Cartesian #Cartesian(Unit is LATTICE_CONSTANT) +Si #Name of element +0.0 #Magnetic for this element. +2 #Number of atoms +0.00 0.00 0.00 0 0 0 #x,y,z, move_x, move_y, move_z +0.25 0.25 0.25 1 1 1 \ No newline at end of file diff --git a/tests/integrate/934_NO_Si2_dzp_ns2_GPU/result.ref b/tests/integrate/934_NO_Si2_dzp_ns2_GPU/result.ref new file mode 100644 index 0000000000..f8a3bf62d6 --- /dev/null +++ b/tests/integrate/934_NO_Si2_dzp_ns2_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -196.6221677973298654 +etotperatomref -98.3110838987 +totalforceref 0.000000 +totalstressref 1380.541365 +pointgroupref T_d +spacegroupref O_h +nksibzref 1 +totaltimeref 4.24 diff --git a/tests/integrate/934_NO_Si2_tzdp_GPU/INPUT b/tests/integrate/934_NO_Si2_tzdp_GPU/INPUT new file mode 100644 index 0000000000..b9822a2a24 --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_GPU/INPUT @@ -0,0 +1,27 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 50 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 +gamma_only 1 +ks_solver cusolver +device gpu diff --git a/tests/integrate/934_NO_Si2_tzdp_GPU/KPT b/tests/integrate/934_NO_Si2_tzdp_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/934_NO_Si2_tzdp_GPU/STRU b/tests/integrate/934_NO_Si2_tzdp_GPU/STRU new file mode 100644 index 0000000000..c6dc6b49b9 --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_GPU/STRU @@ -0,0 +1,21 @@ +ATOMIC_SPECIES +Si 14.000 ../../PP_ORB/Si_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/Si_gga_8au_100Ry_3s3p2d.orb + +LATTICE_CONSTANT +10.2 + +LATTICE_VECTORS +0.5 0.5 0.0 #Lattice vector 1 +0.5 0.0 0.5 #Lattice vector 2 +0.0 0.5 0.5 #Lattice vector 3 + +ATOMIC_POSITIONS +Cartesian #Cartesian(Unit is LATTICE_CONSTANT) +Si #Name of element +0.0 #Magnetic for this element. +2 #Number of atoms +0.00 0.00 0.00 0 0 0 #x,y,z, move_x, move_y, move_z +0.25 0.25 0.25 1 1 1 \ No newline at end of file diff --git a/tests/integrate/934_NO_Si2_tzdp_GPU/result.ref b/tests/integrate/934_NO_Si2_tzdp_GPU/result.ref new file mode 100644 index 0000000000..b517214246 --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -197.1470579143303610 +etotperatomref -98.5735289572 +totalforceref 0.000000 +totalstressref 1377.912180 +pointgroupref T_d +spacegroupref O_h +nksibzref 1 +totaltimeref 4.35 diff --git a/tests/integrate/934_NO_Si2_tzdp_neq_GPU/INPUT b/tests/integrate/934_NO_Si2_tzdp_neq_GPU/INPUT new file mode 100644 index 0000000000..b9822a2a24 --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_neq_GPU/INPUT @@ -0,0 +1,27 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 50 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 +gamma_only 1 +ks_solver cusolver +device gpu diff --git a/tests/integrate/934_NO_Si2_tzdp_neq_GPU/KPT b/tests/integrate/934_NO_Si2_tzdp_neq_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_neq_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/934_NO_Si2_tzdp_neq_GPU/STRU b/tests/integrate/934_NO_Si2_tzdp_neq_GPU/STRU new file mode 100644 index 0000000000..a4b3d70057 --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_neq_GPU/STRU @@ -0,0 +1,21 @@ +ATOMIC_SPECIES +Si 14.000 ../../PP_ORB/Si_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/Si_gga_8au_100Ry_3s3p2d.orb + +LATTICE_CONSTANT +10.2 + +LATTICE_VECTORS +0.9 0.9 0.0 #Lattice vector 1 +0.5 0.0 0.5 #Lattice vector 2 +0.0 0.3 0.3 #Lattice vector 3 + +ATOMIC_POSITIONS +Cartesian #Cartesian(Unit is LATTICE_CONSTANT) +Si #Name of element +0.0 #Magnetic for this element. +2 #Number of atoms +0.00 0.00 0.00 0 0 0 #x,y,z, move_x, move_y, move_z +0.25 0.25 0.25 1 1 1 \ No newline at end of file diff --git a/tests/integrate/934_NO_Si2_tzdp_neq_GPU/result.ref b/tests/integrate/934_NO_Si2_tzdp_neq_GPU/result.ref new file mode 100644 index 0000000000..cb23324273 --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_neq_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -202.0113293338589529 +etotperatomref -101.0056646669 +totalforceref 154.397572 +totalstressref 8606.355128 +pointgroupref C_1 +spacegroupref S_2 +nksibzref 1 +totaltimeref 4.23 diff --git a/tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/INPUT b/tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/INPUT new file mode 100644 index 0000000000..eff7e5af40 --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/INPUT @@ -0,0 +1,28 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 50 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 +gamma_only 1 +ks_solver cusolver +device gpu +nspin 2 diff --git a/tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/KPT b/tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/STRU b/tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/STRU new file mode 100644 index 0000000000..a4b3d70057 --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/STRU @@ -0,0 +1,21 @@ +ATOMIC_SPECIES +Si 14.000 ../../PP_ORB/Si_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/Si_gga_8au_100Ry_3s3p2d.orb + +LATTICE_CONSTANT +10.2 + +LATTICE_VECTORS +0.9 0.9 0.0 #Lattice vector 1 +0.5 0.0 0.5 #Lattice vector 2 +0.0 0.3 0.3 #Lattice vector 3 + +ATOMIC_POSITIONS +Cartesian #Cartesian(Unit is LATTICE_CONSTANT) +Si #Name of element +0.0 #Magnetic for this element. +2 #Number of atoms +0.00 0.00 0.00 0 0 0 #x,y,z, move_x, move_y, move_z +0.25 0.25 0.25 1 1 1 \ No newline at end of file diff --git a/tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/result.ref b/tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/result.ref new file mode 100644 index 0000000000..2885572b5e --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_neq_ns2_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -202.0113267424005130 +etotperatomref -101.0056633712 +totalforceref 154.397574 +totalstressref 8606.356131 +pointgroupref C_1 +spacegroupref S_2 +nksibzref 1 +totaltimeref 7.21 diff --git a/tests/integrate/934_NO_Si2_tzdp_ns2_GPU/INPUT b/tests/integrate/934_NO_Si2_tzdp_ns2_GPU/INPUT new file mode 100644 index 0000000000..eff7e5af40 --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_ns2_GPU/INPUT @@ -0,0 +1,28 @@ +INPUT_PARAMETERS +#Parameters (1.General) +suffix autotest +calculation scf + +#nbands 8 +symmetry 1 + +#Parameters (2.Iteration) +ecutwfc 50 +scf_thr 1e-7 +scf_nmax 100 +cal_force 1 +cal_stress 1 +#Parameters (3.Basis) +basis_type lcao + +#Parameters (4.Smearing) +smearing_method gauss +smearing_sigma 0.002 + +#Parameters (5.Mixing) +mixing_type broyden +mixing_beta 0.3 +gamma_only 1 +ks_solver cusolver +device gpu +nspin 2 diff --git a/tests/integrate/934_NO_Si2_tzdp_ns2_GPU/KPT b/tests/integrate/934_NO_Si2_tzdp_ns2_GPU/KPT new file mode 100644 index 0000000000..c289c0158a --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_ns2_GPU/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +1 1 1 0 0 0 diff --git a/tests/integrate/934_NO_Si2_tzdp_ns2_GPU/STRU b/tests/integrate/934_NO_Si2_tzdp_ns2_GPU/STRU new file mode 100644 index 0000000000..c6dc6b49b9 --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_ns2_GPU/STRU @@ -0,0 +1,21 @@ +ATOMIC_SPECIES +Si 14.000 ../../PP_ORB/Si_ONCV_PBE-1.0.upf + +NUMERICAL_ORBITAL +../../PP_ORB/Si_gga_8au_100Ry_3s3p2d.orb + +LATTICE_CONSTANT +10.2 + +LATTICE_VECTORS +0.5 0.5 0.0 #Lattice vector 1 +0.5 0.0 0.5 #Lattice vector 2 +0.0 0.5 0.5 #Lattice vector 3 + +ATOMIC_POSITIONS +Cartesian #Cartesian(Unit is LATTICE_CONSTANT) +Si #Name of element +0.0 #Magnetic for this element. +2 #Number of atoms +0.00 0.00 0.00 0 0 0 #x,y,z, move_x, move_y, move_z +0.25 0.25 0.25 1 1 1 \ No newline at end of file diff --git a/tests/integrate/934_NO_Si2_tzdp_ns2_GPU/result.ref b/tests/integrate/934_NO_Si2_tzdp_ns2_GPU/result.ref new file mode 100644 index 0000000000..d0c5e0a4d9 --- /dev/null +++ b/tests/integrate/934_NO_Si2_tzdp_ns2_GPU/result.ref @@ -0,0 +1,8 @@ +etotref -197.1470535849873045 +etotperatomref -98.5735267925 +totalforceref 0.000000 +totalstressref 1377.912414 +pointgroupref T_d +spacegroupref O_h +nksibzref 1 +totaltimeref 6.32 diff --git a/tests/integrate/CASES_GPU.txt b/tests/integrate/CASES_GPU.txt index e69de29bb2..707eebaa0f 100644 --- a/tests/integrate/CASES_GPU.txt +++ b/tests/integrate/CASES_GPU.txt @@ -0,0 +1,16 @@ +930_NO_BI2SE2CU2O2_GPU +931_NO_H20_GPU +932_NO_H2_dzp_GPU +932_NO_H2_dzp_ns2_GPU +932_NO_H2_sz_GPU +932_NO_H2_sz_ns2_GPU +933_NO_H_dzp_GPU +933_NO_H_dzp_ns2_GPU +934_NO_Si2_dzp_GPU +934_NO_Si2_dzp_neq_GPU +934_NO_Si2_dzp_neq_ns2_GPU +934_NO_Si2_dzp_ns2_GPU +934_NO_Si2_tzdp_GPU +934_NO_Si2_tzdp_neq_GPU +934_NO_Si2_tzdp_neq_ns2_GPU +934_NO_Si2_tzdp_ns2_GPU \ No newline at end of file