From 76deb2af4e1a9403587b1638026c4feeadcc4e9e Mon Sep 17 00:00:00 2001 From: toxa81 Date: Thu, 11 Jul 2019 12:56:58 +0200 Subject: [PATCH 01/12] add extra charge --- src/Density/density.hpp | 2 +- src/Density/generate_valence.hpp | 13 ++++++++++--- src/K_point/k_point_set.hpp | 27 +++++++++------------------ src/dft_ground_state.cpp | 2 +- src/input.hpp | 4 ++++ src/simulation_context.hpp | 1 + 6 files changed, 26 insertions(+), 23 deletions(-) diff --git a/src/Density/density.hpp b/src/Density/density.hpp index 2c18e0528..a8fd6110b 100644 --- a/src/Density/density.hpp +++ b/src/Density/density.hpp @@ -139,7 +139,7 @@ class Density : public Field4D int ia{-1}; - /// ae and ps local unified densities+magnetization + /* AE and PS local unified densities + magnetization */ std::vector> ae_density_; std::vector> ps_density_; }; diff --git a/src/Density/generate_valence.hpp b/src/Density/generate_valence.hpp index c47fc22f6..bbc9e32bd 100644 --- a/src/Density/generate_valence.hpp +++ b/src/Density/generate_valence.hpp @@ -45,12 +45,13 @@ inline void Density::generate_valence(K_point_set const& ks__) TERMINATE(s); } - if (std::abs(occ_val - unit_cell_.num_valence_electrons()) > 1e-8 && ctx_.comm().rank() == 0) { + if (std::abs(occ_val - unit_cell_.num_valence_electrons() + ctx_.parameters_input().extra_charge_) > 1e-8 && + ctx_.comm().rank() == 0) { std::stringstream s; s << "wrong band occupancies" << std::endl << " computed : " << occ_val << std::endl - << " required : " << unit_cell_.num_valence_electrons() << std::endl - << " difference : " << std::abs(occ_val - unit_cell_.num_valence_electrons()); + << " required : " << unit_cell_.num_valence_electrons() - ctx_.parameters_input().extra_charge_ << std::endl + << " difference : " << std::abs(occ_val - unit_cell_.num_valence_electrons() + ctx_.parameters_input().extra_charge_); WARNING(s); } @@ -131,6 +132,12 @@ inline void Density::generate_valence(K_point_set const& ks__) if (!ctx_.full_potential()) { augment(); + /* remove extra chanrge */ + if (ctx_.gvec().comm().rank() == 0) { + rho().f_pw_local(0) += ctx_.parameters_input().extra_charge_ / ctx_.unit_cell().omega(); + + } + if (ctx_.control().print_hash_ && ctx_.comm().rank() == 0) { auto h = mdarray(&rho().f_pw_local(0), ctx_.gvec().count()).hash(); utils::print_hash("rho", h); diff --git a/src/K_point/k_point_set.hpp b/src/K_point/k_point_set.hpp index d6e82de3a..6136e65f7 100644 --- a/src/K_point/k_point_set.hpp +++ b/src/K_point/k_point_set.hpp @@ -444,9 +444,12 @@ inline void K_point_set::find_band_occupancies() double ne{0}; + /* target number of electrons */ + double ne_target = unit_cell_.num_valence_electrons() - ctx_.parameters_input().extra_charge_; + int step{0}; /* calculate occupations */ - while (std::abs(ne - unit_cell_.num_valence_electrons()) >= 1e-11) { + while (std::abs(ne - ne_target) >= 1e-11) { /* update Efermi */ ef += de; /* compute total number of electrons */ @@ -454,7 +457,8 @@ inline void K_point_set::find_band_occupancies() for (int ik = 0; ik < num_kpoints(); ik++) { for (int ispn = 0; ispn < ctx_.num_spin_dims(); ispn++) { for (int j = 0; j < ctx_.num_bands(); j++) { - bnd_occ(j, ispn, ik) = smearing::gaussian(kpoints_[ik]->band_energy(j, ispn) - ef, ctx_.smearing_width()) * + bnd_occ(j, ispn, ik) = + smearing::gaussian(kpoints_[ik]->band_energy(j, ispn) - ef, ctx_.smearing_width()) * ctx_.max_occupancy(); ne += bnd_occ(j, ispn, ik) * kpoints_[ik]->weight(); } @@ -462,7 +466,7 @@ inline void K_point_set::find_band_occupancies() } sp = s; - s = (ne > unit_cell_.num_valence_electrons()) ? -1 : 1; + s = (ne > ne_target) ? -1 : 1; /* reduce de step if we change the direction, otherwise increase the step */ de = (s != sp) ? (-de * 0.5) : (de * 1.25); @@ -470,18 +474,6 @@ inline void K_point_set::find_band_occupancies() std::stringstream s; s << "search of band occupancies failed after 10000 steps"; TERMINATE(s); - - //ef = 0; - - //for (int ik = 0; ik < num_kpoints(); ik++) { - // ne = unit_cell_.num_valence_electrons(); - // for (int j = 0; j < ctx_.num_bands(); j++) { - // bnd_occ(j, ik) = std::min(ne, static_cast(ctx_.max_occupancy())); - // ne = std::max(ne - ctx_.max_occupancy(), 0.0); - // } - //} - - //break; } step++; } @@ -498,9 +490,8 @@ inline void K_point_set::find_band_occupancies() band_gap_ = 0.0; - int nve = static_cast(unit_cell_.num_valence_electrons() + 1e-12); - if (ctx_.num_spins() == 2 || - (std::abs(nve - unit_cell_.num_valence_electrons()) < 1e-12 && nve % 2 == 0)) { + int nve = static_cast(ne_target + 1e-12); + if (ctx_.num_spins() == 2 || (std::abs(nve - ne_target) < 1e-12 && nve % 2 == 0)) { /* find band gap */ std::vector> eband; std::pair eminmax; diff --git a/src/dft_ground_state.cpp b/src/dft_ground_state.cpp index 39341b433..6e4415fda 100644 --- a/src/dft_ground_state.cpp +++ b/src/dft_ground_state.cpp @@ -506,7 +506,7 @@ void DFT_ground_state::print_info() if (ctx_.comm().rank() == 0 && ctx_.control().verbosity_ >= 1) { if (ctx_.full_potential()) { - double total_core_leakage = 0.0; + double total_core_leakage{0.0}; printf("\n"); printf("Charges and magnetic moments\n"); for (int i = 0; i < 80; i++) { diff --git a/src/input.hpp b/src/input.hpp index 86d868667..0e509e37d 100644 --- a/src/input.hpp +++ b/src/input.hpp @@ -523,6 +523,9 @@ struct Parameters_input /// Reduction of the auxiliary magnetic field at each SCF step. double reduce_aux_bf_{0.0}; + /// Introduce extra charge to the system. Positive charge means extra holes, negative charge - extra electrons. + double extra_charge_{0.0}; + void read(json const& parser) { if (parser.count("parameters")) { @@ -570,6 +573,7 @@ struct Parameters_input molecule_ = parser["parameters"].value("molecule", molecule_); nn_radius_ = parser["parameters"].value("nn_radius", nn_radius_); reduce_aux_bf_ = parser["parameters"].value("reduce_aux_bf", reduce_aux_bf_); + extra_charge_ = parser["parameters"].value("extra_charge", extra_charge_); if (parser["parameters"].count("spin_orbit")) { so_correction_ = parser["parameters"].value("spin_orbit", so_correction_); diff --git a/src/simulation_context.hpp b/src/simulation_context.hpp index e53197794..be1943f65 100644 --- a/src/simulation_context.hpp +++ b/src/simulation_context.hpp @@ -1657,6 +1657,7 @@ inline void Simulation_context::print_info() const printf("number of core electrons : %f\n", unit_cell().num_core_electrons()); printf("number of valence electrons : %f\n", unit_cell().num_valence_electrons()); printf("total number of electrons : %f\n", unit_cell().num_electrons()); + printf("extra charge : %f\n", parameters_input().extra_charge_); printf("total number of aw basis functions : %i\n", unit_cell().mt_aw_basis_size()); printf("total number of lo basis functions : %i\n", unit_cell().mt_lo_basis_size()); printf("number of first-variational states : %i\n", num_fv_states()); From 9000600a7147f9e98ecc7841bd27ba2c8c415a0f Mon Sep 17 00:00:00 2001 From: toxa81 Date: Mon, 15 Jul 2019 09:57:43 +0200 Subject: [PATCH 02/12] add new api functions --- VERSION | 2 +- doc/doxygen.cfg | 2 +- src/constants.hpp | 4 +-- src/generated.f90 | 51 ++++++++++++++++++++++++++++++++ src/sirius_api.cpp | 72 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 127 insertions(+), 4 deletions(-) diff --git a/VERSION b/VERSION index 024b066c0..798e38995 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -6.2.1 +6.3.0 diff --git a/doc/doxygen.cfg b/doc/doxygen.cfg index 13ccefdeb..115dc32ac 100644 --- a/doc/doxygen.cfg +++ b/doc/doxygen.cfg @@ -38,7 +38,7 @@ PROJECT_NAME = "SIRIUS" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "6.2.1" +PROJECT_NUMBER = "6.3.0" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/src/constants.hpp b/src/constants.hpp index 44d4e46ee..00518db7a 100644 --- a/src/constants.hpp +++ b/src/constants.hpp @@ -28,8 +28,8 @@ #include const int major_version = 6; -const int minor_version = 2; -const int revision = 1; +const int minor_version = 3; +const int revision = 0; /// NIST value for the inverse fine structure (http://physics.nist.gov/cuu/Constants/index.html) const double speed_of_light = 137.035999139; diff --git a/src/generated.f90 b/src/generated.f90 index 71e8bbba7..e87574a3a 100644 --- a/src/generated.f90 +++ b/src/generated.f90 @@ -2964,3 +2964,54 @@ subroutine sirius_get_fv_eigen_values_aux(handler,ik,fv_eval,num_fv_states)& call sirius_get_fv_eigen_values_aux(handler,ik,fv_eval,num_fv_states) end subroutine sirius_get_fv_eigen_values +!> @brief Set the values of the function on the regular grid. +!> @param [in] handler DFT ground state handler. +!> @param [in] label Label of the function. +!> @param [in] values Values of the function. +!> @param [in] grid_dims Dimensions of the FFT grid. +!> @param [in] transform_to_pw If true, transform function to PW domain. +subroutine sirius_set_rg_values(handler,label,values,grid_dims,transform_to_pw) +implicit none +type(C_PTR), intent(in) :: handler +character(C_CHAR), dimension(*), intent(in) :: label +real(C_DOUBLE), intent(in) :: values +integer(C_INT), intent(in) :: grid_dims +logical(C_BOOL), optional, target, intent(in) :: transform_to_pw +type(C_PTR) :: transform_to_pw_ptr +interface +subroutine sirius_set_rg_values_aux(handler,label,values,grid_dims,transform_to_pw)& +&bind(C, name="sirius_set_rg_values") +use, intrinsic :: ISO_C_BINDING +type(C_PTR), intent(in) :: handler +character(C_CHAR), dimension(*), intent(in) :: label +real(C_DOUBLE), intent(in) :: values +integer(C_INT), intent(in) :: grid_dims +type(C_PTR), value :: transform_to_pw +end subroutine +end interface + +transform_to_pw_ptr = C_NULL_PTR +if (present(transform_to_pw)) transform_to_pw_ptr = C_LOC(transform_to_pw) + +call sirius_set_rg_values_aux(handler,label,values,grid_dims,transform_to_pw_ptr) +end subroutine sirius_set_rg_values + +!> @brief Get the total magnetization of the system. +!> @param [in] handler DFT ground state handler. +!> @param [out] mag 3D magnetization vector (x,y,z components). +subroutine sirius_get_total_magnetization(handler,mag) +implicit none +type(C_PTR), intent(in) :: handler +real(C_DOUBLE), intent(out) :: mag +interface +subroutine sirius_get_total_magnetization_aux(handler,mag)& +&bind(C, name="sirius_get_total_magnetization") +use, intrinsic :: ISO_C_BINDING +type(C_PTR), intent(in) :: handler +real(C_DOUBLE), intent(out) :: mag +end subroutine +end interface + +call sirius_get_total_magnetization_aux(handler,mag) +end subroutine sirius_get_total_magnetization + diff --git a/src/sirius_api.cpp b/src/sirius_api.cpp index 6d9cc1c04..77bd1f755 100644 --- a/src/sirius_api.cpp +++ b/src/sirius_api.cpp @@ -2920,4 +2920,76 @@ void sirius_get_fv_eigen_values(void* const* handler__, } } +/* @fortran begin function void sirius_set_rg_values Set the values of the function on the regular grid. + @fortran argument in required void* handler DFT ground state handler. + @fortran argument in required string label Label of the function. + @fortran argument in required double values Values of the function. + @fortran argument in required int grid_dims Dimensions of the FFT grid. + @fortran argument in optional bool transform_to_pw If true, transform function to PW domain. + @fortran end */ +void sirius_set_rg_values(void* const* handler__, + char const* label__, + double const* values__, + int const* grid_dims__, + bool const* transform_to_pw__) +{ + GET_GS(handler__); + + std::string label(label__); + + for (int x: {0, 1, 2}) { + if (grid_dims__[x] != gs.ctx().fft().size(x)) { + TERMINATE("wrong FFT grid size"); + } + } + + std::map*> func = { + {"rho", &gs.density().rho()}, + {"magz", &gs.density().magnetization(0)}, + {"magx", &gs.density().magnetization(1)}, + {"magy", &gs.density().magnetization(2)}, + {"veff", &gs.potential().effective_potential()}, + {"bz", &gs.potential().effective_magnetic_field(0)}, + {"bx", &gs.potential().effective_magnetic_field(1)}, + {"by", &gs.potential().effective_magnetic_field(2)}, + {"vxc", &gs.potential().xc_potential()}, + }; + + try { + auto& f = func.at(label); + auto offset = gs.ctx().fft().offset_z() * gs.ctx().fft().size(0) * gs.ctx().fft().size(1); + /* copy local part of the buffer */ + std::copy(values__ + offset, values__ + offset + gs.ctx().fft().local_size(), &f->f_rg(0)); + if (transform_to_pw__ && *transform_to_pw__) { + f->fft_transform(-1); + } + } catch(...) { + TERMINATE("wrong label"); + } +} + +/* @fortran begin function void sirius_get_total_magnetization Get the total magnetization of the system. + @fortran argument in required void* handler DFT ground state handler. + @fortran argument out required double mag 3D magnetization vector (x,y,z components). + @fortran end */ +void sirius_get_total_magnetization(void* const* handler__, + double* mag__) +{ + GET_GS(handler__); + + mdarray total_mag(mag__, 3); + total_mag.zero(); + std::vector mt_mag[3]; + double it_mag[3]; + for (int j = 0; j < gs.ctx().num_mag_dims(); j++) { + total_mag[j] = gs.density().magnetization(j).integrate(mt_mag[j], it_mag[j]); + } + if (gs.ctx().num_mag_dims() == 3) { + /* swap z and x and change order from z,x,y to x,z,y */ + std::swap(total_mag[0], total_mag[1]); + /* swap z and y and change order x,z,y to x,y,z */ + std::swap(total_mag[1], total_mag[2]); + } +} + } // extern "C" From 72e9d431816e19b6f390d738b1ff7e17756ee09e Mon Sep 17 00:00:00 2001 From: toxa81 Date: Mon, 15 Jul 2019 10:59:22 +0200 Subject: [PATCH 03/12] return std::tuple --- python_module/magnetization.hpp | 8 +++----- src/Density/density.hpp | 8 ++------ src/dft_ground_state.cpp | 14 ++++++++++---- src/periodic_function.hpp | 8 +++++--- src/sirius_api.cpp | 5 ++--- 5 files changed, 22 insertions(+), 21 deletions(-) diff --git a/python_module/magnetization.hpp b/python_module/magnetization.hpp index 07910721a..e0428fa95 100644 --- a/python_module/magnetization.hpp +++ b/python_module/magnetization.hpp @@ -12,11 +12,9 @@ std::tuple>, std::list> magnetization(Dens std::list lm; for (int j=0; j < ctx.num_mag_dims(); ++j) { - std::vector vec(3); - double m; - density.magnetization(j).integrate(vec, m); - lvec.push_back(vec); - lm.push_back(m); + auto result = density.magnetization(j).integrate(); + lvec.push_back(std::get<2>(result)); + lm.push_back(std::get<1>(result)); } return std::make_tuple(lvec, lm); diff --git a/src/Density/density.hpp b/src/Density/density.hpp index a8fd6110b..9024f279a 100644 --- a/src/Density/density.hpp +++ b/src/Density/density.hpp @@ -385,9 +385,7 @@ class Density : public Field4D inline void normalize() { - std::vector nel_mt; - double nel_it; - double nel = rho().integrate(nel_mt, nel_it); + double nel = std::get<0>(rho().integrate()); double scale = unit_cell_.num_electrons() / nel; /* renormalize interstitial part */ @@ -410,9 +408,7 @@ class Density : public Field4D { double nel{0}; if (ctx_.full_potential()) { - std::vector nel_mt; - double nel_it; - nel = rho().integrate(nel_mt, nel_it); + nel = std::get<0>(rho().integrate()); } else { nel = rho().f_0().real() * unit_cell_.omega(); } diff --git a/src/dft_ground_state.cpp b/src/dft_ground_state.cpp index 6e4415fda..1b75dc9b3 100644 --- a/src/dft_ground_state.cpp +++ b/src/dft_ground_state.cpp @@ -493,15 +493,21 @@ void DFT_ground_state::print_info() one_elec_en -= potential_.PAW_one_elec_energy(); } - std::vector mt_charge; - double it_charge; - double total_charge = density_.rho().integrate(mt_charge, it_charge); + auto result = density_.rho().integrate(); + + auto total_charge = std::get<0>(result); + auto it_charge = std::get<1>(result); + auto mt_charge = std::get<2>(result); double total_mag[3]; std::vector mt_mag[3]; double it_mag[3]; for (int j = 0; j < ctx_.num_mag_dims(); j++) { - total_mag[j] = density_.magnetization(j).integrate(mt_mag[j], it_mag[j]); + auto result = density_.magnetization(j).integrate(); + + total_mag[j] = std::get<0>(result); + it_mag[j] = std::get<1>(result); + mt_mag[j] = std::get<2>(result); } if (ctx_.comm().rank() == 0 && ctx_.control().verbosity_ >= 1) { diff --git a/src/periodic_function.hpp b/src/periodic_function.hpp index 6cc91144a..d3330b763 100644 --- a/src/periodic_function.hpp +++ b/src/periodic_function.hpp @@ -179,11 +179,12 @@ class Periodic_function : public Smooth_periodic_function } } - T integrate(std::vector& mt_val, T& it_val) const + inline std::tuple> + integrate() const { PROFILE("sirius::Periodic_function::integrate"); - it_val = 0; + T it_val = 0; if (!ctx_.full_potential()) { #pragma omp parallel for schedule(static) reduction(+:it_val) @@ -200,6 +201,7 @@ class Periodic_function : public Smooth_periodic_function this->fft_->comm().allreduce(&it_val, 1); T total = it_val; + std::vector mt_val; if (ctx_.full_potential()) { mt_val = std::vector(unit_cell_.num_atoms(), 0); @@ -214,7 +216,7 @@ class Periodic_function : public Smooth_periodic_function } } - return total; + return std::make_tuple(total, it_val, mt_val); } template diff --git a/src/sirius_api.cpp b/src/sirius_api.cpp index 77bd1f755..34e24000f 100644 --- a/src/sirius_api.cpp +++ b/src/sirius_api.cpp @@ -2979,10 +2979,9 @@ void sirius_get_total_magnetization(void* const* handler__, mdarray total_mag(mag__, 3); total_mag.zero(); - std::vector mt_mag[3]; - double it_mag[3]; for (int j = 0; j < gs.ctx().num_mag_dims(); j++) { - total_mag[j] = gs.density().magnetization(j).integrate(mt_mag[j], it_mag[j]); + auto result = gs.density().magnetization(j).integrate(); + total_mag[j] = std::get<0>(result); } if (gs.ctx().num_mag_dims() == 3) { /* swap z and x and change order from z,x,y to x,z,y */ From da303b28d67230971f67ef04bf42a2b0ea77efa8 Mon Sep 17 00:00:00 2001 From: toxa81 Date: Mon, 15 Jul 2019 11:11:33 +0200 Subject: [PATCH 04/12] add comment --- src/SDDK/linalg_base.hpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/SDDK/linalg_base.hpp b/src/SDDK/linalg_base.hpp index fdd88d6df..e85363338 100644 --- a/src/SDDK/linalg_base.hpp +++ b/src/SDDK/linalg_base.hpp @@ -58,14 +58,22 @@ struct linalg_const } }; +/// Type of linear algebra backend. enum class linalg_t { + /// None none, + /// CPU BLAS blas, + /// CPU LAPACK lapack, + /// CPU ScaLAPACK scalapack, + /// GPU BLAS (cuBlas or ROCblas) gpublas, + /// cuBlasXt (cuBlas with CPU pointers and large matrices support) cublasxt, + /// MAGMA with CPU pointers magma }; From 76fb8ca8d52dffe84beaf25d1bd0237e69eacae1 Mon Sep 17 00:00:00 2001 From: toxa81 Date: Mon, 15 Jul 2019 14:59:44 +0200 Subject: [PATCH 05/12] small fixes --- src/CMakeLists.txt | 15 ++++----------- src/Hamiltonian/non_local_operator.hpp | 7 ++++--- src/K_point/generate_atomic_wave_functions.hpp | 4 ++-- src/Unit_cell/atom_type.hpp | 7 +++++-- src/Unit_cell/radial_functions_index.hpp | 3 ++- src/sirius_api.cpp | 18 +++++++++--------- 6 files changed, 26 insertions(+), 28 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2add6d0ae..e73d9f48e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -17,10 +17,10 @@ endif() set(_SOURSES "dft_ground_state.cpp" "Hubbard/apply_hubbard_potential.cpp" "Hubbard/Hubbard.cpp" -"Hubbard/hubbard_generate_atomic_orbitals.cpp" -"Hubbard/hubbard_occupancies_derivatives.cpp" -"Hubbard/hubbard_occupancy.cpp" -"Hubbard/hubbard_potential_energy.cpp") + "Hubbard/hubbard_generate_atomic_orbitals.cpp" + "Hubbard/hubbard_occupancies_derivatives.cpp" + "Hubbard/hubbard_occupancy.cpp" + "Hubbard/hubbard_potential_energy.cpp") # create library with .cpp, .cu and .f90 sources add_library(sirius STATIC "${_SOURSES};${_CUSOURCES};${_FSOURCES}") @@ -41,13 +41,6 @@ target_link_libraries(sirius PUBLIC OpenMP::OpenMP_CXX) if(CREATE_FORTRAN_BINDINGS) set_target_properties(sirius PROPERTIES Fortran_MODULE_DIRECTORY mod_files) -endif() - -#if(USE_CUDA) -# add_library(sirius STATIC "${_CUSOURCES}") -#endif() - -if(CREATE_FORTRAN_BINDINGS) INSTALL ( CODE "EXECUTE_PROCESS (COMMAND \"${CMAKE_COMMAND}\" -E copy_directory \"${PROJECT_BINARY_DIR}/src/mod_files\" \"${CMAKE_INSTALL_PREFIX}/include/sirius/\")" ) diff --git a/src/Hamiltonian/non_local_operator.hpp b/src/Hamiltonian/non_local_operator.hpp index a9dd21ea3..e59086c9a 100644 --- a/src/Hamiltonian/non_local_operator.hpp +++ b/src/Hamiltonian/non_local_operator.hpp @@ -703,9 +703,10 @@ class Q_operator : public Non_local_operator //}; /// Apply non-local part of the Hamiltonian and S operators. -/** This operations must be combined becuase of the expensive inner product between wave-functions and beta - * projectors, which is computed only once. - * \param [in] spins Range of the spin index +/** This operations must be combined because of the expensive inner product between wave-functions and beta + * projectors, which is computed only once. + * + * \param [in] spins Range of the spin index. * \param [in] N Starting index of the wave-functions. * \param [in] n Number of wave-functions to which D and Q are applied. * \param [in] beta Beta-projectors. diff --git a/src/K_point/generate_atomic_wave_functions.hpp b/src/K_point/generate_atomic_wave_functions.hpp index ee74edf27..66a80838f 100644 --- a/src/K_point/generate_atomic_wave_functions.hpp +++ b/src/K_point/generate_atomic_wave_functions.hpp @@ -40,7 +40,7 @@ inline void K_point::generate_atomic_wave_functions(const basis_functions_index lmax = std::max(lmax, unit_cell_.lmax()); auto& atom_type = unit_cell_.atom(atom).type(); -#pragma omp parallel for schedule(static) + #pragma omp parallel for schedule(static) for (int igk_loc = 0; igk_loc < this->num_gkvec_loc(); igk_loc++) { /* global index of G+k vector */ int igk = this->idxgk(igk_loc); @@ -131,7 +131,7 @@ inline void K_point::compute_gradient_wave_functions(Wave_functions& phi, qalpha[igk_loc] = double_complex(0.0, -G[direction]); } -#pragma omp parallel for schedule(static) + #pragma omp parallel for schedule(static) for (int nphi = 0; nphi < num_wf; nphi++) { for (int ispn = 0; ispn < phi.num_sc(); ispn++) { for (int igk_loc = 0; igk_loc < this->num_gkvec_loc(); igk_loc++) { diff --git a/src/Unit_cell/atom_type.hpp b/src/Unit_cell/atom_type.hpp index fe9a1c4ec..3085305f0 100644 --- a/src/Unit_cell/atom_type.hpp +++ b/src/Unit_cell/atom_type.hpp @@ -82,6 +82,7 @@ class Atom_type std::vector aw_descriptors_; /// List of radial descriptor sets used to construct local orbitals. + /** This list defines all local orbitals for a given atom type */ std::vector lo_descriptors_; /// Maximum number of AW radial functions across angular momentums. @@ -90,9 +91,11 @@ class Atom_type int offset_lo_{-1}; // TODO: better name /// Index of radial basis functions. + /** This index is used in LAPW to combine APW and local-orbital radial functions */ radial_functions_index indexr_; /// Index of atomic basis functions (radial function * spherical harmonic). + /** This index is used in LAPW to combine APW and local-orbital muffin-tin functions */ basis_functions_index indexb_; /// Radial functions of beta-projectors. @@ -103,10 +106,10 @@ class Atom_type Beta-projectors must be loaded before loading the Q radial functions. */ mdarray, 2> q_radial_functions_l_; - /// index for the radial hubbard basis functions + /// Index for the radial hubbard basis functions. radial_functions_index indexr_wfc_; - /// Index of atomic wavefunctions (radial function * spherical harmonic). + /// Index of atomic wavefunctions (radial function * spherical harmonic). basis_functions_index indexb_wfc_; /// Atomic wave-functions used to setup the initial subspace and to apply U-correction. diff --git a/src/Unit_cell/radial_functions_index.hpp b/src/Unit_cell/radial_functions_index.hpp index c36e143f9..8cf9cd4b2 100644 --- a/src/Unit_cell/radial_functions_index.hpp +++ b/src/Unit_cell/radial_functions_index.hpp @@ -154,8 +154,9 @@ class radial_functions_index int order = radial_function_index_descriptors_[i].order; int idxlo = radial_function_index_descriptors_[i].idxlo; index_by_l_order_(l, order) = i; - if (idxlo >= 0) + if (idxlo >= 0) { index_by_idxlo_(idxlo) = i; + } } } diff --git a/src/sirius_api.cpp b/src/sirius_api.cpp index c9816abb9..91d62a56a 100644 --- a/src/sirius_api.cpp +++ b/src/sirius_api.cpp @@ -2944,15 +2944,15 @@ void sirius_set_rg_values(void* const* handler__, } std::map*> func = { - {"rho", &gs.density().rho()}, - {"magz", &gs.density().magnetization(0)}, - {"magx", &gs.density().magnetization(1)}, - {"magy", &gs.density().magnetization(2)}, - {"veff", &gs.potential().effective_potential()}, - {"bz", &gs.potential().effective_magnetic_field(0)}, - {"bx", &gs.potential().effective_magnetic_field(1)}, - {"by", &gs.potential().effective_magnetic_field(2)}, - {"vxc", &gs.potential().xc_potential()}, + {"rho", &gs.density().rho()}, + {"magz", &gs.density().magnetization(0)}, + {"magx", &gs.density().magnetization(1)}, + {"magy", &gs.density().magnetization(2)}, + {"veff", &gs.potential().effective_potential()}, + {"bz", &gs.potential().effective_magnetic_field(0)}, + {"bx", &gs.potential().effective_magnetic_field(1)}, + {"by", &gs.potential().effective_magnetic_field(2)}, + {"vxc", &gs.potential().xc_potential()}, }; try { From a200fb55fde612bf8aef0ae6c55a50c47709b7d4 Mon Sep 17 00:00:00 2001 From: toxa81 Date: Mon, 15 Jul 2019 17:02:48 +0200 Subject: [PATCH 06/12] a fix --- python_module/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python_module/CMakeLists.txt b/python_module/CMakeLists.txt index 527f25059..d1dd356fc 100644 --- a/python_module/CMakeLists.txt +++ b/python_module/CMakeLists.txt @@ -2,6 +2,8 @@ if(CREATE_PYTHON_MODULE) set(CMAKE_CXX_STANDARD 14) find_package(mpi4py REQUIRED) + set(PYBIND11_CPP14 1) + set(pb11_src_dir "${PROJECT_SOURCE_DIR}/python_module/pybind11") check_git_submodule(pybind11 "${pb11_src_dir}") if(NOT pybind11_avail) From 8c6601a710991c13278363f67c11643825137d47 Mon Sep 17 00:00:00 2001 From: toxa81 Date: Tue, 16 Jul 2019 09:17:45 +0200 Subject: [PATCH 07/12] work on Davidson solver profiling --- apps/tests/test_davidson.cpp | 76 ++++++++++++++++++++++-------------- src/SDDK/memory.hpp | 13 ++++++ src/utils/cmd_args.hpp | 18 ++++++++- 3 files changed, 76 insertions(+), 31 deletions(-) diff --git a/apps/tests/test_davidson.cpp b/apps/tests/test_davidson.cpp index 95a03e9b5..3021dcf14 100644 --- a/apps/tests/test_davidson.cpp +++ b/apps/tests/test_davidson.cpp @@ -38,7 +38,7 @@ void init_wf(K_point* kp__, Wave_functions& phi__, int num_bands__, int num_mag_ } } -void test_davidson() +void test_davidson(device_t pu__, double pw_cutoff__, double gk_cutoff__, int N__) { utils::timer t1("test_davidson|setup"); @@ -98,18 +98,17 @@ void test_davidson() } atype.ps_total_charge_density(arho); - int N{2}; /* lattice constant */ double a{5}; /* set lattice vectors */ - ctx.unit_cell().set_lattice_vectors({{a * N, 0, 0}, - {0, a * N, 0}, - {0, 0, a * N}}); + ctx.unit_cell().set_lattice_vectors({{a * N__, 0, 0}, + {0, a * N__, 0}, + {0, 0, a * N__}}); /* add atoms */ - double p = 1.0 / N; - for (int i = 0; i < N; i++) { - for (int j = 0; j < N; j++) { - for (int k = 0; k < N; k++) { + double p = 1.0 / N__; + for (int i = 0; i < N__; i++) { + for (int j = 0; j < N__; j++) { + for (int k = 0; k < N__; k++) { ctx.unit_cell().add_atom("Cu", {i * p, j * p, k * p}); } } @@ -117,12 +116,14 @@ void test_davidson() /* initialize the context */ ctx.set_verbosity(1); - ctx.pw_cutoff(40); - ctx.gk_cutoff(10); + ctx.pw_cutoff(pw_cutoff__); + ctx.gk_cutoff(gk_cutoff__); + ctx.set_processing_unit(pu__); t1.stop(); - ctx.set_verbosity(5); + + ctx.set_verbosity(1); ctx.iterative_solver_tolerance(1e-12); - ctx.set_iterative_solver_type("exact"); + //ctx.set_iterative_solver_type("exact"); ctx.initialize(); Density rho(ctx); @@ -147,10 +148,19 @@ void test_davidson() Hamiltonian H(ctx, pot); H.prepare(); - //Band(ctx).solve_pseudo_potential(kp, H); + Band(ctx).solve_pseudo_potential(kp, H); //for (int i = 0; i < ctx.num_bands(); i++) { // std::cout << "energy[" << i << "]=" << kp.band_energy(i, 0) << "\n"; //} + std::vector ekin(kp.num_gkvec()); + for (int i = 0; i < kp.num_gkvec(); i++) { + ekin[i] = 0.5 * kp.gkvec().gkvec_cart(i).length2(); + } + std::sort(ekin.begin(), ekin.end()); + + for (int i = 0; i < ctx.num_bands(); i++) { + printf("%20.16f %20.16f %20.16e\n", ekin[i], kp.band_energy(i, 0), std::abs(ekin[i] - kp.band_energy(i, 0))); + } //ctx.set_iterative_solver_type("davidson"); //Band(ctx).solve_pseudo_potential(kp, H); @@ -161,22 +171,22 @@ void test_davidson() // K_point kp(ctx, vk, 1.0); // kp.initialize(); - Hamiltonian0 h0(ctx, pot); - auto hk = h0(kp); + //Hamiltonian0 h0(ctx, pot); + //auto hk = h0(kp); - // init_wf(&kp, kp.spinor_wave_functions(), ctx.num_bands(), 0); + //// init_wf(&kp, kp.spinor_wave_functions(), ctx.num_bands(), 0); - auto eval = davidson(hk, kp.spinor_wave_functions(), 0, 4, 40, 1e-12, 0, 1e-7, [](int i, int ispn){return 1.0;}, false); + //auto eval = davidson(hk, kp.spinor_wave_functions(), 0, 4, 40, 1e-12, 0, 1e-7, [](int i, int ispn){return 1.0;}, false); - std::vector ekin(kp.num_gkvec()); - for (int i = 0; i < kp.num_gkvec(); i++) { - ekin[i] = 0.5 * kp.gkvec().gkvec_cart(i).length2(); - } - std::sort(ekin.begin(), ekin.end()); + //std::vector ekin(kp.num_gkvec()); + //for (int i = 0; i < kp.num_gkvec(); i++) { + // ekin[i] = 0.5 * kp.gkvec().gkvec_cart(i).length2(); + //} + //std::sort(ekin.begin(), ekin.end()); - for (int i = 0; i < ctx.num_bands(); i++) { - printf("%20.16f %20.16f %20.16e\n", ekin[i], eval[i], std::abs(ekin[i] - eval[i])); - } + //for (int i = 0; i < ctx.num_bands(); i++) { + // printf("%20.16f %20.16f %20.16e\n", ekin[i], eval[i], std::abs(ekin[i] - eval[i])); + //} //} @@ -216,17 +226,25 @@ void test_davidson() int main(int argn, char** argv) { - cmd_args args; + cmd_args args(argn, argv, {{"device=", "(string) CPU or GPU"}, + {"pw_cutoff=", "(double) plane-wave cutoff for density and potential"}, + {"gk_cutoff=", "(double) plane-wave cutoff for wave-functions"}, + {"N=", "(int) cell multiplicity"} + }); - args.parse_args(argn, argv); if (args.exist("help")) { printf("Usage: %s [options]\n", argv[0]); args.print_help(); return 0; } + auto pu = get_device_t(args.value("device", "CPU")); + auto pw_cutoff = args.value("pw_cutoff", 30); + auto gk_cutoff = args.value("gk_cutoff", 10); + auto N = args.value("N", 1); + sirius::initialize(1); - test_davidson(); + test_davidson(pu, pw_cutoff, gk_cutoff, N); int rank = Communicator::world().rank(); sirius::finalize(); if (!rank) { diff --git a/src/SDDK/memory.hpp b/src/SDDK/memory.hpp index 3a2099788..32f5d6429 100644 --- a/src/SDDK/memory.hpp +++ b/src/SDDK/memory.hpp @@ -128,6 +128,19 @@ inline device_t get_device_t(memory_t mem__) return device_t::CPU; // make compiler happy } +inline device_t get_device_t(std::string name__) +{ + std::transform(name__.begin(), name__.end(), name__.begin(), ::tolower); + if (name__ == "cpu") { + return device_t::CPU; + } else if (name__ == "gpu") { + return device_t::GPU; + } else { + throw std::runtime_error("wrong processing unit"); + } + return device_t::CPU; // make compiler happy +} + /// Allocate n elements in a specified memory. /** Allocate a memory block of the memory_t type. Return a nullptr if this memory is not available, otherwise * return a pointer to an allocated block. */ diff --git a/src/utils/cmd_args.hpp b/src/utils/cmd_args.hpp index 37e451312..041c679ae 100644 --- a/src/utils/cmd_args.hpp +++ b/src/utils/cmd_args.hpp @@ -30,6 +30,7 @@ #include #include #include +#include /// Simple command line arguments handler. class cmd_args @@ -75,6 +76,15 @@ class cmd_args register_key("--help", "print this help and exit"); } + cmd_args(int argn__, char** argv__, std::initializer_list> keys__) + { + register_key("--help", "print this help and exit"); + for (auto key: keys__) { + register_key("--" + key.first, key.second); + } + parse_args(argn__, argv__); + } + void register_key(std::string const key__, std::string const description__) { key_desc_.push_back(std::pair(key__, description__)); @@ -88,7 +98,9 @@ class cmd_args } if (known_keys_.count(key) != 0) { - throw std::runtime_error("key is already added"); + std::stringstream s; + s << "key (" << key << ") is already registered"; + throw std::runtime_error(s.str()); } known_keys_[key] = key_type; @@ -129,7 +141,9 @@ class cmd_args } if (keys_.count(key) != 0) { - throw std::runtime_error("key is already added"); + std::stringstream s; + s << "key (" << key << ") is already added"; + throw std::runtime_error(s.str()); } keys_[key] = val; From b4cd6334ac2b9f3d3caec1008a02b21cdd56ac6a Mon Sep 17 00:00:00 2001 From: toxa81 Date: Tue, 16 Jul 2019 09:46:20 +0200 Subject: [PATCH 08/12] last fixes for device_t enum --- src/eigenproblem.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/eigenproblem.hpp b/src/eigenproblem.hpp index afd3758b3..4d878c88b 100644 --- a/src/eigenproblem.hpp +++ b/src/eigenproblem.hpp @@ -497,10 +497,10 @@ class Eigensolver_elpa : public Eigensolver } /* transform to standard eigen-problem */ /* A * U{-1} -> Z */ - linalg::gemm(0, 0, matrix_size__, matrix_size__, matrix_size__, linalg_const::one(), A__, B__, + linalg::gemm(0, 0, matrix_size__, matrix_size__, matrix_size__, linalg_const::one(), A__, B__, linalg_const::zero(), Z__); /* U^{-H} * Z = U{-H} * A * U^{-1} -> A */ - linalg::gemm(2, 0, matrix_size__, matrix_size__, matrix_size__, linalg_const::one(), B__, Z__, + linalg::gemm(2, 0, matrix_size__, matrix_size__, matrix_size__, linalg_const::one(), B__, Z__, linalg_const::zero(), A__); t1.stop(); @@ -512,7 +512,7 @@ class Eigensolver_elpa : public Eigensolver utils::timer t3("Eigensolver_elpa|bt"); /* back-transform of eigen-vectors */ - linalg::gemm(0, 0, matrix_size__, nev__, matrix_size__, linalg_const::one(), B__, Z__, + linalg::gemm(0, 0, matrix_size__, nev__, matrix_size__, linalg_const::one(), B__, Z__, linalg_const::zero(), A__); A__ >> Z__; t3.stop(); @@ -558,10 +558,10 @@ class Eigensolver_elpa : public Eigensolver } /* transform to standard eigen-problem */ /* A * U{-1} -> Z */ - linalg::gemm(0, 0, matrix_size__, matrix_size__, matrix_size__, linalg_const::one(), A__, + linalg::gemm(0, 0, matrix_size__, matrix_size__, matrix_size__, linalg_const::one(), A__, B__, linalg_const::zero(), Z__); /* U^{-H} * Z = U{-H} * A * U^{-1} -> A */ - linalg::gemm(2, 0, matrix_size__, matrix_size__, matrix_size__, linalg_const::one(), B__, + linalg::gemm(2, 0, matrix_size__, matrix_size__, matrix_size__, linalg_const::one(), B__, Z__, linalg_const::zero(), A__); t1.stop(); @@ -573,7 +573,7 @@ class Eigensolver_elpa : public Eigensolver utils::timer t3("Eigensolver_elpa|bt"); /* back-transform of eigen-vectors */ - linalg::gemm(0, 0, matrix_size__, nev__, matrix_size__, linalg_const::one(), B__, Z__, + linalg::gemm(0, 0, matrix_size__, nev__, matrix_size__, linalg_const::one(), B__, Z__, linalg_const::zero(), A__); A__ >> Z__; t3.stop(); From bae73a0cbe6df8acdd06f297c1e243814c54ed11 Mon Sep 17 00:00:00 2001 From: toxa81 Date: Tue, 16 Jul 2019 10:12:39 +0200 Subject: [PATCH 09/12] fix for PYBIND11_CPP14 --- python_module/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python_module/CMakeLists.txt b/python_module/CMakeLists.txt index d1dd356fc..ebc527717 100644 --- a/python_module/CMakeLists.txt +++ b/python_module/CMakeLists.txt @@ -2,7 +2,7 @@ if(CREATE_PYTHON_MODULE) set(CMAKE_CXX_STANDARD 14) find_package(mpi4py REQUIRED) - set(PYBIND11_CPP14 1) + add_definitions("-DPYBIND11_CPP14") set(pb11_src_dir "${PROJECT_SOURCE_DIR}/python_module/pybind11") check_git_submodule(pybind11 "${pb11_src_dir}") From 5cdebf23f90a00083634ecdd272d45fe657441e7 Mon Sep 17 00:00:00 2001 From: toxa81 Date: Tue, 16 Jul 2019 15:21:32 +0200 Subject: [PATCH 10/12] profiling the davidson solver --- apps/tests/test_davidson.cpp | 44 ++++++++++++++++-------------- src/Band/diag_pseudo_potential.hpp | 10 ++++--- src/SDDK/GPU/cublas.hpp | 20 ++++---------- src/SDDK/linalg.hpp | 35 +++++++++++++----------- src/SDDK/wf_ortho.hpp | 14 +++++++--- src/SDDK/wf_trans.hpp | 10 +++++-- src/sirius.h | 10 +++---- 7 files changed, 75 insertions(+), 68 deletions(-) diff --git a/apps/tests/test_davidson.cpp b/apps/tests/test_davidson.cpp index 3021dcf14..97aa68f37 100644 --- a/apps/tests/test_davidson.cpp +++ b/apps/tests/test_davidson.cpp @@ -135,31 +135,33 @@ void test_davidson(device_t pu__, double pw_cutoff__, double gk_cutoff__, int N_ pot.zero(); - double vk[] = {0.1, 0.1, 0.1}; - K_point kp(ctx, vk, 1.0); - kp.initialize(); - std::cout << "num_gkvec=" << kp.num_gkvec() << "\n"; - for (int i = 0; i < ctx.num_bands(); i++) { - kp.band_occupancy(i, 0, 2); - } - init_wf(&kp, kp.spinor_wave_functions(), ctx.num_bands(), 0); + for (int r = 0; r < 2; r++) { + double vk[] = {0.1, 0.1, 0.1}; + K_point kp(ctx, vk, 1.0); + kp.initialize(); + std::cout << "num_gkvec=" << kp.num_gkvec() << "\n"; + for (int i = 0; i < ctx.num_bands(); i++) { + kp.band_occupancy(i, 0, 2); + } + init_wf(&kp, kp.spinor_wave_functions(), ctx.num_bands(), 0); - Hamiltonian H(ctx, pot); - H.prepare(); - Band(ctx).solve_pseudo_potential(kp, H); - //for (int i = 0; i < ctx.num_bands(); i++) { - // std::cout << "energy[" << i << "]=" << kp.band_energy(i, 0) << "\n"; - //} - std::vector ekin(kp.num_gkvec()); - for (int i = 0; i < kp.num_gkvec(); i++) { - ekin[i] = 0.5 * kp.gkvec().gkvec_cart(i).length2(); - } - std::sort(ekin.begin(), ekin.end()); + Hamiltonian H(ctx, pot); + H.prepare(); + Band(ctx).solve_pseudo_potential(kp, H); + //for (int i = 0; i < ctx.num_bands(); i++) { + // std::cout << "energy[" << i << "]=" << kp.band_energy(i, 0) << "\n"; + //} + std::vector ekin(kp.num_gkvec()); + for (int i = 0; i < kp.num_gkvec(); i++) { + ekin[i] = 0.5 * kp.gkvec().gkvec_cart(i).length2(); + } + std::sort(ekin.begin(), ekin.end()); - for (int i = 0; i < ctx.num_bands(); i++) { - printf("%20.16f %20.16f %20.16e\n", ekin[i], kp.band_energy(i, 0), std::abs(ekin[i] - kp.band_energy(i, 0))); + for (int i = 0; i < ctx.num_bands(); i++) { + printf("%20.16f %20.16f %20.16e\n", ekin[i], kp.band_energy(i, 0), std::abs(ekin[i] - kp.band_energy(i, 0))); + } } //ctx.set_iterative_solver_type("davidson"); diff --git a/src/Band/diag_pseudo_potential.hpp b/src/Band/diag_pseudo_potential.hpp index cc68eaf2b..ee07d4352 100644 --- a/src/Band/diag_pseudo_potential.hpp +++ b/src/Band/diag_pseudo_potential.hpp @@ -456,14 +456,15 @@ inline int Band::diag_pseudo_potential_davidson(K_point* kp__, /* current subspace size */ int N = num_bands; - utils::timer t1("sirius::Band::diag_pseudo_potential_davidson|evp"); + auto p1 = std::unique_ptr(new utils::profiler(__function_name__, __FILE__, __LINE__, + "sirius::Band::diag_pseudo_potential_davidson|evp")); /* solve generalized eigen-value problem with the size N and get lowest num_bands eigen-vectors */ if (std_solver.solve(N, num_bands, hmlt, eval.data(), evec)) { std::stringstream s; s << "error in diagonalziation"; TERMINATE(s); } - t1.stop(); + p1 = nullptr; evp_work_count() += 1; @@ -585,7 +586,8 @@ inline int Band::diag_pseudo_potential_davidson(K_point* kp__, eval_old = eval; - utils::timer t1("sirius::Band::diag_pseudo_potential_davidson|evp"); + auto p1 = std::unique_ptr(new utils::profiler(__function_name__, __FILE__, __LINE__, + "sirius::Band::diag_pseudo_potential_davidson|evp")); if (itso.orthogonalize_) { /* solve standard eigen-value problem with the size N */ if (std_solver.solve(N, num_bands, hmlt, eval.data(), evec)) { @@ -601,7 +603,7 @@ inline int Band::diag_pseudo_potential_davidson(K_point* kp__, TERMINATE(s); } } - t1.stop(); + p1 = nullptr; evp_work_count() += std::pow(static_cast(N) / num_bands, 3); diff --git a/src/SDDK/GPU/cublas.hpp b/src/SDDK/GPU/cublas.hpp index afa9b385c..70b750439 100644 --- a/src/SDDK/GPU/cublas.hpp +++ b/src/SDDK/GPU/cublas.hpp @@ -231,34 +231,26 @@ inline void dgemm(char transa, char transb, int32_t m, int32_t n, int32_t k, } inline void dtrmm(char side__, char uplo__, char transa__, char diag__, int m__, int n__, - double const* alpha__, double const* A__, int lda__, double* B__, int ldb__) + double const* alpha__, double const* A__, int lda__, double* B__, int ldb__, int stream_id) { cublasSideMode_t side = get_cublasSideMode_t(side__); cublasFillMode_t uplo = get_cublasFillMode_t(uplo__); cublasOperation_t transa = get_cublasOperation_t(transa__); cublasDiagType_t diag = get_cublasDiagType_t(diag__); //acc::set_device(); - CALL_CUBLAS(cublasDtrmm, (null_stream_handle(), side, uplo, transa, diag, m__, n__, alpha__, A__, lda__, B__, ldb__, B__, ldb__)); + CALL_CUBLAS(cublasDtrmm, (stream_handle(stream_id), side, uplo, transa, diag, m__, n__, alpha__, A__, lda__, B__, ldb__, B__, ldb__)); } -inline void ztrmm(char side__, - char uplo__, - char transa__, - char diag__, - int m__, - int n__, - cuDoubleComplex const* alpha__, - cuDoubleComplex const* A__, - int lda__, - cuDoubleComplex* B__, - int ldb__) +inline void ztrmm(char side__, char uplo__, char transa__, char diag__, int m__, int n__, + cuDoubleComplex const* alpha__, cuDoubleComplex const* A__, int lda__, cuDoubleComplex* B__, + int ldb__, int stream_id) { cublasSideMode_t side = get_cublasSideMode_t(side__); cublasFillMode_t uplo = get_cublasFillMode_t(uplo__); cublasOperation_t transa = get_cublasOperation_t(transa__); cublasDiagType_t diag = get_cublasDiagType_t(diag__); //acc::set_device(); - CALL_CUBLAS(cublasZtrmm, (null_stream_handle(), side, uplo, transa, diag, m__, n__, alpha__, A__, lda__, B__, ldb__, B__, ldb__)); + CALL_CUBLAS(cublasZtrmm, (stream_handle(stream_id), side, uplo, transa, diag, m__, n__, alpha__, A__, lda__, B__, ldb__, B__, ldb__)); } inline void dger(int m, diff --git a/src/SDDK/linalg.hpp b/src/SDDK/linalg.hpp index 64924c3ba..d0fc078b0 100644 --- a/src/SDDK/linalg.hpp +++ b/src/SDDK/linalg.hpp @@ -66,15 +66,16 @@ class linalg2 stream_id sid = stream_id(-1)) const; template - inline void trmm(char side, char uplo, char transa, ftn_int m, ftn_int n, T const* aplha, T const* A, ftn_int lda, T* B, ftn_int ldb); + inline void trmm(char side, char uplo, char transa, ftn_int m, ftn_int n, T const* aplha, T const* A, ftn_int lda, + T* B, ftn_int ldb, stream_id sid = stream_id(-1)) const; /// Cholesky factorization template - inline int potrf(ftn_int n, T* A, ftn_int lda, ftn_int const* desca = nullptr); + inline int potrf(ftn_int n, T* A, ftn_int lda, ftn_int const* desca = nullptr) const; /// Inversion of a triangular matrix. template - inline int trtri(ftn_int n, T* A, ftn_int lda, ftn_int const* desca = nullptr); + inline int trtri(ftn_int n, T* A, ftn_int lda, ftn_int const* desca = nullptr) const; }; template <> @@ -200,7 +201,7 @@ inline void linalg2::ger(ftn_int m, ftn_int n, ftn_double const* alp template <> inline void linalg2::trmm(char side, char uplo, char transa, ftn_int m, ftn_int n, ftn_double const* alpha, - ftn_double const* A, ftn_int lda, ftn_double* B, ftn_int ldb) + ftn_double const* A, ftn_int lda, ftn_double* B, ftn_int ldb, stream_id sid) const { switch (la_) { case linalg_t::blas: { @@ -209,8 +210,8 @@ inline void linalg2::trmm(char side, char uplo, char transa, ftn_int break; } case linalg_t::gpublas: { -#ifdef __GPU - gpublas::dtrmm(side, uplo, transa, 'N', m, n, alpha, A, lda, B, ldb); +#if defined(__GPU) + gpublas::dtrmm(side, uplo, transa, 'N', m, n, alpha, A, lda, B, ldb, sid()); #else throw std::runtime_error("not compiled with GPU blas support!"); #endif @@ -234,18 +235,20 @@ inline void linalg2::trmm(char side, char uplo, char transa, ftn_int template <> inline void linalg2::trmm(char side, char uplo, char transa, ftn_int m, ftn_int n, ftn_double_complex const* alpha, ftn_double_complex const* A, - ftn_int lda, ftn_double_complex* B, ftn_int ldb) + ftn_int lda, ftn_double_complex* B, ftn_int ldb, stream_id sid) const { switch (la_) { case linalg_t::blas: { FORTRAN(ztrmm)(&side, &uplo, &transa, "N", &m, &n, const_cast(alpha), - const_cast(A), &lda, B, &ldb, (ftn_len)1, (ftn_len)1, (ftn_len)1, (ftn_len)1); + const_cast(A), &lda, B, &ldb, (ftn_len)1, (ftn_len)1, + (ftn_len)1, (ftn_len)1); break; } case linalg_t::gpublas: { -#ifdef __GPU +#if defined(__GPU) gpublas::ztrmm(side, uplo, transa, 'N', m, n, reinterpret_cast(alpha), - reinterpret_cast(A), lda, reinterpret_cast(B), ldb); + reinterpret_cast(A), lda, + reinterpret_cast(B), ldb, sid()); #else throw std::runtime_error("not compiled with GPU blas support!"); #endif @@ -268,7 +271,7 @@ inline void linalg2::trmm(char side, char uplo, char transa, } template<> -inline int linalg2::potrf(ftn_int n, ftn_double* A, ftn_int lda, ftn_int const* desca) +inline int linalg2::potrf(ftn_int n, ftn_double* A, ftn_int lda, ftn_int const* desca) const { switch (la_) { case linalg_t::lapack: { @@ -307,7 +310,7 @@ inline int linalg2::potrf(ftn_int n, ftn_double* A, ftn_int lda, ftn } template<> -inline int linalg2::potrf(ftn_int n, ftn_double_complex* A, ftn_int lda, ftn_int const* desca) +inline int linalg2::potrf(ftn_int n, ftn_double_complex* A, ftn_int lda, ftn_int const* desca) const { switch (la_) { case linalg_t::lapack: { @@ -346,7 +349,7 @@ inline int linalg2::potrf(ftn_int n, ftn_double_complex* A, } template<> -inline int linalg2::trtri(ftn_int n, ftn_double* A, ftn_int lda, ftn_int const* desca) +inline int linalg2::trtri(ftn_int n, ftn_double* A, ftn_int lda, ftn_int const* desca) const { switch (la_) { case linalg_t::lapack: { @@ -385,7 +388,7 @@ inline int linalg2::trtri(ftn_int n, ftn_double* A, ftn_int lda, ftn } template<> -inline int linalg2::trtri(ftn_int n, ftn_double_complex* A, ftn_int lda, ftn_int const* desca) +inline int linalg2::trtri(ftn_int n, ftn_double_complex* A, ftn_int lda, ftn_int const* desca) const { switch (la_) { case linalg_t::lapack: { @@ -1273,7 +1276,7 @@ inline void linalg::trmm(char side, ftn_int ldb) { assert(_local::is_set_device_id()); - gpublas::dtrmm(side, uplo, transa, 'N', m, n, alpha, A, lda, B, ldb); + gpublas::dtrmm(side, uplo, transa, 'N', m, n, alpha, A, lda, B, ldb, -1); } template <> @@ -1290,7 +1293,7 @@ inline void linalg::trmm(char side, { assert(_local::is_set_device_id()); gpublas::ztrmm(side, uplo, transa, 'N', m, n, (acc_complex_double_t*)alpha, (acc_complex_double_t*)A, lda, - (acc_complex_double_t*)B, ldb); + (acc_complex_double_t*)B, ldb, -1); } template <> diff --git a/src/SDDK/wf_ortho.hpp b/src/SDDK/wf_ortho.hpp index 8838f25fa..ac2dfe7d3 100644 --- a/src/SDDK/wf_ortho.hpp +++ b/src/SDDK/wf_ortho.hpp @@ -193,6 +193,7 @@ inline void orthogonalize(memory_t mem__, utils::timer t2("sddk::orthogonalize|transform"); + int sid{0}; for (int s: spins) { /* multiplication by triangular matrix */ for (auto& e: wfs__) { @@ -201,13 +202,15 @@ inline void orthogonalize(memory_t mem__, linalg2(la__).trmm('R', 'U', 'N', e->pw_coeffs(s).num_rows_loc(), n__, &linalg_const::one(), reinterpret_cast(o__.at(mem__)), o__.ld(), - e->pw_coeffs(s).prime().at(e->preferred_memory_t(), 0, N__), e->pw_coeffs(s).prime().ld()); + e->pw_coeffs(s).prime().at(e->preferred_memory_t(), 0, N__), + e->pw_coeffs(s).prime().ld(), stream_id(sid++)); if (e->has_mt()) { linalg2(la__).trmm('R', 'U', 'N', e->mt_coeffs(s).num_rows_loc(), n__, &linalg_const::one(), reinterpret_cast(o__.at(mem__)), o__.ld(), - e->mt_coeffs(s).prime().at(e->preferred_memory_t(), 0, N__), e->mt_coeffs(s).prime().ld()); + e->mt_coeffs(s).prime().at(e->preferred_memory_t(), 0, N__), + e->mt_coeffs(s).prime().ld(), stream_id(sid++)); } } /* wave functions are real (psi(G) = psi^{*}(-G)), transformation matrix is real */ @@ -216,18 +219,21 @@ inline void orthogonalize(memory_t mem__, &linalg_const::one(), reinterpret_cast(o__.at(mem__)), o__.ld(), reinterpret_cast(e->pw_coeffs(s).prime().at(e->preferred_memory_t(), 0, N__)), - 2 * e->pw_coeffs(s).prime().ld()); + 2 * e->pw_coeffs(s).prime().ld(), stream_id(sid++)); if (e->has_mt()) { linalg2(la__).trmm('R', 'U', 'N', 2 * e->mt_coeffs(s).num_rows_loc(), n__, &linalg_const::one(), reinterpret_cast(o__.at(mem__)), o__.ld(), reinterpret_cast(e->mt_coeffs(s).prime().at(e->preferred_memory_t(), 0, N__)), - 2 * e->mt_coeffs(s).prime().ld()); + 2 * e->mt_coeffs(s).prime().ld(), stream_id(sid++)); } } } } + for (int i = 0; i < sid; i++) { + acc::sync_stream(stream_id(i)); + } t2.stop(); } else { /* parallel transformation */ utils::timer t1("sddk::orthogonalize|potrf"); diff --git a/src/SDDK/wf_trans.hpp b/src/SDDK/wf_trans.hpp index 34d2e00e7..60f0cbf41 100644 --- a/src/SDDK/wf_trans.hpp +++ b/src/SDDK/wf_trans.hpp @@ -161,17 +161,21 @@ inline void transform(memory_t mem__, /* trivial case */ if (comm.size() == 1) { if (is_device_memory(mem__)) { + //acc::copyin(mtrx__.at(memory_t::device, irow0__, jcol0__), mtrx__.ld(), + // mtrx__.at(memory_t::host, irow0__, jcol0__), mtrx__.ld(), m__, n__, stream_id(0)); acc::copyin(mtrx__.at(memory_t::device, irow0__, jcol0__), mtrx__.ld(), - mtrx__.at(memory_t::host, irow0__, jcol0__), mtrx__.ld(), m__, n__, stream_id(0)); + mtrx__.at(memory_t::host, irow0__, jcol0__), mtrx__.ld(), m__, n__); } for (int iv = 0; iv < nwf; iv++) { transform_local(la__, ispn__, &alpha, wf_in__[iv], i0__, m__, mtrx__.at(mem__, irow0__, jcol0__), - mtrx__.ld(), wf_out__[iv], j0__, n__, stream_id(0)); + mtrx__.ld(), wf_out__[iv], j0__, n__, stream_id(iv)); } if (is_device_memory(mem__)) { /* wait for the stream to finish zgemm */ - acc::sync_stream(stream_id(0)); + for (int iv = 0; iv < nwf; iv++) { + acc::sync_stream(stream_id(iv)); + } } if (sddk_pp) { time += omp_get_wtime(); diff --git a/src/sirius.h b/src/sirius.h index 3a63f09b6..09a6efb68 100644 --- a/src/sirius.h +++ b/src/sirius.h @@ -87,12 +87,10 @@ inline void initialize(bool call_mpi_init__ = true) if (acc::num_devices() > 0) { int devid = sddk::get_device_id(acc::num_devices()); acc::set_device_id(devid); - // #pragma omp parallel - // { - // #pragma omp critical - // acc::set_device_id(devid); - // } - acc::create_streams(omp_get_max_threads() + 1); + /* create extensive amount of streams */ + /* some parts of the code rely on the number of streams not related to the + number of OMP threads */ + acc::create_streams(omp_get_max_threads() + 100); #if defined(__GPU) gpublas::create_stream_handles(); #endif From 16397c3605dbeeda4f48b3f2b1c0b6e76e227b1c Mon Sep 17 00:00:00 2001 From: toxa81 Date: Tue, 16 Jul 2019 15:37:39 +0200 Subject: [PATCH 11/12] minor changes --- apps/tests/test_davidson.cpp | 8 +++++--- src/simulation_context.hpp | 12 ++++++++++-- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/apps/tests/test_davidson.cpp b/apps/tests/test_davidson.cpp index 3021dcf14..083cf6de8 100644 --- a/apps/tests/test_davidson.cpp +++ b/apps/tests/test_davidson.cpp @@ -229,7 +229,8 @@ int main(int argn, char** argv) cmd_args args(argn, argv, {{"device=", "(string) CPU or GPU"}, {"pw_cutoff=", "(double) plane-wave cutoff for density and potential"}, {"gk_cutoff=", "(double) plane-wave cutoff for wave-functions"}, - {"N=", "(int) cell multiplicity"} + {"N=", "(int) cell multiplicity"}, + {"mpi_grid=", "(int[2]) dimensions of the MPI grid for band diagonalization"} }); if (args.exist("help")) { @@ -238,10 +239,11 @@ int main(int argn, char** argv) return 0; } - auto pu = get_device_t(args.value("device", "CPU")); + auto pu = get_device_t(args.value("device", "CPU")); auto pw_cutoff = args.value("pw_cutoff", 30); auto gk_cutoff = args.value("gk_cutoff", 10); - auto N = args.value("N", 1); + auto N = args.value("N", 1); + auto mpi_grid = args.value>("mpi_grid", {1, 1}); sirius::initialize(1); test_davidson(pu, pw_cutoff, gk_cutoff, N); diff --git a/src/simulation_context.hpp b/src/simulation_context.hpp index be1943f65..10b06f2eb 100644 --- a/src/simulation_context.hpp +++ b/src/simulation_context.hpp @@ -1711,11 +1711,13 @@ inline void Simulation_context::print_info() const printf("LAPACK\n"); break; } -#ifdef __SCALAPACK +#if defined(__SCALAPACK) case ev_solver_t::scalapack: { printf("ScaLAPACK\n"); break; } +#endif +#if defined(__ELPA) case ev_solver_t::elpa1: { printf("ELPA1\n"); break; @@ -1725,6 +1727,7 @@ inline void Simulation_context::print_info() const break; } #endif +#if defined(__MAGMA) case ev_solver_t::magma: { printf("MAGMA\n"); break; @@ -1733,16 +1736,21 @@ inline void Simulation_context::print_info() const printf("MAGMA with GPU pointers\n"); break; } +#endif case ev_solver_t::plasma: { printf("PLASMA\n"); break; } +#if defined(__CUDA) case ev_solver_t::cusolver: { printf("cuSOLVER\n"); break; } +#endif default: { - TERMINATE("wrong eigen-value solver"); + std::stringstream s; + s << "wrong eigen-value solver: " << evsn[i]; + throw std::runtime_error(s.str()); } } } From 80d469931d3f3e52fe640f97c14c429df74fd03b Mon Sep 17 00:00:00 2001 From: toxa81 Date: Tue, 16 Jul 2019 16:10:00 +0200 Subject: [PATCH 12/12] update command line parameter --- verification/run_tests_parallel_gpu.x | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/verification/run_tests_parallel_gpu.x b/verification/run_tests_parallel_gpu.x index a2455b16a..803f15877 100755 --- a/verification/run_tests_parallel_gpu.x +++ b/verification/run_tests_parallel_gpu.x @@ -24,7 +24,7 @@ for f in ./*; do --test_against=output_ref.json \ --std_evp_solver_name=scalapack \ --gen_evp_solver_name=scalapack \ - --processing_unit=gpu --mpi_grid="2 2" + --control.processing_unit=gpu --mpi_grid="2 2" err=$? if [ ${err} == 0 ]; then