From ed966dbfb4c450f017a118a10a3ddaa2950b39de Mon Sep 17 00:00:00 2001 From: Mahias Wagner Date: Mon, 26 Sep 2022 12:39:35 +0200 Subject: [PATCH 001/148] initial structure (completely untested and still a lot of things WIP) --- CMakeLists.txt | 1 + include/enum_quda.h | 1 + include/enum_quda_fortran.h | 1 + include/gauge_field_order.h | 80 +++++ include/quda_openqcd_interface.h | 165 +++++++++ lib/CMakeLists.txt | 7 +- lib/copy_gauge_extended.cu | 19 +- lib/copy_gauge_inc.cu | 19 +- lib/cpu_gauge_field.cpp | 13 +- lib/openqcd_interface.cpp | 552 +++++++++++++++++++++++++++++++ 10 files changed, 848 insertions(+), 10 deletions(-) create mode 100644 include/quda_openqcd_interface.h create mode 100644 lib/openqcd_interface.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 60e18cedc1..ad6f8d1cc3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -171,6 +171,7 @@ option(QUDA_OPENBLAS "enable OpenBLAS" OFF) # Interface options option(QUDA_INTERFACE_QDP "build qdp interface" ON) option(QUDA_INTERFACE_MILC "build milc interface" ON) +option(QUDA_INTERFACE_OpenQCD "build OpenQCD interface" OFF) option(QUDA_INTERFACE_CPS "build cps interface" OFF) option(QUDA_INTERFACE_QDPJIT "build qdpjit interface" OFF) option(QUDA_INTERFACE_BQCD "build bqcd interface" OFF) diff --git a/include/enum_quda.h b/include/enum_quda.h index cffea9c7c3..369bc7d176 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -48,6 +48,7 @@ typedef enum QudaGaugeFieldOrder_s { QUDA_BQCD_GAUGE_ORDER, // expect *gauge, mu, even-odd, spacetime+halos, column-row order QUDA_TIFR_GAUGE_ORDER, // expect *gauge, mu, even-odd, spacetime, column-row order QUDA_TIFR_PADDED_GAUGE_ORDER, // expect *gauge, mu, parity, t, z+halo, y, x/2, column-row order + QUDA_OPENQCD_GAUGE_ORDER, // expect *gauge, spacetime, mu, parity, row-column order -- links attached to odd points only QUDA_INVALID_GAUGE_ORDER = QUDA_INVALID_ENUM } QudaGaugeFieldOrder; diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index 5e17a9df8f..527d6a5798 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -51,6 +51,7 @@ #define QUDA_BQCD_GAUGE_ORDER 15 // expect *gauge mu even-odd spacetime+halos row-column order #define QUDA_TIFR_GAUGE_ORDER 16 #define QUDA_TIFR_PADDED_GAUGE_ORDER 17 +#define QUDA_OPENQCD_GAUGE_ORDER 18 #define QUDA_INVALID_GAUGE_ORDER QUDA_INVALID_ENUM #define QudaTboundary integer(4) diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 714ad7dc07..3538550139 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2308,7 +2308,87 @@ namespace quda { size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } }; + /** + struct to define OpenQCD ordered gauge fields: + [volumecb][dim][parity*][row][col] + */ + template struct OpenQCDOrder : LegacyOrder { + using Accessor = OpenQCDOrder; + using real = typename mapper::type; + using complex = complex; + Float *gauge; + const int volumeCB; + static constexpr int Nc = 3; + const int dim[4]; + OpenQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + LegacyOrder(u, ghost_), + gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + volumeCB(u.VolumeCB()), + dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]} + { + if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); + } + + // fields are only defined for odd points + // The pointer to the + // link variable U(x,mu) at any given *odd* point x is then + // ud+8*(ix-VOLUME/2)+2*mu + // while + // ud+8*(ix-VOLUME/2)+2*mu+1 + // is the pointer to the link variable U(x-mu,mu), where ix denotes the label of + // x. All link variables that constitute the local gauge field can thus be + // accessed in this simple way. + // see https://gitlab.com/rcstar/openQxD/-/blob/master/main/README.global + // typedef struct + // { + // complex c11,c12,c13,c21,c22,c23,c31,c32,c33; + // } su3; + + __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float = 1.0) const + { + if (parity == 1) { // odd points can be loaded directly + auto in = &gauge[(8 * x + 2 * dir) * length]; + block_load(v, reinterpret_cast(in)); + } else { + // gauge field for even points needs to be fetched from odd points, some indexing fun + // ud+8*(ix-VOLUME/2)+2*mu+1 + // is the pointer to the link variable U(x-mu,mu), + // so to get U(x,mu) for even x we need to load U(ix,mu) with ix=x+mu + int xmu = linkIndexP1(x, dim, dir); // TODO: What about on boundaries?, do we need to index into them? + auto in = &gauge[(8 * xmu + 2 * dir + 1) * length]; + block_load(v, reinterpret_cast(in)); + } + } + + __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity) const + { + // auto out = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; + // complex v_[9]; + // for (int i=0; i(reinterpret_cast(out), v_); + } + + /** + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. + */ + __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const + { + return gauge_wrapper(const_cast(*this), dim, x_cb, parity); + } + + size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } + }; } // namespace gauge template diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h new file mode 100644 index 0000000000..07774cfedd --- /dev/null +++ b/include/quda_openqcd_interface.h @@ -0,0 +1,165 @@ +#pragma once + +#include +#include + +/** + * @file quda_openqcd_interface.h + * + * @section Description + * + * The header file defines the milc interface to enable easy + * interfacing between QUDA and the OpenQCS software. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Parameters related to problem size and machine topology. + */ +typedef struct { + const int *latsize; /** Local lattice dimensions L0, L1, L2, L3 */ + const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ + const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ + int device; /** GPU device number */ +} openQCD_QudaLayout_t; + +/** + * Parameters used to create a QUDA context. + */ +typedef struct { + QudaVerbosity verbosity; /** How verbose QUDA should be (QUDA_SILENT, QUDA_VERBOSE or QUDA_SUMMARIZE) */ + openQCD_QudaLayout_t layout; /** Layout for QUDA to use */ +} openQCD_QudaInitArgs_t; // passed to the initialization struct + +/** + * Initialize the QUDA context. + * + * @param input Meta data for the QUDA context + */ +void openQCD_qudaInit(openQCD_QudaInitArgs_t input); + +// /** +// * Set set the local dimensions and machine topology for QUDA to use +// * +// * @param layout Struct defining local dimensions and machine topology +// */ +// void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout); + +/** + * Destroy the QUDA context. + */ +void openQCD_qudaFinalize(); + +#if 0 +// leave that here for now + /** + * Allocate pinned memory suitable for CPU-GPU transfers + * @param bytes The size of the requested allocation + * @return Pointer to allocated memory + */ + void* openQCD_qudaAllocatePinned(size_t bytes); + + /** + * Free pinned memory + * @param ptr Pointer to memory to be free + */ + void openQCD_qudaFreePinned(void *ptr); + + /** + * Allocate managed memory to reduce CPU-GPU transfers + * @param bytes The size of the requested allocation + * @return Pointer to allocated memory + */ + void *openQCD_qudaAllocateManaged(size_t bytes); + + /** + * Free managed memory + * @param ptr Pointer to memory to be free + */ + void openQCD_qudaFreeManaged(void *ptr); + +#endif + +/** + * Parameters related to linear solvers. + */ + +typedef struct { + // TODO: work out what we want to expose here + int max_iter; /** Maximum number of iterations */ + QudaParity + evenodd; /** Which parity are we working on ? (options are QUDA_EVEN_PARITY, QUDA_ODD_PARITY, QUDA_INVALID_PARITY */ + int mixed_precision; /** Whether to use mixed precision or not (1 - yes, 0 - no) */ + double boundary_phase[4]; /** Boundary conditions */ + int make_resident_solution; /** Make the solution resident and don't copy back */ + int use_resident_solution; /** Use the resident solution */ + QudaInverterType solver_type; /** Type of solver to use */ + double tadpole; /** Tadpole improvement factor - set to 1.0 for + HISQ fermions since the tadpole factor is + baked into the links during their construction */ + double naik_epsilon; /** Naik epsilon parameter (HISQ fermions only).*/ +} openQCD_QudaInvertArgs_t; + +/** + * Apply the improved staggered operator to a field. All fields + * passed and returned are host (CPU) field in MILC order. + * + * @param external_precision Precision of host fields passed to QUDA (2 - double, 1 - single) + * @param quda_precision Precision for QUDA to use (2 - double, 1 - single) + * @param inv_args Struct setting some solver metadata + * @param milc_fatlink Fat-link field on the host + * @param milc_longlink Long-link field on the host + * @param source Right-hand side source field + * @param solution Solution spinor field + */ +void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, + const void *const milc_fatlink, const void *const milc_longlink, void *source, void *solution, + int *num_iters); + +/** + * Solve Ax=b for an improved staggered operator. All fields are fields + * passed and returned are host (CPU) field in MILC order. This + * function requires that persistent gauge and clover fields have + * been created prior. This interface is experimental. + * + * @param external_precision Precision of host fields passed to QUDA (2 - double, 1 - single) + * @param quda_precision Precision for QUDA to use (2 - double, 1 - single) + * @param mass Fermion mass parameter + * @param inv_args Struct setting some solver metadata + * @param target_residual Target residual + * @param target_relative_residual Target Fermilab residual + * @param milc_fatlink Fat-link field on the host + * @param milc_longlink Long-link field on the host + * @param source Right-hand side source field + * @param solution Solution spinor field + * @param final_residual True residual + * @param final_relative_residual True Fermilab residual + * @param num_iters Number of iterations taken + */ +void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, openQCD_QudaInvertArgs_t inv_args, + double target_residual, double target_fermilab_residual, const void *const milc_fatlink, + const void *const milc_longlink, void *source, void *solution, double *const final_resid, + double *const final_rel_resid, int *num_iters); + +/** + * Load the gauge field from the host. + * + * @param external_precision Precision of host fields passed to QUDA (2 - double, 1 - single) + * @param quda_precision Precision for QUDA to use (2 - double, 1 - single) + * @param inv_args Meta data + * @param milc_link Base pointer to host gauge field (regardless of dimensionality) + */ +void openQCD_qudaLoadGaugeField(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, + const void *milc_link); + +/** + Free the gauge field allocated in QUDA. + */ +void openQCD_qudaFreeGaugeField(); + +#ifdef __cplusplus +} +#endif diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 99c5ac00d2..6c488cbd73 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -73,7 +73,8 @@ set (QUDA_OBJS copy_gauge_offset.cu copy_color_spinor_offset.cu copy_clover_offset.cu staggered_oprod.cu clover_trace_quda.cu hisq_paths_force_quda.cu - unitarize_force_quda.cu unitarize_links_quda.cu milc_interface.cpp + unitarize_force_quda.cu unitarize_links_quda.cu + milc_interface.cpp openqcd_interface.cpp tune.cpp device_vector.cu inv_gmresdr_quda.cpp @@ -349,6 +350,10 @@ if(QUDA_INTERFACE_TIFR OR QUDA_INTERFACE_ALL) target_compile_definitions(quda PUBLIC BUILD_TIFR_INTERFACE) endif(QUDA_INTERFACE_TIFR OR QUDA_INTERFACE_ALL) +if(QUDA_INTERFACE_OPENQCD OR QUDA_INTERFACE_ALL) + target_compile_definitions(quda PUBLIC BUILD_OPENQCD_INTERFACE) +endif(QUDA_INTERFACE_OPENQCD OR QUDA_INTERFACE_ALL) + if(QUDA_CONTRACT) target_compile_definitions(quda PUBLIC GPU_CONTRACT) endif(QUDA_CONTRACT) diff --git a/lib/copy_gauge_extended.cu b/lib/copy_gauge_extended.cu index ec0f9d31aa..b4472764b9 100644 --- a/lib/copy_gauge_extended.cu +++ b/lib/copy_gauge_extended.cu @@ -110,10 +110,18 @@ namespace quda { errorQuda("TIFR interface has not been built\n"); #endif + } else if (out.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + +#ifdef BUILD_OPENQCD_INTERFACE + using G = OpenQCDOrder; + CopyGaugeEx(out, in, location, Out, In); +#else + errorQuda("OPENQCD interface has not been built\n"); +#endif + } else { errorQuda("Gauge field %d order not supported", out.Order()); } - } template @@ -184,10 +192,17 @@ namespace quda { errorQuda("TIFR interface has not been built\n"); #endif + } else if (in.Order() == QUDA_OPENQCD_GAUGE_ORDER) { +#ifdef BUILD_OPENQCD_INTERFACE + using G = OpenQCDOrder; + copyGaugeEx(out, in, location, Out, In); +#else + errorQuda("OpenQCD interface has not been built\n"); +#endif + } else { errorQuda("Gauge field %d order not supported", in.Order()); } - } template diff --git a/lib/copy_gauge_inc.cu b/lib/copy_gauge_inc.cu index 966d1c49c9..d7eb06bf1f 100644 --- a/lib/copy_gauge_inc.cu +++ b/lib/copy_gauge_inc.cu @@ -131,10 +131,18 @@ namespace quda { errorQuda("TIFR interface has not been built\n"); #endif + } else if (out.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + +#ifdef BUILD_OPENQCD_INTERFACE + copyGauge(OpenQCDOrder(out, Out, outGhost), inOrder, + out, in, location, type); +#else + errorQuda("OPENQCD interface has not been built\n"); +#endif + } else { errorQuda("Gauge field %d order not supported", out.Order()); } - } template @@ -259,6 +267,15 @@ namespace quda { errorQuda("TIFR interface has not been built\n"); #endif + } else if (in.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + +#ifdef BUILD_OPENQCD_INTERFACE + copyGauge(OpenQCDOrder(in, In, inGhost), out, in, location, Out, + outGhost, type); +#else + errorQuda("OPENQCD interface has not been built\n"); +#endif + } else { errorQuda("Gauge field order %d not supported", in.Order()); } diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp index f4b27109a8..f3dbc8172d 100644 --- a/lib/cpu_gauge_field.cpp +++ b/lib/cpu_gauge_field.cpp @@ -58,10 +58,10 @@ namespace quda { errorQuda("Unsupported creation type %d", create); } } - - } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER || - order == QUDA_BQCD_GAUGE_ORDER || order == QUDA_TIFR_GAUGE_ORDER || - order == QUDA_TIFR_PADDED_GAUGE_ORDER || order == QUDA_MILC_SITE_GAUGE_ORDER) { + + } else if (order == QUDA_CPS_WILSON_GAUGE_ORDER || order == QUDA_MILC_GAUGE_ORDER || order == QUDA_BQCD_GAUGE_ORDER + || order == QUDA_TIFR_GAUGE_ORDER || order == QUDA_TIFR_PADDED_GAUGE_ORDER + || order == QUDA_MILC_SITE_GAUGE_ORDER || order == QUDA_OPENQCD_GAUGE_ORDER) { if (order == QUDA_MILC_SITE_GAUGE_ORDER && create != QUDA_REFERENCE_FIELD_CREATE) { errorQuda("MILC site gauge order only supported for reference fields"); @@ -79,7 +79,7 @@ namespace quda { } else { errorQuda("Unsupported gauge order type %d", order); } - + // no need to exchange data if this is a momentum field if (link_type != QUDA_ASQTAD_MOM_LINKS) { // Ghost zone is always 2-dimensional @@ -399,7 +399,8 @@ namespace quda { for (int d = 0; d < 4; d++) { std::memcpy(&dst_buffer[d * dbytes], p[d], dbytes); } } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER - || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { + || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER + || Order() == QUDA_OPENQCD_GAUGE_ORDER) { const void *p = Gauge_p(); int bytes = Bytes(); std::memcpy(buffer, p, bytes); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp new file mode 100644 index 0000000000..f7e781f843 --- /dev/null +++ b/lib/openqcd_interface.cpp @@ -0,0 +1,552 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +// #include + +#define MAX(a, b) ((a) > (b) ? (a) : (b)) + +// code for NVTX taken from Jiri Kraus' blog post: +// http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/ + +#ifdef INTERFACE_NVTX + +#if QUDA_NVTX_VERSION == 3 +#include "nvtx3/nvToolsExt.h" +#else +#include "nvToolsExt.h" +#endif + +static const uint32_t colors[] = {0x0000ff00, 0x000000ff, 0x00ffff00, 0x00ff00ff, 0x0000ffff, 0x00ff0000, 0x00ffffff}; +static const int num_colors = sizeof(colors) / sizeof(uint32_t); + +#define PUSH_RANGE(name, cid) \ + { \ + int color_id = cid; \ + color_id = color_id % num_colors; \ + nvtxEventAttributes_t eventAttrib = {0}; \ + eventAttrib.version = NVTX_VERSION; \ + eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE; \ + eventAttrib.colorType = NVTX_COLOR_ARGB; \ + eventAttrib.color = colors[color_id]; \ + eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \ + eventAttrib.message.ascii = name; \ + nvtxRangePushEx(&eventAttrib); \ + } +#define POP_RANGE nvtxRangePop(); +#else +#define PUSH_RANGE(name, cid) +#define POP_RANGE +#endif + +static bool initialized = false; +static int commsGridDim[4]; +static int localDim[4]; + +using namespace quda; + +// #define QUDAMILC_VERBOSE 1 + +// template void inline qudamilc_called(const char *func, QudaVerbosity verb) +// { +// // add NVTX markup if enabled +// if (start) { +// PUSH_RANGE(func, 1); +// } else { +// POP_RANGE; +// } + +// #ifdef QUDAMILC_VERBOSE +// if (verb >= QUDA_VERBOSE) { +// if (start) { +// printfQuda("QUDA_MILC_INTERFACE: %s (called) \n", func); +// } else { +// printfQuda("QUDA_MILC_INTERFACE: %s (return) \n", func); +// } +// } +// #endif +// } + +// template void inline qudamilc_called(const char *func) { qudamilc_called(func, getVerbosity()); } + +void openQCD_qudaSetLayout(openQCD_QudaLayout_t input) +{ + int local_dim[4]; + for (int dir = 0; dir < 4; ++dir) { local_dim[dir] = input.latsize[dir]; } +#ifdef MULTI_GPU + for (int dir = 0; dir < 4; ++dir) { local_dim[dir] /= input.machsize[dir]; } +#endif + for (int dir = 0; dir < 4; ++dir) { + if (local_dim[dir] % 2 != 0) { + printf("Error: Odd lattice dimensions are not supported\n"); + exit(1); + } + } + // TODO: do we need to track this here + for (int dir = 0; dir < 4; ++dir) localDim[dir] = local_dim[dir]; + +#ifdef MULTI_GPU + for (int dir = 0; dir < 4; ++dir) commsGridDim[dir] = input.machsize[dir]; +// TODO: would we ever want to run with QMP COMMS? +#ifdef QMP_COMMS + initCommsGridQuda(4, commsGridDim, nullptr, nullptr); +#else + initCommsGridQuda(4, commsGridDim, rankFromCoords, (void *)(commsGridDim)); +#endif + static int device = -1; +#else + static int device = input.device; +#endif + + initQuda(device); +} + +void openQCD_qudaInit(openQCD_QudaInitArgs_t input) +{ + if (initialized) return; + setVerbosityQuda(input.verbosity, "", stdout); + // qudamilc_called(__func__); + openQCD_qudaSetLayout(input.layout); + initialized = true; + // qudamilc_called(__func__); +} + +void openQCD_qudaFinalize() { endQuda(); } + +// TODO: fix me for openQCD +static int rankFromCoords(const int *coords, void *fdata) +{ + int *dims = static_cast(fdata); + + int rank = coords[3]; + for (int i = 2; i >= 0; i--) { rank = dims[i] * rank + coords[i]; } + return rank; +} + +// not sure we want to use allocators, but in case we want to +#if 0 +void *qudaAllocatePinned(size_t bytes) { return pool_pinned_malloc(bytes); } + +void qudaFreePinned(void *ptr) { pool_pinned_free(ptr); } + +void *qudaAllocateManaged(size_t bytes) { return managed_malloc(bytes); } + +void qudaFreeManaged(void *ptr) { managed_free(ptr); } +#endif + +static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec, QudaLinkType link_type) +{ + QudaGaugeParam gParam = newQudaGaugeParam(); + for (int dir = 0; dir < 4; ++dir) gParam.X[dir] = dim[dir]; + gParam.cuda_prec_sloppy = gParam.cpu_prec = gParam.cuda_prec = prec; + gParam.type = link_type; + + gParam.reconstruct_sloppy = gParam.reconstruct + = ((link_type == QUDA_SU3_LINKS) ? QUDA_RECONSTRUCT_12 : QUDA_RECONSTRUCT_NO); + gParam.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; + gParam.t_boundary = QUDA_PERIODIC_T; + gParam.gauge_fix = QUDA_GAUGE_FIXED_NO; + gParam.scale = 1.0; + gParam.anisotropy = 1.0; + gParam.tadpole_coeff = 1.0; + gParam.scale = 0; + gParam.ga_pad = 0; + gParam.site_ga_pad = 0; + gParam.mom_ga_pad = 0; + gParam.llfat_ga_pad = 0; + return gParam; +} + +// void qudaPlaquettePhased(int precision, double plaq[3], QudaMILCSiteArg_t *arg, int phase_in) +// { +// qudamilc_called(__func__); + +// QudaGaugeParam qudaGaugeParam = createGaugeParamForObservables(precision, arg, phase_in); +// void *gauge = arg->site ? arg->site : arg->link; + +// loadGaugeQuda(gauge, &qudaGaugeParam); + +// QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam(); +// obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; +// obsParam.remove_staggered_phase = phase_in ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; +// gaugeObservablesQuda(&obsParam); + +// // Let MILC apply its own Nc normalization +// plaq[0] = obsParam.plaquette[0]; +// plaq[1] = obsParam.plaquette[1]; +// plaq[2] = obsParam.plaquette[2]; + +// qudamilc_called(__func__); +// return; +// } + +// static int getLinkPadding(const int dim[4]) +// { +// int padding = MAX(dim[1] * dim[2] * dim[3] / 2, dim[0] * dim[2] * dim[3] / 2); +// padding = MAX(padding, dim[0] * dim[1] * dim[3] / 2); +// padding = MAX(padding, dim[0] * dim[1] * dim[2] / 2); +// return padding; +// } + +// set the params for the single mass solver +static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, + double mass, double target_residual, double target_residual_hq, int maxiter, + double reliable_delta, QudaParity parity, QudaVerbosity verbosity, + QudaInverterType inverter, QudaInvertParam *invertParam) +{ + invertParam->verbosity = verbosity; + invertParam->mass = mass; + invertParam->tol = target_residual; + invertParam->tol_hq = target_residual_hq; + + invertParam->residual_type = static_cast(0); + invertParam->residual_type = (target_residual != 0) ? + static_cast(invertParam->residual_type | QUDA_L2_RELATIVE_RESIDUAL) : + invertParam->residual_type; + invertParam->residual_type = (target_residual_hq != 0) ? + static_cast(invertParam->residual_type | QUDA_HEAVY_QUARK_RESIDUAL) : + invertParam->residual_type; + + invertParam->heavy_quark_check = (invertParam->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? 1 : 0); + if (invertParam->heavy_quark_check) { + invertParam->max_hq_res_increase = 5; // this caps the number of consecutive hq residual increases + invertParam->max_hq_res_restart_total = 10; // this caps the number of hq restarts in case of solver stalling + } + + invertParam->use_sloppy_partial_accumulator = 0; + invertParam->num_offset = 0; + + invertParam->inv_type = inverter; + invertParam->maxiter = maxiter; + invertParam->reliable_delta = reliable_delta; + + invertParam->mass_normalization = QUDA_MASS_NORMALIZATION; + invertParam->cpu_prec = cpu_prec; + invertParam->cuda_prec = cuda_prec; + invertParam->cuda_prec_sloppy = invertParam->heavy_quark_check ? cuda_prec : cuda_prec_sloppy; + invertParam->cuda_prec_precondition = cuda_prec_sloppy; + + invertParam->gcrNkrylov = 10; + + invertParam->solution_type = QUDA_MATPC_SOLUTION; + invertParam->solve_type = QUDA_DIRECT_PC_SOLVE; + invertParam->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // not used, but required by the code. + invertParam->dirac_order = QUDA_DIRAC_ORDER; + + invertParam->dslash_type = QUDA_ASQTAD_DSLASH; + invertParam->Ls = 1; + invertParam->gflops = 0.0; + + invertParam->input_location = QUDA_CPU_FIELD_LOCATION; + invertParam->output_location = QUDA_CPU_FIELD_LOCATION; + + if (parity == QUDA_EVEN_PARITY) { // even parity + invertParam->matpc_type = QUDA_MATPC_EVEN_EVEN; + } else if (parity == QUDA_ODD_PARITY) { + invertParam->matpc_type = QUDA_MATPC_ODD_ODD; + } else { + errorQuda("Invalid parity\n"); + } + + invertParam->dagger = QUDA_DAG_NO; + invertParam->use_init_guess = QUDA_USE_INIT_GUESS_YES; + + // for the preconditioner + invertParam->inv_type_precondition = QUDA_CG_INVERTER; + invertParam->tol_precondition = 1e-1; + invertParam->maxiter_precondition = 2; + invertParam->verbosity_precondition = QUDA_SILENT; + + invertParam->compute_action = 0; +} + +static void setColorSpinorParams(const int dim[4], QudaPrecision precision, ColorSpinorParam *param) +{ + param->nColor = 3; + param->nSpin = 1; + param->nDim = 4; + + for (int dir = 0; dir < 4; ++dir) param->x[dir] = dim[dir]; + param->x[0] /= 2; + + param->setPrecision(precision); + param->pad = 0; + param->siteSubset = QUDA_PARITY_SITE_SUBSET; + param->siteOrder = QUDA_EVEN_ODD_SITE_ORDER; + param->fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; + param->gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // meaningless, but required by the code. + param->create = QUDA_ZERO_FIELD_CREATE; +} + +#if 0 +void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, openQCD_QudaInvertArgs_t inv_args, + double target_residual, double target_fermilab_residual, const void *const fatlink, + const void *const longlink, void *source, void *solution, double *const final_residual, + double *const final_fermilab_residual, int *num_iters) +{ + static const QudaVerbosity verbosity = getVerbosity(); + // qudamilc_called(__func__, verbosity); + + if (target_fermilab_residual == 0 && target_residual == 0) errorQuda("qudaInvert: requesting zero residual\n"); + + QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + + static bool force_double_queried = false; + static bool do_not_force_double = false; + if (!force_double_queried) { + char *donotusedouble_env = getenv("QUDA_MILC_OVERRIDE_DOUBLE_MULTISHIFT"); // disable forcing outer double precision + if (donotusedouble_env && (!(strcmp(donotusedouble_env, "0") == 0))) { + do_not_force_double = true; + printfQuda("Disabling always using double as fine precision for MILC multishift\n"); + } + force_double_queried = true; + } + + QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + + QudaPrecision device_precision_sloppy; + switch (inv_args.mixed_precision) { + case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break; + case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break; + default: device_precision_sloppy = device_precision; + } + + // override fine precision to double, switch to mixed as necessary + if (!do_not_force_double && device_precision == QUDA_SINGLE_PRECISION) { + // force outer double + device_precision = QUDA_DOUBLE_PRECISION; + } + + QudaGaugeParam fat_param = newQudaGaugeParam(); + QudaGaugeParam long_param = newQudaGaugeParam(); + setGaugeParams(fat_param, long_param, longlink, localDim, host_precision, device_precision, device_precision_sloppy, + inv_args.tadpole, inv_args.naik_epsilon); + + QudaInvertParam invertParam = newQudaInvertParam(); + + QudaParity local_parity = inv_args.evenodd; + const double reliable_delta = 1e-1; + + setInvertParams(host_precision, device_precision, device_precision_sloppy, mass, target_residual, + target_fermilab_residual, inv_args.max_iter, reliable_delta, local_parity, verbosity, + QUDA_CG_INVERTER, &invertParam); + + ColorSpinorParam csParam; + setColorSpinorParams(localDim, host_precision, &csParam); + + // dirty hack to invalidate the cached gauge field without breaking interface compatability + if (*num_iters == -1 || !canReuseResidentGauge(&invertParam)) invalidateGaugeQuda(); + + if (invalidate_quda_gauge || !create_quda_gauge) { + loadGaugeQuda(const_cast(fatlink), &fat_param); + if (longlink != nullptr) loadGaugeQuda(const_cast(longlink), &long_param); + invalidate_quda_gauge = false; + } + + if (longlink == nullptr) invertParam.dslash_type = QUDA_STAGGERED_DSLASH; + + int quark_offset = getColorVectorOffset(local_parity, false, localDim) * host_precision; + + invertQuda(static_cast(solution) + quark_offset, static_cast(source) + quark_offset, &invertParam); + + // return the number of iterations taken by the inverter + *num_iters = invertParam.iter; + *final_residual = invertParam.true_res; + *final_fermilab_residual = invertParam.true_res_hq; + + if (!create_quda_gauge) invalidateGaugeQuda(); + + qudamilc_called(__func__, verbosity); +} // qudaInvert + +void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, const void *const fatlink, + const void *const longlink, void *src, void *dst, int *num_iters) +{ + static const QudaVerbosity verbosity = getVerbosity(); + qudamilc_called(__func__, verbosity); + + // static const QudaVerbosity verbosity = getVerbosity(); + QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + QudaPrecision device_precision_sloppy = device_precision; + + QudaGaugeParam fat_param = newQudaGaugeParam(); + QudaGaugeParam long_param = newQudaGaugeParam(); + setGaugeParams(fat_param, long_param, longlink, localDim, host_precision, device_precision, device_precision_sloppy, + inv_args.tadpole, inv_args.naik_epsilon); + + QudaInvertParam invertParam = newQudaInvertParam(); + + QudaParity local_parity = inv_args.evenodd; + QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY; + + setInvertParams(host_precision, device_precision, device_precision_sloppy, 0.0, 0, 0, 0, 0.0, local_parity, verbosity, + QUDA_CG_INVERTER, &invertParam); + + ColorSpinorParam csParam; + setColorSpinorParams(localDim, host_precision, &csParam); + + // dirty hack to invalidate the cached gauge field without breaking interface compatability + if (*num_iters == -1 || !canReuseResidentGauge(&invertParam)) invalidateGaugeQuda(); + + if (invalidate_quda_gauge || !create_quda_gauge) { + loadGaugeQuda(const_cast(fatlink), &fat_param); + if (longlink != nullptr) loadGaugeQuda(const_cast(longlink), &long_param); + invalidate_quda_gauge = false; + } + + if (longlink == nullptr) invertParam.dslash_type = QUDA_STAGGERED_DSLASH; + + int src_offset = getColorVectorOffset(other_parity, false, localDim); + int dst_offset = getColorVectorOffset(local_parity, false, localDim); + + dslashQuda(static_cast(dst) + dst_offset * host_precision, + static_cast(src) + src_offset * host_precision, &invertParam, local_parity); + + if (!create_quda_gauge) invalidateGaugeQuda(); + + qudamilc_called(__func__, verbosity); +} // qudaDslash +#endif +// void* openQCD_qudaCreateGaugeField(void *gauge, int geometry, int precision) +// { +// qudamilc_called(__func__); +// QudaPrecision qudaPrecision = (precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; +// QudaGaugeParam qudaGaugeParam +// = newMILCGaugeParam(localDim, qudaPrecision, (geometry == 1) ? QUDA_GENERAL_LINKS : QUDA_SU3_LINKS); +// qudamilc_called(__func__); +// return createGaugeFieldQuda(gauge, geometry, &qudaGaugeParam); +// } + +// void qudaSaveGaugeField(void *gauge, void *inGauge) +// { +// qudamilc_called(__func__); +// cudaGaugeField *cudaGauge = reinterpret_cast(inGauge); +// QudaGaugeParam qudaGaugeParam = newMILCGaugeParam(localDim, cudaGauge->Precision(), QUDA_GENERAL_LINKS); +// saveGaugeFieldQuda(gauge, inGauge, &qudaGaugeParam); +// qudamilc_called(__func__); +// } + +// void qudaDestroyGaugeField(void *gauge) +// { +// qudamilc_called(__func__); +// destroyGaugeFieldQuda(gauge); +// qudamilc_called(__func__); +// } + +// void setInvertParam(QudaInvertParam &invertParam, openQCD_QudaInvertArgs_t &inv_args, int external_precision, +// int quda_precision, double kappa, double reliable_delta); + +void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], openQCD_QudaInvertArgs_t &inv_args, + int external_precision, int quda_precision) +{ + + const QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + const QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + QudaPrecision device_precision_sloppy; + + switch (inv_args.mixed_precision) { + case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break; + case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break; + default: device_precision_sloppy = device_precision; + } + + for (int dir = 0; dir < 4; ++dir) qudaGaugeParam.X[dir] = dim[dir]; + + qudaGaugeParam.anisotropy = 1.0; + qudaGaugeParam.type = QUDA_WILSON_LINKS; + qudaGaugeParam.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; + + // Check the boundary conditions + // Can't have twisted or anti-periodic boundary conditions in the spatial + // directions with 12 reconstruct at the moment. + bool trivial_phase = true; + for (int dir = 0; dir < 3; ++dir) { + if (inv_args.boundary_phase[dir] != 0) trivial_phase = false; + } + if (inv_args.boundary_phase[3] != 0 && inv_args.boundary_phase[3] != 1) trivial_phase = false; + + if (trivial_phase) { + qudaGaugeParam.t_boundary = (inv_args.boundary_phase[3]) ? QUDA_ANTI_PERIODIC_T : QUDA_PERIODIC_T; + qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_12; + qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_12; + } else { + qudaGaugeParam.t_boundary = QUDA_PERIODIC_T; + qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; + qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; + } + + qudaGaugeParam.cpu_prec = host_precision; + qudaGaugeParam.cuda_prec = device_precision; + qudaGaugeParam.cuda_prec_sloppy = device_precision_sloppy; + qudaGaugeParam.cuda_prec_precondition = device_precision_sloppy; + qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; + // qudaGaugeParam.ga_pad = getLinkPadding(dim); +} + +void setInvertParam(QudaInvertParam &invertParam, openQCD_QudaInvertArgs_t &inv_args, int external_precision, + int quda_precision, double kappa, double reliable_delta) +{ + + const QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + const QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + QudaPrecision device_precision_sloppy; + switch (inv_args.mixed_precision) { + case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break; + case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break; + default: device_precision_sloppy = device_precision; + } + + static const QudaVerbosity verbosity = getVerbosity(); + + invertParam.dslash_type = QUDA_CLOVER_WILSON_DSLASH; + invertParam.kappa = kappa; + invertParam.dagger = QUDA_DAG_NO; + invertParam.mass_normalization = QUDA_KAPPA_NORMALIZATION; + invertParam.gcrNkrylov = 30; + invertParam.reliable_delta = reliable_delta; + invertParam.maxiter = inv_args.max_iter; + + invertParam.cuda_prec_precondition = device_precision_sloppy; + invertParam.verbosity_precondition = verbosity; + invertParam.verbosity = verbosity; + invertParam.cpu_prec = host_precision; + invertParam.cuda_prec = device_precision; + invertParam.cuda_prec_sloppy = device_precision_sloppy; + invertParam.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + invertParam.dirac_order = QUDA_DIRAC_ORDER; + invertParam.clover_cpu_prec = host_precision; + invertParam.clover_cuda_prec = device_precision; + invertParam.clover_cuda_prec_sloppy = device_precision_sloppy; + invertParam.clover_cuda_prec_precondition = device_precision_sloppy; + invertParam.clover_order = QUDA_PACKED_CLOVER_ORDER; + + invertParam.compute_action = 0; +} + +void openQCD_qudaLoadGaugeField(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, + const void *milc_link) +{ + // qudamilc_called(__func__); + QudaGaugeParam qudaGaugeParam = newQudaGaugeParam(); + setGaugeParams(qudaGaugeParam, localDim, inv_args, external_precision, quda_precision); + + loadGaugeQuda(const_cast(milc_link), &qudaGaugeParam); + // qudamilc_called(__func__); +} // qudaLoadGaugeField + +void openQCD_qudaFreeGaugeField() +{ + // qudamilc_called(__func__); + freeGaugeQuda(); + // qudamilc_called(__func__); +} // qudaFreeGaugeField From 71c18976a87cb2a1a24917e9763d97f691b25923 Mon Sep 17 00:00:00 2001 From: Mahias Wagner Date: Tue, 27 Sep 2022 16:25:59 +0200 Subject: [PATCH 002/148] compilation fixes and plaquette dummy --- lib/openqcd_interface.cpp | 63 +++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index f7e781f843..10d1802ea7 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -55,7 +55,6 @@ static int localDim[4]; using namespace quda; // #define QUDAMILC_VERBOSE 1 - // template void inline qudamilc_called(const char *func, QudaVerbosity verb) // { // // add NVTX markup if enabled @@ -77,6 +76,15 @@ using namespace quda; // } // template void inline qudamilc_called(const char *func) { qudamilc_called(func, getVerbosity()); } +// TODO: fix me for openQCD +static int rankFromCoords(const int *coords, void *fdata) +{ + int *dims = static_cast(fdata); + + int rank = coords[3]; + for (int i = 2; i >= 0; i--) { rank = dims[i] * rank + coords[i]; } + return rank; +} void openQCD_qudaSetLayout(openQCD_QudaLayout_t input) { @@ -122,16 +130,6 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t input) void openQCD_qudaFinalize() { endQuda(); } -// TODO: fix me for openQCD -static int rankFromCoords(const int *coords, void *fdata) -{ - int *dims = static_cast(fdata); - - int rank = coords[3]; - for (int i = 2; i >= 0; i--) { rank = dims[i] * rank + coords[i]; } - return rank; -} - // not sure we want to use allocators, but in case we want to #if 0 void *qudaAllocatePinned(size_t bytes) { return pool_pinned_malloc(bytes); } @@ -143,15 +141,14 @@ void *qudaAllocateManaged(size_t bytes) { return managed_malloc(bytes); } void qudaFreeManaged(void *ptr) { managed_free(ptr); } #endif -static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec, QudaLinkType link_type) +static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec) { QudaGaugeParam gParam = newQudaGaugeParam(); for (int dir = 0; dir < 4; ++dir) gParam.X[dir] = dim[dir]; gParam.cuda_prec_sloppy = gParam.cpu_prec = gParam.cuda_prec = prec; - gParam.type = link_type; + gParam.type = QUDA_SU3_LINKS; - gParam.reconstruct_sloppy = gParam.reconstruct - = ((link_type == QUDA_SU3_LINKS) ? QUDA_RECONSTRUCT_12 : QUDA_RECONSTRUCT_NO); + gParam.reconstruct_sloppy = gParam.reconstruct = QUDA_RECONSTRUCT_NO; gParam.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; gParam.t_boundary = QUDA_PERIODIC_T; gParam.gauge_fix = QUDA_GAUGE_FIXED_NO; @@ -166,28 +163,30 @@ static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec, Q return gParam; } -// void qudaPlaquettePhased(int precision, double plaq[3], QudaMILCSiteArg_t *arg, int phase_in) -// { -// qudamilc_called(__func__); +void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge) +{ + // qudamilc_called(__func__); -// QudaGaugeParam qudaGaugeParam = createGaugeParamForObservables(precision, arg, phase_in); -// void *gauge = arg->site ? arg->site : arg->link; + QudaGaugeParam qudaGaugeParam + = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); // fixme + // reateGaugeParamForObservables(precision, arg, phase_in); -// loadGaugeQuda(gauge, &qudaGaugeParam); + loadGaugeQuda(gauge, &qudaGaugeParam); -// QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam(); -// obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; -// obsParam.remove_staggered_phase = phase_in ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; -// gaugeObservablesQuda(&obsParam); + QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam(); + obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; + obsParam.remove_staggered_phase = QUDA_BOOLEAN_FALSE; // + // phase_in ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + gaugeObservablesQuda(&obsParam); -// // Let MILC apply its own Nc normalization -// plaq[0] = obsParam.plaquette[0]; -// plaq[1] = obsParam.plaquette[1]; -// plaq[2] = obsParam.plaquette[2]; + // Let MILC apply its own Nc normalization + plaq[0] = obsParam.plaquette[0]; + plaq[1] = obsParam.plaquette[1]; + plaq[2] = obsParam.plaquette[2]; -// qudamilc_called(__func__); -// return; -// } + // qudamilc_called(__func__); + return; +} // static int getLinkPadding(const int dim[4]) // { From efc24b8404d94ea72ef12433f1c40e6eff87d11c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Antonio=20Fern=C3=A1ndez=20de=20la=20Garza?= Date: Tue, 27 Sep 2022 17:32:24 +0200 Subject: [PATCH 003/148] Updated rank formula The rank formula was updated to the openQxD layout: rank = coords[0]*dims[1]*dims[2]*dims[3] + coords[1]*dims[2]*dims[3] + coords[2]*dims[3] + coords[3] --- lib/openqcd_interface.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 10d1802ea7..f0e115dadd 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -80,9 +80,9 @@ using namespace quda; static int rankFromCoords(const int *coords, void *fdata) { int *dims = static_cast(fdata); - - int rank = coords[3]; - for (int i = 2; i >= 0; i--) { rank = dims[i] * rank + coords[i]; } +// rank = coords[0]*dims[1]*dims[2]*dims[3] + coords[1]*dims[2]*dims[3] + coords[2]*dims[3] + coords[3] + int rank = coords[0]; + for (int i = 0; i < 3; i++) { rank = dims[i] * rank + coords[i]; } return rank; } From 61b8a02b3d62485bf7e2d3c41182daea2aa7b166 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Antonio=20Fern=C3=A1ndez=20de=20la=20Garza?= Date: Tue, 27 Sep 2022 17:33:46 +0200 Subject: [PATCH 004/148] Typo corrected The forloop was corrected (typo in i counter) --- lib/openqcd_interface.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index f0e115dadd..f2a34533b3 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -82,7 +82,7 @@ static int rankFromCoords(const int *coords, void *fdata) int *dims = static_cast(fdata); // rank = coords[0]*dims[1]*dims[2]*dims[3] + coords[1]*dims[2]*dims[3] + coords[2]*dims[3] + coords[3] int rank = coords[0]; - for (int i = 0; i < 3; i++) { rank = dims[i] * rank + coords[i]; } + for (int i = 1; i < 3; i++) { rank = dims[i] * rank + coords[i]; } return rank; } From a40b0326097806c1de6ddd7bdb435ed9b7fbe3bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Antonio=20Fern=C3=A1ndez=20de=20la=20Garza?= Date: Tue, 27 Sep 2022 17:35:10 +0200 Subject: [PATCH 005/148] Typo corrected corrected to i<=3 --- lib/openqcd_interface.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index f2a34533b3..97e14a4a22 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -82,7 +82,7 @@ static int rankFromCoords(const int *coords, void *fdata) int *dims = static_cast(fdata); // rank = coords[0]*dims[1]*dims[2]*dims[3] + coords[1]*dims[2]*dims[3] + coords[2]*dims[3] + coords[3] int rank = coords[0]; - for (int i = 1; i < 3; i++) { rank = dims[i] * rank + coords[i]; } + for (int i = 1; i <= 3; i++) { rank = dims[i] * rank + coords[i]; } return rank; } From 0a69472e336e97c5454105030b5bdb5bf06e113f Mon Sep 17 00:00:00 2001 From: nyholme Date: Wed, 28 Sep 2022 11:55:58 +0200 Subject: [PATCH 006/148] fix rankFromCoords to consider BLK order --- lib/openqcd_interface.cpp | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 97e14a4a22..7b0e06eedd 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -76,13 +76,31 @@ using namespace quda; // } // template void inline qudamilc_called(const char *func) { qudamilc_called(func, getVerbosity()); } -// TODO: fix me for openQCD + +// fdata should point to 8 integers in order {BLK_NPROC0, BLK_NPROC1, BLK_NPROC2, BLK_NPROC3, NPROC0, NPROC1, NPROC2, NPROC3] static int rankFromCoords(const int *coords, void *fdata) { - int *dims = static_cast(fdata); -// rank = coords[0]*dims[1]*dims[2]*dims[3] + coords[1]*dims[2]*dims[3] + coords[2]*dims[3] + coords[3] - int rank = coords[0]; - for (int i = 1; i <= 3; i++) { rank = dims[i] * rank + coords[i]; } + int *BLK_NPROC = static_cast(fdata); + int *NPROC = BLK_NPROC + 4; + + int BLK_coords[4]; + int local_coords[4]; + + for (int i = 0; i < 4; i++) { + // coordinate of BLK in the BLK grid + BLK_coords[i] = coords[i] / BLK_NPROC[i]; + // local coordinate inside BLK + local_coords[i] = coords[i] - BLK_coords[i]*BLK_NPROC[i]; + } + + int rank = BLK_coords[0]; + for (int i = 1; i <= 3; i++) { + rank = (NPROC[i] / BLK_NPROC[i]) * rank + BLK_coords[i]; + } + + for (int i = 0; i <= 3; i++) { + rank = BLK_NPROC[i] * rank + local_coords[i]; + } return rank; } From c29aa6699bffa38de4901c8b624c4b452919b2e3 Mon Sep 17 00:00:00 2001 From: Mahias Wagner Date: Wed, 28 Sep 2022 16:28:17 +0200 Subject: [PATCH 007/148] some fixes for openQCD testing --- CMakeLists.txt | 2 +- include/gauge_field_order.h | 4 +++- include/quda_openqcd_interface.h | 4 +++- lib/openqcd_interface.cpp | 15 +++++++++++---- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ad6f8d1cc3..cce96a5a15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -171,7 +171,7 @@ option(QUDA_OPENBLAS "enable OpenBLAS" OFF) # Interface options option(QUDA_INTERFACE_QDP "build qdp interface" ON) option(QUDA_INTERFACE_MILC "build milc interface" ON) -option(QUDA_INTERFACE_OpenQCD "build OpenQCD interface" OFF) +option(QUDA_INTERFACE_OPENQCD "build OpenQCD interface" OFF) option(QUDA_INTERFACE_CPS "build cps interface" OFF) option(QUDA_INTERFACE_QDPJIT "build qdpjit interface" OFF) option(QUDA_INTERFACE_BQCD "build bqcd interface" OFF) diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 3538550139..041b6b5e3a 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2354,7 +2354,9 @@ namespace quda { // ud+8*(ix-VOLUME/2)+2*mu+1 // is the pointer to the link variable U(x-mu,mu), // so to get U(x,mu) for even x we need to load U(ix,mu) with ix=x+mu - int xmu = linkIndexP1(x, dim, dir); // TODO: What about on boundaries?, do we need to index into them? + int coord[4]; + getCoords(coord, x, dim, 1); + int xmu = linkIndexP1(coord, dim, dir); // TODO: What about on boundaries?, do we need to index into them? auto in = &gauge[(8 * xmu + 2 * dir + 1) * length]; block_load(v, reinterpret_cast(in)); diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 07774cfedd..796688056d 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -51,7 +51,7 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t input); /** * Destroy the QUDA context. */ -void openQCD_qudaFinalize(); +void openQCD_qudaFinalize(void); #if 0 // leave that here for now @@ -155,6 +155,8 @@ void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, void openQCD_qudaLoadGaugeField(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, const void *milc_link); +void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge); + /** Free the gauge field allocated in QUDA. */ diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 7b0e06eedd..3db8536448 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -159,6 +159,15 @@ void *qudaAllocateManaged(size_t bytes) { return managed_malloc(bytes); } void qudaFreeManaged(void *ptr) { managed_free(ptr); } #endif + +static int getLinkPadding(const int dim[4]) +{ + int padding = MAX(dim[1]*dim[2]*dim[3]/2, dim[0]*dim[2]*dim[3]/2); + padding = MAX(padding, dim[0]*dim[1]*dim[3]/2); + padding = MAX(padding, dim[0]*dim[1]*dim[2]/2); + return padding; +} + static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec) { QudaGaugeParam gParam = newQudaGaugeParam(); @@ -174,10 +183,8 @@ static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec) gParam.anisotropy = 1.0; gParam.tadpole_coeff = 1.0; gParam.scale = 0; - gParam.ga_pad = 0; - gParam.site_ga_pad = 0; - gParam.mom_ga_pad = 0; - gParam.llfat_ga_pad = 0; + gParam.ga_pad = getLinkPadding(dim); + return gParam; } From d3d402255fed3383cc3cb8d9d66512708115d803 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Fri, 27 Jan 2023 17:39:32 +0100 Subject: [PATCH 008/148] Still not working, trying to manually couple indexing correctly --- include/enum_quda.h | 2 +- include/gauge_field_order.h | 115 ++++++++++++++++++++++--------- include/quda_openqcd_interface.h | 3 + lib/interface_quda.cpp | 2 +- lib/openqcd_interface.cpp | 24 ++++++- 5 files changed, 112 insertions(+), 34 deletions(-) diff --git a/include/enum_quda.h b/include/enum_quda.h index 369bc7d176..940cfd80e9 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -48,7 +48,7 @@ typedef enum QudaGaugeFieldOrder_s { QUDA_BQCD_GAUGE_ORDER, // expect *gauge, mu, even-odd, spacetime+halos, column-row order QUDA_TIFR_GAUGE_ORDER, // expect *gauge, mu, even-odd, spacetime, column-row order QUDA_TIFR_PADDED_GAUGE_ORDER, // expect *gauge, mu, parity, t, z+halo, y, x/2, column-row order - QUDA_OPENQCD_GAUGE_ORDER, // expect *gauge, spacetime, mu, parity, row-column order -- links attached to odd points only + QUDA_OPENQCD_GAUGE_ORDER, // expect *gauge, spacetime, mu, parity (uplink/downlink), row-column order -- links attached to odd points only QUDA_INVALID_GAUGE_ORDER = QUDA_INVALID_ENUM } QudaGaugeFieldOrder; diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 041b6b5e3a..f716c3db36 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2308,9 +2308,11 @@ namespace quda { size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } }; + + /** struct to define OpenQCD ordered gauge fields: - [volumecb][dim][parity*][row][col] + [volumecb][dim][parity*][row][col] parity*: uplink/downlink (link attached to closest odd site) */ template struct OpenQCDOrder : LegacyOrder { using Accessor = OpenQCDOrder; @@ -2329,38 +2331,89 @@ namespace quda { if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); } - // fields are only defined for odd points - // The pointer to the - // link variable U(x,mu) at any given *odd* point x is then - // ud+8*(ix-VOLUME/2)+2*mu - // while - // ud+8*(ix-VOLUME/2)+2*mu+1 - // is the pointer to the link variable U(x-mu,mu), where ix denotes the label of - // x. All link variables that constitute the local gauge field can thus be - // accessed in this simple way. - // see https://gitlab.com/rcstar/openQxD/-/blob/master/main/README.global - // typedef struct + // For reference: https://gitlab.com/rcstar/openQxD/-/blob/master/main/README.global + + // TODO: packet this function + // __device__ __host__ inline int QUDAtoOpenQxD(int x_cb_QUDA, int dir_QUDA, int parity_QUDA) const // { - // complex c11,c12,c13,c21,c22,c23,c31,c32,c33; - // } su3; - __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float = 1.0) const - { - if (parity == 1) { // odd points can be loaded directly - auto in = &gauge[(8 * x + 2 * dir) * length]; - block_load(v, reinterpret_cast(in)); - } else { - // gauge field for even points needs to be fetched from odd points, some indexing fun - // ud+8*(ix-VOLUME/2)+2*mu+1 - // is the pointer to the link variable U(x-mu,mu), - // so to get U(x,mu) for even x we need to load U(ix,mu) with ix=x+mu - int coord[4]; - getCoords(coord, x, dim, 1); - int xmu = linkIndexP1(coord, dim, dir); // TODO: What about on boundaries?, do we need to index into them? - - auto in = &gauge[(8 * xmu + 2 * dir + 1) * length]; - block_load(v, reinterpret_cast(in)); - } + // } + + // TODO: Implement ipt and iup functions + + + + __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float = 1.0) const // FIXME: What's this "Float = 1.0" for ? + { + // FIX: what's x: ANS: x is the checkerboard 1-D index (cf. index_helper.cuh) + + // With ''natural'' order: lexicographical 0123 = txyz , z fastest, links 0123 = txyz in pos directions + + // Indexing fun: + int coord[4]; // declare a 4D vector x0, x1, x2, x3 + getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site + // int iy_OpenQxD = x3 + L3*x2 + L3*L2*x1 + L3*L2*L1*x0; + int iy_OpenQxD = coord[2] + dim[2]*coord[1] + dim[2]*dim[1]*coord[0] + dim[0]*dim[2]*dim[1]*coord[3]; /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) + coord1 in QUDA is x2 in OpenQxD (y) + coord2 in QUDA is x3 in OpenQxD (z) + coord3 in QUDA is x0 in OpenQxD (t) + */ + int dir_OpenQxD = (dir + 1)%4; // rotation of axes QUDA -> OpenQxD + + // Loading as per QUDA style + auto in = &gauge[ (8*(iy_OpenQxD) + 2*dir_OpenQxD) * length]; // This is how they're accessed within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) + block_load(v, reinterpret_cast(in)); + + + + // if (parity == 1) { // odd points can be loaded directly + + // // Indexing fun: + // int coord[4]; // declare a 4D vector x0, x1, x2, x3 + // getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site + // // int iy_OpenQxD = x3 + L3*x2 + L3*L2*x1 + L3*L2*L1*x0; + // int iy_OpenQxD = coord[2] + dim[2]*coord[1] + dim[2]*dim[1]*coord[0] + dim[0]*dim[2]*dim[1]*coord[3]; /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) + // coord1 in QUDA is x2 in OpenQxD (y) + // coord2 in QUDA is x3 in OpenQxD (z) + // coord3 in QUDA is x0 in OpenQxD (t) + // */ + // int ix_OpenQxD = ipt[iy_OpenQxD]; // ipt mapping + // int dir_OpenQxD = (dir + 1)%4; // rotation of axes QUDA -> OpenQxD + + + // // Loading as per QUDA style + // auto in = &gauge[ (8*(ix_OpenQxD - volumeCB) + 2*dir_OpenQxD)* length]; // This is how they're accessed within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) + // block_load(v, reinterpret_cast(in)); + + // } else if (parity ==0) { + + // // More indexing fun: + // int coord[4]; // declare a 4D vector x0, x1, x2, x3 + // getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site + // // int iy_OpenQxD = x3 + L3*x2 + L3*L2*x1 + L3*L2*L1*x0; + // int iy_OpenQxD = coord[2] + dim[2]*coord[1] + dim[2]*dim[1]*coord[0] + dim[0]*dim[2]*dim[1]*coord[3]; /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) + // coord1 in QUDA is x2 in OpenQxD (y) + // coord2 in QUDA is x3 in OpenQxD (z) + // coord3 in QUDA is x0 in OpenQxD (t) + // In OpenQxD, z runs the fastest, (txyz order) + // In QUDA, I think t runs the fastest (xyzt order) + // Or should it be (zyxt ??) maybe FIXME: + // */ + // int ix_OpenQxD = ipt[iy_OpenQxD]; // ipt mapping + // int dir_OpenQxD = (dir + 1)%4; // rotation of axes QUDA -> OpenQxD + + // int ix_OpenQxD_shifted = iup[ix_OpenQxD][dir_OpenQxD]; // obtain neighboring index + + // // int xmu = linkIndexP1(coord, dim, dir); // Maybe for later FIXME: What about on boundaries?, do we need to index into them? + + // // Loading as per QUDA style + // auto in = &gauge[ (8*(ix_OpenQxD_shifted - volumeCB) + 2*dir_OpenQxD + 1)* length]; // This is how they're accessed within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) + // block_load(v, reinterpret_cast(in)); + + // } else { + // std::cout << "This shouldn't happen!!: Error in parity OpenQxD Order interface" << std::endl; + // } + } __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity) const diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 796688056d..aa9f1ab739 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -157,6 +157,9 @@ void openQCD_qudaLoadGaugeField(int external_precision, int quda_precision, open void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge); + +// int openQCD_ipt(int iy); + /** Free the gauge field allocated in QUDA. */ diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index b09ccec80a..3da20c4d87 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -537,7 +537,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) static_cast(new cpuGaugeField(gauge_param)) : static_cast(new cudaGaugeField(gauge_param)); - if (in->Order() == QUDA_BQCD_GAUGE_ORDER) { + if (in->Order() == QUDA_BQCD_GAUGE_ORDER) { static size_t checksum = SIZE_MAX; size_t in_checksum = in->checksum(true); if (in_checksum == checksum) { diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 3db8536448..b883e8c251 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -193,7 +193,7 @@ void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge) // qudamilc_called(__func__); QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); // fixme + = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); // FIXME: // reateGaugeParamForObservables(precision, arg, phase_in); loadGaugeQuda(gauge, &qudaGaugeParam); @@ -213,6 +213,25 @@ void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge) return; } +// static int openQCD_index() +// { +// // This function is the helper for ipt in QUDA + + +// return ix; +// } + +// int openQCD_ipt(int iy) +// { +// // This function computes the ipt index from iy (lexicographical index) +// int x0,x1,x2,x3; +// int k,mu,ix,iy,iz,iw; +// int bo[4],bs[4],ifc[8]; + + + +// } + // static int getLinkPadding(const int dim[4]) // { // int padding = MAX(dim[1] * dim[2] * dim[3] / 2, dim[0] * dim[2] * dim[3] / 2); @@ -293,6 +312,9 @@ static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, Qud invertParam->compute_action = 0; } + + + static void setColorSpinorParams(const int dim[4], QudaPrecision precision, ColorSpinorParam *param) { param->nColor = 3; From 9e1b7ad688ab8f5318721b82dcf6ceb854dbec71 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Mon, 30 Jan 2023 18:12:33 +0100 Subject: [PATCH 009/148] Now working, todo: add ipt directly in quda Currently, the reordering is done inside OpenQxd --- CMakeLists.txt | 14 +++++++++ include/gauge_field_order.h | 53 +++++++++++++++++++++++++++++--- include/quda_openqcd_interface.h | 1 + 3 files changed, 64 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index cce96a5a15..db4137b491 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -686,3 +686,17 @@ include(CTest) add_subdirectory(lib) add_subdirectory(tests) add_subdirectory(doc) + + + +# ###################################################################################################################### +# OpenQxD +# ###################################################################################################################### +# We might only want to do that if using QUDA_DOWNLOAD_USQCD, but this does not work if not set on the initial run +# if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) + # set(CMAKE_INSTALL_PREFIX + # ${CMAKE_BINARY_DIR}/usqcd + # CACHE PATH "..." FORCE) +# endif() + +# include_directories(/scratch/jfernande/openQxD-devel/include) \ No newline at end of file diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index f716c3db36..098c154356 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -24,6 +24,9 @@ #include #include +// #include "global.h" + + namespace quda { /** @@ -2320,6 +2323,7 @@ namespace quda { using complex = complex; Float *gauge; const int volumeCB; + // int ipt; static constexpr int Nc = 3; const int dim[4]; OpenQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : @@ -2342,26 +2346,67 @@ namespace quda { // TODO: Implement ipt and iup functions +/******* Previous version: */ + // // fields are only defined for odd points + // // The pointer to the + // // link variable U(x,mu) at any given *odd* point x is then + // // ud+8*(ix-VOLUME/2)+2*mu + // // while + // // ud+8*(ix-VOLUME/2)+2*mu+1 + // // is the pointer to the link variable U(x-mu,mu), where ix denotes the label of + // // x. All link variables that constitute the local gauge field can thus be + // // accessed in this simple way. + // // see https://gitlab.com/rcstar/openQxD/-/blob/master/main/README.global + // // typedef struct + // // { + // // complex c11,c12,c13,c21,c22,c23,c31,c32,c33; + // // } su3; + + // __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float = 1.0) const + // { + // if (parity == 1) { // odd points can be loaded directly + // auto in = &gauge[(8 * x + 2 * dir) * length]; // FIXME: what's x? + // block_load(v, reinterpret_cast(in)); + // } else { + // // gauge field for even points needs to be fetched from odd points, some indexing fun + // // ud+8*(ix-VOLUME/2)+2*mu+1 + // // is the pointer to the link variable U(x-mu,mu), + // // Mathias: so to get U(x,mu) for even x we need to load U(ix,mu) with ix=x+mu + // int coord[4]; + // getCoords(coord, x, dim, 1); // From here, x is the checkerboard index + // int xmu = linkIndexP1(coord, dim, dir); // TODO: What about on boundaries?, do we need to index into them? + + // auto in = &gauge[(8 * xmu + 2 * dir + 1) * length]; // TODO: xmu is not ipt[iy] + // block_load(v, reinterpret_cast(in)); + // } + // } +/******* End of Previous version: */ + __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float = 1.0) const // FIXME: What's this "Float = 1.0" for ? { - // FIX: what's x: ANS: x is the checkerboard 1-D index (cf. index_helper.cuh) + // FIX: what's x? (cf. index_helper.cuh) // With ''natural'' order: lexicographical 0123 = txyz , z fastest, links 0123 = txyz in pos directions // Indexing fun: - int coord[4]; // declare a 4D vector x0, x1, x2, x3 + int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) + getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site + // int iy_OpenQxD = x3 + L3*x2 + L3*L2*x1 + L3*L2*L1*x0; int iy_OpenQxD = coord[2] + dim[2]*coord[1] + dim[2]*dim[1]*coord[0] + dim[0]*dim[2]*dim[1]*coord[3]; /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) coord1 in QUDA is x2 in OpenQxD (y) coord2 in QUDA is x3 in OpenQxD (z) coord3 in QUDA is x0 in OpenQxD (t) */ + // int ix_OpenQxD = ipt[iy_OpenQxD]; int dir_OpenQxD = (dir + 1)%4; // rotation of axes QUDA -> OpenQxD + // Loading as per QUDA style - auto in = &gauge[ (8*(iy_OpenQxD) + 2*dir_OpenQxD) * length]; // This is how they're accessed within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) + auto in = &gauge[ (4*(iy_OpenQxD) + dir_OpenQxD) * length]; // This is how they're accessed within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) + // auto in = &gauge[ (8*(ix_OpenQxD - volumeCB) + 2*dir_OpenQxD)* length]; // This is how they're accessed within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) block_load(v, reinterpret_cast(in)); @@ -2442,7 +2487,7 @@ namespace quda { return gauge_wrapper(const_cast(*this), dim, x_cb, parity); } - size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } + size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } // Double => Float = 1.0 => 1 byte per float, 18 floats per complex 3x3 matrix }; } // namespace gauge diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index aa9f1ab739..f3f7ace3c2 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -24,6 +24,7 @@ typedef struct { const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ int device; /** GPU device number */ + // const int *ipt; } openQCD_QudaLayout_t; /** From aec93e34c3004aae946ed256bfb512dc214969f8 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Fri, 3 Feb 2023 14:56:14 +0100 Subject: [PATCH 010/148] TODO: comments & planning --- include/gauge_field_order.h | 5 +++-- include/quda_openqcd_interface.h | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 098c154356..e2b662710e 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2382,7 +2382,8 @@ namespace quda { // } /******* End of Previous version: */ - + + // TODO: even if x, dir, parity are the global indices, the conversion from global to local may run into problems __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float = 1.0) const // FIXME: What's this "Float = 1.0" for ? { // FIX: what's x? (cf. index_helper.cuh) @@ -2395,6 +2396,7 @@ namespace quda { getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site // int iy_OpenQxD = x3 + L3*x2 + L3*L2*x1 + L3*L2*L1*x0; + // TODO: Determine whether coord[mu] is local or global int iy_OpenQxD = coord[2] + dim[2]*coord[1] + dim[2]*dim[1]*coord[0] + dim[0]*dim[2]*dim[1]*coord[3]; /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) coord1 in QUDA is x2 in OpenQxD (y) coord2 in QUDA is x3 in OpenQxD (z) @@ -2409,7 +2411,6 @@ namespace quda { // auto in = &gauge[ (8*(ix_OpenQxD - volumeCB) + 2*dir_OpenQxD)* length]; // This is how they're accessed within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) block_load(v, reinterpret_cast(in)); - // if (parity == 1) { // odd points can be loaded directly diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index f3f7ace3c2..16ecce2d00 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -20,9 +20,9 @@ extern "C" { * Parameters related to problem size and machine topology. */ typedef struct { - const int *latsize; /** Local lattice dimensions L0, L1, L2, L3 */ - const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ - const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ + const int *latsize; /** Local lattice dimensions L0, L1, L2, L3 */ // FIXME: + const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ // FIXME: + const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ // FIXME: int device; /** GPU device number */ // const int *ipt; } openQCD_QudaLayout_t; From 1f0e5732af4e4d12147f6a367ffe0014a37ad5f2 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Mon, 20 Feb 2023 15:53:46 +0100 Subject: [PATCH 011/148] hardwire quda rankfromcoords --- include/quda_openqcd_interface.h | 2 +- lib/openqcd_interface.cpp | 50 +++++++++++++++++++------------- 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 16ecce2d00..214b51e401 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -23,7 +23,7 @@ typedef struct { const int *latsize; /** Local lattice dimensions L0, L1, L2, L3 */ // FIXME: const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ // FIXME: const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ // FIXME: - int device; /** GPU device number */ + int device; /** GPU device number */ // const int *ipt; } openQCD_QudaLayout_t; diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index b883e8c251..7ffc99a2ab 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -80,28 +80,37 @@ using namespace quda; // fdata should point to 8 integers in order {BLK_NPROC0, BLK_NPROC1, BLK_NPROC2, BLK_NPROC3, NPROC0, NPROC1, NPROC2, NPROC3] static int rankFromCoords(const int *coords, void *fdata) { - int *BLK_NPROC = static_cast(fdata); - int *NPROC = BLK_NPROC + 4; - - int BLK_coords[4]; - int local_coords[4]; - - for (int i = 0; i < 4; i++) { - // coordinate of BLK in the BLK grid - BLK_coords[i] = coords[i] / BLK_NPROC[i]; - // local coordinate inside BLK - local_coords[i] = coords[i] - BLK_coords[i]*BLK_NPROC[i]; - } - - int rank = BLK_coords[0]; - for (int i = 1; i <= 3; i++) { - rank = (NPROC[i] / BLK_NPROC[i]) * rank + BLK_coords[i]; + // For 2 ranks: + if (coords[3]<8) + { + return 0; + } else { + return 1; } + - for (int i = 0; i <= 3; i++) { - rank = BLK_NPROC[i] * rank + local_coords[i]; - } - return rank; + // int *BLK_NPROC = static_cast(fdata); + // int *NPROC = BLK_NPROC + 4; + + // int BLK_coords[4]; + // int local_coords[4]; + + // for (int i = 0; i < 4; i++) { + // // coordinate of BLK in the BLK grid + // BLK_coords[i] = coords[i] / BLK_NPROC[i]; + // // local coordinate inside BLK + // local_coords[i] = coords[i] - BLK_coords[i]*BLK_NPROC[i]; + // } + + // int rank = BLK_coords[0]; + // for (int i = 1; i <= 3; i++) { + // rank = (NPROC[i] / BLK_NPROC[i]) * rank + BLK_coords[i]; + // } + + // for (int i = 0; i <= 3; i++) { + // rank = BLK_NPROC[i] * rank + local_coords[i]; + // } + // return rank; } void openQCD_qudaSetLayout(openQCD_QudaLayout_t input) @@ -128,6 +137,7 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t input) #else initCommsGridQuda(4, commsGridDim, rankFromCoords, (void *)(commsGridDim)); #endif + static int device = -1; #else static int device = input.device; From 6271facd13ede00f5d0bb3e7b286f14cbf45ee31 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Sat, 25 Feb 2023 17:34:24 +0100 Subject: [PATCH 012/148] hardwire test 2 --- lib/openqcd_interface.cpp | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index b883e8c251..0d02df4f13 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -78,8 +78,10 @@ using namespace quda; // template void inline qudamilc_called(const char *func) { qudamilc_called(func, getVerbosity()); } // fdata should point to 8 integers in order {BLK_NPROC0, BLK_NPROC1, BLK_NPROC2, BLK_NPROC3, NPROC0, NPROC1, NPROC2, NPROC3] -static int rankFromCoords(const int *coords, void *fdata) +static int rankFromCoords(const int *coords, void *fdata) // TODO: { + + int *BLK_NPROC = static_cast(fdata); int *NPROC = BLK_NPROC + 4; @@ -101,7 +103,18 @@ static int rankFromCoords(const int *coords, void *fdata) for (int i = 0; i <= 3; i++) { rank = BLK_NPROC[i] * rank + local_coords[i]; } - return rank; + warningQuda("rank = %d, Coords = (%d,%d,%d,%d)\n",rank,coords[0],coords[1],coords[2],coords[3]); + + // // For 2 ranks: + // if (coords[3]<8) + // { + // rank = 0; + // } else { + // rank = 1; + // } + return coords[3]; + // return 0; // FIXME: this function needs to be specific + } void openQCD_qudaSetLayout(openQCD_QudaLayout_t input) @@ -318,7 +331,7 @@ static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, Qud static void setColorSpinorParams(const int dim[4], QudaPrecision precision, ColorSpinorParam *param) { param->nColor = 3; - param->nSpin = 1; + param->nSpin = 1; // TODO: param->nDim = 4; for (int dir = 0; dir < 4; ++dir) param->x[dir] = dim[dir]; @@ -326,11 +339,11 @@ static void setColorSpinorParams(const int dim[4], QudaPrecision precision, Colo param->setPrecision(precision); param->pad = 0; - param->siteSubset = QUDA_PARITY_SITE_SUBSET; - param->siteOrder = QUDA_EVEN_ODD_SITE_ORDER; + param->siteSubset = QUDA_PARITY_SITE_SUBSET; // TODO: check how to adapt this for openqxd + param->siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // TODO: check how to adapt this for openqxd param->fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; - param->gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // meaningless, but required by the code. - param->create = QUDA_ZERO_FIELD_CREATE; + param->gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // meaningless, but required by the code. // TODO: + param->create = QUDA_ZERO_FIELD_CREATE; // TODO: check how to adapt this for openqxd } #if 0 From 8550ed0585f96c2a79f57989d9b177ba1bf537ad Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Fri, 3 Mar 2023 15:45:04 +0100 Subject: [PATCH 013/148] ignore build folder --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 58f2516546..7665c11baa 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ include/jitify_options.hpp .tags* autom4te.cache/* .vscode +build \ No newline at end of file From c9d0f6c5b045a887d5609861217c282dd227817f Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Wed, 8 Mar 2023 17:53:50 +0100 Subject: [PATCH 014/148] rankFromCoords for OpenQxD gauge fields working. Probably it will continue to work for spinor fields... To be confirmed.. --- lib/openqcd_interface.cpp | 80 ++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 34 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 7313a5ce35..7f07a29a85 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -10,6 +10,11 @@ #include #include #include +// #include "../../openQxD-devel/include/su3.h" +// #include "../../openQxD-devel/include/flags.h" +// #include "../../openQxD-devel/include/utils.h" +// #include "../../openQxD-devel/include/lattice.h" +// #include "../../openQxD-devel/include/global.h" // #include @@ -77,44 +82,50 @@ using namespace quda; // template void inline qudamilc_called(const char *func) { qudamilc_called(func, getVerbosity()); } -// fdata should point to 8 integers in order {BLK_NPROC0, BLK_NPROC1, BLK_NPROC2, BLK_NPROC3, NPROC0, NPROC1, NPROC2, NPROC3] -static int rankFromCoords(const int *coords, void *fdata) // TODO: + +static int safe_mod(int x,int y) { + if (x>=0) + return x%y; + else + return (y-(abs(x)%y))%y; +} - - int *BLK_NPROC = static_cast(fdata); - int *NPROC = BLK_NPROC + 4; - - int BLK_coords[4]; - int local_coords[4]; - - for (int i = 0; i < 4; i++) { - // coordinate of BLK in the BLK grid - BLK_coords[i] = coords[i] / BLK_NPROC[i]; - // local coordinate inside BLK - local_coords[i] = coords[i] - BLK_coords[i]*BLK_NPROC[i]; - } - int rank = BLK_coords[0]; - for (int i = 1; i <= 3; i++) { - rank = (NPROC[i] / BLK_NPROC[i]) * rank + BLK_coords[i]; - } +// fdata should point to 4 integers in order {NPROC0, NPROC1, NPROC2, NPROC3} +// coords is the 4D cartesian coordinate of a rank. +static int rankFromCoords(const int *coords, void *fdata) // TODO: +{ + int *NPROC = static_cast(fdata); + // int *NPROC = BLK_NPROC + 4; + + int ib; + int n0_OpenQxD; + int n1_OpenQxD; + int n2_OpenQxD; + int n3_OpenQxD; + // int NPROC0_OpenQxD; + int NPROC1_OpenQxD; + int NPROC2_OpenQxD; + int NPROC3_OpenQxD; + + n0_OpenQxD=coords[3]; + n1_OpenQxD=coords[0]; + n2_OpenQxD=coords[1]; + n3_OpenQxD=coords[2]; + + // NPROC0_OpenQxD=NPROC[3]; + NPROC1_OpenQxD=NPROC[0]; + NPROC2_OpenQxD=NPROC[1]; + NPROC3_OpenQxD=NPROC[2]; - for (int i = 0; i <= 3; i++) { - rank = BLK_NPROC[i] * rank + local_coords[i]; - } - warningQuda("rank = %d, Coords = (%d,%d,%d,%d)\n",rank,coords[0],coords[1],coords[2],coords[3]); - - // // For 2 ranks: - // if (coords[3]<8) - // { - // rank = 0; - // } else { - // rank = 1; - // } - return coords[3]; - // return 0; // FIXME: this function needs to be specific + ib=n0_OpenQxD; + ib=ib*NPROC1_OpenQxD+n1_OpenQxD; + ib=ib*NPROC2_OpenQxD+n2_OpenQxD; + ib=ib*NPROC3_OpenQxD+n3_OpenQxD; + printf("Coords are: %d,%d,%d,%d \n Rank is: %d \n\n",coords[0],coords[1],coords[2],coords[3],ib); + return ib; } void openQCD_qudaSetLayout(openQCD_QudaLayout_t input) @@ -158,6 +169,7 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t input) openQCD_qudaSetLayout(input.layout); initialized = true; // qudamilc_called(__func__); + // geometry(); // Establish helper indexes from openQxD } void openQCD_qudaFinalize() { endQuda(); } @@ -333,7 +345,7 @@ static void setColorSpinorParams(const int dim[4], QudaPrecision precision, Colo { param->nColor = 3; param->nSpin = 1; // TODO: - param->nDim = 4; + param->nDim = 4; // TODO: check how to adapt this for openqxd for (int dir = 0; dir < 4; ++dir) param->x[dir] = dim[dir]; param->x[0] /= 2; From 301378b39025faee6688e47f7e2d283372f4c6ae Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Tue, 14 Mar 2023 15:47:55 +0100 Subject: [PATCH 015/148] OpenQCD spinor interface from TIFR (need adjusting --- include/color_spinor_field_order.h | 51 +++++++++++++++++++++++++++++- include/enum_quda.h | 3 +- include/enum_quda_fortran.h | 1 + include/gauge_field_order.h | 12 ++++--- include/index_helper.cuh | 25 +++++++++++++++ include/quda_openqcd_interface.h | 11 ++++++- lib/copy_color_spinor.cuh | 16 ++++++++++ lib/openqcd_interface.cpp | 1 + 8 files changed, 112 insertions(+), 8 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 233d926869..dc43bc72be 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1493,7 +1493,7 @@ namespace quda size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; - template struct SpaceSpinorColorOrder { + template struct SpaceSpinorColorOrder { // TODO: check how to adapt this for openqxd using Accessor = SpaceSpinorColorOrder; using real = typename mapper::type; using complex = complex; @@ -1720,6 +1720,55 @@ namespace quda size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; + template struct OpenQCDDiracOrder { // TODO: implement this accessor corrrectly + using Accessor = OpenQCDDiracOrder; + using real = typename mapper::type; + using complex = complex; + Float *field; + int volumeCB; + int nParity; + OpenQCDDiracOrder(const ColorSpinorField &a, int = 1, Float *field_ = 0, float * = 0) : + field(field_ ? field_ : (Float *)a.V()), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()) + { + } + + __device__ __host__ inline void load(complex v[Ns * Nc], int x, int parity = 0) const + { + for (int s = 0; s < Ns; s++) { + for (int c = 0; c < Nc; c++) { + v[s * Nc + c] = complex(field[(((0 * Nc + c) * Ns + s) * 2 + (1 - parity)) * volumeCB + x], + field[(((1 * Nc + c) * Ns + s) * 2 + (1 - parity)) * volumeCB + x]); + } + } + } + + __device__ __host__ inline void save(const complex v[Ns * Nc], int x, int parity = 0) const + { + for (int s = 0; s < Ns; s++) { + for (int c = 0; c < Nc; c++) { + field[(((0 * Nc + c) * Ns + s) * 2 + (1 - parity)) * volumeCB + x] = v[s * Nc + c].real(); + field[(((1 * Nc + c) * Ns + s) * 2 + (1 - parity)) * volumeCB + x] = v[s * Nc + c].imag(); + } + } + } + + /** + @brief This accessor routine returns a colorspinor_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a colorspinor_wrapper that curries in access to + this field at the above coordinates. + */ + __device__ __host__ inline auto operator()(int x_cb, int parity) const + { + return colorspinor_wrapper(*this, x_cb, parity); + } + + size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } + }; + } // namespace colorspinor // Use traits to reduce the template explosion diff --git a/include/enum_quda.h b/include/enum_quda.h index 940cfd80e9..ee798b9e90 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -356,6 +356,7 @@ typedef enum QudaFieldOrder_s { QUDA_QDPJIT_FIELD_ORDER, // QDP field ordering (complex-color-spin-spacetime) QUDA_QOP_DOMAIN_WALL_FIELD_ORDER, // QOP domain-wall ordering QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER, // TIFR RHMC ordering + QUDA_OPENQCD_FIELD_ORDER, // OPENQCD geometry ordering (at the moment lexicographical w/ rotation zyxt = x3x2x1x0 |-> xyzt x0x1x2x3 ) QUDA_INVALID_FIELD_ORDER = QUDA_INVALID_ENUM } QudaFieldOrder; @@ -370,7 +371,7 @@ typedef enum QudaFieldCreate_s { typedef enum QudaGammaBasis_s { QUDA_DEGRAND_ROSSI_GAMMA_BASIS, QUDA_UKQCD_GAMMA_BASIS, - QUDA_CHIRAL_GAMMA_BASIS, + QUDA_CHIRAL_GAMMA_BASIS, // check ? TODO: use this for quda ? QUDA_INVALID_GAMMA_BASIS = QUDA_INVALID_ENUM } QudaGammaBasis; diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index 527d6a5798..c78f03208a 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -333,6 +333,7 @@ #define QUDA_QDPJIT_FIELD_ORDER 11 // QDP field ordering (complex-color-spin-spacetime) #define QUDA_QOP_DOMAIN_WALL_FIELD_ORDER 12 // QOP domain-wall ordering #define QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER 13 // TIFR RHMC ordering +#define QUDA_OPENQCD_FIELD_ORDER 14 // OPENQCD geometry ordering (at the moment lexicographical w/ rotation zyxt = x3x2x1x0 |-> xyzt x0x1x2x3 ) #define QUDA_INVALID_FIELD_ORDER QUDA_INVALID_ENUM #define QudaFieldCreate integer(4) diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index e2b662710e..826e836216 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -24,7 +24,9 @@ #include #include -// #include "global.h" +// TODO: The ipt functions can be incorporated here (so no reordering needed in OpenQXD side) +// OpenQxD helpers: +// #include "../../openQxD-devel/include/lattice.h" namespace quda { @@ -2329,8 +2331,8 @@ namespace quda { OpenQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), - volumeCB(u.VolumeCB()), - dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]} + volumeCB(u.VolumeCB()), // NOTE: Volume and VolumeCB refer to the global lattice, if VolumeLocal, then local lattice + dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]} // GLOBAL dimensions { if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); } @@ -2388,7 +2390,7 @@ namespace quda { { // FIX: what's x? (cf. index_helper.cuh) - // With ''natural'' order: lexicographical 0123 = txyz , z fastest, links 0123 = txyz in pos directions + // With ''natural'' order: lexicographical 0123 = txyz , t fastest, links 0123 = txyz in pos directions // Indexing fun: int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) @@ -2407,7 +2409,7 @@ namespace quda { // Loading as per QUDA style - auto in = &gauge[ (4*(iy_OpenQxD) + dir_OpenQxD) * length]; // This is how they're accessed within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) + auto in = &gauge[ (4*iy_OpenQxD + dir_OpenQxD) * length]; // This is how they're accessed within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) // auto in = &gauge[ (8*(ix_OpenQxD - volumeCB) + 2*dir_OpenQxD)* length]; // This is how they're accessed within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) block_load(v, reinterpret_cast(in)); diff --git a/include/index_helper.cuh b/include/index_helper.cuh index e1a14225c8..550a2f5b99 100644 --- a/include/index_helper.cuh +++ b/include/index_helper.cuh @@ -2,6 +2,10 @@ #include +// TODO: The ipt functions can be incorporated here (so no reordering needed in OpenQXD side) +// OpenQxD helpers: +// #include "../../openQxD-devel/include/lattice.h" + namespace quda { /** Compute the checkerboard 1-d index from the 4-d coordinate x[] + dx[] @@ -1058,3 +1062,24 @@ namespace quda { } } // namespace quda + +// namespace OpenQxD_Helpers { +// /** +// Compute the 4-d spatial index from the checkerboarded 1-d index +// at parity parity. Wrapper around getCoordsCB. + +// @param[out] x Computed spatial index +// @param[in] cb_index 1-d checkerboarded index +// @param[in] X Full lattice dimensions +// @param[in] X0h Half of x-dim lattice dimension +// @param[in] parity Site parity +// @return Full linear lattice index +// */ +// template __device__ __host__ inline int getCoords(Coord &x, int cb_index, const I &X, int parity) +// { +// return getCoordsCB(x, cb_index, X, X[0] >> 1, parity); +// } + + +// } + diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 16ecce2d00..813fe1cf34 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -3,6 +3,15 @@ #include #include +// TODO: The ipt and other functions can be incorporated here (so no reordering needed in OpenQXD side) +// OpenQxD helpers: +// #include "../../openQxD-devel/include/su3.h" +// #include "../../openQxD-devel/include/flags.h" +// #include "../../openQxD-devel/include/utils.h" +// #include "../../openQxD-devel/include/lattice.h" +// #include "../../openQxD-devel/include/global.h" + + /** * @file quda_openqcd_interface.h * @@ -23,7 +32,7 @@ typedef struct { const int *latsize; /** Local lattice dimensions L0, L1, L2, L3 */ // FIXME: const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ // FIXME: const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ // FIXME: - int device; /** GPU device number */ + int device; /** GPU device number */ // const int *ipt; } openQCD_QudaLayout_t; diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh index 242b0c5cb4..7f86ef3eed 100644 --- a/lib/copy_color_spinor.cuh +++ b/lib/copy_color_spinor.cuh @@ -111,6 +111,14 @@ namespace quda { CopyColorSpinor(out, in, param); #else errorQuda("QDPJIT interface has not been built\n"); +#endif + } else if (out.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { + +#ifdef BUILD_OPENQCD_INTERFACE + using O = OpenQCDDiracOrder; + CopyColorSpinor(out, in, param); +#else + errorQuda("OpenQCD interface has not been built\n"); #endif } else { errorQuda("Order %d not defined (Ns = %d, Nc = %d, precision = %d)", out.FieldOrder(), Ns, Nc, out.Precision()); @@ -152,6 +160,14 @@ namespace quda { genericCopyColorSpinor(param); #else errorQuda("QDPJIT interface has not been built\n"); +#endif + } else if (in.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { + +#ifdef BUILD_OPENQCD_INTERFACE + using ColorSpinor = OpenQCDDiracOrder; + genericCopyColorSpinor(param); +#else + errorQuda("OpenQCD interface has not been built\n"); #endif } else { errorQuda("Order %d not defined (Ns=%d, Nc=%d, precision = %d)", in.FieldOrder(), Ns, Nc, in.Precision()); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 7f07a29a85..1123d63184 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -489,6 +489,7 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda qudamilc_called(__func__, verbosity); } // qudaDslash #endif + // void* openQCD_qudaCreateGaugeField(void *gauge, int geometry, int precision) // { // qudamilc_called(__func__); From 240d6143ad0d7632596783ba79e15aad1e4b2926 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Mon, 20 Mar 2023 10:51:03 +0100 Subject: [PATCH 016/148] commented out openqcd spinors sections for compile --- include/color_spinor_field_order.h | 72 +++++++++++++++++++++--------- include/quda_openqcd_interface.h | 17 +++---- lib/copy_color_spinor.cuh | 32 ++++++------- lib/openqcd_interface.cpp | 11 +++-- 4 files changed, 82 insertions(+), 50 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index dc43bc72be..2440514b97 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1493,6 +1493,7 @@ namespace quda size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; +// Use this template as openqxd for now TODO: template struct SpaceSpinorColorOrder { // TODO: check how to adapt this for openqxd using Accessor = SpaceSpinorColorOrder; using real = typename mapper::type; @@ -1517,13 +1518,14 @@ namespace quda } } - __device__ __host__ inline void load(complex v[length / 2], int x, int parity = 0) const + __device__ __host__ inline void load(complex v[length / 2], int x, int parity = 0) const // TODO: adapt to openqxd { auto in = &field[(parity * volumeCB + x) * length]; block_load(v, reinterpret_cast(in)); } - __device__ __host__ inline void save(const complex v[length / 2], int x, int parity = 0) const + __device__ __host__ inline void save(const complex v[length / 2], int x, + int parity = 0) const // TODO: adapt to openqxd { auto out = &field[(parity * volumeCB + x) * length]; block_store(reinterpret_cast(out), v); @@ -1720,36 +1722,44 @@ namespace quda size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; - template struct OpenQCDDiracOrder { // TODO: implement this accessor corrrectly - using Accessor = OpenQCDDiracOrder; +#if 0 + // Custom accessor for OpenQCD arrays + template + struct OpenQCDDiracOrder { // TODO: USE: check how to adapt this for openqxd + using Accessor = SpaceSpinorColorOrder; using real = typename mapper::type; using complex = complex; + static const int length = 2 * Ns * Nc; Float *field; + size_t offset; + Float *ghost[8]; int volumeCB; + int faceVolumeCB[4]; int nParity; - OpenQCDDiracOrder(const ColorSpinorField &a, int = 1, Float *field_ = 0, float * = 0) : - field(field_ ? field_ : (Float *)a.V()), volumeCB(a.VolumeCB()), nParity(a.SiteSubset()) + SpaceSpinorColorOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : + field(field_ ? field_ : (Float *)a.V()), + offset(a.Bytes() / (2 * sizeof(Float))), + volumeCB(a.VolumeCB()), + nParity(a.SiteSubset()) { + for (int i = 0; i < 4; i++) { + ghost[2 * i] = ghost_ ? ghost_[2 * i] : 0; + ghost[2 * i + 1] = ghost_ ? ghost_[2 * i + 1] : 0; + faceVolumeCB[i] = a.SurfaceCB(i) * nFace; + } } - __device__ __host__ inline void load(complex v[Ns * Nc], int x, int parity = 0) const + __device__ __host__ inline void load(complex v[length / 2], int x, int parity = 0) const // TODO: adapt to openqxd { - for (int s = 0; s < Ns; s++) { - for (int c = 0; c < Nc; c++) { - v[s * Nc + c] = complex(field[(((0 * Nc + c) * Ns + s) * 2 + (1 - parity)) * volumeCB + x], - field[(((1 * Nc + c) * Ns + s) * 2 + (1 - parity)) * volumeCB + x]); - } - } + auto in = &field[(parity * volumeCB + x) * length]; + block_load(v, reinterpret_cast(in)); } - __device__ __host__ inline void save(const complex v[Ns * Nc], int x, int parity = 0) const + __device__ __host__ inline void save(const complex v[length / 2], int x, + int parity = 0) const // TODO: adapt to openqxd { - for (int s = 0; s < Ns; s++) { - for (int c = 0; c < Nc; c++) { - field[(((0 * Nc + c) * Ns + s) * 2 + (1 - parity)) * volumeCB + x] = v[s * Nc + c].real(); - field[(((1 * Nc + c) * Ns + s) * 2 + (1 - parity)) * volumeCB + x] = v[s * Nc + c].imag(); - } - } + auto out = &field[(parity * volumeCB + x) * length]; + block_store(reinterpret_cast(out), v); } /** @@ -1766,8 +1776,30 @@ namespace quda return colorspinor_wrapper(*this, x_cb, parity); } + // __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dim, int dir, int parity = 0) const // TODO: do we need this for openqxd? + // { + // for (int s = 0; s < Ns; s++) { + // for (int c = 0; c < Nc; c++) { + // v[s * Nc + c] + // = complex(ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 0], + // ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 1]); + // } + // } + // } + + // __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dim, int dir, int parity = 0) const // TODO: do we need this for openqxd? + // { + // for (int s = 0; s < Ns; s++) { + // for (int c = 0; c < Nc; c++) { + // ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 0] = v[s * Nc + c].real(); + // ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 1] = v[s * Nc + c].imag(); + // } + // } + // } + size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; +#endif //openQCDDiracOrder } // namespace colorspinor diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 813fe1cf34..c7e05d4623 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -3,7 +3,7 @@ #include #include -// TODO: The ipt and other functions can be incorporated here (so no reordering needed in OpenQXD side) +// TODO: (later) The ipt and other functions can be incorporated here (so no reordering needed in OpenQXD side) // OpenQxD helpers: // #include "../../openQxD-devel/include/su3.h" // #include "../../openQxD-devel/include/flags.h" @@ -11,14 +11,13 @@ // #include "../../openQxD-devel/include/lattice.h" // #include "../../openQxD-devel/include/global.h" - /** * @file quda_openqcd_interface.h * * @section Description * * The header file defines the milc interface to enable easy - * interfacing between QUDA and the OpenQCS software. + * interfacing between QUDA and the OpenQCD software. */ #ifdef __cplusplus @@ -29,10 +28,10 @@ extern "C" { * Parameters related to problem size and machine topology. */ typedef struct { - const int *latsize; /** Local lattice dimensions L0, L1, L2, L3 */ // FIXME: - const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ // FIXME: - const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ // FIXME: - int device; /** GPU device number */ + const int *latsize; /** Local lattice dimensions L0, L1, L2, L3 */ // FIXME: + const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ // FIXME: + const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ // FIXME: + int device; /** GPU device number */ // const int *ipt; } openQCD_QudaLayout_t; @@ -100,8 +99,7 @@ void openQCD_qudaFinalize(void); typedef struct { // TODO: work out what we want to expose here int max_iter; /** Maximum number of iterations */ - QudaParity - evenodd; /** Which parity are we working on ? (options are QUDA_EVEN_PARITY, QUDA_ODD_PARITY, QUDA_INVALID_PARITY */ + QudaParity evenodd; /** Which parity are we working on ? (options are QUDA_EVEN_PARITY, QUDA_ODD_PARITY, QUDA_INVALID_PARITY */ int mixed_precision; /** Whether to use mixed precision or not (1 - yes, 0 - no) */ double boundary_phase[4]; /** Boundary conditions */ int make_resident_solution; /** Make the solution resident and don't copy back */ @@ -167,7 +165,6 @@ void openQCD_qudaLoadGaugeField(int external_precision, int quda_precision, open void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge); - // int openQCD_ipt(int iy); /** diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh index 7f86ef3eed..d4967b8703 100644 --- a/lib/copy_color_spinor.cuh +++ b/lib/copy_color_spinor.cuh @@ -112,14 +112,14 @@ namespace quda { #else errorQuda("QDPJIT interface has not been built\n"); #endif - } else if (out.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { - -#ifdef BUILD_OPENQCD_INTERFACE - using O = OpenQCDDiracOrder; - CopyColorSpinor(out, in, param); -#else - errorQuda("OpenQCD interface has not been built\n"); -#endif +// } else if (out.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { + +// #ifdef BUILD_OPENQCD_INTERFACE +// using O = OpenQCDDiracOrder; +// CopyColorSpinor(out, in, param); +// #else +// errorQuda("OpenQCD interface has not been built\n"); +// #endif } else { errorQuda("Order %d not defined (Ns = %d, Nc = %d, precision = %d)", out.FieldOrder(), Ns, Nc, out.Precision()); } @@ -161,14 +161,14 @@ namespace quda { #else errorQuda("QDPJIT interface has not been built\n"); #endif - } else if (in.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { - -#ifdef BUILD_OPENQCD_INTERFACE - using ColorSpinor = OpenQCDDiracOrder; - genericCopyColorSpinor(param); -#else - errorQuda("OpenQCD interface has not been built\n"); -#endif +// } else if (in.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { + +// #ifdef BUILD_OPENQCD_INTERFACE +// using ColorSpinor = OpenQCDDiracOrder; +// genericCopyColorSpinor(param); +// #else +// errorQuda("OpenQCD interface has not been built\n"); +// #endif } else { errorQuda("Order %d not defined (Ns=%d, Nc=%d, precision = %d)", in.FieldOrder(), Ns, Nc, in.Precision()); } diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 1123d63184..d5d8a4c9a8 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -169,7 +169,7 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t input) openQCD_qudaSetLayout(input.layout); initialized = true; // qudamilc_called(__func__); - // geometry(); // Establish helper indexes from openQxD + // geometry(); // TODO: Establish helper indexes from openQxD?? } void openQCD_qudaFinalize() { endQuda(); } @@ -439,7 +439,8 @@ void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, qudamilc_called(__func__, verbosity); } // qudaInvert - +#endif +#if 0 void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, const void *const fatlink, const void *const longlink, void *src, void *dst, int *num_iters) { @@ -495,7 +496,7 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda // qudamilc_called(__func__); // QudaPrecision qudaPrecision = (precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; // QudaGaugeParam qudaGaugeParam -// = newMILCGaugeParam(localDim, qudaPrecision, (geometry == 1) ? QUDA_GENERAL_LINKS : QUDA_SU3_LINKS); +// = newMILCGaugeParam(localDim, qudaPrecision, (geometry == 1) ? QUDA_GENERAL_LINKS : QUDA_SU3_LINKS); // TODO: change MILC to openQCD // qudamilc_called(__func__); // return createGaugeFieldQuda(gauge, geometry, &qudaGaugeParam); // } @@ -504,7 +505,7 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda // { // qudamilc_called(__func__); // cudaGaugeField *cudaGauge = reinterpret_cast(inGauge); -// QudaGaugeParam qudaGaugeParam = newMILCGaugeParam(localDim, cudaGauge->Precision(), QUDA_GENERAL_LINKS); +// QudaGaugeParam qudaGaugeParam = newMILCGaugeParam(localDim, cudaGauge->Precision(), QUDA_GENERAL_LINKS); // TODO: change MILC to openQCD // saveGaugeFieldQuda(gauge, inGauge, &qudaGaugeParam); // qudamilc_called(__func__); // } @@ -623,3 +624,5 @@ void openQCD_qudaFreeGaugeField() freeGaugeQuda(); // qudamilc_called(__func__); } // qudaFreeGaugeField + +// TODO: OpenQCDMultigridPack functions a la MILC (cf. milc_interface.cpp) From 6202ac4a0c423a3bf802847c4977b3c03f6769de Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Mon, 20 Mar 2023 17:22:39 +0100 Subject: [PATCH 017/148] spinors feature OK (still needs correct indexing) --- include/color_spinor_field_order.h | 21 ++- include/quda_openqcd_interface.h | 10 +- lib/copy_color_spinor.cuh | 30 ++-- lib/openqcd_interface.cpp | 229 +++++++++++++++-------------- 4 files changed, 144 insertions(+), 146 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 2440514b97..f56107f8e1 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1493,8 +1493,9 @@ namespace quda size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; -// Use this template as openqxd for now TODO: - template struct SpaceSpinorColorOrder { // TODO: check how to adapt this for openqxd + // Use this template as openqxd for now TODO: + template + struct SpaceSpinorColorOrder { // TODO: check how to adapt this for openqxd using Accessor = SpaceSpinorColorOrder; using real = typename mapper::type; using complex = complex; @@ -1722,11 +1723,10 @@ namespace quda size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; -#if 0 - // Custom accessor for OpenQCD arrays + // Use this template as openqxd for now TODO: template - struct OpenQCDDiracOrder { // TODO: USE: check how to adapt this for openqxd - using Accessor = SpaceSpinorColorOrder; + struct OpenQCDDiracOrder { // TODO: check how to adapt this for openqxd + using Accessor = OpenQCDDiracOrder; using real = typename mapper::type; using complex = complex; static const int length = 2 * Ns * Nc; @@ -1736,7 +1736,7 @@ namespace quda int volumeCB; int faceVolumeCB[4]; int nParity; - SpaceSpinorColorOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : + OpenQCDDiracOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : field(field_ ? field_ : (Float *)a.V()), offset(a.Bytes() / (2 * sizeof(Float))), volumeCB(a.VolumeCB()), @@ -1776,7 +1776,7 @@ namespace quda return colorspinor_wrapper(*this, x_cb, parity); } - // __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dim, int dir, int parity = 0) const // TODO: do we need this for openqxd? + // __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dim, int dir, int parity = 0) const // { // for (int s = 0; s < Ns; s++) { // for (int c = 0; c < Nc; c++) { @@ -1787,7 +1787,7 @@ namespace quda // } // } - // __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dim, int dir, int parity = 0) const // TODO: do we need this for openqxd? + // __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dim, int dir, int parity = 0) const // { // for (int s = 0; s < Ns; s++) { // for (int c = 0; c < Nc; c++) { @@ -1798,8 +1798,7 @@ namespace quda // } size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } - }; -#endif //openQCDDiracOrder + }; // openQCDDiracOrder } // namespace colorspinor diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index c7e05d4623..6877e28476 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -99,7 +99,8 @@ void openQCD_qudaFinalize(void); typedef struct { // TODO: work out what we want to expose here int max_iter; /** Maximum number of iterations */ - QudaParity evenodd; /** Which parity are we working on ? (options are QUDA_EVEN_PARITY, QUDA_ODD_PARITY, QUDA_INVALID_PARITY */ + QudaParity + evenodd; /** Which parity are we working on ? (options are QUDA_EVEN_PARITY, QUDA_ODD_PARITY, QUDA_INVALID_PARITY */ int mixed_precision; /** Whether to use mixed precision or not (1 - yes, 0 - no) */ double boundary_phase[4]; /** Boundary conditions */ int make_resident_solution; /** Make the solution resident and don't copy back */ @@ -118,14 +119,11 @@ typedef struct { * @param external_precision Precision of host fields passed to QUDA (2 - double, 1 - single) * @param quda_precision Precision for QUDA to use (2 - double, 1 - single) * @param inv_args Struct setting some solver metadata - * @param milc_fatlink Fat-link field on the host - * @param milc_longlink Long-link field on the host * @param source Right-hand side source field * @param solution Solution spinor field */ -void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, - const void *const milc_fatlink, const void *const milc_longlink, void *source, void *solution, - int *num_iters); +void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *source, + void *solution, void *gauge); /** * Solve Ax=b for an improved staggered operator. All fields are fields diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh index d4967b8703..b5f4665f9b 100644 --- a/lib/copy_color_spinor.cuh +++ b/lib/copy_color_spinor.cuh @@ -112,14 +112,13 @@ namespace quda { #else errorQuda("QDPJIT interface has not been built\n"); #endif -// } else if (out.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { - -// #ifdef BUILD_OPENQCD_INTERFACE -// using O = OpenQCDDiracOrder; -// CopyColorSpinor(out, in, param); -// #else -// errorQuda("OpenQCD interface has not been built\n"); -// #endif + } else if (out.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { +#ifdef BUILD_OPENQCD_INTERFACE + using O = OpenQCDDiracOrder; // TODO: Seems OK + CopyColorSpinor(out, in, param); // TODO: Seems OK +#else + errorQuda("OpenQCD interface has not been built\n"); +#endif } else { errorQuda("Order %d not defined (Ns = %d, Nc = %d, precision = %d)", out.FieldOrder(), Ns, Nc, out.Precision()); } @@ -161,14 +160,13 @@ namespace quda { #else errorQuda("QDPJIT interface has not been built\n"); #endif -// } else if (in.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { - -// #ifdef BUILD_OPENQCD_INTERFACE -// using ColorSpinor = OpenQCDDiracOrder; -// genericCopyColorSpinor(param); -// #else -// errorQuda("OpenQCD interface has not been built\n"); -// #endif + } else if (in.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { +#ifdef BUILD_OPENQCD_INTERFACE + using ColorSpinor = OpenQCDDiracOrder; // TODO: Seems OK + genericCopyColorSpinor(param); // TODO: Seems OK +#else + errorQuda("OpenQCD interface has not been built\n"); +#endif } else { errorQuda("Order %d not defined (Ns=%d, Nc=%d, precision = %d)", in.FieldOrder(), Ns, Nc, in.Precision()); } diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index d5d8a4c9a8..30d2e2d659 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -82,23 +82,21 @@ using namespace quda; // template void inline qudamilc_called(const char *func) { qudamilc_called(func, getVerbosity()); } - -static int safe_mod(int x,int y) +static int safe_mod(int x, int y) { - if (x>=0) - return x%y; - else - return (y-(abs(x)%y))%y; + if (x >= 0) + return x % y; + else + return (y - (abs(x) % y)) % y; } - // fdata should point to 4 integers in order {NPROC0, NPROC1, NPROC2, NPROC3} // coords is the 4D cartesian coordinate of a rank. static int rankFromCoords(const int *coords, void *fdata) // TODO: -{ +{ int *NPROC = static_cast(fdata); // int *NPROC = BLK_NPROC + 4; - + int ib; int n0_OpenQxD; int n1_OpenQxD; @@ -109,22 +107,21 @@ static int rankFromCoords(const int *coords, void *fdata) // TODO: int NPROC2_OpenQxD; int NPROC3_OpenQxD; - n0_OpenQxD=coords[3]; - n1_OpenQxD=coords[0]; - n2_OpenQxD=coords[1]; - n3_OpenQxD=coords[2]; + n0_OpenQxD = coords[3]; + n1_OpenQxD = coords[0]; + n2_OpenQxD = coords[1]; + n3_OpenQxD = coords[2]; // NPROC0_OpenQxD=NPROC[3]; - NPROC1_OpenQxD=NPROC[0]; - NPROC2_OpenQxD=NPROC[1]; - NPROC3_OpenQxD=NPROC[2]; - - - ib=n0_OpenQxD; - ib=ib*NPROC1_OpenQxD+n1_OpenQxD; - ib=ib*NPROC2_OpenQxD+n2_OpenQxD; - ib=ib*NPROC3_OpenQxD+n3_OpenQxD; - printf("Coords are: %d,%d,%d,%d \n Rank is: %d \n\n",coords[0],coords[1],coords[2],coords[3],ib); + NPROC1_OpenQxD = NPROC[0]; + NPROC2_OpenQxD = NPROC[1]; + NPROC3_OpenQxD = NPROC[2]; + + ib = n0_OpenQxD; + ib = ib * NPROC1_OpenQxD + n1_OpenQxD; + ib = ib * NPROC2_OpenQxD + n2_OpenQxD; + ib = ib * NPROC3_OpenQxD + n3_OpenQxD; + printf("Coords are: %d,%d,%d,%d \n Rank is: %d \n\n", coords[0], coords[1], coords[2], coords[3], ib); return ib; } @@ -169,7 +166,7 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t input) openQCD_qudaSetLayout(input.layout); initialized = true; // qudamilc_called(__func__); - // geometry(); // TODO: Establish helper indexes from openQxD?? + // geometry(); // TODO: Establish helper indexes from openQxD?? } void openQCD_qudaFinalize() { endQuda(); } @@ -185,12 +182,11 @@ void *qudaAllocateManaged(size_t bytes) { return managed_malloc(bytes); } void qudaFreeManaged(void *ptr) { managed_free(ptr); } #endif - static int getLinkPadding(const int dim[4]) { - int padding = MAX(dim[1]*dim[2]*dim[3]/2, dim[0]*dim[2]*dim[3]/2); - padding = MAX(padding, dim[0]*dim[1]*dim[3]/2); - padding = MAX(padding, dim[0]*dim[1]*dim[2]/2); + int padding = MAX(dim[1] * dim[2] * dim[3] / 2, dim[0] * dim[2] * dim[3] / 2); + padding = MAX(padding, dim[0] * dim[1] * dim[3] / 2); + padding = MAX(padding, dim[0] * dim[1] * dim[2] / 2); return padding; } @@ -243,7 +239,6 @@ void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge) // { // // This function is the helper for ipt in QUDA - // return ix; // } @@ -254,8 +249,6 @@ void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge) // int k,mu,ix,iy,iz,iw; // int bo[4],bs[4],ifc[8]; - - // } // static int getLinkPadding(const int dim[4]) @@ -338,25 +331,69 @@ static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, Qud invertParam->compute_action = 0; } - - - static void setColorSpinorParams(const int dim[4], QudaPrecision precision, ColorSpinorParam *param) { param->nColor = 3; param->nSpin = 1; // TODO: - param->nDim = 4; // TODO: check how to adapt this for openqxd + param->nDim = 4; // TODO: check how to adapt this for openqxd for (int dir = 0; dir < 4; ++dir) param->x[dir] = dim[dir]; param->x[0] /= 2; param->setPrecision(precision); param->pad = 0; - param->siteSubset = QUDA_PARITY_SITE_SUBSET; // TODO: check how to adapt this for openqxd - param->siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // TODO: check how to adapt this for openqxd + param->siteSubset = QUDA_PARITY_SITE_SUBSET; // TODO: check how to adapt this for openqxd + param->siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // TODO: check how to adapt this for openqxd param->fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; param->gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // meaningless, but required by the code. // TODO: - param->create = QUDA_ZERO_FIELD_CREATE; // TODO: check how to adapt this for openqxd + param->create = QUDA_ZERO_FIELD_CREATE; // TODO: check how to adapt this for openqxd +} + +void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], openQCD_QudaInvertArgs_t &inv_args, + int external_precision, int quda_precision) +{ + + const QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + const QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + QudaPrecision device_precision_sloppy; + + switch (inv_args.mixed_precision) { + case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break; + case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break; + default: device_precision_sloppy = device_precision; + } + + for (int dir = 0; dir < 4; ++dir) qudaGaugeParam.X[dir] = dim[dir]; + + qudaGaugeParam.anisotropy = 1.0; + qudaGaugeParam.type = QUDA_WILSON_LINKS; + qudaGaugeParam.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; + + // Check the boundary conditions + // Can't have twisted or anti-periodic boundary conditions in the spatial + // directions with 12 reconstruct at the moment. + bool trivial_phase = true; + for (int dir = 0; dir < 3; ++dir) { + if (inv_args.boundary_phase[dir] != 0) trivial_phase = false; + } + if (inv_args.boundary_phase[3] != 0 && inv_args.boundary_phase[3] != 1) trivial_phase = false; + + if (trivial_phase) { + qudaGaugeParam.t_boundary = (inv_args.boundary_phase[3]) ? QUDA_ANTI_PERIODIC_T : QUDA_PERIODIC_T; + qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_12; + qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_12; + } else { + qudaGaugeParam.t_boundary = QUDA_PERIODIC_T; + qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; + qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; + } + + qudaGaugeParam.cpu_prec = host_precision; + qudaGaugeParam.cuda_prec = device_precision; + qudaGaugeParam.cuda_prec_sloppy = device_precision_sloppy; + qudaGaugeParam.cuda_prec_precondition = device_precision_sloppy; + qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; + // qudaGaugeParam.ga_pad = getLinkPadding(dim); } #if 0 @@ -440,23 +477,38 @@ void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, qudamilc_called(__func__, verbosity); } // qudaInvert #endif -#if 0 -void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, const void *const fatlink, - const void *const longlink, void *src, void *dst, int *num_iters) +// #if 0 +void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, + void *dst, void *gauge) { static const QudaVerbosity verbosity = getVerbosity(); - qudamilc_called(__func__, verbosity); - // static const QudaVerbosity verbosity = getVerbosity(); + QudaGaugeParam qudaGaugeParam + = newOpenQCDGaugeParam(localDim, (quda_precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); + + loadGaugeQuda(gauge, &qudaGaugeParam); + + + // QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam(); + // obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; + // obsParam.remove_staggered_phase = QUDA_BOOLEAN_FALSE; // + // phase_in ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; + // gaugeObservablesQuda(&obsParam); + + // Let MILC apply its own Nc normalization + // plaq[0] = obsParam.plaquette[0]; + // plaq[1] = obsParam.plaquette[1]; + // plaq[2] = obsParam.plaquette[2]; + + // qudamilc_called(__func__); + // return; + + + QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; QudaPrecision device_precision_sloppy = device_precision; - QudaGaugeParam fat_param = newQudaGaugeParam(); - QudaGaugeParam long_param = newQudaGaugeParam(); - setGaugeParams(fat_param, long_param, longlink, localDim, host_precision, device_precision, device_precision_sloppy, - inv_args.tadpole, inv_args.naik_epsilon); - QudaInvertParam invertParam = newQudaInvertParam(); QudaParity local_parity = inv_args.evenodd; @@ -469,34 +521,33 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda setColorSpinorParams(localDim, host_precision, &csParam); // dirty hack to invalidate the cached gauge field without breaking interface compatability - if (*num_iters == -1 || !canReuseResidentGauge(&invertParam)) invalidateGaugeQuda(); + // if (*num_iters == -1 || !canReuseResidentGauge(&invertParam)) invalidateGaugeQuda(); - if (invalidate_quda_gauge || !create_quda_gauge) { - loadGaugeQuda(const_cast(fatlink), &fat_param); - if (longlink != nullptr) loadGaugeQuda(const_cast(longlink), &long_param); - invalidate_quda_gauge = false; - } + // if (invalidate_quda_gauge || !create_quda_gauge) { + // loadGaugeQuda(gauge, &qudaGaugeParam); + // invalidate_quda_gauge = false; + // } - if (longlink == nullptr) invertParam.dslash_type = QUDA_STAGGERED_DSLASH; + invertParam.dslash_type = QUDA_WILSON_DSLASH; - int src_offset = getColorVectorOffset(other_parity, false, localDim); - int dst_offset = getColorVectorOffset(local_parity, false, localDim); + // int src_offset = getColorVectorOffset(other_parity, false, localDim); + // int dst_offset = getColorVectorOffset(local_parity, false, localDim); - dslashQuda(static_cast(dst) + dst_offset * host_precision, - static_cast(src) + src_offset * host_precision, &invertParam, local_parity); + dslashQuda(static_cast(dst), static_cast(src), &invertParam, local_parity); - if (!create_quda_gauge) invalidateGaugeQuda(); + // if (!create_quda_gauge) invalidateGaugeQuda(); - qudamilc_called(__func__, verbosity); + // qudamilc_called(__func__, verbosity);// TODO: remove? (This is from MILC) } // qudaDslash -#endif +// #endif // void* openQCD_qudaCreateGaugeField(void *gauge, int geometry, int precision) // { // qudamilc_called(__func__); // QudaPrecision qudaPrecision = (precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; // QudaGaugeParam qudaGaugeParam -// = newMILCGaugeParam(localDim, qudaPrecision, (geometry == 1) ? QUDA_GENERAL_LINKS : QUDA_SU3_LINKS); // TODO: change MILC to openQCD +// = newMILCGaugeParam(localDim, qudaPrecision, (geometry == 1) ? QUDA_GENERAL_LINKS : QUDA_SU3_LINKS); // TODO: +// change MILC to openQCD // qudamilc_called(__func__); // return createGaugeFieldQuda(gauge, geometry, &qudaGaugeParam); // } @@ -505,9 +556,8 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda // { // qudamilc_called(__func__); // cudaGaugeField *cudaGauge = reinterpret_cast(inGauge); -// QudaGaugeParam qudaGaugeParam = newMILCGaugeParam(localDim, cudaGauge->Precision(), QUDA_GENERAL_LINKS); // TODO: change MILC to openQCD -// saveGaugeFieldQuda(gauge, inGauge, &qudaGaugeParam); -// qudamilc_called(__func__); +// QudaGaugeParam qudaGaugeParam = newMILCGaugeParam(localDim, cudaGauge->Precision(), QUDA_GENERAL_LINKS); // TODO: +// change MILC to openQCD saveGaugeFieldQuda(gauge, inGauge, &qudaGaugeParam); qudamilc_called(__func__); // } // void qudaDestroyGaugeField(void *gauge) @@ -520,53 +570,6 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda // void setInvertParam(QudaInvertParam &invertParam, openQCD_QudaInvertArgs_t &inv_args, int external_precision, // int quda_precision, double kappa, double reliable_delta); -void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], openQCD_QudaInvertArgs_t &inv_args, - int external_precision, int quda_precision) -{ - - const QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - const QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - QudaPrecision device_precision_sloppy; - - switch (inv_args.mixed_precision) { - case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break; - case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break; - default: device_precision_sloppy = device_precision; - } - - for (int dir = 0; dir < 4; ++dir) qudaGaugeParam.X[dir] = dim[dir]; - - qudaGaugeParam.anisotropy = 1.0; - qudaGaugeParam.type = QUDA_WILSON_LINKS; - qudaGaugeParam.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; - - // Check the boundary conditions - // Can't have twisted or anti-periodic boundary conditions in the spatial - // directions with 12 reconstruct at the moment. - bool trivial_phase = true; - for (int dir = 0; dir < 3; ++dir) { - if (inv_args.boundary_phase[dir] != 0) trivial_phase = false; - } - if (inv_args.boundary_phase[3] != 0 && inv_args.boundary_phase[3] != 1) trivial_phase = false; - - if (trivial_phase) { - qudaGaugeParam.t_boundary = (inv_args.boundary_phase[3]) ? QUDA_ANTI_PERIODIC_T : QUDA_PERIODIC_T; - qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_12; - qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_12; - } else { - qudaGaugeParam.t_boundary = QUDA_PERIODIC_T; - qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; - qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; - } - - qudaGaugeParam.cpu_prec = host_precision; - qudaGaugeParam.cuda_prec = device_precision; - qudaGaugeParam.cuda_prec_sloppy = device_precision_sloppy; - qudaGaugeParam.cuda_prec_precondition = device_precision_sloppy; - qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; - // qudaGaugeParam.ga_pad = getLinkPadding(dim); -} - void setInvertParam(QudaInvertParam &invertParam, openQCD_QudaInvertArgs_t &inv_args, int external_precision, int quda_precision, double kappa, double reliable_delta) { From cf3a04aee4b8e2d85702416661dbbe3f57aaa447 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Tue, 21 Mar 2023 18:23:43 +0100 Subject: [PATCH 018/148] openqcd dslash correction + load color spinor --- include/color_spinor_field_order.h | 30 +++++++++++++++++++++------ lib/openqcd_interface.cpp | 33 +++--------------------------- 2 files changed, 27 insertions(+), 36 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index f56107f8e1..c12a86746f 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1493,7 +1493,6 @@ namespace quda size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; - // Use this template as openqxd for now TODO: template struct SpaceSpinorColorOrder { // TODO: check how to adapt this for openqxd using Accessor = SpaceSpinorColorOrder; @@ -1724,8 +1723,7 @@ namespace quda }; // Use this template as openqxd for now TODO: - template - struct OpenQCDDiracOrder { // TODO: check how to adapt this for openqxd + template struct OpenQCDDiracOrder { // TODO: check how to adapt this for openqxd using Accessor = OpenQCDDiracOrder; using real = typename mapper::type; using complex = complex; @@ -1736,12 +1734,15 @@ namespace quda int volumeCB; int faceVolumeCB[4]; int nParity; + const int dim[4]; + OpenQCDDiracOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : field(field_ ? field_ : (Float *)a.V()), offset(a.Bytes() / (2 * sizeof(Float))), volumeCB(a.VolumeCB()), - nParity(a.SiteSubset()) - { + nParity(a.SiteSubset()), + dim {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // GLOBAL dimensions + { // TODO: IS THIS NEEDED?? for (int i = 0; i < 4; i++) { ghost[2 * i] = ghost_ ? ghost_[2 * i] : 0; ghost[2 * i + 1] = ghost_ ? ghost_[2 * i + 1] : 0; @@ -1751,8 +1752,25 @@ namespace quda __device__ __host__ inline void load(complex v[length / 2], int x, int parity = 0) const // TODO: adapt to openqxd { - auto in = &field[(parity * volumeCB + x) * length]; + + /* INDEXING */ + + int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) + getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site + + /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) + coord1 in QUDA is x2 in OpenQxD (y) + coord2 in QUDA is x3 in OpenQxD (z) + coord3 in QUDA is x0 in OpenQxD (t) + */ + int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; + // int dir_OpenQxD = (dir + 1) % 4; // rotation of axes QUDA -> OpenQxD + + // Loading as per QUDA style + auto in = &field[(4 * iy_OpenQxD) * length]; // This is how they're accessed within OpenQxd (length = 24 doubles + // = 12 complex doubles = 4 spinor x 3 colors) block_load(v, reinterpret_cast(in)); + /* END OF INDEXING */ } __device__ __host__ inline void save(const complex v[length / 2], int x, diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 30d2e2d659..e3088fec71 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -489,22 +489,6 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda loadGaugeQuda(gauge, &qudaGaugeParam); - // QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam(); - // obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; - // obsParam.remove_staggered_phase = QUDA_BOOLEAN_FALSE; // - // phase_in ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - // gaugeObservablesQuda(&obsParam); - - // Let MILC apply its own Nc normalization - // plaq[0] = obsParam.plaquette[0]; - // plaq[1] = obsParam.plaquette[1]; - // plaq[2] = obsParam.plaquette[2]; - - // qudamilc_called(__func__); - // return; - - - QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; QudaPrecision device_precision_sloppy = device_precision; @@ -520,24 +504,13 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda ColorSpinorParam csParam; setColorSpinorParams(localDim, host_precision, &csParam); - // dirty hack to invalidate the cached gauge field without breaking interface compatability - // if (*num_iters == -1 || !canReuseResidentGauge(&invertParam)) invalidateGaugeQuda(); - - // if (invalidate_quda_gauge || !create_quda_gauge) { - // loadGaugeQuda(gauge, &qudaGaugeParam); - // invalidate_quda_gauge = false; - // } - invertParam.dslash_type = QUDA_WILSON_DSLASH; - // int src_offset = getColorVectorOffset(other_parity, false, localDim); - // int dst_offset = getColorVectorOffset(local_parity, false, localDim); - - dslashQuda(static_cast(dst), static_cast(src), &invertParam, local_parity); + dslashQuda(dst,src, &invertParam, local_parity); - // if (!create_quda_gauge) invalidateGaugeQuda(); +// TODO: need save?? - // qudamilc_called(__func__, verbosity);// TODO: remove? (This is from MILC) + return; } // qudaDslash // #endif From 16b7df06dafc1b0b3e3b1b2705f4aac307b10e9b Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Wed, 22 Mar 2023 17:51:00 +0100 Subject: [PATCH 019/148] load-save in spinors+gauge + format --- include/color_spinor_field_order.h | 12 +- include/gauge_field_order.h | 2466 ++++++++++++------------ include/quda.h | 2828 ++++++++++++++-------------- include/quda_openqcd_interface.h | 2 + lib/interface_quda.cpp | 1204 ++++++------ lib/openqcd_interface.cpp | 28 +- 6 files changed, 3232 insertions(+), 3308 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index c12a86746f..73681706e4 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1767,7 +1767,7 @@ namespace quda // int dir_OpenQxD = (dir + 1) % 4; // rotation of axes QUDA -> OpenQxD // Loading as per QUDA style - auto in = &field[(4 * iy_OpenQxD) * length]; // This is how they're accessed within OpenQxd (length = 24 doubles + auto in = &field[iy_OpenQxD * length]; // This is how they're accessed within OpenQxd (length = 24 doubles // = 12 complex doubles = 4 spinor x 3 colors) block_load(v, reinterpret_cast(in)); /* END OF INDEXING */ @@ -1776,7 +1776,15 @@ namespace quda __device__ __host__ inline void save(const complex v[length / 2], int x, int parity = 0) const // TODO: adapt to openqxd { - auto out = &field[(parity * volumeCB + x) * length]; + /* INDEXING */ + int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) + getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site + + int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; + // int dir_OpenQxD = (dir + 1) % 4; // rotation of axes QUDA -> OpenQxD + + // Loading as per QUDA style + auto out = &field[iy_OpenQxD * length]; block_store(reinterpret_cast(out), v); } diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 826e836216..1683937777 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -28,8 +28,8 @@ // OpenQxD helpers: // #include "../../openQxD-devel/include/lattice.h" - -namespace quda { +namespace quda +{ /** @brief gauge_wrapper is an internal class that is used to wrap @@ -42,43 +42,43 @@ namespace quda { temporaries with explicit calls to the load/save methods in the gauge-field accessors. */ - template - struct gauge_wrapper { - const int dim; - const int x_cb; - const int parity; - const Float phase; - T &gauge; + template struct gauge_wrapper { + const int dim; + const int x_cb; + const int parity; + const Float phase; + T &gauge; - /** - @brief gauge_wrapper constructor - @param[in] gauge Gauge field accessor we are wrapping - @param[in] dim Dimension we are accessing - @param[in] x_cb Checkerboarded space-time index we are accessing - @param[in] parity Parity we are accessing - */ - __device__ __host__ inline gauge_wrapper(T &gauge, int dim, int x_cb, int parity, Float phase = 1.0) : - dim(dim), x_cb(x_cb), parity(parity), phase(phase), gauge(gauge) - { - } + /** + @brief gauge_wrapper constructor + @param[in] gauge Gauge field accessor we are wrapping + @param[in] dim Dimension we are accessing + @param[in] x_cb Checkerboarded space-time index we are accessing + @param[in] parity Parity we are accessing + */ + __device__ __host__ inline gauge_wrapper(T &gauge, int dim, int x_cb, int parity, Float phase = 1.0) : + dim(dim), x_cb(x_cb), parity(parity), phase(phase), gauge(gauge) + { + } - /** - @brief Assignment operator with Matrix instance as input - @param[in] M Matrix we want to store in this accessor - */ - template __device__ __host__ inline void operator=(const M &a) const - { - gauge.save(a.data, x_cb, dim, parity); - } - }; + /** + @brief Assignment operator with Matrix instance as input + @param[in] M Matrix we want to store in this accessor + */ + template __device__ __host__ inline void operator=(const M &a) const + { + gauge.save(a.data, x_cb, dim, parity); + } + }; /** @brief Copy constructor for the Matrix class with a gauge_wrapper input. @param[in] a Input gauge_wrapper that we use to fill in this matrix instance */ template - template - __device__ __host__ inline void Matrix::operator=(const gauge_wrapper::type,S> &a) { + template + __device__ __host__ inline void Matrix::operator=(const gauge_wrapper::type, S> &a) + { a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase); } @@ -87,8 +87,9 @@ namespace quda { @param[in] a Input gauge_wrapper that we use to fill in this matrix instance */ template - template - __device__ __host__ inline Matrix::Matrix(const gauge_wrapper::type,S> &a) { + template + __device__ __host__ inline Matrix::Matrix(const gauge_wrapper::type, S> &a) + { a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase); } @@ -103,44 +104,44 @@ namespace quda { having to declare temporaries with explicit calls to the load/save methods in the gauge-field accessors. */ - template - struct gauge_ghost_wrapper { - const int dim; - const int ghost_idx; - const int parity; - const Float phase; - T &gauge; + template struct gauge_ghost_wrapper { + const int dim; + const int ghost_idx; + const int parity; + const Float phase; + T &gauge; - /** - @brief gauge_wrapper constructor - @param[in] gauge Gauge field accessor we are wrapping - @param[in] dim Dimension we are accessing - @param[in] ghost_idx Ghost index we are accessing - @param[in] parity Parity we are accessing - */ - __device__ __host__ inline gauge_ghost_wrapper(T &gauge, int dim, int ghost_idx, int parity, - Float phase = 1.0) : - dim(dim), ghost_idx(ghost_idx), parity(parity), phase(phase), gauge(gauge) - { - } + /** + @brief gauge_wrapper constructor + @param[in] gauge Gauge field accessor we are wrapping + @param[in] dim Dimension we are accessing + @param[in] ghost_idx Ghost index we are accessing + @param[in] parity Parity we are accessing + */ + __device__ __host__ inline gauge_ghost_wrapper(T &gauge, int dim, int ghost_idx, int parity, + Float phase = 1.0) : + dim(dim), ghost_idx(ghost_idx), parity(parity), phase(phase), gauge(gauge) + { + } - /** - @brief Assignment operator with Matrix instance as input - @param[in] M Matrix we want to store in this accessot - */ - template __device__ __host__ inline void operator=(const M &a) const - { - gauge.saveGhost(a.data, ghost_idx, dim, parity); - } - }; + /** + @brief Assignment operator with Matrix instance as input + @param[in] M Matrix we want to store in this accessot + */ + template __device__ __host__ inline void operator=(const M &a) const + { + gauge.saveGhost(a.data, ghost_idx, dim, parity); + } + }; /** @brief Copy constructor for the Matrix class with a gauge_ghost_wrapper input. @param[in] a Input gauge_wrapper that we use to fill in this matrix instance */ template - template - __device__ __host__ inline void Matrix::operator=(const gauge_ghost_wrapper::type,S> &a) { + template + __device__ __host__ inline void Matrix::operator=(const gauge_ghost_wrapper::type, S> &a) + { a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase); } @@ -149,21 +150,26 @@ namespace quda { @param[in] a Input gauge_wrapper that we use to fill in this matrix instance */ template - template - __device__ __host__ inline Matrix::Matrix(const gauge_ghost_wrapper::type,S> &a) { + template + __device__ __host__ inline Matrix::Matrix(const gauge_ghost_wrapper::type, S> &a) + { a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase); } - namespace gauge { + namespace gauge + { - template __host__ __device__ inline constexpr bool fixed_point() { return false; } + template __host__ __device__ inline constexpr bool fixed_point() + { + return false; + } template <> __host__ __device__ inline constexpr bool fixed_point() { return true; } - template<> __host__ __device__ inline constexpr bool fixed_point() { return true; } - template<> __host__ __device__ inline constexpr bool fixed_point() { return true; } + template <> __host__ __device__ inline constexpr bool fixed_point() { return true; } + template <> __host__ __device__ inline constexpr bool fixed_point() { return true; } template __host__ __device__ inline constexpr bool match() { return false; } - template<> __host__ __device__ inline constexpr bool match() { return true; } - template<> __host__ __device__ inline constexpr bool match() { return true; } + template <> __host__ __device__ inline constexpr bool match() { return true; } + template <> __host__ __device__ inline constexpr bool match() { return true; } /** @brief fieldorder_wrapper is an internal class that is used to @@ -172,8 +178,7 @@ namespace quda { for fixed-point accessors providing the necessary conversion and scaling when writing to a fixed-point field. */ - template - struct fieldorder_wrapper { + template struct fieldorder_wrapper { using value_type = Float; using store_type = storeFloat; complex *v; @@ -280,10 +285,10 @@ namespace quda { v[idx] -= fixed ? complex(round(scale * a.x), round(scale * a.y)) : complex(a.x, a.y); } } - }; + }; - template - __device__ __host__ inline complex operator*(const Float &a, const fieldorder_wrapper &b) + template + __device__ __host__ inline complex operator*(const Float &a, const fieldorder_wrapper &b) { return a * complex(b); } @@ -345,13 +350,13 @@ namespace quda { struct Accessor { using wrapper = fieldorder_wrapper; static constexpr bool is_mma_compatible = false; - complex *u[QUDA_MAX_GEOMETRY]; + complex *u[QUDA_MAX_GEOMETRY]; const unsigned int volumeCB; const int geometry; const unsigned int cb_offset; Float scale; Float scale_inv; - static constexpr bool fixed = fixed_point(); + static constexpr bool fixed = fixed_point(); Accessor(const GaugeField &U, void *gauge_ = 0, void ** = 0) : volumeCB(U.VolumeCB()), @@ -360,10 +365,10 @@ namespace quda { scale(static_cast(1.0)), scale_inv(static_cast(1.0)) { - for (int d=0; d**>(gauge_)[d] : - static_cast**>(const_cast(U.Gauge_p()))[d]; - resetScale(U.Scale()); + for (int d = 0; d < U.Geometry(); d++) + u[d] = gauge_ ? static_cast **>(gauge_)[d] : + static_cast **>(const_cast(U.Gauge_p()))[d]; + resetScale(U.Scale()); } void resetScale(Float max) @@ -384,11 +389,11 @@ namespace quda { const complex &val) const { using vec2 = array; - vec2 *u2 = reinterpret_cast(u[dim] + parity*cb_offset + (x_cb*nColor + row)*nColor + col); + vec2 *u2 = reinterpret_cast(u[dim] + parity * cb_offset + (x_cb * nColor + row) * nColor + col); vec2 val_ = (fixed && !match()) ? - vec2{static_cast(round(scale * val.real())), static_cast(round(scale * val.imag()))} : - vec2{static_cast(val.real()), static_cast(val.imag())}; + vec2 {static_cast(round(scale * val.real())), static_cast(round(scale * val.imag()))} : + vec2 {static_cast(val.real()), static_cast(val.imag())}; atomic_fetch_add(u2, val_); } @@ -426,23 +431,23 @@ namespace quda { unsigned int ghostOffset[8]; Float scale; Float scale_inv; - static constexpr bool fixed = fixed_point(); + static constexpr bool fixed = fixed_point(); GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) : scale(static_cast(1.0)), scale_inv(static_cast(1.0)) { - for (int d=0; d<4; d++) { - ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : - static_cast*>(const_cast(U.Ghost()[d])); - ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - - ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : - ghost_ ? static_cast*>(ghost_[d+4]) : - static_cast*>(const_cast(U.Ghost()[d+4])); - ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - } + for (int d = 0; d < 4; d++) { + ghost[d] = ghost_ ? static_cast *>(ghost_[d]) : + static_cast *>(const_cast(U.Ghost()[d])); + ghostOffset[d] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor(); + + ghost[d + 4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : + ghost_ ? static_cast *>(ghost_[d + 4]) : + static_cast *>(const_cast(U.Ghost()[d + 4])); + ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor(); + } - resetScale(U.Scale()); + resetScale(U.Scale()); } void resetScale(Float max) @@ -468,7 +473,7 @@ namespace quda { const int geometry; Float scale; Float scale_inv; - static constexpr bool fixed = fixed_point(); + static constexpr bool fixed = fixed_point(); Accessor(const GaugeField &U, void *gauge_ = nullptr, void ** = nullptr) : u(gauge_ ? static_cast *>(gauge_) : @@ -503,11 +508,12 @@ namespace quda { const complex &val) const { using vec2 = array; - vec2 *u2 = reinterpret_cast(u + (((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col); + vec2 *u2 + = reinterpret_cast(u + (((parity * volumeCB + x_cb) * geometry + dim) * nColor + row) * nColor + col); vec2 val_ = (fixed && !match()) ? - vec2{static_cast(round(scale * val.real())), static_cast(round(scale * val.imag()))} : - vec2{static_cast(val.real()), static_cast(val.imag())}; + vec2 {static_cast(round(scale * val.real())), static_cast(round(scale * val.imag()))} : + vec2 {static_cast(val.real()), static_cast(val.imag())}; atomic_fetch_add(u2, val_); } @@ -548,23 +554,23 @@ namespace quda { unsigned int ghostOffset[8]; Float scale; Float scale_inv; - static constexpr bool fixed = fixed_point(); + static constexpr bool fixed = fixed_point(); GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) : scale(static_cast(1.0)), scale_inv(static_cast(1.0)) { - for (int d=0; d<4; d++) { - ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : - static_cast*>(const_cast(U.Ghost()[d])); - ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - - ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : - ghost_ ? static_cast*>(ghost_[d+4]) : - static_cast*>(const_cast(U.Ghost()[d+4])); - ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - } + for (int d = 0; d < 4; d++) { + ghost[d] = ghost_ ? static_cast *>(ghost_[d]) : + static_cast *>(const_cast(U.Ghost()[d])); + ghostOffset[d] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor(); + + ghost[d + 4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : + ghost_ ? static_cast *>(ghost_[d + 4]) : + static_cast *>(const_cast(U.Ghost()[d + 4])); + ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor(); + } - resetScale(U.Scale()); + resetScale(U.Scale()); } void resetScale(Float max) @@ -585,13 +591,14 @@ namespace quda { } }; - template - __device__ __host__ inline int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb) { - constexpr int M = (2*nColor*nColor) / N; - int j = ((row*nColor+col)*2) / N; // factor of two for complexity - int i = ((row*nColor+col)*2) % N; - int index = ((x_cb + dim*stride*M + j*stride)*2+i) / 2; // back to a complex offset - index += parity*offset_cb; + template + __device__ __host__ inline int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb) + { + constexpr int M = (2 * nColor * nColor) / N; + int j = ((row * nColor + col) * 2) / N; // factor of two for complexity + int i = ((row * nColor + col) * 2) % N; + int index = ((x_cb + dim * stride * M + j * stride) * 2 + i) / 2; // back to a complex offset + index += parity * offset_cb; return index; }; @@ -606,7 +613,7 @@ namespace quda { const int geometry; Float scale; Float scale_inv; - static constexpr bool fixed = fixed_point(); + static constexpr bool fixed = fixed_point(); Accessor(const GaugeField &U, void *gauge_ = nullptr, void ** = nullptr) : u(gauge_ ? static_cast *>(gauge_) : @@ -618,21 +625,21 @@ namespace quda { scale(static_cast(1.0)), scale_inv(static_cast(1.0)) { - resetScale(U.Scale()); + resetScale(U.Scale()); } void resetScale(Float max) { if (fixed) { scale = static_cast(std::numeric_limits::max()) / max; - scale_inv = max / static_cast(std::numeric_limits::max()); + scale_inv = max / static_cast(std::numeric_limits::max()); } } __device__ __host__ inline wrapper operator()(int dim, int parity, int x_cb, int row, int col) const { auto index = parity * offset_cb + dim * stride * nColor * nColor + (row * nColor + col) * stride + x_cb; - return fieldorder_wrapper(u, index, scale, scale_inv); + return fieldorder_wrapper(u, index, scale, scale_inv); } template @@ -640,11 +647,12 @@ namespace quda { const complex &val) const { using vec2 = array; - vec2 *u2 = reinterpret_cast(u + parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb); + vec2 *u2 = reinterpret_cast(u + parity * offset_cb + dim * stride * nColor * nColor + + (row * nColor + col) * stride + x_cb); vec2 val_ = (fixed && !match()) ? - vec2{static_cast(round(scale * val.real())), static_cast(round(scale * val.imag()))} : - vec2{static_cast(val.real()), static_cast(val.imag())}; + vec2 {static_cast(round(scale * val.real())), static_cast(round(scale * val.imag()))} : + vec2 {static_cast(val.real()), static_cast(val.imag())}; atomic_fetch_add(u2, val_); } @@ -681,7 +689,7 @@ namespace quda { unsigned int ghostVolumeCB[8]; Float scale; Float scale_inv; - static constexpr bool fixed = fixed_point(); + static constexpr bool fixed = fixed_point(); Accessor accessor; GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_ = 0) : @@ -692,10 +700,12 @@ namespace quda { { if constexpr (!native_ghost) assert(ghost_ != nullptr); for (int d = 0; d < 4; d++) { - ghost[d] = !native_ghost ? static_cast*>(ghost_[d]) : nullptr; - ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d); - ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast*>(ghost_[d+4]) : nullptr; - ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d); + ghost[d] = !native_ghost ? static_cast *>(ghost_[d]) : nullptr; + ghostVolumeCB[d] = U.Nface() * U.SurfaceCB(d); + ghost[d + 4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY ? + static_cast *>(ghost_[d + 4]) : + nullptr; + ghostVolumeCB[d + 4] = U.Nface() * U.SurfaceCB(d); } resetScale(U.Scale()); } @@ -767,778 +777,781 @@ namespace quda { ghostAccessor(U, (void *)gauge_, (void **)ghost_) { if (U.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("GaugeField ordering not supported with reconstruction"); - } - - void resetScale(double max) { - accessor.resetScale(max); - ghostAccessor.resetScale(max); - } - - static constexpr bool fixedPoint() { return fixed_point(); } - - /** - * accessor function - * @param d dimension index - * @param parity Parity index - * @param x 1-d site index - * @param row row index - * @param c column index - */ - __device__ __host__ inline auto operator()(int d, int parity, int x, int row, int col) const - { - return accessor(d, parity, x, row, col); - } + } - __device__ __host__ inline auto Ghost(int d, int parity, int x) const { return ghostAccessor(d, parity, x); } - - /** - * accessor function for the ghost zone - * @param d dimension index - * @param parity Parity index - * @param x 1-d site index - * @param row row index - * @param c column index - */ - __device__ __host__ auto Ghost(int d, int parity, int x, int row, int col) const - { - return ghostAccessor(d, parity, x, row, col); - } - /** - * @brief This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points to - * the start of the memory chunk corresponds to the matrix at d, parity, x. Only available for the - * QUDA_MILC_GAUGE_ORDER order. - - * @param d dimension index - * @param parity Parity index - * @param x 1-d site index - */ - __device__ __host__ auto wrap_ghost(int d, int parity, int x) const - { - return ghostAccessor(d, parity, x, 0, 0); - } + void resetScale(double max) + { + accessor.resetScale(max); + ghostAccessor.resetScale(max); + } - /** - * Specialized complex-member accessor function (for coarse gauge field) - * @param d dimension index - * @param parity Parity index - * @param x 1-d site index - * @param s_row row spin index - * @param c_row row color index - * @param s_col col spin index - * @param c_col col color index - */ - __device__ __host__ inline auto operator()(int d, int parity, int x, int s_row, int s_col, int c_row, - int c_col) const - { - return (*this)(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col); - } + static constexpr bool fixedPoint() { return fixed_point(); } - /** - * Specialized complex-member accessor function (for coarse gauge field ghost zone) - * @param d dimension index - * @param parity Parity index - * @param x 1-d site index - * @param s_row row spin index - * @param c_row row color index - * @param s_col col spin index - * @param c_col col color index - */ - __device__ __host__ inline auto Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const - { - return Ghost(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col); - } + /** + * accessor function + * @param d dimension index + * @param parity Parity index + * @param x 1-d site index + * @param row row index + * @param c column index + */ + __device__ __host__ inline auto operator()(int d, int parity, int x, int row, int col) const + { + return accessor(d, parity, x, row, col); + } - template - __device__ __host__ inline void atomicAdd(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col, - const complex &val) const - { - accessor.atomic_add(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col, val); - } + __device__ __host__ inline auto Ghost(int d, int parity, int x) const { return ghostAccessor(d, parity, x); } + + /** + * accessor function for the ghost zone + * @param d dimension index + * @param parity Parity index + * @param x 1-d site index + * @param row row index + * @param c column index + */ + __device__ __host__ auto Ghost(int d, int parity, int x, int row, int col) const + { + return ghostAccessor(d, parity, x, row, col); + } + /** + * @brief This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points to + * the start of the memory chunk corresponds to the matrix at d, parity, x. Only available for the + * QUDA_MILC_GAUGE_ORDER order. - /** Returns the number of field colors */ - __device__ __host__ inline int Ncolor() const { return nColor; } + * @param d dimension index + * @param parity Parity index + * @param x 1-d site index + */ + __device__ __host__ auto wrap_ghost(int d, int parity, int x) const { return ghostAccessor(d, parity, x, 0, 0); } + + /** + * Specialized complex-member accessor function (for coarse gauge field) + * @param d dimension index + * @param parity Parity index + * @param x 1-d site index + * @param s_row row spin index + * @param c_row row color index + * @param s_col col spin index + * @param c_col col color index + */ + __device__ __host__ inline auto operator()(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const + { + return (*this)(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col); + } - /** Returns the field volume */ - __device__ __host__ inline int Volume() const { return 2*volumeCB; } + /** + * Specialized complex-member accessor function (for coarse gauge field ghost zone) + * @param d dimension index + * @param parity Parity index + * @param x 1-d site index + * @param s_row row spin index + * @param c_row row color index + * @param s_col col spin index + * @param c_col col color index + */ + __device__ __host__ inline auto Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const + { + return Ghost(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col); + } - /** Returns the field volume */ - __device__ __host__ inline int VolumeCB() const { return volumeCB; } + template + __device__ __host__ inline void atomicAdd(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col, + const complex &val) const + { + accessor.atomic_add(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col, val); + } - /** Returns the field geometric dimension */ - __device__ __host__ inline int Ndim() const { return nDim; } + /** Returns the number of field colors */ + __device__ __host__ inline int Ncolor() const { return nColor; } - /** Returns the field geometry */ - __device__ __host__ inline int Geometry() const { return geometry; } + /** Returns the field volume */ + __device__ __host__ inline int Volume() const { return 2 * volumeCB; } - /** Returns the number of coarse gauge field spins */ - __device__ __host__ inline int NspinCoarse() const { return nSpinCoarse; } + /** Returns the field volume */ + __device__ __host__ inline int VolumeCB() const { return volumeCB; } - /** Returns the number of coarse gauge field colors */ - __device__ __host__ inline int NcolorCoarse() const { return nColorCoarse; } + /** Returns the field geometric dimension */ + __device__ __host__ inline int Ndim() const { return nDim; } - /** - * @brief Returns the L1 norm of the field in a given dimension - * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) - * @return L1 norm - */ - __host__ double norm1(int dim=-1, bool global=true) const { - commGlobalReductionPush(global); - double nrm1 = accessor.template transform_reduce>(location, dim, - abs_(accessor.scale_inv)); - commGlobalReductionPop(); - return nrm1; - } + /** Returns the field geometry */ + __device__ __host__ inline int Geometry() const { return geometry; } - /** - * @brief Returns the L2 norm squared of the field in a given dimension - * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) - * @return L2 norm squared - */ - __host__ double norm2(int dim = -1, bool global = true) const - { - commGlobalReductionPush(global); - double nrm2 = accessor.template transform_reduce>( - location, dim, square_(accessor.scale_inv)); - commGlobalReductionPop(); - return nrm2; - } + /** Returns the number of coarse gauge field spins */ + __device__ __host__ inline int NspinCoarse() const { return nSpinCoarse; } - /** - * @brief Returns the Linfinity norm of the field in a given dimension - * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) - * @return Linfinity norm - */ - __host__ double abs_max(int dim = -1, bool global = true) const - { - commGlobalReductionPush(global); - double absmax = accessor.template transform_reduce>( - location, dim, abs_max_(accessor.scale_inv)); - commGlobalReductionPop(); - return absmax; - } + /** Returns the number of coarse gauge field colors */ + __device__ __host__ inline int NcolorCoarse() const { return nColorCoarse; } - /** - * @brief Returns the minimum absolute value of the field - * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) - * @return Minimum norm - */ - __host__ double abs_min(int dim = -1, bool global = true) const - { - commGlobalReductionPush(global); - double absmin = accessor.template transform_reduce>( - location, dim, abs_min_(accessor.scale_inv)); - commGlobalReductionPop(); - return absmin; - } + /** + * @brief Returns the L1 norm of the field in a given dimension + * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) + * @return L1 norm + */ + __host__ double norm1(int dim = -1, bool global = true) const + { + commGlobalReductionPush(global); + double nrm1 = accessor.template transform_reduce>(location, dim, + abs_(accessor.scale_inv)); + commGlobalReductionPop(); + return nrm1; + } - /** Return the size of the allocation (geometry and parity left out and added as needed in Tunable::bytes) */ - size_t Bytes() const { return static_cast(volumeCB) * nColor * nColor * 2ll * sizeof(storeFloat); } - }; + /** + * @brief Returns the L2 norm squared of the field in a given dimension + * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) + * @return L2 norm squared + */ + __host__ double norm2(int dim = -1, bool global = true) const + { + commGlobalReductionPush(global); + double nrm2 = accessor.template transform_reduce>(location, dim, + square_(accessor.scale_inv)); + commGlobalReductionPop(); + return nrm2; + } /** - @brief Generic reconstruction helper with no reconstruction - @tparam N number of real numbers in each packed gauge matrix - @tparam Float Storage format (e.g., double, float, short) - @tparam ghostExchange_ optional template the ghostExchange type - to avoid the run-time overhead (dummy for trivial reconstruct - type) - */ - template - struct Reconstruct { - using real = typename mapper::type; - using complex = complex; - real scale; - real scale_inv; - Reconstruct(const GaugeField &u) : - scale(isFixed::value ? u.LinkMax() : 1.0), - scale_inv(isFixed::value ? 1.0 / scale : 1.0) - { - } + * @brief Returns the Linfinity norm of the field in a given dimension + * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) + * @return Linfinity norm + */ + __host__ double abs_max(int dim = -1, bool global = true) const + { + commGlobalReductionPush(global); + double absmax = accessor.template transform_reduce>( + location, dim, abs_max_(accessor.scale_inv)); + commGlobalReductionPop(); + return absmax; + } - __device__ __host__ inline void Pack(real out[N], const complex in[N / 2]) const - { - if constexpr (isFixed::value) { + /** + * @brief Returns the minimum absolute value of the field + * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) + * @return Minimum norm + */ + __host__ double abs_min(int dim = -1, bool global = true) const + { + commGlobalReductionPush(global); + double absmin = accessor.template transform_reduce>( + location, dim, abs_min_(accessor.scale_inv)); + commGlobalReductionPop(); + return absmin; + } + + /** Return the size of the allocation (geometry and parity left out and added as needed in Tunable::bytes) */ + size_t Bytes() const { return static_cast(volumeCB) * nColor * nColor * 2ll * sizeof(storeFloat); } + }; + + /** + @brief Generic reconstruction helper with no reconstruction + @tparam N number of real numbers in each packed gauge matrix + @tparam Float Storage format (e.g., double, float, short) + @tparam ghostExchange_ optional template the ghostExchange type + to avoid the run-time overhead (dummy for trivial reconstruct + type) + */ + template + struct Reconstruct { + using real = typename mapper::type; + using complex = complex; + real scale; + real scale_inv; + Reconstruct(const GaugeField &u) : + scale(isFixed::value ? u.LinkMax() : 1.0), scale_inv(isFixed::value ? 1.0 / scale : 1.0) + { + } + + __device__ __host__ inline void Pack(real out[N], const complex in[N / 2]) const + { + if constexpr (isFixed::value) { #pragma unroll - for (int i = 0; i < N / 2; i++) { - out[2 * i + 0] = scale_inv * in[i].real(); - out[2 * i + 1] = scale_inv * in[i].imag(); - } - } else { + for (int i = 0; i < N / 2; i++) { + out[2 * i + 0] = scale_inv * in[i].real(); + out[2 * i + 1] = scale_inv * in[i].imag(); + } + } else { #pragma unroll - for (int i = 0; i < N / 2; i++) { - out[2 * i + 0] = in[i].real(); - out[2 * i + 1] = in[i].imag(); - } + for (int i = 0; i < N / 2; i++) { + out[2 * i + 0] = in[i].real(); + out[2 * i + 1] = in[i].imag(); } } + } - template - __device__ __host__ inline void Unpack(complex out[N / 2], const real in[N], int, int, real, const I *, - const int *) const - { - if constexpr (isFixed::value) { + template + __device__ __host__ inline void Unpack(complex out[N / 2], const real in[N], int, int, real, const I *, + const int *) const + { + if constexpr (isFixed::value) { #pragma unroll - for (int i = 0; i < N / 2; i++) { out[i] = scale * complex(in[2 * i + 0], in[2 * i + 1]); } - } else { + for (int i = 0; i < N / 2; i++) { out[i] = scale * complex(in[2 * i + 0], in[2 * i + 1]); } + } else { #pragma unroll - for (int i = 0; i < N / 2; i++) { out[i] = complex(in[2 * i + 0], in[2 * i + 1]); } - } + for (int i = 0; i < N / 2; i++) { out[i] = complex(in[2 * i + 0], in[2 * i + 1]); } } - __device__ __host__ inline real getPhase(const complex[]) const { return 0; } - }; + } + __device__ __host__ inline real getPhase(const complex[]) const { return 0; } + }; - /** - @brief timeBoundary Compute boundary condition correction - @tparam ghostExhange_ Optional template the ghostExchange type to avoid the run-time overhead - @param idx extended field linear index - @param X the gauge field dimensions - @param R the radii dimenions of the extended region - @param tBoundary the boundary condition - @param isFirstTimeSlice if we're on the first time slice of nodes - @param isLastTimeSlide if we're on the last time slice of nodes - @param ghostExchange if the field is extended or not (determines indexing type) - */ - template - __device__ __host__ inline T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM], - T tBoundary, T scale, int firstTimeSliceBound, int lastTimeSliceBound, bool isFirstTimeSlice, - bool isLastTimeSlice, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_NO) - { - - // MWTODO: should this return tBoundary : scale or tBoundary*scale : scale - - if (ghostExchange_ == QUDA_GHOST_EXCHANGE_PAD - || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED)) { - if (idx >= firstTimeSliceBound) { // halo region on the first time slice - return isFirstTimeSlice ? tBoundary : scale; - } else if (idx >= lastTimeSliceBound) { // last link on the last time slice - return isLastTimeSlice ? tBoundary : scale; - } else { - return scale; - } - } else if (ghostExchange_ == QUDA_GHOST_EXCHANGE_EXTENDED - || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED)) { - if (idx >= (R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < R[3] * X[0] * X[1] * X[2] / 2) { - // the boundary condition is on the R[3]-1 time slice - return isFirstTimeSlice ? tBoundary : scale; - } else if (idx >= (X[3] - R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < (X[3] - R[3]) * X[0] * X[1] * X[2] / 2) { - // the boundary condition lies on the X[3]-R[3]-1 time slice - return isLastTimeSlice ? tBoundary : scale; - } else { - return scale; - } + /** + @brief timeBoundary Compute boundary condition correction + @tparam ghostExhange_ Optional template the ghostExchange type to avoid the run-time overhead + @param idx extended field linear index + @param X the gauge field dimensions + @param R the radii dimenions of the extended region + @param tBoundary the boundary condition + @param isFirstTimeSlice if we're on the first time slice of nodes + @param isLastTimeSlide if we're on the last time slice of nodes + @param ghostExchange if the field is extended or not (determines indexing type) + */ + template + __device__ __host__ inline T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM], T tBoundary, + T scale, int firstTimeSliceBound, int lastTimeSliceBound, + bool isFirstTimeSlice, bool isLastTimeSlice, + QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_NO) + { + + // MWTODO: should this return tBoundary : scale or tBoundary*scale : scale + + if (ghostExchange_ == QUDA_GHOST_EXCHANGE_PAD + || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED)) { + if (idx >= firstTimeSliceBound) { // halo region on the first time slice + return isFirstTimeSlice ? tBoundary : scale; + } else if (idx >= lastTimeSliceBound) { // last link on the last time slice + return isLastTimeSlice ? tBoundary : scale; + } else { + return scale; + } + } else if (ghostExchange_ == QUDA_GHOST_EXCHANGE_EXTENDED + || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED)) { + if (idx >= (R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < R[3] * X[0] * X[1] * X[2] / 2) { + // the boundary condition is on the R[3]-1 time slice + return isFirstTimeSlice ? tBoundary : scale; + } else if (idx >= (X[3] - R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < (X[3] - R[3]) * X[0] * X[1] * X[2] / 2) { + // the boundary condition lies on the X[3]-R[3]-1 time slice + return isLastTimeSlice ? tBoundary : scale; + } else { + return scale; } - return scale; } + return scale; + } - // not actually used - here for reference - template - __device__ __host__ inline Float milcStaggeredPhase(int dim, const int x[], const I R[]) { - // could consider non-extended variant too? - Float sign = static_cast(1.0); - switch (dim) { - case 0: if ( ((x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); break; - case 1: if ( ((x[0] - R[0] + x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); break; - case 2: if ( ((x[0] - R[0] + x[1] - R[1] + x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); break; - } - return sign; - } + // not actually used - here for reference + template + __device__ __host__ inline Float milcStaggeredPhase(int dim, const int x[], const I R[]) + { + // could consider non-extended variant too? + Float sign = static_cast(1.0); + switch (dim) { + case 0: + if (((x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); + break; + case 1: + if (((x[0] - R[0] + x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); + break; + case 2: + if (((x[0] - R[0] + x[1] - R[1] + x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); + break; + } + return sign; + } - /** - @brief Gauge reconstruct 12 helper where we reconstruct the - third row from the cross product of the first two rows - @tparam Float Storage format (e.g., double, float, short) - @tparam ghostExchange_ optional template the ghostExchange - type to avoid the run-time overhead - */ - template struct Reconstruct<12, Float, ghostExchange_> { - using real = typename mapper::type; - using complex = complex; - const real anisotropy; - const real tBoundary; - const int firstTimeSliceBound; - const int lastTimeSliceBound; - const bool isFirstTimeSlice; - const bool isLastTimeSlice; - QudaGhostExchange ghostExchange; - - Reconstruct(const GaugeField &u) : - anisotropy(u.Anisotropy()), - tBoundary(static_cast(u.TBoundary())), - firstTimeSliceBound(u.VolumeCB()), - lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2), - isFirstTimeSlice(comm_coord(3) == 0 ? true : false), - isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false), - ghostExchange(u.GhostExchange()) - { - } + /** + @brief Gauge reconstruct 12 helper where we reconstruct the + third row from the cross product of the first two rows + @tparam Float Storage format (e.g., double, float, short) + @tparam ghostExchange_ optional template the ghostExchange + type to avoid the run-time overhead + */ + template struct Reconstruct<12, Float, ghostExchange_> { + using real = typename mapper::type; + using complex = complex; + const real anisotropy; + const real tBoundary; + const int firstTimeSliceBound; + const int lastTimeSliceBound; + const bool isFirstTimeSlice; + const bool isLastTimeSlice; + QudaGhostExchange ghostExchange; - __device__ __host__ inline void Pack(real out[12], const complex in[9]) const - { + Reconstruct(const GaugeField &u) : + anisotropy(u.Anisotropy()), + tBoundary(static_cast(u.TBoundary())), + firstTimeSliceBound(u.VolumeCB()), + lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2), + isFirstTimeSlice(comm_coord(3) == 0 ? true : false), + isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false), + ghostExchange(u.GhostExchange()) + { + } + + __device__ __host__ inline void Pack(real out[12], const complex in[9]) const + { #pragma unroll - for (int i = 0; i < 6; i++) { - out[2 * i + 0] = in[i].real(); - out[2 * i + 1] = in[i].imag(); - } + for (int i = 0; i < 6; i++) { + out[2 * i + 0] = in[i].real(); + out[2 * i + 1] = in[i].imag(); } + } - template - __device__ __host__ inline void Unpack(complex out[9], const real in[12], int idx, int dir, real, const I *X, - const int *R) const - { + template + __device__ __host__ inline void Unpack(complex out[9], const real in[12], int idx, int dir, real, const I *X, + const int *R) const + { #pragma unroll - for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]); - - const real u0 = dir < 3 ? - anisotropy : - timeBoundary(idx, X, R, tBoundary, static_cast(1.0), firstTimeSliceBound, - lastTimeSliceBound, isFirstTimeSlice, isLastTimeSlice, ghostExchange); - - // out[6] = u0*conj(out[1]*out[5] - out[2]*out[4]); - out[6] = cmul(out[2], out[4]); - out[6] = cmac(out[1], out[5], -out[6]); - out[6] = u0 * conj(out[6]); - - // out[7] = u0*conj(out[2]*out[3] - out[0]*out[5]); - out[7] = cmul(out[0], out[5]); - out[7] = cmac(out[2], out[3], -out[7]); - out[7] = u0 * conj(out[7]); - - // out[8] = u0*conj(out[0]*out[4] - out[1]*out[3]); - out[8] = cmul(out[1], out[3]); - out[8] = cmac(out[0], out[4], -out[8]); - out[8] = u0 * conj(out[8]); - } + for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]); - __device__ __host__ inline real getPhase(const complex[]) const { return 0; } - }; + const real u0 = dir < 3 ? + anisotropy : + timeBoundary(idx, X, R, tBoundary, static_cast(1.0), firstTimeSliceBound, + lastTimeSliceBound, isFirstTimeSlice, isLastTimeSlice, ghostExchange); - /** - @brief Gauge reconstruct helper for Momentum field with 10 - packed elements (really 9 from the Lie algebra, with zero for - last element). We label this as 11 to avoid collisions with - simple load/store of momentum field where we do not seek to - unpack/pack. - @tparam Float Storage format (e.g., double, float, short) - @tparam ghostExchange_ optional template the ghostExchange - type to avoid the run-time overhead - */ - template struct Reconstruct<11, Float, ghostExchange_> { - using real = typename mapper::type; - using complex = complex; + // out[6] = u0*conj(out[1]*out[5] - out[2]*out[4]); + out[6] = cmul(out[2], out[4]); + out[6] = cmac(out[1], out[5], -out[6]); + out[6] = u0 * conj(out[6]); - Reconstruct(const GaugeField &) { ; } + // out[7] = u0*conj(out[2]*out[3] - out[0]*out[5]); + out[7] = cmul(out[0], out[5]); + out[7] = cmac(out[2], out[3], -out[7]); + out[7] = u0 * conj(out[7]); - __device__ __host__ inline void Pack(real out[10], const complex in[9]) const - { + // out[8] = u0*conj(out[0]*out[4] - out[1]*out[3]); + out[8] = cmul(out[1], out[3]); + out[8] = cmac(out[0], out[4], -out[8]); + out[8] = u0 * conj(out[8]); + } + + __device__ __host__ inline real getPhase(const complex[]) const { return 0; } + }; + + /** + @brief Gauge reconstruct helper for Momentum field with 10 + packed elements (really 9 from the Lie algebra, with zero for + last element). We label this as 11 to avoid collisions with + simple load/store of momentum field where we do not seek to + unpack/pack. + @tparam Float Storage format (e.g., double, float, short) + @tparam ghostExchange_ optional template the ghostExchange + type to avoid the run-time overhead + */ + template struct Reconstruct<11, Float, ghostExchange_> { + using real = typename mapper::type; + using complex = complex; + + Reconstruct(const GaugeField &) { ; } + + __device__ __host__ inline void Pack(real out[10], const complex in[9]) const + { #pragma unroll - for (int i = 0; i < 2; i++) { - out[2 * i + 0] = in[i + 1].real(); - out[2 * i + 1] = in[i + 1].imag(); - } - out[4] = in[5].real(); - out[5] = in[5].imag(); - out[6] = in[0].imag(); - out[7] = in[4].imag(); - out[8] = in[8].imag(); - out[9] = 0.0; + for (int i = 0; i < 2; i++) { + out[2 * i + 0] = in[i + 1].real(); + out[2 * i + 1] = in[i + 1].imag(); } + out[4] = in[5].real(); + out[5] = in[5].imag(); + out[6] = in[0].imag(); + out[7] = in[4].imag(); + out[8] = in[8].imag(); + out[9] = 0.0; + } - template - __device__ __host__ inline void Unpack(complex out[9], const real in[10], int, int, real, const I *, - const int *) const - { - out[0] = complex(0.0, in[6]); - out[1] = complex(in[0], in[1]); - out[2] = complex(in[2], in[3]); - out[3] = complex(-out[1].real(), out[1].imag()); - out[4] = complex(0.0, in[7]); - out[5] = complex(in[4], in[5]); - out[6] = complex(-out[2].real(), out[2].imag()); - out[7] = complex(-out[5].real(), out[5].imag()); - out[8] = complex(0.0, in[8]); - } + template + __device__ __host__ inline void Unpack(complex out[9], const real in[10], int, int, real, const I *, const int *) const + { + out[0] = complex(0.0, in[6]); + out[1] = complex(in[0], in[1]); + out[2] = complex(in[2], in[3]); + out[3] = complex(-out[1].real(), out[1].imag()); + out[4] = complex(0.0, in[7]); + out[5] = complex(in[4], in[5]); + out[6] = complex(-out[2].real(), out[2].imag()); + out[7] = complex(-out[5].real(), out[5].imag()); + out[8] = complex(0.0, in[8]); + } - __device__ __host__ inline real getPhase(const complex[]) const { return 0; } - }; + __device__ __host__ inline real getPhase(const complex[]) const { return 0; } + }; - /** - @brief Gauge reconstruct 13 helper where we reconstruct the - third row from the cross product of the first two rows, and - include a non-trivial phase factor - @tparam Float Storage format (e.g., double, float, short) - @tparam ghostExchange_ optional template the ghostExchange - type to avoid the run-time overhead - */ - template - struct Reconstruct<13, Float, ghostExchange_, stag_phase> { - using real = typename mapper::type; - using complex = complex; - const Reconstruct<12, Float, ghostExchange_> reconstruct_12; - const real scale; - const real scale_inv; + /** + @brief Gauge reconstruct 13 helper where we reconstruct the + third row from the cross product of the first two rows, and + include a non-trivial phase factor + @tparam Float Storage format (e.g., double, float, short) + @tparam ghostExchange_ optional template the ghostExchange + type to avoid the run-time overhead + */ + template + struct Reconstruct<13, Float, ghostExchange_, stag_phase> { + using real = typename mapper::type; + using complex = complex; + const Reconstruct<12, Float, ghostExchange_> reconstruct_12; + const real scale; + const real scale_inv; - Reconstruct(const GaugeField &u) : reconstruct_12(u), scale(u.Scale()), scale_inv(1.0 / scale) {} + Reconstruct(const GaugeField &u) : reconstruct_12(u), scale(u.Scale()), scale_inv(1.0 / scale) { } - __device__ __host__ inline void Pack(real out[12], const complex in[9]) const { reconstruct_12.Pack(out, in); } + __device__ __host__ inline void Pack(real out[12], const complex in[9]) const { reconstruct_12.Pack(out, in); } - template - __device__ __host__ inline void Unpack(complex out[9], const real in[12], int, int, real phase, const I *, - const int *) const - { + template + __device__ __host__ inline void Unpack(complex out[9], const real in[12], int, int, real phase, const I *, + const int *) const + { #pragma unroll - for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]); - - out[6] = cmul(out[2], out[4]); - out[6] = cmac(out[1], out[5], -out[6]); - out[6] = scale_inv * conj(out[6]); - - out[7] = cmul(out[0], out[5]); - out[7] = cmac(out[2], out[3], -out[7]); - out[7] = scale_inv * conj(out[7]); - - out[8] = cmul(out[1], out[3]); - out[8] = cmac(out[0], out[4], -out[8]); - out[8] = scale_inv * conj(out[8]); - - if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phasing - // Multiply the third row by exp(I*3*phase), since the cross product will end up in a scale factor of exp(-I*2*phase) - real cos_sin[2]; - sincospi(static_cast(3.0) * phase, &cos_sin[1], &cos_sin[0]); - complex A(cos_sin[0], cos_sin[1]); - out[6] = cmul(A, out[6]); - out[7] = cmul(A, out[7]); - out[8] = cmul(A, out[8]); - } else { // phase is +/- 1 so real multiply is sufficient - out[6] *= phase; - out[7] *= phase; - out[8] *= phase; - } + for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]); + + out[6] = cmul(out[2], out[4]); + out[6] = cmac(out[1], out[5], -out[6]); + out[6] = scale_inv * conj(out[6]); + + out[7] = cmul(out[0], out[5]); + out[7] = cmac(out[2], out[3], -out[7]); + out[7] = scale_inv * conj(out[7]); + + out[8] = cmul(out[1], out[3]); + out[8] = cmac(out[0], out[4], -out[8]); + out[8] = scale_inv * conj(out[8]); + + if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phasing + // Multiply the third row by exp(I*3*phase), since the cross product will end up in a scale factor of exp(-I*2*phase) + real cos_sin[2]; + sincospi(static_cast(3.0) * phase, &cos_sin[1], &cos_sin[0]); + complex A(cos_sin[0], cos_sin[1]); + out[6] = cmul(A, out[6]); + out[7] = cmul(A, out[7]); + out[8] = cmul(A, out[8]); + } else { // phase is +/- 1 so real multiply is sufficient + out[6] *= phase; + out[7] *= phase; + out[8] *= phase; } + } - __device__ __host__ inline real getPhase(const complex in[9]) const - { + __device__ __host__ inline real getPhase(const complex in[9]) const + { #if 1 // phase from cross product - // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])* - complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv; - complex expI3Phase = in[8] / denom; // numerator = U[2][2] - - // dynamic phasing - if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) return arg(expI3Phase) / static_cast(3.0 * M_PI); - // static phasing - return expI3Phase.real() > 0 ? 1 : -1; + // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])* + complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv; + complex expI3Phase = in[8] / denom; // numerator = U[2][2] + + // dynamic phasing + if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) return arg(expI3Phase) / static_cast(3.0 * M_PI); + // static phasing + return expI3Phase.real() > 0 ? 1 : -1; #else // phase from determinant - Matrix a; + Matrix a; #pragma unroll - for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i]; - const complex det = getDeterminant(a); - return phase = arg(det) / static_cast(3.0 * M_PI); + for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i]; + const complex det = getDeterminant(a); + return phase = arg(det) / static_cast(3.0 * M_PI); #endif - } - }; + } + }; - /** - @brief Gauge reconstruct 8 helper where we reconstruct the gauge - matrix from 8 packed elements (maximal compression) - @tparam Float Storage format (e.g., double, float, short) - @tparam ghostExchange_ optional template the ghostExchange type - to avoid the run-time overhead - */ - template struct Reconstruct<8, Float, ghostExchange_> { - using real = typename mapper::type; - using complex = complex; - const complex anisotropy; // imaginary value stores inverse - const complex tBoundary; // imaginary value stores inverse - const int firstTimeSliceBound; - const int lastTimeSliceBound; - const bool isFirstTimeSlice; - const bool isLastTimeSlice; - QudaGhostExchange ghostExchange; - - // scale factor is set when using recon-9 - Reconstruct(const GaugeField &u, real scale = 1.0) : - anisotropy(u.Anisotropy() * scale, 1.0 / (u.Anisotropy() * scale)), - tBoundary(static_cast(u.TBoundary()) * scale, 1.0 / (static_cast(u.TBoundary()) * scale)), - firstTimeSliceBound(u.VolumeCB()), - lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2), - isFirstTimeSlice(comm_coord(3) == 0 ? true : false), - isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false), - ghostExchange(u.GhostExchange()) - { - } + /** + @brief Gauge reconstruct 8 helper where we reconstruct the gauge + matrix from 8 packed elements (maximal compression) + @tparam Float Storage format (e.g., double, float, short) + @tparam ghostExchange_ optional template the ghostExchange type + to avoid the run-time overhead + */ + template struct Reconstruct<8, Float, ghostExchange_> { + using real = typename mapper::type; + using complex = complex; + const complex anisotropy; // imaginary value stores inverse + const complex tBoundary; // imaginary value stores inverse + const int firstTimeSliceBound; + const int lastTimeSliceBound; + const bool isFirstTimeSlice; + const bool isLastTimeSlice; + QudaGhostExchange ghostExchange; - // Pack and unpack are described in https://arxiv.org/pdf/0911.3191.pdf - // Method was modified to avoid the singularity at unit gauge by - // compressing the matrix {{b1,b2,b3},{a1,a2,a3},{-c1,-c2,-c3}} - // instead of {{a1,a2,a3},{b1,b2,b3},{c1,c2,c3}} + // scale factor is set when using recon-9 + Reconstruct(const GaugeField &u, real scale = 1.0) : + anisotropy(u.Anisotropy() * scale, 1.0 / (u.Anisotropy() * scale)), + tBoundary(static_cast(u.TBoundary()) * scale, 1.0 / (static_cast(u.TBoundary()) * scale)), + firstTimeSliceBound(u.VolumeCB()), + lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2), + isFirstTimeSlice(comm_coord(3) == 0 ? true : false), + isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false), + ghostExchange(u.GhostExchange()) + { + } - __device__ __host__ inline void Pack(real out[8], const complex in[9]) const - { - out[0] = atan2(in[3].imag(), in[3].real()) / static_cast(M_PI); // a1 -> b1 - out[1] = atan2(-in[6].imag(), -in[6].real()) / static_cast(M_PI); // c1 -> -c1 - - out[2] = in[4].real(); - out[3] = in[4].imag(); // a2 -> b2 - out[4] = in[5].real(); - out[5] = in[5].imag(); // a3 -> b3 - out[6] = in[0].real(); - out[7] = in[0].imag(); // b1 -> a1 - } + // Pack and unpack are described in https://arxiv.org/pdf/0911.3191.pdf + // Method was modified to avoid the singularity at unit gauge by + // compressing the matrix {{b1,b2,b3},{a1,a2,a3},{-c1,-c2,-c3}} + // instead of {{a1,a2,a3},{b1,b2,b3},{c1,c2,c3}} - template - __device__ __host__ inline void Unpack(complex out[9], const real in[8], int, int, real, const I *, const int *, - const complex, const complex u) const - { - real u0 = u.real(); - real u0_inv = u.imag(); + __device__ __host__ inline void Pack(real out[8], const complex in[9]) const + { + out[0] = atan2(in[3].imag(), in[3].real()) / static_cast(M_PI); // a1 -> b1 + out[1] = atan2(-in[6].imag(), -in[6].real()) / static_cast(M_PI); // c1 -> -c1 + + out[2] = in[4].real(); + out[3] = in[4].imag(); // a2 -> b2 + out[4] = in[5].real(); + out[5] = in[5].imag(); // a3 -> b3 + out[6] = in[0].real(); + out[7] = in[0].imag(); // b1 -> a1 + } + + template + __device__ __host__ inline void Unpack(complex out[9], const real in[8], int, int, real, const I *, const int *, + const complex, const complex u) const + { + real u0 = u.real(); + real u0_inv = u.imag(); #pragma unroll - for (int i = 1; i <= 3; i++) - out[i] = complex(in[2 * i + 0], in[2 * i + 1]); // these elements are copied directly + for (int i = 1; i <= 3; i++) + out[i] = complex(in[2 * i + 0], in[2 * i + 1]); // these elements are copied directly - real tmp[2]; - quda::sincospi(in[0], &tmp[1], &tmp[0]); - out[0] = complex(tmp[0], tmp[1]); + real tmp[2]; + quda::sincospi(in[0], &tmp[1], &tmp[0]); + out[0] = complex(tmp[0], tmp[1]); - quda::sincospi(in[1], &tmp[1], &tmp[0]); - out[6] = complex(tmp[0], tmp[1]); + quda::sincospi(in[1], &tmp[1], &tmp[0]); + out[6] = complex(tmp[0], tmp[1]); - // First, reconstruct first row - real row_sum = out[1].real() * out[1].real(); - row_sum += out[1].imag() * out[1].imag(); - row_sum += out[2].real() * out[2].real(); - row_sum += out[2].imag() * out[2].imag(); - real row_sum_inv = static_cast(1.0) / row_sum; + // First, reconstruct first row + real row_sum = out[1].real() * out[1].real(); + row_sum += out[1].imag() * out[1].imag(); + row_sum += out[2].real() * out[2].real(); + row_sum += out[2].imag() * out[2].imag(); + real row_sum_inv = static_cast(1.0) / row_sum; - real diff = u0_inv * u0_inv - row_sum; - real U00_mag = diff > 0.0 ? diff * quda::rsqrt(diff) : static_cast(0.0); + real diff = u0_inv * u0_inv - row_sum; + real U00_mag = diff > 0.0 ? diff * quda::rsqrt(diff) : static_cast(0.0); - out[0] *= U00_mag; + out[0] *= U00_mag; - // Second, reconstruct first column - real column_sum = out[0].real() * out[0].real(); - column_sum += out[0].imag() * out[0].imag(); - column_sum += out[3].real() * out[3].real(); - column_sum += out[3].imag() * out[3].imag(); + // Second, reconstruct first column + real column_sum = out[0].real() * out[0].real(); + column_sum += out[0].imag() * out[0].imag(); + column_sum += out[3].real() * out[3].real(); + column_sum += out[3].imag() * out[3].imag(); - diff = u0_inv * u0_inv - column_sum; - real U20_mag = diff > 0.0 ? diff * quda::rsqrt(diff) : static_cast(0.0); + diff = u0_inv * u0_inv - column_sum; + real U20_mag = diff > 0.0 ? diff * quda::rsqrt(diff) : static_cast(0.0); - out[6] *= U20_mag; + out[6] *= U20_mag; - // Finally, reconstruct last elements from SU(2) rotation - real r_inv2 = u0_inv * row_sum_inv; - { - complex A = cmul(conj(out[0]), out[3]); + // Finally, reconstruct last elements from SU(2) rotation + real r_inv2 = u0_inv * row_sum_inv; + { + complex A = cmul(conj(out[0]), out[3]); - // out[4] = -(conj(out[6])*conj(out[2]) + u0*A*out[1])*r_inv2; // U11 - out[4] = cmul(conj(out[6]), conj(out[2])); - out[4] = cmac(u0 * A, out[1], out[4]); - out[4] = -r_inv2 * out[4]; + // out[4] = -(conj(out[6])*conj(out[2]) + u0*A*out[1])*r_inv2; // U11 + out[4] = cmul(conj(out[6]), conj(out[2])); + out[4] = cmac(u0 * A, out[1], out[4]); + out[4] = -r_inv2 * out[4]; - // out[5] = (conj(out[6])*conj(out[1]) - u0*A*out[2])*r_inv2; // U12 - out[5] = cmul(conj(out[6]), conj(out[1])); - out[5] = cmac(-u0 * A, out[2], out[5]); - out[5] = r_inv2 * out[5]; - } + // out[5] = (conj(out[6])*conj(out[1]) - u0*A*out[2])*r_inv2; // U12 + out[5] = cmul(conj(out[6]), conj(out[1])); + out[5] = cmac(-u0 * A, out[2], out[5]); + out[5] = r_inv2 * out[5]; + } - { - complex A = cmul(conj(out[0]), out[6]); + { + complex A = cmul(conj(out[0]), out[6]); - // out[7] = (conj(out[3])*conj(out[2]) - u0*A*out[1])*r_inv2; // U21 - out[7] = cmul(conj(out[3]), conj(out[2])); - out[7] = cmac(-u0 * A, out[1], out[7]); - out[7] = r_inv2 * out[7]; + // out[7] = (conj(out[3])*conj(out[2]) - u0*A*out[1])*r_inv2; // U21 + out[7] = cmul(conj(out[3]), conj(out[2])); + out[7] = cmac(-u0 * A, out[1], out[7]); + out[7] = r_inv2 * out[7]; - // out[8] = -(conj(out[3])*conj(out[1]) + u0*A*out[2])*r_inv2; // U12 - out[8] = cmul(conj(out[3]), conj(out[1])); - out[8] = cmac(u0 * A, out[2], out[8]); - out[8] = -r_inv2 * out[8]; - } + // out[8] = -(conj(out[3])*conj(out[1]) + u0*A*out[2])*r_inv2; // U12 + out[8] = cmul(conj(out[3]), conj(out[1])); + out[8] = cmac(u0 * A, out[2], out[8]); + out[8] = -r_inv2 * out[8]; + } - // Rearrange {{b1,b2,b3},{a1,a2,a3},{-c1,-c2,-c3}} back - // to {{a1,a2,a3},{b1,b2,b3},{c1,c2,c3}} + // Rearrange {{b1,b2,b3},{a1,a2,a3},{-c1,-c2,-c3}} back + // to {{a1,a2,a3},{b1,b2,b3},{c1,c2,c3}} #pragma unroll - for (int i = 0; i < 3; i++) { - const auto tmp = out[i]; - out[i] = out[i + 3]; - out[i + 3] = tmp; - out[i + 6] = -out[i + 6]; - } + for (int i = 0; i < 3; i++) { + const auto tmp = out[i]; + out[i] = out[i + 3]; + out[i + 3] = tmp; + out[i + 6] = -out[i + 6]; } + } - template - __device__ __host__ inline void - Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, - const complex scale = complex(static_cast(1.0), static_cast(1.0))) const - { - complex u = dir < 3 ? - anisotropy : - timeBoundary(idx, X, R, tBoundary, scale, firstTimeSliceBound, lastTimeSliceBound, - isFirstTimeSlice, isLastTimeSlice, ghostExchange); + template + __device__ __host__ inline void + Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, + const complex scale = complex(static_cast(1.0), static_cast(1.0))) const + { + complex u = dir < 3 ? + anisotropy : + timeBoundary(idx, X, R, tBoundary, scale, firstTimeSliceBound, lastTimeSliceBound, + isFirstTimeSlice, isLastTimeSlice, ghostExchange); - Unpack(out, in, idx, dir, phase, X, R, scale, u); - } + Unpack(out, in, idx, dir, phase, X, R, scale, u); + } - __device__ __host__ inline real getPhase(const complex[]) const { return 0; } - }; + __device__ __host__ inline real getPhase(const complex[]) const { return 0; } + }; - /** - @brief Gauge reconstruct 9 helper where we reconstruct the gauge - matrix from 8 packed elements (maximal compression) and include a - non-trivial phase factor - @tparam Float Storage format (e.g., double, float, short) - @tparam ghostExchange_ optional template the ghostExchange type - to avoid the run-time overhead - */ - template - struct Reconstruct<9, Float, ghostExchange_, stag_phase> { - using real = typename mapper::type; - using complex = complex; - const Reconstruct<8, Float, ghostExchange_> reconstruct_8; - const real scale; - const real scale_inv; + /** + @brief Gauge reconstruct 9 helper where we reconstruct the gauge + matrix from 8 packed elements (maximal compression) and include a + non-trivial phase factor + @tparam Float Storage format (e.g., double, float, short) + @tparam ghostExchange_ optional template the ghostExchange type + to avoid the run-time overhead + */ + template + struct Reconstruct<9, Float, ghostExchange_, stag_phase> { + using real = typename mapper::type; + using complex = complex; + const Reconstruct<8, Float, ghostExchange_> reconstruct_8; + const real scale; + const real scale_inv; - Reconstruct(const GaugeField &u) : reconstruct_8(u), scale(u.Scale()), scale_inv(1.0 / scale) {} + Reconstruct(const GaugeField &u) : reconstruct_8(u), scale(u.Scale()), scale_inv(1.0 / scale) { } - __device__ __host__ inline real getPhase(const complex in[9]) const - { + __device__ __host__ inline real getPhase(const complex in[9]) const + { #if 1 // phase from cross product - // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])* - complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv; - complex expI3Phase = in[8] / denom; // numerator = U[2][2] - // dynamic phasing - if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) return arg(expI3Phase) / static_cast(3.0 * M_PI); - // static phasing - return expI3Phase.real() > 0 ? 1 : -1; + // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])* + complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv; + complex expI3Phase = in[8] / denom; // numerator = U[2][2] + // dynamic phasing + if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) return arg(expI3Phase) / static_cast(3.0 * M_PI); + // static phasing + return expI3Phase.real() > 0 ? 1 : -1; #else // phase from determinant - Matrix a; + Matrix a; #pragma unroll - for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i]; - const complex det = getDeterminant(a); - real phase = arg(det) / static_cast(3.0 * M_PI); - return phase; + for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i]; + const complex det = getDeterminant(a); + real phase = arg(det) / static_cast(3.0 * M_PI); + return phase; #endif - } + } - // Rescale the U3 input matrix by exp(-I*phase) to obtain an SU3 matrix multiplied by a real scale factor, - __device__ __host__ inline void Pack(real out[8], const complex in[9]) const - { - real phase = getPhase(in); - complex su3[9]; - - if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) { - real cos_sin[2]; - sincospi(static_cast(-phase), &cos_sin[1], &cos_sin[0]); - complex z(cos_sin[0], cos_sin[1]); - z *= scale_inv; + // Rescale the U3 input matrix by exp(-I*phase) to obtain an SU3 matrix multiplied by a real scale factor, + __device__ __host__ inline void Pack(real out[8], const complex in[9]) const + { + real phase = getPhase(in); + complex su3[9]; + + if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) { + real cos_sin[2]; + sincospi(static_cast(-phase), &cos_sin[1], &cos_sin[0]); + complex z(cos_sin[0], cos_sin[1]); + z *= scale_inv; #pragma unroll - for (int i = 0; i < 9; i++) su3[i] = cmul(z, in[i]); - } else { + for (int i = 0; i < 9; i++) su3[i] = cmul(z, in[i]); + } else { #pragma unroll - for (int i = 0; i < 9; i++) { su3[i] = phase * in[i]; } - } - reconstruct_8.Pack(out, su3); + for (int i = 0; i < 9; i++) { su3[i] = phase * in[i]; } } + reconstruct_8.Pack(out, su3); + } - template - __device__ __host__ inline void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, - const I *X, const int *R) const - { - reconstruct_8.Unpack(out, in, idx, dir, phase, X, R, complex(static_cast(1.0), static_cast(1.0)), - complex(static_cast(1.0), static_cast(1.0))); - - if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phase - real cos_sin[2]; - sincospi(static_cast(phase), &cos_sin[1], &cos_sin[0]); - complex z(cos_sin[0], cos_sin[1]); - z *= scale; + template + __device__ __host__ inline void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, + const int *R) const + { + reconstruct_8.Unpack(out, in, idx, dir, phase, X, R, complex(static_cast(1.0), static_cast(1.0)), + complex(static_cast(1.0), static_cast(1.0))); + + if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phase + real cos_sin[2]; + sincospi(static_cast(phase), &cos_sin[1], &cos_sin[0]); + complex z(cos_sin[0], cos_sin[1]); + z *= scale; #pragma unroll - for (int i = 0; i < 9; i++) out[i] = cmul(z, out[i]); - } else { // stagic phase + for (int i = 0; i < 9; i++) out[i] = cmul(z, out[i]); + } else { // stagic phase #pragma unroll - for (int i = 0; i < 18; i++) { out[i] *= phase; } - } + for (int i = 0; i < 18; i++) { out[i] *= phase; } } - }; - - __host__ __device__ constexpr int ct_sqrt(int n, int i = 1) - { - return n == i ? n : (i * i < n ? ct_sqrt(n, i + 1) : i); } + }; - /** - @brief Return the number of colors of the accessor based on the length of the field - @param[in] length Number of real numbers per link - @return Number of colors (=sqrt(length/2)) - */ - __host__ __device__ constexpr int Ncolor(int length) { return ct_sqrt(length / 2); } + __host__ __device__ constexpr int ct_sqrt(int n, int i = 1) + { + return n == i ? n : (i * i < n ? ct_sqrt(n, i + 1) : i); + } - // we default to huge allocations for gauge field (for now) - constexpr bool default_huge_alloc = true; + /** + @brief Return the number of colors of the accessor based on the length of the field + @param[in] length Number of real numbers per link + @return Number of colors (=sqrt(length/2)) + */ + __host__ __device__ constexpr int Ncolor(int length) { return ct_sqrt(length / 2); } - template constexpr bool static_phase() - { - switch (phase) { - case QUDA_STAGGERED_PHASE_MILC: - case QUDA_STAGGERED_PHASE_CPS: - case QUDA_STAGGERED_PHASE_TIFR: return true; - default: return false; - } + // we default to huge allocations for gauge field (for now) + constexpr bool default_huge_alloc = true; + + template constexpr bool static_phase() + { + switch (phase) { + case QUDA_STAGGERED_PHASE_MILC: + case QUDA_STAGGERED_PHASE_CPS: + case QUDA_STAGGERED_PHASE_TIFR: return true; + default: return false; } + } - template - struct FloatNOrder { - using Accessor - = FloatNOrder; - - using store_t = Float; - static constexpr int length = length_; - using real = typename mapper::type; - using complex = complex; - typedef typename VectorType::type Vector; - typedef typename AllocType::type AllocInt; - Reconstruct reconstruct; - static constexpr int reconLen = (reconLenParam == 11) ? 10 : reconLenParam; - static constexpr int hasPhase = (reconLen == 9 || reconLen == 13) ? 1 : 0; - Float *gauge; - const AllocInt offset; - Float *ghost[4]; - QudaGhostExchange ghostExchange; - int coords[QUDA_MAX_DIM]; - int_fastdiv X[QUDA_MAX_DIM]; - int R[QUDA_MAX_DIM]; - const int volumeCB; - int faceVolumeCB[4]; - const int stride; - const int geometry; - const AllocInt phaseOffset; - size_t bytes; - - FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : - reconstruct(u), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), - offset(u.Bytes() / (2 * sizeof(Float) * N)), - ghostExchange(u.GhostExchange()), - volumeCB(u.VolumeCB()), - stride(u.Stride()), - geometry(u.Geometry()), - phaseOffset(u.PhaseOffset() / sizeof(Float)), - bytes(u.Bytes()) - { - if (geometry == QUDA_COARSE_GEOMETRY) - errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone"); - - // static_assert( !(stag_phase!=QUDA_STAGGERED_PHASE_NO && reconLenParam != 18 && reconLenParam != 12), - // "staggered phase only presently supported for 18 and 12 reconstruct"); - for (int i = 0; i < 4; i++) { - X[i] = u.X()[i]; - R[i] = u.R()[i]; - ghost[i] = ghost_ ? ghost_[i] : 0; - faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth - } + template + struct FloatNOrder { + using Accessor = FloatNOrder; + + using store_t = Float; + static constexpr int length = length_; + using real = typename mapper::type; + using complex = complex; + typedef typename VectorType::type Vector; + typedef typename AllocType::type AllocInt; + Reconstruct reconstruct; + static constexpr int reconLen = (reconLenParam == 11) ? 10 : reconLenParam; + static constexpr int hasPhase = (reconLen == 9 || reconLen == 13) ? 1 : 0; + Float *gauge; + const AllocInt offset; + Float *ghost[4]; + QudaGhostExchange ghostExchange; + int coords[QUDA_MAX_DIM]; + int_fastdiv X[QUDA_MAX_DIM]; + int R[QUDA_MAX_DIM]; + const int volumeCB; + int faceVolumeCB[4]; + const int stride; + const int geometry; + const AllocInt phaseOffset; + size_t bytes; + + FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + reconstruct(u), + gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + offset(u.Bytes() / (2 * sizeof(Float) * N)), + ghostExchange(u.GhostExchange()), + volumeCB(u.VolumeCB()), + stride(u.Stride()), + geometry(u.Geometry()), + phaseOffset(u.PhaseOffset() / sizeof(Float)), + bytes(u.Bytes()) + { + if (geometry == QUDA_COARSE_GEOMETRY) + errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone"); + + // static_assert( !(stag_phase!=QUDA_STAGGERED_PHASE_NO && reconLenParam != 18 && reconLenParam != 12), + // "staggered phase only presently supported for 18 and 12 reconstruct"); + for (int i = 0; i < 4; i++) { + X[i] = u.X()[i]; + R[i] = u.R()[i]; + ghost[i] = ghost_ ? ghost_[i] : 0; + faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth } + } __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real phase = 1.0) const { @@ -1546,7 +1559,7 @@ namespace quda { real tmp[reconLen]; #pragma unroll - for (int i=0; i(gauge, parity * offset + (dir * M + i) * stride + x); // second do copy converting into register type @@ -1570,12 +1583,12 @@ namespace quda { reconstruct.Pack(tmp, v); #pragma unroll - for (int i=0; i(&vecTmp)[j], tmp[i*N+j]); - // second do vectorized copy into memory + for (int j = 0; j < N; j++) copy(reinterpret_cast(&vecTmp)[j], tmp[i * N + j]); + // second do vectorized copy into memory vector_store(gauge, parity * offset + x + (dir * M + i) * stride, vecTmp); } if constexpr (hasPhase) { @@ -1585,14 +1598,14 @@ namespace quda { } /** - @brief This accessor routine returns a gauge_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] x_cb Checkerboarded space-time index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_wrapper that curries in access to - this field at the above coordinates. + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. */ __device__ __host__ inline auto operator()(int dim, int x_cb, int parity, real phase = 1.0) const { @@ -1601,7 +1614,7 @@ namespace quda { __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const { - if (!ghost[dir]) { // load from main field not separate array + if (!ghost[dir]) { // load from main field not separate array load(v, volumeCB + x, dir, parity, inphase); // an offset of size volumeCB puts us at the padded region // This also works perfectly when phases are stored. No need to change this. } else { @@ -1609,10 +1622,10 @@ namespace quda { real tmp[reconLen]; #pragma unroll - for (int i=0; i( - ghost[dir] + parity * faceVolumeCB[dir] * (M * N + hasPhase), i * faceVolumeCB[dir] + x); + for (int i = 0; i < M; i++) { + // first do vectorized copy from memory into registers + Vector vecTmp = vector_load(ghost[dir] + parity * faceVolumeCB[dir] * (M * N + hasPhase), + i * faceVolumeCB[dir] + x); // second do copy converting into register type #pragma unroll for (int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast(&vecTmp)[j]); @@ -1634,7 +1647,7 @@ namespace quda { __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dir, int parity) const { - if (!ghost[dir]) { // store in main field not separate array + if (!ghost[dir]) { // store in main field not separate array save(v, volumeCB + x, dir, parity); // an offset of size volumeCB puts us at the padded region } else { const int M = reconLen / N; @@ -1642,13 +1655,13 @@ namespace quda { reconstruct.Pack(tmp, v); #pragma unroll - for (int i=0; i(&vecTmp)[j], tmp[i*N+j]); - // second do vectorized copy into memory - vector_store(ghost[dir]+parity*faceVolumeCB[dir]*(M*N + hasPhase), i*faceVolumeCB[dir]+x, vecTmp); + for (int j = 0; j < N; j++) copy(reinterpret_cast(&vecTmp)[j], tmp[i * N + j]); + // second do vectorized copy into memory + vector_store(ghost[dir] + parity * faceVolumeCB[dir] * (M * N + hasPhase), i * faceVolumeCB[dir] + x, vecTmp); } if constexpr (hasPhase) { @@ -1698,14 +1711,15 @@ namespace quda { real tmp[reconLen]; #pragma unroll - for (int i=0; i(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase), - +i*R[dim]*faceVolumeCB[dim]+buff_idx); - // second do copy converting into register type + for (int i = 0; i < M; i++) { + // first do vectorized copy from memory + Vector vecTmp = vector_load( + ghost[dim] + ((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + hasPhase), + +i * R[dim] * faceVolumeCB[dim] + buff_idx); + // second do copy converting into register type #pragma unroll - for (int j=0; j(&vecTmp)[j]); - } + for (int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast(&vecTmp)[j]); + } real phase = 0.; if constexpr (hasPhase) copy(phase, @@ -1724,119 +1738,118 @@ namespace quda { reconstruct.Pack(tmp, v); #pragma unroll - for (int i=0; i(&vecTmp)[j], tmp[i*N+j]); - // second do vectorized copy to memory - vector_store(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase), - i*R[dim]*faceVolumeCB[dim]+buff_idx, vecTmp); - } - if constexpr (hasPhase) { - real phase = reconstruct.getPhase(v); - copy(ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1) - + R[dim] * faceVolumeCB[dim] * M * N + buff_idx], - static_cast(0.5) * phase); - } + for (int j = 0; j < N; j++) copy(reinterpret_cast(&vecTmp)[j], tmp[i * N + j]); + // second do vectorized copy to memory + vector_store(ghost[dim] + ((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + hasPhase), + i * R[dim] * faceVolumeCB[dim] + buff_idx, vecTmp); + } + if constexpr (hasPhase) { + real phase = reconstruct.getPhase(v); + copy(ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1) + + R[dim] * faceVolumeCB[dim] * M * N + buff_idx], + static_cast(0.5) * phase); + } } size_t Bytes() const { return reconLen * sizeof(Float); } - }; + }; - /** - @brief The LegacyOrder defines the ghost zone storage and ordering for - all cpuGaugeFields, which use the same ghost zone storage. - */ - template struct LegacyOrder { - static constexpr int length = length_; - using Accessor = LegacyOrder; - using store_t = Float; - using real = typename mapper::type; - using complex = complex; - Float *ghost[QUDA_MAX_DIM]; - int faceVolumeCB[QUDA_MAX_DIM]; - const int volumeCB; - const int stride; - const int geometry; - const int hasPhase; - - LegacyOrder(const GaugeField &u, Float **ghost_) : - volumeCB(u.VolumeCB()), - stride(u.Stride()), - geometry(u.Geometry()), - hasPhase(0) - { - if (geometry == QUDA_COARSE_GEOMETRY) - errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone"); + /** + @brief The LegacyOrder defines the ghost zone storage and ordering for + all cpuGaugeFields, which use the same ghost zone storage. + */ + template struct LegacyOrder { + static constexpr int length = length_; + using Accessor = LegacyOrder; + using store_t = Float; + using real = typename mapper::type; + using complex = complex; + Float *ghost[QUDA_MAX_DIM]; + int faceVolumeCB[QUDA_MAX_DIM]; + const int volumeCB; + const int stride; + const int geometry; + const int hasPhase; - for (int i = 0; i < 4; i++) { - ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i]); - faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth - } - } + LegacyOrder(const GaugeField &u, Float **ghost_) : + volumeCB(u.VolumeCB()), stride(u.Stride()), geometry(u.Geometry()), hasPhase(0) + { + if (geometry == QUDA_COARSE_GEOMETRY) + errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone"); - __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dir, int parity, real = 1.0) const - { - auto in = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length]; - block_load(v, reinterpret_cast(in)); + for (int i = 0; i < 4; i++) { + ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i]); + faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth } + } - __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dir, int parity) - { - auto out = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length]; - block_store(reinterpret_cast(out), v); - } + __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dir, int parity, real = 1.0) const + { + auto in = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length]; + block_load(v, reinterpret_cast(in)); + } - /** - @brief This accessor routine returns a const gauge_ghost_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] ghost_idx Ghost index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_ghost_wrapper that curries in access to - this field at the above coordinates. - */ - __device__ __host__ inline const gauge_ghost_wrapper Ghost(int dim, int ghost_idx, int parity, - real phase = 1.0) const - { - return gauge_ghost_wrapper(const_cast(*this), dim, ghost_idx, parity, phase); - } + __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dir, int parity) + { + auto out = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length]; + block_store(reinterpret_cast(out), v); + } - __device__ __host__ inline void loadGhostEx(complex v[length / 2], int x, int, int dir, int dim, int g, - int parity, const int R[]) const - { - auto in = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length]; - block_load(v, reinterpret_cast(in)); - } + /** + @brief This accessor routine returns a const gauge_ghost_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] ghost_idx Ghost index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_ghost_wrapper that curries in access to + this field at the above coordinates. + */ + __device__ __host__ inline const gauge_ghost_wrapper Ghost(int dim, int ghost_idx, int parity, + real phase = 1.0) const + { + return gauge_ghost_wrapper(const_cast(*this), dim, ghost_idx, parity, phase); + } - __device__ __host__ inline void saveGhostEx(const complex v[length / 2], int x, int, int dir, int dim, int g, - int parity, const int R[]) const - { - auto out = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length]; - block_store(reinterpret_cast(out), v); - } - }; + __device__ __host__ inline void loadGhostEx(complex v[length / 2], int x, int, int dir, int dim, int g, + int parity, const int R[]) const + { + auto in = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length]; + block_load(v, reinterpret_cast(in)); + } + + __device__ __host__ inline void saveGhostEx(const complex v[length / 2], int x, int, int dir, int dim, int g, + int parity, const int R[]) const + { + auto out = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length]; + block_store(reinterpret_cast(out), v); + } + }; /** struct to define QDP ordered gauge fields: [[dim]] [[parity][volumecb][row][col]] */ - template struct QDPOrder : public LegacyOrder { + template struct QDPOrder : public LegacyOrder { using Accessor = QDPOrder; using real = typename mapper::type; using complex = complex; Float *gauge[QUDA_MAX_DIM]; const int volumeCB; - QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) - : LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) - { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; } + QDPOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) + { + for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : ((Float **)u.Gauge_p())[i]; + } - __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const - { - auto in = &gauge[dir][(parity * volumeCB + x) * length]; - block_load(v, reinterpret_cast(in)); + __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const + { + auto in = &gauge[dir][(parity * volumeCB + x) * length]; + block_load(v, reinterpret_cast(in)); } __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const @@ -1846,14 +1859,14 @@ namespace quda { } /** - @brief This accessor routine returns a gauge_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] x_cb Checkerboarded space-time index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_wrapper that curries in access to - this field at the above coordinates. + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. */ __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const { @@ -1867,22 +1880,24 @@ namespace quda { struct to define QDPJIT ordered gauge fields: [[dim]] [[parity][complex][row][col][volumecb]] */ - template struct QDPJITOrder : public LegacyOrder { + template struct QDPJITOrder : public LegacyOrder { using Accessor = QDPJITOrder; using real = typename mapper::type; using complex = complex; Float *gauge[QUDA_MAX_DIM]; const int volumeCB; - QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) - : LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) - { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; } + QDPJITOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) + { + for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : ((Float **)u.Gauge_p())[i]; + } - __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const - { - for (int i = 0; i < length / 2; i++) { - v[i].real((real)gauge[dir][((0 * (length / 2) + i) * 2 + parity) * volumeCB + x]); - v[i].imag((real)gauge[dir][((1 * (length / 2) + i) * 2 + parity) * volumeCB + x]); - } + __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const + { + for (int i = 0; i < length / 2; i++) { + v[i].real((real)gauge[dir][((0 * (length / 2) + i) * 2 + parity) * volumeCB + x]); + v[i].imag((real)gauge[dir][((1 * (length / 2) + i) * 2 + parity) * volumeCB + x]); + } } __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const @@ -1894,14 +1909,14 @@ namespace quda { } /** - @brief This accessor routine returns a gauge_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] x_cb Checkerboarded space-time index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_wrapper that curries in access to - this field at the above coordinates. + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. */ __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const { @@ -1911,186 +1926,190 @@ namespace quda { size_t Bytes() const { return length * sizeof(Float); } }; - /** - struct to define MILC ordered gauge fields: - [parity][dim][volumecb][row][col] - */ - template struct MILCOrder : public LegacyOrder { - using Accessor = MILCOrder; - using real = typename mapper::type; - using complex = complex; - Float *gauge; - const int volumeCB; - const int geometry; - MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) : - LegacyOrder(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()), - volumeCB(u.VolumeCB()), geometry(u.Geometry()) { ; } - - __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const - { - auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; - block_load(v, reinterpret_cast(in)); - } - - __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const - { - auto out = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; - block_store(reinterpret_cast(out), v); - } - /** - @brief This accessor routine returns a gauge_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] x_cb Checkerboarded space-time index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_wrapper that curries in access to - this field at the above coordinates. + struct to define MILC ordered gauge fields: + [parity][dim][volumecb][row][col] */ - __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const - { - return gauge_wrapper(const_cast(*this), dim, x_cb, parity); - } + template struct MILCOrder : public LegacyOrder { + using Accessor = MILCOrder; + using real = typename mapper::type; + using complex = complex; + Float *gauge; + const int volumeCB; + const int geometry; + MILCOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + LegacyOrder(u, ghost_), + gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + volumeCB(u.VolumeCB()), + geometry(u.Geometry()) + { + ; + } - size_t Bytes() const { return length * sizeof(Float); } - }; + __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const + { + auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; + block_load(v, reinterpret_cast(in)); + } - /** - @brief struct to define gauge fields packed into an opaque MILC site struct: - - struct { - char padding[offset]; - Float [dim][row][col]; - } site; - - site lattice [parity][volumecb]; - - We are just passed the size of the struct and the offset to the - required matrix elements. Typically, it is expected that this - accessor will be used with zero-copy memory to the original - allocation in MILC. - */ - template struct MILCSiteOrder : public LegacyOrder { - using Accessor = MILCSiteOrder; - using real = typename mapper::type; - using complex = complex; - Float *gauge; - const int volumeCB; - const int geometry; - const size_t offset; - const size_t size; - MILCSiteOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : - LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), - volumeCB(u.VolumeCB()), - geometry(u.Geometry()), - offset(u.SiteOffset()), - size(u.SiteSize()) - { - if ((uintptr_t)((char *)gauge + offset) % 16 != 0) { errorQuda("MILC structure has misaligned offset"); } - } + __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const + { + auto out = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; + block_store(reinterpret_cast(out), v); + } - __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const - { - // get base pointer - auto in = reinterpret_cast(reinterpret_cast(gauge) + (parity * volumeCB + x) * size - + offset + dir * length * sizeof(Float)); - block_load(v, reinterpret_cast(in)); - } + /** + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. + */ + __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const + { + return gauge_wrapper(const_cast(*this), dim, x_cb, parity); + } - __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const - { - // get base pointer - auto out = reinterpret_cast(reinterpret_cast(gauge) + (parity * volumeCB + x) * size + offset - + dir * length * sizeof(Float)); - block_store(reinterpret_cast(out), v); - } + size_t Bytes() const { return length * sizeof(Float); } + }; /** - @brief This accessor routine returns a gauge_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] x_cb Checkerboarded space-time index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_wrapper that curries in access to - this field at the above coordinates. - */ - __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const - { - return gauge_wrapper(const_cast(*this), dim, x_cb, parity); - } + @brief struct to define gauge fields packed into an opaque MILC site struct: - size_t Bytes() const { return length * sizeof(Float); } - }; + struct { + char padding[offset]; + Float [dim][row][col]; + } site; + site lattice [parity][volumecb]; - /** - struct to define CPS ordered gauge fields: - [parity][dim][volumecb][col][row] - */ - template struct CPSOrder : LegacyOrder { - using Accessor = CPSOrder; - using real = typename mapper::type; - using complex = complex; - Float *gauge; - const int volumeCB; - const real anisotropy; - const real anisotropy_inv; - static constexpr int Nc = 3; - const int geometry; - CPSOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : - LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), - volumeCB(u.VolumeCB()), - anisotropy(u.Anisotropy()), - anisotropy_inv(1.0 / anisotropy), - geometry(u.Geometry()) - { - if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); - } + We are just passed the size of the struct and the offset to the + required matrix elements. Typically, it is expected that this + accessor will be used with zero-copy memory to the original + allocation in MILC. + */ + template struct MILCSiteOrder : public LegacyOrder { + using Accessor = MILCSiteOrder; + using real = typename mapper::type; + using complex = complex; + Float *gauge; + const int volumeCB; + const int geometry; + const size_t offset; + const size_t size; + MILCSiteOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + LegacyOrder(u, ghost_), + gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + volumeCB(u.VolumeCB()), + geometry(u.Geometry()), + offset(u.SiteOffset()), + size(u.SiteSize()) + { + if ((uintptr_t)((char *)gauge + offset) % 16 != 0) { errorQuda("MILC structure has misaligned offset"); } + } - // we need to transpose and scale for CPS ordering - __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float = 1.0) const - { - auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; - complex v_[9]; - block_load(v_, reinterpret_cast(in)); + __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const + { + // get base pointer + auto in = reinterpret_cast(reinterpret_cast(gauge) + (parity * volumeCB + x) * size + + offset + dir * length * sizeof(Float)); + block_load(v, reinterpret_cast(in)); + } - for (int i=0; i(reinterpret_cast(gauge) + (parity * volumeCB + x) * size + offset + + dir * length * sizeof(Float)); + block_store(reinterpret_cast(out), v); } - } - __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity) const - { - auto out = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; - complex v_[9]; - for (int i=0; i(const_cast(*this), dim, x_cb, parity); } - block_store(reinterpret_cast(out), v_); - } + size_t Bytes() const { return length * sizeof(Float); } + }; /** - @brief This accessor routine returns a gauge_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] x_cb Checkerboarded space-time index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_wrapper that curries in access to - this field at the above coordinates. + struct to define CPS ordered gauge fields: + [parity][dim][volumecb][col][row] */ - __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const - { - return gauge_wrapper(const_cast(*this), dim, x_cb, parity); - } + template struct CPSOrder : LegacyOrder { + using Accessor = CPSOrder; + using real = typename mapper::type; + using complex = complex; + Float *gauge; + const int volumeCB; + const real anisotropy; + const real anisotropy_inv; + static constexpr int Nc = 3; + const int geometry; + CPSOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + LegacyOrder(u, ghost_), + gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + volumeCB(u.VolumeCB()), + anisotropy(u.Anisotropy()), + anisotropy_inv(1.0 / anisotropy), + geometry(u.Geometry()) + { + if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); + } - size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } - }; + // we need to transpose and scale for CPS ordering + __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float = 1.0) const + { + auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; + complex v_[9]; + block_load(v_, reinterpret_cast(in)); + + for (int i = 0; i < Nc; i++) { + for (int j = 0; j < Nc; j++) { v[i * Nc + j] = v_[j * Nc + i] * anisotropy_inv; } + } + } + + __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity) const + { + auto out = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; + complex v_[9]; + for (int i = 0; i < Nc; i++) { + for (int j = 0; j < Nc; j++) { v_[i * Nc + j] = v[j * Nc + i] * anisotropy; } + } + + block_store(reinterpret_cast(out), v_); + } + + /** + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. + */ + __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const + { + return gauge_wrapper(const_cast(*this), dim, x_cb, parity); + } + + size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } + }; /** @brief struct to define BQCD ordered gauge fields: @@ -2099,7 +2118,7 @@ namespace quda { variables in and extended fields with inline halos [mu][parity][volumecb+halos][col][row] */ - template struct BQCDOrder : LegacyOrder { + template struct BQCDOrder : LegacyOrder { using Accessor = BQCDOrder; using real = typename mapper::type; using complex = complex; @@ -2108,14 +2127,12 @@ namespace quda { int exVolumeCB; // extended checkerboard volume static constexpr int Nc = 3; BQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : - LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), - volumeCB(u.VolumeCB()) + LegacyOrder(u, ghost_), gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), volumeCB(u.VolumeCB()) { if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); // compute volumeCB + halo region - exVolumeCB = u.X()[0]/2 + 2; - for (int i=1; i<4; i++) exVolumeCB *= u.X()[i] + 2; + exVolumeCB = u.X()[0] / 2 + 2; + for (int i = 1; i < 4; i++) exVolumeCB *= u.X()[i] + 2; } // we need to transpose for BQCD ordering @@ -2163,7 +2180,7 @@ namespace quda { @brief struct to define TIFR ordered gauge fields: [mu][parity][volumecb][col][row] */ - template struct TIFROrder : LegacyOrder { + template struct TIFROrder : LegacyOrder { using Accessor = TIFROrder; using real = typename mapper::type; using complex = complex; @@ -2227,7 +2244,7 @@ namespace quda { struct to define TIFR ordered gauge fields (with inlined z halo of depth two): [mu][parity][t][z+4][y][x/2][col][row] */ - template struct TIFRPaddedOrder : LegacyOrder { + template struct TIFRPaddedOrder : LegacyOrder { using Accessor = TIFRPaddedOrder; using real = typename mapper::type; using complex = complex; @@ -2252,22 +2269,23 @@ namespace quda { if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); // exVolumeCB is the padded checkboard volume - for (int i=0; i<4; i++) exVolumeCB *= exDim[i]; - exVolumeCB /= 2; + for (int i = 0; i < 4; i++) exVolumeCB *= exDim[i]; + exVolumeCB /= 2; } /** - @brief Compute the index into the padded field. Assumes that - parity doesn't change from unpadded to padded. + @brief Compute the index into the padded field. Assumes that + parity doesn't change from unpadded to padded. */ - __device__ __host__ inline int getPaddedIndex(int x_cb, int parity) const { - // find coordinates - int coord[4]; - getCoords(coord, x_cb, dim, parity); + __device__ __host__ inline int getPaddedIndex(int x_cb, int parity) const + { + // find coordinates + int coord[4]; + getCoords(coord, x_cb, dim, parity); - // get z-extended index - coord[2] += 2; // offset for halo - return linkIndex(coord, exDim); + // get z-extended index + coord[2] += 2; // offset for halo + return linkIndex(coord, exDim); } // we need to transpose for TIFR ordering @@ -2314,7 +2332,6 @@ namespace quda { size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } }; - /** struct to define OpenQCD ordered gauge fields: [volumecb][dim][parity*][row][col] parity*: uplink/downlink (link attached to closest odd site) @@ -2331,148 +2348,79 @@ namespace quda { OpenQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), - volumeCB(u.VolumeCB()), // NOTE: Volume and VolumeCB refer to the global lattice, if VolumeLocal, then local lattice + volumeCB( + u.VolumeCB()), // NOTE: Volume and VolumeCB refer to the global lattice, if VolumeLocal, then local lattice dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]} // GLOBAL dimensions { if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); } - // For reference: https://gitlab.com/rcstar/openQxD/-/blob/master/main/README.global - // TODO: packet this function + // TODO: make this function // __device__ __host__ inline int QUDAtoOpenQxD(int x_cb_QUDA, int dir_QUDA, int parity_QUDA) const - // { - - // } - // TODO: Implement ipt and iup functions - - -/******* Previous version: */ - // // fields are only defined for odd points - // // The pointer to the - // // link variable U(x,mu) at any given *odd* point x is then - // // ud+8*(ix-VOLUME/2)+2*mu - // // while - // // ud+8*(ix-VOLUME/2)+2*mu+1 - // // is the pointer to the link variable U(x-mu,mu), where ix denotes the label of - // // x. All link variables that constitute the local gauge field can thus be - // // accessed in this simple way. - // // see https://gitlab.com/rcstar/openQxD/-/blob/master/main/README.global - // // typedef struct - // // { - // // complex c11,c12,c13,c21,c22,c23,c31,c32,c33; - // // } su3; - - // __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float = 1.0) const // { - // if (parity == 1) { // odd points can be loaded directly - // auto in = &gauge[(8 * x + 2 * dir) * length]; // FIXME: what's x? - // block_load(v, reinterpret_cast(in)); - // } else { - // // gauge field for even points needs to be fetched from odd points, some indexing fun - // // ud+8*(ix-VOLUME/2)+2*mu+1 - // // is the pointer to the link variable U(x-mu,mu), - // // Mathias: so to get U(x,mu) for even x we need to load U(ix,mu) with ix=x+mu - // int coord[4]; - // getCoords(coord, x, dim, 1); // From here, x is the checkerboard index - // int xmu = linkIndexP1(coord, dim, dir); // TODO: What about on boundaries?, do we need to index into them? - - // auto in = &gauge[(8 * xmu + 2 * dir + 1) * length]; // TODO: xmu is not ipt[iy] - // block_load(v, reinterpret_cast(in)); - // } + // } -/******* End of Previous version: */ - // TODO: even if x, dir, parity are the global indices, the conversion from global to local may run into problems - __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float = 1.0) const // FIXME: What's this "Float = 1.0" for ? + __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, + Float = 1.0) const { - // FIX: what's x? (cf. index_helper.cuh) - // With ''natural'' order: lexicographical 0123 = txyz , t fastest, links 0123 = txyz in pos directions - + // Indexing fun: - int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) + int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) - getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site + getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site - // int iy_OpenQxD = x3 + L3*x2 + L3*L2*x1 + L3*L2*L1*x0; + // int iy_OpenQxD = x3 + L3*x2 + L3*L2*x1 + L3*L2*L1*x0; // TODO: Determine whether coord[mu] is local or global - int iy_OpenQxD = coord[2] + dim[2]*coord[1] + dim[2]*dim[1]*coord[0] + dim[0]*dim[2]*dim[1]*coord[3]; /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) - coord1 in QUDA is x2 in OpenQxD (y) - coord2 in QUDA is x3 in OpenQxD (z) - coord3 in QUDA is x0 in OpenQxD (t) - */ + int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; + /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) + coord1 in QUDA is x2 in OpenQxD (y) + coord2 in QUDA is x3 in OpenQxD (z) + coord3 in QUDA is x0 in OpenQxD (t) + */ // int ix_OpenQxD = ipt[iy_OpenQxD]; - int dir_OpenQxD = (dir + 1)%4; // rotation of axes QUDA -> OpenQxD - + int dir_OpenQxD = (dir + 1) % 4; // rotation of axes QUDA -> OpenQxD // Loading as per QUDA style - auto in = &gauge[ (4*iy_OpenQxD + dir_OpenQxD) * length]; // This is how they're accessed within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) - // auto in = &gauge[ (8*(ix_OpenQxD - volumeCB) + 2*dir_OpenQxD)* length]; // This is how they're accessed within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) + auto in + = &gauge[(4 * iy_OpenQxD + dir_OpenQxD) * length]; // This is how they're accessed within OpenQxd (length = 18 + // doubles = 9 complex doubles = 1 su3dble struct) + // auto in = &gauge[ (8*(ix_OpenQxD - volumeCB) + 2*dir_OpenQxD)* length]; // This is how they're accessed + // within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) block_load(v, reinterpret_cast(in)); - - // if (parity == 1) { // odd points can be loaded directly - - // // Indexing fun: - // int coord[4]; // declare a 4D vector x0, x1, x2, x3 - // getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site - // // int iy_OpenQxD = x3 + L3*x2 + L3*L2*x1 + L3*L2*L1*x0; - // int iy_OpenQxD = coord[2] + dim[2]*coord[1] + dim[2]*dim[1]*coord[0] + dim[0]*dim[2]*dim[1]*coord[3]; /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) - // coord1 in QUDA is x2 in OpenQxD (y) - // coord2 in QUDA is x3 in OpenQxD (z) - // coord3 in QUDA is x0 in OpenQxD (t) - // */ - // int ix_OpenQxD = ipt[iy_OpenQxD]; // ipt mapping - // int dir_OpenQxD = (dir + 1)%4; // rotation of axes QUDA -> OpenQxD - - - // // Loading as per QUDA style - // auto in = &gauge[ (8*(ix_OpenQxD - volumeCB) + 2*dir_OpenQxD)* length]; // This is how they're accessed within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) - // block_load(v, reinterpret_cast(in)); - - // } else if (parity ==0) { - - // // More indexing fun: - // int coord[4]; // declare a 4D vector x0, x1, x2, x3 - // getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site - // // int iy_OpenQxD = x3 + L3*x2 + L3*L2*x1 + L3*L2*L1*x0; - // int iy_OpenQxD = coord[2] + dim[2]*coord[1] + dim[2]*dim[1]*coord[0] + dim[0]*dim[2]*dim[1]*coord[3]; /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) - // coord1 in QUDA is x2 in OpenQxD (y) - // coord2 in QUDA is x3 in OpenQxD (z) - // coord3 in QUDA is x0 in OpenQxD (t) - // In OpenQxD, z runs the fastest, (txyz order) - // In QUDA, I think t runs the fastest (xyzt order) - // Or should it be (zyxt ??) maybe FIXME: - // */ - // int ix_OpenQxD = ipt[iy_OpenQxD]; // ipt mapping - // int dir_OpenQxD = (dir + 1)%4; // rotation of axes QUDA -> OpenQxD - - // int ix_OpenQxD_shifted = iup[ix_OpenQxD][dir_OpenQxD]; // obtain neighboring index - - // // int xmu = linkIndexP1(coord, dim, dir); // Maybe for later FIXME: What about on boundaries?, do we need to index into them? - - // // Loading as per QUDA style - // auto in = &gauge[ (8*(ix_OpenQxD_shifted - volumeCB) + 2*dir_OpenQxD + 1)* length]; // This is how they're accessed within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) - // block_load(v, reinterpret_cast(in)); - - // } else { - // std::cout << "This shouldn't happen!!: Error in parity OpenQxD Order interface" << std::endl; - // } } __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity) const { - // auto out = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; - // complex v_[9]; - // for (int i=0; i(reinterpret_cast(out), v_); + // int iy_OpenQxD = x3 + L3*x2 + L3*L2*x1 + L3*L2*L1*x0; + // TODO: Determine whether coord[mu] is local or global + /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) + coord1 in QUDA is x2 in OpenQxD (y) + coord2 in QUDA is x3 in OpenQxD (z) + coord3 in QUDA is x0 in OpenQxD (t) + */ + int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; + + // int ix_OpenQxD = ipt[iy_OpenQxD]; + int dir_OpenQxD = (dir + 1) % 4; // rotation of axes QUDA -> OpenQxD + + // Loading as per QUDA style + // This is how they're accessed within OpenQxd (length = 18 + // doubles = 9 complex doubles = 1 su3dble struct) + auto out = &gauge[(4 * iy_OpenQxD + dir_OpenQxD) * length]; + // within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) + block_store(reinterpret_cast(out), v); } /** @@ -2490,7 +2438,10 @@ namespace quda { return gauge_wrapper(const_cast(*this), dim, x_cb, parity); } - size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } // Double => Float = 1.0 => 1 byte per float, 18 floats per complex 3x3 matrix + size_t Bytes() const + { + return Nc * Nc * 2 * sizeof(Float); + } // Double => Float = 1.0 => 1 byte per float, 18 floats per complex 3x3 matrix }; } // namespace gauge @@ -2633,16 +2584,31 @@ namespace quda { typedef gauge::QDPOrder type; }; - template struct gauge_order_mapper { }; - template struct gauge_order_mapper { typedef gauge::QDPOrder type; }; - template struct gauge_order_mapper { typedef gauge::QDPJITOrder type; }; - template struct gauge_order_mapper { typedef gauge::MILCOrder type; }; + template struct gauge_order_mapper { + }; + template struct gauge_order_mapper { + typedef gauge::QDPOrder type; + }; + template struct gauge_order_mapper { + typedef gauge::QDPJITOrder type; + }; + template struct gauge_order_mapper { + typedef gauge::MILCOrder type; + }; template struct gauge_order_mapper { typedef gauge::CPSOrder type; }; - template struct gauge_order_mapper { typedef gauge::BQCDOrder type; }; - template struct gauge_order_mapper { typedef gauge::TIFROrder type; }; - template struct gauge_order_mapper { typedef gauge::TIFRPaddedOrder type; }; - template struct gauge_order_mapper { typedef gauge::FloatNOrder type; }; + template struct gauge_order_mapper { + typedef gauge::BQCDOrder type; + }; + template struct gauge_order_mapper { + typedef gauge::TIFROrder type; + }; + template struct gauge_order_mapper { + typedef gauge::TIFRPaddedOrder type; + }; + template struct gauge_order_mapper { + typedef gauge::FloatNOrder type; + }; } // namespace quda diff --git a/include/quda.h b/include/quda.h index a2d5353a1a..e574de7293 100644 --- a/include/quda.h +++ b/include/quda.h @@ -24,1695 +24,1682 @@ extern "C" { #endif - /** - * Parameters having to do with the gauge field or the - * interpretation of the gauge field by various Dirac operators - */ - typedef struct QudaGaugeParam_s { - size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ - QudaFieldLocation location; /**< The location of the gauge field */ - - int X[4]; /**< The local space-time dimensions (without checkboarding) */ - - double anisotropy; /**< Used for Wilson and Wilson-clover */ - double tadpole_coeff; /**< Used for staggered only */ - double scale; /**< Used by staggered long links */ - - QudaLinkType type; /**< The link type of the gauge field (e.g., Wilson, fat, long, etc.) */ - QudaGaugeFieldOrder gauge_order; /**< The ordering on the input gauge field */ - - QudaTboundary t_boundary; /**< The temporal boundary condition that will be used for fermion fields */ - - QudaPrecision cpu_prec; /**< The precision used by the caller */ - - QudaPrecision cuda_prec; /**< The precision of the cuda gauge field */ - QudaReconstructType reconstruct; /**< The reconstruction type of the cuda gauge field */ +/** + * Parameters having to do with the gauge field or the + * interpretation of the gauge field by various Dirac operators + */ +typedef struct QudaGaugeParam_s { + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ + QudaFieldLocation location; /**< The location of the gauge field */ - QudaPrecision cuda_prec_sloppy; /**< The precision of the sloppy gauge field */ - QudaReconstructType reconstruct_sloppy; /**< The recontruction type of the sloppy gauge field */ + int X[4]; /**< The local space-time dimensions (without checkboarding) */ - QudaPrecision cuda_prec_refinement_sloppy; /**< The precision of the sloppy gauge field for the refinement step in multishift */ - QudaReconstructType reconstruct_refinement_sloppy; /**< The recontruction type of the sloppy gauge field for the refinement step in multishift*/ + double anisotropy; /**< Used for Wilson and Wilson-clover */ + double tadpole_coeff; /**< Used for staggered only */ + double scale; /**< Used by staggered long links */ - QudaPrecision cuda_prec_precondition; /**< The precision of the preconditioner gauge field */ - QudaReconstructType reconstruct_precondition; /**< The recontruction type of the preconditioner gauge field */ + QudaLinkType type; /**< The link type of the gauge field (e.g., Wilson, fat, long, etc.) */ + QudaGaugeFieldOrder gauge_order; /**< The ordering on the input gauge field */ - QudaPrecision cuda_prec_eigensolver; /**< The precision of the eigensolver gauge field */ - QudaReconstructType reconstruct_eigensolver; /**< The recontruction type of the eigensolver gauge field */ + QudaTboundary t_boundary; /**< The temporal boundary condition that will be used for fermion fields */ - QudaGaugeFixed gauge_fix; /**< Whether the input gauge field is in the axial gauge or not */ + QudaPrecision cpu_prec; /**< The precision used by the caller */ - int ga_pad; /**< The pad size that the cudaGaugeField will use (default=0) */ + QudaPrecision cuda_prec; /**< The precision of the cuda gauge field */ + QudaReconstructType reconstruct; /**< The reconstruction type of the cuda gauge field */ - int site_ga_pad; /**< Used by link fattening and the gauge and fermion forces */ + QudaPrecision cuda_prec_sloppy; /**< The precision of the sloppy gauge field */ + QudaReconstructType reconstruct_sloppy; /**< The recontruction type of the sloppy gauge field */ - int staple_pad; /**< Used by link fattening */ - int llfat_ga_pad; /**< Used by link fattening */ - int mom_ga_pad; /**< Used by the gauge and fermion forces */ + QudaPrecision + cuda_prec_refinement_sloppy; /**< The precision of the sloppy gauge field for the refinement step in multishift */ + QudaReconstructType reconstruct_refinement_sloppy; /**< The recontruction type of the sloppy gauge field for the + refinement step in multishift*/ - QudaStaggeredPhase staggered_phase_type; /**< Set the staggered phase type of the links */ - int staggered_phase_applied; /**< Whether the staggered phase has already been applied to the links */ + QudaPrecision cuda_prec_precondition; /**< The precision of the preconditioner gauge field */ + QudaReconstructType reconstruct_precondition; /**< The recontruction type of the preconditioner gauge field */ - double i_mu; /**< Imaginary chemical potential */ + QudaPrecision cuda_prec_eigensolver; /**< The precision of the eigensolver gauge field */ + QudaReconstructType reconstruct_eigensolver; /**< The recontruction type of the eigensolver gauge field */ - int overlap; /**< Width of overlapping domains */ + QudaGaugeFixed gauge_fix; /**< Whether the input gauge field is in the axial gauge or not */ - int overwrite_gauge; /**< When computing gauge, should we overwrite it or accumulate to it */ - int overwrite_mom; /**< When computing momentum, should we overwrite it or accumulate to it */ + int ga_pad; /**< The pad size that the cudaGaugeField will use (default=0) */ - int use_resident_gauge; /**< Use the resident gauge field as input */ - int use_resident_mom; /**< Use the resident momentum field as input*/ - int make_resident_gauge; /**< Make the result gauge field resident */ - int make_resident_mom; /**< Make the result momentum field resident */ - int return_result_gauge; /**< Return the result gauge field */ - int return_result_mom; /**< Return the result momentum field */ + int site_ga_pad; /**< Used by link fattening and the gauge and fermion forces */ - size_t gauge_offset; /**< Offset into MILC site struct to the gauge field (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ - size_t mom_offset; /**< Offset into MILC site struct to the momentum field (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ - size_t site_size; /**< Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ - } QudaGaugeParam; + int staple_pad; /**< Used by link fattening */ + int llfat_ga_pad; /**< Used by link fattening */ + int mom_ga_pad; /**< Used by the gauge and fermion forces */ + QudaStaggeredPhase staggered_phase_type; /**< Set the staggered phase type of the links */ + int staggered_phase_applied; /**< Whether the staggered phase has already been applied to the links */ - /** - * Parameters relating to the solver and the choice of Dirac operator. - */ - typedef struct QudaInvertParam_s { + double i_mu; /**< Imaginary chemical potential */ - /** Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ - size_t struct_size; + int overlap; /**< Width of overlapping domains */ - QudaFieldLocation input_location; /**< The location of the input field */ - QudaFieldLocation output_location; /**< The location of the output field */ + int overwrite_gauge; /**< When computing gauge, should we overwrite it or accumulate to it */ + int overwrite_mom; /**< When computing momentum, should we overwrite it or accumulate to it */ - QudaDslashType dslash_type; /**< The Dirac Dslash type that is being used */ - QudaInverterType inv_type; /**< Which linear solver to use */ + int use_resident_gauge; /**< Use the resident gauge field as input */ + int use_resident_mom; /**< Use the resident momentum field as input*/ + int make_resident_gauge; /**< Make the result gauge field resident */ + int make_resident_mom; /**< Make the result momentum field resident */ + int return_result_gauge; /**< Return the result gauge field */ + int return_result_mom; /**< Return the result momentum field */ - double mass; /**< Used for staggered only */ - double kappa; /**< Used for Wilson and Wilson-clover */ + size_t gauge_offset; /**< Offset into MILC site struct to the gauge field (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ + size_t mom_offset; /**< Offset into MILC site struct to the momentum field (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ + size_t site_size; /**< Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ +} QudaGaugeParam; - double m5; /**< Domain wall height */ - int Ls; /**< Extent of the 5th dimension (for domain wall) */ +/** + * Parameters relating to the solver and the choice of Dirac operator. + */ +typedef struct QudaInvertParam_s { - double_complex b_5[QUDA_MAX_DWF_LS]; /**< Mobius coefficients - only real part used if regular Mobius */ - double_complex c_5[QUDA_MAX_DWF_LS]; /**< Mobius coefficients - only real part used if regular Mobius */ + /** Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ + size_t struct_size; - /**< - * The following specifies the EOFA parameters. Notation follows arXiv:1706.05843 - * eofa_shift: the "\beta" in the paper - * eofa_pm: plus or minus for the EOFA operator - * mq1, mq2, mq3 are the three masses corresponds to Hasenbusch mass spliting. - * As far as I know mq1 is always the same as "mass" but it's here just for consistence. - * */ - double eofa_shift; - int eofa_pm; - double mq1; - double mq2; - double mq3; + QudaFieldLocation input_location; /**< The location of the input field */ + QudaFieldLocation output_location; /**< The location of the output field */ - double mu; /**< Twisted mass parameter */ - double tm_rho; /**< Hasenbusch mass shift applied like twisted mass to diagonal (but not inverse) */ - double epsilon; /**< Twisted mass parameter */ + QudaDslashType dslash_type; /**< The Dirac Dslash type that is being used */ + QudaInverterType inv_type; /**< Which linear solver to use */ - QudaTwistFlavorType twist_flavor; /**< Twisted mass flavor */ + double mass; /**< Used for staggered only */ + double kappa; /**< Used for Wilson and Wilson-clover */ - int laplace3D; /**< omit this direction from laplace operator: x,y,z,t -> 0,1,2,3 (-1 is full 4D) */ + double m5; /**< Domain wall height */ + int Ls; /**< Extent of the 5th dimension (for domain wall) */ - double tol; /**< Solver tolerance in the L2 residual norm */ - double tol_restart; /**< Solver tolerance in the L2 residual norm (used to restart InitCG) */ - double tol_hq; /**< Solver tolerance in the heavy quark residual norm */ + double_complex b_5[QUDA_MAX_DWF_LS]; /**< Mobius coefficients - only real part used if regular Mobius */ + double_complex c_5[QUDA_MAX_DWF_LS]; /**< Mobius coefficients - only real part used if regular Mobius */ - int compute_true_res; /** Whether to compute the true residual post solve */ - double true_res; /**< Actual L2 residual norm achieved in solver */ - double true_res_hq; /**< Actual heavy quark residual norm achieved in solver */ - int maxiter; /**< Maximum number of iterations in the linear solver */ - double reliable_delta; /**< Reliable update tolerance */ - double reliable_delta_refinement; /**< Reliable update tolerance used in post multi-shift solver refinement */ - int use_alternative_reliable; /**< Whether to use alternative reliable updates */ - int use_sloppy_partial_accumulator; /**< Whether to keep the partial solution accumuator in sloppy precision */ + /**< + * The following specifies the EOFA parameters. Notation follows arXiv:1706.05843 + * eofa_shift: the "\beta" in the paper + * eofa_pm: plus or minus for the EOFA operator + * mq1, mq2, mq3 are the three masses corresponds to Hasenbusch mass spliting. + * As far as I know mq1 is always the same as "mass" but it's here just for consistence. + * */ + double eofa_shift; + int eofa_pm; + double mq1; + double mq2; + double mq3; - /**< This parameter determines how often we accumulate into the - solution vector from the direction vectors in the solver. - E.g., running with solution_accumulator_pipeline = 4, means we - will update the solution vector every four iterations using the - direction vectors from the prior four iterations. This - increases performance of mixed-precision solvers since it means - less high-precision vector round-trip memory travel, but - requires more low-precision memory allocation. */ - int solution_accumulator_pipeline; + double mu; /**< Twisted mass parameter */ + double tm_rho; /**< Hasenbusch mass shift applied like twisted mass to diagonal (but not inverse) */ + double epsilon; /**< Twisted mass parameter */ - /**< This parameter determines how many consecutive reliable update - residual increases we tolerate before terminating the solver, - i.e., how long do we want to keep trying to converge */ - int max_res_increase; + QudaTwistFlavorType twist_flavor; /**< Twisted mass flavor */ - /**< This parameter determines how many total reliable update - residual increases we tolerate before terminating the solver, - i.e., how long do we want to keep trying to converge */ - int max_res_increase_total; + int laplace3D; /**< omit this direction from laplace operator: x,y,z,t -> 0,1,2,3 (-1 is full 4D) */ - /**< This parameter determines how many consecutive heavy-quark - residual increases we tolerate before terminating the solver, - i.e., how long do we want to keep trying to converge */ - int max_hq_res_increase; + double tol; /**< Solver tolerance in the L2 residual norm */ + double tol_restart; /**< Solver tolerance in the L2 residual norm (used to restart InitCG) */ + double tol_hq; /**< Solver tolerance in the heavy quark residual norm */ - /**< This parameter determines how many total heavy-quark residual - restarts we tolerate before terminating the solver, i.e., how long - do we want to keep trying to converge */ - int max_hq_res_restart_total; + int compute_true_res; /** Whether to compute the true residual post solve */ + double true_res; /**< Actual L2 residual norm achieved in solver */ + double true_res_hq; /**< Actual heavy quark residual norm achieved in solver */ + int maxiter; /**< Maximum number of iterations in the linear solver */ + double reliable_delta; /**< Reliable update tolerance */ + double reliable_delta_refinement; /**< Reliable update tolerance used in post multi-shift solver refinement */ + int use_alternative_reliable; /**< Whether to use alternative reliable updates */ + int use_sloppy_partial_accumulator; /**< Whether to keep the partial solution accumuator in sloppy precision */ - /**< After how many iterations shall the heavy quark residual be updated */ - int heavy_quark_check; + /**< This parameter determines how often we accumulate into the + solution vector from the direction vectors in the solver. + E.g., running with solution_accumulator_pipeline = 4, means we + will update the solution vector every four iterations using the + direction vectors from the prior four iterations. This + increases performance of mixed-precision solvers since it means + less high-precision vector round-trip memory travel, but + requires more low-precision memory allocation. */ + int solution_accumulator_pipeline; - int pipeline; /**< Whether to use a pipelined solver with less global sums */ + /**< This parameter determines how many consecutive reliable update + residual increases we tolerate before terminating the solver, + i.e., how long do we want to keep trying to converge */ + int max_res_increase; - int num_offset; /**< Number of offsets in the multi-shift solver */ + /**< This parameter determines how many total reliable update + residual increases we tolerate before terminating the solver, + i.e., how long do we want to keep trying to converge */ + int max_res_increase_total; - int num_src; /**< Number of sources in the multiple source solver */ + /**< This parameter determines how many consecutive heavy-quark + residual increases we tolerate before terminating the solver, + i.e., how long do we want to keep trying to converge */ + int max_hq_res_increase; - int num_src_per_sub_partition; /**< Number of sources in the multiple source solver, but per sub-partition */ + /**< This parameter determines how many total heavy-quark residual + restarts we tolerate before terminating the solver, i.e., how long + do we want to keep trying to converge */ + int max_hq_res_restart_total; - /**< The grid of sub-partition according to which the processor grid will be partitioned. - Should have: - split_grid[0] * split_grid[1] * split_grid[2] * split_grid[3] * num_src_per_sub_partition == num_src. **/ - int split_grid[QUDA_MAX_DIM]; + /**< After how many iterations shall the heavy quark residual be updated */ + int heavy_quark_check; - int overlap; /**< Width of domain overlaps */ + int pipeline; /**< Whether to use a pipelined solver with less global sums */ - /** Offsets for multi-shift solver */ - double offset[QUDA_MAX_MULTI_SHIFT]; + int num_offset; /**< Number of offsets in the multi-shift solver */ - /** Solver tolerance for each offset */ - double tol_offset[QUDA_MAX_MULTI_SHIFT]; + int num_src; /**< Number of sources in the multiple source solver */ - /** Solver tolerance for each shift when refinement is applied using the heavy-quark residual */ - double tol_hq_offset[QUDA_MAX_MULTI_SHIFT]; + int num_src_per_sub_partition; /**< Number of sources in the multiple source solver, but per sub-partition */ - /** Actual L2 residual norm achieved in solver for each offset */ - double true_res_offset[QUDA_MAX_MULTI_SHIFT]; + /**< The grid of sub-partition according to which the processor grid will be partitioned. + Should have: + split_grid[0] * split_grid[1] * split_grid[2] * split_grid[3] * num_src_per_sub_partition == num_src. **/ + int split_grid[QUDA_MAX_DIM]; - /** Iterated L2 residual norm achieved in multi shift solver for each offset */ - double iter_res_offset[QUDA_MAX_MULTI_SHIFT]; + int overlap; /**< Width of domain overlaps */ - /** Actual heavy quark residual norm achieved in solver for each offset */ - double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]; + /** Offsets for multi-shift solver */ + double offset[QUDA_MAX_MULTI_SHIFT]; - /** Residuals in the partial faction expansion */ - double residue[QUDA_MAX_MULTI_SHIFT]; + /** Solver tolerance for each offset */ + double tol_offset[QUDA_MAX_MULTI_SHIFT]; - /** Whether we should evaluate the action after the linear solver*/ - int compute_action; + /** Solver tolerance for each shift when refinement is applied using the heavy-quark residual */ + double tol_hq_offset[QUDA_MAX_MULTI_SHIFT]; - /** Computed value of the bilinear action (complex-valued) - invert: \phi^\dagger A^{-1} \phi - multishift: \phi^\dagger r(x) \phi = \phi^\dagger (sum_k residue[k] * (A + offset[k])^{-1} ) \phi */ - double action[2]; + /** Actual L2 residual norm achieved in solver for each offset */ + double true_res_offset[QUDA_MAX_MULTI_SHIFT]; - QudaSolutionType solution_type; /**< Type of system to solve */ - QudaSolveType solve_type; /**< How to solve it */ - QudaMatPCType matpc_type; /**< The preconditioned matrix type */ - QudaDagType dagger; /**< Whether we are using the Hermitian conjugate system or not */ - QudaMassNormalization mass_normalization; /**< The mass normalization is being used by the caller */ - QudaSolverNormalization solver_normalization; /**< The normalization desired in the solver */ + /** Iterated L2 residual norm achieved in multi shift solver for each offset */ + double iter_res_offset[QUDA_MAX_MULTI_SHIFT]; - QudaPreserveSource preserve_source; /**< Preserve the source or not in the linear solver (deprecated) */ + /** Actual heavy quark residual norm achieved in solver for each offset */ + double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]; - QudaPrecision cpu_prec; /**< The precision used by the input fermion fields */ - QudaPrecision cuda_prec; /**< The precision used by the QUDA solver */ - QudaPrecision cuda_prec_sloppy; /**< The precision used by the QUDA sloppy operator */ - QudaPrecision cuda_prec_refinement_sloppy; /**< The precision of the sloppy gauge field for the refinement step in multishift */ - QudaPrecision cuda_prec_precondition; /**< The precision used by the QUDA preconditioner */ - QudaPrecision cuda_prec_eigensolver; /**< The precision used by the QUDA eigensolver */ + /** Residuals in the partial faction expansion */ + double residue[QUDA_MAX_MULTI_SHIFT]; - QudaDiracFieldOrder dirac_order; /**< The order of the input and output fermion fields */ + /** Whether we should evaluate the action after the linear solver*/ + int compute_action; - QudaGammaBasis gamma_basis; /**< Gamma basis of the input and output host fields */ + /** Computed value of the bilinear action (complex-valued) + invert: \phi^\dagger A^{-1} \phi + multishift: \phi^\dagger r(x) \phi = \phi^\dagger (sum_k residue[k] * (A + offset[k])^{-1} ) \phi */ + double action[2]; - QudaFieldLocation clover_location; /**< The location of the clover field */ - QudaPrecision clover_cpu_prec; /**< The precision used for the input clover field */ - QudaPrecision clover_cuda_prec; /**< The precision used for the clover field in the QUDA solver */ - QudaPrecision clover_cuda_prec_sloppy; /**< The precision used for the clover field in the QUDA sloppy operator */ - QudaPrecision clover_cuda_prec_refinement_sloppy; /**< The precision of the sloppy clover field for the refinement step in multishift */ - QudaPrecision clover_cuda_prec_precondition; /**< The precision used for the clover field in the QUDA preconditioner */ - QudaPrecision clover_cuda_prec_eigensolver; /**< The precision used for the clover field in the QUDA eigensolver */ + QudaSolutionType solution_type; /**< Type of system to solve */ + QudaSolveType solve_type; /**< How to solve it */ + QudaMatPCType matpc_type; /**< The preconditioned matrix type */ + QudaDagType dagger; /**< Whether we are using the Hermitian conjugate system or not */ + QudaMassNormalization mass_normalization; /**< The mass normalization is being used by the caller */ + QudaSolverNormalization solver_normalization; /**< The normalization desired in the solver */ - QudaCloverFieldOrder clover_order; /**< The order of the input clover field */ - QudaUseInitGuess use_init_guess; /**< Whether to use an initial guess in the solver or not */ + QudaPreserveSource preserve_source; /**< Preserve the source or not in the linear solver (deprecated) */ - double clover_csw; /**< Csw coefficient of the clover term */ - double clover_coeff; /**< Coefficient of the clover term */ - double clover_rho; /**< Real number added to the clover diagonal (not to inverse) */ + QudaPrecision cpu_prec; /**< The precision used by the input fermion fields */ + QudaPrecision cuda_prec; /**< The precision used by the QUDA solver */ + QudaPrecision cuda_prec_sloppy; /**< The precision used by the QUDA sloppy operator */ + QudaPrecision + cuda_prec_refinement_sloppy; /**< The precision of the sloppy gauge field for the refinement step in multishift */ + QudaPrecision cuda_prec_precondition; /**< The precision used by the QUDA preconditioner */ + QudaPrecision cuda_prec_eigensolver; /**< The precision used by the QUDA eigensolver */ - int compute_clover_trlog; /**< Whether to compute the trace log of the clover term */ - double trlogA[2]; /**< The trace log of the clover term (even/odd computed separately) */ + QudaDiracFieldOrder dirac_order; /**< The order of the input and output fermion fields */ - int compute_clover; /**< Whether to compute the clover field */ - int compute_clover_inverse; /**< Whether to compute the clover inverse field */ - int return_clover; /**< Whether to copy back the clover matrix field */ - int return_clover_inverse; /**< Whether to copy back the inverted clover matrix field */ + QudaGammaBasis gamma_basis; /**< Gamma basis of the input and output host fields */ - QudaVerbosity verbosity; /**< The verbosity setting to use in the solver */ + QudaFieldLocation clover_location; /**< The location of the clover field */ + QudaPrecision clover_cpu_prec; /**< The precision used for the input clover field */ + QudaPrecision clover_cuda_prec; /**< The precision used for the clover field in the QUDA solver */ + QudaPrecision clover_cuda_prec_sloppy; /**< The precision used for the clover field in the QUDA sloppy operator */ + QudaPrecision clover_cuda_prec_refinement_sloppy; /**< The precision of the sloppy clover field for the refinement step in multishift */ + QudaPrecision clover_cuda_prec_precondition; /**< The precision used for the clover field in the QUDA preconditioner */ + QudaPrecision clover_cuda_prec_eigensolver; /**< The precision used for the clover field in the QUDA eigensolver */ - int iter; /**< The number of iterations performed by the solver */ - double gflops; /**< The Gflops rate of the solver */ - double secs; /**< The time taken by the solver */ + QudaCloverFieldOrder clover_order; /**< The order of the input clover field */ + QudaUseInitGuess use_init_guess; /**< Whether to use an initial guess in the solver or not */ - QudaTune tune; /**< Enable auto-tuning? (default = QUDA_TUNE_YES) */ + double clover_csw; /**< Csw coefficient of the clover term */ + double clover_coeff; /**< Coefficient of the clover term */ + double clover_rho; /**< Real number added to the clover diagonal (not to inverse) */ - /** Number of steps in s-step algorithms */ - int Nsteps; + int compute_clover_trlog; /**< Whether to compute the trace log of the clover term */ + double trlogA[2]; /**< The trace log of the clover term (even/odd computed separately) */ - /** Maximum size of Krylov space used by solver */ - int gcrNkrylov; + int compute_clover; /**< Whether to compute the clover field */ + int compute_clover_inverse; /**< Whether to compute the clover inverse field */ + int return_clover; /**< Whether to copy back the clover matrix field */ + int return_clover_inverse; /**< Whether to copy back the inverted clover matrix field */ - /* - * The following parameters are related to the solver - * preconditioner, if enabled. - */ + QudaVerbosity verbosity; /**< The verbosity setting to use in the solver */ - /** - * The inner Krylov solver used in the preconditioner. Set to - * QUDA_INVALID_INVERTER to disable the preconditioner entirely. - */ - QudaInverterType inv_type_precondition; + int iter; /**< The number of iterations performed by the solver */ + double gflops; /**< The Gflops rate of the solver */ + double secs; /**< The time taken by the solver */ - /** Preconditioner instance, e.g., multigrid */ - void *preconditioner; + QudaTune tune; /**< Enable auto-tuning? (default = QUDA_TUNE_YES) */ - /** Deflation instance */ - void *deflation_op; + /** Number of steps in s-step algorithms */ + int Nsteps; - /** defines deflation */ - void *eig_param; + /** Maximum size of Krylov space used by solver */ + int gcrNkrylov; - /** If true, deflate the initial guess */ - QudaBoolean deflate; + /* + * The following parameters are related to the solver + * preconditioner, if enabled. + */ - /** Dirac Dslash used in preconditioner */ - QudaDslashType dslash_type_precondition; - /** Verbosity of the inner Krylov solver */ - QudaVerbosity verbosity_precondition; + /** + * The inner Krylov solver used in the preconditioner. Set to + * QUDA_INVALID_INVERTER to disable the preconditioner entirely. + */ + QudaInverterType inv_type_precondition; - /** Tolerance in the inner solver */ - double tol_precondition; + /** Preconditioner instance, e.g., multigrid */ + void *preconditioner; - /** Maximum number of iterations allowed in the inner solver */ - int maxiter_precondition; + /** Deflation instance */ + void *deflation_op; - /** Relaxation parameter used in GCR-DD (default = 1.0) */ - double omega; + /** defines deflation */ + void *eig_param; - /** Basis for CA algorithms */ - QudaCABasis ca_basis; + /** If true, deflate the initial guess */ + QudaBoolean deflate; - /** Minimum eigenvalue for Chebyshev CA basis */ - double ca_lambda_min; + /** Dirac Dslash used in preconditioner */ + QudaDslashType dslash_type_precondition; + /** Verbosity of the inner Krylov solver */ + QudaVerbosity verbosity_precondition; - /** Maximum eigenvalue for Chebyshev CA basis */ - double ca_lambda_max; + /** Tolerance in the inner solver */ + double tol_precondition; - /** Basis for CA algorithms in a preconditioned solver */ - QudaCABasis ca_basis_precondition; + /** Maximum number of iterations allowed in the inner solver */ + int maxiter_precondition; - /** Minimum eigenvalue for Chebyshev CA basis in a preconditioner solver */ - double ca_lambda_min_precondition; + /** Relaxation parameter used in GCR-DD (default = 1.0) */ + double omega; - /** Maximum eigenvalue for Chebyshev CA basis in a preconditioner solver */ - double ca_lambda_max_precondition; + /** Basis for CA algorithms */ + QudaCABasis ca_basis; - /** Number of preconditioner cycles to perform per iteration */ - int precondition_cycle; + /** Minimum eigenvalue for Chebyshev CA basis */ + double ca_lambda_min; - /** Whether to use additive or multiplicative Schwarz preconditioning */ - QudaSchwarzType schwarz_type; + /** Maximum eigenvalue for Chebyshev CA basis */ + double ca_lambda_max; - /** The type of accelerator type to use for preconditioner */ - QudaAcceleratorType accelerator_type_precondition; + /** Basis for CA algorithms in a preconditioned solver */ + QudaCABasis ca_basis_precondition; - /** - * The following parameters are the ones used to perform the adaptive MADWF in MSPCG - * See section 3.3 of [arXiv:2104.05615] - */ + /** Minimum eigenvalue for Chebyshev CA basis in a preconditioner solver */ + double ca_lambda_min_precondition; - /** The diagonal constant to suppress the low modes when performing 5D transfer */ - double madwf_diagonal_suppressor; + /** Maximum eigenvalue for Chebyshev CA basis in a preconditioner solver */ + double ca_lambda_max_precondition; - /** The target MADWF Ls to be used in the accelerator */ - int madwf_ls; + /** Number of preconditioner cycles to perform per iteration */ + int precondition_cycle; - /** The minimum number of iterations after which to generate the null vectors for MADWF */ - int madwf_null_miniter; + /** Whether to use additive or multiplicative Schwarz preconditioning */ + QudaSchwarzType schwarz_type; - /** The maximum tolerance after which to generate the null vectors for MADWF */ - double madwf_null_tol; + /** The type of accelerator type to use for preconditioner */ + QudaAcceleratorType accelerator_type_precondition; - /** The maximum number of iterations for the training iterations */ - int madwf_train_maxiter; + /** + * The following parameters are the ones used to perform the adaptive MADWF in MSPCG + * See section 3.3 of [arXiv:2104.05615] + */ - /** Whether to load the MADWF parameters from the file system */ - QudaBoolean madwf_param_load; + /** The diagonal constant to suppress the low modes when performing 5D transfer */ + double madwf_diagonal_suppressor; - /** Whether to save the MADWF parameters to the file system */ - QudaBoolean madwf_param_save; + /** The target MADWF Ls to be used in the accelerator */ + int madwf_ls; - /** Path to load from the file system */ - char madwf_param_infile[256]; + /** The minimum number of iterations after which to generate the null vectors for MADWF */ + int madwf_null_miniter; - /** Path to save to the file system */ - char madwf_param_outfile[256]; + /** The maximum tolerance after which to generate the null vectors for MADWF */ + double madwf_null_tol; - /** - * Whether to use the L2 relative residual, Fermilab heavy-quark - * residual, or both to determine convergence. To require that both - * stopping conditions are satisfied, use a bitwise OR as follows: - * - * p.residual_type = (QudaResidualType) (QUDA_L2_RELATIVE_RESIDUAL - * | QUDA_HEAVY_QUARK_RESIDUAL); - */ - QudaResidualType residual_type; + /** The maximum number of iterations for the training iterations */ + int madwf_train_maxiter; - /**Parameters for deflated solvers*/ - /** The precision of the Ritz vectors */ - QudaPrecision cuda_prec_ritz; - /** How many vectors to compute after one solve - * for eigCG recommended values 8 or 16 - */ - int n_ev; - /** EeigCG : Search space dimension - * gmresdr : Krylov subspace dimension - */ - int max_search_dim; - /** For systems with many RHS: current RHS index */ - int rhs_idx; - /** Specifies deflation space volume: total number of eigenvectors is n_ev*deflation_grid */ - int deflation_grid; - /** eigCG: selection criterion for the reduced eigenvector set */ - double eigenval_tol; - /** mixed precision eigCG tuning parameter: minimum search vector space restarts */ - int eigcg_max_restarts; - /** initCG tuning parameter: maximum restarts */ - int max_restart_num; - /** initCG tuning parameter: tolerance for cg refinement corrections in the deflation stage */ - double inc_tol; + /** Whether to load the MADWF parameters from the file system */ + QudaBoolean madwf_param_load; - /** Whether to make the solution vector(s) after the solve */ - int make_resident_solution; + /** Whether to save the MADWF parameters to the file system */ + QudaBoolean madwf_param_save; - /** Whether to use the resident solution vector(s) */ - int use_resident_solution; - - /** Whether to use the solution vector to augment the chronological basis */ - int chrono_make_resident; - - /** Whether the solution should replace the last entry in the chronology */ - int chrono_replace_last; - - /** Whether to use the resident chronological basis */ - int chrono_use_resident; - - /** The maximum length of the chronological history to store */ - int chrono_max_dim; - - /** The index to indicate which chrono history we are augmenting */ - int chrono_index; - - /** Precision to store the chronological basis in */ - QudaPrecision chrono_precision; - - /** Which external library to use in the linear solvers (Eigen) */ - QudaExtLibType extlib_type; - - /** Whether to use the platform native or generic BLAS / LAPACK */ - QudaBoolean native_blas_lapack; - - /** Whether to use fused kernels for mobius */ - QudaBoolean use_mobius_fused_kernel; - - } QudaInvertParam; - - // Parameter set for solving eigenvalue problems. - typedef struct QudaEigParam_s { - /** Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ - size_t struct_size; - - // EIGENSOLVER PARAMS - //------------------------------------------------- - /** Used to store information pertinent to the operator **/ - QudaInvertParam *invert_param; - - /** Type of eigensolver algorithm to employ **/ - QudaEigType eig_type; - - /** Use Polynomial Acceleration **/ - QudaBoolean use_poly_acc; - - /** Degree of the Chebysev polynomial **/ - int poly_deg; - - /** Range used in polynomial acceleration **/ - double a_min; - double a_max; - - /** Whether to preserve the deflation space between solves. If - true, the space will be stored in an instance of the - deflation_space struct, pointed to by preserve_deflation_space */ - QudaBoolean preserve_deflation; - - /** This is where we store the deflation space. This will point - to an instance of deflation_space. When a deflated solver is enabled, the deflation space will be obtained from this. */ - void *preserve_deflation_space; - - /** If we restore the deflation space, this boolean indicates - whether we are also preserving the evalues or recomputing - them. For example if a different mass shift is being used - than the one used to generate the space, then this should be - false, but preserve_deflation would be true */ - QudaBoolean preserve_evals; - - /** What type of Dirac operator we are using **/ - /** If !(use_norm_op) && !(use_dagger) use M. **/ - /** If use_dagger, use Mdag **/ - /** If use_norm_op, use MdagM **/ - /** If use_norm_op && use_dagger use MMdag. **/ - /** If use_pc for any, then use the even-odd pc version **/ - QudaBoolean use_dagger; - QudaBoolean use_norm_op; - QudaBoolean use_pc; - - /** Use Eigen routines to eigensolve the upper Hessenberg via QR **/ - QudaBoolean use_eigen_qr; - - /** Performs an MdagM solve, then constructs the left and right SVD. **/ - QudaBoolean compute_svd; + /** Path to load from the file system */ + char madwf_param_infile[256]; - /** Performs the \gamma_5 OP solve by Post multipling the eignvectors with - \gamma_5 before computing the eigenvalues */ - QudaBoolean compute_gamma5; + /** Path to save to the file system */ + char madwf_param_outfile[256]; - /** If true, the solver will error out if the convergence criteria are not met **/ - QudaBoolean require_convergence; + /** + * Whether to use the L2 relative residual, Fermilab heavy-quark + * residual, or both to determine convergence. To require that both + * stopping conditions are satisfied, use a bitwise OR as follows: + * + * p.residual_type = (QudaResidualType) (QUDA_L2_RELATIVE_RESIDUAL + * | QUDA_HEAVY_QUARK_RESIDUAL); + */ + QudaResidualType residual_type; - /** Which part of the spectrum to solve **/ - QudaEigSpectrumType spectrum; + /**Parameters for deflated solvers*/ + /** The precision of the Ritz vectors */ + QudaPrecision cuda_prec_ritz; + /** How many vectors to compute after one solve + * for eigCG recommended values 8 or 16 + */ + int n_ev; + /** EeigCG : Search space dimension + * gmresdr : Krylov subspace dimension + */ + int max_search_dim; + /** For systems with many RHS: current RHS index */ + int rhs_idx; + /** Specifies deflation space volume: total number of eigenvectors is n_ev*deflation_grid */ + int deflation_grid; + /** eigCG: selection criterion for the reduced eigenvector set */ + double eigenval_tol; + /** mixed precision eigCG tuning parameter: minimum search vector space restarts */ + int eigcg_max_restarts; + /** initCG tuning parameter: maximum restarts */ + int max_restart_num; + /** initCG tuning parameter: tolerance for cg refinement corrections in the deflation stage */ + double inc_tol; - /** Size of the eigenvector search space **/ - int n_ev; - /** Total size of Krylov space **/ - int n_kr; - /** Max number of locked eigenpairs (deduced at runtime) **/ - int nLockedMax; - /** Number of requested converged eigenvectors **/ - int n_conv; - /** Number of requested converged eigenvectors to use in deflation **/ - int n_ev_deflate; - /** Tolerance on the least well known eigenvalue's residual **/ - double tol; - /** Tolerance on the QR iteration **/ - double qr_tol; - /** For IRLM/IRAM, check every nth restart **/ - int check_interval; - /** For IRLM/IRAM, quit after n restarts **/ - int max_restarts; - /** For the Ritz rotation, the maximal number of extra vectors the solver may allocate **/ - int batched_rotate; - /** For block method solvers, the block size **/ - int block_size; + /** Whether to make the solution vector(s) after the solve */ + int make_resident_solution; + + /** Whether to use the resident solution vector(s) */ + int use_resident_solution; + + /** Whether to use the solution vector to augment the chronological basis */ + int chrono_make_resident; + + /** Whether the solution should replace the last entry in the chronology */ + int chrono_replace_last; + + /** Whether to use the resident chronological basis */ + int chrono_use_resident; + + /** The maximum length of the chronological history to store */ + int chrono_max_dim; + + /** The index to indicate which chrono history we are augmenting */ + int chrono_index; + + /** Precision to store the chronological basis in */ + QudaPrecision chrono_precision; + + /** Which external library to use in the linear solvers (Eigen) */ + QudaExtLibType extlib_type; + + /** Whether to use the platform native or generic BLAS / LAPACK */ + QudaBoolean native_blas_lapack; + + /** Whether to use fused kernels for mobius */ + QudaBoolean use_mobius_fused_kernel; + +} QudaInvertParam; + +// Parameter set for solving eigenvalue problems. +typedef struct QudaEigParam_s { + /** Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ + size_t struct_size; + + // EIGENSOLVER PARAMS + //------------------------------------------------- + /** Used to store information pertinent to the operator **/ + QudaInvertParam *invert_param; + + /** Type of eigensolver algorithm to employ **/ + QudaEigType eig_type; + + /** Use Polynomial Acceleration **/ + QudaBoolean use_poly_acc; + + /** Degree of the Chebysev polynomial **/ + int poly_deg; + + /** Range used in polynomial acceleration **/ + double a_min; + double a_max; + + /** Whether to preserve the deflation space between solves. If + true, the space will be stored in an instance of the + deflation_space struct, pointed to by preserve_deflation_space */ + QudaBoolean preserve_deflation; + + /** This is where we store the deflation space. This will point + to an instance of deflation_space. When a deflated solver is enabled, the deflation space will be obtained from this. */ + void *preserve_deflation_space; + + /** If we restore the deflation space, this boolean indicates + whether we are also preserving the evalues or recomputing + them. For example if a different mass shift is being used + than the one used to generate the space, then this should be + false, but preserve_deflation would be true */ + QudaBoolean preserve_evals; + + /** What type of Dirac operator we are using **/ + /** If !(use_norm_op) && !(use_dagger) use M. **/ + /** If use_dagger, use Mdag **/ + /** If use_norm_op, use MdagM **/ + /** If use_norm_op && use_dagger use MMdag. **/ + /** If use_pc for any, then use the even-odd pc version **/ + QudaBoolean use_dagger; + QudaBoolean use_norm_op; + QudaBoolean use_pc; + + /** Use Eigen routines to eigensolve the upper Hessenberg via QR **/ + QudaBoolean use_eigen_qr; + + /** Performs an MdagM solve, then constructs the left and right SVD. **/ + QudaBoolean compute_svd; - /** In the test function, cross check the device result against ARPACK **/ - QudaBoolean arpack_check; - /** For Arpack cross check, name of the Arpack logfile **/ - char arpack_logfile[512]; + /** Performs the \gamma_5 OP solve by Post multipling the eignvectors with + \gamma_5 before computing the eigenvalues */ + QudaBoolean compute_gamma5; - /** Name of the QUDA logfile (residua, upper Hessenberg/tridiag matrix updates) **/ - char QUDA_logfile[512]; + /** If true, the solver will error out if the convergence criteria are not met **/ + QudaBoolean require_convergence; - //------------------------------------------------- + /** Which part of the spectrum to solve **/ + QudaEigSpectrumType spectrum; - // EIG-CG PARAMS - //------------------------------------------------- - int nk; - int np; + /** Size of the eigenvector search space **/ + int n_ev; + /** Total size of Krylov space **/ + int n_kr; + /** Max number of locked eigenpairs (deduced at runtime) **/ + int nLockedMax; + /** Number of requested converged eigenvectors **/ + int n_conv; + /** Number of requested converged eigenvectors to use in deflation **/ + int n_ev_deflate; + /** Tolerance on the least well known eigenvalue's residual **/ + double tol; + /** Tolerance on the QR iteration **/ + double qr_tol; + /** For IRLM/IRAM, check every nth restart **/ + int check_interval; + /** For IRLM/IRAM, quit after n restarts **/ + int max_restarts; + /** For the Ritz rotation, the maximal number of extra vectors the solver may allocate **/ + int batched_rotate; + /** For block method solvers, the block size **/ + int block_size; - /** Whether to load eigenvectors */ - QudaBoolean import_vectors; + /** In the test function, cross check the device result against ARPACK **/ + QudaBoolean arpack_check; + /** For Arpack cross check, name of the Arpack logfile **/ + char arpack_logfile[512]; - /** The precision of the Ritz vectors */ - QudaPrecision cuda_prec_ritz; + /** Name of the QUDA logfile (residua, upper Hessenberg/tridiag matrix updates) **/ + char QUDA_logfile[512]; - /** The memory type used to keep the Ritz vectors */ - QudaMemoryType mem_type_ritz; + //------------------------------------------------- - /** Location where deflation should be done */ - QudaFieldLocation location; + // EIG-CG PARAMS + //------------------------------------------------- + int nk; + int np; - /** Whether to run the verification checks once set up is complete */ - QudaBoolean run_verify; + /** Whether to load eigenvectors */ + QudaBoolean import_vectors; - /** Filename prefix where to load the null-space vectors */ - char vec_infile[256]; + /** The precision of the Ritz vectors */ + QudaPrecision cuda_prec_ritz; - /** Filename prefix for where to save the null-space vectors */ - char vec_outfile[256]; + /** The memory type used to keep the Ritz vectors */ + QudaMemoryType mem_type_ritz; - /** The precision with which to save the vectors */ - QudaPrecision save_prec; + /** Location where deflation should be done */ + QudaFieldLocation location; - /** Whether to inflate single-parity eigen-vector I/O to a full - field (e.g., enabling this is required for compatability with - MILC I/O) */ - QudaBoolean io_parity_inflate; + /** Whether to run the verification checks once set up is complete */ + QudaBoolean run_verify; - /** The Gflops rate of the eigensolver setup */ - double gflops; + /** Filename prefix where to load the null-space vectors */ + char vec_infile[256]; - /**< The time taken by the eigensolver setup */ - double secs; + /** Filename prefix for where to save the null-space vectors */ + char vec_outfile[256]; - /** Which external library to use in the deflation operations (Eigen) */ - QudaExtLibType extlib_type; - //------------------------------------------------- - } QudaEigParam; + /** The precision with which to save the vectors */ + QudaPrecision save_prec; - typedef struct QudaMultigridParam_s { + /** Whether to inflate single-parity eigen-vector I/O to a full + field (e.g., enabling this is required for compatability with + MILC I/O) */ + QudaBoolean io_parity_inflate; - /** Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ - size_t struct_size; + /** The Gflops rate of the eigensolver setup */ + double gflops; - QudaInvertParam *invert_param; + /**< The time taken by the eigensolver setup */ + double secs; - QudaEigParam *eig_param[QUDA_MAX_MG_LEVEL]; + /** Which external library to use in the deflation operations (Eigen) */ + QudaExtLibType extlib_type; + //------------------------------------------------- +} QudaEigParam; - /** Number of multigrid levels */ - int n_level; +typedef struct QudaMultigridParam_s { - /** Geometric block sizes to use on each level */ - int geo_block_size[QUDA_MAX_MG_LEVEL][QUDA_MAX_DIM]; + /** Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ + size_t struct_size; - /** Spin block sizes to use on each level */ - int spin_block_size[QUDA_MAX_MG_LEVEL]; + QudaInvertParam *invert_param; - /** Number of null-space vectors to use on each level */ - int n_vec[QUDA_MAX_MG_LEVEL]; + QudaEigParam *eig_param[QUDA_MAX_MG_LEVEL]; - /** Precision to store the null-space vectors in (post block orthogonalization) */ - QudaPrecision precision_null[QUDA_MAX_MG_LEVEL]; + /** Number of multigrid levels */ + int n_level; - /** Number of times to repeat Gram-Schmidt in block orthogonalization */ - int n_block_ortho[QUDA_MAX_MG_LEVEL]; + /** Geometric block sizes to use on each level */ + int geo_block_size[QUDA_MAX_MG_LEVEL][QUDA_MAX_DIM]; - /** Whether to do passes at block orthogonalize in fixed point for improved accuracy */ - QudaBoolean block_ortho_two_pass[QUDA_MAX_MG_LEVEL]; + /** Spin block sizes to use on each level */ + int spin_block_size[QUDA_MAX_MG_LEVEL]; - /** Verbosity on each level of the multigrid */ - QudaVerbosity verbosity[QUDA_MAX_MG_LEVEL]; + /** Number of null-space vectors to use on each level */ + int n_vec[QUDA_MAX_MG_LEVEL]; - /** Inverter to use in the setup phase */ - QudaInverterType setup_inv_type[QUDA_MAX_MG_LEVEL]; + /** Precision to store the null-space vectors in (post block orthogonalization) */ + QudaPrecision precision_null[QUDA_MAX_MG_LEVEL]; - /** Number of setup iterations */ - int num_setup_iter[QUDA_MAX_MG_LEVEL]; + /** Number of times to repeat Gram-Schmidt in block orthogonalization */ + int n_block_ortho[QUDA_MAX_MG_LEVEL]; - /** Tolerance to use in the setup phase */ - double setup_tol[QUDA_MAX_MG_LEVEL]; + /** Whether to do passes at block orthogonalize in fixed point for improved accuracy */ + QudaBoolean block_ortho_two_pass[QUDA_MAX_MG_LEVEL]; - /** Maximum number of iterations for each setup solver */ - int setup_maxiter[QUDA_MAX_MG_LEVEL]; + /** Verbosity on each level of the multigrid */ + QudaVerbosity verbosity[QUDA_MAX_MG_LEVEL]; - /** Maximum number of iterations for refreshing the null-space vectors */ - int setup_maxiter_refresh[QUDA_MAX_MG_LEVEL]; + /** Inverter to use in the setup phase */ + QudaInverterType setup_inv_type[QUDA_MAX_MG_LEVEL]; - /** Basis to use for CA solver setup */ - QudaCABasis setup_ca_basis[QUDA_MAX_MG_LEVEL]; + /** Number of setup iterations */ + int num_setup_iter[QUDA_MAX_MG_LEVEL]; - /** Basis size for CA solver setup */ - int setup_ca_basis_size[QUDA_MAX_MG_LEVEL]; + /** Tolerance to use in the setup phase */ + double setup_tol[QUDA_MAX_MG_LEVEL]; - /** Minimum eigenvalue for Chebyshev CA basis */ - double setup_ca_lambda_min[QUDA_MAX_MG_LEVEL]; + /** Maximum number of iterations for each setup solver */ + int setup_maxiter[QUDA_MAX_MG_LEVEL]; - /** Maximum eigenvalue for Chebyshev CA basis */ - double setup_ca_lambda_max[QUDA_MAX_MG_LEVEL]; + /** Maximum number of iterations for refreshing the null-space vectors */ + int setup_maxiter_refresh[QUDA_MAX_MG_LEVEL]; - /** Null-space type to use in the setup phase */ - QudaSetupType setup_type; + /** Basis to use for CA solver setup */ + QudaCABasis setup_ca_basis[QUDA_MAX_MG_LEVEL]; - /** Pre orthonormalize vectors in the setup phase */ - QudaBoolean pre_orthonormalize; + /** Basis size for CA solver setup */ + int setup_ca_basis_size[QUDA_MAX_MG_LEVEL]; - /** Post orthonormalize vectors in the setup phase */ - QudaBoolean post_orthonormalize; + /** Minimum eigenvalue for Chebyshev CA basis */ + double setup_ca_lambda_min[QUDA_MAX_MG_LEVEL]; - /** The solver that wraps around the coarse grid correction and smoother */ - QudaInverterType coarse_solver[QUDA_MAX_MG_LEVEL]; + /** Maximum eigenvalue for Chebyshev CA basis */ + double setup_ca_lambda_max[QUDA_MAX_MG_LEVEL]; - /** Tolerance for the solver that wraps around the coarse grid correction and smoother */ - double coarse_solver_tol[QUDA_MAX_MG_LEVEL]; + /** Null-space type to use in the setup phase */ + QudaSetupType setup_type; - /** Maximum number of iterations for the solver that wraps around the coarse grid correction and smoother */ - int coarse_solver_maxiter[QUDA_MAX_MG_LEVEL]; + /** Pre orthonormalize vectors in the setup phase */ + QudaBoolean pre_orthonormalize; - /** Basis to use for CA coarse solvers */ - QudaCABasis coarse_solver_ca_basis[QUDA_MAX_MG_LEVEL]; + /** Post orthonormalize vectors in the setup phase */ + QudaBoolean post_orthonormalize; - /** Basis size for CA coarse solvers */ - int coarse_solver_ca_basis_size[QUDA_MAX_MG_LEVEL]; + /** The solver that wraps around the coarse grid correction and smoother */ + QudaInverterType coarse_solver[QUDA_MAX_MG_LEVEL]; - /** Minimum eigenvalue for Chebyshev CA basis */ - double coarse_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL]; + /** Tolerance for the solver that wraps around the coarse grid correction and smoother */ + double coarse_solver_tol[QUDA_MAX_MG_LEVEL]; - /** Maximum eigenvalue for Chebyshev CA basis */ - double coarse_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL]; + /** Maximum number of iterations for the solver that wraps around the coarse grid correction and smoother */ + int coarse_solver_maxiter[QUDA_MAX_MG_LEVEL]; - /** Smoother to use on each level */ - QudaInverterType smoother[QUDA_MAX_MG_LEVEL]; + /** Basis to use for CA coarse solvers */ + QudaCABasis coarse_solver_ca_basis[QUDA_MAX_MG_LEVEL]; - /** Tolerance to use for the smoother / solver on each level */ - double smoother_tol[QUDA_MAX_MG_LEVEL]; + /** Basis size for CA coarse solvers */ + int coarse_solver_ca_basis_size[QUDA_MAX_MG_LEVEL]; - /** Number of pre-smoother applications on each level */ - int nu_pre[QUDA_MAX_MG_LEVEL]; + /** Minimum eigenvalue for Chebyshev CA basis */ + double coarse_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL]; - /** Number of post-smoother applications on each level */ - int nu_post[QUDA_MAX_MG_LEVEL]; + /** Maximum eigenvalue for Chebyshev CA basis */ + double coarse_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL]; - /** Basis to use for CA smoother solvers */ - QudaCABasis smoother_solver_ca_basis[QUDA_MAX_MG_LEVEL]; + /** Smoother to use on each level */ + QudaInverterType smoother[QUDA_MAX_MG_LEVEL]; - /** Minimum eigenvalue for Chebyshev CA smoother basis */ - double smoother_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL]; + /** Tolerance to use for the smoother / solver on each level */ + double smoother_tol[QUDA_MAX_MG_LEVEL]; - /** Maximum eigenvalue for Chebyshev CA smoother basis */ - double smoother_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL]; + /** Number of pre-smoother applications on each level */ + int nu_pre[QUDA_MAX_MG_LEVEL]; - /** Over/under relaxation factor for the smoother at each level */ - double omega[QUDA_MAX_MG_LEVEL]; + /** Number of post-smoother applications on each level */ + int nu_post[QUDA_MAX_MG_LEVEL]; - /** Precision to use for halo communication in the smoother */ - QudaPrecision smoother_halo_precision[QUDA_MAX_MG_LEVEL]; + /** Basis to use for CA smoother solvers */ + QudaCABasis smoother_solver_ca_basis[QUDA_MAX_MG_LEVEL]; - /** Whether to use additive or multiplicative Schwarz preconditioning in the smoother */ - QudaSchwarzType smoother_schwarz_type[QUDA_MAX_MG_LEVEL]; + /** Minimum eigenvalue for Chebyshev CA smoother basis */ + double smoother_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL]; - /** Number of Schwarz cycles to apply */ - int smoother_schwarz_cycle[QUDA_MAX_MG_LEVEL]; + /** Maximum eigenvalue for Chebyshev CA smoother basis */ + double smoother_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL]; - /** The type of residual to send to the next coarse grid, and thus the - type of solution to receive back from this coarse grid */ - QudaSolutionType coarse_grid_solution_type[QUDA_MAX_MG_LEVEL]; + /** Over/under relaxation factor for the smoother at each level */ + double omega[QUDA_MAX_MG_LEVEL]; - /** The type of smoother solve to do on each grid (e/o preconditioning or not)*/ - QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]; + /** Precision to use for halo communication in the smoother */ + QudaPrecision smoother_halo_precision[QUDA_MAX_MG_LEVEL]; - /** The type of multigrid cycle to perform at each level */ - QudaMultigridCycleType cycle_type[QUDA_MAX_MG_LEVEL]; + /** Whether to use additive or multiplicative Schwarz preconditioning in the smoother */ + QudaSchwarzType smoother_schwarz_type[QUDA_MAX_MG_LEVEL]; - /** Whether to use global reductions or not for the smoother / solver at each level */ - QudaBoolean global_reduction[QUDA_MAX_MG_LEVEL]; + /** Number of Schwarz cycles to apply */ + int smoother_schwarz_cycle[QUDA_MAX_MG_LEVEL]; - /** Location where each level should be done */ - QudaFieldLocation location[QUDA_MAX_MG_LEVEL]; + /** The type of residual to send to the next coarse grid, and thus the + type of solution to receive back from this coarse grid */ + QudaSolutionType coarse_grid_solution_type[QUDA_MAX_MG_LEVEL]; - /** Location where the coarse-operator construction will be computedn */ - QudaFieldLocation setup_location[QUDA_MAX_MG_LEVEL]; + /** The type of smoother solve to do on each grid (e/o preconditioning or not)*/ + QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]; - /** Whether to use eigenvectors for the nullspace or, if the coarsest instance deflate*/ - QudaBoolean use_eig_solver[QUDA_MAX_MG_LEVEL]; + /** The type of multigrid cycle to perform at each level */ + QudaMultigridCycleType cycle_type[QUDA_MAX_MG_LEVEL]; - /** Minimize device memory allocations during the adaptive setup, - placing temporary fields in mapped memory instad of device - memory */ - QudaBoolean setup_minimize_memory; + /** Whether to use global reductions or not for the smoother / solver at each level */ + QudaBoolean global_reduction[QUDA_MAX_MG_LEVEL]; - /** Whether to compute the null vectors or reload them */ - QudaComputeNullVector compute_null_vector; + /** Location where each level should be done */ + QudaFieldLocation location[QUDA_MAX_MG_LEVEL]; - /** Whether to generate on all levels or just on level 0 */ - QudaBoolean generate_all_levels; + /** Location where the coarse-operator construction will be computedn */ + QudaFieldLocation setup_location[QUDA_MAX_MG_LEVEL]; - /** Whether to run the verification checks once set up is complete */ - QudaBoolean run_verify; + /** Whether to use eigenvectors for the nullspace or, if the coarsest instance deflate*/ + QudaBoolean use_eig_solver[QUDA_MAX_MG_LEVEL]; - /** Whether to run null Vs eigen vector overlap checks once set up is complete */ - QudaBoolean run_low_mode_check; + /** Minimize device memory allocations during the adaptive setup, + placing temporary fields in mapped memory instad of device + memory */ + QudaBoolean setup_minimize_memory; - /** Whether to run null vector oblique checks once set up is complete */ - QudaBoolean run_oblique_proj_check; + /** Whether to compute the null vectors or reload them */ + QudaComputeNullVector compute_null_vector; - /** Whether to load the null-space vectors to disk (requires QIO) */ - QudaBoolean vec_load[QUDA_MAX_MG_LEVEL]; + /** Whether to generate on all levels or just on level 0 */ + QudaBoolean generate_all_levels; - /** Filename prefix where to load the null-space vectors */ - char vec_infile[QUDA_MAX_MG_LEVEL][256]; + /** Whether to run the verification checks once set up is complete */ + QudaBoolean run_verify; - /** Whether to store the null-space vectors to disk (requires QIO) */ - QudaBoolean vec_store[QUDA_MAX_MG_LEVEL]; + /** Whether to run null Vs eigen vector overlap checks once set up is complete */ + QudaBoolean run_low_mode_check; - /** Filename prefix for where to save the null-space vectors */ - char vec_outfile[QUDA_MAX_MG_LEVEL][256]; + /** Whether to run null vector oblique checks once set up is complete */ + QudaBoolean run_oblique_proj_check; - /** Whether to use and initial guess during coarse grid deflation */ - QudaBoolean coarse_guess; + /** Whether to load the null-space vectors to disk (requires QIO) */ + QudaBoolean vec_load[QUDA_MAX_MG_LEVEL]; - /** Whether to preserve the deflation space during MG update */ - QudaBoolean preserve_deflation; + /** Filename prefix where to load the null-space vectors */ + char vec_infile[QUDA_MAX_MG_LEVEL][256]; - /** The Gflops rate of the multigrid solver setup */ - double gflops; + /** Whether to store the null-space vectors to disk (requires QIO) */ + QudaBoolean vec_store[QUDA_MAX_MG_LEVEL]; - /**< The time taken by the multigrid solver setup */ - double secs; + /** Filename prefix for where to save the null-space vectors */ + char vec_outfile[QUDA_MAX_MG_LEVEL][256]; - /** Multiplicative factor for the mu parameter */ - double mu_factor[QUDA_MAX_MG_LEVEL]; + /** Whether to use and initial guess during coarse grid deflation */ + QudaBoolean coarse_guess; - /** Boolean for aggregation type, implies staggered or not */ - QudaTransferType transfer_type[QUDA_MAX_MG_LEVEL]; - - /** Whether or not to let MG coarsening drop improvements, for ex dropping long links in small aggregation dimensions */ - QudaBoolean allow_truncation; - - /** Whether or not to use the dagger approximation for the KD preconditioned operator */ - QudaBoolean staggered_kd_dagger_approximation; - - /** Whether to use tensor cores (if available) */ - QudaBoolean use_mma; - - /** Whether to do a full (false) or thin (true) update in the context of updateMultigridQuda */ - QudaBoolean thin_update_only; - } QudaMultigridParam; - - typedef struct QudaGaugeObservableParam_s { - size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct*/ - QudaBoolean su_project; /**< Whether to project onto the manifold prior to measurement */ - QudaBoolean compute_plaquette; /**< Whether to compute the plaquette */ - double plaquette[3]; /**< Total, spatial and temporal field energies, respectively */ - QudaBoolean compute_polyakov_loop; /**< Whether to compute the temporal Polyakov loop */ - double ploop[2]; /**< Real and imaginary part of temporal Polyakov loop */ - QudaBoolean compute_gauge_loop_trace; /**< Whether to compute gauge loop traces */ - double_complex *traces; /**< Individual complex traces of each loop */ - int **input_path_buff; /**< Array of paths */ - int *path_length; /**< Length of each path */ - double *loop_coeff; /**< Multiplicative factor for each loop */ - int num_paths; /**< Total number of paths */ - int max_length; /**< Maximum length of any path */ - double factor; /**< Global multiplicative factor to apply to each loop trace */ - QudaBoolean compute_qcharge; /**< Whether to compute the topological charge and field energy */ - double qcharge; /**< Computed topological charge */ - double energy[3]; /**< Total, spatial and temporal field energies, respectively */ - QudaBoolean compute_qcharge_density; /**< Whether to compute the topological charge density */ - void *qcharge_density; /**< Pointer to host array of length volume where the q-charge density will be copied */ - QudaBoolean - remove_staggered_phase; /**< Whether or not the resident gauge field has staggered phases applied and if they should - be removed; this was needed for the Polyakov loop calculation when called through MILC, - with the underlying issue documented https://github.com/lattice/quda/issues/1315 */ - } QudaGaugeObservableParam; - - typedef struct QudaGaugeSmearParam_s { - size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct*/ - unsigned int n_steps; /**< The total number of smearing steps to perform. */ - double epsilon; /**< Serves as one of the coefficients in Over Improved Stout smearing, or as the step size in - Wilson/Symanzik flow */ - double alpha; /**< The single coefficient used in APE smearing */ - double rho; /**< Serves as one of the coefficients used in Over Improved Stout smearing, or as the single coefficient used in Stout */ - unsigned int meas_interval; /**< Perform the requested measurements on the gauge field at this interval */ - QudaGaugeSmearType smear_type; /**< The smearing type to perform */ - } QudaGaugeSmearParam; - - typedef struct QudaBLASParam_s { - size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct*/ - - QudaBLASType blas_type; /**< Type of BLAS computation to perfrom */ - - // GEMM params - QudaBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */ - QudaBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */ - int m; /**< number of rows of matrix op(A) and C. */ - int n; /**< number of columns of matrix op(B) and C. */ - int k; /**< number of columns of op(A) and rows of op(B). */ - int lda; /**< leading dimension of two-dimensional array used to store the matrix A. */ - int ldb; /**< leading dimension of two-dimensional array used to store matrix B. */ - int ldc; /**< leading dimension of two-dimensional array used to store matrix C. */ - int a_offset; /**< position of the A array from which begin read/write. */ - int b_offset; /**< position of the B array from which begin read/write. */ - int c_offset; /**< position of the C array from which begin read/write. */ - int a_stride; /**< stride of the A array in strided(batched) mode */ - int b_stride; /**< stride of the B array in strided(batched) mode */ - int c_stride; /**< stride of the C array in strided(batched) mode */ - double_complex alpha; /**< scalar used for multiplication. */ - double_complex beta; /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */ - - // LU inversion params - int inv_mat_size; /**< The rank of the square matrix in the LU inversion */ - - // Common params - int batch_count; /**< number of pointers contained in arrayA, arrayB and arrayC. */ - QudaBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ - QudaBLASDataOrder data_order; /**< Specifies if using Row or Column major */ - - } QudaBLASParam; + /** Whether to preserve the deflation space during MG update */ + QudaBoolean preserve_deflation; - /* - * Interface functions, found in interface_quda.cpp - */ + /** The Gflops rate of the multigrid solver setup */ + double gflops; - /** - * Set parameters related to status reporting. - * - * In typical usage, this function will be called once (or not at - * all) just before the call to initQuda(), but it's valid to call - * it any number of times at any point during execution. Prior to - * the first time it's called, the parameters take default values - * as indicated below. - * - * @param verbosity Default verbosity, ranging from QUDA_SILENT to - * QUDA_DEBUG_VERBOSE. Within a solver, this - * parameter is overridden by the "verbosity" - * member of QudaInvertParam. The default value - * is QUDA_SUMMARIZE. - * - * @param prefix String to prepend to all messages from QUDA. This - * defaults to the empty string (""), but you may - * wish to specify something like "QUDA: " to - * distinguish QUDA's output from that of your - * application. - * - * @param outfile File pointer (such as stdout, stderr, or a handle - * returned by fopen()) where messages should be - * printed. The default is stdout. - */ - void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], - FILE *outfile); - - /** - * initCommsGridQuda() takes an optional "rank_from_coords" argument that - * should be a pointer to a user-defined function with this prototype. - * - * @param coords Node coordinates - * @param fdata Any auxiliary data needed by the function - * @return MPI rank or QMP node ID cooresponding to the node coordinates - * - * @see initCommsGridQuda - */ - typedef int (*QudaCommsMap)(const int *coords, void *fdata); + /**< The time taken by the multigrid solver setup */ + double secs; + + /** Multiplicative factor for the mu parameter */ + double mu_factor[QUDA_MAX_MG_LEVEL]; + + /** Boolean for aggregation type, implies staggered or not */ + QudaTransferType transfer_type[QUDA_MAX_MG_LEVEL]; + + /** Whether or not to let MG coarsening drop improvements, for ex dropping long links in small aggregation dimensions */ + QudaBoolean allow_truncation; + + /** Whether or not to use the dagger approximation for the KD preconditioned operator */ + QudaBoolean staggered_kd_dagger_approximation; + + /** Whether to use tensor cores (if available) */ + QudaBoolean use_mma; + + /** Whether to do a full (false) or thin (true) update in the context of updateMultigridQuda */ + QudaBoolean thin_update_only; +} QudaMultigridParam; + +typedef struct QudaGaugeObservableParam_s { + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct*/ + QudaBoolean su_project; /**< Whether to project onto the manifold prior to measurement */ + QudaBoolean compute_plaquette; /**< Whether to compute the plaquette */ + double plaquette[3]; /**< Total, spatial and temporal field energies, respectively */ + QudaBoolean compute_polyakov_loop; /**< Whether to compute the temporal Polyakov loop */ + double ploop[2]; /**< Real and imaginary part of temporal Polyakov loop */ + QudaBoolean compute_gauge_loop_trace; /**< Whether to compute gauge loop traces */ + double_complex *traces; /**< Individual complex traces of each loop */ + int **input_path_buff; /**< Array of paths */ + int *path_length; /**< Length of each path */ + double *loop_coeff; /**< Multiplicative factor for each loop */ + int num_paths; /**< Total number of paths */ + int max_length; /**< Maximum length of any path */ + double factor; /**< Global multiplicative factor to apply to each loop trace */ + QudaBoolean compute_qcharge; /**< Whether to compute the topological charge and field energy */ + double qcharge; /**< Computed topological charge */ + double energy[3]; /**< Total, spatial and temporal field energies, respectively */ + QudaBoolean compute_qcharge_density; /**< Whether to compute the topological charge density */ + void *qcharge_density; /**< Pointer to host array of length volume where the q-charge density will be copied */ + QudaBoolean + remove_staggered_phase; /**< Whether or not the resident gauge field has staggered phases applied and if they should + be removed; this was needed for the Polyakov loop calculation when called through MILC, + with the underlying issue documented https://github.com/lattice/quda/issues/1315 */ +} QudaGaugeObservableParam; + +typedef struct QudaGaugeSmearParam_s { + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct*/ + unsigned int n_steps; /**< The total number of smearing steps to perform. */ + double epsilon; /**< Serves as one of the coefficients in Over Improved Stout smearing, or as the step size in + Wilson/Symanzik flow */ + double alpha; /**< The single coefficient used in APE smearing */ + double rho; /**< Serves as one of the coefficients used in Over Improved Stout smearing, or as the single coefficient used in Stout */ + unsigned int meas_interval; /**< Perform the requested measurements on the gauge field at this interval */ + QudaGaugeSmearType smear_type; /**< The smearing type to perform */ +} QudaGaugeSmearParam; + +typedef struct QudaBLASParam_s { + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct*/ + + QudaBLASType blas_type; /**< Type of BLAS computation to perfrom */ + + // GEMM params + QudaBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */ + QudaBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */ + int m; /**< number of rows of matrix op(A) and C. */ + int n; /**< number of columns of matrix op(B) and C. */ + int k; /**< number of columns of op(A) and rows of op(B). */ + int lda; /**< leading dimension of two-dimensional array used to store the matrix A. */ + int ldb; /**< leading dimension of two-dimensional array used to store matrix B. */ + int ldc; /**< leading dimension of two-dimensional array used to store matrix C. */ + int a_offset; /**< position of the A array from which begin read/write. */ + int b_offset; /**< position of the B array from which begin read/write. */ + int c_offset; /**< position of the C array from which begin read/write. */ + int a_stride; /**< stride of the A array in strided(batched) mode */ + int b_stride; /**< stride of the B array in strided(batched) mode */ + int c_stride; /**< stride of the C array in strided(batched) mode */ + double_complex alpha; /**< scalar used for multiplication. */ + double_complex beta; /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */ + + // LU inversion params + int inv_mat_size; /**< The rank of the square matrix in the LU inversion */ + + // Common params + int batch_count; /**< number of pointers contained in arrayA, arrayB and arrayC. */ + QudaBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ + QudaBLASDataOrder data_order; /**< Specifies if using Row or Column major */ + +} QudaBLASParam; + +/* + * Interface functions, found in interface_quda.cpp + */ - /** - * @param mycomm User provided MPI communicator in place of MPI_COMM_WORLD - */ +/** + * Set parameters related to status reporting. + * + * In typical usage, this function will be called once (or not at + * all) just before the call to initQuda(), but it's valid to call + * it any number of times at any point during execution. Prior to + * the first time it's called, the parameters take default values + * as indicated below. + * + * @param verbosity Default verbosity, ranging from QUDA_SILENT to + * QUDA_DEBUG_VERBOSE. Within a solver, this + * parameter is overridden by the "verbosity" + * member of QudaInvertParam. The default value + * is QUDA_SUMMARIZE. + * + * @param prefix String to prepend to all messages from QUDA. This + * defaults to the empty string (""), but you may + * wish to specify something like "QUDA: " to + * distinguish QUDA's output from that of your + * application. + * + * @param outfile File pointer (such as stdout, stderr, or a handle + * returned by fopen()) where messages should be + * printed. The default is stdout. + */ +void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile); - void qudaSetCommHandle(void *mycomm); +/** + * initCommsGridQuda() takes an optional "rank_from_coords" argument that + * should be a pointer to a user-defined function with this prototype. + * + * @param coords Node coordinates + * @param fdata Any auxiliary data needed by the function + * @return MPI rank or QMP node ID cooresponding to the node coordinates + * + * @see initCommsGridQuda + */ +typedef int (*QudaCommsMap)(const int *coords, void *fdata); - /** - * Declare the grid mapping ("logical topology" in QMP parlance) - * used for communications in a multi-GPU grid. This function - * should be called prior to initQuda(). The only case in which - * it's optional is when QMP is used for communication and the - * logical topology has already been declared by the application. - * - * @param nDim Number of grid dimensions. "4" is the only supported - * value currently. - * - * @param dims Array of grid dimensions. dims[0]*dims[1]*dims[2]*dims[3] - * must equal the total number of MPI ranks or QMP nodes. - * - * @param func Pointer to a user-supplied function that maps coordinates - * in the communication grid to MPI ranks (or QMP node IDs). - * If the pointer is NULL, the default mapping depends on - * whether QMP or MPI is being used for communication. With - * QMP, the existing logical topology is used if it's been - * declared. With MPI or as a fallback with QMP, the default - * ordering is lexicographical with the fourth ("t") index - * varying fastest. - * - * @param fdata Pointer to any data required by "func" (may be NULL) - * - * @see QudaCommsMap - */ +/** + * @param mycomm User provided MPI communicator in place of MPI_COMM_WORLD + */ - void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata); +void qudaSetCommHandle(void *mycomm); - /** - * Initialize the library. This is a low-level interface that is - * called by initQuda. Calling initQudaDevice requires that the - * user also call initQudaMemory before using QUDA. - * - * @param device CUDA device number to use. In a multi-GPU build, - * this parameter may either be set explicitly on a - * per-process basis or set to -1 to enable a default - * allocation of devices to processes. - */ - void initQudaDevice(int device); +/** + * Declare the grid mapping ("logical topology" in QMP parlance) + * used for communications in a multi-GPU grid. This function + * should be called prior to initQuda(). The only case in which + * it's optional is when QMP is used for communication and the + * logical topology has already been declared by the application. + * + * @param nDim Number of grid dimensions. "4" is the only supported + * value currently. + * + * @param dims Array of grid dimensions. dims[0]*dims[1]*dims[2]*dims[3] + * must equal the total number of MPI ranks or QMP nodes. + * + * @param func Pointer to a user-supplied function that maps coordinates + * in the communication grid to MPI ranks (or QMP node IDs). + * If the pointer is NULL, the default mapping depends on + * whether QMP or MPI is being used for communication. With + * QMP, the existing logical topology is used if it's been + * declared. With MPI or as a fallback with QMP, the default + * ordering is lexicographical with the fourth ("t") index + * varying fastest. + * + * @param fdata Pointer to any data required by "func" (may be NULL) + * + * @see QudaCommsMap + */ - /** - * Initialize the library persistant memory allocations (both host - * and device). This is a low-level interface that is called by - * initQuda. Calling initQudaMemory requires that the user has - * previously called initQudaDevice. - */ - void initQudaMemory(); +void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata); - /** - * Initialize the library. This function is actually a wrapper - * around calls to initQudaDevice() and initQudaMemory(). - * - * @param device CUDA device number to use. In a multi-GPU build, - * this parameter may either be set explicitly on a - * per-process basis or set to -1 to enable a default - * allocation of devices to processes. - */ - void initQuda(int device); +/** + * Initialize the library. This is a low-level interface that is + * called by initQuda. Calling initQudaDevice requires that the + * user also call initQudaMemory before using QUDA. + * + * @param device CUDA device number to use. In a multi-GPU build, + * this parameter may either be set explicitly on a + * per-process basis or set to -1 to enable a default + * allocation of devices to processes. + */ +void initQudaDevice(int device); - /** - * Finalize the library. - */ - void endQuda(void); +/** + * Initialize the library persistant memory allocations (both host + * and device). This is a low-level interface that is called by + * initQuda. Calling initQudaMemory requires that the user has + * previously called initQudaDevice. + */ +void initQudaMemory(); - /** - * @brief update the radius for halos. - * @details This should only be needed for automated testing when - * different partitioning is applied within a single run. - */ - void updateR(); +/** + * Initialize the library. This function is actually a wrapper + * around calls to initQudaDevice() and initQudaMemory(). + * + * @param device CUDA device number to use. In a multi-GPU build, + * this parameter may either be set explicitly on a + * per-process basis or set to -1 to enable a default + * allocation of devices to processes. + */ +void initQuda(int device); - /** - * A new QudaGaugeParam should always be initialized immediately - * after it's defined (and prior to explicitly setting its members) - * using this function. Typical usage is as follows: - * - * QudaGaugeParam gauge_param = newQudaGaugeParam(); - */ - QudaGaugeParam newQudaGaugeParam(void); +/** + * Finalize the library. + */ +void endQuda(void); - /** - * A new QudaInvertParam should always be initialized immediately - * after it's defined (and prior to explicitly setting its members) - * using this function. Typical usage is as follows: - * - * QudaInvertParam invert_param = newQudaInvertParam(); - */ - QudaInvertParam newQudaInvertParam(void); +/** + * @brief update the radius for halos. + * @details This should only be needed for automated testing when + * different partitioning is applied within a single run. + */ +void updateR(); - /** - * A new QudaMultigridParam should always be initialized immediately - * after it's defined (and prior to explicitly setting its members) - * using this function. Typical usage is as follows: - * - * QudaMultigridParam mg_param = newQudaMultigridParam(); - */ - QudaMultigridParam newQudaMultigridParam(void); +/** + * A new QudaGaugeParam should always be initialized immediately + * after it's defined (and prior to explicitly setting its members) + * using this function. Typical usage is as follows: + * + * QudaGaugeParam gauge_param = newQudaGaugeParam(); + */ +QudaGaugeParam newQudaGaugeParam(void); - /** - * A new QudaEigParam should always be initialized immediately - * after it's defined (and prior to explicitly setting its members) - * using this function. Typical usage is as follows: - * - * QudaEigParam eig_param = newQudaEigParam(); - */ - QudaEigParam newQudaEigParam(void); +/** + * A new QudaInvertParam should always be initialized immediately + * after it's defined (and prior to explicitly setting its members) + * using this function. Typical usage is as follows: + * + * QudaInvertParam invert_param = newQudaInvertParam(); + */ +QudaInvertParam newQudaInvertParam(void); - /** - * A new QudaGaugeObservableParam should always be initialized - * immediately after it's defined (and prior to explicitly setting - * its members) using this function. Typical usage is as follows: - * - * QudaGaugeObservalbeParam obs_param = newQudaGaugeObservableParam(); - */ - QudaGaugeObservableParam newQudaGaugeObservableParam(void); +/** + * A new QudaMultigridParam should always be initialized immediately + * after it's defined (and prior to explicitly setting its members) + * using this function. Typical usage is as follows: + * + * QudaMultigridParam mg_param = newQudaMultigridParam(); + */ +QudaMultigridParam newQudaMultigridParam(void); - /** - * A new QudaGaugeSmearParam should always be initialized - * immediately after it's defined (and prior to explicitly setting - * its members) using this function. Typical usage is as follows: - * - * QudaGaugeSmearParam smear_param = newQudaGaugeSmearParam(); - */ - QudaGaugeSmearParam newQudaGaugeSmearParam(void); +/** + * A new QudaEigParam should always be initialized immediately + * after it's defined (and prior to explicitly setting its members) + * using this function. Typical usage is as follows: + * + * QudaEigParam eig_param = newQudaEigParam(); + */ +QudaEigParam newQudaEigParam(void); - /** - * A new QudaBLASParam should always be initialized immediately - * after it's defined (and prior to explicitly setting its members) - * using this function. Typical usage is as follows: - * - * QudaBLASParam blas_param = newQudaBLASParam(); - */ - QudaBLASParam newQudaBLASParam(void); +/** + * A new QudaGaugeObservableParam should always be initialized + * immediately after it's defined (and prior to explicitly setting + * its members) using this function. Typical usage is as follows: + * + * QudaGaugeObservalbeParam obs_param = newQudaGaugeObservableParam(); + */ +QudaGaugeObservableParam newQudaGaugeObservableParam(void); - /** - * Print the members of QudaGaugeParam. - * @param param The QudaGaugeParam whose elements we are to print. - */ - void printQudaGaugeParam(QudaGaugeParam *param); +/** + * A new QudaGaugeSmearParam should always be initialized + * immediately after it's defined (and prior to explicitly setting + * its members) using this function. Typical usage is as follows: + * + * QudaGaugeSmearParam smear_param = newQudaGaugeSmearParam(); + */ +QudaGaugeSmearParam newQudaGaugeSmearParam(void); - /** - * Print the members of QudaInvertParam. - * @param param The QudaInvertParam whose elements we are to print. - */ - void printQudaInvertParam(QudaInvertParam *param); +/** + * A new QudaBLASParam should always be initialized immediately + * after it's defined (and prior to explicitly setting its members) + * using this function. Typical usage is as follows: + * + * QudaBLASParam blas_param = newQudaBLASParam(); + */ +QudaBLASParam newQudaBLASParam(void); - /** - * Print the members of QudaMultigridParam. - * @param param The QudaMultigridParam whose elements we are to print. - */ - void printQudaMultigridParam(QudaMultigridParam *param); +/** + * Print the members of QudaGaugeParam. + * @param param The QudaGaugeParam whose elements we are to print. + */ +void printQudaGaugeParam(QudaGaugeParam *param); - /** - * Print the members of QudaEigParam. - * @param param The QudaEigParam whose elements we are to print. - */ - void printQudaEigParam(QudaEigParam *param); +/** + * Print the members of QudaInvertParam. + * @param param The QudaInvertParam whose elements we are to print. + */ +void printQudaInvertParam(QudaInvertParam *param); - /** - * Print the members of QudaGaugeObservableParam. - * @param param The QudaGaugeObservableParam whose elements we are to print. - */ - void printQudaGaugeObservableParam(QudaGaugeObservableParam *param); +/** + * Print the members of QudaMultigridParam. + * @param param The QudaMultigridParam whose elements we are to print. + */ +void printQudaMultigridParam(QudaMultigridParam *param); - /** - * Print the members of QudaBLASParam. - * @param param The QudaBLASParam whose elements we are to print. - */ - void printQudaBLASParam(QudaBLASParam *param); +/** + * Print the members of QudaEigParam. + * @param param The QudaEigParam whose elements we are to print. + */ +void printQudaEigParam(QudaEigParam *param); - /** - * Load the gauge field from the host. - * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) - * @param param Contains all metadata regarding host and device storage - */ - void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param); +/** + * Print the members of QudaGaugeObservableParam. + * @param param The QudaGaugeObservableParam whose elements we are to print. + */ +void printQudaGaugeObservableParam(QudaGaugeObservableParam *param); - /** - * Free QUDA's internal copy of the gauge field. - */ - void freeGaugeQuda(void); +/** + * Print the members of QudaBLASParam. + * @param param The QudaBLASParam whose elements we are to print. + */ +void printQudaBLASParam(QudaBLASParam *param); - /** - * Save the gauge field to the host. - * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) - * @param param Contains all metadata regarding host and device storage - */ - void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param); +/** + * Load the gauge field from the host. + * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) + * @param param Contains all metadata regarding host and device storage + */ +void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param); - /** - * Load the clover term and/or the clover inverse from the host. - * Either h_clover or h_clovinv may be set to NULL. - * @param h_clover Base pointer to host clover field - * @param h_cloverinv Base pointer to host clover inverse field - * @param inv_param Contains all metadata regarding host and device storage - */ - void loadCloverQuda(void *h_clover, void *h_clovinv, - QudaInvertParam *inv_param); +/** + * Free QUDA's internal copy of the gauge field. + */ +void freeGaugeQuda(void); - /** - * Free QUDA's internal copy of the clover term and/or clover inverse. - */ - void freeCloverQuda(void); +/** + * Save the gauge field to the host. + * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) + * @param param Contains all metadata regarding host and device storage + */ +void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param); - /** - * Perform the solve, according to the parameters set in param. It - * is assumed that the gauge field has already been loaded via - * loadGaugeQuda(). - * @param h_x Solution spinor field - * @param h_b Source spinor field - * @param param Contains all metadata regarding host and device - * storage and solver parameters - */ - void lanczosQuda(int k0, int m, void *hp_Apsi, void *hp_r, void *hp_V, void *hp_alpha, void *hp_beta, - QudaEigParam *eig_param); +/** + * Load the clover term and/or the clover inverse from the host. + * Either h_clover or h_clovinv may be set to NULL. + * @param h_clover Base pointer to host clover field + * @param h_cloverinv Base pointer to host clover inverse field + * @param inv_param Contains all metadata regarding host and device storage + */ +void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param); - /** - * Perform the eigensolve. The problem matrix is defined by the invert param, the - * mode of solution is specified by the eig param. It is assumed that the gauge - * field has already been loaded via loadGaugeQuda(). - * @param h_evecs Array of pointers to application eigenvectors - * @param h_evals Host side eigenvalues - * @param param Contains all metadata regarding the type of solve. - */ - void eigensolveQuda(void **h_evecs, double_complex *h_evals, QudaEigParam *param); +/** + * Free QUDA's internal copy of the clover term and/or clover inverse. + */ +void freeCloverQuda(void); - /** - * Perform the solve, according to the parameters set in param. It - * is assumed that the gauge field has already been loaded via - * loadGaugeQuda(). - * @param h_x Solution spinor field - * @param h_b Source spinor field - * @param param Contains all metadata regarding host and device - * storage and solver parameters - */ - void invertQuda(void *h_x, void *h_b, QudaInvertParam *param); +/** + * Perform the solve, according to the parameters set in param. It + * is assumed that the gauge field has already been loaded via + * loadGaugeQuda(). + * @param h_x Solution spinor field + * @param h_b Source spinor field + * @param param Contains all metadata regarding host and device + * storage and solver parameters + */ +void lanczosQuda(int k0, int m, void *hp_Apsi, void *hp_r, void *hp_V, void *hp_alpha, void *hp_beta, + QudaEigParam *eig_param); - /** - * @brief Perform the solve like @invertQuda but for multiple rhs by spliting the comm grid into - * sub-partitions: each sub-partition invert one or more rhs'. - * The QudaInvertParam object specifies how the solve should be performed on each sub-partition. - * Unlike @invertQuda, the interface also takes the host side gauge as input. The gauge pointer and - * gauge_param are used if for inv_param split_grid[0] * split_grid[1] * split_grid[2] * split_grid[3] - * is larger than 1, in which case gauge field is not required to be loaded beforehand; otherwise - * this interface would just work as @invertQuda, which requires gauge field to be loaded beforehand, - * and the gauge field pointer and gauge_param are not used. - * @param _hp_x Array of solution spinor fields - * @param _hp_b Array of source spinor fields - * @param param Contains all metadata regarding host and device storage and solver parameters - * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) - * @param gauge_param Contains all metadata regarding host and device storage for gauge field - */ - void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, QudaGaugeParam *gauge_param); +/** + * Perform the eigensolve. The problem matrix is defined by the invert param, the + * mode of solution is specified by the eig param. It is assumed that the gauge + * field has already been loaded via loadGaugeQuda(). + * @param h_evecs Array of pointers to application eigenvectors + * @param h_evals Host side eigenvalues + * @param param Contains all metadata regarding the type of solve. + */ +void eigensolveQuda(void **h_evecs, double_complex *h_evals, QudaEigParam *param); - /** - * @brief Really the same with @invertMultiSrcQuda but for staggered-style fermions, by accepting pointers - * to fat links and long links. - * @param _hp_x Array of solution spinor fields - * @param _hp_b Array of source spinor fields - * @param param Contains all metadata regarding host and device storage and solver parameters - * @param milc_fatlinks Base pointer to host **fat** gauge field (regardless of dimensionality) - * @param milc_longlinks Base pointer to host **long** gauge field (regardless of dimensionality) - * @param gauge_param Contains all metadata regarding host and device storage for gauge field - */ - void invertMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *milc_fatlinks, - void *milc_longlinks, QudaGaugeParam *gauge_param); +/** + * Perform the solve, according to the parameters set in param. It + * is assumed that the gauge field has already been loaded via + * loadGaugeQuda(). + * @param h_x Solution spinor field + * @param h_b Source spinor field + * @param param Contains all metadata regarding host and device + * storage and solver parameters + */ +void invertQuda(void *h_x, void *h_b, QudaInvertParam *param); - /** - * @brief Really the same with @invertMultiSrcQuda but for clover-style fermions, by accepting pointers - * to direct and inverse clover field pointers. - * @param _hp_x Array of solution spinor fields - * @param _hp_b Array of source spinor fields - * @param param Contains all metadata regarding host and device storage and solver parameters - * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) - * @param gauge_param Contains all metadata regarding host and device storage for gauge field - * @param h_clover Base pointer to the direct clover field - * @param h_clovinv Base pointer to the inverse clover field - */ - void invertMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, - QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv); +/** + * @brief Perform the solve like @invertQuda but for multiple rhs by spliting the comm grid into + * sub-partitions: each sub-partition invert one or more rhs'. + * The QudaInvertParam object specifies how the solve should be performed on each sub-partition. + * Unlike @invertQuda, the interface also takes the host side gauge as input. The gauge pointer and + * gauge_param are used if for inv_param split_grid[0] * split_grid[1] * split_grid[2] * split_grid[3] + * is larger than 1, in which case gauge field is not required to be loaded beforehand; otherwise + * this interface would just work as @invertQuda, which requires gauge field to be loaded beforehand, + * and the gauge field pointer and gauge_param are not used. + * @param _hp_x Array of solution spinor fields + * @param _hp_b Array of source spinor fields + * @param param Contains all metadata regarding host and device storage and solver parameters + * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) + * @param gauge_param Contains all metadata regarding host and device storage for gauge field + */ +void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, QudaGaugeParam *gauge_param); - /** - * Solve for multiple shifts (e.g., masses). - * @param _hp_x Array of solution spinor fields - * @param _hp_b Source spinor fields - * @param param Contains all metadata regarding host and device - * storage and solver parameters - */ - void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param); +/** + * @brief Really the same with @invertMultiSrcQuda but for staggered-style fermions, by accepting pointers + * to fat links and long links. + * @param _hp_x Array of solution spinor fields + * @param _hp_b Array of source spinor fields + * @param param Contains all metadata regarding host and device storage and solver parameters + * @param milc_fatlinks Base pointer to host **fat** gauge field (regardless of dimensionality) + * @param milc_longlinks Base pointer to host **long** gauge field (regardless of dimensionality) + * @param gauge_param Contains all metadata regarding host and device storage for gauge field + */ +void invertMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *milc_fatlinks, + void *milc_longlinks, QudaGaugeParam *gauge_param); - /** - * Setup the multigrid solver, according to the parameters set in param. It - * is assumed that the gauge field has already been loaded via - * loadGaugeQuda(). - * @param param Contains all metadata regarding host and device - * storage and solver parameters - */ - void* newMultigridQuda(QudaMultigridParam *param); +/** + * @brief Really the same with @invertMultiSrcQuda but for clover-style fermions, by accepting pointers + * to direct and inverse clover field pointers. + * @param _hp_x Array of solution spinor fields + * @param _hp_b Array of source spinor fields + * @param param Contains all metadata regarding host and device storage and solver parameters + * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) + * @param gauge_param Contains all metadata regarding host and device storage for gauge field + * @param h_clover Base pointer to the direct clover field + * @param h_clovinv Base pointer to the inverse clover field + */ +void invertMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, + QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv); - /** - * @brief Free resources allocated by the multigrid solver - * @param mg_instance Pointer to instance of multigrid_solver - * @param param Contains all metadata regarding host and device - * storage and solver parameters - */ - void destroyMultigridQuda(void *mg_instance); +/** + * Solve for multiple shifts (e.g., masses). + * @param _hp_x Array of solution spinor fields + * @param _hp_b Source spinor fields + * @param param Contains all metadata regarding host and device + * storage and solver parameters + */ +void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param); - /** - * @brief Updates the multigrid preconditioner for the new gauge / clover field - * @param mg_instance Pointer to instance of multigrid_solver - * @param param Contains all metadata regarding host and device - * storage and solver parameters, of note contains a flag specifying whether - * to do a full update or a thin update. - */ - void updateMultigridQuda(void *mg_instance, QudaMultigridParam *param); +/** + * Setup the multigrid solver, according to the parameters set in param. It + * is assumed that the gauge field has already been loaded via + * loadGaugeQuda(). + * @param param Contains all metadata regarding host and device + * storage and solver parameters + */ +void *newMultigridQuda(QudaMultigridParam *param); - /** - * @brief Dump the null-space vectors to disk - * @param[in] mg_instance Pointer to the instance of multigrid_solver - * @param[in] param Contains all metadata regarding host and device - * storage and solver parameters (QudaMultigridParam::vec_outfile - * sets the output filename prefix). - */ - void dumpMultigridQuda(void *mg_instance, QudaMultigridParam *param); +/** + * @brief Free resources allocated by the multigrid solver + * @param mg_instance Pointer to instance of multigrid_solver + * @param param Contains all metadata regarding host and device + * storage and solver parameters + */ +void destroyMultigridQuda(void *mg_instance); - /** - * Apply the Dslash operator (D_{eo} or D_{oe}). - * @param h_out Result spinor field - * @param h_in Input spinor field - * @param param Contains all metadata regarding host and device - * storage - * @param parity The destination parity of the field - */ - void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity); +/** + * @brief Updates the multigrid preconditioner for the new gauge / clover field + * @param mg_instance Pointer to instance of multigrid_solver + * @param param Contains all metadata regarding host and device + * storage and solver parameters, of note contains a flag specifying whether + * to do a full update or a thin update. + */ +void updateMultigridQuda(void *mg_instance, QudaMultigridParam *param); - /** - * @brief Perform the solve like @dslashQuda but for multiple rhs by spliting the comm grid into - * sub-partitions: each sub-partition does one or more rhs'. - * The QudaInvertParam object specifies how the solve should be performed on each sub-partition. - * Unlike @invertQuda, the interface also takes the host side gauge as - * input - gauge field is not required to be loaded beforehand. - * @param _hp_x Array of solution spinor fields - * @param _hp_b Array of source spinor fields - * @param param Contains all metadata regarding host and device storage and solver parameters - * @param parity Parity to apply dslash on - * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) - * @param gauge_param Contains all metadata regarding host and device storage for gauge field - */ - void dslashMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, - QudaGaugeParam *gauge_param); - /** - * @brief Really the same with @dslashMultiSrcQuda but for staggered-style fermions, by accepting pointers - * to fat links and long links. - * @param _hp_x Array of solution spinor fields - * @param _hp_b Array of source spinor fields - * @param param Contains all metadata regarding host and device storage and solver parameters - * @param parity Parity to apply dslash on - * @param milc_fatlinks Base pointer to host **fat** gauge field (regardless of dimensionality) - * @param milc_longlinks Base pointer to host **long** gauge field (regardless of dimensionality) - * @param gauge_param Contains all metadata regarding host and device storage for gauge field - */ +/** + * @brief Dump the null-space vectors to disk + * @param[in] mg_instance Pointer to the instance of multigrid_solver + * @param[in] param Contains all metadata regarding host and device + * storage and solver parameters (QudaMultigridParam::vec_outfile + * sets the output filename prefix). + */ +void dumpMultigridQuda(void *mg_instance, QudaMultigridParam *param); - void dslashMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, - void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param); +/** + * Apply the Dslash operator (D_{eo} or D_{oe}). + * @param h_out Result spinor field + * @param h_in Input spinor field + * @param param Contains all metadata regarding host and device + * storage + * @param parity The destination parity of the field + */ +void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity); - /** - * @brief Really the same with @dslashMultiSrcQuda but for clover-style fermions, by accepting pointers - * to direct and inverse clover field pointers. - * @param _hp_x Array of solution spinor fields - * @param _hp_b Array of source spinor fields - * @param param Contains all metadata regarding host and device storage and solver parameters - * @param parity Parity to apply dslash on - * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) - * @param gauge_param Contains all metadata regarding host and device storage for gauge field - * @param h_clover Base pointer to the direct clover field - * @param h_clovinv Base pointer to the inverse clover field - */ - void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, - QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv); +/** + * @brief Perform the solve like @dslashQuda but for multiple rhs by spliting the comm grid into + * sub-partitions: each sub-partition does one or more rhs'. + * The QudaInvertParam object specifies how the solve should be performed on each sub-partition. + * Unlike @invertQuda, the interface also takes the host side gauge as + * input - gauge field is not required to be loaded beforehand. + * @param _hp_x Array of solution spinor fields + * @param _hp_b Array of source spinor fields + * @param param Contains all metadata regarding host and device storage and solver parameters + * @param parity Parity to apply dslash on + * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) + * @param gauge_param Contains all metadata regarding host and device storage for gauge field + */ +void dslashMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, + QudaGaugeParam *gauge_param); +/** + * @brief Really the same with @dslashMultiSrcQuda but for staggered-style fermions, by accepting pointers + * to fat links and long links. + * @param _hp_x Array of solution spinor fields + * @param _hp_b Array of source spinor fields + * @param param Contains all metadata regarding host and device storage and solver parameters + * @param parity Parity to apply dslash on + * @param milc_fatlinks Base pointer to host **fat** gauge field (regardless of dimensionality) + * @param milc_longlinks Base pointer to host **long** gauge field (regardless of dimensionality) + * @param gauge_param Contains all metadata regarding host and device storage for gauge field + */ - /** - * Apply the clover operator or its inverse. - * @param h_out Result spinor field - * @param h_in Input spinor field - * @param param Contains all metadata regarding host and device - * storage - * @param parity The source and destination parity of the field - * @param inverse Whether to apply the inverse of the clover term - */ - void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse); +void dslashMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, + void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param); - /** - * Apply the full Dslash matrix, possibly even/odd preconditioned. - * @param h_out Result spinor field - * @param h_in Input spinor field - * @param param Contains all metadata regarding host and device - * storage - */ - void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param); +/** + * @brief Really the same with @dslashMultiSrcQuda but for clover-style fermions, by accepting pointers + * to direct and inverse clover field pointers. + * @param _hp_x Array of solution spinor fields + * @param _hp_b Array of source spinor fields + * @param param Contains all metadata regarding host and device storage and solver parameters + * @param parity Parity to apply dslash on + * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) + * @param gauge_param Contains all metadata regarding host and device storage for gauge field + * @param h_clover Base pointer to the direct clover field + * @param h_clovinv Base pointer to the inverse clover field + */ +void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, + QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv); - /** - * Apply M^{\dag}M, possibly even/odd preconditioned. - * @param h_out Result spinor field - * @param h_in Input spinor field - * @param param Contains all metadata regarding host and device - * storage - */ - void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param); +/** + * Apply the clover operator or its inverse. + * @param h_out Result spinor field + * @param h_in Input spinor field + * @param param Contains all metadata regarding host and device + * storage + * @param parity The source and destination parity of the field + * @param inverse Whether to apply the inverse of the clover term + */ +void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse); +/** + * Apply the full Dslash matrix, possibly even/odd preconditioned. + * @param h_out Result spinor field + * @param h_in Input spinor field + * @param param Contains all metadata regarding host and device + * storage + */ +void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param); - /* - * The following routines are temporary additions used by the HISQ - * link-fattening code. - */ +/** + * Apply M^{\dag}M, possibly even/odd preconditioned. + * @param h_out Result spinor field + * @param h_in Input spinor field + * @param param Contains all metadata regarding host and device + * storage + */ +void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param); - void set_dim(int *); - void pack_ghost(void **cpuLink, void **cpuGhost, int nFace, - QudaPrecision precision); +/* + * The following routines are temporary additions used by the HISQ + * link-fattening code. + */ - void computeKSLinkQuda(void* fatlink, void* longlink, void* ulink, void* inlink, - double *path_coeff, QudaGaugeParam *param); +void set_dim(int *); +void pack_ghost(void **cpuLink, void **cpuGhost, int nFace, QudaPrecision precision); - /** - * Either downloads and sets the resident momentum field, or uploads - * and returns the resident momentum field - * - * @param[in,out] mom The external momentum field - * @param[in] param The parameters of the external field - */ - void momResidentQuda(void *mom, QudaGaugeParam *param); +void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, + QudaGaugeParam *param); - /** - * Compute the gauge force and update the momentum field - * - * @param[in,out] mom The momentum field to be updated - * @param[in] sitelink The gauge field from which we compute the force - * @param[in] input_path_buf[dim][num_paths][path_length] - * @param[in] path_length One less that the number of links in a loop (e.g., 3 for a staple) - * @param[in] loop_coeff Coefficients of the different loops in the Symanzik action - * @param[in] num_paths How many contributions from path_length different "staples" - * @param[in] max_length The maximum number of non-zero of links in any path in the action - * @param[in] dt The integration step size (for MILC this is dt*beta/3) - * @param[in] param The parameters of the external fields and the computation settings - */ - int computeGaugeForceQuda(void *mom, void *sitelink, int ***input_path_buf, int *path_length, double *loop_coeff, - int num_paths, int max_length, double dt, QudaGaugeParam *qudaGaugeParam); +/** + * Either downloads and sets the resident momentum field, or uploads + * and returns the resident momentum field + * + * @param[in,out] mom The external momentum field + * @param[in] param The parameters of the external field + */ +void momResidentQuda(void *mom, QudaGaugeParam *param); - /** - * Compute the product of gauge links along a path and add to/overwrite the output field - * - * @param[in,out] out The output field to be updated - * @param[in] sitelink The gauge field from which we compute the products of gauge links - * @param[in] input_path_buf[dim][num_paths][path_length] - * @param[in] path_length One less that the number of links in a loop (e.g., 3 for a staple) - * @param[in] loop_coeff Coefficients of the different loops in the Symanzik action - * @param[in] num_paths How many contributions from path_length different "staples" - * @param[in] max_length The maximum number of non-zero of links in any path in the action - * @param[in] dt The integration step size (for MILC this is dt*beta/3) - * @param[in] param The parameters of the external fields and the computation settings - */ - int computeGaugePathQuda(void *out, void *sitelink, int ***input_path_buf, int *path_length, double *loop_coeff, - int num_paths, int max_length, double dt, QudaGaugeParam *qudaGaugeParam); +/** + * Compute the gauge force and update the momentum field + * + * @param[in,out] mom The momentum field to be updated + * @param[in] sitelink The gauge field from which we compute the force + * @param[in] input_path_buf[dim][num_paths][path_length] + * @param[in] path_length One less that the number of links in a loop (e.g., 3 for a staple) + * @param[in] loop_coeff Coefficients of the different loops in the Symanzik action + * @param[in] num_paths How many contributions from path_length different "staples" + * @param[in] max_length The maximum number of non-zero of links in any path in the action + * @param[in] dt The integration step size (for MILC this is dt*beta/3) + * @param[in] param The parameters of the external fields and the computation settings + */ +int computeGaugeForceQuda(void *mom, void *sitelink, int ***input_path_buf, int *path_length, double *loop_coeff, + int num_paths, int max_length, double dt, QudaGaugeParam *qudaGaugeParam); - /** - * Compute the traces of products of gauge links along paths using the resident field - * - * @param[in,out] traces The computed traces - * @param[in] sitelink The gauge field from which we compute the products of gauge links - * @param[in] path_length The number of links in each loop - * @param[in] loop_coeff Multiplicative coefficients for each loop - * @param[in] num_paths Total number of loops - * @param[in] max_length The maximum number of non-zero of links in any path in the action - * @param[in] factor An overall normalization factor - */ - void computeGaugeLoopTraceQuda(double_complex *traces, int **input_path_buf, int *path_length, double *loop_coeff, - int num_paths, int max_length, double factor); +/** + * Compute the product of gauge links along a path and add to/overwrite the output field + * + * @param[in,out] out The output field to be updated + * @param[in] sitelink The gauge field from which we compute the products of gauge links + * @param[in] input_path_buf[dim][num_paths][path_length] + * @param[in] path_length One less that the number of links in a loop (e.g., 3 for a staple) + * @param[in] loop_coeff Coefficients of the different loops in the Symanzik action + * @param[in] num_paths How many contributions from path_length different "staples" + * @param[in] max_length The maximum number of non-zero of links in any path in the action + * @param[in] dt The integration step size (for MILC this is dt*beta/3) + * @param[in] param The parameters of the external fields and the computation settings + */ +int computeGaugePathQuda(void *out, void *sitelink, int ***input_path_buf, int *path_length, double *loop_coeff, + int num_paths, int max_length, double dt, QudaGaugeParam *qudaGaugeParam); - /** - * Evolve the gauge field by step size dt, using the momentum field - * I.e., Evalulate U(t+dt) = e(dt pi) U(t) - * - * @param gauge The gauge field to be updated - * @param momentum The momentum field - * @param dt The integration step size step - * @param conj_mom Whether to conjugate the momentum matrix - * @param exact Whether to use an exact exponential or Taylor expand - * @param param The parameters of the external fields and the computation settings - */ - void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, - int conj_mom, int exact, QudaGaugeParam* param); +/** + * Compute the traces of products of gauge links along paths using the resident field + * + * @param[in,out] traces The computed traces + * @param[in] sitelink The gauge field from which we compute the products of gauge links + * @param[in] path_length The number of links in each loop + * @param[in] loop_coeff Multiplicative coefficients for each loop + * @param[in] num_paths Total number of loops + * @param[in] max_length The maximum number of non-zero of links in any path in the action + * @param[in] factor An overall normalization factor + */ +void computeGaugeLoopTraceQuda(double_complex *traces, int **input_path_buf, int *path_length, double *loop_coeff, + int num_paths, int max_length, double factor); - /** - * Apply the staggered phase factors to the gauge field. If the - * imaginary chemical potential is non-zero then the phase factor - * exp(imu/T) will be applied to the links in the temporal - * direction. - * - * @param gauge_h The gauge field - * @param param The parameters of the gauge field - */ - void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param); +/** + * Evolve the gauge field by step size dt, using the momentum field + * I.e., Evalulate U(t+dt) = e(dt pi) U(t) + * + * @param gauge The gauge field to be updated + * @param momentum The momentum field + * @param dt The integration step size step + * @param conj_mom Whether to conjugate the momentum matrix + * @param exact Whether to use an exact exponential or Taylor expand + * @param param The parameters of the external fields and the computation settings + */ +void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param); - /** - * Project the input field on the SU(3) group. If the target - * tolerance is not met, this routine will give a runtime error. - * - * @param gauge_h The gauge field to be updated - * @param tol The tolerance to which we iterate - * @param param The parameters of the gauge field - */ - void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param); +/** + * Apply the staggered phase factors to the gauge field. If the + * imaginary chemical potential is non-zero then the phase factor + * exp(imu/T) will be applied to the links in the temporal + * direction. + * + * @param gauge_h The gauge field + * @param param The parameters of the gauge field + */ +void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param); - /** - * Evaluate the momentum contribution to the Hybrid Monte Carlo - * action. - * - * @param momentum The momentum field - * @param param The parameters of the external fields and the computation settings - * @return momentum action - */ - double momActionQuda(void* momentum, QudaGaugeParam* param); +/** + * Project the input field on the SU(3) group. If the target + * tolerance is not met, this routine will give a runtime error. + * + * @param gauge_h The gauge field to be updated + * @param tol The tolerance to which we iterate + * @param param The parameters of the gauge field + */ +void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param); - /** - * Allocate a gauge (matrix) field on the device and optionally download a host gauge field. - * - * @param gauge The host gauge field (optional - if set to 0 then the gauge field zeroed) - * @param geometry The geometry of the matrix field to create (1 - scalar, 4 - vector, 6 - tensor) - * @param param The parameters of the external field and the field to be created - * @return Pointer to the gauge field (cast as a void*) - */ - void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param); +/** + * Evaluate the momentum contribution to the Hybrid Monte Carlo + * action. + * + * @param momentum The momentum field + * @param param The parameters of the external fields and the computation settings + * @return momentum action + */ +double momActionQuda(void *momentum, QudaGaugeParam *param); - /** - * Copy the QUDA gauge (matrix) field on the device to the CPU - * - * @param outGauge Pointer to the host gauge field - * @param inGauge Pointer to the device gauge field (QUDA device field) - * @param param The parameters of the host and device fields - */ - void saveGaugeFieldQuda(void* outGauge, void* inGauge, QudaGaugeParam* param); +/** + * Allocate a gauge (matrix) field on the device and optionally download a host gauge field. + * + * @param gauge The host gauge field (optional - if set to 0 then the gauge field zeroed) + * @param geometry The geometry of the matrix field to create (1 - scalar, 4 - vector, 6 - tensor) + * @param param The parameters of the external field and the field to be created + * @return Pointer to the gauge field (cast as a void*) + */ +void *createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param); - /** - * Reinterpret gauge as a pointer to cudaGaugeField and call destructor. - * - * @param gauge Gauge field to be freed - */ - void destroyGaugeFieldQuda(void* gauge); +/** + * Copy the QUDA gauge (matrix) field on the device to the CPU + * + * @param outGauge Pointer to the host gauge field + * @param inGauge Pointer to the device gauge field (QUDA device field) + * @param param The parameters of the host and device fields + */ +void saveGaugeFieldQuda(void *outGauge, void *inGauge, QudaGaugeParam *param); - /** - * Compute the clover field and its inverse from the resident gauge field. - * - * @param param The parameters of the clover field to create - */ - void createCloverQuda(QudaInvertParam* param); +/** + * Reinterpret gauge as a pointer to cudaGaugeField and call destructor. + * + * @param gauge Gauge field to be freed + */ +void destroyGaugeFieldQuda(void *gauge); - /** - * Compute the clover force contributions in each dimension mu given - * the array of solution fields, and compute the resulting momentum - * field. - * - * @param mom Force matrix - * @param dt Integrating step size - * @param x Array of solution vectors - * @param p Array of intermediate vectors - * @param coeff Array of residues for each contribution (multiplied by stepsize) - * @param kappa2 -kappa*kappa parameter - * @param ck -clover_coefficient * kappa / 8 - * @param nvec Number of vectors - * @param multiplicity Number fermions this bilinear reresents - * @param gauge Gauge Field - * @param gauge_param Gauge field meta data - * @param inv_param Dirac and solver meta data - */ - void computeCloverForceQuda(void *mom, double dt, void **x, void **p, double *coeff, double kappa2, double ck, - int nvector, double multiplicity, void *gauge, - QudaGaugeParam *gauge_param, QudaInvertParam *inv_param); +/** + * Compute the clover field and its inverse from the resident gauge field. + * + * @param param The parameters of the clover field to create + */ +void createCloverQuda(QudaInvertParam *param); - /** - * Compute the naive staggered force. All fields must be in the same precision. - * - * @param mom Momentum field - * @param dt Integrating step size - * @param delta Additional scale factor when updating momentum (mom += delta * [force]_TA - * @param gauge Gauge field (at present only supports resident gauge field) - * @param x Array of single-parity solution vectors (at present only supports resident solutions) - * @param gauge_param Gauge field meta data - * @param invert_param Dirac and solver meta data - */ - void computeStaggeredForceQuda(void *mom, double dt, double delta, void *gauge, void **x, QudaGaugeParam *gauge_param, - QudaInvertParam *invert_param); +/** + * Compute the clover force contributions in each dimension mu given + * the array of solution fields, and compute the resulting momentum + * field. + * + * @param mom Force matrix + * @param dt Integrating step size + * @param x Array of solution vectors + * @param p Array of intermediate vectors + * @param coeff Array of residues for each contribution (multiplied by stepsize) + * @param kappa2 -kappa*kappa parameter + * @param ck -clover_coefficient * kappa / 8 + * @param nvec Number of vectors + * @param multiplicity Number fermions this bilinear reresents + * @param gauge Gauge Field + * @param gauge_param Gauge field meta data + * @param inv_param Dirac and solver meta data + */ +void computeCloverForceQuda(void *mom, double dt, void **x, void **p, double *coeff, double kappa2, double ck, + int nvector, double multiplicity, void *gauge, QudaGaugeParam *gauge_param, + QudaInvertParam *inv_param); - /** - * Compute the fermion force for the HISQ quark action and integrate the momentum. - * @param momentum The momentum field we are integrating - * @param dt The stepsize used to integrate the momentum - * @param level2_coeff The coefficients for the second level of smearing in the quark action. - * @param fat7_coeff The coefficients for the first level of smearing (fat7) in the quark action. - * @param w_link Unitarized link variables obtained by applying fat7 smearing and unitarization to the original links. - * @param v_link Fat7 link variables. - * @param u_link SU(3) think link variables. - * @param quark The input fermion field. - * @param num The number of quark fields - * @param num_naik The number of naik contributions - * @param coeff The coefficient multiplying the fermion fields in the outer product - * @param param. The field parameters. - */ - void computeHISQForceQuda(void* momentum, - double dt, - const double level2_coeff[6], - const double fat7_coeff[6], - const void* const w_link, - const void* const v_link, - const void* const u_link, - void** quark, - int num, - int num_naik, - double** coeff, - QudaGaugeParam* param); +/** + * Compute the naive staggered force. All fields must be in the same precision. + * + * @param mom Momentum field + * @param dt Integrating step size + * @param delta Additional scale factor when updating momentum (mom += delta * [force]_TA + * @param gauge Gauge field (at present only supports resident gauge field) + * @param x Array of single-parity solution vectors (at present only supports resident solutions) + * @param gauge_param Gauge field meta data + * @param invert_param Dirac and solver meta data + */ +void computeStaggeredForceQuda(void *mom, double dt, double delta, void *gauge, void **x, QudaGaugeParam *gauge_param, + QudaInvertParam *invert_param); - /** - @brief Generate Gaussian distributed fields and store in the - resident gauge field. We create a Gaussian-distributed su(n) - field and exponentiate it, e.g., U = exp(sigma * H), where H is - the distributed su(n) field and sigma is the width of the - distribution (sigma = 0 results in a free field, and sigma = 1 has - maximum disorder). - - @param seed The seed used for the RNG - @param sigma Width of Gaussian distrubution - */ - void gaussGaugeQuda(unsigned long long seed, double sigma); +/** + * Compute the fermion force for the HISQ quark action and integrate the momentum. + * @param momentum The momentum field we are integrating + * @param dt The stepsize used to integrate the momentum + * @param level2_coeff The coefficients for the second level of smearing in the quark action. + * @param fat7_coeff The coefficients for the first level of smearing (fat7) in the quark action. + * @param w_link Unitarized link variables obtained by applying fat7 smearing and unitarization to the original links. + * @param v_link Fat7 link variables. + * @param u_link SU(3) think link variables. + * @param quark The input fermion field. + * @param num The number of quark fields + * @param num_naik The number of naik contributions + * @param coeff The coefficient multiplying the fermion fields in the outer product + * @param param. The field parameters. + */ +void computeHISQForceQuda(void *momentum, double dt, const double level2_coeff[6], const double fat7_coeff[6], + const void *const w_link, const void *const v_link, const void *const u_link, void **quark, + int num, int num_naik, double **coeff, QudaGaugeParam *param); - /** - * @brief Generate Gaussian distributed fields and store in the - * resident momentum field. We create a Gaussian-distributed su(n) - * field, e.g., sigma * H, where H is the distributed su(n) field - * and sigma is the width of the distribution (sigma = 0 results - * in a free field, and sigma = 1 has maximum disorder). - * - * @param seed The seed used for the RNG - * @param sigma Width of Gaussian distrubution - */ - void gaussMomQuda(unsigned long long seed, double sigma); +/** + @brief Generate Gaussian distributed fields and store in the + resident gauge field. We create a Gaussian-distributed su(n) + field and exponentiate it, e.g., U = exp(sigma * H), where H is + the distributed su(n) field and sigma is the width of the + distribution (sigma = 0 results in a free field, and sigma = 1 has + maximum disorder). + + @param seed The seed used for the RNG + @param sigma Width of Gaussian distrubution +*/ +void gaussGaugeQuda(unsigned long long seed, double sigma); - /** - * Computes the total, spatial and temporal plaquette averages of the loaded gauge configuration. - * @param[out] Array for storing the averages (total, spatial, temporal) - */ - void plaqQuda(double plaq[3]); +/** + * @brief Generate Gaussian distributed fields and store in the + * resident momentum field. We create a Gaussian-distributed su(n) + * field, e.g., sigma * H, where H is the distributed su(n) field + * and sigma is the width of the distribution (sigma = 0 results + * in a free field, and sigma = 1 has maximum disorder). + * + * @param seed The seed used for the RNG + * @param sigma Width of Gaussian distrubution + */ +void gaussMomQuda(unsigned long long seed, double sigma); - /** - @brief Computes the trace of the Polyakov loop of the current resident field - in a given direction. +/** + * Computes the total, spatial and temporal plaquette averages of the loaded gauge configuration. + * @param[out] Array for storing the averages (total, spatial, temporal) + */ +void plaqQuda(double plaq[3]); - @param[out] ploop Trace of the Polyakov loop in direction dir - @param[in] dir Direction of Polyakov loop - */ - void polyakovLoopQuda(double ploop[2], int dir); +/** + @brief Computes the trace of the Polyakov loop of the current resident field + in a given direction. - /** - * Performs a deep copy from the internal extendedGaugeResident field. - * @param Pointer to externally allocated GaugeField - */ - void copyExtendedResidentGaugeQuda(void *resident_gauge); + @param[out] ploop Trace of the Polyakov loop in direction dir + @param[in] dir Direction of Polyakov loop +*/ +void polyakovLoopQuda(double ploop[2], int dir); - /** - * Performs Wuppertal smearing on a given spinor using the gauge field - * gaugeSmeared, if it exist, or gaugePrecise if no smeared field is present. - * @param h_out Result spinor field - * @param h_in Input spinor field - * @param param Contains all metadata regarding host and device - * storage and operator which will be applied to the spinor - * @param n_steps Number of steps to apply. - * @param alpha Alpha coefficient for Wuppertal smearing. - */ - void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *param, unsigned int n_steps, double alpha); +/** + * Performs a deep copy from the internal extendedGaugeResident field. + * @param Pointer to externally allocated GaugeField + */ +void copyExtendedResidentGaugeQuda(void *resident_gauge); - /** - * Performs APE, Stout, or Over Imroved STOUT smearing on gaugePrecise and stores it in gaugeSmeared - * @param[in] smear_param Parameter struct that defines the computation parameters - * @param[in,out] obs_param Parameter struct that defines which - * observables we are making and the resulting observables. - */ - void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param); +/** + * Performs Wuppertal smearing on a given spinor using the gauge field + * gaugeSmeared, if it exist, or gaugePrecise if no smeared field is present. + * @param h_out Result spinor field + * @param h_in Input spinor field + * @param param Contains all metadata regarding host and device + * storage and operator which will be applied to the spinor + * @param n_steps Number of steps to apply. + * @param alpha Alpha coefficient for Wuppertal smearing. + */ +void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *param, unsigned int n_steps, double alpha); - /** - * Performs Wilson Flow on gaugePrecise and stores it in gaugeSmeared - * @param[in] smear_param Parameter struct that defines the computation parameters - * @param[in,out] obs_param Parameter struct that defines which - * observables we are making and the resulting observables. - */ - void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param); +/** + * Performs APE, Stout, or Over Imroved STOUT smearing on gaugePrecise and stores it in gaugeSmeared + * @param[in] smear_param Parameter struct that defines the computation parameters + * @param[in,out] obs_param Parameter struct that defines which + * observables we are making and the resulting observables. + */ +void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param); - /** - * @brief Calculates a variety of gauge-field observables. If a - * smeared gauge field is presently loaded (in gaugeSmeared) the - * observables are computed on this, else the resident gauge field - * will be used. - * @param[in,out] param Parameter struct that defines which - * observables we are making and the resulting observables. - */ - void gaugeObservablesQuda(QudaGaugeObservableParam *param); +/** + * Performs Wilson Flow on gaugePrecise and stores it in gaugeSmeared + * @param[in] smear_param Parameter struct that defines the computation parameters + * @param[in,out] obs_param Parameter struct that defines which + * observables we are making and the resulting observables. + */ +void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param); - /** - * Public function to perform color contractions of the host spinors x and y. - * @param[in] x pointer to host data - * @param[in] y pointer to host data - * @param[out] result pointer to the 16 spin projections per lattice site - * @param[in] cType Which type of contraction (open, degrand-rossi, etc) - * @param[in] param meta data for construction of ColorSpinorFields. - * @param[in] X spacetime data for construction of ColorSpinorFields. - */ - void contractQuda(const void *x, const void *y, void *result, const QudaContractType cType, QudaInvertParam *param, - const int *X); +/** + * @brief Calculates a variety of gauge-field observables. If a + * smeared gauge field is presently loaded (in gaugeSmeared) the + * observables are computed on this, else the resident gauge field + * will be used. + * @param[in,out] param Parameter struct that defines which + * observables we are making and the resulting observables. + */ +void gaugeObservablesQuda(QudaGaugeObservableParam *param); - /** - * @brief Gauge fixing with overrelaxation with support for single and multi GPU. - * @param[in,out] gauge, gauge field to be fixed - * @param[in] gauge_dir, 3 for Coulomb gauge fixing, other for Landau gauge fixing - * @param[in] Nsteps, maximum number of steps to perform gauge fixing - * @param[in] verbose_interval, print gauge fixing info when iteration count is a multiple of this - * @param[in] relax_boost, gauge fixing parameter of the overrelaxation method, most common value is 1.5 or 1.7. - * @param[in] tolerance, torelance value to stop the method, if this value is zero then the method stops when - * iteration reachs the maximum number of steps defined by Nsteps - * @param[in] reunit_interval, reunitarize gauge field when iteration count is a multiple of this - * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value - * @param[in] param The parameters of the external fields and the computation settings - * @param[out] timeinfo - */ - int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, - const unsigned int verbose_interval, const double relax_boost, const double tolerance, - const unsigned int reunit_interval, const unsigned int stopWtheta, - QudaGaugeParam *param, double *timeinfo); +/** + * Public function to perform color contractions of the host spinors x and y. + * @param[in] x pointer to host data + * @param[in] y pointer to host data + * @param[out] result pointer to the 16 spin projections per lattice site + * @param[in] cType Which type of contraction (open, degrand-rossi, etc) + * @param[in] param meta data for construction of ColorSpinorFields. + * @param[in] X spacetime data for construction of ColorSpinorFields. + */ +void contractQuda(const void *x, const void *y, void *result, const QudaContractType cType, QudaInvertParam *param, + const int *X); - /** - * @brief Gauge fixing with Steepest descent method with FFTs with support for single GPU only. - * @param[in,out] gauge, gauge field to be fixed - * @param[in] gauge_dir, 3 for Coulomb gauge fixing, other for Landau gauge fixing - * @param[in] Nsteps, maximum number of steps to perform gauge fixing - * @param[in] verbose_interval, print gauge fixing info when iteration count is a multiple of this - * @param[in] alpha, gauge fixing parameter of the method, most common value is 0.08 - * @param[in] autotune, 1 to autotune the method, i.e., if the Fg inverts its tendency we decrease the alpha value - * @param[in] tolerance, torelance value to stop the method, if this value is zero then the method stops when - * iteration reachs the maximum number of steps defined by Nsteps - * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value - * @param[in] param The parameters of the external fields and the computation settings - * @param[out] timeinfo - */ - int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, - const unsigned int verbose_interval, const double alpha, const unsigned int autotune, - const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, - double *timeinfo); +/** + * @brief Gauge fixing with overrelaxation with support for single and multi GPU. + * @param[in,out] gauge, gauge field to be fixed + * @param[in] gauge_dir, 3 for Coulomb gauge fixing, other for Landau gauge fixing + * @param[in] Nsteps, maximum number of steps to perform gauge fixing + * @param[in] verbose_interval, print gauge fixing info when iteration count is a multiple of this + * @param[in] relax_boost, gauge fixing parameter of the overrelaxation method, most common value is 1.5 or 1.7. + * @param[in] tolerance, torelance value to stop the method, if this value is zero then the method stops when + * iteration reachs the maximum number of steps defined by Nsteps + * @param[in] reunit_interval, reunitarize gauge field when iteration count is a multiple of this + * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value + * @param[in] param The parameters of the external fields and the computation settings + * @param[out] timeinfo + */ +int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, + const unsigned int verbose_interval, const double relax_boost, const double tolerance, + const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param, + double *timeinfo); - /** - * @brief Strided Batched GEMM - * @param[in] arrayA The array containing the A matrix data - * @param[in] arrayB The array containing the B matrix data - * @param[in] arrayC The array containing the C matrix data - * @param[in] native Boolean to use either the native or generic version - * @param[in] param The data defining the problem execution. - */ - void blasGEMMQuda(void *arrayA, void *arrayB, void *arrayC, QudaBoolean native, QudaBLASParam *param); +/** + * @brief Gauge fixing with Steepest descent method with FFTs with support for single GPU only. + * @param[in,out] gauge, gauge field to be fixed + * @param[in] gauge_dir, 3 for Coulomb gauge fixing, other for Landau gauge fixing + * @param[in] Nsteps, maximum number of steps to perform gauge fixing + * @param[in] verbose_interval, print gauge fixing info when iteration count is a multiple of this + * @param[in] alpha, gauge fixing parameter of the method, most common value is 0.08 + * @param[in] autotune, 1 to autotune the method, i.e., if the Fg inverts its tendency we decrease the alpha value + * @param[in] tolerance, torelance value to stop the method, if this value is zero then the method stops when + * iteration reachs the maximum number of steps defined by Nsteps + * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value + * @param[in] param The parameters of the external fields and the computation settings + * @param[out] timeinfo + */ +int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, + const unsigned int verbose_interval, const double alpha, const unsigned int autotune, + const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, + double *timeinfo); - /** - * @brief Strided Batched in-place matrix inversion via LU - * @param[in] Ainv The array containing the A inverse matrix data - * @param[in] A The array containing the A matrix data - * @param[in] use_native Boolean to use either the native or generic version - * @param[in] param The data defining the problem execution. - */ - void blasLUInvQuda(void *Ainv, void *A, QudaBoolean use_native, QudaBLASParam *param); +/** + * @brief Strided Batched GEMM + * @param[in] arrayA The array containing the A matrix data + * @param[in] arrayB The array containing the B matrix data + * @param[in] arrayC The array containing the C matrix data + * @param[in] native Boolean to use either the native or generic version + * @param[in] param The data defining the problem execution. + */ +void blasGEMMQuda(void *arrayA, void *arrayB, void *arrayC, QudaBoolean native, QudaBLASParam *param); - /** - * @brief Flush the chronological history for the given index - * @param[in] index Index for which we are flushing - */ - void flushChronoQuda(int index); +/** + * @brief Strided Batched in-place matrix inversion via LU + * @param[in] Ainv The array containing the A inverse matrix data + * @param[in] A The array containing the A matrix data + * @param[in] use_native Boolean to use either the native or generic version + * @param[in] param The data defining the problem execution. + */ +void blasLUInvQuda(void *Ainv, void *A, QudaBoolean use_native, QudaBLASParam *param); +/** + * @brief Flush the chronological history for the given index + * @param[in] index Index for which we are flushing + */ +void flushChronoQuda(int index); - /** - * Create deflation solver resources. - * - **/ +/** + * Create deflation solver resources. + * + **/ - void* newDeflationQuda(QudaEigParam *param); +void *newDeflationQuda(QudaEigParam *param); - /** - * Free resources allocated by the deflated solver - */ - void destroyDeflationQuda(void *df_instance); +/** + * Free resources allocated by the deflated solver + */ +void destroyDeflationQuda(void *df_instance); - void setMPICommHandleQuda(void *mycomm); +void setMPICommHandleQuda(void *mycomm); #ifdef __cplusplus } @@ -1722,4 +1709,3 @@ extern "C" { #undef double_complex /* #include */ - diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 6877e28476..7e6db7a9f5 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -163,6 +163,8 @@ void openQCD_qudaLoadGaugeField(int external_precision, int quda_precision, open void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge); +void openQCD_gaugeloadsave(int precision, void *gauge); + // int openQCD_ipt(int iy); /** diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 3da20c4d87..ae33a44602 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -39,8 +39,8 @@ #include #include -#define MAX(a,b) ((a)>(b)? (a):(b)) -#define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define TDIFF(a, b) (b.tv_sec - a.tv_sec + 0.000001 * (b.tv_usec - a.tv_usec)) // define newQudaGaugeParam() and newQudaInvertParam() #define INIT_PARAM @@ -70,7 +70,6 @@ static bool redundant_comms = false; #include - cudaGaugeField *gaugePrecise = nullptr; cudaGaugeField *gaugeSloppy = nullptr; cudaGaugeField *gaugePrecondition = nullptr; @@ -149,28 +148,28 @@ static TimeProfile profileGaugeForce("computeGaugeForceQuda"); //!< Profiler for computeGaugePathQuda static TimeProfile profileGaugePath("computeGaugePathQuda"); -//!> target) { - target_list.push_back(target); - if (target_stream.peek() == ',') target_stream.ignore(); - } - - if (target_list.size() > 0) { - std::sort(target_list.begin(), target_list.end()); - target_list.erase( unique( target_list.begin(), target_list.end() ), target_list.end() ); - warningQuda("Targeted profiling enabled for %lu functions\n", target_list.size()); - enable = true; - } - } - - char* donotprofile_env = getenv("QUDA_DO_NOT_PROFILE"); // disable profiling of QUDA parts - if (donotprofile_env && (!(strcmp(donotprofile_env, "0") == 0))) { - do_not_profile_quda=true; + while (target_stream >> target) { + target_list.push_back(target); + if (target_stream.peek() == ',') target_stream.ignore(); + } + + if (target_list.size() > 0) { + std::sort(target_list.begin(), target_list.end()); + target_list.erase(unique(target_list.begin(), target_list.end()), target_list.end()); + warningQuda("Targeted profiling enabled for %lu functions\n", target_list.size()); + enable = true; + } + } + + char *donotprofile_env = getenv("QUDA_DO_NOT_PROFILE"); // disable profiling of QUDA parts + if (donotprofile_env && (!(strcmp(donotprofile_env, "0") == 0))) { + do_not_profile_quda = true; printfQuda("Disabling profiling in QUDA\n"); } init = true; @@ -249,7 +248,7 @@ static void profilerStart(const char *f) static int target_count = 0; static unsigned int i = 0; - if (do_not_profile_quda){ + if (do_not_profile_quda) { device::profile::stop(); printfQuda("Stopping profiling in QUDA\n"); } else { @@ -259,12 +258,13 @@ static void profilerStart(const char *f) printfQuda("Starting profiling for %s\n", f); device::profile::start(); i++; // advance to next target + } } } } -} -static void profilerStop(const char *f) { +static void profilerStop(const char *f) +{ if (do_not_profile_quda) { device::profile::start(); } else { @@ -277,8 +277,8 @@ static void profilerStop(const char *f) { } } - -namespace quda { +namespace quda +{ void printLaunchTimer(); } @@ -289,7 +289,6 @@ void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfil setOutputFile(outfile); } - typedef struct { int ndim; int dims[QUDA_MAX_DIM]; @@ -303,9 +302,7 @@ static int lex_rank_from_coords(const int *coords, void *fdata) auto *md = static_cast(fdata); int rank = coords[0]; - for (int i = 1; i < md->ndim; i++) { - rank = md->dims[i] * rank + coords[i]; - } + for (int i = 1; i < md->ndim; i++) { rank = md->dims[i] * rank + coords[i]; } return rank; } @@ -340,21 +337,17 @@ void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata { if (comms_initialized) return; - if (nDim != 4) { - errorQuda("Number of communication grid dimensions must be 4"); - } + if (nDim != 4) { errorQuda("Number of communication grid dimensions must be 4"); } LexMapData map_data; if (!func) { #if QMP_COMMS if (QMP_logical_topology_is_declared()) { - if (QMP_get_logical_number_of_dimensions() != 4) { - errorQuda("QMP logical topology must have 4 dimensions"); - } - for (int i=0; i= QUDA_SUMMARIZE) { #ifdef GITVERSION - printfQuda("QUDA %s (git %s)\n",quda_version.c_str(),gitversion); + printfQuda("QUDA %s (git %s)\n", quda_version.c_str(), gitversion); #else - printfQuda("QUDA %s\n",quda_version.c_str()); + printfQuda("QUDA %s\n", quda_version.c_str()); #endif } @@ -452,7 +441,7 @@ void initQudaDevice(int dev) { // determine if we will do CPU or GPU data reordering (default is GPU) char *reorder_str = getenv("QUDA_REORDER_LOCATION"); - if (!reorder_str || (strcmp(reorder_str,"CPU") && strcmp(reorder_str,"cpu")) ) { + if (!reorder_str || (strcmp(reorder_str, "CPU") && strcmp(reorder_str, "cpu"))) { warningQuda("Data reordering done on GPU (set with QUDA_REORDER_LOCATION=GPU/CPU)"); reorder_location_set(QUDA_CUDA_FIELD_LOCATION); } else { @@ -491,7 +480,7 @@ void initQudaMemory() num_failures_h = static_cast(mapped_malloc(sizeof(int))); num_failures_d = static_cast(get_mapped_device_pointer(num_failures_h)); - for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d)); + for (int d = 0; d < 4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d)); profileInit.TPSTOP(QUDA_PROFILE_INIT); profileInit.TPSTOP(QUDA_PROFILE_TOTAL); @@ -499,7 +488,7 @@ void initQudaMemory() void updateR() { - for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d)); + for (int d = 0; d < 4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d)); } void initQuda(int dev) @@ -534,10 +523,10 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) if (gauge_param.order <= 4) gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; GaugeField *in = (param->location == QUDA_CPU_FIELD_LOCATION) ? - static_cast(new cpuGaugeField(gauge_param)) : - static_cast(new cudaGaugeField(gauge_param)); + static_cast(new cpuGaugeField(gauge_param)) : + static_cast(new cudaGaugeField(gauge_param)); - if (in->Order() == QUDA_BQCD_GAUGE_ORDER) { + if (in->Order() == QUDA_BQCD_GAUGE_ORDER) { static size_t checksum = SIZE_MAX; size_t in_checksum = in->checksum(true); if (in_checksum == checksum) { @@ -555,63 +544,62 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) // free any current gauge field before new allocations to reduce memory overhead switch (param->type) { - case QUDA_WILSON_LINKS: - if (gaugeRefinement != gaugeSloppy && gaugeRefinement != gaugeEigensolver && gaugeRefinement) - delete gaugeRefinement; + case QUDA_WILSON_LINKS: + if (gaugeRefinement != gaugeSloppy && gaugeRefinement != gaugeEigensolver && gaugeRefinement) + delete gaugeRefinement; - if (gaugePrecondition != gaugeSloppy && gaugePrecondition != gaugeEigensolver && gaugePrecondition != gaugePrecise - && gaugePrecondition) - delete gaugePrecondition; + if (gaugePrecondition != gaugeSloppy && gaugePrecondition != gaugeEigensolver && gaugePrecondition != gaugePrecise + && gaugePrecondition) + delete gaugePrecondition; - if (gaugeEigensolver != gaugeSloppy && gaugeEigensolver != gaugePrecise && gaugeEigensolver != gaugePrecondition - && gaugeEigensolver) - delete gaugeEigensolver; + if (gaugeEigensolver != gaugeSloppy && gaugeEigensolver != gaugePrecise && gaugeEigensolver != gaugePrecondition + && gaugeEigensolver) + delete gaugeEigensolver; - if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy; + if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy; - if (gaugePrecise && !param->use_resident_gauge) delete gaugePrecise; + if (gaugePrecise && !param->use_resident_gauge) delete gaugePrecise; - break; - case QUDA_ASQTAD_FAT_LINKS: - if (gaugeFatRefinement != gaugeFatSloppy && gaugeFatRefinement != gaugeFatEigensolver && gaugeFatRefinement) - delete gaugeFatRefinement; + break; + case QUDA_ASQTAD_FAT_LINKS: + if (gaugeFatRefinement != gaugeFatSloppy && gaugeFatRefinement != gaugeFatEigensolver && gaugeFatRefinement) + delete gaugeFatRefinement; - if (gaugeFatPrecondition != gaugeFatSloppy && gaugeFatPrecondition != gaugeFatEigensolver - && gaugeFatPrecondition != gaugeFatPrecise && gaugeFatPrecondition) - delete gaugeFatPrecondition; + if (gaugeFatPrecondition != gaugeFatSloppy && gaugeFatPrecondition != gaugeFatEigensolver + && gaugeFatPrecondition != gaugeFatPrecise && gaugeFatPrecondition) + delete gaugeFatPrecondition; - if (gaugeFatEigensolver != gaugeFatSloppy && gaugeFatEigensolver != gaugeFatPrecise - && gaugeFatEigensolver != gaugeFatPrecondition && gaugeFatEigensolver) - delete gaugeFatEigensolver; + if (gaugeFatEigensolver != gaugeFatSloppy && gaugeFatEigensolver != gaugeFatPrecise + && gaugeFatEigensolver != gaugeFatPrecondition && gaugeFatEigensolver) + delete gaugeFatEigensolver; - if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy; + if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy; - if (gaugeFatPrecise && !param->use_resident_gauge) delete gaugeFatPrecise; + if (gaugeFatPrecise && !param->use_resident_gauge) delete gaugeFatPrecise; - break; - case QUDA_ASQTAD_LONG_LINKS: + break; + case QUDA_ASQTAD_LONG_LINKS: - if (gaugeLongRefinement != gaugeLongSloppy && gaugeLongRefinement != gaugeLongEigensolver && gaugeLongRefinement) - delete gaugeLongRefinement; + if (gaugeLongRefinement != gaugeLongSloppy && gaugeLongRefinement != gaugeLongEigensolver && gaugeLongRefinement) + delete gaugeLongRefinement; - if (gaugeLongPrecondition != gaugeLongSloppy && gaugeLongPrecondition != gaugeLongEigensolver - && gaugeLongPrecondition != gaugeLongPrecise && gaugeLongPrecondition) - delete gaugeLongPrecondition; + if (gaugeLongPrecondition != gaugeLongSloppy && gaugeLongPrecondition != gaugeLongEigensolver + && gaugeLongPrecondition != gaugeLongPrecise && gaugeLongPrecondition) + delete gaugeLongPrecondition; - if (gaugeLongEigensolver != gaugeLongSloppy && gaugeLongEigensolver != gaugeLongPrecise - && gaugeLongEigensolver != gaugeLongPrecondition && gaugeLongEigensolver) - delete gaugeLongEigensolver; + if (gaugeLongEigensolver != gaugeLongSloppy && gaugeLongEigensolver != gaugeLongPrecise + && gaugeLongEigensolver != gaugeLongPrecondition && gaugeLongEigensolver) + delete gaugeLongEigensolver; - if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy; + if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy; - if (gaugeLongPrecise) delete gaugeLongPrecise; + if (gaugeLongPrecise) delete gaugeLongPrecise; - break; - case QUDA_SMEARED_LINKS: - if (gaugeSmeared) delete gaugeSmeared; - break; - default: - errorQuda("Invalid gauge type %d", param->type); + break; + case QUDA_SMEARED_LINKS: + if (gaugeSmeared) delete gaugeSmeared; + break; + default: errorQuda("Invalid gauge type %d", param->type); } // if not preserving then copy the gauge field passed in @@ -628,7 +616,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) precise = new cudaGaugeField(gauge_param); if (param->use_resident_gauge) { - if(gaugePrecise == nullptr) errorQuda("No resident gauge field"); + if (gaugePrecise == nullptr) errorQuda("No resident gauge field"); // copy rather than point at to ensure that the padded region is filled in precise->copy(*gaugePrecise); precise->exchangeGhost(); @@ -715,49 +703,48 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) profileGauge.TPSTOP(QUDA_PROFILE_COMPUTE); // create an extended preconditioning field - cudaGaugeField* extended = nullptr; - if (param->overlap){ + cudaGaugeField *extended = nullptr; + if (param->overlap) { lat_dim_t R; // domain-overlap widths in different directions - for (int i=0; i<4; ++i) R[i] = param->overlap*commDimPartitioned(i); + for (int i = 0; i < 4; ++i) R[i] = param->overlap * commDimPartitioned(i); extended = createExtendedGauge(*precondition, R, profileGauge); } switch (param->type) { - case QUDA_WILSON_LINKS: - gaugePrecise = precise; - gaugeSloppy = sloppy; - gaugePrecondition = precondition; - gaugeRefinement = refinement; - gaugeEigensolver = eigensolver; - - if(param->overlap) gaugeExtended = extended; - break; - case QUDA_ASQTAD_FAT_LINKS: - gaugeFatPrecise = precise; - gaugeFatSloppy = sloppy; - gaugeFatPrecondition = precondition; - gaugeFatRefinement = refinement; - gaugeFatEigensolver = eigensolver; - - if(param->overlap){ - if(gaugeFatExtended) errorQuda("Extended gauge fat field already allocated"); - gaugeFatExtended = extended; - } - break; - case QUDA_ASQTAD_LONG_LINKS: - gaugeLongPrecise = precise; - gaugeLongSloppy = sloppy; - gaugeLongPrecondition = precondition; - gaugeLongRefinement = refinement; - gaugeLongEigensolver = eigensolver; - - if(param->overlap){ - if(gaugeLongExtended) errorQuda("Extended gauge long field already allocated"); - gaugeLongExtended = extended; - } - break; - default: - errorQuda("Invalid gauge type %d", param->type); + case QUDA_WILSON_LINKS: + gaugePrecise = precise; + gaugeSloppy = sloppy; + gaugePrecondition = precondition; + gaugeRefinement = refinement; + gaugeEigensolver = eigensolver; + + if (param->overlap) gaugeExtended = extended; + break; + case QUDA_ASQTAD_FAT_LINKS: + gaugeFatPrecise = precise; + gaugeFatSloppy = sloppy; + gaugeFatPrecondition = precondition; + gaugeFatRefinement = refinement; + gaugeFatEigensolver = eigensolver; + + if (param->overlap) { + if (gaugeFatExtended) errorQuda("Extended gauge fat field already allocated"); + gaugeFatExtended = extended; + } + break; + case QUDA_ASQTAD_LONG_LINKS: + gaugeLongPrecise = precise; + gaugeLongSloppy = sloppy; + gaugeLongPrecondition = precondition; + gaugeLongRefinement = refinement; + gaugeLongEigensolver = eigensolver; + + if (param->overlap) { + if (gaugeLongExtended) errorQuda("Extended gauge long field already allocated"); + gaugeLongExtended = extended; + } + break; + default: errorQuda("Invalid gauge type %d", param->type); } profileGauge.TPSTART(QUDA_PROFILE_FREE); @@ -1016,7 +1003,6 @@ void loadSloppyCloverQuda(const QudaPrecision *prec) cloverEigensolver->copy(*cloverPrecise); } } - } // just free the sloppy fields used in mixed-precision solvers @@ -1365,8 +1351,7 @@ void freeCloverQuda(void) void flushChronoQuda(int i) { - if (i >= QUDA_MAX_CHRONO) - errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO); + if (i >= QUDA_MAX_CHRONO) errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO); chronoResident[i].clear(); } @@ -1384,7 +1369,7 @@ void endQuda(void) solutionResident.clear(); - if(momResident) delete momResident; + if (momResident) delete momResident; LatticeField::freeGhostBuffer(); ColorSpinorField::freeGhostBuffer(); @@ -1461,23 +1446,17 @@ void endQuda(void) device::destroy(); } - -namespace quda { +namespace quda +{ void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) { double kappa = inv_param->kappa; - if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { - kappa *= gaugePrecise->Anisotropy(); - } + if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { kappa *= gaugePrecise->Anisotropy(); } switch (inv_param->dslash_type) { - case QUDA_WILSON_DSLASH: - diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC; - break; - case QUDA_CLOVER_WILSON_DSLASH: - diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC; - break; + case QUDA_WILSON_DSLASH: diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC; break; + case QUDA_CLOVER_WILSON_DSLASH: diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC; break; case QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH: diracParam.type = pc ? QUDA_CLOVER_HASENBUSCH_TWISTPC_DIRAC : QUDA_CLOVER_HASENBUSCH_TWIST_DIRAC; break; @@ -1508,7 +1487,7 @@ namespace quda { break; case QUDA_MOBIUS_DWF_DSLASH: if (inv_param->Ls > QUDA_MAX_DWF_LS) - errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS); + errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS); diracParam.type = pc ? QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC : QUDA_MOBIUS_DOMAIN_WALL_DIRAC; diracParam.Ls = inv_param->Ls; if (sizeof(Complex) != sizeof(double _Complex)) { @@ -1520,48 +1499,41 @@ namespace quda { printfQuda("Printing b_5 and c_5 values\n"); for (int i = 0; i < diracParam.Ls; i++) { printfQuda("fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n", i, diracParam.b_5[i].real(), - diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(), diracParam.c_5[i].imag()); + diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(), diracParam.c_5[i].imag()); // printfQuda("fromQUDA inv_param: b5[%d] = %f %f c5[%d] = %f %f\n", i, inv_param->b_5[i], i, // inv_param->c_5[i] ); printfQuda("fromQUDA creal: b5[%d] = %f %f c5[%d] = %f %f \n", i, // creal(inv_param->b_5[i]), cimag(inv_param->b_5[i]), i, creal(inv_param->c_5[i]), cimag(inv_param->c_5[i]) ); } } break; - case QUDA_STAGGERED_DSLASH: - diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC; - break; - case QUDA_ASQTAD_DSLASH: - diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC; - break; + case QUDA_STAGGERED_DSLASH: diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC; break; + case QUDA_ASQTAD_DSLASH: diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC; break; case QUDA_TWISTED_MASS_DSLASH: diracParam.type = pc ? QUDA_TWISTED_MASSPC_DIRAC : QUDA_TWISTED_MASS_DIRAC; if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) { - diracParam.Ls = 1; - diracParam.epsilon = 0.0; + diracParam.Ls = 1; + diracParam.epsilon = 0.0; } else { - diracParam.Ls = 2; - diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0; + diracParam.Ls = 2; + diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0; } break; case QUDA_TWISTED_CLOVER_DSLASH: diracParam.type = pc ? QUDA_TWISTED_CLOVERPC_DIRAC : QUDA_TWISTED_CLOVER_DIRAC; - if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) { - diracParam.Ls = 1; - diracParam.epsilon = 0.0; + if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) { + diracParam.Ls = 1; + diracParam.epsilon = 0.0; } else { - diracParam.Ls = 2; - diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0; + diracParam.Ls = 2; + diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0; } break; case QUDA_LAPLACE_DSLASH: diracParam.type = pc ? QUDA_GAUGE_LAPLACEPC_DIRAC : QUDA_GAUGE_LAPLACE_DIRAC; diracParam.laplace3D = inv_param->laplace3D; break; - case QUDA_COVDEV_DSLASH: - diracParam.type = QUDA_GAUGE_COVDEV_DIRAC; - break; - default: - errorQuda("Unsupported dslash_type %d", inv_param->dslash_type); + case QUDA_COVDEV_DSLASH: diracParam.type = QUDA_GAUGE_COVDEV_DIRAC; break; + default: errorQuda("Unsupported dslash_type %d", inv_param->dslash_type); } diracParam.matpcType = inv_param->matpc_type; @@ -1576,7 +1548,7 @@ namespace quda { diracParam.mu = inv_param->mu; diracParam.tm_rho = inv_param->tm_rho; - for (int i=0; i<4; i++) diracParam.commDim[i] = 1; // comms are always on + for (int i = 0; i < 4; i++) diracParam.commDim[i] = 1; // comms are always on if (diracParam.gauge->Precision() != inv_param->cuda_prec) errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(), @@ -1585,7 +1557,6 @@ namespace quda { diracParam.use_mobius_fused_kernel = inv_param->use_mobius_fused_kernel; } - void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) { setDiracParam(diracParam, inv_param, pc); @@ -1595,8 +1566,8 @@ namespace quda { diracParam.longGauge = gaugeLongSloppy; diracParam.clover = cloverSloppy; - for (int i=0; i<4; i++) { - diracParam.commDim[i] = 1; // comms are always on + for (int i = 0; i < 4; i++) { + diracParam.commDim[i] = 1; // comms are always on } if (diracParam.gauge->Precision() != inv_param->cuda_prec_sloppy) @@ -1613,8 +1584,8 @@ namespace quda { diracParam.longGauge = gaugeLongRefinement; diracParam.clover = cloverRefinement; - for (int i=0; i<4; i++) { - diracParam.commDim[i] = 1; // comms are always on + for (int i = 0; i < 4; i++) { + diracParam.commDim[i] = 1; // comms are always on } if (diracParam.gauge->Precision() != inv_param->cuda_prec_refinement_sloppy) @@ -1638,15 +1609,13 @@ namespace quda { } diracParam.clover = cloverPrecondition; - for (int i=0; i<4; i++) { - diracParam.commDim[i] = comms ? 1 : 0; - } + for (int i = 0; i < 4; i++) { diracParam.commDim[i] = comms ? 1 : 0; } // In the preconditioned staggered CG allow a different dslash type in the preconditioning - if(inv_param->inv_type == QUDA_PCG_INVERTER && inv_param->dslash_type == QUDA_ASQTAD_DSLASH - && inv_param->dslash_type_precondition == QUDA_STAGGERED_DSLASH) { - diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC; - diracParam.gauge = gaugeFatPrecondition; + if (inv_param->inv_type == QUDA_PCG_INVERTER && inv_param->dslash_type == QUDA_ASQTAD_DSLASH + && inv_param->dslash_type_precondition == QUDA_STAGGERED_DSLASH) { + diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC; + diracParam.gauge = gaugeFatPrecondition; } if (diracParam.gauge->Precision() != inv_param->cuda_prec_precondition) @@ -1746,7 +1715,7 @@ namespace quda { void massRescale(ColorSpinorField &b, QudaInvertParam ¶m, bool for_multishift) { - double kappa5 = (0.5/(5.0 + param.m5)); + double kappa5 = (0.5 / (5.0 + param.m5)); double kappa = (param.dslash_type == QUDA_DOMAIN_WALL_DSLASH || param.dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || param.dslash_type == QUDA_MOBIUS_DWF_DSLASH || param.dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH) ? kappa5 : @@ -1759,16 +1728,15 @@ namespace quda { // staggered dslash uses mass normalization internally if (param.dslash_type == QUDA_ASQTAD_DSLASH || param.dslash_type == QUDA_STAGGERED_DSLASH) { switch (param.solution_type) { - case QUDA_MAT_SOLUTION: - case QUDA_MATPC_SOLUTION: - if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(2.0*param.mass, b); - break; - case QUDA_MATDAG_MAT_SOLUTION: - case QUDA_MATPCDAG_MATPC_SOLUTION: - if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(4.0*param.mass*param.mass, b); - break; - default: - errorQuda("Not implemented"); + case QUDA_MAT_SOLUTION: + case QUDA_MATPC_SOLUTION: + if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(2.0 * param.mass, b); + break; + case QUDA_MATDAG_MAT_SOLUTION: + case QUDA_MATPCDAG_MATPC_SOLUTION: + if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(4.0 * param.mass * param.mass, b); + break; + default: errorQuda("Not implemented"); } return; } @@ -1776,51 +1744,50 @@ namespace quda { // multiply the source to compensate for normalization of the Dirac operator, if necessary // you are responsible for restoring what's in param.offset switch (param.solution_type) { - case QUDA_MAT_SOLUTION: - if (param.mass_normalization == QUDA_MASS_NORMALIZATION || - param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(2.0*kappa, b); - if (for_multishift) - for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 2.0 * kappa; - } - break; - case QUDA_MATDAG_MAT_SOLUTION: - if (param.mass_normalization == QUDA_MASS_NORMALIZATION || - param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(4.0*kappa*kappa, b); - if (for_multishift) - for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; - } - break; - case QUDA_MATPC_SOLUTION: - if (param.mass_normalization == QUDA_MASS_NORMALIZATION) { - blas::ax(4.0*kappa*kappa, b); - if (for_multishift) - for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; - } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(2.0*kappa, b); - if (for_multishift) - for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 2.0 * kappa; - } - break; - case QUDA_MATPCDAG_MATPC_SOLUTION: - if (param.mass_normalization == QUDA_MASS_NORMALIZATION) { - blas::ax(16.0*std::pow(kappa,4), b); - if (for_multishift) - for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 16.0 * std::pow(kappa, 4); - } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(4.0*kappa*kappa, b); - if (for_multishift) - for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; - } - break; - default: - errorQuda("Solution type %d not supported", param.solution_type); + case QUDA_MAT_SOLUTION: + if (param.mass_normalization == QUDA_MASS_NORMALIZATION + || param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(2.0 * kappa, b); + if (for_multishift) + for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 2.0 * kappa; + } + break; + case QUDA_MATDAG_MAT_SOLUTION: + if (param.mass_normalization == QUDA_MASS_NORMALIZATION + || param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(4.0 * kappa * kappa, b); + if (for_multishift) + for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; + } + break; + case QUDA_MATPC_SOLUTION: + if (param.mass_normalization == QUDA_MASS_NORMALIZATION) { + blas::ax(4.0 * kappa * kappa, b); + if (for_multishift) + for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; + } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(2.0 * kappa, b); + if (for_multishift) + for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 2.0 * kappa; + } + break; + case QUDA_MATPCDAG_MATPC_SOLUTION: + if (param.mass_normalization == QUDA_MASS_NORMALIZATION) { + blas::ax(16.0 * std::pow(kappa, 4), b); + if (for_multishift) + for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 16.0 * std::pow(kappa, 4); + } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(4.0 * kappa * kappa, b); + if (for_multishift) + for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; + } + break; + default: errorQuda("Solution type %d not supported", param.solution_type); } logQuda(QUDA_DEBUG_VERBOSE, "Mass rescale: norm of source out = %g\n", blas::norm2(b)); } -} +} // namespace quda void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) { @@ -1832,7 +1799,8 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH) || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH)) errorQuda("Gauge field not allocated"); - if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) + if (cloverPrecise == nullptr + && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) errorQuda("Clover field not allocated"); pushVerbosity(inv_param->verbosity); @@ -1863,10 +1831,9 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); - if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION && - (inv_param->dslash_type == QUDA_STAGGERED_DSLASH || - inv_param->dslash_type == QUDA_ASQTAD_DSLASH) ) - blas::ax(1.0/(2.0*inv_param->mass), in); + if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION + && (inv_param->dslash_type == QUDA_STAGGERED_DSLASH || inv_param->dslash_type == QUDA_ASQTAD_DSLASH)) + blas::ax(1.0 / (2.0 * inv_param->mass), in); if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { if (parity == QUDA_EVEN_PARITY) { @@ -1881,8 +1848,8 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inv_param->dagger) { cudaParam.create = QUDA_NULL_FIELD_CREATE; ColorSpinorField tmp1(cudaParam); - ((DiracTwistedCloverPC*) dirac)->TwistCloverInv(tmp1, in, (parity+1)%2); // apply the clover-twist - dirac->Dslash(out, tmp1, parity); // apply the operator + ((DiracTwistedCloverPC *)dirac)->TwistCloverInv(tmp1, in, (parity + 1) % 2); // apply the clover-twist + dirac->Dslash(out, tmp1, parity); // apply the operator } else if (inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH || inv_param->dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH) { dirac->Dslash4(out, in, parity); @@ -1915,12 +1882,12 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH) || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH)) errorQuda("Gauge field not allocated"); - if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) + if (cloverPrecise == nullptr + && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) errorQuda("Clover field not allocated"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); - bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || - inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); + bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), pc, inv_param->input_location); ColorSpinorField in_h(cpuParam); @@ -1939,20 +1906,20 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) setDiracParam(diracParam, inv_param, pc); Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator - dirac->M(out, in); // apply the operator - delete dirac; // clean up + dirac->M(out, in); // apply the operator + delete dirac; // clean up double kappa = inv_param->kappa; if (pc) { if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) { - blas::ax(0.25/(kappa*kappa), out); + blas::ax(0.25 / (kappa * kappa), out); } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(0.5/kappa, out); + blas::ax(0.5 / kappa, out); } } else { - if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION || - inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(0.5/kappa, out); + if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION + || inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(0.5 / kappa, out); } } @@ -1966,7 +1933,6 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) popVerbosity(); } - void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) { pushVerbosity(inv_param->verbosity); @@ -1976,12 +1942,12 @@ void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH) || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH)) errorQuda("Gauge field not allocated"); - if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) + if (cloverPrecise == nullptr + && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) errorQuda("Clover field not allocated"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); - bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || - inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); + bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), pc, inv_param->input_location); ColorSpinorField in_h(cpuParam); @@ -2002,20 +1968,20 @@ void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) setDiracParam(diracParam, inv_param, pc); Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator - dirac->MdagM(out, in); // apply the operator - delete dirac; // clean up + dirac->MdagM(out, in); // apply the operator + delete dirac; // clean up double kappa = inv_param->kappa; if (pc) { if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) { - blas::ax(1.0/std::pow(2.0*kappa,4), out); + blas::ax(1.0 / std::pow(2.0 * kappa, 4), out); } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(0.25/(kappa*kappa), out); + blas::ax(0.25 / (kappa * kappa), out); } } else { - if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION || - inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(0.25/(kappa*kappa), out); + if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION + || inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(0.25 / (kappa * kappa), out); } } @@ -2044,11 +2010,10 @@ namespace quda } // namespace quda -void checkClover(QudaInvertParam *param) { +void checkClover(QudaInvertParam *param) +{ - if (param->dslash_type != QUDA_CLOVER_WILSON_DSLASH && param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) { - return; - } + if (param->dslash_type != QUDA_CLOVER_WILSON_DSLASH && param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) { return; } if (param->cuda_prec != cloverPrecise->Precision()) { errorQuda("Solve precision %d doesn't match clover precision %d", param->cuda_prec, cloverPrecise->Precision()); @@ -2188,10 +2153,12 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity DiracParam diracParam; setDiracParam(diracParam, inv_param, pc); - //FIXME: Do we need this for twisted clover??? + // FIXME: Do we need this for twisted clover??? DiracCloverPC dirac(diracParam); // create the Dirac operator - if (!inverse) dirac.Clover(out, in, parity); // apply the clover operator - else dirac.CloverInv(out, in, parity); + if (!inverse) + dirac.Clover(out, in, parity); // apply the clover operator + else + dirac.CloverInv(out, in, parity); cpuParam.v = h_out; cpuParam.location = inv_param->output_location; @@ -2383,8 +2350,8 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam profileEigensolve.TPSTOP(QUDA_PROFILE_TOTAL); } -multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile) - : profile(profile) { +multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile) : profile(profile) +{ profile.TPSTART(QUDA_PROFILE_INIT); QudaInvertParam *param = mg_param.invert_param; // set whether we are going use native or generic blas @@ -2396,22 +2363,20 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr // check MG params (needs to go somewhere else) if (mg_param.n_level > QUDA_MAX_MG_LEVEL) errorQuda("Requested MG levels %d greater than allowed maximum %d", mg_param.n_level, QUDA_MAX_MG_LEVEL); - for (int i=0; isolve_type != QUDA_DIRECT_SOLVE) - errorQuda("Outer MG solver can only use QUDA_DIRECT_SOLVE at present"); + if (param->solve_type != QUDA_DIRECT_SOLVE) errorQuda("Outer MG solver can only use QUDA_DIRECT_SOLVE at present"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaMultigridParam(&mg_param); mg_param.secs = 0; mg_param.gflops = 0; - bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || - (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); + bool pc_solution + = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); - bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || - (param->solve_type == QUDA_NORMOP_PC_SOLVE); + bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE); // create the dirac operators for the fine grid @@ -2423,8 +2388,8 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr // this is the Dirac operator we use for smoothing DiracParam diracSmoothParam; - bool fine_grid_pc_solve = (mg_param.smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE) || - (mg_param.smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE); + bool fine_grid_pc_solve = (mg_param.smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE) + || (mg_param.smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE); setDiracSloppyParam(diracSmoothParam, param, fine_grid_pc_solve); diracSmoothParam.halo_precision = mg_param.smoother_halo_precision[0]; dSmooth = Dirac::create(diracSmoothParam); @@ -2433,7 +2398,7 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this) DiracParam diracSmoothSloppyParam; setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve, - mg_param.smoother_schwarz_type[0] == QUDA_INVALID_SCHWARZ ? true : false); + mg_param.smoother_schwarz_type[0] == QUDA_INVALID_SCHWARZ ? true : false); diracSmoothSloppyParam.halo_precision = mg_param.smoother_halo_precision[0]; dSmoothSloppy = Dirac::create(diracSmoothSloppyParam); @@ -2442,7 +2407,9 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr ColorSpinorParam csParam(nullptr, *param, cudaGauge->X(), pc_solution, mg_param.setup_location[0]); csParam.create = QUDA_NULL_FIELD_CREATE; QudaPrecision Bprec = mg_param.precision_null[0]; - Bprec = (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION && Bprec < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : Bprec); + Bprec + = (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION && Bprec < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : + Bprec); csParam.setPrecision(Bprec, Bprec, true); if (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION) csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.mem_type = mg_param.setup_minimize_memory == QUDA_BOOLEAN_TRUE ? QUDA_MEMORY_MAPPED : QUDA_MEMORY_DEVICE; @@ -2467,7 +2434,8 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr profile.TPSTOP(QUDA_PROFILE_INIT); } -void* newMultigridQuda(QudaMultigridParam *mg_param) { +void *newMultigridQuda(QudaMultigridParam *mg_param) +{ profilerStart(__func__); pushVerbosity(mg_param->invert_param->verbosity); @@ -2481,12 +2449,10 @@ void* newMultigridQuda(QudaMultigridParam *mg_param) { popVerbosity(); profilerStop(__func__); - return static_cast(mg); + return static_cast(mg); } -void destroyMultigridQuda(void *mg) { - delete static_cast(mg); -} +void destroyMultigridQuda(void *mg) { delete static_cast(mg); } void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param) { @@ -2497,7 +2463,7 @@ void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param) profileInvert.TPSTART(QUDA_PROFILE_TOTAL); profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE); - auto *mg = static_cast(mg_); + auto *mg = static_cast(mg_); checkMultigridParam(mg_param); QudaInvertParam *param = mg_param->invert_param; @@ -2507,7 +2473,7 @@ void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param) // for reporting level 1 is the fine level but internally use level 0 for indexing // sprintf(mg->prefix,"MG level 1 (%s): ", param.location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU" ); // setOutputPrefix(prefix); - setOutputPrefix("MG level 1 (GPU): "); //fix me + setOutputPrefix("MG level 1 (GPU): "); // fix me // Check if we're doing a thin update only if (mg_param->thin_update_only) { @@ -2609,7 +2575,7 @@ void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param) pushVerbosity(mg_param->invert_param->verbosity); profileInvert.TPSTART(QUDA_PROFILE_TOTAL); - auto *mg = static_cast(mg_); + auto *mg = static_cast(mg_); checkMultigridParam(mg_param); checkGauge(mg_param->invert_param); @@ -2620,8 +2586,9 @@ void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param) profilerStop(__func__); } -deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) - : d(nullptr), m(nullptr), RV(nullptr), deflParam(nullptr), defl(nullptr), profile(profile) { +deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) : + d(nullptr), m(nullptr), RV(nullptr), deflParam(nullptr), defl(nullptr), profile(profile) +{ QudaInvertParam *param = eig_param.invert_param; @@ -2630,49 +2597,52 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) profile.TPSTART(QUDA_PROFILE_INIT); cudaGaugeField *cudaGauge = checkGauge(param); - eig_param.secs = 0; + eig_param.secs = 0; eig_param.gflops = 0; DiracParam diracParam; - if(eig_param.cuda_prec_ritz == param->cuda_prec) - { - setDiracParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE)); + if (eig_param.cuda_prec_ritz == param->cuda_prec) { + setDiracParam(diracParam, param, + (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE)); } else { - setDiracSloppyParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE)); + setDiracSloppyParam(diracParam, param, + (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE)); } const bool pc_solve = (param->solve_type == QUDA_NORMOP_PC_SOLVE); d = Dirac::create(diracParam); - m = pc_solve ? static_cast( new DiracMdagM(*d) ) : static_cast( new DiracM(*d)); + m = pc_solve ? static_cast(new DiracMdagM(*d)) : static_cast(new DiracM(*d)); ColorSpinorParam ritzParam(nullptr, *param, cudaGauge->X(), pc_solve, eig_param.location); - ritzParam.create = QUDA_ZERO_FIELD_CREATE; - ritzParam.is_composite = true; - ritzParam.is_component = false; + ritzParam.create = QUDA_ZERO_FIELD_CREATE; + ritzParam.is_composite = true; + ritzParam.is_component = false; ritzParam.composite_dim = param->n_ev * param->deflation_grid; ritzParam.setPrecision(param->cuda_prec_ritz); - if (ritzParam.location==QUDA_CUDA_FIELD_LOCATION) { + if (ritzParam.location == QUDA_CUDA_FIELD_LOCATION) { ritzParam.setPrecision(param->cuda_prec_ritz, param->cuda_prec_ritz, true); // set native field order if (ritzParam.nSpin != 1) ritzParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; - //select memory location here, by default ritz vectors will be allocated on the device - //but if not sufficient device memory, then the user may choose mapped type of memory + // select memory location here, by default ritz vectors will be allocated on the device + // but if not sufficient device memory, then the user may choose mapped type of memory ritzParam.mem_type = eig_param.mem_type_ritz; - } else { //host location + } else { // host location ritzParam.mem_type = QUDA_MEMORY_PINNED; } int ritzVolume = 1; - for(int d = 0; d < ritzParam.nDim; d++) ritzVolume *= ritzParam.x[d]; + for (int d = 0; d < ritzParam.nDim; d++) ritzVolume *= ritzParam.x[d]; if (getVerbosity() == QUDA_DEBUG_VERBOSE) { - size_t byte_estimate = (size_t)ritzParam.composite_dim*(size_t)ritzVolume*(ritzParam.nColor*ritzParam.nSpin*ritzParam.Precision()); + size_t byte_estimate = (size_t)ritzParam.composite_dim * (size_t)ritzVolume + * (ritzParam.nColor * ritzParam.nSpin * ritzParam.Precision()); printfQuda("allocating bytes: %lu (lattice volume %d, prec %d)", byte_estimate, ritzVolume, ritzParam.Precision()); - if(ritzParam.mem_type == QUDA_MEMORY_DEVICE) printfQuda("Using device memory type.\n"); + if (ritzParam.mem_type == QUDA_MEMORY_DEVICE) + printfQuda("Using device memory type.\n"); else if (ritzParam.mem_type == QUDA_MEMORY_MAPPED) printfQuda("Using mapped memory type.\n"); } @@ -2686,7 +2656,8 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) profile.TPSTOP(QUDA_PROFILE_INIT); } -void* newDeflationQuda(QudaEigParam *eig_param) { +void *newDeflationQuda(QudaEigParam *eig_param) +{ profileInvert.TPSTART(QUDA_PROFILE_TOTAL); auto *defl = new deflated_solver(*eig_param, profileInvert); @@ -2694,12 +2665,10 @@ void* newDeflationQuda(QudaEigParam *eig_param) { saveProfile(__func__); flushProfile(); - return static_cast(defl); + return static_cast(defl); } -void destroyDeflationQuda(void *df) { - delete static_cast(df); -} +void destroyDeflationQuda(void *df) { delete static_cast(df); } void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) { @@ -2721,16 +2690,13 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) // solve_type and solution_type, rather than in separate members of QudaInvertParam. We're stuck with it // for now, though, so here we factorize everything for convenience. - bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || - (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); - bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || - (param->solve_type == QUDA_NORMOP_PC_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE); - bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || - (param->solution_type == QUDA_MATPC_SOLUTION); - bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || - (param->solve_type == QUDA_DIRECT_PC_SOLVE); - bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) || - (param->solve_type == QUDA_NORMERR_PC_SOLVE); + bool pc_solution + = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); + bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE) + || (param->solve_type == QUDA_NORMERR_PC_SOLVE); + bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type == QUDA_MATPC_SOLUTION); + bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE); + bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE); param->secs = 0; param->gflops = 0; @@ -2793,13 +2759,13 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES && !param->chrono_use_resident) { // download initial guess // initial guess only supported for single-pass solvers - if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) && - (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) { + if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) + && (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) { errorQuda("Initial guess not supported for two-pass solver"); } x = h_x; // solution - } else { // zero initial guess + } else { // zero initial guess blas::zero(x); } @@ -2816,7 +2782,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE); double nb = blas::norm2(b); - if (nb==0.0) errorQuda("Source has zero norm"); + if (nb == 0.0) errorQuda("Source has zero norm"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Source: CPU = %g, CUDA copy = %g\n", blas::norm2(h_b), nb); @@ -2871,23 +2837,19 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) // taken care of by Dirac::prepare() and Dirac::reconstruct(), // respectively. - if (pc_solution && !pc_solve) { - errorQuda("Preconditioned (PC) solution_type requires a PC solve_type"); - } + if (pc_solution && !pc_solve) { errorQuda("Preconditioned (PC) solution_type requires a PC solve_type"); } if (!mat_solution && !pc_solution && pc_solve) { errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type"); } - if (!mat_solution && norm_error_solve) { - errorQuda("Normal-error solve requires Mat solution"); - } + if (!mat_solution && norm_error_solve) { errorQuda("Normal-error solve requires Mat solution"); } if (param->inv_type_precondition == QUDA_MG_INVERTER && (!direct_solve || !mat_solution)) { errorQuda("Multigrid preconditioning only supported for direct solves"); } - if (param->chrono_use_resident && ( norm_error_solve) ){ + if (param->chrono_use_resident && (norm_error_solve)) { errorQuda("Chronological forcasting only presently supported for M^dagger M solver"); } @@ -2993,8 +2955,8 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) ColorSpinorField tmp(*out); SolverParam solverParam(*param); Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, mEig, profileInvert); - (*solve)(tmp, *in); // y = (M M^\dag) b - dirac.Mdag(*out, tmp); // x = M^dag y + (*solve)(tmp, *in); // y = (M M^\dag) b + dirac.Mdag(*out, tmp); // x = M^dag y delete solve; solverParam.updateInvertParam(*param); } @@ -3003,21 +2965,21 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE); if (param->chrono_make_resident) { - if(param->chrono_max_dim < 1){ + if (param->chrono_max_dim < 1) { errorQuda("Cannot chrono_make_resident with chrono_max_dim %i", param->chrono_max_dim); } const int i = param->chrono_index; - if (i >= QUDA_MAX_CHRONO) - errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO); + if (i >= QUDA_MAX_CHRONO) errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO); auto &basis = chronoResident[i]; if (param->chrono_max_dim < (int)basis.size()) { - errorQuda("Requested chrono_max_dim %i is smaller than already existing chronology %lu", param->chrono_max_dim, basis.size()); + errorQuda("Requested chrono_max_dim %i is smaller than already existing chronology %lu", param->chrono_max_dim, + basis.size()); } - if(not param->chrono_replace_last){ + if (not param->chrono_replace_last) { // if we have not filled the space yet just augment if ((int)basis.size() < param->chrono_max_dim) { ColorSpinorParam cs_param(*out); @@ -3502,13 +3464,13 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) pushVerbosity(param->verbosity); - bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); + bool pc_solution + = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE); - bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type == QUDA_MATPC_SOLUTION); + bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type == QUDA_MATPC_SOLUTION); bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE); - if (param->dslash_type == QUDA_ASQTAD_DSLASH || - param->dslash_type == QUDA_STAGGERED_DSLASH) { + if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) { if (param->solution_type != QUDA_MATPC_SOLUTION) { errorQuda("For Staggered-type fermions, multi-shift solver only suports MATPC solution type"); @@ -3530,7 +3492,8 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) errorQuda("For Wilson-type fermions, preconditioned (PC) solution_type requires a PC solve_type"); } if (!pc_solution & pc_solve) { - errorQuda("For Wilson-type fermions, in multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type"); + errorQuda("For Wilson-type fermions, in multi-shift solver, a preconditioned (PC) solve_type requires a PC " + "solution_type"); } } @@ -3539,10 +3502,9 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) param->gflops = 0; param->iter = 0; - for (int i=0; inum_offset-1; i++) { - for (int j=i+1; jnum_offset; j++) { - if (param->offset[i] > param->offset[j]) - errorQuda("Offsets must be ordered from smallest to largest"); + for (int i = 0; i < param->num_offset - 1; i++) { + for (int j = i + 1; j < param->num_offset; j++) { + if (param->offset[i] > param->offset[j]) errorQuda("Offsets must be ordered from smallest to largest"); } } @@ -3553,9 +3515,8 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) // Balint: Isn't there a nice construction pattern we could use here? This is // expedient but yucky. // DiracParam diracParam; - if (param->dslash_type == QUDA_ASQTAD_DSLASH || - param->dslash_type == QUDA_STAGGERED_DSLASH){ - param->mass = sqrt(param->offset[0]/4); + if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) { + param->mass = sqrt(param->offset[0] / 4); } Dirac *d = nullptr; @@ -3587,7 +3548,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) h_x.resize(param->num_offset); cpuParam.location = param->output_location; - for(int i=0; i < param->num_offset; i++) { + for (int i = 0; i < param->num_offset; i++) { cpuParam.v = hp_x[i]; h_x[i] = std::make_unique(cpuParam); } @@ -3629,7 +3590,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) // Check source norms double nb = blas::norm2(b); - if (nb==0.0) errorQuda("Source has zero norm"); + if (nb == 0.0) errorQuda("Source has zero norm"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Source: CPU = %g, CUDA copy = %g\n", blas::norm2(h_b), nb); @@ -3650,8 +3611,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) DiracMatrix *m, *mSloppy; - if (param->dslash_type == QUDA_ASQTAD_DSLASH || - param->dslash_type == QUDA_STAGGERED_DSLASH) { + if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) { m = new DiracM(dirac); mSloppy = new DiracM(diracSloppy); } else { @@ -3683,42 +3643,38 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) #define REFINE_INCREASING_MASS #ifdef REFINE_INCREASING_MASS - for(int i=0; i < param->num_offset; i++) { + for (int i = 0; i < param->num_offset; i++) { #else - for(int i=param->num_offset-1; i >= 0; i--) { + for (int i = param->num_offset - 1; i >= 0; i--) { #endif - double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? - param->true_res_hq_offset[i] : 0; - double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? - param->tol_hq_offset[i] : 0; + double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? param->true_res_hq_offset[i] : 0; + double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? param->tol_hq_offset[i] : 0; /* - In the case where the shifted systems have zero tolerance - specified, we refine these systems until either the limit of - precision is reached (prec_tol) or until the tolerance reaches - the iterated residual tolerance of the previous multi-shift - solver (iter_res_offset[i]), which ever is greater. + In the case where the shifted systems have zero tolerance + specified, we refine these systems until either the limit of + precision is reached (prec_tol) or until the tolerance reaches + the iterated residual tolerance of the previous multi-shift + solver (iter_res_offset[i]), which ever is greater. */ - const double prec_tol = std::pow(10.,(-2*(int)param->cuda_prec+4)); // implicit refinment limit of 1e-12 - const double iter_tol = (param->iter_res_offset[i] < prec_tol ? prec_tol : (param->iter_res_offset[i] *1.1)); + const double prec_tol = std::pow(10., (-2 * (int)param->cuda_prec + 4)); // implicit refinment limit of 1e-12 + const double iter_tol = (param->iter_res_offset[i] < prec_tol ? prec_tol : (param->iter_res_offset[i] * 1.1)); const double refine_tol = (param->tol_offset[i] == 0.0 ? iter_tol : param->tol_offset[i]); // refine if either L2 or heavy quark residual tolerances have not been met, only if desired residual is > 0 if (param->true_res_offset[i] > refine_tol || rsd_hq > tol_hq) { - if (getVerbosity() >= QUDA_SUMMARIZE) - printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n", - i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq); + if (getVerbosity() >= QUDA_SUMMARIZE) + printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n", i, + param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq); // for staggered the shift is just a change in mass term (FIXME: for twisted mass also) - if (param->dslash_type == QUDA_ASQTAD_DSLASH || - param->dslash_type == QUDA_STAGGERED_DSLASH) { - dirac.setMass(sqrt(param->offset[i]/4)); - diracSloppy.setMass(sqrt(param->offset[i]/4)); + if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) { + dirac.setMass(sqrt(param->offset[i] / 4)); + diracSloppy.setMass(sqrt(param->offset[i] / 4)); } DiracMatrix *m, *mSloppy; - if (param->dslash_type == QUDA_ASQTAD_DSLASH || - param->dslash_type == QUDA_STAGGERED_DSLASH) { + if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) { m = new DiracM(dirac); mSloppy = new DiracM(diracSloppy); } else { @@ -3735,9 +3691,9 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) if (false) { // experimenting with Minimum residual extrapolation // only perform MRE using current and previously refined solutions #ifdef REFINE_INCREASING_MASS - const int nRefine = i+1; + const int nRefine = i + 1; #else - const int nRefine = param->num_offset - i + 1; + const int nRefine = param->num_offset - i + 1; #endif cudaParam.create = QUDA_NULL_FIELD_CREATE; @@ -3754,7 +3710,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) bool orthogonal = false; bool apply_mat = true; bool hermitian = true; - MinResExt mre(*m, orthogonal, apply_mat, hermitian, profileMulti); + MinResExt mre(*m, orthogonal, apply_mat, hermitian, profileMulti); mre(x[i], b, z, q); } @@ -3767,7 +3723,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) { CG cg(*m, *mSloppy, *mSloppy, *mSloppy, solverParam, profileMulti); - if (i==0) + if (i == 0) cg(x[i], b, &p[i], r2_old[i]); else cg(x[i], b); @@ -3775,12 +3731,11 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) solverParam.true_res_offset[i] = solverParam.true_res; solverParam.true_res_hq_offset[i] = solverParam.true_res_hq; - solverParam.updateInvertParam(*param,i); + solverParam.updateInvertParam(*param, i); - if (param->dslash_type == QUDA_ASQTAD_DSLASH || - param->dslash_type == QUDA_STAGGERED_DSLASH) { - dirac.setMass(sqrt(param->offset[0]/4)); // restore just in case - diracSloppy.setMass(sqrt(param->offset[0]/4)); // restore just in case + if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) { + dirac.setMass(sqrt(param->offset[0] / 4)); // restore just in case + diracSloppy.setMass(sqrt(param->offset[0] / 4)); // restore just in case } delete m; @@ -3801,7 +3756,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) param->action[1] = action.imag(); } - for(int i=0; i < param->num_offset; i++) { + for (int i = 0; i < param->num_offset; i++) { if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) { // rescale the solution blas::ax(sqrt(nb), x[i]); } @@ -3844,14 +3799,14 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, GaugeFieldParam gParam(*param, fatlink, QUDA_GENERAL_LINKS); gParam.location = QUDA_CPU_FIELD_LOCATION; - cpuGaugeField cpuFatLink(gParam); // create the host fatlink + cpuGaugeField cpuFatLink(gParam); // create the host fatlink gParam.gauge = longlink; - cpuGaugeField cpuLongLink(gParam); // create the host longlink + cpuGaugeField cpuLongLink(gParam); // create the host longlink gParam.gauge = ulink; cpuGaugeField cpuUnitarizedLink(gParam); gParam.link_type = param->type; gParam.gauge = inlink; - cpuGaugeField cpuInLink(gParam); // create the host sitelink + cpuGaugeField cpuInLink(gParam); // create the host sitelink // create the device fields gParam.location = QUDA_CUDA_FIELD_LOCATION; @@ -3937,8 +3892,8 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, profileFatLink.TPSTOP(QUDA_PROFILE_TOTAL); } -int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int* path_length, - double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam) +int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, + int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam) { profileGaugeForce.TPSTART(QUDA_PROFILE_TOTAL); profileGaugeForce.TPSTART(QUDA_PROFILE_INIT); @@ -3951,7 +3906,7 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int gParam.site_size = qudaGaugeParam->site_size; cpuGaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new cpuGaugeField(gParam) : nullptr; - cudaGaugeField* cudaSiteLink = nullptr; + cudaGaugeField *cudaSiteLink = nullptr; if (qudaGaugeParam->use_resident_gauge) { if (!gaugePrecise) errorQuda("No resident gauge field to use"); @@ -3981,9 +3936,9 @@ int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int gParamMom.site_offset = qudaGaugeParam->mom_offset; gParamMom.site_size = qudaGaugeParam->site_size; - cpuGaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new cpuGaugeField(gParamMom) : nullptr; + cpuGaugeField *cpuMom = (!qudaGaugeParam->use_resident_mom) ? new cpuGaugeField(gParamMom) : nullptr; - cudaGaugeField* cudaMom = nullptr; + cudaGaugeField *cudaMom = nullptr; if (qudaGaugeParam->use_resident_mom) { if (!momResident) errorQuda("No resident momentum field to use"); cudaMom = momResident; @@ -4226,16 +4181,18 @@ void momResidentQuda(void *mom, QudaGaugeParam *param) profileGaugeForce.TPSTOP(QUDA_PROFILE_TOTAL); } -void createCloverQuda(QudaInvertParam* invertParam) +void createCloverQuda(QudaInvertParam *invertParam) { profileClover.TPSTART(QUDA_PROFILE_TOTAL); if (!cloverPrecise) errorQuda("Clover field not allocated"); - QudaReconstructType recon = (gaugePrecise->Reconstruct() == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_12 : gaugePrecise->Reconstruct(); + QudaReconstructType recon + = (gaugePrecise->Reconstruct() == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_12 : gaugePrecise->Reconstruct(); // for clover we optimize to only send depth 1 halos in y/z/t (FIXME - make work for x, make robust in general) lat_dim_t R; - for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d)); - cudaGaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon); + for (int d = 0; d < 4; d++) R[d] = (d == 0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d)); + cudaGaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : + createExtendedGauge(*gaugePrecise, R, profileClover, false, recon); profileClover.TPSTART(QUDA_PROFILE_INIT); @@ -4268,7 +4225,7 @@ void createCloverQuda(QudaInvertParam* invertParam) extendedGaugeResident = gauge; } -void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param) +void *createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param) { GaugeFieldParam gParam(*param, gauge, QUDA_GENERAL_LINKS); gParam.geometry = static_cast(geometry); @@ -4280,7 +4237,7 @@ void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param) gParam.order = QUDA_FLOAT2_GAUGE_ORDER; gParam.create = QUDA_ZERO_FIELD_CREATE; - auto* cudaGauge = new cudaGaugeField(gParam); + auto *cudaGauge = new cudaGaugeField(gParam); if (gauge) { cudaGauge->loadCPUField(*cpuGauge); @@ -4292,7 +4249,7 @@ void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param) void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param) { - auto* cudaGauge = reinterpret_cast(inGauge); + auto *cudaGauge = reinterpret_cast(inGauge); GaugeFieldParam gParam(*param, gauge, QUDA_GENERAL_LINKS); gParam.geometry = cudaGauge->Geometry(); @@ -4303,7 +4260,7 @@ void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param) void destroyGaugeFieldQuda(void *gauge) { - auto* g = reinterpret_cast(gauge); + auto *g = reinterpret_cast(gauge); delete g; } @@ -4346,7 +4303,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi qParam.pc_type = QUDA_4D_PC; qParam.setPrecision(gParam.Precision()); qParam.pad = 0; - for(int dir=0; dir<4; ++dir) qParam.x[dir] = gParam.x[dir]; + for (int dir = 0; dir < 4; ++dir) qParam.x[dir] = gParam.x[dir]; qParam.x[4] = 1; qParam.create = QUDA_NULL_FIELD_CREATE; qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; @@ -4364,8 +4321,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi } // resident gauge field is required - if (!gauge_param->use_resident_gauge || !gaugePrecise) - errorQuda("Resident gauge field is required"); + if (!gauge_param->use_resident_gauge || !gaugePrecise) errorQuda("Resident gauge field is required"); if (!gaugePrecise->StaggeredPhaseApplied()) { errorQuda("Gauge field requires the staggered phase factors to be applied"); @@ -4373,41 +4329,39 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi // check if staggered phase is the desired one if (gauge_param->staggered_phase_type != gaugePrecise->StaggeredPhase()) { - errorQuda("Requested staggered phase %d, but found %d\n", - gauge_param->staggered_phase_type, gaugePrecise->StaggeredPhase()); + errorQuda("Requested staggered phase %d, but found %d\n", gauge_param->staggered_phase_type, + gaugePrecise->StaggeredPhase()); } profileStaggeredForce.TPSTOP(QUDA_PROFILE_H2D); profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT); const int nvector = inv_param->num_offset; - std::vector X(nvector); - for ( int i=0; i X(nvector); + for (int i = 0; i < nvector; i++) X[i] = ColorSpinorField::Create(qParam); if (inv_param->use_resident_solution) { if (solutionResident.size() < (unsigned int)nvector) - errorQuda("solutionResident.size() %lu does not match number of shifts %d", - solutionResident.size(), nvector); + errorQuda("solutionResident.size() %lu does not match number of shifts %d", solutionResident.size(), nvector); } // create the staggered operator DiracParam diracParam; - bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || - (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE); - if (!pc_solve) - errorQuda("Preconditioned solve type required not %d\n", inv_param->solve_type); + bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE); + if (!pc_solve) errorQuda("Preconditioned solve type required not %d\n", inv_param->solve_type); setDiracParam(diracParam, inv_param, pc_solve); Dirac *dirac = Dirac::create(diracParam); profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT); profileStaggeredForce.TPSTART(QUDA_PROFILE_PREAMBLE); - for (int i=0; iuse_resident_solution) x.Even() = solutionResident[i]; - else errorQuda("%s requires resident solution", __func__); + else + errorQuda("%s requires resident solution", __func__); // set the odd solution component dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY); @@ -4425,7 +4379,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi profileStaggeredForce.TPSTART(QUDA_PROFILE_COMPUTE); // compute quark-field outer product - for (int i=0; iresidue[i], 0.0}; @@ -4457,24 +4411,15 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi profileStaggeredForce.TPSTOP(QUDA_PROFILE_D2H); profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE); - for (int i=0; iuse_resident_mom) ? new cpuGaugeField(param) : nullptr; + param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; + cpuGaugeField *cpuMom = (!gParam->use_resident_mom) ? new cpuGaugeField(param) : nullptr; param.link_type = QUDA_GENERAL_LINKS; param.reconstruct = QUDA_RECONSTRUCT_NO; - param.gauge = (void*)w_link; + param.gauge = (void *)w_link; cpuGaugeField cpuWLink(param); - param.gauge = (void*)v_link; + param.gauge = (void *)v_link; cpuGaugeField cpuVLink(param); - param.gauge = (void*)u_link; + param.gauge = (void *)u_link; cpuGaugeField cpuULink(param); param.create = QUDA_ZERO_FIELD_CREATE; - param.order = QUDA_FLOAT2_GAUGE_ORDER; + param.order = QUDA_FLOAT2_GAUGE_ORDER; param.link_type = QUDA_ASQTAD_MOM_LINKS; param.reconstruct = QUDA_RECONSTRUCT_10; GaugeFieldParam momParam(param); @@ -4541,8 +4487,8 @@ void computeHISQForceQuda(void* const milc_momentum, lat_dim_t R = {2 * comm_dim_partitioned(0), 2 * comm_dim_partitioned(1), 2 * comm_dim_partitioned(2), 2 * comm_dim_partitioned(3)}; - for (int dir=0; dir<4; ++dir) { - param.x[dir] += 2*R[dir]; + for (int dir = 0; dir < 4; ++dir) { + param.x[dir] += 2 * R[dir]; param.r[dir] = R[dir]; } @@ -4563,7 +4509,7 @@ void computeHISQForceQuda(void* const milc_momentum, qParam.pc_type = QUDA_4D_PC; qParam.setPrecision(oParam.Precision()); qParam.pad = 0; - for (int dir=0; dir<4; ++dir) qParam.x[dir] = oParam.x[dir]; + for (int dir = 0; dir < 4; ++dir) qParam.x[dir] = oParam.x[dir]; // create the device quark field qParam.create = QUDA_NULL_FIELD_CREATE; @@ -4581,7 +4527,7 @@ void computeHISQForceQuda(void* const milc_momentum, GaugeField *oprod[2] = {stapleOprod, naikOprod}; // loop over different quark fields - for(int i=0; iloadCPUField(cpuWLink, profileHISQForce); - cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true); - cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true); - cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true); + cudaInForce->exchangeExtendedGhost(R, profileHISQForce, true); + cudaGauge->exchangeExtendedGhost(R, profileHISQForce, true); + cudaOutForce->exchangeExtendedGhost(R, profileHISQForce, true); profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE); hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaGauge, act_path_coeff); @@ -4652,7 +4598,7 @@ void computeHISQForceQuda(void* const milc_momentum, // Load naik outer product copyExtendedGauge(*cudaInForce, *naikOprod, QUDA_CUDA_FIELD_LOCATION); - cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true); + cudaInForce->exchangeExtendedGhost(R, profileHISQForce, true); delete naikOprod; // Compute Naik three-link term @@ -4661,17 +4607,18 @@ void computeHISQForceQuda(void* const milc_momentum, qudaDeviceSynchronize(); profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE); - cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true); + cudaOutForce->exchangeExtendedGhost(R, profileHISQForce, true); // load v-link cudaGauge->loadCPUField(cpuVLink, profileHISQForce); - cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true); + cudaGauge->exchangeExtendedGhost(R, profileHISQForce, true); profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE); *num_failures_h = 0; unitarizeForce(*cudaInForce, *cudaOutForce, *cudaGauge, num_failures_d); - if (*num_failures_h>0) errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h); + if (*num_failures_h > 0) + errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h); cudaOutForce->zero(); qudaDeviceSynchronize(); @@ -4689,7 +4636,7 @@ void computeHISQForceQuda(void* const milc_momentum, delete cudaInForce; momParam.location = QUDA_CUDA_FIELD_LOCATION; - cudaGaugeField* cudaMom = new cudaGaugeField(momParam); + cudaGaugeField *cudaMom = new cudaGaugeField(momParam); profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE); hisqCompleteForce(*cudaOutForce, *cudaGauge); @@ -4764,15 +4711,15 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double qParam.nDim = 4; qParam.setPrecision(fParam.Precision()); qParam.pad = 0; - for(int dir=0; dir<4; ++dir) qParam.x[dir] = fParam.x[dir]; + for (int dir = 0; dir < 4; ++dir) qParam.x[dir] = fParam.x[dir]; // create the device quark field qParam.create = QUDA_NULL_FIELD_CREATE; qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; qParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; - std::vector quarkX, quarkP; - for (int i=0; i quarkX, quarkP; + for (int i = 0; i < nvector; i++) { quarkX.push_back(ColorSpinorField::Create(qParam)); quarkP.push_back(ColorSpinorField::Create(qParam)); } @@ -4787,8 +4734,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double qParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; qParam.gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // need expose this to interface - bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || - (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE); + bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE); DiracParam diracParam; setDiracParam(diracParam, inv_param, pc_solve); diracParam.tmp1 = &tmp; // use as temporary for dirac->M @@ -4796,8 +4742,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double if (inv_param->use_resident_solution) { if (solutionResident.size() < (unsigned int)nvector) - errorQuda("solutionResident.size() %lu does not match number of shifts %d", - solutionResident.size(), nvector); + errorQuda("solutionResident.size() %lu does not match number of shifts %d", solutionResident.size(), nvector); } cudaGaugeField &gaugeEx = *extendedGaugeResident; @@ -4811,7 +4756,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double std::vector force_coeff(nvector); // loop over different quark fields - for(int i=0; i copy(gaugeEx); + u->copy(gaugeEx); } - computeCloverSigmaTrace(oprod, *cloverPrecise, 2.0*ck*multiplicity*dt); + computeCloverSigmaTrace(oprod, *cloverPrecise, 2.0 * ck * multiplicity * dt); /* Now the U dA/dU terms */ - std::vector< std::vector > ferm_epsilon(nvector); + std::vector> ferm_epsilon(nvector); for (int shift = 0; shift < nvector; shift++) { ferm_epsilon[shift].reserve(2); - ferm_epsilon[shift][0] = 2.0*ck*coeff[shift]*dt; - ferm_epsilon[shift][1] = -kappa2 * 2.0*ck*coeff[shift]*dt; + ferm_epsilon[shift][0] = 2.0 * ck * coeff[shift] * dt; + ferm_epsilon[shift][1] = -kappa2 * 2.0 * ck * coeff[shift] * dt; } computeCloverSigmaOprod(oprod, quarkX, quarkP, ferm_epsilon); @@ -4891,7 +4836,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double profileCloverForce.TPSTART(QUDA_PROFILE_FREE); - for (int i=0; imom_offset; gParamMom.site_size = param->site_size; @@ -4953,7 +4894,7 @@ void updateGaugeFieldQuda(void* gauge, profileGaugeUpdate.TPSTART(QUDA_PROFILE_H2D); - if (!param->use_resident_gauge) { // load fields onto the device + if (!param->use_resident_gauge) { // load fields onto the device cudaInGauge->loadCPUField(*cpuGauge); } else { // or use resident fields already present if (!gaugePrecise) errorQuda("No resident gauge field allocated"); @@ -4973,8 +4914,7 @@ void updateGaugeFieldQuda(void* gauge, // perform the update profileGaugeUpdate.TPSTART(QUDA_PROFILE_COMPUTE); - updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom, - (bool)conj_mom, (bool)exact); + updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom, (bool)conj_mom, (bool)exact); profileGaugeUpdate.TPSTOP(QUDA_PROFILE_COMPUTE); if (param->return_result_gauge) { @@ -5007,127 +4947,130 @@ void updateGaugeFieldQuda(void* gauge, profileGaugeUpdate.TPSTOP(QUDA_PROFILE_TOTAL); } - void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) { - profileProject.TPSTART(QUDA_PROFILE_TOTAL); - - profileProject.TPSTART(QUDA_PROFILE_INIT); - checkGaugeParam(param); - - // create the gauge field - GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); - gParam.location = QUDA_CPU_FIELD_LOCATION; - gParam.site_offset = param->gauge_offset; - gParam.site_size = param->site_size; - bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; - cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr; - - // create the device fields - gParam.location = QUDA_CUDA_FIELD_LOCATION; - gParam.create = QUDA_NULL_FIELD_CREATE; - gParam.order = QUDA_FLOAT2_GAUGE_ORDER; - gParam.reconstruct = param->reconstruct; - cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr; - profileProject.TPSTOP(QUDA_PROFILE_INIT); - - if (param->use_resident_gauge) { - if (!gaugePrecise) errorQuda("No resident gauge field to use"); - cudaGauge = gaugePrecise; - gaugePrecise = nullptr; - } else { - profileProject.TPSTART(QUDA_PROFILE_H2D); - cudaGauge->loadCPUField(*cpuGauge); - profileProject.TPSTOP(QUDA_PROFILE_H2D); - } - - profileProject.TPSTART(QUDA_PROFILE_COMPUTE); - *num_failures_h = 0; - - // project onto SU(3) - if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase(); - projectSU3(*cudaGauge, tol, num_failures_d); - if (!cudaGauge->StaggeredPhaseApplied() && param->staggered_phase_applied) cudaGauge->applyStaggeredPhase(); - - profileProject.TPSTOP(QUDA_PROFILE_COMPUTE); - - if(*num_failures_h>0) - errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h); - - profileProject.TPSTART(QUDA_PROFILE_D2H); - if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge); - profileProject.TPSTOP(QUDA_PROFILE_D2H); - - if (param->make_resident_gauge) { - if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise; - gaugePrecise = cudaGauge; - } else { - delete cudaGauge; - } - - profileProject.TPSTART(QUDA_PROFILE_FREE); - if (cpuGauge) delete cpuGauge; - profileProject.TPSTOP(QUDA_PROFILE_FREE); - - profileProject.TPSTOP(QUDA_PROFILE_TOTAL); - } - - void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) { - profilePhase.TPSTART(QUDA_PROFILE_TOTAL); - - profilePhase.TPSTART(QUDA_PROFILE_INIT); - checkGaugeParam(param); - - // create the gauge field - GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); - bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; - gParam.location = QUDA_CPU_FIELD_LOCATION; - cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr; - - // create the device fields - gParam.location = QUDA_CUDA_FIELD_LOCATION; - gParam.create = QUDA_NULL_FIELD_CREATE; - gParam.order = QUDA_FLOAT2_GAUGE_ORDER; - gParam.reconstruct = param->reconstruct; - cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr; - profilePhase.TPSTOP(QUDA_PROFILE_INIT); - - if (param->use_resident_gauge) { - if (!gaugePrecise) errorQuda("No resident gauge field to use"); - cudaGauge = gaugePrecise; - } else { - profilePhase.TPSTART(QUDA_PROFILE_H2D); - cudaGauge->loadCPUField(*cpuGauge); - profilePhase.TPSTOP(QUDA_PROFILE_H2D); - } - - profilePhase.TPSTART(QUDA_PROFILE_COMPUTE); - *num_failures_h = 0; - - // apply / remove phase as appropriate - if (!cudaGauge->StaggeredPhaseApplied()) cudaGauge->applyStaggeredPhase(); - else cudaGauge->removeStaggeredPhase(); - - profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE); - - profilePhase.TPSTART(QUDA_PROFILE_D2H); - if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge); - profilePhase.TPSTOP(QUDA_PROFILE_D2H); - - if (param->make_resident_gauge) { - if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise; - gaugePrecise = cudaGauge; - } else { - delete cudaGauge; - } - - profilePhase.TPSTART(QUDA_PROFILE_FREE); - if (cpuGauge) delete cpuGauge; - profilePhase.TPSTOP(QUDA_PROFILE_FREE); - - profilePhase.TPSTOP(QUDA_PROFILE_TOTAL); - } +void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) +{ + profileProject.TPSTART(QUDA_PROFILE_TOTAL); + + profileProject.TPSTART(QUDA_PROFILE_INIT); + checkGaugeParam(param); + + // create the gauge field + GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); + gParam.location = QUDA_CPU_FIELD_LOCATION; + gParam.site_offset = param->gauge_offset; + gParam.site_size = param->site_size; + bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; + cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr; + + // create the device fields + gParam.location = QUDA_CUDA_FIELD_LOCATION; + gParam.create = QUDA_NULL_FIELD_CREATE; + gParam.order = QUDA_FLOAT2_GAUGE_ORDER; + gParam.reconstruct = param->reconstruct; + cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr; + profileProject.TPSTOP(QUDA_PROFILE_INIT); + + if (param->use_resident_gauge) { + if (!gaugePrecise) errorQuda("No resident gauge field to use"); + cudaGauge = gaugePrecise; + gaugePrecise = nullptr; + } else { + profileProject.TPSTART(QUDA_PROFILE_H2D); + cudaGauge->loadCPUField(*cpuGauge); + profileProject.TPSTOP(QUDA_PROFILE_H2D); + } + + profileProject.TPSTART(QUDA_PROFILE_COMPUTE); + *num_failures_h = 0; + + // project onto SU(3) + if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase(); + projectSU3(*cudaGauge, tol, num_failures_d); + if (!cudaGauge->StaggeredPhaseApplied() && param->staggered_phase_applied) cudaGauge->applyStaggeredPhase(); + + profileProject.TPSTOP(QUDA_PROFILE_COMPUTE); + + if (*num_failures_h > 0) errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h); + + profileProject.TPSTART(QUDA_PROFILE_D2H); + if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge); + profileProject.TPSTOP(QUDA_PROFILE_D2H); + + if (param->make_resident_gauge) { + if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise; + gaugePrecise = cudaGauge; + } else { + delete cudaGauge; + } + + profileProject.TPSTART(QUDA_PROFILE_FREE); + if (cpuGauge) delete cpuGauge; + profileProject.TPSTOP(QUDA_PROFILE_FREE); + + profileProject.TPSTOP(QUDA_PROFILE_TOTAL); +} + +void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) +{ + profilePhase.TPSTART(QUDA_PROFILE_TOTAL); + + profilePhase.TPSTART(QUDA_PROFILE_INIT); + checkGaugeParam(param); + + // create the gauge field + GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); + bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; + gParam.location = QUDA_CPU_FIELD_LOCATION; + cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr; + + // create the device fields + gParam.location = QUDA_CUDA_FIELD_LOCATION; + gParam.create = QUDA_NULL_FIELD_CREATE; + gParam.order = QUDA_FLOAT2_GAUGE_ORDER; + gParam.reconstruct = param->reconstruct; + cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr; + profilePhase.TPSTOP(QUDA_PROFILE_INIT); + + if (param->use_resident_gauge) { + if (!gaugePrecise) errorQuda("No resident gauge field to use"); + cudaGauge = gaugePrecise; + } else { + profilePhase.TPSTART(QUDA_PROFILE_H2D); + cudaGauge->loadCPUField(*cpuGauge); + profilePhase.TPSTOP(QUDA_PROFILE_H2D); + } + + profilePhase.TPSTART(QUDA_PROFILE_COMPUTE); + *num_failures_h = 0; + + // apply / remove phase as appropriate + if (!cudaGauge->StaggeredPhaseApplied()) + cudaGauge->applyStaggeredPhase(); + else + cudaGauge->removeStaggeredPhase(); + + profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE); + + profilePhase.TPSTART(QUDA_PROFILE_D2H); + if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge); + profilePhase.TPSTOP(QUDA_PROFILE_D2H); + + if (param->make_resident_gauge) { + if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise; + gaugePrecise = cudaGauge; + } else { + delete cudaGauge; + } + + profilePhase.TPSTART(QUDA_PROFILE_FREE); + if (cpuGauge) delete cpuGauge; + profilePhase.TPSTOP(QUDA_PROFILE_FREE); + + profilePhase.TPSTOP(QUDA_PROFILE_TOTAL); +} // evaluate the momentum action -double momActionQuda(void* momentum, QudaGaugeParam* param) +double momActionQuda(void *momentum, QudaGaugeParam *param) { profileMomAction.TPSTART(QUDA_PROFILE_TOTAL); @@ -5138,7 +5081,8 @@ double momActionQuda(void* momentum, QudaGaugeParam* param) GaugeFieldParam gParam(*param, momentum, QUDA_ASQTAD_MOM_LINKS); gParam.location = QUDA_CPU_FIELD_LOCATION; gParam.reconstruct = (gParam.order == QUDA_TIFR_GAUGE_ORDER || gParam.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ? - QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10; + QUDA_RECONSTRUCT_NO : + QUDA_RECONSTRUCT_10; gParam.site_offset = param->mom_offset; gParam.site_size = param->site_size; @@ -5176,9 +5120,7 @@ double momActionQuda(void* momentum, QudaGaugeParam* param) delete cudaMom; momResident = nullptr; } - if (cpuMom) { - delete cpuMom; - } + if (cpuMom) { delete cpuMom; } profileMomAction.TPSTOP(QUDA_PROFILE_FREE); profileMomAction.TPSTOP(QUDA_PROFILE_TOTAL); @@ -5230,7 +5172,8 @@ void plaqQuda(double plaq[3]) if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field"); - cudaGaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq); + cudaGaugeField *data + = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq); extendedGaugeResident = data; profilePlaq.TPSTART(QUDA_PROFILE_COMPUTE); @@ -5311,8 +5254,7 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, copyExtendedGauge(*precise, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION); precise->exchangeGhost(); } else { - if (getVerbosity() >= QUDA_VERBOSE) - printfQuda("Wuppertal smearing done with gaugePrecise\n"); + if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Wuppertal smearing done with gaugePrecise\n"); precise = gaugePrecise; } @@ -5357,8 +5299,7 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, printfQuda("Out CPU %e CUDA %e\n", cpu, gpu); } - if (gaugeSmeared != nullptr) - delete precise; + if (gaugeSmeared != nullptr) delete precise; popVerbosity(); @@ -5538,7 +5479,7 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u delete cpuGauge; - if(timeinfo){ + if (timeinfo) { timeinfo[0] = GaugeFixOVRQuda.Last(QUDA_PROFILE_H2D); timeinfo[1] = GaugeFixOVRQuda.Last(QUDA_PROFILE_COMPUTE); timeinfo[2] = GaugeFixOVRQuda.Last(QUDA_PROFILE_D2H); @@ -5547,9 +5488,10 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u return 0; } -int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir, const unsigned int Nsteps, \ - const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, \ - const unsigned int stopWtheta, QudaGaugeParam* param , double* timeinfo) +int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, + const unsigned int verbose_interval, const double alpha, const unsigned int autotune, + const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, + double *timeinfo) { GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_TOTAL); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index e3088fec71..0328c91aa5 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -3,6 +3,9 @@ #include #include #include +#include +#include +#include #include #include @@ -10,6 +13,7 @@ #include #include #include + // #include "../../openQxD-devel/include/su3.h" // #include "../../openQxD-devel/include/flags.h" // #include "../../openQxD-devel/include/utils.h" @@ -206,6 +210,7 @@ static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec) gParam.tadpole_coeff = 1.0; gParam.scale = 0; gParam.ga_pad = getLinkPadding(dim); + gParam.return_result_gauge = 1; return gParam; } @@ -231,7 +236,21 @@ void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge) plaq[1] = obsParam.plaquette[1]; plaq[2] = obsParam.plaquette[2]; - // qudamilc_called(__func__); + saveGaugeQuda(gauge, &qudaGaugeParam); + + return; +} + +void openQCD_gaugeloadsave(int precision, void *gauge) +{ + + QudaGaugeParam qudaGaugeParam + = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); // FIXME: + + loadGaugeQuda(gauge, &qudaGaugeParam); + + saveGaugeQuda(gauge, &qudaGaugeParam); + return; } @@ -488,7 +507,6 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda loadGaugeQuda(gauge, &qudaGaugeParam); - QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; QudaPrecision device_precision_sloppy = device_precision; @@ -506,9 +524,11 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda invertParam.dslash_type = QUDA_WILSON_DSLASH; - dslashQuda(dst,src, &invertParam, local_parity); + dslashQuda(dst, src, &invertParam, local_parity); + + // TODO: need save?? -// TODO: need save?? + // saveGaugeQuda(gauge, &qudaGaugeParam); return; } // qudaDslash From 12f22f448b542a537391c280356bab797317f8bc Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Thu, 23 Mar 2023 11:14:44 +0100 Subject: [PATCH 020/148] small correction on load fct --- include/gauge_field_order.h | 45 +++++++++++++++++++++++-------------- lib/openqcd_interface.cpp | 3 ++- 2 files changed, 30 insertions(+), 18 deletions(-) diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 1683937777..67991b4da9 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2355,7 +2355,6 @@ namespace quda if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); } - // TODO: make this function // __device__ __host__ inline int QUDAtoOpenQxD(int x_cb_QUDA, int dir_QUDA, int parity_QUDA) const // TODO: Implement ipt and iup functions @@ -2363,40 +2362,52 @@ namespace quda // } + /* ORIGINAL */ + +#if 0 + __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const + { + auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; + block_load(v, reinterpret_cast(in)); + } - __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, - Float = 1.0) const + __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const + { + auto out = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; + block_store(reinterpret_cast(out), v); + } +#endif + + /*****************/ + + __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, Float = 1.0) const { // With ''natural'' order: lexicographical 0123 = txyz , t fastest, links 0123 = txyz in pos directions // Indexing fun: int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) - getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site // int iy_OpenQxD = x3 + L3*x2 + L3*L2*x1 + L3*L2*L1*x0; - // TODO: Determine whether coord[mu] is local or global - int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; + int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) - coord1 in QUDA is x2 in OpenQxD (y) - coord2 in QUDA is x3 in OpenQxD (z) - coord3 in QUDA is x0 in OpenQxD (t) - */ + coord1 in QUDA is x2 in OpenQxD (y) + coord2 in QUDA is x3 in OpenQxD (z) + coord3 in QUDA is x0 in OpenQxD (t) */ // int ix_OpenQxD = ipt[iy_OpenQxD]; - int dir_OpenQxD = (dir + 1) % 4; // rotation of axes QUDA -> OpenQxD + + int dir_OpenQxD = (dir) % 4; // rotation of axes QUDA -> OpenQxD // Loading as per QUDA style - auto in - = &gauge[(4 * iy_OpenQxD + dir_OpenQxD) * length]; // This is how they're accessed within OpenQxd (length = 18 - // doubles = 9 complex doubles = 1 su3dble struct) + auto in = &gauge[(4 * iy_OpenQxD + dir_OpenQxD) * length]; + // This is how they're accessed within OpenQxd (length = 18 + // doubles = 9 complex doubles = 1 su3dble struct) // auto in = &gauge[ (8*(ix_OpenQxD - volumeCB) + 2*dir_OpenQxD)* length]; // This is how they're accessed // within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) block_load(v, reinterpret_cast(in)); - - } - __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity) const + __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const { // Indexing fun: int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 0328c91aa5..d07e579570 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -210,7 +210,8 @@ static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec) gParam.tadpole_coeff = 1.0; gParam.scale = 0; gParam.ga_pad = getLinkPadding(dim); - gParam.return_result_gauge = 1; + + // gParam.return_result_gauge = 1; //? return gParam; } From d535512c658a530db4ded658b69f198f66c8b6ca Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Thu, 23 Mar 2023 11:55:05 +0100 Subject: [PATCH 021/148] ordering done, lexicographical io w openqxd --- include/color_spinor_field_order.h | 25 +++++++++++-------------- include/gauge_field_order.h | 2 +- lib/openqcd_interface.cpp | 2 +- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 73681706e4..0c1d8fe4cc 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1494,7 +1494,7 @@ namespace quda }; template - struct SpaceSpinorColorOrder { // TODO: check how to adapt this for openqxd + struct SpaceSpinorColorOrder { using Accessor = SpaceSpinorColorOrder; using real = typename mapper::type; using complex = complex; @@ -1722,8 +1722,9 @@ namespace quda size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; - // Use this template as openqxd for now TODO: - template struct OpenQCDDiracOrder { // TODO: check how to adapt this for openqxd + +// Based on ''SpaceSpinorColorOrder'' TODO: + template struct OpenQCDDiracOrder { using Accessor = OpenQCDDiracOrder; using real = typename mapper::type; using complex = complex; @@ -1750,38 +1751,34 @@ namespace quda } } - __device__ __host__ inline void load(complex v[length / 2], int x, int parity = 0) const // TODO: adapt to openqxd +/* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) + coord1 in QUDA is x2 in OpenQxD (y) + coord2 in QUDA is x3 in OpenQxD (z) + coord3 in QUDA is x0 in OpenQxD (t) + */ + __device__ __host__ inline void load(complex v[length / 2], int x, int parity = 0) const { /* INDEXING */ - int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site - /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) - coord1 in QUDA is x2 in OpenQxD (y) - coord2 in QUDA is x3 in OpenQxD (z) - coord3 in QUDA is x0 in OpenQxD (t) - */ int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; - // int dir_OpenQxD = (dir + 1) % 4; // rotation of axes QUDA -> OpenQxD // Loading as per QUDA style auto in = &field[iy_OpenQxD * length]; // This is how they're accessed within OpenQxd (length = 24 doubles // = 12 complex doubles = 4 spinor x 3 colors) block_load(v, reinterpret_cast(in)); - /* END OF INDEXING */ } __device__ __host__ inline void save(const complex v[length / 2], int x, - int parity = 0) const // TODO: adapt to openqxd + int parity = 0) const { /* INDEXING */ int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; - // int dir_OpenQxD = (dir + 1) % 4; // rotation of axes QUDA -> OpenQxD // Loading as per QUDA style auto out = &field[iy_OpenQxD * length]; diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 67991b4da9..3bf2b6439a 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2396,7 +2396,7 @@ namespace quda coord3 in QUDA is x0 in OpenQxD (t) */ // int ix_OpenQxD = ipt[iy_OpenQxD]; - int dir_OpenQxD = (dir) % 4; // rotation of axes QUDA -> OpenQxD + int dir_OpenQxD = (dir + 1) % 4; // rotation of axes QUDA -> OpenQxD // Loading as per QUDA style auto in = &gauge[(4 * iy_OpenQxD + dir_OpenQxD) * length]; diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index d07e579570..8b480d31f9 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -211,7 +211,7 @@ static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec) gParam.scale = 0; gParam.ga_pad = getLinkPadding(dim); - // gParam.return_result_gauge = 1; //? + // gParam.return_result_gauge = 1; // I think this is not needed ? return gParam; } From f538e53476f361f7c77aaa44c71ff80c6b811926 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Thu, 23 Mar 2023 13:43:48 +0100 Subject: [PATCH 022/148] added openqcd dirac order --- include/color_spinor_field.h | 4 ++ include/enum_quda.h | 1 + include/enum_quda_fortran.h | 1 + include/quda.h | 12 +++++ include/quda_openqcd_interface.h | 16 ++++++ lib/interface_quda.cpp | 86 ++++++++++++++++++++++++++++++++ lib/openqcd_interface.cpp | 45 +++++++++++++++-- 7 files changed, 162 insertions(+), 3 deletions(-) diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index eea9eed8e8..04daf931b3 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -1,5 +1,6 @@ #pragma once +#include "enum_quda.h" #include #include #include @@ -229,6 +230,9 @@ namespace quda } else if (inv_param.dirac_order == QUDA_TIFR_PADDED_DIRAC_ORDER) { fieldOrder = QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER; siteOrder = QUDA_EVEN_ODD_SITE_ORDER; + } else if (inv_param.dirac_order == QUDA_OPENQCD_DIRAC_ORDER) { + fieldOrder = QUDA_OPENQCD_FIELD_ORDER; + siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: SHOULD THIS BE LEXICOGRAPHICAL, OR IN QUDA_OPENQCD_FIELD_ORDER THIS WORKS OUT ALREADY ??? } else { errorQuda("Dirac order %d not supported", inv_param.dirac_order); } diff --git a/include/enum_quda.h b/include/enum_quda.h index ee798b9e90..836cb1f1f7 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -250,6 +250,7 @@ typedef enum QudaDiracFieldOrder_s { QUDA_CPS_WILSON_DIRAC_ORDER, // odd-even, color inside spin QUDA_LEX_DIRAC_ORDER, // lexicographical order, color inside spin QUDA_TIFR_PADDED_DIRAC_ORDER, // padded z dimension for TIFR RHMC code + QUDA_OPENQCD_DIRAC_ORDER, // openqcd QUDA_INVALID_DIRAC_ORDER = QUDA_INVALID_ENUM } QudaDiracFieldOrder; diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index c78f03208a..d6b954d0bf 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -233,6 +233,7 @@ #define QUDA_CPS_WILSON_DIRAC_ORDER 4 // odd-even color inside spin #define QUDA_LEX_DIRAC_ORDER 5 // lexicographical order color inside spin #define QUDA_TIFR_PADDED_DIRAC_ORDER 6 +#define QUDA_OPENQCD_DIRAC_ORDER 7 // openqcd #define QUDA_INVALID_DIRAC_ORDER QUDA_INVALID_ENUM #define QudaCloverFieldOrder integer(4) diff --git a/include/quda.h b/include/quda.h index e574de7293..13a6489d5c 100644 --- a/include/quda.h +++ b/include/quda.h @@ -1256,6 +1256,18 @@ void dumpMultigridQuda(void *mg_instance, QudaMultigridParam *param); */ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity); +#if 0 +/** + * Apply the Dslash operator (D_{eo} or D_{oe}). + * @param h_out Result spinor field + * @param h_in Input spinor field + * @param param Contains all metadata regarding host and device + * storage + * @param parity The destination parity of the field + */ +void dslashQudaTest(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity); +#endif + /** * @brief Perform the solve like @dslashQuda but for multiple rhs by spliting the comm grid into * sub-partitions: each sub-partition does one or more rhs'. diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 7e6db7a9f5..e669314b15 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -125,6 +125,22 @@ typedef struct { void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *source, void *solution, void *gauge); +#if 0 +/** + * Apply the improved staggered operator to a field. All fields + * passed and returned are host (CPU) field in MILC order. + * + * @param external_precision Precision of host fields passed to QUDA (2 - double, 1 - single) + * @param quda_precision Precision for QUDA to use (2 - double, 1 - single) + * @param inv_args Struct setting some solver metadata + * @param source Right-hand side source field + * @param solution Solution spinor field + */ +void openQCD_qudaDslashTest(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *source, + void *solution, void *gauge); +#endif + + /** * Solve Ax=b for an improved staggered operator. All fields are fields * passed and returned are host (CPU) field in MILC order. This diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index ae33a44602..3a01ed63eb 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -1873,6 +1873,92 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity profileDslash.TPSTOP(QUDA_PROFILE_TOTAL); } +#if 0 +void dslashQudaTest(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) +{ + profileDslash.TPSTART(QUDA_PROFILE_TOTAL); + profileDslash.TPSTART(QUDA_PROFILE_INIT); + + const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise; + + if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH) + || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH)) + errorQuda("Gauge field not allocated"); + if (cloverPrecise == nullptr + && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) + errorQuda("Clover field not allocated"); + + pushVerbosity(inv_param->verbosity); + if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); + + ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), true, inv_param->input_location); + ColorSpinorField in_h(cpuParam); + ColorSpinorParam cudaParam(cpuParam, *inv_param, QUDA_CUDA_FIELD_LOCATION); + + cpuParam.v = h_out; + cpuParam.location = inv_param->output_location; + ColorSpinorField out_h(cpuParam); + + ColorSpinorField in(cudaParam); + ColorSpinorField out(cudaParam); + + bool pc = true; + DiracParam diracParam; + setDiracParam(diracParam, inv_param, pc); + + profileDslash.TPSTOP(QUDA_PROFILE_INIT); + + profileDslash.TPSTART(QUDA_PROFILE_H2D); + in = in_h; + profileDslash.TPSTOP(QUDA_PROFILE_H2D); + + profileDslash.TPSTART(QUDA_PROFILE_COMPUTE); + + if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); + + if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION + && (inv_param->dslash_type == QUDA_STAGGERED_DSLASH || inv_param->dslash_type == QUDA_ASQTAD_DSLASH)) + blas::ax(1.0 / (2.0 * inv_param->mass), in); + + if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { + if (parity == QUDA_EVEN_PARITY) { + parity = QUDA_ODD_PARITY; + } else { + parity = QUDA_EVEN_PARITY; + } + blas::ax(gauge.Anisotropy(), in); + } + + Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator + if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inv_param->dagger) { + cudaParam.create = QUDA_NULL_FIELD_CREATE; + ColorSpinorField tmp1(cudaParam); + ((DiracTwistedCloverPC *)dirac)->TwistCloverInv(tmp1, in, (parity + 1) % 2); // apply the clover-twist + dirac->Dslash(out, tmp1, parity); // apply the operator + } else if (inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH + || inv_param->dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH) { + dirac->Dslash4(out, in, parity); + } else { + dirac->Dslash(out, in, parity); // apply the operator + } + profileDslash.TPSTOP(QUDA_PROFILE_COMPUTE); + + profileDslash.TPSTART(QUDA_PROFILE_D2H); + out_h = out; + profileDslash.TPSTOP(QUDA_PROFILE_D2H); + + if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out)); + + profileDslash.TPSTART(QUDA_PROFILE_FREE); + delete dirac; // clean up + + profileDslash.TPSTOP(QUDA_PROFILE_FREE); + + popVerbosity(); + profileDslash.TPSTOP(QUDA_PROFILE_TOTAL); +} +#endif + void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) { pushVerbosity(inv_param->verbosity); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 8b480d31f9..86d841028a 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -322,9 +322,9 @@ static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, Qud invertParam->solution_type = QUDA_MATPC_SOLUTION; invertParam->solve_type = QUDA_DIRECT_PC_SOLVE; invertParam->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // not used, but required by the code. - invertParam->dirac_order = QUDA_DIRAC_ORDER; + invertParam->dirac_order = QUDA_OPENQCD_DIRAC_ORDER; - invertParam->dslash_type = QUDA_ASQTAD_DSLASH; + invertParam->dslash_type = QUDA_WILSON_DSLASH; // FIXME: OR THIS; QUDA_ASQTAD_DSLASH; invertParam->Ls = 1; invertParam->gflops = 0.0; @@ -533,6 +533,45 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda return; } // qudaDslash + +#if 0 +void openQCD_qudaDslashTest(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, + void *dst, void *gauge) +{ + static const QudaVerbosity verbosity = getVerbosity(); + + QudaGaugeParam qudaGaugeParam + = newOpenQCDGaugeParam(localDim, (quda_precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); + + loadGaugeQuda(gauge, &qudaGaugeParam); + + QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + QudaPrecision device_precision_sloppy = device_precision; + + QudaInvertParam invertParam = newQudaInvertParam(); + + QudaParity local_parity = inv_args.evenodd; + QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY; + + setInvertParams(host_precision, device_precision, device_precision_sloppy, 0.0, 0, 0, 0, 0.0, local_parity, verbosity, + QUDA_CG_INVERTER, &invertParam); + + ColorSpinorParam csParam; + setColorSpinorParams(localDim, host_precision, &csParam); + + invertParam.dslash_type = QUDA_WILSON_DSLASH; + + dslashQudaTest(dst, src, &invertParam, local_parity); + + // TODO: need save?? + + // saveGaugeQuda(gauge, &qudaGaugeParam); + + return; +} // qudaDslashTest +#endif + // #endif // void* openQCD_qudaCreateGaugeField(void *gauge, int geometry, int precision) @@ -594,7 +633,7 @@ void setInvertParam(QudaInvertParam &invertParam, openQCD_QudaInvertArgs_t &inv_ invertParam.cuda_prec = device_precision; invertParam.cuda_prec_sloppy = device_precision_sloppy; invertParam.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; - invertParam.dirac_order = QUDA_DIRAC_ORDER; + invertParam.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; invertParam.clover_cpu_prec = host_precision; invertParam.clover_cuda_prec = device_precision; invertParam.clover_cuda_prec_sloppy = device_precision_sloppy; From 568d1cda0de42bd041c31caddfad74c3373bbd89 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Thu, 23 Mar 2023 17:51:33 +0100 Subject: [PATCH 023/148] dslash quda := NO dslash operator applied --- include/quda.h | 2 -- lib/interface_quda.cpp | 12 ++++++------ lib/openqcd_interface.cpp | 40 ++------------------------------------- 3 files changed, 8 insertions(+), 46 deletions(-) diff --git a/include/quda.h b/include/quda.h index 13a6489d5c..90fc9cf165 100644 --- a/include/quda.h +++ b/include/quda.h @@ -1256,7 +1256,6 @@ void dumpMultigridQuda(void *mg_instance, QudaMultigridParam *param); */ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity); -#if 0 /** * Apply the Dslash operator (D_{eo} or D_{oe}). * @param h_out Result spinor field @@ -1266,7 +1265,6 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity * @param parity The destination parity of the field */ void dslashQudaTest(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity); -#endif /** * @brief Perform the solve like @dslashQuda but for multiple rhs by spliting the comm grid into diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 3a01ed63eb..01dd8d477a 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -1873,7 +1873,7 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity profileDslash.TPSTOP(QUDA_PROFILE_TOTAL); } -#if 0 +// #if 0 void dslashQudaTest(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) { profileDslash.TPSTART(QUDA_PROFILE_TOTAL); @@ -1933,13 +1933,13 @@ void dslashQudaTest(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaPar if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inv_param->dagger) { cudaParam.create = QUDA_NULL_FIELD_CREATE; ColorSpinorField tmp1(cudaParam); - ((DiracTwistedCloverPC *)dirac)->TwistCloverInv(tmp1, in, (parity + 1) % 2); // apply the clover-twist - dirac->Dslash(out, tmp1, parity); // apply the operator + // ((DiracTwistedCloverPC *)dirac)->TwistCloverInv(tmp1, in, (parity + 1) % 2); // // DO NOT APPLY the clover-twist + // dirac->Dslash(out, tmp1, parity); // DO NOT APPLY OPERATOR } else if (inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH || inv_param->dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH) { - dirac->Dslash4(out, in, parity); + // dirac->Dslash4(out, in, parity); // DO NOT APPLY OPERATOR } else { - dirac->Dslash(out, in, parity); // apply the operator + // dirac->Dslash(out, in, parity); // DO NOT APPLY OPERATOR } profileDslash.TPSTOP(QUDA_PROFILE_COMPUTE); @@ -1957,7 +1957,7 @@ void dslashQudaTest(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaPar popVerbosity(); profileDslash.TPSTOP(QUDA_PROFILE_TOTAL); } -#endif +// #endif void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) { diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 86d841028a..1fbc1edabd 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -525,7 +525,8 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda invertParam.dslash_type = QUDA_WILSON_DSLASH; - dslashQuda(dst, src, &invertParam, local_parity); + // dslashQuda(dst, src, &invertParam, local_parity); + dslashQudaTest(dst, src, &invertParam, local_parity); // TODO: need save?? @@ -534,43 +535,6 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda return; } // qudaDslash -#if 0 -void openQCD_qudaDslashTest(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, - void *dst, void *gauge) -{ - static const QudaVerbosity verbosity = getVerbosity(); - - QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (quda_precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); - - loadGaugeQuda(gauge, &qudaGaugeParam); - - QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - QudaPrecision device_precision_sloppy = device_precision; - - QudaInvertParam invertParam = newQudaInvertParam(); - - QudaParity local_parity = inv_args.evenodd; - QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY; - - setInvertParams(host_precision, device_precision, device_precision_sloppy, 0.0, 0, 0, 0, 0.0, local_parity, verbosity, - QUDA_CG_INVERTER, &invertParam); - - ColorSpinorParam csParam; - setColorSpinorParams(localDim, host_precision, &csParam); - - invertParam.dslash_type = QUDA_WILSON_DSLASH; - - dslashQudaTest(dst, src, &invertParam, local_parity); - - // TODO: need save?? - - // saveGaugeQuda(gauge, &qudaGaugeParam); - - return; -} // qudaDslashTest -#endif // #endif From dc8c99fafa8a3213c6a6f50b06f2f60ca67c8261 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Thu, 23 Mar 2023 19:28:06 +0100 Subject: [PATCH 024/148] strange bad indexing behaviour (check3 log) --- include/color_spinor_field_order.h | 33 ++++++++++++++---------------- lib/openqcd_interface.cpp | 14 ++++++------- 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 0c1d8fe4cc..f56b650964 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -11,6 +11,7 @@ * also. */ +#include "enum_quda.h" #include #include #include @@ -1493,8 +1494,7 @@ namespace quda size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; - template - struct SpaceSpinorColorOrder { + template struct SpaceSpinorColorOrder { using Accessor = SpaceSpinorColorOrder; using real = typename mapper::type; using complex = complex; @@ -1518,14 +1518,13 @@ namespace quda } } - __device__ __host__ inline void load(complex v[length / 2], int x, int parity = 0) const // TODO: adapt to openqxd + __device__ __host__ inline void load(complex v[length / 2], int x, int parity = 0) const { auto in = &field[(parity * volumeCB + x) * length]; block_load(v, reinterpret_cast(in)); } - __device__ __host__ inline void save(const complex v[length / 2], int x, - int parity = 0) const // TODO: adapt to openqxd + __device__ __host__ inline void save(const complex v[length / 2], int x, int parity = 0) const { auto out = &field[(parity * volumeCB + x) * length]; block_store(reinterpret_cast(out), v); @@ -1722,9 +1721,8 @@ namespace quda size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; - -// Based on ''SpaceSpinorColorOrder'' TODO: - template struct OpenQCDDiracOrder { + // Based on ''SpaceSpinorColorOrder'' TODO: + template struct OpenQCDDiracOrder { using Accessor = OpenQCDDiracOrder; using real = typename mapper::type; using complex = complex; @@ -1741,9 +1739,9 @@ namespace quda field(field_ ? field_ : (Float *)a.V()), offset(a.Bytes() / (2 * sizeof(Float))), volumeCB(a.VolumeCB()), - nParity(a.SiteSubset()), + nParity(QUDA_FULL_SITE_SUBSET), dim {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // GLOBAL dimensions - { // TODO: IS THIS NEEDED?? + { // TODO: IS THIS NEEDED?? for (int i = 0; i < 4; i++) { ghost[2 * i] = ghost_ ? ghost_[2 * i] : 0; ghost[2 * i + 1] = ghost_ ? ghost_[2 * i + 1] : 0; @@ -1751,11 +1749,11 @@ namespace quda } } -/* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) - coord1 in QUDA is x2 in OpenQxD (y) - coord2 in QUDA is x3 in OpenQxD (z) - coord3 in QUDA is x0 in OpenQxD (t) - */ + /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) + coord1 in QUDA is x2 in OpenQxD (y) + coord2 in QUDA is x3 in OpenQxD (z) + coord3 in QUDA is x0 in OpenQxD (t) + */ __device__ __host__ inline void load(complex v[length / 2], int x, int parity = 0) const { @@ -1767,12 +1765,11 @@ namespace quda // Loading as per QUDA style auto in = &field[iy_OpenQxD * length]; // This is how they're accessed within OpenQxd (length = 24 doubles - // = 12 complex doubles = 4 spinor x 3 colors) + // = 12 complex doubles = 4 spinor x 3 colors) block_load(v, reinterpret_cast(in)); } - __device__ __host__ inline void save(const complex v[length / 2], int x, - int parity = 0) const + __device__ __host__ inline void save(const complex v[length / 2], int x, int parity = 0) const { /* INDEXING */ int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 1fbc1edabd..2b6b18c1ea 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -354,19 +354,19 @@ static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, Qud static void setColorSpinorParams(const int dim[4], QudaPrecision precision, ColorSpinorParam *param) { param->nColor = 3; - param->nSpin = 1; // TODO: + param->nSpin = 4; // =1 for staggered, =2 for coarse Dslash, =4 for 4d spinor param->nDim = 4; // TODO: check how to adapt this for openqxd for (int dir = 0; dir < 4; ++dir) param->x[dir] = dim[dir]; - param->x[0] /= 2; + param->x[0] /= 2; // FIXME: param->setPrecision(precision); param->pad = 0; - param->siteSubset = QUDA_PARITY_SITE_SUBSET; // TODO: check how to adapt this for openqxd - param->siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // TODO: check how to adapt this for openqxd - param->fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; - param->gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // meaningless, but required by the code. // TODO: - param->create = QUDA_ZERO_FIELD_CREATE; // TODO: check how to adapt this for openqxd + param->siteSubset = QUDA_FULL_SITE_SUBSET; // FIXME: check how to adapt this for openqxd + param->siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: check how to adapt this for openqxd + param->fieldOrder = QUDA_OPENQCD_FIELD_ORDER; // FIXME: + param->gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // meaningless, but required by the code. // // FIXME:: + param->create = QUDA_ZERO_FIELD_CREATE; // // FIXME:: check how to adapt this for openqxd ?? created -0 in weird places } void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], openQCD_QudaInvertArgs_t &inv_args, From ecc21779ab47b76144b2c4a70c5cbf8b10ef63f2 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Fri, 24 Mar 2023 01:29:17 +0100 Subject: [PATCH 025/148] small corrections + comments In check3, only half the sites are loaded onto quda (odd ones on whole lattice), then in the save function, only the odd ones in the first half get reloaded! --- include/color_spinor_field_order.h | 9 +++++---- include/gauge_field_order.h | 2 +- include/quda_openqcd_interface.h | 16 +--------------- lib/openqcd_interface.cpp | 10 ++++++---- 4 files changed, 13 insertions(+), 24 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index f56b650964..96555ecbad 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1726,7 +1726,7 @@ namespace quda using Accessor = OpenQCDDiracOrder; using real = typename mapper::type; using complex = complex; - static const int length = 2 * Ns * Nc; + static const int length = 2 * Ns * Nc; // 12 complex (2 floats) numbers per spinor color field Float *field; size_t offset; Float *ghost[8]; @@ -1739,14 +1739,15 @@ namespace quda field(field_ ? field_ : (Float *)a.V()), offset(a.Bytes() / (2 * sizeof(Float))), volumeCB(a.VolumeCB()), - nParity(QUDA_FULL_SITE_SUBSET), + nParity(QUDA_FULL_SITE_SUBSET), // => nParity = 2 dim {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // GLOBAL dimensions - { // TODO: IS THIS NEEDED?? + { // TODO: ARE GHOSTS NEEDED?? for (int i = 0; i < 4; i++) { ghost[2 * i] = ghost_ ? ghost_[2 * i] : 0; ghost[2 * i + 1] = ghost_ ? ghost_[2 * i + 1] : 0; faceVolumeCB[i] = a.SurfaceCB(i) * nFace; } + if constexpr (length != 24) errorQuda("Spinor field length %d not supported", length); } /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) @@ -1817,7 +1818,7 @@ namespace quda // } // } - size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } + size_t Bytes() const { return Nc * Ns * 2 * sizeof(Float); } // FIXME: ?? }; // openQCDDiracOrder } // namespace colorspinor diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 3bf2b6439a..7c019f780e 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2438,7 +2438,7 @@ namespace quda @brief This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operators for manipulating at the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting + @param[in] dim Which dimension are we requesting // FIXME: dim is a typo for dir! @param[in] x_cb Checkerboarded space-time index we are requesting @param[in] parity Parity we are requesting @return Instance of a gauge_wrapper that curries in access to diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index e669314b15..f7c819ba65 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -110,6 +110,7 @@ typedef struct { HISQ fermions since the tadpole factor is baked into the links during their construction */ double naik_epsilon; /** Naik epsilon parameter (HISQ fermions only).*/ + QudaDslashType dslash_type; } openQCD_QudaInvertArgs_t; /** @@ -125,21 +126,6 @@ typedef struct { void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *source, void *solution, void *gauge); -#if 0 -/** - * Apply the improved staggered operator to a field. All fields - * passed and returned are host (CPU) field in MILC order. - * - * @param external_precision Precision of host fields passed to QUDA (2 - double, 1 - single) - * @param quda_precision Precision for QUDA to use (2 - double, 1 - single) - * @param inv_args Struct setting some solver metadata - * @param source Right-hand side source field - * @param solution Solution spinor field - */ -void openQCD_qudaDslashTest(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *source, - void *solution, void *gauge); -#endif - /** * Solve Ax=b for an improved staggered operator. All fields are fields diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 2b6b18c1ea..2c22ef6bf5 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -358,7 +358,7 @@ static void setColorSpinorParams(const int dim[4], QudaPrecision precision, Colo param->nDim = 4; // TODO: check how to adapt this for openqxd for (int dir = 0; dir < 4; ++dir) param->x[dir] = dim[dir]; - param->x[0] /= 2; // FIXME: + // param->x[0] /= 2; // FIXME: ?? param->setPrecision(precision); param->pad = 0; @@ -517,14 +517,16 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda QudaParity local_parity = inv_args.evenodd; QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY; + /* setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, + double mass, double target_residual, double target_residual_hq, int maxiter, + double reliable_delta, QudaParity parity, QudaVerbosity verbosity, + QudaInverterType inverter, QudaInvertParam *invertParam) */ setInvertParams(host_precision, device_precision, device_precision_sloppy, 0.0, 0, 0, 0, 0.0, local_parity, verbosity, QUDA_CG_INVERTER, &invertParam); ColorSpinorParam csParam; setColorSpinorParams(localDim, host_precision, &csParam); - invertParam.dslash_type = QUDA_WILSON_DSLASH; - // dslashQuda(dst, src, &invertParam, local_parity); dslashQudaTest(dst, src, &invertParam, local_parity); @@ -582,7 +584,7 @@ void setInvertParam(QudaInvertParam &invertParam, openQCD_QudaInvertArgs_t &inv_ static const QudaVerbosity verbosity = getVerbosity(); - invertParam.dslash_type = QUDA_CLOVER_WILSON_DSLASH; + invertParam.dslash_type = QUDA_WILSON_DSLASH; invertParam.kappa = kappa; invertParam.dagger = QUDA_DAG_NO; invertParam.mass_normalization = QUDA_KAPPA_NORMALIZATION; From 8b1830db5ea749cfd129063ef07d0dad067ed70b Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Fri, 24 Mar 2023 13:37:41 +0100 Subject: [PATCH 026/148] cleanup and test oddeven --- include/color_spinor_field.h | 2 +- include/color_spinor_field_order.h | 4 +- lib/interface_quda.cpp | 4 +- lib/openqcd_interface.cpp | 95 +++++++++++++++--------------- 4 files changed, 52 insertions(+), 53 deletions(-) diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 04daf931b3..0010a7a362 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -232,7 +232,7 @@ namespace quda siteOrder = QUDA_EVEN_ODD_SITE_ORDER; } else if (inv_param.dirac_order == QUDA_OPENQCD_DIRAC_ORDER) { fieldOrder = QUDA_OPENQCD_FIELD_ORDER; - siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: SHOULD THIS BE LEXICOGRAPHICAL, OR IN QUDA_OPENQCD_FIELD_ORDER THIS WORKS OUT ALREADY ??? + siteOrder = QUDA_ODD_EVEN_SITE_ORDER; // FIXME: SHOULD THIS BE LEXICOGRAPHICAL, OR IN QUDA_OPENQCD_FIELD_ORDER THIS WORKS OUT ALREADY ??? } else { errorQuda("Dirac order %d not supported", inv_param.dirac_order); } diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 96555ecbad..c97a2cf1e2 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1737,7 +1737,7 @@ namespace quda OpenQCDDiracOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : field(field_ ? field_ : (Float *)a.V()), - offset(a.Bytes() / (2 * sizeof(Float))), + // offset(a.Bytes() / (2 * sizeof(Float))), volumeCB(a.VolumeCB()), nParity(QUDA_FULL_SITE_SUBSET), // => nParity = 2 dim {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // GLOBAL dimensions @@ -1818,7 +1818,7 @@ namespace quda // } // } - size_t Bytes() const { return Nc * Ns * 2 * sizeof(Float); } // FIXME: ?? + size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } // FIXME: ?? }; // openQCDDiracOrder } // namespace colorspinor diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 01dd8d477a..b3bb200746 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -1931,8 +1931,8 @@ void dslashQudaTest(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaPar Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inv_param->dagger) { - cudaParam.create = QUDA_NULL_FIELD_CREATE; - ColorSpinorField tmp1(cudaParam); + // cudaParam.create = QUDA_NULL_FIELD_CREATE; + // ColorSpinorField tmp1(cudaParam); // ((DiracTwistedCloverPC *)dirac)->TwistCloverInv(tmp1, in, (parity + 1) % 2); // // DO NOT APPLY the clover-twist // dirac->Dslash(out, tmp1, parity); // DO NOT APPLY OPERATOR } else if (inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 2c22ef6bf5..dc900ae92d 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -210,7 +210,7 @@ static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec) gParam.tadpole_coeff = 1.0; gParam.scale = 0; gParam.ga_pad = getLinkPadding(dim); - + // gParam.return_result_gauge = 1; // I think this is not needed ? return gParam; @@ -358,15 +358,15 @@ static void setColorSpinorParams(const int dim[4], QudaPrecision precision, Colo param->nDim = 4; // TODO: check how to adapt this for openqxd for (int dir = 0; dir < 4; ++dir) param->x[dir] = dim[dir]; - // param->x[0] /= 2; // FIXME: ?? + // param->x[0] /= 2; // for staggered sites only? param->setPrecision(precision); param->pad = 0; - param->siteSubset = QUDA_FULL_SITE_SUBSET; // FIXME: check how to adapt this for openqxd - param->siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: check how to adapt this for openqxd - param->fieldOrder = QUDA_OPENQCD_FIELD_ORDER; // FIXME: + param->siteSubset = QUDA_FULL_SITE_SUBSET; // FIXME: check how to adapt this for openqxd + param->siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: check how to adapt this for openqxd // EVEN-ODD is only about inner ordering in quda + param->fieldOrder = QUDA_OPENQCD_FIELD_ORDER; // FIXME: param->gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // meaningless, but required by the code. // // FIXME:: - param->create = QUDA_ZERO_FIELD_CREATE; // // FIXME:: check how to adapt this for openqxd ?? created -0 in weird places + param->create = QUDA_ZERO_FIELD_CREATE; // // FIXME:: check how to adapt this for openqxd ?? created -0 in weird places } void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], openQCD_QudaInvertArgs_t &inv_args, @@ -416,6 +416,47 @@ void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], openQCD_Qu // qudaGaugeParam.ga_pad = getLinkPadding(dim); } +// #if 0 +void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, + void *dst, void *gauge) +{ + static const QudaVerbosity verbosity = getVerbosity(); + + QudaGaugeParam qudaGaugeParam + = newOpenQCDGaugeParam(localDim, (quda_precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); + + loadGaugeQuda(gauge, &qudaGaugeParam); + + QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + QudaPrecision device_precision_sloppy = device_precision; + + QudaInvertParam invertParam = newQudaInvertParam(); + + QudaParity local_parity = inv_args.evenodd; + QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY; + + /* setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, + double mass, double target_residual, double target_residual_hq, int maxiter, + double reliable_delta, QudaParity parity, QudaVerbosity verbosity, + QudaInverterType inverter, QudaInvertParam *invertParam) */ + setInvertParams(host_precision, device_precision, device_precision_sloppy, 0.0, 0, 0, 0, 0.0, local_parity, verbosity, + QUDA_CG_INVERTER, &invertParam); + + ColorSpinorParam csParam; + setColorSpinorParams(localDim, host_precision, &csParam); + + // dslashQuda(dst, src, &invertParam, local_parity); + dslashQudaTest(dst, src, &invertParam, local_parity); + + // TODO: need save?? + + // saveGaugeQuda(gauge, &qudaGaugeParam); + + return; +} // qudaDslash +// #endif + #if 0 void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, openQCD_QudaInvertArgs_t inv_args, double target_residual, double target_fermilab_residual, const void *const fatlink, @@ -497,48 +538,6 @@ void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, qudamilc_called(__func__, verbosity); } // qudaInvert #endif -// #if 0 -void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, - void *dst, void *gauge) -{ - static const QudaVerbosity verbosity = getVerbosity(); - - QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (quda_precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); - - loadGaugeQuda(gauge, &qudaGaugeParam); - - QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - QudaPrecision device_precision_sloppy = device_precision; - - QudaInvertParam invertParam = newQudaInvertParam(); - - QudaParity local_parity = inv_args.evenodd; - QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY; - - /* setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, - double mass, double target_residual, double target_residual_hq, int maxiter, - double reliable_delta, QudaParity parity, QudaVerbosity verbosity, - QudaInverterType inverter, QudaInvertParam *invertParam) */ - setInvertParams(host_precision, device_precision, device_precision_sloppy, 0.0, 0, 0, 0, 0.0, local_parity, verbosity, - QUDA_CG_INVERTER, &invertParam); - - ColorSpinorParam csParam; - setColorSpinorParams(localDim, host_precision, &csParam); - - // dslashQuda(dst, src, &invertParam, local_parity); - dslashQudaTest(dst, src, &invertParam, local_parity); - - // TODO: need save?? - - // saveGaugeQuda(gauge, &qudaGaugeParam); - - return; -} // qudaDslash - - -// #endif // void* openQCD_qudaCreateGaugeField(void *gauge, int geometry, int precision) // { From f7385106f72cb2a84915cbaf97a7f4ff4e623300 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Fri, 24 Mar 2023 18:58:01 +0100 Subject: [PATCH 027/148] latest test --- include/color_spinor_field.h | 2 +- include/color_spinor_field_order.h | 64 +++++++++++++++++++++++------- 2 files changed, 51 insertions(+), 15 deletions(-) diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 0010a7a362..04daf931b3 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -232,7 +232,7 @@ namespace quda siteOrder = QUDA_EVEN_ODD_SITE_ORDER; } else if (inv_param.dirac_order == QUDA_OPENQCD_DIRAC_ORDER) { fieldOrder = QUDA_OPENQCD_FIELD_ORDER; - siteOrder = QUDA_ODD_EVEN_SITE_ORDER; // FIXME: SHOULD THIS BE LEXICOGRAPHICAL, OR IN QUDA_OPENQCD_FIELD_ORDER THIS WORKS OUT ALREADY ??? + siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: SHOULD THIS BE LEXICOGRAPHICAL, OR IN QUDA_OPENQCD_FIELD_ORDER THIS WORKS OUT ALREADY ??? } else { errorQuda("Dirac order %d not supported", inv_param.dirac_order); } diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index c97a2cf1e2..214a843e72 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1737,19 +1737,51 @@ namespace quda OpenQCDDiracOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : field(field_ ? field_ : (Float *)a.V()), - // offset(a.Bytes() / (2 * sizeof(Float))), + offset(a.Bytes() / (2 * sizeof(Float))), volumeCB(a.VolumeCB()), - nParity(QUDA_FULL_SITE_SUBSET), // => nParity = 2 - dim {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // GLOBAL dimensions + nParity(a.SiteSubset()), + dim {a.X(0), a.X(1), a.X(2), a.X(3)} + // dim {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // GLOBAL dimensions { // TODO: ARE GHOSTS NEEDED?? - for (int i = 0; i < 4; i++) { - ghost[2 * i] = ghost_ ? ghost_[2 * i] : 0; - ghost[2 * i + 1] = ghost_ ? ghost_[2 * i + 1] : 0; - faceVolumeCB[i] = a.SurfaceCB(i) * nFace; - } + // for (int i = 0; i < 4; i++) { + // ghost[2 * i] = ghost_ ? ghost_[2 * i] : 0; + // ghost[2 * i + 1] = ghost_ ? ghost_[2 * i + 1] : 0; + // faceVolumeCB[i] = a.SurfaceCB(i) * nFace; + // } if constexpr (length != 24) errorQuda("Spinor field length %d not supported", length); } + + /** + @brief Convert from 1-dimensional index to the n-dimensional + spatial index. With full fields, we assume that the field is + even-odd ordered. The lattice coordinates that are computed + here are full-field coordinates. + */ + // __device__ __host__ inline void LatticeIndexOpenQCD(int y[4], int i) const + // { + // // if (siteSubset == QUDA_FULL_SITE_SUBSET) + // x[0] /= 2; + + // for (int d = 0; d < 4; d++) { + // y[d] = i % x[d]; + // i /= x[d]; + // } + // int parity = i; // parity is the slowest running dimension + + // // convert into the full-field lattice coordinate + // // if (siteSubset == QUDA_FULL_SITE_SUBSET) { + // for (int d = 1; d < nDim; d++) parity += y[d]; + // parity = parity & 1; + // x[0] *= 2; // restore x[0] + // // } + + // y[0] = 2 * y[0] + parity; // compute the full x coordinate + // } + + + + /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) coord1 in QUDA is x2 in OpenQxD (y) coord2 in QUDA is x3 in OpenQxD (z) @@ -1767,20 +1799,24 @@ namespace quda // Loading as per QUDA style auto in = &field[iy_OpenQxD * length]; // This is how they're accessed within OpenQxd (length = 24 doubles // = 12 complex doubles = 4 spinor x 3 colors) + // + printf("Loading site iy: %d with field value %.10e \n",iy_OpenQxD,field[iy_OpenQxD * length]); block_load(v, reinterpret_cast(in)); } __device__ __host__ inline void save(const complex v[length / 2], int x, int parity = 0) const { /* INDEXING */ - int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) - getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site + // int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) + // getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site - int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; + // int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; - // Loading as per QUDA style - auto out = &field[iy_OpenQxD * length]; - block_store(reinterpret_cast(out), v); + // // Loading as per QUDA style + // auto out = &field[iy_OpenQxD * length]; + // printf("Saving site iy: %d with field value %.10e \n",iy_OpenQxD,field[iy_OpenQxD * length]); + + // block_store(reinterpret_cast(out), v); } /** From ad8f689f39d51900d4abd4287d537af8a5c62046 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Sat, 25 Mar 2023 19:12:47 +0100 Subject: [PATCH 028/148] formatting --- include/color_spinor_field.h | 3 +- include/color_spinor_field_order.h | 18 ++--- include/enum_quda.h | 5 +- include/enum_quda_fortran.h | 125 +++++++++++++++-------------- include/gauge_field_order.h | 6 +- include/quda_openqcd_interface.h | 3 +- lib/openqcd_interface.cpp | 5 +- 7 files changed, 81 insertions(+), 84 deletions(-) diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 04daf931b3..e4375351ca 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -232,7 +232,8 @@ namespace quda siteOrder = QUDA_EVEN_ODD_SITE_ORDER; } else if (inv_param.dirac_order == QUDA_OPENQCD_DIRAC_ORDER) { fieldOrder = QUDA_OPENQCD_FIELD_ORDER; - siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: SHOULD THIS BE LEXICOGRAPHICAL, OR IN QUDA_OPENQCD_FIELD_ORDER THIS WORKS OUT ALREADY ??? + siteOrder + = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: SHOULD THIS BE LEXICOGRAPHICAL, OR IN QUDA_OPENQCD_FIELD_ORDER THIS WORKS OUT ALREADY ??? } else { errorQuda("Dirac order %d not supported", inv_param.dirac_order); } diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 214a843e72..146e96084f 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1741,8 +1741,8 @@ namespace quda volumeCB(a.VolumeCB()), nParity(a.SiteSubset()), dim {a.X(0), a.X(1), a.X(2), a.X(3)} - // dim {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // GLOBAL dimensions - { // TODO: ARE GHOSTS NEEDED?? + // dim {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // GLOBAL dimensions + { // TODO: ARE GHOSTS NEEDED?? // for (int i = 0; i < 4; i++) { // ghost[2 * i] = ghost_ ? ghost_[2 * i] : 0; // ghost[2 * i + 1] = ghost_ ? ghost_[2 * i + 1] : 0; @@ -1751,7 +1751,6 @@ namespace quda if constexpr (length != 24) errorQuda("Spinor field length %d not supported", length); } - /** @brief Convert from 1-dimensional index to the n-dimensional spatial index. With full fields, we assume that the field is @@ -1760,7 +1759,7 @@ namespace quda */ // __device__ __host__ inline void LatticeIndexOpenQCD(int y[4], int i) const // { - // // if (siteSubset == QUDA_FULL_SITE_SUBSET) + // // if (siteSubset == QUDA_FULL_SITE_SUBSET) // x[0] /= 2; // for (int d = 0; d < 4; d++) { @@ -1775,13 +1774,10 @@ namespace quda // parity = parity & 1; // x[0] *= 2; // restore x[0] // // } - + // y[0] = 2 * y[0] + parity; // compute the full x coordinate // } - - - /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) coord1 in QUDA is x2 in OpenQxD (y) coord2 in QUDA is x3 in OpenQxD (z) @@ -1799,8 +1795,8 @@ namespace quda // Loading as per QUDA style auto in = &field[iy_OpenQxD * length]; // This is how they're accessed within OpenQxd (length = 24 doubles // = 12 complex doubles = 4 spinor x 3 colors) - // - printf("Loading site iy: %d with field value %.10e \n",iy_OpenQxD,field[iy_OpenQxD * length]); + // + printf("Loading site iy: %d with field value %.10e \n", iy_OpenQxD, field[iy_OpenQxD * length]); block_load(v, reinterpret_cast(in)); } @@ -1855,7 +1851,7 @@ namespace quda // } size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } // FIXME: ?? - }; // openQCDDiracOrder + }; // openQCDDiracOrder } // namespace colorspinor diff --git a/include/enum_quda.h b/include/enum_quda.h index 836cb1f1f7..cc228446da 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -48,7 +48,7 @@ typedef enum QudaGaugeFieldOrder_s { QUDA_BQCD_GAUGE_ORDER, // expect *gauge, mu, even-odd, spacetime+halos, column-row order QUDA_TIFR_GAUGE_ORDER, // expect *gauge, mu, even-odd, spacetime, column-row order QUDA_TIFR_PADDED_GAUGE_ORDER, // expect *gauge, mu, parity, t, z+halo, y, x/2, column-row order - QUDA_OPENQCD_GAUGE_ORDER, // expect *gauge, spacetime, mu, parity (uplink/downlink), row-column order -- links attached to odd points only + QUDA_OPENQCD_GAUGE_ORDER, // expect *gauge, spacetime, mu, parity (uplink/downlink), row-column order -- links attached to odd points only QUDA_INVALID_GAUGE_ORDER = QUDA_INVALID_ENUM } QudaGaugeFieldOrder; @@ -357,7 +357,7 @@ typedef enum QudaFieldOrder_s { QUDA_QDPJIT_FIELD_ORDER, // QDP field ordering (complex-color-spin-spacetime) QUDA_QOP_DOMAIN_WALL_FIELD_ORDER, // QOP domain-wall ordering QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER, // TIFR RHMC ordering - QUDA_OPENQCD_FIELD_ORDER, // OPENQCD geometry ordering (at the moment lexicographical w/ rotation zyxt = x3x2x1x0 |-> xyzt x0x1x2x3 ) + QUDA_OPENQCD_FIELD_ORDER, // OPENQCD geometry ordering (at the moment lexicographical w/ rotation zyxt = x3x2x1x0 |-> xyzt x0x1x2x3 ) QUDA_INVALID_FIELD_ORDER = QUDA_INVALID_ENUM } QudaFieldOrder; @@ -584,4 +584,3 @@ typedef enum QudaExtLibType_s { #ifdef __cplusplus } #endif - diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index d6b954d0bf..ccd17b74e2 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -9,7 +9,7 @@ # gfortran). #*/ -#define QUDA_INVALID_ENUM (-Z'7fffffff' - 1) +#define QUDA_INVALID_ENUM (-Z '7fffffff' - 1) #define QudaLinkType integer(4) @@ -22,27 +22,27 @@ #define QUDA_MEMORY_MAPPED 2 #define QUDA_MEMORY_INVALID QUDA_INVALID_ENUM -#define QUDA_SU3_LINKS 0 -#define QUDA_GENERAL_LINKS 1 -#define QUDA_THREE_LINKS 2 +#define QUDA_SU3_LINKS 0 +#define QUDA_GENERAL_LINKS 1 +#define QUDA_THREE_LINKS 2 #define QUDA_MOMENTUM_LINKS 3 -#define QUDA_COARSE_LINKS 4 -#define QUDA_SMEARED_LINKS 5 +#define QUDA_COARSE_LINKS 4 +#define QUDA_SMEARED_LINKS 5 -#define QUDA_WILSON_LINKS QUDA_SU3_LINKS -#define QUDA_ASQTAD_FAT_LINKS QUDA_GENERAL_LINKS -#define QUDA_ASQTAD_LONG_LINKS QUDA_THREE_LINKS -#define QUDA_ASQTAD_MOM_LINKS QUDA_MOMENTUM_LINKS +#define QUDA_WILSON_LINKS QUDA_SU3_LINKS +#define QUDA_ASQTAD_FAT_LINKS QUDA_GENERAL_LINKS +#define QUDA_ASQTAD_LONG_LINKS QUDA_THREE_LINKS +#define QUDA_ASQTAD_MOM_LINKS QUDA_MOMENTUM_LINKS #define QUDA_ASQTAD_GENERAL_LINKS QUDA_GENERAL_LINKS -#define QUDA_INVALID_LINKS QUDA_INVALID_ENUM +#define QUDA_INVALID_LINKS QUDA_INVALID_ENUM #define QudaGaugeFieldOrder integer(4) #define QUDA_FLOAT_GAUGE_ORDER 1 -#define QUDA_FLOAT2_GAUGE_ORDER 2 //no reconstruct and double precision -#define QUDA_FLOAT4_GAUGE_ORDER 4 // 8 reconstruct single, and 12 reconstruct single, half, quarter -#define QUDA_FLOAT8_GAUGE_ORDER 8 // 8 reconstruct half and quarter -#define QUDA_NATIVE_GAUGE_ORDER 9 // used to denote one of the above types in a trait, not used directly -#define QUDA_QDP_GAUGE_ORDER 10 // expect *gauge[4] even-odd spacetime row-column color +#define QUDA_FLOAT2_GAUGE_ORDER 2 // no reconstruct and double precision +#define QUDA_FLOAT4_GAUGE_ORDER 4 // 8 reconstruct single, and 12 reconstruct single, half, quarter +#define QUDA_FLOAT8_GAUGE_ORDER 8 // 8 reconstruct half and quarter +#define QUDA_NATIVE_GAUGE_ORDER 9 // used to denote one of the above types in a trait, not used directly +#define QUDA_QDP_GAUGE_ORDER 10 // expect *gauge[4] even-odd spacetime row-column color #define QUDA_QDPJIT_GAUGE_ORDER 11 // expect *gauge[4] even-odd spacetime row-column color #define QUDA_CPS_WILSON_GAUGE_ORDER 12 // expect *gauge even-odd spacetime column-row color #define QUDA_MILC_GAUGE_ORDER 13 // expect *gauge even-odd mu spacetime row-column order @@ -69,21 +69,21 @@ #define QudaReconstructType integer(4) #define QUDA_RECONSTRUCT_NO 18 #define QUDA_RECONSTRUCT_12 12 -#define QUDA_RECONSTRUCT_8 8 -#define QUDA_RECONSTRUCT_9 9 +#define QUDA_RECONSTRUCT_8 8 +#define QUDA_RECONSTRUCT_9 9 #define QUDA_RECONSTRUCT_13 13 #define QUDA_RECONSTRUCT_10 10 #define QUDA_RECONSTRUCT_INVALID QUDA_INVALID_ENUM #define QudaGaugeFixed integer(4) -#define QUDA_GAUGE_FIXED_NO 0 +#define QUDA_GAUGE_FIXED_NO 0 #define QUDA_GAUGE_FIXED_YES 1 // gauge field stored in temporal gauge #define QUDA_GAUGE_FIXED_INVALID QUDA_INVALID_ENUM ! Types used in QudaInvertParam #define QudaDslashType integer(4) -#define QUDA_WILSON_DSLASH 0 +#define QUDA_WILSON_DSLASH 0 #define QUDA_CLOVER_WILSON_DSLASH 1 #define QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH 2 #define QUDA_DOMAIN_WALL_DSLASH 3 @@ -125,9 +125,9 @@ #define QUDA_INVALID_INVERTER QUDA_INVALID_ENUM #define QudaEigType integer(4) -#define QUDA_EIG_TR_LANCZOS 0 // Thick Restarted Lanczos Solver +#define QUDA_EIG_TR_LANCZOS 0 // Thick Restarted Lanczos Solver #define QUDA_EIG_BLK_IR_LANCZOS 1 // Block Thick Restarted Lanczos Solver -#define QUDA_EIG_IR_ARNOLDI 2 // Implicitly restarted Arnoldi solver +#define QUDA_EIG_IR_ARNOLDI 2 // Implicitly restarted Arnoldi solver #define QUDA_EIG_BLK_IR_ARNOLDI 3 // Block Implicitly restarted Arnoldi solver (not yet implemented) #define QUDA_EIG_INVALID QUDA_INVALID_ENUM @@ -141,7 +141,7 @@ #define QUDA_SPECTRUM_INVALID QUDA_INVALID_ENUM #define QudaSolutionType integer(4) -#define QUDA_MAT_SOLUTION 0 +#define QUDA_MAT_SOLUTION 0 #define QUDA_MATDAG_MAT_SOLUTION 1 #define QUDA_MATPC_SOLUTION 2 #define QUDA_MATPC_DAG_SOLUTION 3 @@ -156,7 +156,7 @@ #define QUDA_NORMOP_PC_SOLVE 3 #define QUDA_NORMERR_SOLVE 4 #define QUDA_NORMERR_PC_SOLVE 5 -#define QUDA_NORMEQ_SOLVE QUDA_NORMOP_SOLVE // deprecated +#define QUDA_NORMEQ_SOLVE QUDA_NORMOP_SOLVE // deprecated #define QUDA_NORMEQ_PC_SOLVE QUDA_NORMOP_PC_SOLVE // deprecated #define QUDA_INVALID_SOLVE QUDA_INVALID_ENUM @@ -168,7 +168,7 @@ #define QUDA_MG_CYCLE_INVALID QUDA_INVALID_ENUM #define QudaSchwarzType integer(4) -#define QUDA_ADDITIVE_SCHWARZ 0 +#define QUDA_ADDITIVE_SCHWARZ 0 #define QUDA_MULTIPLICATIVE_SCHWARZ 1 #define QUDA_INVALID_SCHWARZ QUDA_INVALID_ENUM @@ -206,44 +206,44 @@ #define QUDA_MATPC_INVALID QUDA_INVALID_ENUM #define QudaDagType integer(4) -#define QUDA_DAG_NO 0 +#define QUDA_DAG_NO 0 #define QUDA_DAG_YES 1 #define QUDA_DAG_INVALID QUDA_INVALID_ENUM - + #define QudaMassNormalization integer(4) -#define QUDA_KAPPA_NORMALIZATION 0 +#define QUDA_KAPPA_NORMALIZATION 0 #define QUDA_MASS_NORMALIZATION 1 #define QUDA_ASYMMETRIC_MASS_NORMALIZATION 2 #define QUDA_INVALID_NORMALIZATION QUDA_INVALID_ENUM #define QudaSolverNormalization integer(4) #define QUDA_DEFAULT_NORMALIZATION 0 // leave source and solution untouched -#define QUDA_SOURCE_NORMALIZATION 1 // normalize such that || src || = 1 +#define QUDA_SOURCE_NORMALIZATION 1 // normalize such that || src || = 1 #define QudaPreserveSource integer(4) -#define QUDA_PRESERVE_SOURCE_NO 0 // use the source for the residual +#define QUDA_PRESERVE_SOURCE_NO 0 // use the source for the residual #define QUDA_PRESERVE_SOURCE_YES 1 #define QUDA_PRESERVE_SOURCE_INVALID QUDA_INVALID_ENUM #define QudaDiracFieldOrder integer(4) -#define QUDA_INTERNAL_DIRAC_ORDER 0 // internal dirac order used by QUDA varies depending on precision and dslash type +#define QUDA_INTERNAL_DIRAC_ORDER 0 // internal dirac order used by QUDA varies depending on precision and dslash type #define QUDA_DIRAC_ORDER 1 -#define QUDA_QDP_DIRAC_ORDER 2 // even-odd spin inside color -#define QUDA_QDPJIT_DIRAC_ORDER 3 // even-odd, complex-color-spin-spacetime -#define QUDA_CPS_WILSON_DIRAC_ORDER 4 // odd-even color inside spin -#define QUDA_LEX_DIRAC_ORDER 5 // lexicographical order color inside spin +#define QUDA_QDP_DIRAC_ORDER 2 // even-odd spin inside color +#define QUDA_QDPJIT_DIRAC_ORDER 3 // even-odd, complex-color-spin-spacetime +#define QUDA_CPS_WILSON_DIRAC_ORDER 4 // odd-even color inside spin +#define QUDA_LEX_DIRAC_ORDER 5 // lexicographical order color inside spin #define QUDA_TIFR_PADDED_DIRAC_ORDER 6 -#define QUDA_OPENQCD_DIRAC_ORDER 7 // openqcd +#define QUDA_OPENQCD_DIRAC_ORDER 7 // openqcd #define QUDA_INVALID_DIRAC_ORDER QUDA_INVALID_ENUM #define QudaCloverFieldOrder integer(4) -#define QUDA_FLOAT_CLOVER_ORDER 1 // even-odd float ordering -#define QUDA_FLOAT2_CLOVER_ORDER 2 // even-odd float2 ordering -#define QUDA_FLOAT4_CLOVER_ORDER 4 // even-odd float4 ordering -#define QUDA_FLOAT8_CLOVER_ORDER 8 // even-odd float8 ordering -#define QUDA_PACKED_CLOVER_ORDER 9 // even-odd packed -#define QUDA_QDPJIT_CLOVER_ORDER 10 // lexicographical order packed -#define QUDA_BQCD_CLOVER_ORDER 11 // BQCD order which is a packed super-diagonal form +#define QUDA_FLOAT_CLOVER_ORDER 1 // even-odd float ordering +#define QUDA_FLOAT2_CLOVER_ORDER 2 // even-odd float2 ordering +#define QUDA_FLOAT4_CLOVER_ORDER 4 // even-odd float4 ordering +#define QUDA_FLOAT8_CLOVER_ORDER 8 // even-odd float8 ordering +#define QUDA_PACKED_CLOVER_ORDER 9 // even-odd packed +#define QUDA_QDPJIT_CLOVER_ORDER 10 // lexicographical order packed +#define QUDA_BQCD_CLOVER_ORDER 11 // BQCD order which is a packed super-diagonal form #define QUDA_INVALID_CLOVER_ORDER QUDA_INVALID_ENUM #define QudaVerbosity integer(4) @@ -319,28 +319,29 @@ ! Site ordering (always t-z-y-x with rightmost varying fastest) #define QudaSiteOrder integer(4) #define QUDA_LEXICOGRAPHIC_SITE_ORDER 0 // lexicographic ordering -#define QUDA_EVEN_ODD_SITE_ORDER 1 // QUDA and QDP use this -#define QUDA_ODD_EVEN_SITE_ORDER 2 // CPS uses this +#define QUDA_EVEN_ODD_SITE_ORDER 1 // QUDA and QDP use this +#define QUDA_ODD_EVEN_SITE_ORDER 2 // CPS uses this #define QUDA_INVALID_SITE_ORDER QUDA_INVALID_ENUM ! Degree of freedom ordering #define QudaFieldOrder integer(4) -#define QUDA_FLOAT_FIELD_ORDER 1 // spin-color-complex-space -#define QUDA_FLOAT2_FIELD_ORDER 2 // (spin-color-complex)/2-space-(spin-color-complex)%2 -#define QUDA_FLOAT4_FIELD_ORDER 4 // (spin-color-complex)/4-space-(spin-color-complex)%4 -#define QUDA_FLOAT8_FIELD_ORDER 8 // (spin-color-complex)/8-space-(spin-color-complex)%8 +#define QUDA_FLOAT_FIELD_ORDER 1 // spin-color-complex-space +#define QUDA_FLOAT2_FIELD_ORDER 2 // (spin-color-complex)/2-space-(spin-color-complex)%2 +#define QUDA_FLOAT4_FIELD_ORDER 4 // (spin-color-complex)/4-space-(spin-color-complex)%4 +#define QUDA_FLOAT8_FIELD_ORDER 8 // (spin-color-complex)/8-space-(spin-color-complex)%8 #define QUDA_SPACE_SPIN_COLOR_FIELD_ORDER 9 // CPS/QDP++ ordering #define QUDA_SPACE_COLOR_SPIN_FIELD_ORDER 10 // QLA ordering (spin inside color) #define QUDA_QDPJIT_FIELD_ORDER 11 // QDP field ordering (complex-color-spin-spacetime) #define QUDA_QOP_DOMAIN_WALL_FIELD_ORDER 12 // QOP domain-wall ordering #define QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER 13 // TIFR RHMC ordering -#define QUDA_OPENQCD_FIELD_ORDER 14 // OPENQCD geometry ordering (at the moment lexicographical w/ rotation zyxt = x3x2x1x0 |-> xyzt x0x1x2x3 ) +#define QUDA_OPENQCD_FIELD_ORDER \ + 14 // OPENQCD geometry ordering (at the moment lexicographical w/ rotation zyxt = x3x2x1x0 |-> xyzt x0x1x2x3 ) #define QUDA_INVALID_FIELD_ORDER QUDA_INVALID_ENUM - + #define QudaFieldCreate integer(4) -#define QUDA_NULL_FIELD_CREATE 0 // create new field -#define QUDA_ZERO_FIELD_CREATE 1 // create new field and zero it -#define QUDA_COPY_FIELD_CREATE 2 // create copy to field +#define QUDA_NULL_FIELD_CREATE 0 // create new field +#define QUDA_ZERO_FIELD_CREATE 1 // create new field and zero it +#define QUDA_COPY_FIELD_CREATE 2 // create copy to field #define QUDA_REFERENCE_FIELD_CREATE 3 // create reference to field #define QUDA_INVALID_FIELD_CREATE QUDA_INVALID_ENUM @@ -383,7 +384,7 @@ #define QudaTwistFlavorType integer(4) #define QUDA_TWIST_SINGLET 1 #define QUDA_TWIST_NONDEG_DOUBLET +2 -#define QUDA_TWIST_NO 0 +#define QUDA_TWIST_NO 0 #define QUDA_TWIST_INVALID QUDA_INVALID_ENUM #define QudaTwistDslashType integer(4) @@ -405,12 +406,12 @@ #define QUDA_TWIST_GAMMA5_INVALID QUDA_INVALID_ENUM #define QudaUseInitGuess integer(4) -#define QUDA_USE_INIT_GUESS_NO 0 +#define QUDA_USE_INIT_GUESS_NO 0 #define QUDA_USE_INIT_GUESS_YES 1 #define QUDA_USE_INIT_GUESS_INVALID QUDA_INVALID_ENUM #define QudaComputeNullVector integer(4) -#define QUDA_COMPUTE_NULL_VECTOR_NO 0 +#define QUDA_COMPUTE_NULL_VECTOR_NO 0 #define QUDA_COMPUTE_NULL_VECTOR_YES 1 #define QUDA_COMPUTE_NULL_VECTOR_INVALID QUDA_INVALID_ENUM @@ -453,7 +454,7 @@ #define QudaDirection integer(4) #define QUDA_BACKWARDS -1 -#define QUDA_FORWARDS +1 +#define QUDA_FORWARDS +1 #define QUDA_BOTH_DIRS 2 #define QudaLinkDirection integer(4) @@ -470,15 +471,15 @@ #define QUDA_INVALID_GEOMETRY QUDA_INVALID_ENUM #define QudaGhostExchange integer(4) -#define QUDA_GHOST_EXCHANGE_NO 0 -#define QUDA_GHOST_EXCHANGE_PAD 1 +#define QUDA_GHOST_EXCHANGE_NO 0 +#define QUDA_GHOST_EXCHANGE_PAD 1 #define QUDA_GHOST_EXCHANGE_EXTENDED 2 #define QUDA_GHOST_EXCHANGE_INVALID QUDA_INVALID_ENUM #define QudaStaggeredPhase integer(4) -#define QUDA_STAGGERED_PHASE_NO 0 +#define QUDA_STAGGERED_PHASE_NO 0 #define QUDA_STAGGERED_PHASE_MILC 1 -#define QUDA_STAGGERED_PHASE_CPS 2 +#define QUDA_STAGGERED_PHASE_CPS 2 #define QUDA_STAGGERED_PHASE_TIFR 3 #define QUDA_STAGGERED_PHASE_INVALID QUDA_INVALID_ENUM diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 7c019f780e..9c7ea4baee 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2399,9 +2399,9 @@ namespace quda int dir_OpenQxD = (dir + 1) % 4; // rotation of axes QUDA -> OpenQxD // Loading as per QUDA style - auto in = &gauge[(4 * iy_OpenQxD + dir_OpenQxD) * length]; - // This is how they're accessed within OpenQxd (length = 18 - // doubles = 9 complex doubles = 1 su3dble struct) + auto in = &gauge[(4 * iy_OpenQxD + dir_OpenQxD) * length]; + // This is how they're accessed within OpenQxd (length = 18 + // doubles = 9 complex doubles = 1 su3dble struct) // auto in = &gauge[ (8*(ix_OpenQxD - volumeCB) + 2*dir_OpenQxD)* length]; // This is how they're accessed // within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) block_load(v, reinterpret_cast(in)); diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index f7c819ba65..34e8eb63d7 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -110,7 +110,7 @@ typedef struct { HISQ fermions since the tadpole factor is baked into the links during their construction */ double naik_epsilon; /** Naik epsilon parameter (HISQ fermions only).*/ - QudaDslashType dslash_type; + QudaDslashType dslash_type; } openQCD_QudaInvertArgs_t; /** @@ -126,7 +126,6 @@ typedef struct { void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *source, void *solution, void *gauge); - /** * Solve Ax=b for an improved staggered operator. All fields are fields * passed and returned are host (CPU) field in MILC order. This diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index dc900ae92d..49c7abde24 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -362,8 +362,9 @@ static void setColorSpinorParams(const int dim[4], QudaPrecision precision, Colo param->setPrecision(precision); param->pad = 0; - param->siteSubset = QUDA_FULL_SITE_SUBSET; // FIXME: check how to adapt this for openqxd - param->siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: check how to adapt this for openqxd // EVEN-ODD is only about inner ordering in quda + param->siteSubset = QUDA_FULL_SITE_SUBSET; // FIXME: check how to adapt this for openqxd + param->siteOrder + = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: check how to adapt this for openqxd // EVEN-ODD is only about inner ordering in quda param->fieldOrder = QUDA_OPENQCD_FIELD_ORDER; // FIXME: param->gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // meaningless, but required by the code. // // FIXME:: param->create = QUDA_ZERO_FIELD_CREATE; // // FIXME:: check how to adapt this for openqxd ?? created -0 in weird places From 189896e40b735a5abdb67c957fd483274e924412 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Sun, 26 Mar 2023 22:02:22 +0200 Subject: [PATCH 029/148] corrected over-include enum + cmmnts --- include/color_spinor_field.h | 1 - include/color_spinor_field_order.h | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index e4375351ca..03063bd5c5 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -1,6 +1,5 @@ #pragma once -#include "enum_quda.h" #include #include #include diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 146e96084f..8a4bd87b9d 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -11,7 +11,6 @@ * also. */ -#include "enum_quda.h" #include #include #include @@ -1737,11 +1736,11 @@ namespace quda OpenQCDDiracOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : field(field_ ? field_ : (Float *)a.V()), - offset(a.Bytes() / (2 * sizeof(Float))), + offset(a.Bytes() / (2 * sizeof(Float))), // TODO: What's this for?? volumeCB(a.VolumeCB()), nParity(a.SiteSubset()), dim {a.X(0), a.X(1), a.X(2), a.X(3)} - // dim {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // GLOBAL dimensions + // dim {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // GLOBAL dimensions?? { // TODO: ARE GHOSTS NEEDED?? // for (int i = 0; i < 4; i++) { // ghost[2 * i] = ghost_ ? ghost_[2 * i] : 0; From 37e602374eddf4e8e1b30b716d6413ff842b50c1 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Sun, 26 Mar 2023 22:03:07 +0200 Subject: [PATCH 030/148] return to previous formatting --- include/enum_quda_fortran.h | 125 +- include/gauge_field_order.h | 2350 ++++++++++++++--------------- include/quda.h | 2846 ++++++++++++++++++----------------- lib/interface_quda.cpp | 1206 ++++++++------- 4 files changed, 3279 insertions(+), 3248 deletions(-) diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index ccd17b74e2..d6b954d0bf 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -9,7 +9,7 @@ # gfortran). #*/ -#define QUDA_INVALID_ENUM (-Z '7fffffff' - 1) +#define QUDA_INVALID_ENUM (-Z'7fffffff' - 1) #define QudaLinkType integer(4) @@ -22,27 +22,27 @@ #define QUDA_MEMORY_MAPPED 2 #define QUDA_MEMORY_INVALID QUDA_INVALID_ENUM -#define QUDA_SU3_LINKS 0 -#define QUDA_GENERAL_LINKS 1 -#define QUDA_THREE_LINKS 2 +#define QUDA_SU3_LINKS 0 +#define QUDA_GENERAL_LINKS 1 +#define QUDA_THREE_LINKS 2 #define QUDA_MOMENTUM_LINKS 3 -#define QUDA_COARSE_LINKS 4 -#define QUDA_SMEARED_LINKS 5 +#define QUDA_COARSE_LINKS 4 +#define QUDA_SMEARED_LINKS 5 -#define QUDA_WILSON_LINKS QUDA_SU3_LINKS -#define QUDA_ASQTAD_FAT_LINKS QUDA_GENERAL_LINKS -#define QUDA_ASQTAD_LONG_LINKS QUDA_THREE_LINKS -#define QUDA_ASQTAD_MOM_LINKS QUDA_MOMENTUM_LINKS +#define QUDA_WILSON_LINKS QUDA_SU3_LINKS +#define QUDA_ASQTAD_FAT_LINKS QUDA_GENERAL_LINKS +#define QUDA_ASQTAD_LONG_LINKS QUDA_THREE_LINKS +#define QUDA_ASQTAD_MOM_LINKS QUDA_MOMENTUM_LINKS #define QUDA_ASQTAD_GENERAL_LINKS QUDA_GENERAL_LINKS -#define QUDA_INVALID_LINKS QUDA_INVALID_ENUM +#define QUDA_INVALID_LINKS QUDA_INVALID_ENUM #define QudaGaugeFieldOrder integer(4) #define QUDA_FLOAT_GAUGE_ORDER 1 -#define QUDA_FLOAT2_GAUGE_ORDER 2 // no reconstruct and double precision -#define QUDA_FLOAT4_GAUGE_ORDER 4 // 8 reconstruct single, and 12 reconstruct single, half, quarter -#define QUDA_FLOAT8_GAUGE_ORDER 8 // 8 reconstruct half and quarter -#define QUDA_NATIVE_GAUGE_ORDER 9 // used to denote one of the above types in a trait, not used directly -#define QUDA_QDP_GAUGE_ORDER 10 // expect *gauge[4] even-odd spacetime row-column color +#define QUDA_FLOAT2_GAUGE_ORDER 2 //no reconstruct and double precision +#define QUDA_FLOAT4_GAUGE_ORDER 4 // 8 reconstruct single, and 12 reconstruct single, half, quarter +#define QUDA_FLOAT8_GAUGE_ORDER 8 // 8 reconstruct half and quarter +#define QUDA_NATIVE_GAUGE_ORDER 9 // used to denote one of the above types in a trait, not used directly +#define QUDA_QDP_GAUGE_ORDER 10 // expect *gauge[4] even-odd spacetime row-column color #define QUDA_QDPJIT_GAUGE_ORDER 11 // expect *gauge[4] even-odd spacetime row-column color #define QUDA_CPS_WILSON_GAUGE_ORDER 12 // expect *gauge even-odd spacetime column-row color #define QUDA_MILC_GAUGE_ORDER 13 // expect *gauge even-odd mu spacetime row-column order @@ -69,21 +69,21 @@ #define QudaReconstructType integer(4) #define QUDA_RECONSTRUCT_NO 18 #define QUDA_RECONSTRUCT_12 12 -#define QUDA_RECONSTRUCT_8 8 -#define QUDA_RECONSTRUCT_9 9 +#define QUDA_RECONSTRUCT_8 8 +#define QUDA_RECONSTRUCT_9 9 #define QUDA_RECONSTRUCT_13 13 #define QUDA_RECONSTRUCT_10 10 #define QUDA_RECONSTRUCT_INVALID QUDA_INVALID_ENUM #define QudaGaugeFixed integer(4) -#define QUDA_GAUGE_FIXED_NO 0 +#define QUDA_GAUGE_FIXED_NO 0 #define QUDA_GAUGE_FIXED_YES 1 // gauge field stored in temporal gauge #define QUDA_GAUGE_FIXED_INVALID QUDA_INVALID_ENUM ! Types used in QudaInvertParam #define QudaDslashType integer(4) -#define QUDA_WILSON_DSLASH 0 +#define QUDA_WILSON_DSLASH 0 #define QUDA_CLOVER_WILSON_DSLASH 1 #define QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH 2 #define QUDA_DOMAIN_WALL_DSLASH 3 @@ -125,9 +125,9 @@ #define QUDA_INVALID_INVERTER QUDA_INVALID_ENUM #define QudaEigType integer(4) -#define QUDA_EIG_TR_LANCZOS 0 // Thick Restarted Lanczos Solver +#define QUDA_EIG_TR_LANCZOS 0 // Thick Restarted Lanczos Solver #define QUDA_EIG_BLK_IR_LANCZOS 1 // Block Thick Restarted Lanczos Solver -#define QUDA_EIG_IR_ARNOLDI 2 // Implicitly restarted Arnoldi solver +#define QUDA_EIG_IR_ARNOLDI 2 // Implicitly restarted Arnoldi solver #define QUDA_EIG_BLK_IR_ARNOLDI 3 // Block Implicitly restarted Arnoldi solver (not yet implemented) #define QUDA_EIG_INVALID QUDA_INVALID_ENUM @@ -141,7 +141,7 @@ #define QUDA_SPECTRUM_INVALID QUDA_INVALID_ENUM #define QudaSolutionType integer(4) -#define QUDA_MAT_SOLUTION 0 +#define QUDA_MAT_SOLUTION 0 #define QUDA_MATDAG_MAT_SOLUTION 1 #define QUDA_MATPC_SOLUTION 2 #define QUDA_MATPC_DAG_SOLUTION 3 @@ -156,7 +156,7 @@ #define QUDA_NORMOP_PC_SOLVE 3 #define QUDA_NORMERR_SOLVE 4 #define QUDA_NORMERR_PC_SOLVE 5 -#define QUDA_NORMEQ_SOLVE QUDA_NORMOP_SOLVE // deprecated +#define QUDA_NORMEQ_SOLVE QUDA_NORMOP_SOLVE // deprecated #define QUDA_NORMEQ_PC_SOLVE QUDA_NORMOP_PC_SOLVE // deprecated #define QUDA_INVALID_SOLVE QUDA_INVALID_ENUM @@ -168,7 +168,7 @@ #define QUDA_MG_CYCLE_INVALID QUDA_INVALID_ENUM #define QudaSchwarzType integer(4) -#define QUDA_ADDITIVE_SCHWARZ 0 +#define QUDA_ADDITIVE_SCHWARZ 0 #define QUDA_MULTIPLICATIVE_SCHWARZ 1 #define QUDA_INVALID_SCHWARZ QUDA_INVALID_ENUM @@ -206,44 +206,44 @@ #define QUDA_MATPC_INVALID QUDA_INVALID_ENUM #define QudaDagType integer(4) -#define QUDA_DAG_NO 0 +#define QUDA_DAG_NO 0 #define QUDA_DAG_YES 1 #define QUDA_DAG_INVALID QUDA_INVALID_ENUM - + #define QudaMassNormalization integer(4) -#define QUDA_KAPPA_NORMALIZATION 0 +#define QUDA_KAPPA_NORMALIZATION 0 #define QUDA_MASS_NORMALIZATION 1 #define QUDA_ASYMMETRIC_MASS_NORMALIZATION 2 #define QUDA_INVALID_NORMALIZATION QUDA_INVALID_ENUM #define QudaSolverNormalization integer(4) #define QUDA_DEFAULT_NORMALIZATION 0 // leave source and solution untouched -#define QUDA_SOURCE_NORMALIZATION 1 // normalize such that || src || = 1 +#define QUDA_SOURCE_NORMALIZATION 1 // normalize such that || src || = 1 #define QudaPreserveSource integer(4) -#define QUDA_PRESERVE_SOURCE_NO 0 // use the source for the residual +#define QUDA_PRESERVE_SOURCE_NO 0 // use the source for the residual #define QUDA_PRESERVE_SOURCE_YES 1 #define QUDA_PRESERVE_SOURCE_INVALID QUDA_INVALID_ENUM #define QudaDiracFieldOrder integer(4) -#define QUDA_INTERNAL_DIRAC_ORDER 0 // internal dirac order used by QUDA varies depending on precision and dslash type +#define QUDA_INTERNAL_DIRAC_ORDER 0 // internal dirac order used by QUDA varies depending on precision and dslash type #define QUDA_DIRAC_ORDER 1 -#define QUDA_QDP_DIRAC_ORDER 2 // even-odd spin inside color -#define QUDA_QDPJIT_DIRAC_ORDER 3 // even-odd, complex-color-spin-spacetime -#define QUDA_CPS_WILSON_DIRAC_ORDER 4 // odd-even color inside spin -#define QUDA_LEX_DIRAC_ORDER 5 // lexicographical order color inside spin +#define QUDA_QDP_DIRAC_ORDER 2 // even-odd spin inside color +#define QUDA_QDPJIT_DIRAC_ORDER 3 // even-odd, complex-color-spin-spacetime +#define QUDA_CPS_WILSON_DIRAC_ORDER 4 // odd-even color inside spin +#define QUDA_LEX_DIRAC_ORDER 5 // lexicographical order color inside spin #define QUDA_TIFR_PADDED_DIRAC_ORDER 6 -#define QUDA_OPENQCD_DIRAC_ORDER 7 // openqcd +#define QUDA_OPENQCD_DIRAC_ORDER 7 // openqcd #define QUDA_INVALID_DIRAC_ORDER QUDA_INVALID_ENUM #define QudaCloverFieldOrder integer(4) -#define QUDA_FLOAT_CLOVER_ORDER 1 // even-odd float ordering -#define QUDA_FLOAT2_CLOVER_ORDER 2 // even-odd float2 ordering -#define QUDA_FLOAT4_CLOVER_ORDER 4 // even-odd float4 ordering -#define QUDA_FLOAT8_CLOVER_ORDER 8 // even-odd float8 ordering -#define QUDA_PACKED_CLOVER_ORDER 9 // even-odd packed -#define QUDA_QDPJIT_CLOVER_ORDER 10 // lexicographical order packed -#define QUDA_BQCD_CLOVER_ORDER 11 // BQCD order which is a packed super-diagonal form +#define QUDA_FLOAT_CLOVER_ORDER 1 // even-odd float ordering +#define QUDA_FLOAT2_CLOVER_ORDER 2 // even-odd float2 ordering +#define QUDA_FLOAT4_CLOVER_ORDER 4 // even-odd float4 ordering +#define QUDA_FLOAT8_CLOVER_ORDER 8 // even-odd float8 ordering +#define QUDA_PACKED_CLOVER_ORDER 9 // even-odd packed +#define QUDA_QDPJIT_CLOVER_ORDER 10 // lexicographical order packed +#define QUDA_BQCD_CLOVER_ORDER 11 // BQCD order which is a packed super-diagonal form #define QUDA_INVALID_CLOVER_ORDER QUDA_INVALID_ENUM #define QudaVerbosity integer(4) @@ -319,29 +319,28 @@ ! Site ordering (always t-z-y-x with rightmost varying fastest) #define QudaSiteOrder integer(4) #define QUDA_LEXICOGRAPHIC_SITE_ORDER 0 // lexicographic ordering -#define QUDA_EVEN_ODD_SITE_ORDER 1 // QUDA and QDP use this -#define QUDA_ODD_EVEN_SITE_ORDER 2 // CPS uses this +#define QUDA_EVEN_ODD_SITE_ORDER 1 // QUDA and QDP use this +#define QUDA_ODD_EVEN_SITE_ORDER 2 // CPS uses this #define QUDA_INVALID_SITE_ORDER QUDA_INVALID_ENUM ! Degree of freedom ordering #define QudaFieldOrder integer(4) -#define QUDA_FLOAT_FIELD_ORDER 1 // spin-color-complex-space -#define QUDA_FLOAT2_FIELD_ORDER 2 // (spin-color-complex)/2-space-(spin-color-complex)%2 -#define QUDA_FLOAT4_FIELD_ORDER 4 // (spin-color-complex)/4-space-(spin-color-complex)%4 -#define QUDA_FLOAT8_FIELD_ORDER 8 // (spin-color-complex)/8-space-(spin-color-complex)%8 +#define QUDA_FLOAT_FIELD_ORDER 1 // spin-color-complex-space +#define QUDA_FLOAT2_FIELD_ORDER 2 // (spin-color-complex)/2-space-(spin-color-complex)%2 +#define QUDA_FLOAT4_FIELD_ORDER 4 // (spin-color-complex)/4-space-(spin-color-complex)%4 +#define QUDA_FLOAT8_FIELD_ORDER 8 // (spin-color-complex)/8-space-(spin-color-complex)%8 #define QUDA_SPACE_SPIN_COLOR_FIELD_ORDER 9 // CPS/QDP++ ordering #define QUDA_SPACE_COLOR_SPIN_FIELD_ORDER 10 // QLA ordering (spin inside color) #define QUDA_QDPJIT_FIELD_ORDER 11 // QDP field ordering (complex-color-spin-spacetime) #define QUDA_QOP_DOMAIN_WALL_FIELD_ORDER 12 // QOP domain-wall ordering #define QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER 13 // TIFR RHMC ordering -#define QUDA_OPENQCD_FIELD_ORDER \ - 14 // OPENQCD geometry ordering (at the moment lexicographical w/ rotation zyxt = x3x2x1x0 |-> xyzt x0x1x2x3 ) +#define QUDA_OPENQCD_FIELD_ORDER 14 // OPENQCD geometry ordering (at the moment lexicographical w/ rotation zyxt = x3x2x1x0 |-> xyzt x0x1x2x3 ) #define QUDA_INVALID_FIELD_ORDER QUDA_INVALID_ENUM - + #define QudaFieldCreate integer(4) -#define QUDA_NULL_FIELD_CREATE 0 // create new field -#define QUDA_ZERO_FIELD_CREATE 1 // create new field and zero it -#define QUDA_COPY_FIELD_CREATE 2 // create copy to field +#define QUDA_NULL_FIELD_CREATE 0 // create new field +#define QUDA_ZERO_FIELD_CREATE 1 // create new field and zero it +#define QUDA_COPY_FIELD_CREATE 2 // create copy to field #define QUDA_REFERENCE_FIELD_CREATE 3 // create reference to field #define QUDA_INVALID_FIELD_CREATE QUDA_INVALID_ENUM @@ -384,7 +383,7 @@ #define QudaTwistFlavorType integer(4) #define QUDA_TWIST_SINGLET 1 #define QUDA_TWIST_NONDEG_DOUBLET +2 -#define QUDA_TWIST_NO 0 +#define QUDA_TWIST_NO 0 #define QUDA_TWIST_INVALID QUDA_INVALID_ENUM #define QudaTwistDslashType integer(4) @@ -406,12 +405,12 @@ #define QUDA_TWIST_GAMMA5_INVALID QUDA_INVALID_ENUM #define QudaUseInitGuess integer(4) -#define QUDA_USE_INIT_GUESS_NO 0 +#define QUDA_USE_INIT_GUESS_NO 0 #define QUDA_USE_INIT_GUESS_YES 1 #define QUDA_USE_INIT_GUESS_INVALID QUDA_INVALID_ENUM #define QudaComputeNullVector integer(4) -#define QUDA_COMPUTE_NULL_VECTOR_NO 0 +#define QUDA_COMPUTE_NULL_VECTOR_NO 0 #define QUDA_COMPUTE_NULL_VECTOR_YES 1 #define QUDA_COMPUTE_NULL_VECTOR_INVALID QUDA_INVALID_ENUM @@ -454,7 +453,7 @@ #define QudaDirection integer(4) #define QUDA_BACKWARDS -1 -#define QUDA_FORWARDS +1 +#define QUDA_FORWARDS +1 #define QUDA_BOTH_DIRS 2 #define QudaLinkDirection integer(4) @@ -471,15 +470,15 @@ #define QUDA_INVALID_GEOMETRY QUDA_INVALID_ENUM #define QudaGhostExchange integer(4) -#define QUDA_GHOST_EXCHANGE_NO 0 -#define QUDA_GHOST_EXCHANGE_PAD 1 +#define QUDA_GHOST_EXCHANGE_NO 0 +#define QUDA_GHOST_EXCHANGE_PAD 1 #define QUDA_GHOST_EXCHANGE_EXTENDED 2 #define QUDA_GHOST_EXCHANGE_INVALID QUDA_INVALID_ENUM #define QudaStaggeredPhase integer(4) -#define QUDA_STAGGERED_PHASE_NO 0 +#define QUDA_STAGGERED_PHASE_NO 0 #define QUDA_STAGGERED_PHASE_MILC 1 -#define QUDA_STAGGERED_PHASE_CPS 2 +#define QUDA_STAGGERED_PHASE_CPS 2 #define QUDA_STAGGERED_PHASE_TIFR 3 #define QUDA_STAGGERED_PHASE_INVALID QUDA_INVALID_ENUM diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 9c7ea4baee..7e766bb55b 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -28,8 +28,8 @@ // OpenQxD helpers: // #include "../../openQxD-devel/include/lattice.h" -namespace quda -{ + +namespace quda { /** @brief gauge_wrapper is an internal class that is used to wrap @@ -42,43 +42,43 @@ namespace quda temporaries with explicit calls to the load/save methods in the gauge-field accessors. */ - template struct gauge_wrapper { - const int dim; - const int x_cb; - const int parity; - const Float phase; - T &gauge; + template + struct gauge_wrapper { + const int dim; + const int x_cb; + const int parity; + const Float phase; + T &gauge; - /** - @brief gauge_wrapper constructor - @param[in] gauge Gauge field accessor we are wrapping - @param[in] dim Dimension we are accessing - @param[in] x_cb Checkerboarded space-time index we are accessing - @param[in] parity Parity we are accessing - */ - __device__ __host__ inline gauge_wrapper(T &gauge, int dim, int x_cb, int parity, Float phase = 1.0) : - dim(dim), x_cb(x_cb), parity(parity), phase(phase), gauge(gauge) - { - } + /** + @brief gauge_wrapper constructor + @param[in] gauge Gauge field accessor we are wrapping + @param[in] dim Dimension we are accessing + @param[in] x_cb Checkerboarded space-time index we are accessing + @param[in] parity Parity we are accessing + */ + __device__ __host__ inline gauge_wrapper(T &gauge, int dim, int x_cb, int parity, Float phase = 1.0) : + dim(dim), x_cb(x_cb), parity(parity), phase(phase), gauge(gauge) + { + } - /** - @brief Assignment operator with Matrix instance as input - @param[in] M Matrix we want to store in this accessor - */ - template __device__ __host__ inline void operator=(const M &a) const - { - gauge.save(a.data, x_cb, dim, parity); - } - }; + /** + @brief Assignment operator with Matrix instance as input + @param[in] M Matrix we want to store in this accessor + */ + template __device__ __host__ inline void operator=(const M &a) const + { + gauge.save(a.data, x_cb, dim, parity); + } + }; /** @brief Copy constructor for the Matrix class with a gauge_wrapper input. @param[in] a Input gauge_wrapper that we use to fill in this matrix instance */ template - template - __device__ __host__ inline void Matrix::operator=(const gauge_wrapper::type, S> &a) - { + template + __device__ __host__ inline void Matrix::operator=(const gauge_wrapper::type,S> &a) { a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase); } @@ -87,9 +87,8 @@ namespace quda @param[in] a Input gauge_wrapper that we use to fill in this matrix instance */ template - template - __device__ __host__ inline Matrix::Matrix(const gauge_wrapper::type, S> &a) - { + template + __device__ __host__ inline Matrix::Matrix(const gauge_wrapper::type,S> &a) { a.gauge.load(data, a.x_cb, a.dim, a.parity, a.phase); } @@ -104,44 +103,44 @@ namespace quda having to declare temporaries with explicit calls to the load/save methods in the gauge-field accessors. */ - template struct gauge_ghost_wrapper { - const int dim; - const int ghost_idx; - const int parity; - const Float phase; - T &gauge; + template + struct gauge_ghost_wrapper { + const int dim; + const int ghost_idx; + const int parity; + const Float phase; + T &gauge; - /** - @brief gauge_wrapper constructor - @param[in] gauge Gauge field accessor we are wrapping - @param[in] dim Dimension we are accessing - @param[in] ghost_idx Ghost index we are accessing - @param[in] parity Parity we are accessing - */ - __device__ __host__ inline gauge_ghost_wrapper(T &gauge, int dim, int ghost_idx, int parity, - Float phase = 1.0) : - dim(dim), ghost_idx(ghost_idx), parity(parity), phase(phase), gauge(gauge) - { - } + /** + @brief gauge_wrapper constructor + @param[in] gauge Gauge field accessor we are wrapping + @param[in] dim Dimension we are accessing + @param[in] ghost_idx Ghost index we are accessing + @param[in] parity Parity we are accessing + */ + __device__ __host__ inline gauge_ghost_wrapper(T &gauge, int dim, int ghost_idx, int parity, + Float phase = 1.0) : + dim(dim), ghost_idx(ghost_idx), parity(parity), phase(phase), gauge(gauge) + { + } - /** - @brief Assignment operator with Matrix instance as input - @param[in] M Matrix we want to store in this accessot - */ - template __device__ __host__ inline void operator=(const M &a) const - { - gauge.saveGhost(a.data, ghost_idx, dim, parity); - } - }; + /** + @brief Assignment operator with Matrix instance as input + @param[in] M Matrix we want to store in this accessot + */ + template __device__ __host__ inline void operator=(const M &a) const + { + gauge.saveGhost(a.data, ghost_idx, dim, parity); + } + }; /** @brief Copy constructor for the Matrix class with a gauge_ghost_wrapper input. @param[in] a Input gauge_wrapper that we use to fill in this matrix instance */ template - template - __device__ __host__ inline void Matrix::operator=(const gauge_ghost_wrapper::type, S> &a) - { + template + __device__ __host__ inline void Matrix::operator=(const gauge_ghost_wrapper::type,S> &a) { a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase); } @@ -150,26 +149,21 @@ namespace quda @param[in] a Input gauge_wrapper that we use to fill in this matrix instance */ template - template - __device__ __host__ inline Matrix::Matrix(const gauge_ghost_wrapper::type, S> &a) - { + template + __device__ __host__ inline Matrix::Matrix(const gauge_ghost_wrapper::type,S> &a) { a.gauge.loadGhost(data, a.ghost_idx, a.dim, a.parity, a.phase); } - namespace gauge - { + namespace gauge { - template __host__ __device__ inline constexpr bool fixed_point() - { - return false; - } + template __host__ __device__ inline constexpr bool fixed_point() { return false; } template <> __host__ __device__ inline constexpr bool fixed_point() { return true; } - template <> __host__ __device__ inline constexpr bool fixed_point() { return true; } - template <> __host__ __device__ inline constexpr bool fixed_point() { return true; } + template<> __host__ __device__ inline constexpr bool fixed_point() { return true; } + template<> __host__ __device__ inline constexpr bool fixed_point() { return true; } template __host__ __device__ inline constexpr bool match() { return false; } - template <> __host__ __device__ inline constexpr bool match() { return true; } - template <> __host__ __device__ inline constexpr bool match() { return true; } + template<> __host__ __device__ inline constexpr bool match() { return true; } + template<> __host__ __device__ inline constexpr bool match() { return true; } /** @brief fieldorder_wrapper is an internal class that is used to @@ -178,7 +172,8 @@ namespace quda for fixed-point accessors providing the necessary conversion and scaling when writing to a fixed-point field. */ - template struct fieldorder_wrapper { + template + struct fieldorder_wrapper { using value_type = Float; using store_type = storeFloat; complex *v; @@ -285,10 +280,10 @@ namespace quda v[idx] -= fixed ? complex(round(scale * a.x), round(scale * a.y)) : complex(a.x, a.y); } } - }; + }; - template - __device__ __host__ inline complex operator*(const Float &a, const fieldorder_wrapper &b) + template + __device__ __host__ inline complex operator*(const Float &a, const fieldorder_wrapper &b) { return a * complex(b); } @@ -350,13 +345,13 @@ namespace quda struct Accessor { using wrapper = fieldorder_wrapper; static constexpr bool is_mma_compatible = false; - complex *u[QUDA_MAX_GEOMETRY]; + complex *u[QUDA_MAX_GEOMETRY]; const unsigned int volumeCB; const int geometry; const unsigned int cb_offset; Float scale; Float scale_inv; - static constexpr bool fixed = fixed_point(); + static constexpr bool fixed = fixed_point(); Accessor(const GaugeField &U, void *gauge_ = 0, void ** = 0) : volumeCB(U.VolumeCB()), @@ -365,10 +360,10 @@ namespace quda scale(static_cast(1.0)), scale_inv(static_cast(1.0)) { - for (int d = 0; d < U.Geometry(); d++) - u[d] = gauge_ ? static_cast **>(gauge_)[d] : - static_cast **>(const_cast(U.Gauge_p()))[d]; - resetScale(U.Scale()); + for (int d=0; d**>(gauge_)[d] : + static_cast**>(const_cast(U.Gauge_p()))[d]; + resetScale(U.Scale()); } void resetScale(Float max) @@ -389,11 +384,11 @@ namespace quda const complex &val) const { using vec2 = array; - vec2 *u2 = reinterpret_cast(u[dim] + parity * cb_offset + (x_cb * nColor + row) * nColor + col); + vec2 *u2 = reinterpret_cast(u[dim] + parity*cb_offset + (x_cb*nColor + row)*nColor + col); vec2 val_ = (fixed && !match()) ? - vec2 {static_cast(round(scale * val.real())), static_cast(round(scale * val.imag()))} : - vec2 {static_cast(val.real()), static_cast(val.imag())}; + vec2{static_cast(round(scale * val.real())), static_cast(round(scale * val.imag()))} : + vec2{static_cast(val.real()), static_cast(val.imag())}; atomic_fetch_add(u2, val_); } @@ -431,23 +426,23 @@ namespace quda unsigned int ghostOffset[8]; Float scale; Float scale_inv; - static constexpr bool fixed = fixed_point(); + static constexpr bool fixed = fixed_point(); GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) : scale(static_cast(1.0)), scale_inv(static_cast(1.0)) { - for (int d = 0; d < 4; d++) { - ghost[d] = ghost_ ? static_cast *>(ghost_[d]) : - static_cast *>(const_cast(U.Ghost()[d])); - ghostOffset[d] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor(); - - ghost[d + 4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : - ghost_ ? static_cast *>(ghost_[d + 4]) : - static_cast *>(const_cast(U.Ghost()[d + 4])); - ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor(); - } + for (int d=0; d<4; d++) { + ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : + static_cast*>(const_cast(U.Ghost()[d])); + ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - resetScale(U.Scale()); + ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : + ghost_ ? static_cast*>(ghost_[d+4]) : + static_cast*>(const_cast(U.Ghost()[d+4])); + ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); + } + + resetScale(U.Scale()); } void resetScale(Float max) @@ -473,7 +468,7 @@ namespace quda const int geometry; Float scale; Float scale_inv; - static constexpr bool fixed = fixed_point(); + static constexpr bool fixed = fixed_point(); Accessor(const GaugeField &U, void *gauge_ = nullptr, void ** = nullptr) : u(gauge_ ? static_cast *>(gauge_) : @@ -508,12 +503,11 @@ namespace quda const complex &val) const { using vec2 = array; - vec2 *u2 - = reinterpret_cast(u + (((parity * volumeCB + x_cb) * geometry + dim) * nColor + row) * nColor + col); + vec2 *u2 = reinterpret_cast(u + (((parity*volumeCB+x_cb)*geometry + dim)*nColor + row)*nColor + col); vec2 val_ = (fixed && !match()) ? - vec2 {static_cast(round(scale * val.real())), static_cast(round(scale * val.imag()))} : - vec2 {static_cast(val.real()), static_cast(val.imag())}; + vec2{static_cast(round(scale * val.real())), static_cast(round(scale * val.imag()))} : + vec2{static_cast(val.real()), static_cast(val.imag())}; atomic_fetch_add(u2, val_); } @@ -554,23 +548,23 @@ namespace quda unsigned int ghostOffset[8]; Float scale; Float scale_inv; - static constexpr bool fixed = fixed_point(); + static constexpr bool fixed = fixed_point(); GhostAccessor(const GaugeField &U, void * = nullptr, void **ghost_ = nullptr) : scale(static_cast(1.0)), scale_inv(static_cast(1.0)) { - for (int d = 0; d < 4; d++) { - ghost[d] = ghost_ ? static_cast *>(ghost_[d]) : - static_cast *>(const_cast(U.Ghost()[d])); - ghostOffset[d] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor(); - - ghost[d + 4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : - ghost_ ? static_cast *>(ghost_[d + 4]) : - static_cast *>(const_cast(U.Ghost()[d + 4])); - ghostOffset[d + 4] = U.Nface() * U.SurfaceCB(d) * U.Ncolor() * U.Ncolor(); - } + for (int d=0; d<4; d++) { + ghost[d] = ghost_ ? static_cast*>(ghost_[d]) : + static_cast*>(const_cast(U.Ghost()[d])); + ghostOffset[d] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); - resetScale(U.Scale()); + ghost[d+4] = (U.Geometry() != QUDA_COARSE_GEOMETRY) ? nullptr : + ghost_ ? static_cast*>(ghost_[d+4]) : + static_cast*>(const_cast(U.Ghost()[d+4])); + ghostOffset[d+4] = U.Nface()*U.SurfaceCB(d)*U.Ncolor()*U.Ncolor(); + } + + resetScale(U.Scale()); } void resetScale(Float max) @@ -591,14 +585,13 @@ namespace quda } }; - template - __device__ __host__ inline int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb) - { - constexpr int M = (2 * nColor * nColor) / N; - int j = ((row * nColor + col) * 2) / N; // factor of two for complexity - int i = ((row * nColor + col) * 2) % N; - int index = ((x_cb + dim * stride * M + j * stride) * 2 + i) / 2; // back to a complex offset - index += parity * offset_cb; + template + __device__ __host__ inline int indexFloatN(int dim, int parity, int x_cb, int row, int col, int stride, int offset_cb) { + constexpr int M = (2*nColor*nColor) / N; + int j = ((row*nColor+col)*2) / N; // factor of two for complexity + int i = ((row*nColor+col)*2) % N; + int index = ((x_cb + dim*stride*M + j*stride)*2+i) / 2; // back to a complex offset + index += parity*offset_cb; return index; }; @@ -613,7 +606,7 @@ namespace quda const int geometry; Float scale; Float scale_inv; - static constexpr bool fixed = fixed_point(); + static constexpr bool fixed = fixed_point(); Accessor(const GaugeField &U, void *gauge_ = nullptr, void ** = nullptr) : u(gauge_ ? static_cast *>(gauge_) : @@ -625,21 +618,21 @@ namespace quda scale(static_cast(1.0)), scale_inv(static_cast(1.0)) { - resetScale(U.Scale()); + resetScale(U.Scale()); } void resetScale(Float max) { if (fixed) { scale = static_cast(std::numeric_limits::max()) / max; - scale_inv = max / static_cast(std::numeric_limits::max()); + scale_inv = max / static_cast(std::numeric_limits::max()); } } __device__ __host__ inline wrapper operator()(int dim, int parity, int x_cb, int row, int col) const { auto index = parity * offset_cb + dim * stride * nColor * nColor + (row * nColor + col) * stride + x_cb; - return fieldorder_wrapper(u, index, scale, scale_inv); + return fieldorder_wrapper(u, index, scale, scale_inv); } template @@ -647,12 +640,11 @@ namespace quda const complex &val) const { using vec2 = array; - vec2 *u2 = reinterpret_cast(u + parity * offset_cb + dim * stride * nColor * nColor - + (row * nColor + col) * stride + x_cb); + vec2 *u2 = reinterpret_cast(u + parity*offset_cb + dim*stride*nColor*nColor + (row*nColor+col)*stride + x_cb); vec2 val_ = (fixed && !match()) ? - vec2 {static_cast(round(scale * val.real())), static_cast(round(scale * val.imag()))} : - vec2 {static_cast(val.real()), static_cast(val.imag())}; + vec2{static_cast(round(scale * val.real())), static_cast(round(scale * val.imag()))} : + vec2{static_cast(val.real()), static_cast(val.imag())}; atomic_fetch_add(u2, val_); } @@ -689,7 +681,7 @@ namespace quda unsigned int ghostVolumeCB[8]; Float scale; Float scale_inv; - static constexpr bool fixed = fixed_point(); + static constexpr bool fixed = fixed_point(); Accessor accessor; GhostAccessor(const GaugeField &U, void *gauge_, void **ghost_ = 0) : @@ -700,12 +692,10 @@ namespace quda { if constexpr (!native_ghost) assert(ghost_ != nullptr); for (int d = 0; d < 4; d++) { - ghost[d] = !native_ghost ? static_cast *>(ghost_[d]) : nullptr; - ghostVolumeCB[d] = U.Nface() * U.SurfaceCB(d); - ghost[d + 4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY ? - static_cast *>(ghost_[d + 4]) : - nullptr; - ghostVolumeCB[d + 4] = U.Nface() * U.SurfaceCB(d); + ghost[d] = !native_ghost ? static_cast*>(ghost_[d]) : nullptr; + ghostVolumeCB[d] = U.Nface()*U.SurfaceCB(d); + ghost[d+4] = !native_ghost && U.Geometry() == QUDA_COARSE_GEOMETRY? static_cast*>(ghost_[d+4]) : nullptr; + ghostVolumeCB[d+4] = U.Nface()*U.SurfaceCB(d); } resetScale(U.Scale()); } @@ -777,789 +767,786 @@ namespace quda ghostAccessor(U, (void *)gauge_, (void **)ghost_) { if (U.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("GaugeField ordering not supported with reconstruction"); - } - - void resetScale(double max) - { - accessor.resetScale(max); - ghostAccessor.resetScale(max); - } - - static constexpr bool fixedPoint() { return fixed_point(); } - - /** - * accessor function - * @param d dimension index - * @param parity Parity index - * @param x 1-d site index - * @param row row index - * @param c column index - */ - __device__ __host__ inline auto operator()(int d, int parity, int x, int row, int col) const - { - return accessor(d, parity, x, row, col); - } + } - __device__ __host__ inline auto Ghost(int d, int parity, int x) const { return ghostAccessor(d, parity, x); } + void resetScale(double max) { + accessor.resetScale(max); + ghostAccessor.resetScale(max); + } - /** - * accessor function for the ghost zone - * @param d dimension index - * @param parity Parity index - * @param x 1-d site index - * @param row row index - * @param c column index - */ - __device__ __host__ auto Ghost(int d, int parity, int x, int row, int col) const - { - return ghostAccessor(d, parity, x, row, col); - } - /** - * @brief This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points to - * the start of the memory chunk corresponds to the matrix at d, parity, x. Only available for the - * QUDA_MILC_GAUGE_ORDER order. + static constexpr bool fixedPoint() { return fixed_point(); } + + /** + * accessor function + * @param d dimension index + * @param parity Parity index + * @param x 1-d site index + * @param row row index + * @param c column index + */ + __device__ __host__ inline auto operator()(int d, int parity, int x, int row, int col) const + { + return accessor(d, parity, x, row, col); + } - * @param d dimension index - * @param parity Parity index - * @param x 1-d site index - */ - __device__ __host__ auto wrap_ghost(int d, int parity, int x) const { return ghostAccessor(d, parity, x, 0, 0); } + __device__ __host__ inline auto Ghost(int d, int parity, int x) const { return ghostAccessor(d, parity, x); } + + /** + * accessor function for the ghost zone + * @param d dimension index + * @param parity Parity index + * @param x 1-d site index + * @param row row index + * @param c column index + */ + __device__ __host__ auto Ghost(int d, int parity, int x, int row, int col) const + { + return ghostAccessor(d, parity, x, row, col); + } + /** + * @brief This and the following method (eventually) creates a fieldorder_wrapper object whose pointer points to + * the start of the memory chunk corresponds to the matrix at d, parity, x. Only available for the + * QUDA_MILC_GAUGE_ORDER order. + + * @param d dimension index + * @param parity Parity index + * @param x 1-d site index + */ + __device__ __host__ auto wrap_ghost(int d, int parity, int x) const + { + return ghostAccessor(d, parity, x, 0, 0); + } - /** - * Specialized complex-member accessor function (for coarse gauge field) - * @param d dimension index - * @param parity Parity index - * @param x 1-d site index - * @param s_row row spin index - * @param c_row row color index - * @param s_col col spin index - * @param c_col col color index - */ - __device__ __host__ inline auto operator()(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const - { - return (*this)(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col); - } + /** + * Specialized complex-member accessor function (for coarse gauge field) + * @param d dimension index + * @param parity Parity index + * @param x 1-d site index + * @param s_row row spin index + * @param c_row row color index + * @param s_col col spin index + * @param c_col col color index + */ + __device__ __host__ inline auto operator()(int d, int parity, int x, int s_row, int s_col, int c_row, + int c_col) const + { + return (*this)(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col); + } - /** - * Specialized complex-member accessor function (for coarse gauge field ghost zone) - * @param d dimension index - * @param parity Parity index - * @param x 1-d site index - * @param s_row row spin index - * @param c_row row color index - * @param s_col col spin index - * @param c_col col color index - */ - __device__ __host__ inline auto Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const - { - return Ghost(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col); - } + /** + * Specialized complex-member accessor function (for coarse gauge field ghost zone) + * @param d dimension index + * @param parity Parity index + * @param x 1-d site index + * @param s_row row spin index + * @param c_row row color index + * @param s_col col spin index + * @param c_col col color index + */ + __device__ __host__ inline auto Ghost(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col) const + { + return Ghost(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col); + } - template - __device__ __host__ inline void atomicAdd(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col, - const complex &val) const - { - accessor.atomic_add(d, parity, x, s_row * nColorCoarse + c_row, s_col * nColorCoarse + c_col, val); - } + template + __device__ __host__ inline void atomicAdd(int d, int parity, int x, int s_row, int s_col, int c_row, int c_col, + const complex &val) const + { + accessor.atomic_add(d, parity, x, s_row*nColorCoarse + c_row, s_col*nColorCoarse + c_col, val); + } - /** Returns the number of field colors */ - __device__ __host__ inline int Ncolor() const { return nColor; } + /** Returns the number of field colors */ + __device__ __host__ inline int Ncolor() const { return nColor; } - /** Returns the field volume */ - __device__ __host__ inline int Volume() const { return 2 * volumeCB; } + /** Returns the field volume */ + __device__ __host__ inline int Volume() const { return 2*volumeCB; } - /** Returns the field volume */ - __device__ __host__ inline int VolumeCB() const { return volumeCB; } + /** Returns the field volume */ + __device__ __host__ inline int VolumeCB() const { return volumeCB; } - /** Returns the field geometric dimension */ - __device__ __host__ inline int Ndim() const { return nDim; } + /** Returns the field geometric dimension */ + __device__ __host__ inline int Ndim() const { return nDim; } - /** Returns the field geometry */ - __device__ __host__ inline int Geometry() const { return geometry; } + /** Returns the field geometry */ + __device__ __host__ inline int Geometry() const { return geometry; } - /** Returns the number of coarse gauge field spins */ - __device__ __host__ inline int NspinCoarse() const { return nSpinCoarse; } + /** Returns the number of coarse gauge field spins */ + __device__ __host__ inline int NspinCoarse() const { return nSpinCoarse; } - /** Returns the number of coarse gauge field colors */ - __device__ __host__ inline int NcolorCoarse() const { return nColorCoarse; } + /** Returns the number of coarse gauge field colors */ + __device__ __host__ inline int NcolorCoarse() const { return nColorCoarse; } - /** - * @brief Returns the L1 norm of the field in a given dimension - * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) - * @return L1 norm - */ - __host__ double norm1(int dim = -1, bool global = true) const - { - commGlobalReductionPush(global); - double nrm1 = accessor.template transform_reduce>(location, dim, - abs_(accessor.scale_inv)); - commGlobalReductionPop(); - return nrm1; - } + /** + * @brief Returns the L1 norm of the field in a given dimension + * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) + * @return L1 norm + */ + __host__ double norm1(int dim=-1, bool global=true) const { + commGlobalReductionPush(global); + double nrm1 = accessor.template transform_reduce>(location, dim, + abs_(accessor.scale_inv)); + commGlobalReductionPop(); + return nrm1; + } - /** - * @brief Returns the L2 norm squared of the field in a given dimension - * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) - * @return L2 norm squared - */ - __host__ double norm2(int dim = -1, bool global = true) const - { - commGlobalReductionPush(global); - double nrm2 = accessor.template transform_reduce>(location, dim, - square_(accessor.scale_inv)); - commGlobalReductionPop(); - return nrm2; - } + /** + * @brief Returns the L2 norm squared of the field in a given dimension + * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) + * @return L2 norm squared + */ + __host__ double norm2(int dim = -1, bool global = true) const + { + commGlobalReductionPush(global); + double nrm2 = accessor.template transform_reduce>( + location, dim, square_(accessor.scale_inv)); + commGlobalReductionPop(); + return nrm2; + } - /** - * @brief Returns the Linfinity norm of the field in a given dimension - * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) - * @return Linfinity norm - */ - __host__ double abs_max(int dim = -1, bool global = true) const - { - commGlobalReductionPush(global); - double absmax = accessor.template transform_reduce>( - location, dim, abs_max_(accessor.scale_inv)); - commGlobalReductionPop(); - return absmax; - } + /** + * @brief Returns the Linfinity norm of the field in a given dimension + * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) + * @return Linfinity norm + */ + __host__ double abs_max(int dim = -1, bool global = true) const + { + commGlobalReductionPush(global); + double absmax = accessor.template transform_reduce>( + location, dim, abs_max_(accessor.scale_inv)); + commGlobalReductionPop(); + return absmax; + } - /** - * @brief Returns the minimum absolute value of the field - * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) - * @return Minimum norm - */ - __host__ double abs_min(int dim = -1, bool global = true) const - { - commGlobalReductionPush(global); - double absmin = accessor.template transform_reduce>( - location, dim, abs_min_(accessor.scale_inv)); - commGlobalReductionPop(); - return absmin; - } + /** + * @brief Returns the minimum absolute value of the field + * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) + * @return Minimum norm + */ + __host__ double abs_min(int dim = -1, bool global = true) const + { + commGlobalReductionPush(global); + double absmin = accessor.template transform_reduce>( + location, dim, abs_min_(accessor.scale_inv)); + commGlobalReductionPop(); + return absmin; + } - /** Return the size of the allocation (geometry and parity left out and added as needed in Tunable::bytes) */ - size_t Bytes() const { return static_cast(volumeCB) * nColor * nColor * 2ll * sizeof(storeFloat); } + /** Return the size of the allocation (geometry and parity left out and added as needed in Tunable::bytes) */ + size_t Bytes() const { return static_cast(volumeCB) * nColor * nColor * 2ll * sizeof(storeFloat); } }; - /** - @brief Generic reconstruction helper with no reconstruction - @tparam N number of real numbers in each packed gauge matrix - @tparam Float Storage format (e.g., double, float, short) - @tparam ghostExchange_ optional template the ghostExchange type - to avoid the run-time overhead (dummy for trivial reconstruct - type) - */ - template - struct Reconstruct { - using real = typename mapper::type; - using complex = complex; - real scale; - real scale_inv; - Reconstruct(const GaugeField &u) : - scale(isFixed::value ? u.LinkMax() : 1.0), scale_inv(isFixed::value ? 1.0 / scale : 1.0) - { - } + /** + @brief Generic reconstruction helper with no reconstruction + @tparam N number of real numbers in each packed gauge matrix + @tparam Float Storage format (e.g., double, float, short) + @tparam ghostExchange_ optional template the ghostExchange type + to avoid the run-time overhead (dummy for trivial reconstruct + type) + */ + template + struct Reconstruct { + using real = typename mapper::type; + using complex = complex; + real scale; + real scale_inv; + Reconstruct(const GaugeField &u) : + scale(isFixed::value ? u.LinkMax() : 1.0), + scale_inv(isFixed::value ? 1.0 / scale : 1.0) + { + } - __device__ __host__ inline void Pack(real out[N], const complex in[N / 2]) const - { - if constexpr (isFixed::value) { + __device__ __host__ inline void Pack(real out[N], const complex in[N / 2]) const + { + if constexpr (isFixed::value) { #pragma unroll - for (int i = 0; i < N / 2; i++) { - out[2 * i + 0] = scale_inv * in[i].real(); - out[2 * i + 1] = scale_inv * in[i].imag(); - } - } else { + for (int i = 0; i < N / 2; i++) { + out[2 * i + 0] = scale_inv * in[i].real(); + out[2 * i + 1] = scale_inv * in[i].imag(); + } + } else { #pragma unroll - for (int i = 0; i < N / 2; i++) { - out[2 * i + 0] = in[i].real(); - out[2 * i + 1] = in[i].imag(); + for (int i = 0; i < N / 2; i++) { + out[2 * i + 0] = in[i].real(); + out[2 * i + 1] = in[i].imag(); + } } } - } - template - __device__ __host__ inline void Unpack(complex out[N / 2], const real in[N], int, int, real, const I *, - const int *) const - { - if constexpr (isFixed::value) { + template + __device__ __host__ inline void Unpack(complex out[N / 2], const real in[N], int, int, real, const I *, + const int *) const + { + if constexpr (isFixed::value) { #pragma unroll - for (int i = 0; i < N / 2; i++) { out[i] = scale * complex(in[2 * i + 0], in[2 * i + 1]); } - } else { + for (int i = 0; i < N / 2; i++) { out[i] = scale * complex(in[2 * i + 0], in[2 * i + 1]); } + } else { #pragma unroll - for (int i = 0; i < N / 2; i++) { out[i] = complex(in[2 * i + 0], in[2 * i + 1]); } + for (int i = 0; i < N / 2; i++) { out[i] = complex(in[2 * i + 0], in[2 * i + 1]); } + } } - } - __device__ __host__ inline real getPhase(const complex[]) const { return 0; } - }; - - /** - @brief timeBoundary Compute boundary condition correction - @tparam ghostExhange_ Optional template the ghostExchange type to avoid the run-time overhead - @param idx extended field linear index - @param X the gauge field dimensions - @param R the radii dimenions of the extended region - @param tBoundary the boundary condition - @param isFirstTimeSlice if we're on the first time slice of nodes - @param isLastTimeSlide if we're on the last time slice of nodes - @param ghostExchange if the field is extended or not (determines indexing type) - */ - template - __device__ __host__ inline T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM], T tBoundary, - T scale, int firstTimeSliceBound, int lastTimeSliceBound, - bool isFirstTimeSlice, bool isLastTimeSlice, - QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_NO) - { + __device__ __host__ inline real getPhase(const complex[]) const { return 0; } + }; - // MWTODO: should this return tBoundary : scale or tBoundary*scale : scale - - if (ghostExchange_ == QUDA_GHOST_EXCHANGE_PAD - || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED)) { - if (idx >= firstTimeSliceBound) { // halo region on the first time slice - return isFirstTimeSlice ? tBoundary : scale; - } else if (idx >= lastTimeSliceBound) { // last link on the last time slice - return isLastTimeSlice ? tBoundary : scale; - } else { - return scale; - } - } else if (ghostExchange_ == QUDA_GHOST_EXCHANGE_EXTENDED - || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED)) { - if (idx >= (R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < R[3] * X[0] * X[1] * X[2] / 2) { - // the boundary condition is on the R[3]-1 time slice - return isFirstTimeSlice ? tBoundary : scale; - } else if (idx >= (X[3] - R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < (X[3] - R[3]) * X[0] * X[1] * X[2] / 2) { - // the boundary condition lies on the X[3]-R[3]-1 time slice - return isLastTimeSlice ? tBoundary : scale; - } else { - return scale; + /** + @brief timeBoundary Compute boundary condition correction + @tparam ghostExhange_ Optional template the ghostExchange type to avoid the run-time overhead + @param idx extended field linear index + @param X the gauge field dimensions + @param R the radii dimenions of the extended region + @param tBoundary the boundary condition + @param isFirstTimeSlice if we're on the first time slice of nodes + @param isLastTimeSlide if we're on the last time slice of nodes + @param ghostExchange if the field is extended or not (determines indexing type) + */ + template + __device__ __host__ inline T timeBoundary(int idx, const I X[QUDA_MAX_DIM], const int R[QUDA_MAX_DIM], + T tBoundary, T scale, int firstTimeSliceBound, int lastTimeSliceBound, bool isFirstTimeSlice, + bool isLastTimeSlice, QudaGhostExchange ghostExchange = QUDA_GHOST_EXCHANGE_NO) + { + + // MWTODO: should this return tBoundary : scale or tBoundary*scale : scale + + if (ghostExchange_ == QUDA_GHOST_EXCHANGE_PAD + || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED)) { + if (idx >= firstTimeSliceBound) { // halo region on the first time slice + return isFirstTimeSlice ? tBoundary : scale; + } else if (idx >= lastTimeSliceBound) { // last link on the last time slice + return isLastTimeSlice ? tBoundary : scale; + } else { + return scale; + } + } else if (ghostExchange_ == QUDA_GHOST_EXCHANGE_EXTENDED + || (ghostExchange_ == QUDA_GHOST_EXCHANGE_INVALID && ghostExchange == QUDA_GHOST_EXCHANGE_EXTENDED)) { + if (idx >= (R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < R[3] * X[0] * X[1] * X[2] / 2) { + // the boundary condition is on the R[3]-1 time slice + return isFirstTimeSlice ? tBoundary : scale; + } else if (idx >= (X[3] - R[3] - 1) * X[0] * X[1] * X[2] / 2 && idx < (X[3] - R[3]) * X[0] * X[1] * X[2] / 2) { + // the boundary condition lies on the X[3]-R[3]-1 time slice + return isLastTimeSlice ? tBoundary : scale; + } else { + return scale; + } } + return scale; + } + + // not actually used - here for reference + template + __device__ __host__ inline Float milcStaggeredPhase(int dim, const int x[], const I R[]) { + // could consider non-extended variant too? + Float sign = static_cast(1.0); + switch (dim) { + case 0: if ( ((x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); break; + case 1: if ( ((x[0] - R[0] + x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); break; + case 2: if ( ((x[0] - R[0] + x[1] - R[1] + x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); break; + } + return sign; } - return scale; - } - - // not actually used - here for reference - template - __device__ __host__ inline Float milcStaggeredPhase(int dim, const int x[], const I R[]) - { - // could consider non-extended variant too? - Float sign = static_cast(1.0); - switch (dim) { - case 0: - if (((x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); - break; - case 1: - if (((x[0] - R[0] + x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); - break; - case 2: - if (((x[0] - R[0] + x[1] - R[1] + x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); - break; - } - return sign; - } - - /** - @brief Gauge reconstruct 12 helper where we reconstruct the - third row from the cross product of the first two rows - @tparam Float Storage format (e.g., double, float, short) - @tparam ghostExchange_ optional template the ghostExchange - type to avoid the run-time overhead - */ - template struct Reconstruct<12, Float, ghostExchange_> { - using real = typename mapper::type; - using complex = complex; - const real anisotropy; - const real tBoundary; - const int firstTimeSliceBound; - const int lastTimeSliceBound; - const bool isFirstTimeSlice; - const bool isLastTimeSlice; - QudaGhostExchange ghostExchange; - Reconstruct(const GaugeField &u) : - anisotropy(u.Anisotropy()), - tBoundary(static_cast(u.TBoundary())), - firstTimeSliceBound(u.VolumeCB()), - lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2), - isFirstTimeSlice(comm_coord(3) == 0 ? true : false), - isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false), - ghostExchange(u.GhostExchange()) - { - } + /** + @brief Gauge reconstruct 12 helper where we reconstruct the + third row from the cross product of the first two rows + @tparam Float Storage format (e.g., double, float, short) + @tparam ghostExchange_ optional template the ghostExchange + type to avoid the run-time overhead + */ + template struct Reconstruct<12, Float, ghostExchange_> { + using real = typename mapper::type; + using complex = complex; + const real anisotropy; + const real tBoundary; + const int firstTimeSliceBound; + const int lastTimeSliceBound; + const bool isFirstTimeSlice; + const bool isLastTimeSlice; + QudaGhostExchange ghostExchange; + + Reconstruct(const GaugeField &u) : + anisotropy(u.Anisotropy()), + tBoundary(static_cast(u.TBoundary())), + firstTimeSliceBound(u.VolumeCB()), + lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2), + isFirstTimeSlice(comm_coord(3) == 0 ? true : false), + isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false), + ghostExchange(u.GhostExchange()) + { + } - __device__ __host__ inline void Pack(real out[12], const complex in[9]) const - { + __device__ __host__ inline void Pack(real out[12], const complex in[9]) const + { #pragma unroll - for (int i = 0; i < 6; i++) { - out[2 * i + 0] = in[i].real(); - out[2 * i + 1] = in[i].imag(); + for (int i = 0; i < 6; i++) { + out[2 * i + 0] = in[i].real(); + out[2 * i + 1] = in[i].imag(); + } } - } - template - __device__ __host__ inline void Unpack(complex out[9], const real in[12], int idx, int dir, real, const I *X, - const int *R) const - { + template + __device__ __host__ inline void Unpack(complex out[9], const real in[12], int idx, int dir, real, const I *X, + const int *R) const + { #pragma unroll - for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]); - - const real u0 = dir < 3 ? - anisotropy : - timeBoundary(idx, X, R, tBoundary, static_cast(1.0), firstTimeSliceBound, - lastTimeSliceBound, isFirstTimeSlice, isLastTimeSlice, ghostExchange); - - // out[6] = u0*conj(out[1]*out[5] - out[2]*out[4]); - out[6] = cmul(out[2], out[4]); - out[6] = cmac(out[1], out[5], -out[6]); - out[6] = u0 * conj(out[6]); - - // out[7] = u0*conj(out[2]*out[3] - out[0]*out[5]); - out[7] = cmul(out[0], out[5]); - out[7] = cmac(out[2], out[3], -out[7]); - out[7] = u0 * conj(out[7]); - - // out[8] = u0*conj(out[0]*out[4] - out[1]*out[3]); - out[8] = cmul(out[1], out[3]); - out[8] = cmac(out[0], out[4], -out[8]); - out[8] = u0 * conj(out[8]); - } + for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]); + + const real u0 = dir < 3 ? + anisotropy : + timeBoundary(idx, X, R, tBoundary, static_cast(1.0), firstTimeSliceBound, + lastTimeSliceBound, isFirstTimeSlice, isLastTimeSlice, ghostExchange); + + // out[6] = u0*conj(out[1]*out[5] - out[2]*out[4]); + out[6] = cmul(out[2], out[4]); + out[6] = cmac(out[1], out[5], -out[6]); + out[6] = u0 * conj(out[6]); + + // out[7] = u0*conj(out[2]*out[3] - out[0]*out[5]); + out[7] = cmul(out[0], out[5]); + out[7] = cmac(out[2], out[3], -out[7]); + out[7] = u0 * conj(out[7]); + + // out[8] = u0*conj(out[0]*out[4] - out[1]*out[3]); + out[8] = cmul(out[1], out[3]); + out[8] = cmac(out[0], out[4], -out[8]); + out[8] = u0 * conj(out[8]); + } - __device__ __host__ inline real getPhase(const complex[]) const { return 0; } - }; + __device__ __host__ inline real getPhase(const complex[]) const { return 0; } + }; - /** - @brief Gauge reconstruct helper for Momentum field with 10 - packed elements (really 9 from the Lie algebra, with zero for - last element). We label this as 11 to avoid collisions with - simple load/store of momentum field where we do not seek to - unpack/pack. - @tparam Float Storage format (e.g., double, float, short) - @tparam ghostExchange_ optional template the ghostExchange - type to avoid the run-time overhead - */ - template struct Reconstruct<11, Float, ghostExchange_> { - using real = typename mapper::type; - using complex = complex; + /** + @brief Gauge reconstruct helper for Momentum field with 10 + packed elements (really 9 from the Lie algebra, with zero for + last element). We label this as 11 to avoid collisions with + simple load/store of momentum field where we do not seek to + unpack/pack. + @tparam Float Storage format (e.g., double, float, short) + @tparam ghostExchange_ optional template the ghostExchange + type to avoid the run-time overhead + */ + template struct Reconstruct<11, Float, ghostExchange_> { + using real = typename mapper::type; + using complex = complex; - Reconstruct(const GaugeField &) { ; } + Reconstruct(const GaugeField &) { ; } - __device__ __host__ inline void Pack(real out[10], const complex in[9]) const - { + __device__ __host__ inline void Pack(real out[10], const complex in[9]) const + { #pragma unroll - for (int i = 0; i < 2; i++) { - out[2 * i + 0] = in[i + 1].real(); - out[2 * i + 1] = in[i + 1].imag(); + for (int i = 0; i < 2; i++) { + out[2 * i + 0] = in[i + 1].real(); + out[2 * i + 1] = in[i + 1].imag(); + } + out[4] = in[5].real(); + out[5] = in[5].imag(); + out[6] = in[0].imag(); + out[7] = in[4].imag(); + out[8] = in[8].imag(); + out[9] = 0.0; } - out[4] = in[5].real(); - out[5] = in[5].imag(); - out[6] = in[0].imag(); - out[7] = in[4].imag(); - out[8] = in[8].imag(); - out[9] = 0.0; - } - template - __device__ __host__ inline void Unpack(complex out[9], const real in[10], int, int, real, const I *, const int *) const - { - out[0] = complex(0.0, in[6]); - out[1] = complex(in[0], in[1]); - out[2] = complex(in[2], in[3]); - out[3] = complex(-out[1].real(), out[1].imag()); - out[4] = complex(0.0, in[7]); - out[5] = complex(in[4], in[5]); - out[6] = complex(-out[2].real(), out[2].imag()); - out[7] = complex(-out[5].real(), out[5].imag()); - out[8] = complex(0.0, in[8]); - } + template + __device__ __host__ inline void Unpack(complex out[9], const real in[10], int, int, real, const I *, + const int *) const + { + out[0] = complex(0.0, in[6]); + out[1] = complex(in[0], in[1]); + out[2] = complex(in[2], in[3]); + out[3] = complex(-out[1].real(), out[1].imag()); + out[4] = complex(0.0, in[7]); + out[5] = complex(in[4], in[5]); + out[6] = complex(-out[2].real(), out[2].imag()); + out[7] = complex(-out[5].real(), out[5].imag()); + out[8] = complex(0.0, in[8]); + } - __device__ __host__ inline real getPhase(const complex[]) const { return 0; } - }; + __device__ __host__ inline real getPhase(const complex[]) const { return 0; } + }; - /** - @brief Gauge reconstruct 13 helper where we reconstruct the - third row from the cross product of the first two rows, and - include a non-trivial phase factor - @tparam Float Storage format (e.g., double, float, short) - @tparam ghostExchange_ optional template the ghostExchange - type to avoid the run-time overhead - */ - template - struct Reconstruct<13, Float, ghostExchange_, stag_phase> { - using real = typename mapper::type; - using complex = complex; - const Reconstruct<12, Float, ghostExchange_> reconstruct_12; - const real scale; - const real scale_inv; + /** + @brief Gauge reconstruct 13 helper where we reconstruct the + third row from the cross product of the first two rows, and + include a non-trivial phase factor + @tparam Float Storage format (e.g., double, float, short) + @tparam ghostExchange_ optional template the ghostExchange + type to avoid the run-time overhead + */ + template + struct Reconstruct<13, Float, ghostExchange_, stag_phase> { + using real = typename mapper::type; + using complex = complex; + const Reconstruct<12, Float, ghostExchange_> reconstruct_12; + const real scale; + const real scale_inv; - Reconstruct(const GaugeField &u) : reconstruct_12(u), scale(u.Scale()), scale_inv(1.0 / scale) { } + Reconstruct(const GaugeField &u) : reconstruct_12(u), scale(u.Scale()), scale_inv(1.0 / scale) {} - __device__ __host__ inline void Pack(real out[12], const complex in[9]) const { reconstruct_12.Pack(out, in); } + __device__ __host__ inline void Pack(real out[12], const complex in[9]) const { reconstruct_12.Pack(out, in); } - template - __device__ __host__ inline void Unpack(complex out[9], const real in[12], int, int, real phase, const I *, - const int *) const - { + template + __device__ __host__ inline void Unpack(complex out[9], const real in[12], int, int, real phase, const I *, + const int *) const + { #pragma unroll - for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]); - - out[6] = cmul(out[2], out[4]); - out[6] = cmac(out[1], out[5], -out[6]); - out[6] = scale_inv * conj(out[6]); - - out[7] = cmul(out[0], out[5]); - out[7] = cmac(out[2], out[3], -out[7]); - out[7] = scale_inv * conj(out[7]); - - out[8] = cmul(out[1], out[3]); - out[8] = cmac(out[0], out[4], -out[8]); - out[8] = scale_inv * conj(out[8]); - - if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phasing - // Multiply the third row by exp(I*3*phase), since the cross product will end up in a scale factor of exp(-I*2*phase) - real cos_sin[2]; - sincospi(static_cast(3.0) * phase, &cos_sin[1], &cos_sin[0]); - complex A(cos_sin[0], cos_sin[1]); - out[6] = cmul(A, out[6]); - out[7] = cmul(A, out[7]); - out[8] = cmul(A, out[8]); - } else { // phase is +/- 1 so real multiply is sufficient - out[6] *= phase; - out[7] *= phase; - out[8] *= phase; + for (int i = 0; i < 6; i++) out[i] = complex(in[2 * i + 0], in[2 * i + 1]); + + out[6] = cmul(out[2], out[4]); + out[6] = cmac(out[1], out[5], -out[6]); + out[6] = scale_inv * conj(out[6]); + + out[7] = cmul(out[0], out[5]); + out[7] = cmac(out[2], out[3], -out[7]); + out[7] = scale_inv * conj(out[7]); + + out[8] = cmul(out[1], out[3]); + out[8] = cmac(out[0], out[4], -out[8]); + out[8] = scale_inv * conj(out[8]); + + if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phasing + // Multiply the third row by exp(I*3*phase), since the cross product will end up in a scale factor of exp(-I*2*phase) + real cos_sin[2]; + sincospi(static_cast(3.0) * phase, &cos_sin[1], &cos_sin[0]); + complex A(cos_sin[0], cos_sin[1]); + out[6] = cmul(A, out[6]); + out[7] = cmul(A, out[7]); + out[8] = cmul(A, out[8]); + } else { // phase is +/- 1 so real multiply is sufficient + out[6] *= phase; + out[7] *= phase; + out[8] *= phase; + } } - } - __device__ __host__ inline real getPhase(const complex in[9]) const - { + __device__ __host__ inline real getPhase(const complex in[9]) const + { #if 1 // phase from cross product - // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])* - complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv; - complex expI3Phase = in[8] / denom; // numerator = U[2][2] - - // dynamic phasing - if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) return arg(expI3Phase) / static_cast(3.0 * M_PI); - // static phasing - return expI3Phase.real() > 0 ? 1 : -1; + // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])* + complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv; + complex expI3Phase = in[8] / denom; // numerator = U[2][2] + + // dynamic phasing + if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) return arg(expI3Phase) / static_cast(3.0 * M_PI); + // static phasing + return expI3Phase.real() > 0 ? 1 : -1; #else // phase from determinant - Matrix a; + Matrix a; #pragma unroll - for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i]; - const complex det = getDeterminant(a); - return phase = arg(det) / static_cast(3.0 * M_PI); + for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i]; + const complex det = getDeterminant(a); + return phase = arg(det) / static_cast(3.0 * M_PI); #endif - } - }; - - /** - @brief Gauge reconstruct 8 helper where we reconstruct the gauge - matrix from 8 packed elements (maximal compression) - @tparam Float Storage format (e.g., double, float, short) - @tparam ghostExchange_ optional template the ghostExchange type - to avoid the run-time overhead - */ - template struct Reconstruct<8, Float, ghostExchange_> { - using real = typename mapper::type; - using complex = complex; - const complex anisotropy; // imaginary value stores inverse - const complex tBoundary; // imaginary value stores inverse - const int firstTimeSliceBound; - const int lastTimeSliceBound; - const bool isFirstTimeSlice; - const bool isLastTimeSlice; - QudaGhostExchange ghostExchange; - - // scale factor is set when using recon-9 - Reconstruct(const GaugeField &u, real scale = 1.0) : - anisotropy(u.Anisotropy() * scale, 1.0 / (u.Anisotropy() * scale)), - tBoundary(static_cast(u.TBoundary()) * scale, 1.0 / (static_cast(u.TBoundary()) * scale)), - firstTimeSliceBound(u.VolumeCB()), - lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2), - isFirstTimeSlice(comm_coord(3) == 0 ? true : false), - isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false), - ghostExchange(u.GhostExchange()) - { - } + } + }; - // Pack and unpack are described in https://arxiv.org/pdf/0911.3191.pdf - // Method was modified to avoid the singularity at unit gauge by - // compressing the matrix {{b1,b2,b3},{a1,a2,a3},{-c1,-c2,-c3}} - // instead of {{a1,a2,a3},{b1,b2,b3},{c1,c2,c3}} + /** + @brief Gauge reconstruct 8 helper where we reconstruct the gauge + matrix from 8 packed elements (maximal compression) + @tparam Float Storage format (e.g., double, float, short) + @tparam ghostExchange_ optional template the ghostExchange type + to avoid the run-time overhead + */ + template struct Reconstruct<8, Float, ghostExchange_> { + using real = typename mapper::type; + using complex = complex; + const complex anisotropy; // imaginary value stores inverse + const complex tBoundary; // imaginary value stores inverse + const int firstTimeSliceBound; + const int lastTimeSliceBound; + const bool isFirstTimeSlice; + const bool isLastTimeSlice; + QudaGhostExchange ghostExchange; + + // scale factor is set when using recon-9 + Reconstruct(const GaugeField &u, real scale = 1.0) : + anisotropy(u.Anisotropy() * scale, 1.0 / (u.Anisotropy() * scale)), + tBoundary(static_cast(u.TBoundary()) * scale, 1.0 / (static_cast(u.TBoundary()) * scale)), + firstTimeSliceBound(u.VolumeCB()), + lastTimeSliceBound((u.X()[3] - 1) * u.X()[0] * u.X()[1] * u.X()[2] / 2), + isFirstTimeSlice(comm_coord(3) == 0 ? true : false), + isLastTimeSlice(comm_coord(3) == comm_dim(3) - 1 ? true : false), + ghostExchange(u.GhostExchange()) + { + } - __device__ __host__ inline void Pack(real out[8], const complex in[9]) const - { - out[0] = atan2(in[3].imag(), in[3].real()) / static_cast(M_PI); // a1 -> b1 - out[1] = atan2(-in[6].imag(), -in[6].real()) / static_cast(M_PI); // c1 -> -c1 + // Pack and unpack are described in https://arxiv.org/pdf/0911.3191.pdf + // Method was modified to avoid the singularity at unit gauge by + // compressing the matrix {{b1,b2,b3},{a1,a2,a3},{-c1,-c2,-c3}} + // instead of {{a1,a2,a3},{b1,b2,b3},{c1,c2,c3}} - out[2] = in[4].real(); - out[3] = in[4].imag(); // a2 -> b2 - out[4] = in[5].real(); - out[5] = in[5].imag(); // a3 -> b3 - out[6] = in[0].real(); - out[7] = in[0].imag(); // b1 -> a1 - } + __device__ __host__ inline void Pack(real out[8], const complex in[9]) const + { + out[0] = atan2(in[3].imag(), in[3].real()) / static_cast(M_PI); // a1 -> b1 + out[1] = atan2(-in[6].imag(), -in[6].real()) / static_cast(M_PI); // c1 -> -c1 + + out[2] = in[4].real(); + out[3] = in[4].imag(); // a2 -> b2 + out[4] = in[5].real(); + out[5] = in[5].imag(); // a3 -> b3 + out[6] = in[0].real(); + out[7] = in[0].imag(); // b1 -> a1 + } - template - __device__ __host__ inline void Unpack(complex out[9], const real in[8], int, int, real, const I *, const int *, - const complex, const complex u) const - { - real u0 = u.real(); - real u0_inv = u.imag(); + template + __device__ __host__ inline void Unpack(complex out[9], const real in[8], int, int, real, const I *, const int *, + const complex, const complex u) const + { + real u0 = u.real(); + real u0_inv = u.imag(); #pragma unroll - for (int i = 1; i <= 3; i++) - out[i] = complex(in[2 * i + 0], in[2 * i + 1]); // these elements are copied directly + for (int i = 1; i <= 3; i++) + out[i] = complex(in[2 * i + 0], in[2 * i + 1]); // these elements are copied directly - real tmp[2]; - quda::sincospi(in[0], &tmp[1], &tmp[0]); - out[0] = complex(tmp[0], tmp[1]); + real tmp[2]; + quda::sincospi(in[0], &tmp[1], &tmp[0]); + out[0] = complex(tmp[0], tmp[1]); - quda::sincospi(in[1], &tmp[1], &tmp[0]); - out[6] = complex(tmp[0], tmp[1]); + quda::sincospi(in[1], &tmp[1], &tmp[0]); + out[6] = complex(tmp[0], tmp[1]); - // First, reconstruct first row - real row_sum = out[1].real() * out[1].real(); - row_sum += out[1].imag() * out[1].imag(); - row_sum += out[2].real() * out[2].real(); - row_sum += out[2].imag() * out[2].imag(); - real row_sum_inv = static_cast(1.0) / row_sum; + // First, reconstruct first row + real row_sum = out[1].real() * out[1].real(); + row_sum += out[1].imag() * out[1].imag(); + row_sum += out[2].real() * out[2].real(); + row_sum += out[2].imag() * out[2].imag(); + real row_sum_inv = static_cast(1.0) / row_sum; - real diff = u0_inv * u0_inv - row_sum; - real U00_mag = diff > 0.0 ? diff * quda::rsqrt(diff) : static_cast(0.0); + real diff = u0_inv * u0_inv - row_sum; + real U00_mag = diff > 0.0 ? diff * quda::rsqrt(diff) : static_cast(0.0); - out[0] *= U00_mag; + out[0] *= U00_mag; - // Second, reconstruct first column - real column_sum = out[0].real() * out[0].real(); - column_sum += out[0].imag() * out[0].imag(); - column_sum += out[3].real() * out[3].real(); - column_sum += out[3].imag() * out[3].imag(); + // Second, reconstruct first column + real column_sum = out[0].real() * out[0].real(); + column_sum += out[0].imag() * out[0].imag(); + column_sum += out[3].real() * out[3].real(); + column_sum += out[3].imag() * out[3].imag(); - diff = u0_inv * u0_inv - column_sum; - real U20_mag = diff > 0.0 ? diff * quda::rsqrt(diff) : static_cast(0.0); + diff = u0_inv * u0_inv - column_sum; + real U20_mag = diff > 0.0 ? diff * quda::rsqrt(diff) : static_cast(0.0); - out[6] *= U20_mag; + out[6] *= U20_mag; - // Finally, reconstruct last elements from SU(2) rotation - real r_inv2 = u0_inv * row_sum_inv; - { - complex A = cmul(conj(out[0]), out[3]); + // Finally, reconstruct last elements from SU(2) rotation + real r_inv2 = u0_inv * row_sum_inv; + { + complex A = cmul(conj(out[0]), out[3]); - // out[4] = -(conj(out[6])*conj(out[2]) + u0*A*out[1])*r_inv2; // U11 - out[4] = cmul(conj(out[6]), conj(out[2])); - out[4] = cmac(u0 * A, out[1], out[4]); - out[4] = -r_inv2 * out[4]; + // out[4] = -(conj(out[6])*conj(out[2]) + u0*A*out[1])*r_inv2; // U11 + out[4] = cmul(conj(out[6]), conj(out[2])); + out[4] = cmac(u0 * A, out[1], out[4]); + out[4] = -r_inv2 * out[4]; - // out[5] = (conj(out[6])*conj(out[1]) - u0*A*out[2])*r_inv2; // U12 - out[5] = cmul(conj(out[6]), conj(out[1])); - out[5] = cmac(-u0 * A, out[2], out[5]); - out[5] = r_inv2 * out[5]; - } + // out[5] = (conj(out[6])*conj(out[1]) - u0*A*out[2])*r_inv2; // U12 + out[5] = cmul(conj(out[6]), conj(out[1])); + out[5] = cmac(-u0 * A, out[2], out[5]); + out[5] = r_inv2 * out[5]; + } - { - complex A = cmul(conj(out[0]), out[6]); + { + complex A = cmul(conj(out[0]), out[6]); - // out[7] = (conj(out[3])*conj(out[2]) - u0*A*out[1])*r_inv2; // U21 - out[7] = cmul(conj(out[3]), conj(out[2])); - out[7] = cmac(-u0 * A, out[1], out[7]); - out[7] = r_inv2 * out[7]; + // out[7] = (conj(out[3])*conj(out[2]) - u0*A*out[1])*r_inv2; // U21 + out[7] = cmul(conj(out[3]), conj(out[2])); + out[7] = cmac(-u0 * A, out[1], out[7]); + out[7] = r_inv2 * out[7]; - // out[8] = -(conj(out[3])*conj(out[1]) + u0*A*out[2])*r_inv2; // U12 - out[8] = cmul(conj(out[3]), conj(out[1])); - out[8] = cmac(u0 * A, out[2], out[8]); - out[8] = -r_inv2 * out[8]; - } + // out[8] = -(conj(out[3])*conj(out[1]) + u0*A*out[2])*r_inv2; // U12 + out[8] = cmul(conj(out[3]), conj(out[1])); + out[8] = cmac(u0 * A, out[2], out[8]); + out[8] = -r_inv2 * out[8]; + } - // Rearrange {{b1,b2,b3},{a1,a2,a3},{-c1,-c2,-c3}} back - // to {{a1,a2,a3},{b1,b2,b3},{c1,c2,c3}} + // Rearrange {{b1,b2,b3},{a1,a2,a3},{-c1,-c2,-c3}} back + // to {{a1,a2,a3},{b1,b2,b3},{c1,c2,c3}} #pragma unroll - for (int i = 0; i < 3; i++) { - const auto tmp = out[i]; - out[i] = out[i + 3]; - out[i + 3] = tmp; - out[i + 6] = -out[i + 6]; + for (int i = 0; i < 3; i++) { + const auto tmp = out[i]; + out[i] = out[i + 3]; + out[i + 3] = tmp; + out[i + 6] = -out[i + 6]; + } } - } - template - __device__ __host__ inline void - Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, - const complex scale = complex(static_cast(1.0), static_cast(1.0))) const - { - complex u = dir < 3 ? - anisotropy : - timeBoundary(idx, X, R, tBoundary, scale, firstTimeSliceBound, lastTimeSliceBound, - isFirstTimeSlice, isLastTimeSlice, ghostExchange); + template + __device__ __host__ inline void + Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, const int *R, + const complex scale = complex(static_cast(1.0), static_cast(1.0))) const + { + complex u = dir < 3 ? + anisotropy : + timeBoundary(idx, X, R, tBoundary, scale, firstTimeSliceBound, lastTimeSliceBound, + isFirstTimeSlice, isLastTimeSlice, ghostExchange); - Unpack(out, in, idx, dir, phase, X, R, scale, u); - } + Unpack(out, in, idx, dir, phase, X, R, scale, u); + } - __device__ __host__ inline real getPhase(const complex[]) const { return 0; } - }; + __device__ __host__ inline real getPhase(const complex[]) const { return 0; } + }; - /** - @brief Gauge reconstruct 9 helper where we reconstruct the gauge - matrix from 8 packed elements (maximal compression) and include a - non-trivial phase factor - @tparam Float Storage format (e.g., double, float, short) - @tparam ghostExchange_ optional template the ghostExchange type - to avoid the run-time overhead - */ - template - struct Reconstruct<9, Float, ghostExchange_, stag_phase> { - using real = typename mapper::type; - using complex = complex; - const Reconstruct<8, Float, ghostExchange_> reconstruct_8; - const real scale; - const real scale_inv; + /** + @brief Gauge reconstruct 9 helper where we reconstruct the gauge + matrix from 8 packed elements (maximal compression) and include a + non-trivial phase factor + @tparam Float Storage format (e.g., double, float, short) + @tparam ghostExchange_ optional template the ghostExchange type + to avoid the run-time overhead + */ + template + struct Reconstruct<9, Float, ghostExchange_, stag_phase> { + using real = typename mapper::type; + using complex = complex; + const Reconstruct<8, Float, ghostExchange_> reconstruct_8; + const real scale; + const real scale_inv; - Reconstruct(const GaugeField &u) : reconstruct_8(u), scale(u.Scale()), scale_inv(1.0 / scale) { } + Reconstruct(const GaugeField &u) : reconstruct_8(u), scale(u.Scale()), scale_inv(1.0 / scale) {} - __device__ __host__ inline real getPhase(const complex in[9]) const - { + __device__ __host__ inline real getPhase(const complex in[9]) const + { #if 1 // phase from cross product - // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])* - complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv; - complex expI3Phase = in[8] / denom; // numerator = U[2][2] - // dynamic phasing - if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) return arg(expI3Phase) / static_cast(3.0 * M_PI); - // static phasing - return expI3Phase.real() > 0 ? 1 : -1; + // denominator = (U[0][0]*U[1][1] - U[0][1]*U[1][0])* + complex denom = conj(in[0] * in[4] - in[1] * in[3]) * scale_inv; + complex expI3Phase = in[8] / denom; // numerator = U[2][2] + // dynamic phasing + if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) return arg(expI3Phase) / static_cast(3.0 * M_PI); + // static phasing + return expI3Phase.real() > 0 ? 1 : -1; #else // phase from determinant - Matrix a; + Matrix a; #pragma unroll - for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i]; - const complex det = getDeterminant(a); - real phase = arg(det) / static_cast(3.0 * M_PI); - return phase; + for (int i = 0; i < 9; i++) a(i) = scale_inv * in[i]; + const complex det = getDeterminant(a); + real phase = arg(det) / static_cast(3.0 * M_PI); + return phase; #endif - } - - // Rescale the U3 input matrix by exp(-I*phase) to obtain an SU3 matrix multiplied by a real scale factor, - __device__ __host__ inline void Pack(real out[8], const complex in[9]) const - { - real phase = getPhase(in); - complex su3[9]; + } - if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) { - real cos_sin[2]; - sincospi(static_cast(-phase), &cos_sin[1], &cos_sin[0]); - complex z(cos_sin[0], cos_sin[1]); - z *= scale_inv; + // Rescale the U3 input matrix by exp(-I*phase) to obtain an SU3 matrix multiplied by a real scale factor, + __device__ __host__ inline void Pack(real out[8], const complex in[9]) const + { + real phase = getPhase(in); + complex su3[9]; + + if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) { + real cos_sin[2]; + sincospi(static_cast(-phase), &cos_sin[1], &cos_sin[0]); + complex z(cos_sin[0], cos_sin[1]); + z *= scale_inv; #pragma unroll - for (int i = 0; i < 9; i++) su3[i] = cmul(z, in[i]); - } else { + for (int i = 0; i < 9; i++) su3[i] = cmul(z, in[i]); + } else { #pragma unroll - for (int i = 0; i < 9; i++) { su3[i] = phase * in[i]; } + for (int i = 0; i < 9; i++) { su3[i] = phase * in[i]; } + } + reconstruct_8.Pack(out, su3); } - reconstruct_8.Pack(out, su3); - } - template - __device__ __host__ inline void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, const I *X, - const int *R) const - { - reconstruct_8.Unpack(out, in, idx, dir, phase, X, R, complex(static_cast(1.0), static_cast(1.0)), - complex(static_cast(1.0), static_cast(1.0))); - - if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phase - real cos_sin[2]; - sincospi(static_cast(phase), &cos_sin[1], &cos_sin[0]); - complex z(cos_sin[0], cos_sin[1]); - z *= scale; + template + __device__ __host__ inline void Unpack(complex out[9], const real in[8], int idx, int dir, real phase, + const I *X, const int *R) const + { + reconstruct_8.Unpack(out, in, idx, dir, phase, X, R, complex(static_cast(1.0), static_cast(1.0)), + complex(static_cast(1.0), static_cast(1.0))); + + if constexpr (stag_phase == QUDA_STAGGERED_PHASE_NO) { // dynamic phase + real cos_sin[2]; + sincospi(static_cast(phase), &cos_sin[1], &cos_sin[0]); + complex z(cos_sin[0], cos_sin[1]); + z *= scale; #pragma unroll - for (int i = 0; i < 9; i++) out[i] = cmul(z, out[i]); - } else { // stagic phase + for (int i = 0; i < 9; i++) out[i] = cmul(z, out[i]); + } else { // stagic phase #pragma unroll - for (int i = 0; i < 18; i++) { out[i] *= phase; } + for (int i = 0; i < 18; i++) { out[i] *= phase; } + } } - } - }; + }; - __host__ __device__ constexpr int ct_sqrt(int n, int i = 1) - { - return n == i ? n : (i * i < n ? ct_sqrt(n, i + 1) : i); - } - - /** - @brief Return the number of colors of the accessor based on the length of the field - @param[in] length Number of real numbers per link - @return Number of colors (=sqrt(length/2)) - */ - __host__ __device__ constexpr int Ncolor(int length) { return ct_sqrt(length / 2); } - - // we default to huge allocations for gauge field (for now) - constexpr bool default_huge_alloc = true; - - template constexpr bool static_phase() - { - switch (phase) { - case QUDA_STAGGERED_PHASE_MILC: - case QUDA_STAGGERED_PHASE_CPS: - case QUDA_STAGGERED_PHASE_TIFR: return true; - default: return false; + __host__ __device__ constexpr int ct_sqrt(int n, int i = 1) + { + return n == i ? n : (i * i < n ? ct_sqrt(n, i + 1) : i); } - } - template - struct FloatNOrder { - using Accessor = FloatNOrder; + /** + @brief Return the number of colors of the accessor based on the length of the field + @param[in] length Number of real numbers per link + @return Number of colors (=sqrt(length/2)) + */ + __host__ __device__ constexpr int Ncolor(int length) { return ct_sqrt(length / 2); } - using store_t = Float; - static constexpr int length = length_; - using real = typename mapper::type; - using complex = complex; - typedef typename VectorType::type Vector; - typedef typename AllocType::type AllocInt; - Reconstruct reconstruct; - static constexpr int reconLen = (reconLenParam == 11) ? 10 : reconLenParam; - static constexpr int hasPhase = (reconLen == 9 || reconLen == 13) ? 1 : 0; - Float *gauge; - const AllocInt offset; - Float *ghost[4]; - QudaGhostExchange ghostExchange; - int coords[QUDA_MAX_DIM]; - int_fastdiv X[QUDA_MAX_DIM]; - int R[QUDA_MAX_DIM]; - const int volumeCB; - int faceVolumeCB[4]; - const int stride; - const int geometry; - const AllocInt phaseOffset; - size_t bytes; + // we default to huge allocations for gauge field (for now) + constexpr bool default_huge_alloc = true; - FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : - reconstruct(u), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), - offset(u.Bytes() / (2 * sizeof(Float) * N)), - ghostExchange(u.GhostExchange()), - volumeCB(u.VolumeCB()), - stride(u.Stride()), - geometry(u.Geometry()), - phaseOffset(u.PhaseOffset() / sizeof(Float)), - bytes(u.Bytes()) - { - if (geometry == QUDA_COARSE_GEOMETRY) - errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone"); - - // static_assert( !(stag_phase!=QUDA_STAGGERED_PHASE_NO && reconLenParam != 18 && reconLenParam != 12), - // "staggered phase only presently supported for 18 and 12 reconstruct"); - for (int i = 0; i < 4; i++) { - X[i] = u.X()[i]; - R[i] = u.R()[i]; - ghost[i] = ghost_ ? ghost_[i] : 0; - faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth + template constexpr bool static_phase() + { + switch (phase) { + case QUDA_STAGGERED_PHASE_MILC: + case QUDA_STAGGERED_PHASE_CPS: + case QUDA_STAGGERED_PHASE_TIFR: return true; + default: return false; } } + template + struct FloatNOrder { + using Accessor + = FloatNOrder; + + using store_t = Float; + static constexpr int length = length_; + using real = typename mapper::type; + using complex = complex; + typedef typename VectorType::type Vector; + typedef typename AllocType::type AllocInt; + Reconstruct reconstruct; + static constexpr int reconLen = (reconLenParam == 11) ? 10 : reconLenParam; + static constexpr int hasPhase = (reconLen == 9 || reconLen == 13) ? 1 : 0; + Float *gauge; + const AllocInt offset; + Float *ghost[4]; + QudaGhostExchange ghostExchange; + int coords[QUDA_MAX_DIM]; + int_fastdiv X[QUDA_MAX_DIM]; + int R[QUDA_MAX_DIM]; + const int volumeCB; + int faceVolumeCB[4]; + const int stride; + const int geometry; + const AllocInt phaseOffset; + size_t bytes; + + FloatNOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + reconstruct(u), + gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + offset(u.Bytes() / (2 * sizeof(Float) * N)), + ghostExchange(u.GhostExchange()), + volumeCB(u.VolumeCB()), + stride(u.Stride()), + geometry(u.Geometry()), + phaseOffset(u.PhaseOffset() / sizeof(Float)), + bytes(u.Bytes()) + { + if (geometry == QUDA_COARSE_GEOMETRY) + errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone"); + + // static_assert( !(stag_phase!=QUDA_STAGGERED_PHASE_NO && reconLenParam != 18 && reconLenParam != 12), + // "staggered phase only presently supported for 18 and 12 reconstruct"); + for (int i = 0; i < 4; i++) { + X[i] = u.X()[i]; + R[i] = u.R()[i]; + ghost[i] = ghost_ ? ghost_[i] : 0; + faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth + } + } + __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real phase = 1.0) const { const int M = reconLen / N; real tmp[reconLen]; #pragma unroll - for (int i = 0; i < M; i++) { + for (int i=0; i(gauge, parity * offset + (dir * M + i) * stride + x); // second do copy converting into register type @@ -1583,12 +1570,12 @@ namespace quda reconstruct.Pack(tmp, v); #pragma unroll - for (int i = 0; i < M; i++) { - Vector vecTmp; - // first do copy converting into storage type + for (int i=0; i(&vecTmp)[j], tmp[i * N + j]); - // second do vectorized copy into memory + for (int j=0; j(&vecTmp)[j], tmp[i*N+j]); + // second do vectorized copy into memory vector_store(gauge, parity * offset + x + (dir * M + i) * stride, vecTmp); } if constexpr (hasPhase) { @@ -1598,14 +1585,14 @@ namespace quda } /** - @brief This accessor routine returns a gauge_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] x_cb Checkerboarded space-time index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_wrapper that curries in access to - this field at the above coordinates. + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. */ __device__ __host__ inline auto operator()(int dim, int x_cb, int parity, real phase = 1.0) const { @@ -1614,7 +1601,7 @@ namespace quda __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dir, int parity, real inphase = 1.0) const { - if (!ghost[dir]) { // load from main field not separate array + if (!ghost[dir]) { // load from main field not separate array load(v, volumeCB + x, dir, parity, inphase); // an offset of size volumeCB puts us at the padded region // This also works perfectly when phases are stored. No need to change this. } else { @@ -1622,10 +1609,10 @@ namespace quda real tmp[reconLen]; #pragma unroll - for (int i = 0; i < M; i++) { - // first do vectorized copy from memory into registers - Vector vecTmp = vector_load(ghost[dir] + parity * faceVolumeCB[dir] * (M * N + hasPhase), - i * faceVolumeCB[dir] + x); + for (int i=0; i( + ghost[dir] + parity * faceVolumeCB[dir] * (M * N + hasPhase), i * faceVolumeCB[dir] + x); // second do copy converting into register type #pragma unroll for (int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast(&vecTmp)[j]); @@ -1647,7 +1634,7 @@ namespace quda __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dir, int parity) const { - if (!ghost[dir]) { // store in main field not separate array + if (!ghost[dir]) { // store in main field not separate array save(v, volumeCB + x, dir, parity); // an offset of size volumeCB puts us at the padded region } else { const int M = reconLen / N; @@ -1655,13 +1642,13 @@ namespace quda reconstruct.Pack(tmp, v); #pragma unroll - for (int i = 0; i < M; i++) { - Vector vecTmp; - // first do copy converting into storage type + for (int i=0; i(&vecTmp)[j], tmp[i * N + j]); - // second do vectorized copy into memory - vector_store(ghost[dir] + parity * faceVolumeCB[dir] * (M * N + hasPhase), i * faceVolumeCB[dir] + x, vecTmp); + for (int j=0; j(&vecTmp)[j], tmp[i*N+j]); + // second do vectorized copy into memory + vector_store(ghost[dir]+parity*faceVolumeCB[dir]*(M*N + hasPhase), i*faceVolumeCB[dir]+x, vecTmp); } if constexpr (hasPhase) { @@ -1711,15 +1698,14 @@ namespace quda real tmp[reconLen]; #pragma unroll - for (int i = 0; i < M; i++) { - // first do vectorized copy from memory - Vector vecTmp = vector_load( - ghost[dim] + ((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + hasPhase), - +i * R[dim] * faceVolumeCB[dim] + buff_idx); - // second do copy converting into register type + for (int i=0; i(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase), + +i*R[dim]*faceVolumeCB[dim]+buff_idx); + // second do copy converting into register type #pragma unroll - for (int j = 0; j < N; j++) copy(tmp[i * N + j], reinterpret_cast(&vecTmp)[j]); - } + for (int j=0; j(&vecTmp)[j]); + } real phase = 0.; if constexpr (hasPhase) copy(phase, @@ -1738,118 +1724,119 @@ namespace quda reconstruct.Pack(tmp, v); #pragma unroll - for (int i = 0; i < M; i++) { - Vector vecTmp; - // first do copy converting into storage type + for (int i=0; i(&vecTmp)[j], tmp[i * N + j]); - // second do vectorized copy to memory - vector_store(ghost[dim] + ((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + hasPhase), - i * R[dim] * faceVolumeCB[dim] + buff_idx, vecTmp); - } - if constexpr (hasPhase) { - real phase = reconstruct.getPhase(v); - copy(ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1) - + R[dim] * faceVolumeCB[dim] * M * N + buff_idx], - static_cast(0.5) * phase); - } + for (int j=0; j(&vecTmp)[j], tmp[i*N+j]); + // second do vectorized copy to memory + vector_store(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase), + i*R[dim]*faceVolumeCB[dim]+buff_idx, vecTmp); + } + if constexpr (hasPhase) { + real phase = reconstruct.getPhase(v); + copy(ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1) + + R[dim] * faceVolumeCB[dim] * M * N + buff_idx], + static_cast(0.5) * phase); + } } size_t Bytes() const { return reconLen * sizeof(Float); } - }; - - /** - @brief The LegacyOrder defines the ghost zone storage and ordering for - all cpuGaugeFields, which use the same ghost zone storage. - */ - template struct LegacyOrder { - static constexpr int length = length_; - using Accessor = LegacyOrder; - using store_t = Float; - using real = typename mapper::type; - using complex = complex; - Float *ghost[QUDA_MAX_DIM]; - int faceVolumeCB[QUDA_MAX_DIM]; - const int volumeCB; - const int stride; - const int geometry; - const int hasPhase; + }; - LegacyOrder(const GaugeField &u, Float **ghost_) : - volumeCB(u.VolumeCB()), stride(u.Stride()), geometry(u.Geometry()), hasPhase(0) - { - if (geometry == QUDA_COARSE_GEOMETRY) - errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone"); + /** + @brief The LegacyOrder defines the ghost zone storage and ordering for + all cpuGaugeFields, which use the same ghost zone storage. + */ + template struct LegacyOrder { + static constexpr int length = length_; + using Accessor = LegacyOrder; + using store_t = Float; + using real = typename mapper::type; + using complex = complex; + Float *ghost[QUDA_MAX_DIM]; + int faceVolumeCB[QUDA_MAX_DIM]; + const int volumeCB; + const int stride; + const int geometry; + const int hasPhase; + + LegacyOrder(const GaugeField &u, Float **ghost_) : + volumeCB(u.VolumeCB()), + stride(u.Stride()), + geometry(u.Geometry()), + hasPhase(0) + { + if (geometry == QUDA_COARSE_GEOMETRY) + errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone"); - for (int i = 0; i < 4; i++) { - ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i]); - faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth + for (int i = 0; i < 4; i++) { + ghost[i] = (ghost_) ? ghost_[i] : (Float *)(u.Ghost()[i]); + faceVolumeCB[i] = u.SurfaceCB(i) * u.Nface(); // face volume equals surface * depth + } } - } - __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dir, int parity, real = 1.0) const - { - auto in = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length]; - block_load(v, reinterpret_cast(in)); - } + __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dir, int parity, real = 1.0) const + { + auto in = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length]; + block_load(v, reinterpret_cast(in)); + } - __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dir, int parity) - { - auto out = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length]; - block_store(reinterpret_cast(out), v); - } + __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dir, int parity) + { + auto out = &ghost[dir][(parity * faceVolumeCB[dir] + x) * length]; + block_store(reinterpret_cast(out), v); + } - /** - @brief This accessor routine returns a const gauge_ghost_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] ghost_idx Ghost index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_ghost_wrapper that curries in access to - this field at the above coordinates. - */ - __device__ __host__ inline const gauge_ghost_wrapper Ghost(int dim, int ghost_idx, int parity, - real phase = 1.0) const - { - return gauge_ghost_wrapper(const_cast(*this), dim, ghost_idx, parity, phase); - } + /** + @brief This accessor routine returns a const gauge_ghost_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] ghost_idx Ghost index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_ghost_wrapper that curries in access to + this field at the above coordinates. + */ + __device__ __host__ inline const gauge_ghost_wrapper Ghost(int dim, int ghost_idx, int parity, + real phase = 1.0) const + { + return gauge_ghost_wrapper(const_cast(*this), dim, ghost_idx, parity, phase); + } - __device__ __host__ inline void loadGhostEx(complex v[length / 2], int x, int, int dir, int dim, int g, - int parity, const int R[]) const - { - auto in = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length]; - block_load(v, reinterpret_cast(in)); - } + __device__ __host__ inline void loadGhostEx(complex v[length / 2], int x, int, int dir, int dim, int g, + int parity, const int R[]) const + { + auto in = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length]; + block_load(v, reinterpret_cast(in)); + } - __device__ __host__ inline void saveGhostEx(const complex v[length / 2], int x, int, int dir, int dim, int g, - int parity, const int R[]) const - { - auto out = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length]; - block_store(reinterpret_cast(out), v); - } - }; + __device__ __host__ inline void saveGhostEx(const complex v[length / 2], int x, int, int dir, int dim, int g, + int parity, const int R[]) const + { + auto out = &ghost[dim][(((dir * 2 + parity) * R[dim] * faceVolumeCB[dim] + x) * geometry + g) * length]; + block_store(reinterpret_cast(out), v); + } + }; /** struct to define QDP ordered gauge fields: [[dim]] [[parity][volumecb][row][col]] */ - template struct QDPOrder : public LegacyOrder { + template struct QDPOrder : public LegacyOrder { using Accessor = QDPOrder; using real = typename mapper::type; using complex = complex; Float *gauge[QUDA_MAX_DIM]; const int volumeCB; - QDPOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : - LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) - { - for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : ((Float **)u.Gauge_p())[i]; - } + QDPOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) + : LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) + { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; } - __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const - { - auto in = &gauge[dir][(parity * volumeCB + x) * length]; - block_load(v, reinterpret_cast(in)); + __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const + { + auto in = &gauge[dir][(parity * volumeCB + x) * length]; + block_load(v, reinterpret_cast(in)); } __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const @@ -1859,14 +1846,14 @@ namespace quda } /** - @brief This accessor routine returns a gauge_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] x_cb Checkerboarded space-time index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_wrapper that curries in access to - this field at the above coordinates. + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. */ __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const { @@ -1880,24 +1867,22 @@ namespace quda struct to define QDPJIT ordered gauge fields: [[dim]] [[parity][complex][row][col][volumecb]] */ - template struct QDPJITOrder : public LegacyOrder { + template struct QDPJITOrder : public LegacyOrder { using Accessor = QDPJITOrder; using real = typename mapper::type; using complex = complex; Float *gauge[QUDA_MAX_DIM]; const int volumeCB; - QDPJITOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : - LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) - { - for (int i = 0; i < 4; i++) gauge[i] = gauge_ ? ((Float **)gauge_)[i] : ((Float **)u.Gauge_p())[i]; - } + QDPJITOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) + : LegacyOrder(u, ghost_), volumeCB(u.VolumeCB()) + { for (int i=0; i<4; i++) gauge[i] = gauge_ ? ((Float**)gauge_)[i] : ((Float**)u.Gauge_p())[i]; } - __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const - { - for (int i = 0; i < length / 2; i++) { - v[i].real((real)gauge[dir][((0 * (length / 2) + i) * 2 + parity) * volumeCB + x]); - v[i].imag((real)gauge[dir][((1 * (length / 2) + i) * 2 + parity) * volumeCB + x]); - } + __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const + { + for (int i = 0; i < length / 2; i++) { + v[i].real((real)gauge[dir][((0 * (length / 2) + i) * 2 + parity) * volumeCB + x]); + v[i].imag((real)gauge[dir][((1 * (length / 2) + i) * 2 + parity) * volumeCB + x]); + } } __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const @@ -1909,14 +1894,14 @@ namespace quda } /** - @brief This accessor routine returns a gauge_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] x_cb Checkerboarded space-time index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_wrapper that curries in access to - this field at the above coordinates. + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. */ __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const { @@ -1926,190 +1911,186 @@ namespace quda size_t Bytes() const { return length * sizeof(Float); } }; + /** + struct to define MILC ordered gauge fields: + [parity][dim][volumecb][row][col] + */ + template struct MILCOrder : public LegacyOrder { + using Accessor = MILCOrder; + using real = typename mapper::type; + using complex = complex; + Float *gauge; + const int volumeCB; + const int geometry; + MILCOrder(const GaugeField &u, Float *gauge_=0, Float **ghost_=0) : + LegacyOrder(u, ghost_), gauge(gauge_ ? gauge_ : (Float*)u.Gauge_p()), + volumeCB(u.VolumeCB()), geometry(u.Geometry()) { ; } + + __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const + { + auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; + block_load(v, reinterpret_cast(in)); + } + + __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const + { + auto out = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; + block_store(reinterpret_cast(out), v); + } + /** - struct to define MILC ordered gauge fields: - [parity][dim][volumecb][row][col] + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. */ - template struct MILCOrder : public LegacyOrder { - using Accessor = MILCOrder; - using real = typename mapper::type; - using complex = complex; - Float *gauge; - const int volumeCB; - const int geometry; - MILCOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : - LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), - volumeCB(u.VolumeCB()), - geometry(u.Geometry()) - { - ; - } + __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const + { + return gauge_wrapper(const_cast(*this), dim, x_cb, parity); + } - __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const - { - auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; - block_load(v, reinterpret_cast(in)); - } + size_t Bytes() const { return length * sizeof(Float); } + }; - __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const - { - auto out = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; - block_store(reinterpret_cast(out), v); - } + /** + @brief struct to define gauge fields packed into an opaque MILC site struct: + + struct { + char padding[offset]; + Float [dim][row][col]; + } site; + + site lattice [parity][volumecb]; + + We are just passed the size of the struct and the offset to the + required matrix elements. Typically, it is expected that this + accessor will be used with zero-copy memory to the original + allocation in MILC. + */ + template struct MILCSiteOrder : public LegacyOrder { + using Accessor = MILCSiteOrder; + using real = typename mapper::type; + using complex = complex; + Float *gauge; + const int volumeCB; + const int geometry; + const size_t offset; + const size_t size; + MILCSiteOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + LegacyOrder(u, ghost_), + gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + volumeCB(u.VolumeCB()), + geometry(u.Geometry()), + offset(u.SiteOffset()), + size(u.SiteSize()) + { + if ((uintptr_t)((char *)gauge + offset) % 16 != 0) { errorQuda("MILC structure has misaligned offset"); } + } - /** - @brief This accessor routine returns a gauge_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] x_cb Checkerboarded space-time index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_wrapper that curries in access to - this field at the above coordinates. - */ - __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const - { - return gauge_wrapper(const_cast(*this), dim, x_cb, parity); - } + __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const + { + // get base pointer + auto in = reinterpret_cast(reinterpret_cast(gauge) + (parity * volumeCB + x) * size + + offset + dir * length * sizeof(Float)); + block_load(v, reinterpret_cast(in)); + } - size_t Bytes() const { return length * sizeof(Float); } - }; + __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const + { + // get base pointer + auto out = reinterpret_cast(reinterpret_cast(gauge) + (parity * volumeCB + x) * size + offset + + dir * length * sizeof(Float)); + block_store(reinterpret_cast(out), v); + } /** - @brief struct to define gauge fields packed into an opaque MILC site struct: + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. + */ + __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const + { + return gauge_wrapper(const_cast(*this), dim, x_cb, parity); + } - struct { - char padding[offset]; - Float [dim][row][col]; - } site; + size_t Bytes() const { return length * sizeof(Float); } + }; - site lattice [parity][volumecb]; - We are just passed the size of the struct and the offset to the - required matrix elements. Typically, it is expected that this - accessor will be used with zero-copy memory to the original - allocation in MILC. - */ - template struct MILCSiteOrder : public LegacyOrder { - using Accessor = MILCSiteOrder; - using real = typename mapper::type; - using complex = complex; - Float *gauge; - const int volumeCB; - const int geometry; - const size_t offset; - const size_t size; - MILCSiteOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : - LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), - volumeCB(u.VolumeCB()), - geometry(u.Geometry()), - offset(u.SiteOffset()), - size(u.SiteSize()) - { - if ((uintptr_t)((char *)gauge + offset) % 16 != 0) { errorQuda("MILC structure has misaligned offset"); } - } + /** + struct to define CPS ordered gauge fields: + [parity][dim][volumecb][col][row] + */ + template struct CPSOrder : LegacyOrder { + using Accessor = CPSOrder; + using real = typename mapper::type; + using complex = complex; + Float *gauge; + const int volumeCB; + const real anisotropy; + const real anisotropy_inv; + static constexpr int Nc = 3; + const int geometry; + CPSOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : + LegacyOrder(u, ghost_), + gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + volumeCB(u.VolumeCB()), + anisotropy(u.Anisotropy()), + anisotropy_inv(1.0 / anisotropy), + geometry(u.Geometry()) + { + if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); + } - __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const - { - // get base pointer - auto in = reinterpret_cast(reinterpret_cast(gauge) + (parity * volumeCB + x) * size - + offset + dir * length * sizeof(Float)); - block_load(v, reinterpret_cast(in)); - } + // we need to transpose and scale for CPS ordering + __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float = 1.0) const + { + auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; + complex v_[9]; + block_load(v_, reinterpret_cast(in)); - __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const - { - // get base pointer - auto out = reinterpret_cast(reinterpret_cast(gauge) + (parity * volumeCB + x) * size + offset - + dir * length * sizeof(Float)); - block_store(reinterpret_cast(out), v); + for (int i=0; i(const_cast(*this), dim, x_cb, parity); + __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity) const + { + auto out = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; + complex v_[9]; + for (int i=0; i(reinterpret_cast(out), v_); + } /** - struct to define CPS ordered gauge fields: - [parity][dim][volumecb][col][row] + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. */ - template struct CPSOrder : LegacyOrder { - using Accessor = CPSOrder; - using real = typename mapper::type; - using complex = complex; - Float *gauge; - const int volumeCB; - const real anisotropy; - const real anisotropy_inv; - static constexpr int Nc = 3; - const int geometry; - CPSOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : - LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), - volumeCB(u.VolumeCB()), - anisotropy(u.Anisotropy()), - anisotropy_inv(1.0 / anisotropy), - geometry(u.Geometry()) - { - if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); - } - - // we need to transpose and scale for CPS ordering - __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, Float = 1.0) const - { - auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; - complex v_[9]; - block_load(v_, reinterpret_cast(in)); - - for (int i = 0; i < Nc; i++) { - for (int j = 0; j < Nc; j++) { v[i * Nc + j] = v_[j * Nc + i] * anisotropy_inv; } - } - } - - __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity) const - { - auto out = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; - complex v_[9]; - for (int i = 0; i < Nc; i++) { - for (int j = 0; j < Nc; j++) { v_[i * Nc + j] = v[j * Nc + i] * anisotropy; } - } - - block_store(reinterpret_cast(out), v_); - } - - /** - @brief This accessor routine returns a gauge_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] x_cb Checkerboarded space-time index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_wrapper that curries in access to - this field at the above coordinates. - */ - __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const - { - return gauge_wrapper(const_cast(*this), dim, x_cb, parity); - } + __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const + { + return gauge_wrapper(const_cast(*this), dim, x_cb, parity); + } - size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } - }; + size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } + }; /** @brief struct to define BQCD ordered gauge fields: @@ -2118,7 +2099,7 @@ namespace quda variables in and extended fields with inline halos [mu][parity][volumecb+halos][col][row] */ - template struct BQCDOrder : LegacyOrder { + template struct BQCDOrder : LegacyOrder { using Accessor = BQCDOrder; using real = typename mapper::type; using complex = complex; @@ -2127,12 +2108,14 @@ namespace quda int exVolumeCB; // extended checkerboard volume static constexpr int Nc = 3; BQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : - LegacyOrder(u, ghost_), gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), volumeCB(u.VolumeCB()) + LegacyOrder(u, ghost_), + gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), + volumeCB(u.VolumeCB()) { if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); // compute volumeCB + halo region - exVolumeCB = u.X()[0] / 2 + 2; - for (int i = 1; i < 4; i++) exVolumeCB *= u.X()[i] + 2; + exVolumeCB = u.X()[0]/2 + 2; + for (int i=1; i<4; i++) exVolumeCB *= u.X()[i] + 2; } // we need to transpose for BQCD ordering @@ -2180,7 +2163,7 @@ namespace quda @brief struct to define TIFR ordered gauge fields: [mu][parity][volumecb][col][row] */ - template struct TIFROrder : LegacyOrder { + template struct TIFROrder : LegacyOrder { using Accessor = TIFROrder; using real = typename mapper::type; using complex = complex; @@ -2244,7 +2227,7 @@ namespace quda struct to define TIFR ordered gauge fields (with inlined z halo of depth two): [mu][parity][t][z+4][y][x/2][col][row] */ - template struct TIFRPaddedOrder : LegacyOrder { + template struct TIFRPaddedOrder : LegacyOrder { using Accessor = TIFRPaddedOrder; using real = typename mapper::type; using complex = complex; @@ -2269,23 +2252,22 @@ namespace quda if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); // exVolumeCB is the padded checkboard volume - for (int i = 0; i < 4; i++) exVolumeCB *= exDim[i]; - exVolumeCB /= 2; + for (int i=0; i<4; i++) exVolumeCB *= exDim[i]; + exVolumeCB /= 2; } /** - @brief Compute the index into the padded field. Assumes that - parity doesn't change from unpadded to padded. + @brief Compute the index into the padded field. Assumes that + parity doesn't change from unpadded to padded. */ - __device__ __host__ inline int getPaddedIndex(int x_cb, int parity) const - { - // find coordinates - int coord[4]; - getCoords(coord, x_cb, dim, parity); + __device__ __host__ inline int getPaddedIndex(int x_cb, int parity) const { + // find coordinates + int coord[4]; + getCoords(coord, x_cb, dim, parity); - // get z-extended index - coord[2] += 2; // offset for halo - return linkIndex(coord, exDim); + // get z-extended index + coord[2] += 2; // offset for halo + return linkIndex(coord, exDim); } // we need to transpose for TIFR ordering @@ -2332,7 +2314,8 @@ namespace quda size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } }; - /** + +/** struct to define OpenQCD ordered gauge fields: [volumecb][dim][parity*][row][col] parity*: uplink/downlink (link attached to closest odd site) */ @@ -2355,6 +2338,7 @@ namespace quda if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); } + // TODO: make this function // __device__ __host__ inline int QUDAtoOpenQxD(int x_cb_QUDA, int dir_QUDA, int parity_QUDA) const // TODO: Implement ipt and iup functions @@ -2362,52 +2346,40 @@ namespace quda // } - /* ORIGINAL */ - -#if 0 - __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, real = 1.0) const - { - auto in = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; - block_load(v, reinterpret_cast(in)); - } - __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const - { - auto out = &gauge[((parity * volumeCB + x) * geometry + dir) * length]; - block_store(reinterpret_cast(out), v); - } -#endif - - /*****************/ - - __device__ __host__ inline void load(complex v[length / 2], int x, int dir, int parity, Float = 1.0) const + __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, + Float = 1.0) const { // With ''natural'' order: lexicographical 0123 = txyz , t fastest, links 0123 = txyz in pos directions // Indexing fun: int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) + getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site // int iy_OpenQxD = x3 + L3*x2 + L3*L2*x1 + L3*L2*L1*x0; - int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; + // TODO: Determine whether coord[mu] is local or global + int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) - coord1 in QUDA is x2 in OpenQxD (y) - coord2 in QUDA is x3 in OpenQxD (z) - coord3 in QUDA is x0 in OpenQxD (t) */ + coord1 in QUDA is x2 in OpenQxD (y) + coord2 in QUDA is x3 in OpenQxD (z) + coord3 in QUDA is x0 in OpenQxD (t) + */ // int ix_OpenQxD = ipt[iy_OpenQxD]; - int dir_OpenQxD = (dir + 1) % 4; // rotation of axes QUDA -> OpenQxD // Loading as per QUDA style - auto in = &gauge[(4 * iy_OpenQxD + dir_OpenQxD) * length]; - // This is how they're accessed within OpenQxd (length = 18 - // doubles = 9 complex doubles = 1 su3dble struct) + auto in + = &gauge[(4 * iy_OpenQxD + dir_OpenQxD) * length]; // This is how they're accessed within OpenQxd (length = 18 + // doubles = 9 complex doubles = 1 su3dble struct) // auto in = &gauge[ (8*(ix_OpenQxD - volumeCB) + 2*dir_OpenQxD)* length]; // This is how they're accessed // within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) block_load(v, reinterpret_cast(in)); + + } - __device__ __host__ inline void save(const complex v[length / 2], int x, int dir, int parity) const + __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity) const { // Indexing fun: int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) @@ -2438,7 +2410,7 @@ namespace quda @brief This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operators for manipulating at the site level interms of matrix operations. - @param[in] dim Which dimension are we requesting // FIXME: dim is a typo for dir! + @param[in] dir Which dimension are we requesting @param[in] x_cb Checkerboarded space-time index we are requesting @param[in] parity Parity we are requesting @return Instance of a gauge_wrapper that curries in access to @@ -2456,6 +2428,7 @@ namespace quda }; } // namespace gauge + template __device__ __host__ inline auto operator*(const gauge::fieldorder_wrapper &a, @@ -2595,31 +2568,16 @@ namespace quda typedef gauge::QDPOrder type; }; - template struct gauge_order_mapper { - }; - template struct gauge_order_mapper { - typedef gauge::QDPOrder type; - }; - template struct gauge_order_mapper { - typedef gauge::QDPJITOrder type; - }; - template struct gauge_order_mapper { - typedef gauge::MILCOrder type; - }; + template struct gauge_order_mapper { }; + template struct gauge_order_mapper { typedef gauge::QDPOrder type; }; + template struct gauge_order_mapper { typedef gauge::QDPJITOrder type; }; + template struct gauge_order_mapper { typedef gauge::MILCOrder type; }; template struct gauge_order_mapper { typedef gauge::CPSOrder type; }; - template struct gauge_order_mapper { - typedef gauge::BQCDOrder type; - }; - template struct gauge_order_mapper { - typedef gauge::TIFROrder type; - }; - template struct gauge_order_mapper { - typedef gauge::TIFRPaddedOrder type; - }; - template struct gauge_order_mapper { - typedef gauge::FloatNOrder type; - }; + template struct gauge_order_mapper { typedef gauge::BQCDOrder type; }; + template struct gauge_order_mapper { typedef gauge::TIFROrder type; }; + template struct gauge_order_mapper { typedef gauge::TIFRPaddedOrder type; }; + template struct gauge_order_mapper { typedef gauge::FloatNOrder type; }; } // namespace quda diff --git a/include/quda.h b/include/quda.h index 90fc9cf165..70a14bc804 100644 --- a/include/quda.h +++ b/include/quda.h @@ -24,1692 +24,1705 @@ extern "C" { #endif -/** - * Parameters having to do with the gauge field or the - * interpretation of the gauge field by various Dirac operators - */ -typedef struct QudaGaugeParam_s { - size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ - QudaFieldLocation location; /**< The location of the gauge field */ + /** + * Parameters having to do with the gauge field or the + * interpretation of the gauge field by various Dirac operators + */ + typedef struct QudaGaugeParam_s { + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ + QudaFieldLocation location; /**< The location of the gauge field */ - int X[4]; /**< The local space-time dimensions (without checkboarding) */ + int X[4]; /**< The local space-time dimensions (without checkboarding) */ - double anisotropy; /**< Used for Wilson and Wilson-clover */ - double tadpole_coeff; /**< Used for staggered only */ - double scale; /**< Used by staggered long links */ + double anisotropy; /**< Used for Wilson and Wilson-clover */ + double tadpole_coeff; /**< Used for staggered only */ + double scale; /**< Used by staggered long links */ - QudaLinkType type; /**< The link type of the gauge field (e.g., Wilson, fat, long, etc.) */ - QudaGaugeFieldOrder gauge_order; /**< The ordering on the input gauge field */ + QudaLinkType type; /**< The link type of the gauge field (e.g., Wilson, fat, long, etc.) */ + QudaGaugeFieldOrder gauge_order; /**< The ordering on the input gauge field */ - QudaTboundary t_boundary; /**< The temporal boundary condition that will be used for fermion fields */ + QudaTboundary t_boundary; /**< The temporal boundary condition that will be used for fermion fields */ - QudaPrecision cpu_prec; /**< The precision used by the caller */ + QudaPrecision cpu_prec; /**< The precision used by the caller */ - QudaPrecision cuda_prec; /**< The precision of the cuda gauge field */ - QudaReconstructType reconstruct; /**< The reconstruction type of the cuda gauge field */ + QudaPrecision cuda_prec; /**< The precision of the cuda gauge field */ + QudaReconstructType reconstruct; /**< The reconstruction type of the cuda gauge field */ - QudaPrecision cuda_prec_sloppy; /**< The precision of the sloppy gauge field */ - QudaReconstructType reconstruct_sloppy; /**< The recontruction type of the sloppy gauge field */ + QudaPrecision cuda_prec_sloppy; /**< The precision of the sloppy gauge field */ + QudaReconstructType reconstruct_sloppy; /**< The recontruction type of the sloppy gauge field */ - QudaPrecision - cuda_prec_refinement_sloppy; /**< The precision of the sloppy gauge field for the refinement step in multishift */ - QudaReconstructType reconstruct_refinement_sloppy; /**< The recontruction type of the sloppy gauge field for the - refinement step in multishift*/ + QudaPrecision cuda_prec_refinement_sloppy; /**< The precision of the sloppy gauge field for the refinement step in multishift */ + QudaReconstructType reconstruct_refinement_sloppy; /**< The recontruction type of the sloppy gauge field for the refinement step in multishift*/ - QudaPrecision cuda_prec_precondition; /**< The precision of the preconditioner gauge field */ - QudaReconstructType reconstruct_precondition; /**< The recontruction type of the preconditioner gauge field */ + QudaPrecision cuda_prec_precondition; /**< The precision of the preconditioner gauge field */ + QudaReconstructType reconstruct_precondition; /**< The recontruction type of the preconditioner gauge field */ - QudaPrecision cuda_prec_eigensolver; /**< The precision of the eigensolver gauge field */ - QudaReconstructType reconstruct_eigensolver; /**< The recontruction type of the eigensolver gauge field */ + QudaPrecision cuda_prec_eigensolver; /**< The precision of the eigensolver gauge field */ + QudaReconstructType reconstruct_eigensolver; /**< The recontruction type of the eigensolver gauge field */ - QudaGaugeFixed gauge_fix; /**< Whether the input gauge field is in the axial gauge or not */ + QudaGaugeFixed gauge_fix; /**< Whether the input gauge field is in the axial gauge or not */ - int ga_pad; /**< The pad size that the cudaGaugeField will use (default=0) */ + int ga_pad; /**< The pad size that the cudaGaugeField will use (default=0) */ - int site_ga_pad; /**< Used by link fattening and the gauge and fermion forces */ + int site_ga_pad; /**< Used by link fattening and the gauge and fermion forces */ - int staple_pad; /**< Used by link fattening */ - int llfat_ga_pad; /**< Used by link fattening */ - int mom_ga_pad; /**< Used by the gauge and fermion forces */ + int staple_pad; /**< Used by link fattening */ + int llfat_ga_pad; /**< Used by link fattening */ + int mom_ga_pad; /**< Used by the gauge and fermion forces */ - QudaStaggeredPhase staggered_phase_type; /**< Set the staggered phase type of the links */ - int staggered_phase_applied; /**< Whether the staggered phase has already been applied to the links */ + QudaStaggeredPhase staggered_phase_type; /**< Set the staggered phase type of the links */ + int staggered_phase_applied; /**< Whether the staggered phase has already been applied to the links */ - double i_mu; /**< Imaginary chemical potential */ + double i_mu; /**< Imaginary chemical potential */ - int overlap; /**< Width of overlapping domains */ + int overlap; /**< Width of overlapping domains */ - int overwrite_gauge; /**< When computing gauge, should we overwrite it or accumulate to it */ - int overwrite_mom; /**< When computing momentum, should we overwrite it or accumulate to it */ + int overwrite_gauge; /**< When computing gauge, should we overwrite it or accumulate to it */ + int overwrite_mom; /**< When computing momentum, should we overwrite it or accumulate to it */ - int use_resident_gauge; /**< Use the resident gauge field as input */ - int use_resident_mom; /**< Use the resident momentum field as input*/ - int make_resident_gauge; /**< Make the result gauge field resident */ - int make_resident_mom; /**< Make the result momentum field resident */ - int return_result_gauge; /**< Return the result gauge field */ - int return_result_mom; /**< Return the result momentum field */ + int use_resident_gauge; /**< Use the resident gauge field as input */ + int use_resident_mom; /**< Use the resident momentum field as input*/ + int make_resident_gauge; /**< Make the result gauge field resident */ + int make_resident_mom; /**< Make the result momentum field resident */ + int return_result_gauge; /**< Return the result gauge field */ + int return_result_mom; /**< Return the result momentum field */ - size_t gauge_offset; /**< Offset into MILC site struct to the gauge field (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ - size_t mom_offset; /**< Offset into MILC site struct to the momentum field (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ - size_t site_size; /**< Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ -} QudaGaugeParam; + size_t gauge_offset; /**< Offset into MILC site struct to the gauge field (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ + size_t mom_offset; /**< Offset into MILC site struct to the momentum field (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ + size_t site_size; /**< Size of MILC site struct (only if gauge_order=MILC_SITE_GAUGE_ORDER) */ + } QudaGaugeParam; -/** - * Parameters relating to the solver and the choice of Dirac operator. - */ -typedef struct QudaInvertParam_s { - /** Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ - size_t struct_size; + /** + * Parameters relating to the solver and the choice of Dirac operator. + */ + typedef struct QudaInvertParam_s { - QudaFieldLocation input_location; /**< The location of the input field */ - QudaFieldLocation output_location; /**< The location of the output field */ + /** Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ + size_t struct_size; - QudaDslashType dslash_type; /**< The Dirac Dslash type that is being used */ - QudaInverterType inv_type; /**< Which linear solver to use */ + QudaFieldLocation input_location; /**< The location of the input field */ + QudaFieldLocation output_location; /**< The location of the output field */ - double mass; /**< Used for staggered only */ - double kappa; /**< Used for Wilson and Wilson-clover */ + QudaDslashType dslash_type; /**< The Dirac Dslash type that is being used */ + QudaInverterType inv_type; /**< Which linear solver to use */ - double m5; /**< Domain wall height */ - int Ls; /**< Extent of the 5th dimension (for domain wall) */ + double mass; /**< Used for staggered only */ + double kappa; /**< Used for Wilson and Wilson-clover */ - double_complex b_5[QUDA_MAX_DWF_LS]; /**< Mobius coefficients - only real part used if regular Mobius */ - double_complex c_5[QUDA_MAX_DWF_LS]; /**< Mobius coefficients - only real part used if regular Mobius */ + double m5; /**< Domain wall height */ + int Ls; /**< Extent of the 5th dimension (for domain wall) */ - /**< - * The following specifies the EOFA parameters. Notation follows arXiv:1706.05843 - * eofa_shift: the "\beta" in the paper - * eofa_pm: plus or minus for the EOFA operator - * mq1, mq2, mq3 are the three masses corresponds to Hasenbusch mass spliting. - * As far as I know mq1 is always the same as "mass" but it's here just for consistence. - * */ - double eofa_shift; - int eofa_pm; - double mq1; - double mq2; - double mq3; + double_complex b_5[QUDA_MAX_DWF_LS]; /**< Mobius coefficients - only real part used if regular Mobius */ + double_complex c_5[QUDA_MAX_DWF_LS]; /**< Mobius coefficients - only real part used if regular Mobius */ - double mu; /**< Twisted mass parameter */ - double tm_rho; /**< Hasenbusch mass shift applied like twisted mass to diagonal (but not inverse) */ - double epsilon; /**< Twisted mass parameter */ + /**< + * The following specifies the EOFA parameters. Notation follows arXiv:1706.05843 + * eofa_shift: the "\beta" in the paper + * eofa_pm: plus or minus for the EOFA operator + * mq1, mq2, mq3 are the three masses corresponds to Hasenbusch mass spliting. + * As far as I know mq1 is always the same as "mass" but it's here just for consistence. + * */ + double eofa_shift; + int eofa_pm; + double mq1; + double mq2; + double mq3; - QudaTwistFlavorType twist_flavor; /**< Twisted mass flavor */ + double mu; /**< Twisted mass parameter */ + double tm_rho; /**< Hasenbusch mass shift applied like twisted mass to diagonal (but not inverse) */ + double epsilon; /**< Twisted mass parameter */ - int laplace3D; /**< omit this direction from laplace operator: x,y,z,t -> 0,1,2,3 (-1 is full 4D) */ + QudaTwistFlavorType twist_flavor; /**< Twisted mass flavor */ - double tol; /**< Solver tolerance in the L2 residual norm */ - double tol_restart; /**< Solver tolerance in the L2 residual norm (used to restart InitCG) */ - double tol_hq; /**< Solver tolerance in the heavy quark residual norm */ + int laplace3D; /**< omit this direction from laplace operator: x,y,z,t -> 0,1,2,3 (-1 is full 4D) */ - int compute_true_res; /** Whether to compute the true residual post solve */ - double true_res; /**< Actual L2 residual norm achieved in solver */ - double true_res_hq; /**< Actual heavy quark residual norm achieved in solver */ - int maxiter; /**< Maximum number of iterations in the linear solver */ - double reliable_delta; /**< Reliable update tolerance */ - double reliable_delta_refinement; /**< Reliable update tolerance used in post multi-shift solver refinement */ - int use_alternative_reliable; /**< Whether to use alternative reliable updates */ - int use_sloppy_partial_accumulator; /**< Whether to keep the partial solution accumuator in sloppy precision */ + double tol; /**< Solver tolerance in the L2 residual norm */ + double tol_restart; /**< Solver tolerance in the L2 residual norm (used to restart InitCG) */ + double tol_hq; /**< Solver tolerance in the heavy quark residual norm */ - /**< This parameter determines how often we accumulate into the - solution vector from the direction vectors in the solver. - E.g., running with solution_accumulator_pipeline = 4, means we - will update the solution vector every four iterations using the - direction vectors from the prior four iterations. This - increases performance of mixed-precision solvers since it means - less high-precision vector round-trip memory travel, but - requires more low-precision memory allocation. */ - int solution_accumulator_pipeline; + int compute_true_res; /** Whether to compute the true residual post solve */ + double true_res; /**< Actual L2 residual norm achieved in solver */ + double true_res_hq; /**< Actual heavy quark residual norm achieved in solver */ + int maxiter; /**< Maximum number of iterations in the linear solver */ + double reliable_delta; /**< Reliable update tolerance */ + double reliable_delta_refinement; /**< Reliable update tolerance used in post multi-shift solver refinement */ + int use_alternative_reliable; /**< Whether to use alternative reliable updates */ + int use_sloppy_partial_accumulator; /**< Whether to keep the partial solution accumuator in sloppy precision */ - /**< This parameter determines how many consecutive reliable update - residual increases we tolerate before terminating the solver, - i.e., how long do we want to keep trying to converge */ - int max_res_increase; + /**< This parameter determines how often we accumulate into the + solution vector from the direction vectors in the solver. + E.g., running with solution_accumulator_pipeline = 4, means we + will update the solution vector every four iterations using the + direction vectors from the prior four iterations. This + increases performance of mixed-precision solvers since it means + less high-precision vector round-trip memory travel, but + requires more low-precision memory allocation. */ + int solution_accumulator_pipeline; - /**< This parameter determines how many total reliable update - residual increases we tolerate before terminating the solver, - i.e., how long do we want to keep trying to converge */ - int max_res_increase_total; + /**< This parameter determines how many consecutive reliable update + residual increases we tolerate before terminating the solver, + i.e., how long do we want to keep trying to converge */ + int max_res_increase; - /**< This parameter determines how many consecutive heavy-quark - residual increases we tolerate before terminating the solver, - i.e., how long do we want to keep trying to converge */ - int max_hq_res_increase; + /**< This parameter determines how many total reliable update + residual increases we tolerate before terminating the solver, + i.e., how long do we want to keep trying to converge */ + int max_res_increase_total; - /**< This parameter determines how many total heavy-quark residual - restarts we tolerate before terminating the solver, i.e., how long - do we want to keep trying to converge */ - int max_hq_res_restart_total; + /**< This parameter determines how many consecutive heavy-quark + residual increases we tolerate before terminating the solver, + i.e., how long do we want to keep trying to converge */ + int max_hq_res_increase; - /**< After how many iterations shall the heavy quark residual be updated */ - int heavy_quark_check; + /**< This parameter determines how many total heavy-quark residual + restarts we tolerate before terminating the solver, i.e., how long + do we want to keep trying to converge */ + int max_hq_res_restart_total; - int pipeline; /**< Whether to use a pipelined solver with less global sums */ + /**< After how many iterations shall the heavy quark residual be updated */ + int heavy_quark_check; - int num_offset; /**< Number of offsets in the multi-shift solver */ + int pipeline; /**< Whether to use a pipelined solver with less global sums */ - int num_src; /**< Number of sources in the multiple source solver */ + int num_offset; /**< Number of offsets in the multi-shift solver */ - int num_src_per_sub_partition; /**< Number of sources in the multiple source solver, but per sub-partition */ + int num_src; /**< Number of sources in the multiple source solver */ - /**< The grid of sub-partition according to which the processor grid will be partitioned. - Should have: - split_grid[0] * split_grid[1] * split_grid[2] * split_grid[3] * num_src_per_sub_partition == num_src. **/ - int split_grid[QUDA_MAX_DIM]; + int num_src_per_sub_partition; /**< Number of sources in the multiple source solver, but per sub-partition */ - int overlap; /**< Width of domain overlaps */ + /**< The grid of sub-partition according to which the processor grid will be partitioned. + Should have: + split_grid[0] * split_grid[1] * split_grid[2] * split_grid[3] * num_src_per_sub_partition == num_src. **/ + int split_grid[QUDA_MAX_DIM]; - /** Offsets for multi-shift solver */ - double offset[QUDA_MAX_MULTI_SHIFT]; + int overlap; /**< Width of domain overlaps */ - /** Solver tolerance for each offset */ - double tol_offset[QUDA_MAX_MULTI_SHIFT]; + /** Offsets for multi-shift solver */ + double offset[QUDA_MAX_MULTI_SHIFT]; - /** Solver tolerance for each shift when refinement is applied using the heavy-quark residual */ - double tol_hq_offset[QUDA_MAX_MULTI_SHIFT]; + /** Solver tolerance for each offset */ + double tol_offset[QUDA_MAX_MULTI_SHIFT]; - /** Actual L2 residual norm achieved in solver for each offset */ - double true_res_offset[QUDA_MAX_MULTI_SHIFT]; + /** Solver tolerance for each shift when refinement is applied using the heavy-quark residual */ + double tol_hq_offset[QUDA_MAX_MULTI_SHIFT]; - /** Iterated L2 residual norm achieved in multi shift solver for each offset */ - double iter_res_offset[QUDA_MAX_MULTI_SHIFT]; + /** Actual L2 residual norm achieved in solver for each offset */ + double true_res_offset[QUDA_MAX_MULTI_SHIFT]; - /** Actual heavy quark residual norm achieved in solver for each offset */ - double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]; + /** Iterated L2 residual norm achieved in multi shift solver for each offset */ + double iter_res_offset[QUDA_MAX_MULTI_SHIFT]; - /** Residuals in the partial faction expansion */ - double residue[QUDA_MAX_MULTI_SHIFT]; + /** Actual heavy quark residual norm achieved in solver for each offset */ + double true_res_hq_offset[QUDA_MAX_MULTI_SHIFT]; - /** Whether we should evaluate the action after the linear solver*/ - int compute_action; + /** Residuals in the partial faction expansion */ + double residue[QUDA_MAX_MULTI_SHIFT]; - /** Computed value of the bilinear action (complex-valued) - invert: \phi^\dagger A^{-1} \phi - multishift: \phi^\dagger r(x) \phi = \phi^\dagger (sum_k residue[k] * (A + offset[k])^{-1} ) \phi */ - double action[2]; + /** Whether we should evaluate the action after the linear solver*/ + int compute_action; - QudaSolutionType solution_type; /**< Type of system to solve */ - QudaSolveType solve_type; /**< How to solve it */ - QudaMatPCType matpc_type; /**< The preconditioned matrix type */ - QudaDagType dagger; /**< Whether we are using the Hermitian conjugate system or not */ - QudaMassNormalization mass_normalization; /**< The mass normalization is being used by the caller */ - QudaSolverNormalization solver_normalization; /**< The normalization desired in the solver */ + /** Computed value of the bilinear action (complex-valued) + invert: \phi^\dagger A^{-1} \phi + multishift: \phi^\dagger r(x) \phi = \phi^\dagger (sum_k residue[k] * (A + offset[k])^{-1} ) \phi */ + double action[2]; - QudaPreserveSource preserve_source; /**< Preserve the source or not in the linear solver (deprecated) */ + QudaSolutionType solution_type; /**< Type of system to solve */ + QudaSolveType solve_type; /**< How to solve it */ + QudaMatPCType matpc_type; /**< The preconditioned matrix type */ + QudaDagType dagger; /**< Whether we are using the Hermitian conjugate system or not */ + QudaMassNormalization mass_normalization; /**< The mass normalization is being used by the caller */ + QudaSolverNormalization solver_normalization; /**< The normalization desired in the solver */ - QudaPrecision cpu_prec; /**< The precision used by the input fermion fields */ - QudaPrecision cuda_prec; /**< The precision used by the QUDA solver */ - QudaPrecision cuda_prec_sloppy; /**< The precision used by the QUDA sloppy operator */ - QudaPrecision - cuda_prec_refinement_sloppy; /**< The precision of the sloppy gauge field for the refinement step in multishift */ - QudaPrecision cuda_prec_precondition; /**< The precision used by the QUDA preconditioner */ - QudaPrecision cuda_prec_eigensolver; /**< The precision used by the QUDA eigensolver */ + QudaPreserveSource preserve_source; /**< Preserve the source or not in the linear solver (deprecated) */ - QudaDiracFieldOrder dirac_order; /**< The order of the input and output fermion fields */ + QudaPrecision cpu_prec; /**< The precision used by the input fermion fields */ + QudaPrecision cuda_prec; /**< The precision used by the QUDA solver */ + QudaPrecision cuda_prec_sloppy; /**< The precision used by the QUDA sloppy operator */ + QudaPrecision cuda_prec_refinement_sloppy; /**< The precision of the sloppy gauge field for the refinement step in multishift */ + QudaPrecision cuda_prec_precondition; /**< The precision used by the QUDA preconditioner */ + QudaPrecision cuda_prec_eigensolver; /**< The precision used by the QUDA eigensolver */ - QudaGammaBasis gamma_basis; /**< Gamma basis of the input and output host fields */ + QudaDiracFieldOrder dirac_order; /**< The order of the input and output fermion fields */ - QudaFieldLocation clover_location; /**< The location of the clover field */ - QudaPrecision clover_cpu_prec; /**< The precision used for the input clover field */ - QudaPrecision clover_cuda_prec; /**< The precision used for the clover field in the QUDA solver */ - QudaPrecision clover_cuda_prec_sloppy; /**< The precision used for the clover field in the QUDA sloppy operator */ - QudaPrecision clover_cuda_prec_refinement_sloppy; /**< The precision of the sloppy clover field for the refinement step in multishift */ - QudaPrecision clover_cuda_prec_precondition; /**< The precision used for the clover field in the QUDA preconditioner */ - QudaPrecision clover_cuda_prec_eigensolver; /**< The precision used for the clover field in the QUDA eigensolver */ + QudaGammaBasis gamma_basis; /**< Gamma basis of the input and output host fields */ - QudaCloverFieldOrder clover_order; /**< The order of the input clover field */ - QudaUseInitGuess use_init_guess; /**< Whether to use an initial guess in the solver or not */ + QudaFieldLocation clover_location; /**< The location of the clover field */ + QudaPrecision clover_cpu_prec; /**< The precision used for the input clover field */ + QudaPrecision clover_cuda_prec; /**< The precision used for the clover field in the QUDA solver */ + QudaPrecision clover_cuda_prec_sloppy; /**< The precision used for the clover field in the QUDA sloppy operator */ + QudaPrecision clover_cuda_prec_refinement_sloppy; /**< The precision of the sloppy clover field for the refinement step in multishift */ + QudaPrecision clover_cuda_prec_precondition; /**< The precision used for the clover field in the QUDA preconditioner */ + QudaPrecision clover_cuda_prec_eigensolver; /**< The precision used for the clover field in the QUDA eigensolver */ - double clover_csw; /**< Csw coefficient of the clover term */ - double clover_coeff; /**< Coefficient of the clover term */ - double clover_rho; /**< Real number added to the clover diagonal (not to inverse) */ + QudaCloverFieldOrder clover_order; /**< The order of the input clover field */ + QudaUseInitGuess use_init_guess; /**< Whether to use an initial guess in the solver or not */ - int compute_clover_trlog; /**< Whether to compute the trace log of the clover term */ - double trlogA[2]; /**< The trace log of the clover term (even/odd computed separately) */ + double clover_csw; /**< Csw coefficient of the clover term */ + double clover_coeff; /**< Coefficient of the clover term */ + double clover_rho; /**< Real number added to the clover diagonal (not to inverse) */ - int compute_clover; /**< Whether to compute the clover field */ - int compute_clover_inverse; /**< Whether to compute the clover inverse field */ - int return_clover; /**< Whether to copy back the clover matrix field */ - int return_clover_inverse; /**< Whether to copy back the inverted clover matrix field */ + int compute_clover_trlog; /**< Whether to compute the trace log of the clover term */ + double trlogA[2]; /**< The trace log of the clover term (even/odd computed separately) */ - QudaVerbosity verbosity; /**< The verbosity setting to use in the solver */ + int compute_clover; /**< Whether to compute the clover field */ + int compute_clover_inverse; /**< Whether to compute the clover inverse field */ + int return_clover; /**< Whether to copy back the clover matrix field */ + int return_clover_inverse; /**< Whether to copy back the inverted clover matrix field */ - int iter; /**< The number of iterations performed by the solver */ - double gflops; /**< The Gflops rate of the solver */ - double secs; /**< The time taken by the solver */ + QudaVerbosity verbosity; /**< The verbosity setting to use in the solver */ - QudaTune tune; /**< Enable auto-tuning? (default = QUDA_TUNE_YES) */ + int iter; /**< The number of iterations performed by the solver */ + double gflops; /**< The Gflops rate of the solver */ + double secs; /**< The time taken by the solver */ - /** Number of steps in s-step algorithms */ - int Nsteps; + QudaTune tune; /**< Enable auto-tuning? (default = QUDA_TUNE_YES) */ - /** Maximum size of Krylov space used by solver */ - int gcrNkrylov; + /** Number of steps in s-step algorithms */ + int Nsteps; - /* - * The following parameters are related to the solver - * preconditioner, if enabled. - */ + /** Maximum size of Krylov space used by solver */ + int gcrNkrylov; - /** - * The inner Krylov solver used in the preconditioner. Set to - * QUDA_INVALID_INVERTER to disable the preconditioner entirely. - */ - QudaInverterType inv_type_precondition; + /* + * The following parameters are related to the solver + * preconditioner, if enabled. + */ - /** Preconditioner instance, e.g., multigrid */ - void *preconditioner; + /** + * The inner Krylov solver used in the preconditioner. Set to + * QUDA_INVALID_INVERTER to disable the preconditioner entirely. + */ + QudaInverterType inv_type_precondition; - /** Deflation instance */ - void *deflation_op; + /** Preconditioner instance, e.g., multigrid */ + void *preconditioner; - /** defines deflation */ - void *eig_param; + /** Deflation instance */ + void *deflation_op; - /** If true, deflate the initial guess */ - QudaBoolean deflate; + /** defines deflation */ + void *eig_param; - /** Dirac Dslash used in preconditioner */ - QudaDslashType dslash_type_precondition; - /** Verbosity of the inner Krylov solver */ - QudaVerbosity verbosity_precondition; + /** If true, deflate the initial guess */ + QudaBoolean deflate; - /** Tolerance in the inner solver */ - double tol_precondition; + /** Dirac Dslash used in preconditioner */ + QudaDslashType dslash_type_precondition; + /** Verbosity of the inner Krylov solver */ + QudaVerbosity verbosity_precondition; - /** Maximum number of iterations allowed in the inner solver */ - int maxiter_precondition; + /** Tolerance in the inner solver */ + double tol_precondition; - /** Relaxation parameter used in GCR-DD (default = 1.0) */ - double omega; + /** Maximum number of iterations allowed in the inner solver */ + int maxiter_precondition; - /** Basis for CA algorithms */ - QudaCABasis ca_basis; + /** Relaxation parameter used in GCR-DD (default = 1.0) */ + double omega; - /** Minimum eigenvalue for Chebyshev CA basis */ - double ca_lambda_min; + /** Basis for CA algorithms */ + QudaCABasis ca_basis; - /** Maximum eigenvalue for Chebyshev CA basis */ - double ca_lambda_max; + /** Minimum eigenvalue for Chebyshev CA basis */ + double ca_lambda_min; - /** Basis for CA algorithms in a preconditioned solver */ - QudaCABasis ca_basis_precondition; + /** Maximum eigenvalue for Chebyshev CA basis */ + double ca_lambda_max; - /** Minimum eigenvalue for Chebyshev CA basis in a preconditioner solver */ - double ca_lambda_min_precondition; + /** Basis for CA algorithms in a preconditioned solver */ + QudaCABasis ca_basis_precondition; - /** Maximum eigenvalue for Chebyshev CA basis in a preconditioner solver */ - double ca_lambda_max_precondition; + /** Minimum eigenvalue for Chebyshev CA basis in a preconditioner solver */ + double ca_lambda_min_precondition; - /** Number of preconditioner cycles to perform per iteration */ - int precondition_cycle; + /** Maximum eigenvalue for Chebyshev CA basis in a preconditioner solver */ + double ca_lambda_max_precondition; - /** Whether to use additive or multiplicative Schwarz preconditioning */ - QudaSchwarzType schwarz_type; + /** Number of preconditioner cycles to perform per iteration */ + int precondition_cycle; - /** The type of accelerator type to use for preconditioner */ - QudaAcceleratorType accelerator_type_precondition; + /** Whether to use additive or multiplicative Schwarz preconditioning */ + QudaSchwarzType schwarz_type; - /** - * The following parameters are the ones used to perform the adaptive MADWF in MSPCG - * See section 3.3 of [arXiv:2104.05615] - */ + /** The type of accelerator type to use for preconditioner */ + QudaAcceleratorType accelerator_type_precondition; - /** The diagonal constant to suppress the low modes when performing 5D transfer */ - double madwf_diagonal_suppressor; + /** + * The following parameters are the ones used to perform the adaptive MADWF in MSPCG + * See section 3.3 of [arXiv:2104.05615] + */ - /** The target MADWF Ls to be used in the accelerator */ - int madwf_ls; + /** The diagonal constant to suppress the low modes when performing 5D transfer */ + double madwf_diagonal_suppressor; - /** The minimum number of iterations after which to generate the null vectors for MADWF */ - int madwf_null_miniter; + /** The target MADWF Ls to be used in the accelerator */ + int madwf_ls; - /** The maximum tolerance after which to generate the null vectors for MADWF */ - double madwf_null_tol; + /** The minimum number of iterations after which to generate the null vectors for MADWF */ + int madwf_null_miniter; - /** The maximum number of iterations for the training iterations */ - int madwf_train_maxiter; + /** The maximum tolerance after which to generate the null vectors for MADWF */ + double madwf_null_tol; - /** Whether to load the MADWF parameters from the file system */ - QudaBoolean madwf_param_load; + /** The maximum number of iterations for the training iterations */ + int madwf_train_maxiter; - /** Whether to save the MADWF parameters to the file system */ - QudaBoolean madwf_param_save; + /** Whether to load the MADWF parameters from the file system */ + QudaBoolean madwf_param_load; - /** Path to load from the file system */ - char madwf_param_infile[256]; + /** Whether to save the MADWF parameters to the file system */ + QudaBoolean madwf_param_save; - /** Path to save to the file system */ - char madwf_param_outfile[256]; + /** Path to load from the file system */ + char madwf_param_infile[256]; - /** - * Whether to use the L2 relative residual, Fermilab heavy-quark - * residual, or both to determine convergence. To require that both - * stopping conditions are satisfied, use a bitwise OR as follows: - * - * p.residual_type = (QudaResidualType) (QUDA_L2_RELATIVE_RESIDUAL - * | QUDA_HEAVY_QUARK_RESIDUAL); - */ - QudaResidualType residual_type; + /** Path to save to the file system */ + char madwf_param_outfile[256]; - /**Parameters for deflated solvers*/ - /** The precision of the Ritz vectors */ - QudaPrecision cuda_prec_ritz; - /** How many vectors to compute after one solve - * for eigCG recommended values 8 or 16 - */ - int n_ev; - /** EeigCG : Search space dimension - * gmresdr : Krylov subspace dimension - */ - int max_search_dim; - /** For systems with many RHS: current RHS index */ - int rhs_idx; - /** Specifies deflation space volume: total number of eigenvectors is n_ev*deflation_grid */ - int deflation_grid; - /** eigCG: selection criterion for the reduced eigenvector set */ - double eigenval_tol; - /** mixed precision eigCG tuning parameter: minimum search vector space restarts */ - int eigcg_max_restarts; - /** initCG tuning parameter: maximum restarts */ - int max_restart_num; - /** initCG tuning parameter: tolerance for cg refinement corrections in the deflation stage */ - double inc_tol; + /** + * Whether to use the L2 relative residual, Fermilab heavy-quark + * residual, or both to determine convergence. To require that both + * stopping conditions are satisfied, use a bitwise OR as follows: + * + * p.residual_type = (QudaResidualType) (QUDA_L2_RELATIVE_RESIDUAL + * | QUDA_HEAVY_QUARK_RESIDUAL); + */ + QudaResidualType residual_type; - /** Whether to make the solution vector(s) after the solve */ - int make_resident_solution; - - /** Whether to use the resident solution vector(s) */ - int use_resident_solution; - - /** Whether to use the solution vector to augment the chronological basis */ - int chrono_make_resident; - - /** Whether the solution should replace the last entry in the chronology */ - int chrono_replace_last; - - /** Whether to use the resident chronological basis */ - int chrono_use_resident; - - /** The maximum length of the chronological history to store */ - int chrono_max_dim; - - /** The index to indicate which chrono history we are augmenting */ - int chrono_index; - - /** Precision to store the chronological basis in */ - QudaPrecision chrono_precision; - - /** Which external library to use in the linear solvers (Eigen) */ - QudaExtLibType extlib_type; - - /** Whether to use the platform native or generic BLAS / LAPACK */ - QudaBoolean native_blas_lapack; - - /** Whether to use fused kernels for mobius */ - QudaBoolean use_mobius_fused_kernel; - -} QudaInvertParam; - -// Parameter set for solving eigenvalue problems. -typedef struct QudaEigParam_s { - /** Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ - size_t struct_size; - - // EIGENSOLVER PARAMS - //------------------------------------------------- - /** Used to store information pertinent to the operator **/ - QudaInvertParam *invert_param; - - /** Type of eigensolver algorithm to employ **/ - QudaEigType eig_type; - - /** Use Polynomial Acceleration **/ - QudaBoolean use_poly_acc; - - /** Degree of the Chebysev polynomial **/ - int poly_deg; - - /** Range used in polynomial acceleration **/ - double a_min; - double a_max; - - /** Whether to preserve the deflation space between solves. If - true, the space will be stored in an instance of the - deflation_space struct, pointed to by preserve_deflation_space */ - QudaBoolean preserve_deflation; - - /** This is where we store the deflation space. This will point - to an instance of deflation_space. When a deflated solver is enabled, the deflation space will be obtained from this. */ - void *preserve_deflation_space; - - /** If we restore the deflation space, this boolean indicates - whether we are also preserving the evalues or recomputing - them. For example if a different mass shift is being used - than the one used to generate the space, then this should be - false, but preserve_deflation would be true */ - QudaBoolean preserve_evals; - - /** What type of Dirac operator we are using **/ - /** If !(use_norm_op) && !(use_dagger) use M. **/ - /** If use_dagger, use Mdag **/ - /** If use_norm_op, use MdagM **/ - /** If use_norm_op && use_dagger use MMdag. **/ - /** If use_pc for any, then use the even-odd pc version **/ - QudaBoolean use_dagger; - QudaBoolean use_norm_op; - QudaBoolean use_pc; - - /** Use Eigen routines to eigensolve the upper Hessenberg via QR **/ - QudaBoolean use_eigen_qr; - - /** Performs an MdagM solve, then constructs the left and right SVD. **/ - QudaBoolean compute_svd; + /**Parameters for deflated solvers*/ + /** The precision of the Ritz vectors */ + QudaPrecision cuda_prec_ritz; + /** How many vectors to compute after one solve + * for eigCG recommended values 8 or 16 + */ + int n_ev; + /** EeigCG : Search space dimension + * gmresdr : Krylov subspace dimension + */ + int max_search_dim; + /** For systems with many RHS: current RHS index */ + int rhs_idx; + /** Specifies deflation space volume: total number of eigenvectors is n_ev*deflation_grid */ + int deflation_grid; + /** eigCG: selection criterion for the reduced eigenvector set */ + double eigenval_tol; + /** mixed precision eigCG tuning parameter: minimum search vector space restarts */ + int eigcg_max_restarts; + /** initCG tuning parameter: maximum restarts */ + int max_restart_num; + /** initCG tuning parameter: tolerance for cg refinement corrections in the deflation stage */ + double inc_tol; - /** Performs the \gamma_5 OP solve by Post multipling the eignvectors with - \gamma_5 before computing the eigenvalues */ - QudaBoolean compute_gamma5; + /** Whether to make the solution vector(s) after the solve */ + int make_resident_solution; - /** If true, the solver will error out if the convergence criteria are not met **/ - QudaBoolean require_convergence; + /** Whether to use the resident solution vector(s) */ + int use_resident_solution; + + /** Whether to use the solution vector to augment the chronological basis */ + int chrono_make_resident; + + /** Whether the solution should replace the last entry in the chronology */ + int chrono_replace_last; + + /** Whether to use the resident chronological basis */ + int chrono_use_resident; + + /** The maximum length of the chronological history to store */ + int chrono_max_dim; + + /** The index to indicate which chrono history we are augmenting */ + int chrono_index; + + /** Precision to store the chronological basis in */ + QudaPrecision chrono_precision; + + /** Which external library to use in the linear solvers (Eigen) */ + QudaExtLibType extlib_type; + + /** Whether to use the platform native or generic BLAS / LAPACK */ + QudaBoolean native_blas_lapack; + + /** Whether to use fused kernels for mobius */ + QudaBoolean use_mobius_fused_kernel; + + } QudaInvertParam; + + // Parameter set for solving eigenvalue problems. + typedef struct QudaEigParam_s { + /** Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ + size_t struct_size; + + // EIGENSOLVER PARAMS + //------------------------------------------------- + /** Used to store information pertinent to the operator **/ + QudaInvertParam *invert_param; + + /** Type of eigensolver algorithm to employ **/ + QudaEigType eig_type; + + /** Use Polynomial Acceleration **/ + QudaBoolean use_poly_acc; + + /** Degree of the Chebysev polynomial **/ + int poly_deg; + + /** Range used in polynomial acceleration **/ + double a_min; + double a_max; + + /** Whether to preserve the deflation space between solves. If + true, the space will be stored in an instance of the + deflation_space struct, pointed to by preserve_deflation_space */ + QudaBoolean preserve_deflation; + + /** This is where we store the deflation space. This will point + to an instance of deflation_space. When a deflated solver is enabled, the deflation space will be obtained from this. */ + void *preserve_deflation_space; + + /** If we restore the deflation space, this boolean indicates + whether we are also preserving the evalues or recomputing + them. For example if a different mass shift is being used + than the one used to generate the space, then this should be + false, but preserve_deflation would be true */ + QudaBoolean preserve_evals; + + /** What type of Dirac operator we are using **/ + /** If !(use_norm_op) && !(use_dagger) use M. **/ + /** If use_dagger, use Mdag **/ + /** If use_norm_op, use MdagM **/ + /** If use_norm_op && use_dagger use MMdag. **/ + /** If use_pc for any, then use the even-odd pc version **/ + QudaBoolean use_dagger; + QudaBoolean use_norm_op; + QudaBoolean use_pc; + + /** Use Eigen routines to eigensolve the upper Hessenberg via QR **/ + QudaBoolean use_eigen_qr; + + /** Performs an MdagM solve, then constructs the left and right SVD. **/ + QudaBoolean compute_svd; - /** Which part of the spectrum to solve **/ - QudaEigSpectrumType spectrum; + /** Performs the \gamma_5 OP solve by Post multipling the eignvectors with + \gamma_5 before computing the eigenvalues */ + QudaBoolean compute_gamma5; - /** Size of the eigenvector search space **/ - int n_ev; - /** Total size of Krylov space **/ - int n_kr; - /** Max number of locked eigenpairs (deduced at runtime) **/ - int nLockedMax; - /** Number of requested converged eigenvectors **/ - int n_conv; - /** Number of requested converged eigenvectors to use in deflation **/ - int n_ev_deflate; - /** Tolerance on the least well known eigenvalue's residual **/ - double tol; - /** Tolerance on the QR iteration **/ - double qr_tol; - /** For IRLM/IRAM, check every nth restart **/ - int check_interval; - /** For IRLM/IRAM, quit after n restarts **/ - int max_restarts; - /** For the Ritz rotation, the maximal number of extra vectors the solver may allocate **/ - int batched_rotate; - /** For block method solvers, the block size **/ - int block_size; + /** If true, the solver will error out if the convergence criteria are not met **/ + QudaBoolean require_convergence; - /** In the test function, cross check the device result against ARPACK **/ - QudaBoolean arpack_check; - /** For Arpack cross check, name of the Arpack logfile **/ - char arpack_logfile[512]; + /** Which part of the spectrum to solve **/ + QudaEigSpectrumType spectrum; - /** Name of the QUDA logfile (residua, upper Hessenberg/tridiag matrix updates) **/ - char QUDA_logfile[512]; + /** Size of the eigenvector search space **/ + int n_ev; + /** Total size of Krylov space **/ + int n_kr; + /** Max number of locked eigenpairs (deduced at runtime) **/ + int nLockedMax; + /** Number of requested converged eigenvectors **/ + int n_conv; + /** Number of requested converged eigenvectors to use in deflation **/ + int n_ev_deflate; + /** Tolerance on the least well known eigenvalue's residual **/ + double tol; + /** Tolerance on the QR iteration **/ + double qr_tol; + /** For IRLM/IRAM, check every nth restart **/ + int check_interval; + /** For IRLM/IRAM, quit after n restarts **/ + int max_restarts; + /** For the Ritz rotation, the maximal number of extra vectors the solver may allocate **/ + int batched_rotate; + /** For block method solvers, the block size **/ + int block_size; - //------------------------------------------------- + /** In the test function, cross check the device result against ARPACK **/ + QudaBoolean arpack_check; + /** For Arpack cross check, name of the Arpack logfile **/ + char arpack_logfile[512]; - // EIG-CG PARAMS - //------------------------------------------------- - int nk; - int np; + /** Name of the QUDA logfile (residua, upper Hessenberg/tridiag matrix updates) **/ + char QUDA_logfile[512]; - /** Whether to load eigenvectors */ - QudaBoolean import_vectors; + //------------------------------------------------- - /** The precision of the Ritz vectors */ - QudaPrecision cuda_prec_ritz; + // EIG-CG PARAMS + //------------------------------------------------- + int nk; + int np; - /** The memory type used to keep the Ritz vectors */ - QudaMemoryType mem_type_ritz; + /** Whether to load eigenvectors */ + QudaBoolean import_vectors; - /** Location where deflation should be done */ - QudaFieldLocation location; + /** The precision of the Ritz vectors */ + QudaPrecision cuda_prec_ritz; - /** Whether to run the verification checks once set up is complete */ - QudaBoolean run_verify; + /** The memory type used to keep the Ritz vectors */ + QudaMemoryType mem_type_ritz; - /** Filename prefix where to load the null-space vectors */ - char vec_infile[256]; + /** Location where deflation should be done */ + QudaFieldLocation location; - /** Filename prefix for where to save the null-space vectors */ - char vec_outfile[256]; + /** Whether to run the verification checks once set up is complete */ + QudaBoolean run_verify; - /** The precision with which to save the vectors */ - QudaPrecision save_prec; + /** Filename prefix where to load the null-space vectors */ + char vec_infile[256]; - /** Whether to inflate single-parity eigen-vector I/O to a full - field (e.g., enabling this is required for compatability with - MILC I/O) */ - QudaBoolean io_parity_inflate; + /** Filename prefix for where to save the null-space vectors */ + char vec_outfile[256]; - /** The Gflops rate of the eigensolver setup */ - double gflops; + /** The precision with which to save the vectors */ + QudaPrecision save_prec; - /**< The time taken by the eigensolver setup */ - double secs; + /** Whether to inflate single-parity eigen-vector I/O to a full + field (e.g., enabling this is required for compatability with + MILC I/O) */ + QudaBoolean io_parity_inflate; - /** Which external library to use in the deflation operations (Eigen) */ - QudaExtLibType extlib_type; - //------------------------------------------------- -} QudaEigParam; + /** The Gflops rate of the eigensolver setup */ + double gflops; -typedef struct QudaMultigridParam_s { + /**< The time taken by the eigensolver setup */ + double secs; - /** Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ - size_t struct_size; + /** Which external library to use in the deflation operations (Eigen) */ + QudaExtLibType extlib_type; + //------------------------------------------------- + } QudaEigParam; - QudaInvertParam *invert_param; + typedef struct QudaMultigridParam_s { - QudaEigParam *eig_param[QUDA_MAX_MG_LEVEL]; + /** Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ + size_t struct_size; - /** Number of multigrid levels */ - int n_level; + QudaInvertParam *invert_param; - /** Geometric block sizes to use on each level */ - int geo_block_size[QUDA_MAX_MG_LEVEL][QUDA_MAX_DIM]; + QudaEigParam *eig_param[QUDA_MAX_MG_LEVEL]; - /** Spin block sizes to use on each level */ - int spin_block_size[QUDA_MAX_MG_LEVEL]; + /** Number of multigrid levels */ + int n_level; - /** Number of null-space vectors to use on each level */ - int n_vec[QUDA_MAX_MG_LEVEL]; + /** Geometric block sizes to use on each level */ + int geo_block_size[QUDA_MAX_MG_LEVEL][QUDA_MAX_DIM]; - /** Precision to store the null-space vectors in (post block orthogonalization) */ - QudaPrecision precision_null[QUDA_MAX_MG_LEVEL]; + /** Spin block sizes to use on each level */ + int spin_block_size[QUDA_MAX_MG_LEVEL]; - /** Number of times to repeat Gram-Schmidt in block orthogonalization */ - int n_block_ortho[QUDA_MAX_MG_LEVEL]; + /** Number of null-space vectors to use on each level */ + int n_vec[QUDA_MAX_MG_LEVEL]; - /** Whether to do passes at block orthogonalize in fixed point for improved accuracy */ - QudaBoolean block_ortho_two_pass[QUDA_MAX_MG_LEVEL]; + /** Precision to store the null-space vectors in (post block orthogonalization) */ + QudaPrecision precision_null[QUDA_MAX_MG_LEVEL]; - /** Verbosity on each level of the multigrid */ - QudaVerbosity verbosity[QUDA_MAX_MG_LEVEL]; + /** Number of times to repeat Gram-Schmidt in block orthogonalization */ + int n_block_ortho[QUDA_MAX_MG_LEVEL]; - /** Inverter to use in the setup phase */ - QudaInverterType setup_inv_type[QUDA_MAX_MG_LEVEL]; + /** Whether to do passes at block orthogonalize in fixed point for improved accuracy */ + QudaBoolean block_ortho_two_pass[QUDA_MAX_MG_LEVEL]; - /** Number of setup iterations */ - int num_setup_iter[QUDA_MAX_MG_LEVEL]; + /** Verbosity on each level of the multigrid */ + QudaVerbosity verbosity[QUDA_MAX_MG_LEVEL]; - /** Tolerance to use in the setup phase */ - double setup_tol[QUDA_MAX_MG_LEVEL]; + /** Inverter to use in the setup phase */ + QudaInverterType setup_inv_type[QUDA_MAX_MG_LEVEL]; - /** Maximum number of iterations for each setup solver */ - int setup_maxiter[QUDA_MAX_MG_LEVEL]; + /** Number of setup iterations */ + int num_setup_iter[QUDA_MAX_MG_LEVEL]; - /** Maximum number of iterations for refreshing the null-space vectors */ - int setup_maxiter_refresh[QUDA_MAX_MG_LEVEL]; + /** Tolerance to use in the setup phase */ + double setup_tol[QUDA_MAX_MG_LEVEL]; - /** Basis to use for CA solver setup */ - QudaCABasis setup_ca_basis[QUDA_MAX_MG_LEVEL]; + /** Maximum number of iterations for each setup solver */ + int setup_maxiter[QUDA_MAX_MG_LEVEL]; - /** Basis size for CA solver setup */ - int setup_ca_basis_size[QUDA_MAX_MG_LEVEL]; + /** Maximum number of iterations for refreshing the null-space vectors */ + int setup_maxiter_refresh[QUDA_MAX_MG_LEVEL]; - /** Minimum eigenvalue for Chebyshev CA basis */ - double setup_ca_lambda_min[QUDA_MAX_MG_LEVEL]; + /** Basis to use for CA solver setup */ + QudaCABasis setup_ca_basis[QUDA_MAX_MG_LEVEL]; - /** Maximum eigenvalue for Chebyshev CA basis */ - double setup_ca_lambda_max[QUDA_MAX_MG_LEVEL]; + /** Basis size for CA solver setup */ + int setup_ca_basis_size[QUDA_MAX_MG_LEVEL]; - /** Null-space type to use in the setup phase */ - QudaSetupType setup_type; + /** Minimum eigenvalue for Chebyshev CA basis */ + double setup_ca_lambda_min[QUDA_MAX_MG_LEVEL]; - /** Pre orthonormalize vectors in the setup phase */ - QudaBoolean pre_orthonormalize; + /** Maximum eigenvalue for Chebyshev CA basis */ + double setup_ca_lambda_max[QUDA_MAX_MG_LEVEL]; - /** Post orthonormalize vectors in the setup phase */ - QudaBoolean post_orthonormalize; + /** Null-space type to use in the setup phase */ + QudaSetupType setup_type; - /** The solver that wraps around the coarse grid correction and smoother */ - QudaInverterType coarse_solver[QUDA_MAX_MG_LEVEL]; + /** Pre orthonormalize vectors in the setup phase */ + QudaBoolean pre_orthonormalize; - /** Tolerance for the solver that wraps around the coarse grid correction and smoother */ - double coarse_solver_tol[QUDA_MAX_MG_LEVEL]; + /** Post orthonormalize vectors in the setup phase */ + QudaBoolean post_orthonormalize; - /** Maximum number of iterations for the solver that wraps around the coarse grid correction and smoother */ - int coarse_solver_maxiter[QUDA_MAX_MG_LEVEL]; + /** The solver that wraps around the coarse grid correction and smoother */ + QudaInverterType coarse_solver[QUDA_MAX_MG_LEVEL]; - /** Basis to use for CA coarse solvers */ - QudaCABasis coarse_solver_ca_basis[QUDA_MAX_MG_LEVEL]; + /** Tolerance for the solver that wraps around the coarse grid correction and smoother */ + double coarse_solver_tol[QUDA_MAX_MG_LEVEL]; - /** Basis size for CA coarse solvers */ - int coarse_solver_ca_basis_size[QUDA_MAX_MG_LEVEL]; + /** Maximum number of iterations for the solver that wraps around the coarse grid correction and smoother */ + int coarse_solver_maxiter[QUDA_MAX_MG_LEVEL]; - /** Minimum eigenvalue for Chebyshev CA basis */ - double coarse_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL]; + /** Basis to use for CA coarse solvers */ + QudaCABasis coarse_solver_ca_basis[QUDA_MAX_MG_LEVEL]; - /** Maximum eigenvalue for Chebyshev CA basis */ - double coarse_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL]; + /** Basis size for CA coarse solvers */ + int coarse_solver_ca_basis_size[QUDA_MAX_MG_LEVEL]; - /** Smoother to use on each level */ - QudaInverterType smoother[QUDA_MAX_MG_LEVEL]; + /** Minimum eigenvalue for Chebyshev CA basis */ + double coarse_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL]; - /** Tolerance to use for the smoother / solver on each level */ - double smoother_tol[QUDA_MAX_MG_LEVEL]; + /** Maximum eigenvalue for Chebyshev CA basis */ + double coarse_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL]; - /** Number of pre-smoother applications on each level */ - int nu_pre[QUDA_MAX_MG_LEVEL]; + /** Smoother to use on each level */ + QudaInverterType smoother[QUDA_MAX_MG_LEVEL]; - /** Number of post-smoother applications on each level */ - int nu_post[QUDA_MAX_MG_LEVEL]; + /** Tolerance to use for the smoother / solver on each level */ + double smoother_tol[QUDA_MAX_MG_LEVEL]; - /** Basis to use for CA smoother solvers */ - QudaCABasis smoother_solver_ca_basis[QUDA_MAX_MG_LEVEL]; + /** Number of pre-smoother applications on each level */ + int nu_pre[QUDA_MAX_MG_LEVEL]; - /** Minimum eigenvalue for Chebyshev CA smoother basis */ - double smoother_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL]; + /** Number of post-smoother applications on each level */ + int nu_post[QUDA_MAX_MG_LEVEL]; - /** Maximum eigenvalue for Chebyshev CA smoother basis */ - double smoother_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL]; + /** Basis to use for CA smoother solvers */ + QudaCABasis smoother_solver_ca_basis[QUDA_MAX_MG_LEVEL]; - /** Over/under relaxation factor for the smoother at each level */ - double omega[QUDA_MAX_MG_LEVEL]; + /** Minimum eigenvalue for Chebyshev CA smoother basis */ + double smoother_solver_ca_lambda_min[QUDA_MAX_MG_LEVEL]; - /** Precision to use for halo communication in the smoother */ - QudaPrecision smoother_halo_precision[QUDA_MAX_MG_LEVEL]; + /** Maximum eigenvalue for Chebyshev CA smoother basis */ + double smoother_solver_ca_lambda_max[QUDA_MAX_MG_LEVEL]; - /** Whether to use additive or multiplicative Schwarz preconditioning in the smoother */ - QudaSchwarzType smoother_schwarz_type[QUDA_MAX_MG_LEVEL]; + /** Over/under relaxation factor for the smoother at each level */ + double omega[QUDA_MAX_MG_LEVEL]; - /** Number of Schwarz cycles to apply */ - int smoother_schwarz_cycle[QUDA_MAX_MG_LEVEL]; + /** Precision to use for halo communication in the smoother */ + QudaPrecision smoother_halo_precision[QUDA_MAX_MG_LEVEL]; - /** The type of residual to send to the next coarse grid, and thus the - type of solution to receive back from this coarse grid */ - QudaSolutionType coarse_grid_solution_type[QUDA_MAX_MG_LEVEL]; + /** Whether to use additive or multiplicative Schwarz preconditioning in the smoother */ + QudaSchwarzType smoother_schwarz_type[QUDA_MAX_MG_LEVEL]; - /** The type of smoother solve to do on each grid (e/o preconditioning or not)*/ - QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]; + /** Number of Schwarz cycles to apply */ + int smoother_schwarz_cycle[QUDA_MAX_MG_LEVEL]; - /** The type of multigrid cycle to perform at each level */ - QudaMultigridCycleType cycle_type[QUDA_MAX_MG_LEVEL]; + /** The type of residual to send to the next coarse grid, and thus the + type of solution to receive back from this coarse grid */ + QudaSolutionType coarse_grid_solution_type[QUDA_MAX_MG_LEVEL]; - /** Whether to use global reductions or not for the smoother / solver at each level */ - QudaBoolean global_reduction[QUDA_MAX_MG_LEVEL]; + /** The type of smoother solve to do on each grid (e/o preconditioning or not)*/ + QudaSolveType smoother_solve_type[QUDA_MAX_MG_LEVEL]; - /** Location where each level should be done */ - QudaFieldLocation location[QUDA_MAX_MG_LEVEL]; + /** The type of multigrid cycle to perform at each level */ + QudaMultigridCycleType cycle_type[QUDA_MAX_MG_LEVEL]; - /** Location where the coarse-operator construction will be computedn */ - QudaFieldLocation setup_location[QUDA_MAX_MG_LEVEL]; + /** Whether to use global reductions or not for the smoother / solver at each level */ + QudaBoolean global_reduction[QUDA_MAX_MG_LEVEL]; - /** Whether to use eigenvectors for the nullspace or, if the coarsest instance deflate*/ - QudaBoolean use_eig_solver[QUDA_MAX_MG_LEVEL]; + /** Location where each level should be done */ + QudaFieldLocation location[QUDA_MAX_MG_LEVEL]; - /** Minimize device memory allocations during the adaptive setup, - placing temporary fields in mapped memory instad of device - memory */ - QudaBoolean setup_minimize_memory; + /** Location where the coarse-operator construction will be computedn */ + QudaFieldLocation setup_location[QUDA_MAX_MG_LEVEL]; - /** Whether to compute the null vectors or reload them */ - QudaComputeNullVector compute_null_vector; + /** Whether to use eigenvectors for the nullspace or, if the coarsest instance deflate*/ + QudaBoolean use_eig_solver[QUDA_MAX_MG_LEVEL]; - /** Whether to generate on all levels or just on level 0 */ - QudaBoolean generate_all_levels; + /** Minimize device memory allocations during the adaptive setup, + placing temporary fields in mapped memory instad of device + memory */ + QudaBoolean setup_minimize_memory; - /** Whether to run the verification checks once set up is complete */ - QudaBoolean run_verify; + /** Whether to compute the null vectors or reload them */ + QudaComputeNullVector compute_null_vector; - /** Whether to run null Vs eigen vector overlap checks once set up is complete */ - QudaBoolean run_low_mode_check; + /** Whether to generate on all levels or just on level 0 */ + QudaBoolean generate_all_levels; - /** Whether to run null vector oblique checks once set up is complete */ - QudaBoolean run_oblique_proj_check; + /** Whether to run the verification checks once set up is complete */ + QudaBoolean run_verify; - /** Whether to load the null-space vectors to disk (requires QIO) */ - QudaBoolean vec_load[QUDA_MAX_MG_LEVEL]; + /** Whether to run null Vs eigen vector overlap checks once set up is complete */ + QudaBoolean run_low_mode_check; - /** Filename prefix where to load the null-space vectors */ - char vec_infile[QUDA_MAX_MG_LEVEL][256]; + /** Whether to run null vector oblique checks once set up is complete */ + QudaBoolean run_oblique_proj_check; - /** Whether to store the null-space vectors to disk (requires QIO) */ - QudaBoolean vec_store[QUDA_MAX_MG_LEVEL]; + /** Whether to load the null-space vectors to disk (requires QIO) */ + QudaBoolean vec_load[QUDA_MAX_MG_LEVEL]; - /** Filename prefix for where to save the null-space vectors */ - char vec_outfile[QUDA_MAX_MG_LEVEL][256]; + /** Filename prefix where to load the null-space vectors */ + char vec_infile[QUDA_MAX_MG_LEVEL][256]; - /** Whether to use and initial guess during coarse grid deflation */ - QudaBoolean coarse_guess; + /** Whether to store the null-space vectors to disk (requires QIO) */ + QudaBoolean vec_store[QUDA_MAX_MG_LEVEL]; - /** Whether to preserve the deflation space during MG update */ - QudaBoolean preserve_deflation; + /** Filename prefix for where to save the null-space vectors */ + char vec_outfile[QUDA_MAX_MG_LEVEL][256]; - /** The Gflops rate of the multigrid solver setup */ - double gflops; + /** Whether to use and initial guess during coarse grid deflation */ + QudaBoolean coarse_guess; - /**< The time taken by the multigrid solver setup */ - double secs; - - /** Multiplicative factor for the mu parameter */ - double mu_factor[QUDA_MAX_MG_LEVEL]; - - /** Boolean for aggregation type, implies staggered or not */ - QudaTransferType transfer_type[QUDA_MAX_MG_LEVEL]; - - /** Whether or not to let MG coarsening drop improvements, for ex dropping long links in small aggregation dimensions */ - QudaBoolean allow_truncation; - - /** Whether or not to use the dagger approximation for the KD preconditioned operator */ - QudaBoolean staggered_kd_dagger_approximation; - - /** Whether to use tensor cores (if available) */ - QudaBoolean use_mma; - - /** Whether to do a full (false) or thin (true) update in the context of updateMultigridQuda */ - QudaBoolean thin_update_only; -} QudaMultigridParam; - -typedef struct QudaGaugeObservableParam_s { - size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct*/ - QudaBoolean su_project; /**< Whether to project onto the manifold prior to measurement */ - QudaBoolean compute_plaquette; /**< Whether to compute the plaquette */ - double plaquette[3]; /**< Total, spatial and temporal field energies, respectively */ - QudaBoolean compute_polyakov_loop; /**< Whether to compute the temporal Polyakov loop */ - double ploop[2]; /**< Real and imaginary part of temporal Polyakov loop */ - QudaBoolean compute_gauge_loop_trace; /**< Whether to compute gauge loop traces */ - double_complex *traces; /**< Individual complex traces of each loop */ - int **input_path_buff; /**< Array of paths */ - int *path_length; /**< Length of each path */ - double *loop_coeff; /**< Multiplicative factor for each loop */ - int num_paths; /**< Total number of paths */ - int max_length; /**< Maximum length of any path */ - double factor; /**< Global multiplicative factor to apply to each loop trace */ - QudaBoolean compute_qcharge; /**< Whether to compute the topological charge and field energy */ - double qcharge; /**< Computed topological charge */ - double energy[3]; /**< Total, spatial and temporal field energies, respectively */ - QudaBoolean compute_qcharge_density; /**< Whether to compute the topological charge density */ - void *qcharge_density; /**< Pointer to host array of length volume where the q-charge density will be copied */ - QudaBoolean - remove_staggered_phase; /**< Whether or not the resident gauge field has staggered phases applied and if they should - be removed; this was needed for the Polyakov loop calculation when called through MILC, - with the underlying issue documented https://github.com/lattice/quda/issues/1315 */ -} QudaGaugeObservableParam; - -typedef struct QudaGaugeSmearParam_s { - size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct*/ - unsigned int n_steps; /**< The total number of smearing steps to perform. */ - double epsilon; /**< Serves as one of the coefficients in Over Improved Stout smearing, or as the step size in - Wilson/Symanzik flow */ - double alpha; /**< The single coefficient used in APE smearing */ - double rho; /**< Serves as one of the coefficients used in Over Improved Stout smearing, or as the single coefficient used in Stout */ - unsigned int meas_interval; /**< Perform the requested measurements on the gauge field at this interval */ - QudaGaugeSmearType smear_type; /**< The smearing type to perform */ -} QudaGaugeSmearParam; - -typedef struct QudaBLASParam_s { - size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct*/ - - QudaBLASType blas_type; /**< Type of BLAS computation to perfrom */ - - // GEMM params - QudaBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */ - QudaBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */ - int m; /**< number of rows of matrix op(A) and C. */ - int n; /**< number of columns of matrix op(B) and C. */ - int k; /**< number of columns of op(A) and rows of op(B). */ - int lda; /**< leading dimension of two-dimensional array used to store the matrix A. */ - int ldb; /**< leading dimension of two-dimensional array used to store matrix B. */ - int ldc; /**< leading dimension of two-dimensional array used to store matrix C. */ - int a_offset; /**< position of the A array from which begin read/write. */ - int b_offset; /**< position of the B array from which begin read/write. */ - int c_offset; /**< position of the C array from which begin read/write. */ - int a_stride; /**< stride of the A array in strided(batched) mode */ - int b_stride; /**< stride of the B array in strided(batched) mode */ - int c_stride; /**< stride of the C array in strided(batched) mode */ - double_complex alpha; /**< scalar used for multiplication. */ - double_complex beta; /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */ - - // LU inversion params - int inv_mat_size; /**< The rank of the square matrix in the LU inversion */ - - // Common params - int batch_count; /**< number of pointers contained in arrayA, arrayB and arrayC. */ - QudaBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ - QudaBLASDataOrder data_order; /**< Specifies if using Row or Column major */ - -} QudaBLASParam; - -/* - * Interface functions, found in interface_quda.cpp - */ + /** Whether to preserve the deflation space during MG update */ + QudaBoolean preserve_deflation; -/** - * Set parameters related to status reporting. - * - * In typical usage, this function will be called once (or not at - * all) just before the call to initQuda(), but it's valid to call - * it any number of times at any point during execution. Prior to - * the first time it's called, the parameters take default values - * as indicated below. - * - * @param verbosity Default verbosity, ranging from QUDA_SILENT to - * QUDA_DEBUG_VERBOSE. Within a solver, this - * parameter is overridden by the "verbosity" - * member of QudaInvertParam. The default value - * is QUDA_SUMMARIZE. - * - * @param prefix String to prepend to all messages from QUDA. This - * defaults to the empty string (""), but you may - * wish to specify something like "QUDA: " to - * distinguish QUDA's output from that of your - * application. - * - * @param outfile File pointer (such as stdout, stderr, or a handle - * returned by fopen()) where messages should be - * printed. The default is stdout. - */ -void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfile); + /** The Gflops rate of the multigrid solver setup */ + double gflops; -/** - * initCommsGridQuda() takes an optional "rank_from_coords" argument that - * should be a pointer to a user-defined function with this prototype. - * - * @param coords Node coordinates - * @param fdata Any auxiliary data needed by the function - * @return MPI rank or QMP node ID cooresponding to the node coordinates - * - * @see initCommsGridQuda - */ -typedef int (*QudaCommsMap)(const int *coords, void *fdata); + /**< The time taken by the multigrid solver setup */ + double secs; -/** - * @param mycomm User provided MPI communicator in place of MPI_COMM_WORLD - */ + /** Multiplicative factor for the mu parameter */ + double mu_factor[QUDA_MAX_MG_LEVEL]; -void qudaSetCommHandle(void *mycomm); + /** Boolean for aggregation type, implies staggered or not */ + QudaTransferType transfer_type[QUDA_MAX_MG_LEVEL]; + + /** Whether or not to let MG coarsening drop improvements, for ex dropping long links in small aggregation dimensions */ + QudaBoolean allow_truncation; + + /** Whether or not to use the dagger approximation for the KD preconditioned operator */ + QudaBoolean staggered_kd_dagger_approximation; + + /** Whether to use tensor cores (if available) */ + QudaBoolean use_mma; + + /** Whether to do a full (false) or thin (true) update in the context of updateMultigridQuda */ + QudaBoolean thin_update_only; + } QudaMultigridParam; + + typedef struct QudaGaugeObservableParam_s { + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct*/ + QudaBoolean su_project; /**< Whether to project onto the manifold prior to measurement */ + QudaBoolean compute_plaquette; /**< Whether to compute the plaquette */ + double plaquette[3]; /**< Total, spatial and temporal field energies, respectively */ + QudaBoolean compute_polyakov_loop; /**< Whether to compute the temporal Polyakov loop */ + double ploop[2]; /**< Real and imaginary part of temporal Polyakov loop */ + QudaBoolean compute_gauge_loop_trace; /**< Whether to compute gauge loop traces */ + double_complex *traces; /**< Individual complex traces of each loop */ + int **input_path_buff; /**< Array of paths */ + int *path_length; /**< Length of each path */ + double *loop_coeff; /**< Multiplicative factor for each loop */ + int num_paths; /**< Total number of paths */ + int max_length; /**< Maximum length of any path */ + double factor; /**< Global multiplicative factor to apply to each loop trace */ + QudaBoolean compute_qcharge; /**< Whether to compute the topological charge and field energy */ + double qcharge; /**< Computed topological charge */ + double energy[3]; /**< Total, spatial and temporal field energies, respectively */ + QudaBoolean compute_qcharge_density; /**< Whether to compute the topological charge density */ + void *qcharge_density; /**< Pointer to host array of length volume where the q-charge density will be copied */ + QudaBoolean + remove_staggered_phase; /**< Whether or not the resident gauge field has staggered phases applied and if they should + be removed; this was needed for the Polyakov loop calculation when called through MILC, + with the underlying issue documented https://github.com/lattice/quda/issues/1315 */ + } QudaGaugeObservableParam; + + typedef struct QudaGaugeSmearParam_s { + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct*/ + unsigned int n_steps; /**< The total number of smearing steps to perform. */ + double epsilon; /**< Serves as one of the coefficients in Over Improved Stout smearing, or as the step size in + Wilson/Symanzik flow */ + double alpha; /**< The single coefficient used in APE smearing */ + double rho; /**< Serves as one of the coefficients used in Over Improved Stout smearing, or as the single coefficient used in Stout */ + unsigned int meas_interval; /**< Perform the requested measurements on the gauge field at this interval */ + QudaGaugeSmearType smear_type; /**< The smearing type to perform */ + } QudaGaugeSmearParam; + + typedef struct QudaBLASParam_s { + size_t struct_size; /**< Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct*/ + + QudaBLASType blas_type; /**< Type of BLAS computation to perfrom */ + + // GEMM params + QudaBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */ + QudaBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */ + int m; /**< number of rows of matrix op(A) and C. */ + int n; /**< number of columns of matrix op(B) and C. */ + int k; /**< number of columns of op(A) and rows of op(B). */ + int lda; /**< leading dimension of two-dimensional array used to store the matrix A. */ + int ldb; /**< leading dimension of two-dimensional array used to store matrix B. */ + int ldc; /**< leading dimension of two-dimensional array used to store matrix C. */ + int a_offset; /**< position of the A array from which begin read/write. */ + int b_offset; /**< position of the B array from which begin read/write. */ + int c_offset; /**< position of the C array from which begin read/write. */ + int a_stride; /**< stride of the A array in strided(batched) mode */ + int b_stride; /**< stride of the B array in strided(batched) mode */ + int c_stride; /**< stride of the C array in strided(batched) mode */ + double_complex alpha; /**< scalar used for multiplication. */ + double_complex beta; /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */ + + // LU inversion params + int inv_mat_size; /**< The rank of the square matrix in the LU inversion */ + + // Common params + int batch_count; /**< number of pointers contained in arrayA, arrayB and arrayC. */ + QudaBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ + QudaBLASDataOrder data_order; /**< Specifies if using Row or Column major */ + + } QudaBLASParam; -/** - * Declare the grid mapping ("logical topology" in QMP parlance) - * used for communications in a multi-GPU grid. This function - * should be called prior to initQuda(). The only case in which - * it's optional is when QMP is used for communication and the - * logical topology has already been declared by the application. - * - * @param nDim Number of grid dimensions. "4" is the only supported - * value currently. - * - * @param dims Array of grid dimensions. dims[0]*dims[1]*dims[2]*dims[3] - * must equal the total number of MPI ranks or QMP nodes. - * - * @param func Pointer to a user-supplied function that maps coordinates - * in the communication grid to MPI ranks (or QMP node IDs). - * If the pointer is NULL, the default mapping depends on - * whether QMP or MPI is being used for communication. With - * QMP, the existing logical topology is used if it's been - * declared. With MPI or as a fallback with QMP, the default - * ordering is lexicographical with the fourth ("t") index - * varying fastest. - * - * @param fdata Pointer to any data required by "func" (may be NULL) - * - * @see QudaCommsMap - */ + /* + * Interface functions, found in interface_quda.cpp + */ -void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata); + /** + * Set parameters related to status reporting. + * + * In typical usage, this function will be called once (or not at + * all) just before the call to initQuda(), but it's valid to call + * it any number of times at any point during execution. Prior to + * the first time it's called, the parameters take default values + * as indicated below. + * + * @param verbosity Default verbosity, ranging from QUDA_SILENT to + * QUDA_DEBUG_VERBOSE. Within a solver, this + * parameter is overridden by the "verbosity" + * member of QudaInvertParam. The default value + * is QUDA_SUMMARIZE. + * + * @param prefix String to prepend to all messages from QUDA. This + * defaults to the empty string (""), but you may + * wish to specify something like "QUDA: " to + * distinguish QUDA's output from that of your + * application. + * + * @param outfile File pointer (such as stdout, stderr, or a handle + * returned by fopen()) where messages should be + * printed. The default is stdout. + */ + void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], + FILE *outfile); -/** - * Initialize the library. This is a low-level interface that is - * called by initQuda. Calling initQudaDevice requires that the - * user also call initQudaMemory before using QUDA. - * - * @param device CUDA device number to use. In a multi-GPU build, - * this parameter may either be set explicitly on a - * per-process basis or set to -1 to enable a default - * allocation of devices to processes. - */ -void initQudaDevice(int device); + /** + * initCommsGridQuda() takes an optional "rank_from_coords" argument that + * should be a pointer to a user-defined function with this prototype. + * + * @param coords Node coordinates + * @param fdata Any auxiliary data needed by the function + * @return MPI rank or QMP node ID cooresponding to the node coordinates + * + * @see initCommsGridQuda + */ + typedef int (*QudaCommsMap)(const int *coords, void *fdata); -/** - * Initialize the library persistant memory allocations (both host - * and device). This is a low-level interface that is called by - * initQuda. Calling initQudaMemory requires that the user has - * previously called initQudaDevice. - */ -void initQudaMemory(); + /** + * @param mycomm User provided MPI communicator in place of MPI_COMM_WORLD + */ -/** - * Initialize the library. This function is actually a wrapper - * around calls to initQudaDevice() and initQudaMemory(). - * - * @param device CUDA device number to use. In a multi-GPU build, - * this parameter may either be set explicitly on a - * per-process basis or set to -1 to enable a default - * allocation of devices to processes. - */ -void initQuda(int device); + void qudaSetCommHandle(void *mycomm); -/** - * Finalize the library. - */ -void endQuda(void); + /** + * Declare the grid mapping ("logical topology" in QMP parlance) + * used for communications in a multi-GPU grid. This function + * should be called prior to initQuda(). The only case in which + * it's optional is when QMP is used for communication and the + * logical topology has already been declared by the application. + * + * @param nDim Number of grid dimensions. "4" is the only supported + * value currently. + * + * @param dims Array of grid dimensions. dims[0]*dims[1]*dims[2]*dims[3] + * must equal the total number of MPI ranks or QMP nodes. + * + * @param func Pointer to a user-supplied function that maps coordinates + * in the communication grid to MPI ranks (or QMP node IDs). + * If the pointer is NULL, the default mapping depends on + * whether QMP or MPI is being used for communication. With + * QMP, the existing logical topology is used if it's been + * declared. With MPI or as a fallback with QMP, the default + * ordering is lexicographical with the fourth ("t") index + * varying fastest. + * + * @param fdata Pointer to any data required by "func" (may be NULL) + * + * @see QudaCommsMap + */ -/** - * @brief update the radius for halos. - * @details This should only be needed for automated testing when - * different partitioning is applied within a single run. - */ -void updateR(); + void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata); -/** - * A new QudaGaugeParam should always be initialized immediately - * after it's defined (and prior to explicitly setting its members) - * using this function. Typical usage is as follows: - * - * QudaGaugeParam gauge_param = newQudaGaugeParam(); - */ -QudaGaugeParam newQudaGaugeParam(void); + /** + * Initialize the library. This is a low-level interface that is + * called by initQuda. Calling initQudaDevice requires that the + * user also call initQudaMemory before using QUDA. + * + * @param device CUDA device number to use. In a multi-GPU build, + * this parameter may either be set explicitly on a + * per-process basis or set to -1 to enable a default + * allocation of devices to processes. + */ + void initQudaDevice(int device); -/** - * A new QudaInvertParam should always be initialized immediately - * after it's defined (and prior to explicitly setting its members) - * using this function. Typical usage is as follows: - * - * QudaInvertParam invert_param = newQudaInvertParam(); - */ -QudaInvertParam newQudaInvertParam(void); + /** + * Initialize the library persistant memory allocations (both host + * and device). This is a low-level interface that is called by + * initQuda. Calling initQudaMemory requires that the user has + * previously called initQudaDevice. + */ + void initQudaMemory(); -/** - * A new QudaMultigridParam should always be initialized immediately - * after it's defined (and prior to explicitly setting its members) - * using this function. Typical usage is as follows: - * - * QudaMultigridParam mg_param = newQudaMultigridParam(); - */ -QudaMultigridParam newQudaMultigridParam(void); + /** + * Initialize the library. This function is actually a wrapper + * around calls to initQudaDevice() and initQudaMemory(). + * + * @param device CUDA device number to use. In a multi-GPU build, + * this parameter may either be set explicitly on a + * per-process basis or set to -1 to enable a default + * allocation of devices to processes. + */ + void initQuda(int device); -/** - * A new QudaEigParam should always be initialized immediately - * after it's defined (and prior to explicitly setting its members) - * using this function. Typical usage is as follows: - * - * QudaEigParam eig_param = newQudaEigParam(); - */ -QudaEigParam newQudaEigParam(void); + /** + * Finalize the library. + */ + void endQuda(void); -/** - * A new QudaGaugeObservableParam should always be initialized - * immediately after it's defined (and prior to explicitly setting - * its members) using this function. Typical usage is as follows: - * - * QudaGaugeObservalbeParam obs_param = newQudaGaugeObservableParam(); - */ -QudaGaugeObservableParam newQudaGaugeObservableParam(void); + /** + * @brief update the radius for halos. + * @details This should only be needed for automated testing when + * different partitioning is applied within a single run. + */ + void updateR(); -/** - * A new QudaGaugeSmearParam should always be initialized - * immediately after it's defined (and prior to explicitly setting - * its members) using this function. Typical usage is as follows: - * - * QudaGaugeSmearParam smear_param = newQudaGaugeSmearParam(); - */ -QudaGaugeSmearParam newQudaGaugeSmearParam(void); + /** + * A new QudaGaugeParam should always be initialized immediately + * after it's defined (and prior to explicitly setting its members) + * using this function. Typical usage is as follows: + * + * QudaGaugeParam gauge_param = newQudaGaugeParam(); + */ + QudaGaugeParam newQudaGaugeParam(void); -/** - * A new QudaBLASParam should always be initialized immediately - * after it's defined (and prior to explicitly setting its members) - * using this function. Typical usage is as follows: - * - * QudaBLASParam blas_param = newQudaBLASParam(); - */ -QudaBLASParam newQudaBLASParam(void); + /** + * A new QudaInvertParam should always be initialized immediately + * after it's defined (and prior to explicitly setting its members) + * using this function. Typical usage is as follows: + * + * QudaInvertParam invert_param = newQudaInvertParam(); + */ + QudaInvertParam newQudaInvertParam(void); -/** - * Print the members of QudaGaugeParam. - * @param param The QudaGaugeParam whose elements we are to print. - */ -void printQudaGaugeParam(QudaGaugeParam *param); + /** + * A new QudaMultigridParam should always be initialized immediately + * after it's defined (and prior to explicitly setting its members) + * using this function. Typical usage is as follows: + * + * QudaMultigridParam mg_param = newQudaMultigridParam(); + */ + QudaMultigridParam newQudaMultigridParam(void); -/** - * Print the members of QudaInvertParam. - * @param param The QudaInvertParam whose elements we are to print. - */ -void printQudaInvertParam(QudaInvertParam *param); + /** + * A new QudaEigParam should always be initialized immediately + * after it's defined (and prior to explicitly setting its members) + * using this function. Typical usage is as follows: + * + * QudaEigParam eig_param = newQudaEigParam(); + */ + QudaEigParam newQudaEigParam(void); -/** - * Print the members of QudaMultigridParam. - * @param param The QudaMultigridParam whose elements we are to print. - */ -void printQudaMultigridParam(QudaMultigridParam *param); + /** + * A new QudaGaugeObservableParam should always be initialized + * immediately after it's defined (and prior to explicitly setting + * its members) using this function. Typical usage is as follows: + * + * QudaGaugeObservalbeParam obs_param = newQudaGaugeObservableParam(); + */ + QudaGaugeObservableParam newQudaGaugeObservableParam(void); -/** - * Print the members of QudaEigParam. - * @param param The QudaEigParam whose elements we are to print. - */ -void printQudaEigParam(QudaEigParam *param); + /** + * A new QudaGaugeSmearParam should always be initialized + * immediately after it's defined (and prior to explicitly setting + * its members) using this function. Typical usage is as follows: + * + * QudaGaugeSmearParam smear_param = newQudaGaugeSmearParam(); + */ + QudaGaugeSmearParam newQudaGaugeSmearParam(void); -/** - * Print the members of QudaGaugeObservableParam. - * @param param The QudaGaugeObservableParam whose elements we are to print. - */ -void printQudaGaugeObservableParam(QudaGaugeObservableParam *param); + /** + * A new QudaBLASParam should always be initialized immediately + * after it's defined (and prior to explicitly setting its members) + * using this function. Typical usage is as follows: + * + * QudaBLASParam blas_param = newQudaBLASParam(); + */ + QudaBLASParam newQudaBLASParam(void); -/** - * Print the members of QudaBLASParam. - * @param param The QudaBLASParam whose elements we are to print. - */ -void printQudaBLASParam(QudaBLASParam *param); + /** + * Print the members of QudaGaugeParam. + * @param param The QudaGaugeParam whose elements we are to print. + */ + void printQudaGaugeParam(QudaGaugeParam *param); -/** - * Load the gauge field from the host. - * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) - * @param param Contains all metadata regarding host and device storage - */ -void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param); + /** + * Print the members of QudaInvertParam. + * @param param The QudaInvertParam whose elements we are to print. + */ + void printQudaInvertParam(QudaInvertParam *param); -/** - * Free QUDA's internal copy of the gauge field. - */ -void freeGaugeQuda(void); + /** + * Print the members of QudaMultigridParam. + * @param param The QudaMultigridParam whose elements we are to print. + */ + void printQudaMultigridParam(QudaMultigridParam *param); -/** - * Save the gauge field to the host. - * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) - * @param param Contains all metadata regarding host and device storage - */ -void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param); + /** + * Print the members of QudaEigParam. + * @param param The QudaEigParam whose elements we are to print. + */ + void printQudaEigParam(QudaEigParam *param); -/** - * Load the clover term and/or the clover inverse from the host. - * Either h_clover or h_clovinv may be set to NULL. - * @param h_clover Base pointer to host clover field - * @param h_cloverinv Base pointer to host clover inverse field - * @param inv_param Contains all metadata regarding host and device storage - */ -void loadCloverQuda(void *h_clover, void *h_clovinv, QudaInvertParam *inv_param); + /** + * Print the members of QudaGaugeObservableParam. + * @param param The QudaGaugeObservableParam whose elements we are to print. + */ + void printQudaGaugeObservableParam(QudaGaugeObservableParam *param); -/** - * Free QUDA's internal copy of the clover term and/or clover inverse. - */ -void freeCloverQuda(void); + /** + * Print the members of QudaBLASParam. + * @param param The QudaBLASParam whose elements we are to print. + */ + void printQudaBLASParam(QudaBLASParam *param); -/** - * Perform the solve, according to the parameters set in param. It - * is assumed that the gauge field has already been loaded via - * loadGaugeQuda(). - * @param h_x Solution spinor field - * @param h_b Source spinor field - * @param param Contains all metadata regarding host and device - * storage and solver parameters - */ -void lanczosQuda(int k0, int m, void *hp_Apsi, void *hp_r, void *hp_V, void *hp_alpha, void *hp_beta, - QudaEigParam *eig_param); + /** + * Load the gauge field from the host. + * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) + * @param param Contains all metadata regarding host and device storage + */ + void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param); -/** - * Perform the eigensolve. The problem matrix is defined by the invert param, the - * mode of solution is specified by the eig param. It is assumed that the gauge - * field has already been loaded via loadGaugeQuda(). - * @param h_evecs Array of pointers to application eigenvectors - * @param h_evals Host side eigenvalues - * @param param Contains all metadata regarding the type of solve. - */ -void eigensolveQuda(void **h_evecs, double_complex *h_evals, QudaEigParam *param); + /** + * Free QUDA's internal copy of the gauge field. + */ + void freeGaugeQuda(void); -/** - * Perform the solve, according to the parameters set in param. It - * is assumed that the gauge field has already been loaded via - * loadGaugeQuda(). - * @param h_x Solution spinor field - * @param h_b Source spinor field - * @param param Contains all metadata regarding host and device - * storage and solver parameters - */ -void invertQuda(void *h_x, void *h_b, QudaInvertParam *param); + /** + * Save the gauge field to the host. + * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) + * @param param Contains all metadata regarding host and device storage + */ + void saveGaugeQuda(void *h_gauge, QudaGaugeParam *param); -/** - * @brief Perform the solve like @invertQuda but for multiple rhs by spliting the comm grid into - * sub-partitions: each sub-partition invert one or more rhs'. - * The QudaInvertParam object specifies how the solve should be performed on each sub-partition. - * Unlike @invertQuda, the interface also takes the host side gauge as input. The gauge pointer and - * gauge_param are used if for inv_param split_grid[0] * split_grid[1] * split_grid[2] * split_grid[3] - * is larger than 1, in which case gauge field is not required to be loaded beforehand; otherwise - * this interface would just work as @invertQuda, which requires gauge field to be loaded beforehand, - * and the gauge field pointer and gauge_param are not used. - * @param _hp_x Array of solution spinor fields - * @param _hp_b Array of source spinor fields - * @param param Contains all metadata regarding host and device storage and solver parameters - * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) - * @param gauge_param Contains all metadata regarding host and device storage for gauge field - */ -void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, QudaGaugeParam *gauge_param); + /** + * Load the clover term and/or the clover inverse from the host. + * Either h_clover or h_clovinv may be set to NULL. + * @param h_clover Base pointer to host clover field + * @param h_cloverinv Base pointer to host clover inverse field + * @param inv_param Contains all metadata regarding host and device storage + */ + void loadCloverQuda(void *h_clover, void *h_clovinv, + QudaInvertParam *inv_param); -/** - * @brief Really the same with @invertMultiSrcQuda but for staggered-style fermions, by accepting pointers - * to fat links and long links. - * @param _hp_x Array of solution spinor fields - * @param _hp_b Array of source spinor fields - * @param param Contains all metadata regarding host and device storage and solver parameters - * @param milc_fatlinks Base pointer to host **fat** gauge field (regardless of dimensionality) - * @param milc_longlinks Base pointer to host **long** gauge field (regardless of dimensionality) - * @param gauge_param Contains all metadata regarding host and device storage for gauge field - */ -void invertMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *milc_fatlinks, - void *milc_longlinks, QudaGaugeParam *gauge_param); + /** + * Free QUDA's internal copy of the clover term and/or clover inverse. + */ + void freeCloverQuda(void); -/** - * @brief Really the same with @invertMultiSrcQuda but for clover-style fermions, by accepting pointers - * to direct and inverse clover field pointers. - * @param _hp_x Array of solution spinor fields - * @param _hp_b Array of source spinor fields - * @param param Contains all metadata regarding host and device storage and solver parameters - * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) - * @param gauge_param Contains all metadata regarding host and device storage for gauge field - * @param h_clover Base pointer to the direct clover field - * @param h_clovinv Base pointer to the inverse clover field - */ -void invertMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, - QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv); + /** + * Perform the solve, according to the parameters set in param. It + * is assumed that the gauge field has already been loaded via + * loadGaugeQuda(). + * @param h_x Solution spinor field + * @param h_b Source spinor field + * @param param Contains all metadata regarding host and device + * storage and solver parameters + */ + void lanczosQuda(int k0, int m, void *hp_Apsi, void *hp_r, void *hp_V, void *hp_alpha, void *hp_beta, + QudaEigParam *eig_param); -/** - * Solve for multiple shifts (e.g., masses). - * @param _hp_x Array of solution spinor fields - * @param _hp_b Source spinor fields - * @param param Contains all metadata regarding host and device - * storage and solver parameters - */ -void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param); + /** + * Perform the eigensolve. The problem matrix is defined by the invert param, the + * mode of solution is specified by the eig param. It is assumed that the gauge + * field has already been loaded via loadGaugeQuda(). + * @param h_evecs Array of pointers to application eigenvectors + * @param h_evals Host side eigenvalues + * @param param Contains all metadata regarding the type of solve. + */ + void eigensolveQuda(void **h_evecs, double_complex *h_evals, QudaEigParam *param); -/** - * Setup the multigrid solver, according to the parameters set in param. It - * is assumed that the gauge field has already been loaded via - * loadGaugeQuda(). - * @param param Contains all metadata regarding host and device - * storage and solver parameters - */ -void *newMultigridQuda(QudaMultigridParam *param); + /** + * Perform the solve, according to the parameters set in param. It + * is assumed that the gauge field has already been loaded via + * loadGaugeQuda(). + * @param h_x Solution spinor field + * @param h_b Source spinor field + * @param param Contains all metadata regarding host and device + * storage and solver parameters + */ + void invertQuda(void *h_x, void *h_b, QudaInvertParam *param); -/** - * @brief Free resources allocated by the multigrid solver - * @param mg_instance Pointer to instance of multigrid_solver - * @param param Contains all metadata regarding host and device - * storage and solver parameters - */ -void destroyMultigridQuda(void *mg_instance); + /** + * @brief Perform the solve like @invertQuda but for multiple rhs by spliting the comm grid into + * sub-partitions: each sub-partition invert one or more rhs'. + * The QudaInvertParam object specifies how the solve should be performed on each sub-partition. + * Unlike @invertQuda, the interface also takes the host side gauge as input. The gauge pointer and + * gauge_param are used if for inv_param split_grid[0] * split_grid[1] * split_grid[2] * split_grid[3] + * is larger than 1, in which case gauge field is not required to be loaded beforehand; otherwise + * this interface would just work as @invertQuda, which requires gauge field to be loaded beforehand, + * and the gauge field pointer and gauge_param are not used. + * @param _hp_x Array of solution spinor fields + * @param _hp_b Array of source spinor fields + * @param param Contains all metadata regarding host and device storage and solver parameters + * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) + * @param gauge_param Contains all metadata regarding host and device storage for gauge field + */ + void invertMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, QudaGaugeParam *gauge_param); -/** - * @brief Updates the multigrid preconditioner for the new gauge / clover field - * @param mg_instance Pointer to instance of multigrid_solver - * @param param Contains all metadata regarding host and device - * storage and solver parameters, of note contains a flag specifying whether - * to do a full update or a thin update. - */ -void updateMultigridQuda(void *mg_instance, QudaMultigridParam *param); + /** + * @brief Really the same with @invertMultiSrcQuda but for staggered-style fermions, by accepting pointers + * to fat links and long links. + * @param _hp_x Array of solution spinor fields + * @param _hp_b Array of source spinor fields + * @param param Contains all metadata regarding host and device storage and solver parameters + * @param milc_fatlinks Base pointer to host **fat** gauge field (regardless of dimensionality) + * @param milc_longlinks Base pointer to host **long** gauge field (regardless of dimensionality) + * @param gauge_param Contains all metadata regarding host and device storage for gauge field + */ + void invertMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *milc_fatlinks, + void *milc_longlinks, QudaGaugeParam *gauge_param); -/** - * @brief Dump the null-space vectors to disk - * @param[in] mg_instance Pointer to the instance of multigrid_solver - * @param[in] param Contains all metadata regarding host and device - * storage and solver parameters (QudaMultigridParam::vec_outfile - * sets the output filename prefix). - */ -void dumpMultigridQuda(void *mg_instance, QudaMultigridParam *param); + /** + * @brief Really the same with @invertMultiSrcQuda but for clover-style fermions, by accepting pointers + * to direct and inverse clover field pointers. + * @param _hp_x Array of solution spinor fields + * @param _hp_b Array of source spinor fields + * @param param Contains all metadata regarding host and device storage and solver parameters + * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) + * @param gauge_param Contains all metadata regarding host and device storage for gauge field + * @param h_clover Base pointer to the direct clover field + * @param h_clovinv Base pointer to the inverse clover field + */ + void invertMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, void *h_gauge, + QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv); -/** - * Apply the Dslash operator (D_{eo} or D_{oe}). - * @param h_out Result spinor field - * @param h_in Input spinor field - * @param param Contains all metadata regarding host and device - * storage - * @param parity The destination parity of the field - */ -void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity); + /** + * Solve for multiple shifts (e.g., masses). + * @param _hp_x Array of solution spinor fields + * @param _hp_b Source spinor fields + * @param param Contains all metadata regarding host and device + * storage and solver parameters + */ + void invertMultiShiftQuda(void **_hp_x, void *_hp_b, QudaInvertParam *param); -/** - * Apply the Dslash operator (D_{eo} or D_{oe}). - * @param h_out Result spinor field - * @param h_in Input spinor field - * @param param Contains all metadata regarding host and device - * storage - * @param parity The destination parity of the field - */ -void dslashQudaTest(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity); + /** + * Setup the multigrid solver, according to the parameters set in param. It + * is assumed that the gauge field has already been loaded via + * loadGaugeQuda(). + * @param param Contains all metadata regarding host and device + * storage and solver parameters + */ + void* newMultigridQuda(QudaMultigridParam *param); -/** - * @brief Perform the solve like @dslashQuda but for multiple rhs by spliting the comm grid into - * sub-partitions: each sub-partition does one or more rhs'. - * The QudaInvertParam object specifies how the solve should be performed on each sub-partition. - * Unlike @invertQuda, the interface also takes the host side gauge as - * input - gauge field is not required to be loaded beforehand. - * @param _hp_x Array of solution spinor fields - * @param _hp_b Array of source spinor fields - * @param param Contains all metadata regarding host and device storage and solver parameters - * @param parity Parity to apply dslash on - * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) - * @param gauge_param Contains all metadata regarding host and device storage for gauge field - */ -void dslashMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, - QudaGaugeParam *gauge_param); -/** - * @brief Really the same with @dslashMultiSrcQuda but for staggered-style fermions, by accepting pointers - * to fat links and long links. - * @param _hp_x Array of solution spinor fields - * @param _hp_b Array of source spinor fields - * @param param Contains all metadata regarding host and device storage and solver parameters - * @param parity Parity to apply dslash on - * @param milc_fatlinks Base pointer to host **fat** gauge field (regardless of dimensionality) - * @param milc_longlinks Base pointer to host **long** gauge field (regardless of dimensionality) - * @param gauge_param Contains all metadata regarding host and device storage for gauge field - */ + /** + * @brief Free resources allocated by the multigrid solver + * @param mg_instance Pointer to instance of multigrid_solver + * @param param Contains all metadata regarding host and device + * storage and solver parameters + */ + void destroyMultigridQuda(void *mg_instance); -void dslashMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, - void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param); + /** + * @brief Updates the multigrid preconditioner for the new gauge / clover field + * @param mg_instance Pointer to instance of multigrid_solver + * @param param Contains all metadata regarding host and device + * storage and solver parameters, of note contains a flag specifying whether + * to do a full update or a thin update. + */ + void updateMultigridQuda(void *mg_instance, QudaMultigridParam *param); -/** - * @brief Really the same with @dslashMultiSrcQuda but for clover-style fermions, by accepting pointers - * to direct and inverse clover field pointers. - * @param _hp_x Array of solution spinor fields - * @param _hp_b Array of source spinor fields - * @param param Contains all metadata regarding host and device storage and solver parameters - * @param parity Parity to apply dslash on - * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) - * @param gauge_param Contains all metadata regarding host and device storage for gauge field - * @param h_clover Base pointer to the direct clover field - * @param h_clovinv Base pointer to the inverse clover field - */ -void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, - QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv); + /** + * @brief Dump the null-space vectors to disk + * @param[in] mg_instance Pointer to the instance of multigrid_solver + * @param[in] param Contains all metadata regarding host and device + * storage and solver parameters (QudaMultigridParam::vec_outfile + * sets the output filename prefix). + */ + void dumpMultigridQuda(void *mg_instance, QudaMultigridParam *param); -/** - * Apply the clover operator or its inverse. - * @param h_out Result spinor field - * @param h_in Input spinor field - * @param param Contains all metadata regarding host and device - * storage - * @param parity The source and destination parity of the field - * @param inverse Whether to apply the inverse of the clover term - */ -void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse); + /** + * Apply the Dslash operator (D_{eo} or D_{oe}). + * @param h_out Result spinor field + * @param h_in Input spinor field + * @param param Contains all metadata regarding host and device + * storage + * @param parity The destination parity of the field + */ + void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity); -/** - * Apply the full Dslash matrix, possibly even/odd preconditioned. - * @param h_out Result spinor field - * @param h_in Input spinor field - * @param param Contains all metadata regarding host and device - * storage - */ -void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param); + /** + * Apply the Dslash operator (D_{eo} or D_{oe}). + * @param h_out Result spinor field + * @param h_in Input spinor field + * @param param Contains all metadata regarding host and device + * storage + * @param parity The destination parity of the field + */ + void dslashQudaTest(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity); -/** - * Apply M^{\dag}M, possibly even/odd preconditioned. - * @param h_out Result spinor field - * @param h_in Input spinor field - * @param param Contains all metadata regarding host and device - * storage - */ -void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param); + /** + * @brief Perform the solve like @dslashQuda but for multiple rhs by spliting the comm grid into + * sub-partitions: each sub-partition does one or more rhs'. + * The QudaInvertParam object specifies how the solve should be performed on each sub-partition. + * Unlike @invertQuda, the interface also takes the host side gauge as + * input - gauge field is not required to be loaded beforehand. + * @param _hp_x Array of solution spinor fields + * @param _hp_b Array of source spinor fields + * @param param Contains all metadata regarding host and device storage and solver parameters + * @param parity Parity to apply dslash on + * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) + * @param gauge_param Contains all metadata regarding host and device storage for gauge field + */ + void dslashMultiSrcQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, + QudaGaugeParam *gauge_param); + /** + * @brief Really the same with @dslashMultiSrcQuda but for staggered-style fermions, by accepting pointers + * to fat links and long links. + * @param _hp_x Array of solution spinor fields + * @param _hp_b Array of source spinor fields + * @param param Contains all metadata regarding host and device storage and solver parameters + * @param parity Parity to apply dslash on + * @param milc_fatlinks Base pointer to host **fat** gauge field (regardless of dimensionality) + * @param milc_longlinks Base pointer to host **long** gauge field (regardless of dimensionality) + * @param gauge_param Contains all metadata regarding host and device storage for gauge field + */ -/* - * The following routines are temporary additions used by the HISQ - * link-fattening code. - */ + void dslashMultiSrcStaggeredQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, + void *milc_fatlinks, void *milc_longlinks, QudaGaugeParam *gauge_param); -void set_dim(int *); -void pack_ghost(void **cpuLink, void **cpuGhost, int nFace, QudaPrecision precision); + /** + * @brief Really the same with @dslashMultiSrcQuda but for clover-style fermions, by accepting pointers + * to direct and inverse clover field pointers. + * @param _hp_x Array of solution spinor fields + * @param _hp_b Array of source spinor fields + * @param param Contains all metadata regarding host and device storage and solver parameters + * @param parity Parity to apply dslash on + * @param h_gauge Base pointer to host gauge field (regardless of dimensionality) + * @param gauge_param Contains all metadata regarding host and device storage for gauge field + * @param h_clover Base pointer to the direct clover field + * @param h_clovinv Base pointer to the inverse clover field + */ + void dslashMultiSrcCloverQuda(void **_hp_x, void **_hp_b, QudaInvertParam *param, QudaParity parity, void *h_gauge, + QudaGaugeParam *gauge_param, void *h_clover, void *h_clovinv); -void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, double *path_coeff, - QudaGaugeParam *param); + /** + * Apply the clover operator or its inverse. + * @param h_out Result spinor field + * @param h_in Input spinor field + * @param param Contains all metadata regarding host and device + * storage + * @param parity The source and destination parity of the field + * @param inverse Whether to apply the inverse of the clover term + */ + void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity, int inverse); -/** - * Either downloads and sets the resident momentum field, or uploads - * and returns the resident momentum field - * - * @param[in,out] mom The external momentum field - * @param[in] param The parameters of the external field - */ -void momResidentQuda(void *mom, QudaGaugeParam *param); + /** + * Apply the full Dslash matrix, possibly even/odd preconditioned. + * @param h_out Result spinor field + * @param h_in Input spinor field + * @param param Contains all metadata regarding host and device + * storage + */ + void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param); -/** - * Compute the gauge force and update the momentum field - * - * @param[in,out] mom The momentum field to be updated - * @param[in] sitelink The gauge field from which we compute the force - * @param[in] input_path_buf[dim][num_paths][path_length] - * @param[in] path_length One less that the number of links in a loop (e.g., 3 for a staple) - * @param[in] loop_coeff Coefficients of the different loops in the Symanzik action - * @param[in] num_paths How many contributions from path_length different "staples" - * @param[in] max_length The maximum number of non-zero of links in any path in the action - * @param[in] dt The integration step size (for MILC this is dt*beta/3) - * @param[in] param The parameters of the external fields and the computation settings - */ -int computeGaugeForceQuda(void *mom, void *sitelink, int ***input_path_buf, int *path_length, double *loop_coeff, - int num_paths, int max_length, double dt, QudaGaugeParam *qudaGaugeParam); + /** + * Apply M^{\dag}M, possibly even/odd preconditioned. + * @param h_out Result spinor field + * @param h_in Input spinor field + * @param param Contains all metadata regarding host and device + * storage + */ + void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param); -/** - * Compute the product of gauge links along a path and add to/overwrite the output field - * - * @param[in,out] out The output field to be updated - * @param[in] sitelink The gauge field from which we compute the products of gauge links - * @param[in] input_path_buf[dim][num_paths][path_length] - * @param[in] path_length One less that the number of links in a loop (e.g., 3 for a staple) - * @param[in] loop_coeff Coefficients of the different loops in the Symanzik action - * @param[in] num_paths How many contributions from path_length different "staples" - * @param[in] max_length The maximum number of non-zero of links in any path in the action - * @param[in] dt The integration step size (for MILC this is dt*beta/3) - * @param[in] param The parameters of the external fields and the computation settings - */ -int computeGaugePathQuda(void *out, void *sitelink, int ***input_path_buf, int *path_length, double *loop_coeff, - int num_paths, int max_length, double dt, QudaGaugeParam *qudaGaugeParam); -/** - * Compute the traces of products of gauge links along paths using the resident field - * - * @param[in,out] traces The computed traces - * @param[in] sitelink The gauge field from which we compute the products of gauge links - * @param[in] path_length The number of links in each loop - * @param[in] loop_coeff Multiplicative coefficients for each loop - * @param[in] num_paths Total number of loops - * @param[in] max_length The maximum number of non-zero of links in any path in the action - * @param[in] factor An overall normalization factor - */ -void computeGaugeLoopTraceQuda(double_complex *traces, int **input_path_buf, int *path_length, double *loop_coeff, - int num_paths, int max_length, double factor); + /* + * The following routines are temporary additions used by the HISQ + * link-fattening code. + */ -/** - * Evolve the gauge field by step size dt, using the momentum field - * I.e., Evalulate U(t+dt) = e(dt pi) U(t) - * - * @param gauge The gauge field to be updated - * @param momentum The momentum field - * @param dt The integration step size step - * @param conj_mom Whether to conjugate the momentum matrix - * @param exact Whether to use an exact exponential or Taylor expand - * @param param The parameters of the external fields and the computation settings - */ -void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, int exact, QudaGaugeParam *param); + void set_dim(int *); + void pack_ghost(void **cpuLink, void **cpuGhost, int nFace, + QudaPrecision precision); -/** - * Apply the staggered phase factors to the gauge field. If the - * imaginary chemical potential is non-zero then the phase factor - * exp(imu/T) will be applied to the links in the temporal - * direction. - * - * @param gauge_h The gauge field - * @param param The parameters of the gauge field - */ -void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param); + void computeKSLinkQuda(void* fatlink, void* longlink, void* ulink, void* inlink, + double *path_coeff, QudaGaugeParam *param); -/** - * Project the input field on the SU(3) group. If the target - * tolerance is not met, this routine will give a runtime error. - * - * @param gauge_h The gauge field to be updated - * @param tol The tolerance to which we iterate - * @param param The parameters of the gauge field - */ -void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param); + /** + * Either downloads and sets the resident momentum field, or uploads + * and returns the resident momentum field + * + * @param[in,out] mom The external momentum field + * @param[in] param The parameters of the external field + */ + void momResidentQuda(void *mom, QudaGaugeParam *param); -/** - * Evaluate the momentum contribution to the Hybrid Monte Carlo - * action. - * - * @param momentum The momentum field - * @param param The parameters of the external fields and the computation settings - * @return momentum action - */ -double momActionQuda(void *momentum, QudaGaugeParam *param); + /** + * Compute the gauge force and update the momentum field + * + * @param[in,out] mom The momentum field to be updated + * @param[in] sitelink The gauge field from which we compute the force + * @param[in] input_path_buf[dim][num_paths][path_length] + * @param[in] path_length One less that the number of links in a loop (e.g., 3 for a staple) + * @param[in] loop_coeff Coefficients of the different loops in the Symanzik action + * @param[in] num_paths How many contributions from path_length different "staples" + * @param[in] max_length The maximum number of non-zero of links in any path in the action + * @param[in] dt The integration step size (for MILC this is dt*beta/3) + * @param[in] param The parameters of the external fields and the computation settings + */ + int computeGaugeForceQuda(void *mom, void *sitelink, int ***input_path_buf, int *path_length, double *loop_coeff, + int num_paths, int max_length, double dt, QudaGaugeParam *qudaGaugeParam); -/** - * Allocate a gauge (matrix) field on the device and optionally download a host gauge field. - * - * @param gauge The host gauge field (optional - if set to 0 then the gauge field zeroed) - * @param geometry The geometry of the matrix field to create (1 - scalar, 4 - vector, 6 - tensor) - * @param param The parameters of the external field and the field to be created - * @return Pointer to the gauge field (cast as a void*) - */ -void *createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param); + /** + * Compute the product of gauge links along a path and add to/overwrite the output field + * + * @param[in,out] out The output field to be updated + * @param[in] sitelink The gauge field from which we compute the products of gauge links + * @param[in] input_path_buf[dim][num_paths][path_length] + * @param[in] path_length One less that the number of links in a loop (e.g., 3 for a staple) + * @param[in] loop_coeff Coefficients of the different loops in the Symanzik action + * @param[in] num_paths How many contributions from path_length different "staples" + * @param[in] max_length The maximum number of non-zero of links in any path in the action + * @param[in] dt The integration step size (for MILC this is dt*beta/3) + * @param[in] param The parameters of the external fields and the computation settings + */ + int computeGaugePathQuda(void *out, void *sitelink, int ***input_path_buf, int *path_length, double *loop_coeff, + int num_paths, int max_length, double dt, QudaGaugeParam *qudaGaugeParam); -/** - * Copy the QUDA gauge (matrix) field on the device to the CPU - * - * @param outGauge Pointer to the host gauge field - * @param inGauge Pointer to the device gauge field (QUDA device field) - * @param param The parameters of the host and device fields - */ -void saveGaugeFieldQuda(void *outGauge, void *inGauge, QudaGaugeParam *param); + /** + * Compute the traces of products of gauge links along paths using the resident field + * + * @param[in,out] traces The computed traces + * @param[in] sitelink The gauge field from which we compute the products of gauge links + * @param[in] path_length The number of links in each loop + * @param[in] loop_coeff Multiplicative coefficients for each loop + * @param[in] num_paths Total number of loops + * @param[in] max_length The maximum number of non-zero of links in any path in the action + * @param[in] factor An overall normalization factor + */ + void computeGaugeLoopTraceQuda(double_complex *traces, int **input_path_buf, int *path_length, double *loop_coeff, + int num_paths, int max_length, double factor); -/** - * Reinterpret gauge as a pointer to cudaGaugeField and call destructor. - * - * @param gauge Gauge field to be freed - */ -void destroyGaugeFieldQuda(void *gauge); + /** + * Evolve the gauge field by step size dt, using the momentum field + * I.e., Evalulate U(t+dt) = e(dt pi) U(t) + * + * @param gauge The gauge field to be updated + * @param momentum The momentum field + * @param dt The integration step size step + * @param conj_mom Whether to conjugate the momentum matrix + * @param exact Whether to use an exact exponential or Taylor expand + * @param param The parameters of the external fields and the computation settings + */ + void updateGaugeFieldQuda(void* gauge, void* momentum, double dt, + int conj_mom, int exact, QudaGaugeParam* param); -/** - * Compute the clover field and its inverse from the resident gauge field. - * - * @param param The parameters of the clover field to create - */ -void createCloverQuda(QudaInvertParam *param); + /** + * Apply the staggered phase factors to the gauge field. If the + * imaginary chemical potential is non-zero then the phase factor + * exp(imu/T) will be applied to the links in the temporal + * direction. + * + * @param gauge_h The gauge field + * @param param The parameters of the gauge field + */ + void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param); -/** - * Compute the clover force contributions in each dimension mu given - * the array of solution fields, and compute the resulting momentum - * field. - * - * @param mom Force matrix - * @param dt Integrating step size - * @param x Array of solution vectors - * @param p Array of intermediate vectors - * @param coeff Array of residues for each contribution (multiplied by stepsize) - * @param kappa2 -kappa*kappa parameter - * @param ck -clover_coefficient * kappa / 8 - * @param nvec Number of vectors - * @param multiplicity Number fermions this bilinear reresents - * @param gauge Gauge Field - * @param gauge_param Gauge field meta data - * @param inv_param Dirac and solver meta data - */ -void computeCloverForceQuda(void *mom, double dt, void **x, void **p, double *coeff, double kappa2, double ck, - int nvector, double multiplicity, void *gauge, QudaGaugeParam *gauge_param, - QudaInvertParam *inv_param); + /** + * Project the input field on the SU(3) group. If the target + * tolerance is not met, this routine will give a runtime error. + * + * @param gauge_h The gauge field to be updated + * @param tol The tolerance to which we iterate + * @param param The parameters of the gauge field + */ + void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param); -/** - * Compute the naive staggered force. All fields must be in the same precision. - * - * @param mom Momentum field - * @param dt Integrating step size - * @param delta Additional scale factor when updating momentum (mom += delta * [force]_TA - * @param gauge Gauge field (at present only supports resident gauge field) - * @param x Array of single-parity solution vectors (at present only supports resident solutions) - * @param gauge_param Gauge field meta data - * @param invert_param Dirac and solver meta data - */ -void computeStaggeredForceQuda(void *mom, double dt, double delta, void *gauge, void **x, QudaGaugeParam *gauge_param, - QudaInvertParam *invert_param); + /** + * Evaluate the momentum contribution to the Hybrid Monte Carlo + * action. + * + * @param momentum The momentum field + * @param param The parameters of the external fields and the computation settings + * @return momentum action + */ + double momActionQuda(void* momentum, QudaGaugeParam* param); -/** - * Compute the fermion force for the HISQ quark action and integrate the momentum. - * @param momentum The momentum field we are integrating - * @param dt The stepsize used to integrate the momentum - * @param level2_coeff The coefficients for the second level of smearing in the quark action. - * @param fat7_coeff The coefficients for the first level of smearing (fat7) in the quark action. - * @param w_link Unitarized link variables obtained by applying fat7 smearing and unitarization to the original links. - * @param v_link Fat7 link variables. - * @param u_link SU(3) think link variables. - * @param quark The input fermion field. - * @param num The number of quark fields - * @param num_naik The number of naik contributions - * @param coeff The coefficient multiplying the fermion fields in the outer product - * @param param. The field parameters. - */ -void computeHISQForceQuda(void *momentum, double dt, const double level2_coeff[6], const double fat7_coeff[6], - const void *const w_link, const void *const v_link, const void *const u_link, void **quark, - int num, int num_naik, double **coeff, QudaGaugeParam *param); + /** + * Allocate a gauge (matrix) field on the device and optionally download a host gauge field. + * + * @param gauge The host gauge field (optional - if set to 0 then the gauge field zeroed) + * @param geometry The geometry of the matrix field to create (1 - scalar, 4 - vector, 6 - tensor) + * @param param The parameters of the external field and the field to be created + * @return Pointer to the gauge field (cast as a void*) + */ + void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param); -/** - @brief Generate Gaussian distributed fields and store in the - resident gauge field. We create a Gaussian-distributed su(n) - field and exponentiate it, e.g., U = exp(sigma * H), where H is - the distributed su(n) field and sigma is the width of the - distribution (sigma = 0 results in a free field, and sigma = 1 has - maximum disorder). - - @param seed The seed used for the RNG - @param sigma Width of Gaussian distrubution -*/ -void gaussGaugeQuda(unsigned long long seed, double sigma); + /** + * Copy the QUDA gauge (matrix) field on the device to the CPU + * + * @param outGauge Pointer to the host gauge field + * @param inGauge Pointer to the device gauge field (QUDA device field) + * @param param The parameters of the host and device fields + */ + void saveGaugeFieldQuda(void* outGauge, void* inGauge, QudaGaugeParam* param); -/** - * @brief Generate Gaussian distributed fields and store in the - * resident momentum field. We create a Gaussian-distributed su(n) - * field, e.g., sigma * H, where H is the distributed su(n) field - * and sigma is the width of the distribution (sigma = 0 results - * in a free field, and sigma = 1 has maximum disorder). - * - * @param seed The seed used for the RNG - * @param sigma Width of Gaussian distrubution - */ -void gaussMomQuda(unsigned long long seed, double sigma); + /** + * Reinterpret gauge as a pointer to cudaGaugeField and call destructor. + * + * @param gauge Gauge field to be freed + */ + void destroyGaugeFieldQuda(void* gauge); -/** - * Computes the total, spatial and temporal plaquette averages of the loaded gauge configuration. - * @param[out] Array for storing the averages (total, spatial, temporal) - */ -void plaqQuda(double plaq[3]); + /** + * Compute the clover field and its inverse from the resident gauge field. + * + * @param param The parameters of the clover field to create + */ + void createCloverQuda(QudaInvertParam* param); -/** - @brief Computes the trace of the Polyakov loop of the current resident field - in a given direction. + /** + * Compute the clover force contributions in each dimension mu given + * the array of solution fields, and compute the resulting momentum + * field. + * + * @param mom Force matrix + * @param dt Integrating step size + * @param x Array of solution vectors + * @param p Array of intermediate vectors + * @param coeff Array of residues for each contribution (multiplied by stepsize) + * @param kappa2 -kappa*kappa parameter + * @param ck -clover_coefficient * kappa / 8 + * @param nvec Number of vectors + * @param multiplicity Number fermions this bilinear reresents + * @param gauge Gauge Field + * @param gauge_param Gauge field meta data + * @param inv_param Dirac and solver meta data + */ + void computeCloverForceQuda(void *mom, double dt, void **x, void **p, double *coeff, double kappa2, double ck, + int nvector, double multiplicity, void *gauge, + QudaGaugeParam *gauge_param, QudaInvertParam *inv_param); - @param[out] ploop Trace of the Polyakov loop in direction dir - @param[in] dir Direction of Polyakov loop -*/ -void polyakovLoopQuda(double ploop[2], int dir); + /** + * Compute the naive staggered force. All fields must be in the same precision. + * + * @param mom Momentum field + * @param dt Integrating step size + * @param delta Additional scale factor when updating momentum (mom += delta * [force]_TA + * @param gauge Gauge field (at present only supports resident gauge field) + * @param x Array of single-parity solution vectors (at present only supports resident solutions) + * @param gauge_param Gauge field meta data + * @param invert_param Dirac and solver meta data + */ + void computeStaggeredForceQuda(void *mom, double dt, double delta, void *gauge, void **x, QudaGaugeParam *gauge_param, + QudaInvertParam *invert_param); -/** - * Performs a deep copy from the internal extendedGaugeResident field. - * @param Pointer to externally allocated GaugeField - */ -void copyExtendedResidentGaugeQuda(void *resident_gauge); + /** + * Compute the fermion force for the HISQ quark action and integrate the momentum. + * @param momentum The momentum field we are integrating + * @param dt The stepsize used to integrate the momentum + * @param level2_coeff The coefficients for the second level of smearing in the quark action. + * @param fat7_coeff The coefficients for the first level of smearing (fat7) in the quark action. + * @param w_link Unitarized link variables obtained by applying fat7 smearing and unitarization to the original links. + * @param v_link Fat7 link variables. + * @param u_link SU(3) think link variables. + * @param quark The input fermion field. + * @param num The number of quark fields + * @param num_naik The number of naik contributions + * @param coeff The coefficient multiplying the fermion fields in the outer product + * @param param. The field parameters. + */ + void computeHISQForceQuda(void* momentum, + double dt, + const double level2_coeff[6], + const double fat7_coeff[6], + const void* const w_link, + const void* const v_link, + const void* const u_link, + void** quark, + int num, + int num_naik, + double** coeff, + QudaGaugeParam* param); -/** - * Performs Wuppertal smearing on a given spinor using the gauge field - * gaugeSmeared, if it exist, or gaugePrecise if no smeared field is present. - * @param h_out Result spinor field - * @param h_in Input spinor field - * @param param Contains all metadata regarding host and device - * storage and operator which will be applied to the spinor - * @param n_steps Number of steps to apply. - * @param alpha Alpha coefficient for Wuppertal smearing. - */ -void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *param, unsigned int n_steps, double alpha); + /** + @brief Generate Gaussian distributed fields and store in the + resident gauge field. We create a Gaussian-distributed su(n) + field and exponentiate it, e.g., U = exp(sigma * H), where H is + the distributed su(n) field and sigma is the width of the + distribution (sigma = 0 results in a free field, and sigma = 1 has + maximum disorder). + + @param seed The seed used for the RNG + @param sigma Width of Gaussian distrubution + */ + void gaussGaugeQuda(unsigned long long seed, double sigma); -/** - * Performs APE, Stout, or Over Imroved STOUT smearing on gaugePrecise and stores it in gaugeSmeared - * @param[in] smear_param Parameter struct that defines the computation parameters - * @param[in,out] obs_param Parameter struct that defines which - * observables we are making and the resulting observables. - */ -void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param); + /** + * @brief Generate Gaussian distributed fields and store in the + * resident momentum field. We create a Gaussian-distributed su(n) + * field, e.g., sigma * H, where H is the distributed su(n) field + * and sigma is the width of the distribution (sigma = 0 results + * in a free field, and sigma = 1 has maximum disorder). + * + * @param seed The seed used for the RNG + * @param sigma Width of Gaussian distrubution + */ + void gaussMomQuda(unsigned long long seed, double sigma); -/** - * Performs Wilson Flow on gaugePrecise and stores it in gaugeSmeared - * @param[in] smear_param Parameter struct that defines the computation parameters - * @param[in,out] obs_param Parameter struct that defines which - * observables we are making and the resulting observables. - */ -void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param); + /** + * Computes the total, spatial and temporal plaquette averages of the loaded gauge configuration. + * @param[out] Array for storing the averages (total, spatial, temporal) + */ + void plaqQuda(double plaq[3]); -/** - * @brief Calculates a variety of gauge-field observables. If a - * smeared gauge field is presently loaded (in gaugeSmeared) the - * observables are computed on this, else the resident gauge field - * will be used. - * @param[in,out] param Parameter struct that defines which - * observables we are making and the resulting observables. - */ -void gaugeObservablesQuda(QudaGaugeObservableParam *param); + /** + @brief Computes the trace of the Polyakov loop of the current resident field + in a given direction. -/** - * Public function to perform color contractions of the host spinors x and y. - * @param[in] x pointer to host data - * @param[in] y pointer to host data - * @param[out] result pointer to the 16 spin projections per lattice site - * @param[in] cType Which type of contraction (open, degrand-rossi, etc) - * @param[in] param meta data for construction of ColorSpinorFields. - * @param[in] X spacetime data for construction of ColorSpinorFields. - */ -void contractQuda(const void *x, const void *y, void *result, const QudaContractType cType, QudaInvertParam *param, - const int *X); + @param[out] ploop Trace of the Polyakov loop in direction dir + @param[in] dir Direction of Polyakov loop + */ + void polyakovLoopQuda(double ploop[2], int dir); -/** - * @brief Gauge fixing with overrelaxation with support for single and multi GPU. - * @param[in,out] gauge, gauge field to be fixed - * @param[in] gauge_dir, 3 for Coulomb gauge fixing, other for Landau gauge fixing - * @param[in] Nsteps, maximum number of steps to perform gauge fixing - * @param[in] verbose_interval, print gauge fixing info when iteration count is a multiple of this - * @param[in] relax_boost, gauge fixing parameter of the overrelaxation method, most common value is 1.5 or 1.7. - * @param[in] tolerance, torelance value to stop the method, if this value is zero then the method stops when - * iteration reachs the maximum number of steps defined by Nsteps - * @param[in] reunit_interval, reunitarize gauge field when iteration count is a multiple of this - * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value - * @param[in] param The parameters of the external fields and the computation settings - * @param[out] timeinfo - */ -int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, - const unsigned int verbose_interval, const double relax_boost, const double tolerance, - const unsigned int reunit_interval, const unsigned int stopWtheta, QudaGaugeParam *param, - double *timeinfo); + /** + * Performs a deep copy from the internal extendedGaugeResident field. + * @param Pointer to externally allocated GaugeField + */ + void copyExtendedResidentGaugeQuda(void *resident_gauge); -/** - * @brief Gauge fixing with Steepest descent method with FFTs with support for single GPU only. - * @param[in,out] gauge, gauge field to be fixed - * @param[in] gauge_dir, 3 for Coulomb gauge fixing, other for Landau gauge fixing - * @param[in] Nsteps, maximum number of steps to perform gauge fixing - * @param[in] verbose_interval, print gauge fixing info when iteration count is a multiple of this - * @param[in] alpha, gauge fixing parameter of the method, most common value is 0.08 - * @param[in] autotune, 1 to autotune the method, i.e., if the Fg inverts its tendency we decrease the alpha value - * @param[in] tolerance, torelance value to stop the method, if this value is zero then the method stops when - * iteration reachs the maximum number of steps defined by Nsteps - * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value - * @param[in] param The parameters of the external fields and the computation settings - * @param[out] timeinfo - */ -int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, - const unsigned int verbose_interval, const double alpha, const unsigned int autotune, - const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, - double *timeinfo); + /** + * Performs Wuppertal smearing on a given spinor using the gauge field + * gaugeSmeared, if it exist, or gaugePrecise if no smeared field is present. + * @param h_out Result spinor field + * @param h_in Input spinor field + * @param param Contains all metadata regarding host and device + * storage and operator which will be applied to the spinor + * @param n_steps Number of steps to apply. + * @param alpha Alpha coefficient for Wuppertal smearing. + */ + void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *param, unsigned int n_steps, double alpha); -/** - * @brief Strided Batched GEMM - * @param[in] arrayA The array containing the A matrix data - * @param[in] arrayB The array containing the B matrix data - * @param[in] arrayC The array containing the C matrix data - * @param[in] native Boolean to use either the native or generic version - * @param[in] param The data defining the problem execution. - */ -void blasGEMMQuda(void *arrayA, void *arrayB, void *arrayC, QudaBoolean native, QudaBLASParam *param); + /** + * Performs APE, Stout, or Over Imroved STOUT smearing on gaugePrecise and stores it in gaugeSmeared + * @param[in] smear_param Parameter struct that defines the computation parameters + * @param[in,out] obs_param Parameter struct that defines which + * observables we are making and the resulting observables. + */ + void performGaugeSmearQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param); -/** - * @brief Strided Batched in-place matrix inversion via LU - * @param[in] Ainv The array containing the A inverse matrix data - * @param[in] A The array containing the A matrix data - * @param[in] use_native Boolean to use either the native or generic version - * @param[in] param The data defining the problem execution. - */ -void blasLUInvQuda(void *Ainv, void *A, QudaBoolean use_native, QudaBLASParam *param); + /** + * Performs Wilson Flow on gaugePrecise and stores it in gaugeSmeared + * @param[in] smear_param Parameter struct that defines the computation parameters + * @param[in,out] obs_param Parameter struct that defines which + * observables we are making and the resulting observables. + */ + void performWFlowQuda(QudaGaugeSmearParam *smear_param, QudaGaugeObservableParam *obs_param); -/** - * @brief Flush the chronological history for the given index - * @param[in] index Index for which we are flushing - */ -void flushChronoQuda(int index); + /** + * @brief Calculates a variety of gauge-field observables. If a + * smeared gauge field is presently loaded (in gaugeSmeared) the + * observables are computed on this, else the resident gauge field + * will be used. + * @param[in,out] param Parameter struct that defines which + * observables we are making and the resulting observables. + */ + void gaugeObservablesQuda(QudaGaugeObservableParam *param); -/** - * Create deflation solver resources. - * - **/ + /** + * Public function to perform color contractions of the host spinors x and y. + * @param[in] x pointer to host data + * @param[in] y pointer to host data + * @param[out] result pointer to the 16 spin projections per lattice site + * @param[in] cType Which type of contraction (open, degrand-rossi, etc) + * @param[in] param meta data for construction of ColorSpinorFields. + * @param[in] X spacetime data for construction of ColorSpinorFields. + */ + void contractQuda(const void *x, const void *y, void *result, const QudaContractType cType, QudaInvertParam *param, + const int *X); -void *newDeflationQuda(QudaEigParam *param); + /** + * @brief Gauge fixing with overrelaxation with support for single and multi GPU. + * @param[in,out] gauge, gauge field to be fixed + * @param[in] gauge_dir, 3 for Coulomb gauge fixing, other for Landau gauge fixing + * @param[in] Nsteps, maximum number of steps to perform gauge fixing + * @param[in] verbose_interval, print gauge fixing info when iteration count is a multiple of this + * @param[in] relax_boost, gauge fixing parameter of the overrelaxation method, most common value is 1.5 or 1.7. + * @param[in] tolerance, torelance value to stop the method, if this value is zero then the method stops when + * iteration reachs the maximum number of steps defined by Nsteps + * @param[in] reunit_interval, reunitarize gauge field when iteration count is a multiple of this + * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value + * @param[in] param The parameters of the external fields and the computation settings + * @param[out] timeinfo + */ + int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, + const unsigned int verbose_interval, const double relax_boost, const double tolerance, + const unsigned int reunit_interval, const unsigned int stopWtheta, + QudaGaugeParam *param, double *timeinfo); -/** - * Free resources allocated by the deflated solver - */ -void destroyDeflationQuda(void *df_instance); + /** + * @brief Gauge fixing with Steepest descent method with FFTs with support for single GPU only. + * @param[in,out] gauge, gauge field to be fixed + * @param[in] gauge_dir, 3 for Coulomb gauge fixing, other for Landau gauge fixing + * @param[in] Nsteps, maximum number of steps to perform gauge fixing + * @param[in] verbose_interval, print gauge fixing info when iteration count is a multiple of this + * @param[in] alpha, gauge fixing parameter of the method, most common value is 0.08 + * @param[in] autotune, 1 to autotune the method, i.e., if the Fg inverts its tendency we decrease the alpha value + * @param[in] tolerance, torelance value to stop the method, if this value is zero then the method stops when + * iteration reachs the maximum number of steps defined by Nsteps + * @param[in] stopWtheta, 0 for MILC criterion and 1 to use the theta value + * @param[in] param The parameters of the external fields and the computation settings + * @param[out] timeinfo + */ + int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, + const unsigned int verbose_interval, const double alpha, const unsigned int autotune, + const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, + double *timeinfo); + + /** + * @brief Strided Batched GEMM + * @param[in] arrayA The array containing the A matrix data + * @param[in] arrayB The array containing the B matrix data + * @param[in] arrayC The array containing the C matrix data + * @param[in] native Boolean to use either the native or generic version + * @param[in] param The data defining the problem execution. + */ + void blasGEMMQuda(void *arrayA, void *arrayB, void *arrayC, QudaBoolean native, QudaBLASParam *param); -void setMPICommHandleQuda(void *mycomm); + /** + * @brief Strided Batched in-place matrix inversion via LU + * @param[in] Ainv The array containing the A inverse matrix data + * @param[in] A The array containing the A matrix data + * @param[in] use_native Boolean to use either the native or generic version + * @param[in] param The data defining the problem execution. + */ + void blasLUInvQuda(void *Ainv, void *A, QudaBoolean use_native, QudaBLASParam *param); + + /** + * @brief Flush the chronological history for the given index + * @param[in] index Index for which we are flushing + */ + void flushChronoQuda(int index); + + + /** + * Create deflation solver resources. + * + **/ + + void* newDeflationQuda(QudaEigParam *param); + + /** + * Free resources allocated by the deflated solver + */ + void destroyDeflationQuda(void *df_instance); + + void setMPICommHandleQuda(void *mycomm); #ifdef __cplusplus } @@ -1719,3 +1732,4 @@ void setMPICommHandleQuda(void *mycomm); #undef double_complex /* #include */ + diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index b3bb200746..a6a461daa8 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -39,8 +39,8 @@ #include #include -#define MAX(a, b) ((a) > (b) ? (a) : (b)) -#define TDIFF(a, b) (b.tv_sec - a.tv_sec + 0.000001 * (b.tv_usec - a.tv_usec)) +#define MAX(a,b) ((a)>(b)? (a):(b)) +#define TDIFF(a,b) (b.tv_sec - a.tv_sec + 0.000001*(b.tv_usec - a.tv_usec)) // define newQudaGaugeParam() and newQudaInvertParam() #define INIT_PARAM @@ -70,6 +70,7 @@ static bool redundant_comms = false; #include + cudaGaugeField *gaugePrecise = nullptr; cudaGaugeField *gaugeSloppy = nullptr; cudaGaugeField *gaugePrecondition = nullptr; @@ -148,28 +149,28 @@ static TimeProfile profileGaugeForce("computeGaugeForceQuda"); //!< Profiler for computeGaugePathQuda static TimeProfile profileGaugePath("computeGaugePathQuda"); -//!< Profiler for updateGaugeFieldQuda +//!> target) { - target_list.push_back(target); - if (target_stream.peek() == ',') target_stream.ignore(); - } - - if (target_list.size() > 0) { - std::sort(target_list.begin(), target_list.end()); - target_list.erase(unique(target_list.begin(), target_list.end()), target_list.end()); - warningQuda("Targeted profiling enabled for %lu functions\n", target_list.size()); - enable = true; - } - } - - char *donotprofile_env = getenv("QUDA_DO_NOT_PROFILE"); // disable profiling of QUDA parts - if (donotprofile_env && (!(strcmp(donotprofile_env, "0") == 0))) { - do_not_profile_quda = true; + while(target_stream >> target) { + target_list.push_back(target); + if (target_stream.peek() == ',') target_stream.ignore(); + } + + if (target_list.size() > 0) { + std::sort(target_list.begin(), target_list.end()); + target_list.erase( unique( target_list.begin(), target_list.end() ), target_list.end() ); + warningQuda("Targeted profiling enabled for %lu functions\n", target_list.size()); + enable = true; + } + } + + char* donotprofile_env = getenv("QUDA_DO_NOT_PROFILE"); // disable profiling of QUDA parts + if (donotprofile_env && (!(strcmp(donotprofile_env, "0") == 0))) { + do_not_profile_quda=true; printfQuda("Disabling profiling in QUDA\n"); } init = true; @@ -248,7 +249,7 @@ static void profilerStart(const char *f) static int target_count = 0; static unsigned int i = 0; - if (do_not_profile_quda) { + if (do_not_profile_quda){ device::profile::stop(); printfQuda("Stopping profiling in QUDA\n"); } else { @@ -258,13 +259,12 @@ static void profilerStart(const char *f) printfQuda("Starting profiling for %s\n", f); device::profile::start(); i++; // advance to next target - } } } } +} -static void profilerStop(const char *f) -{ +static void profilerStop(const char *f) { if (do_not_profile_quda) { device::profile::start(); } else { @@ -277,8 +277,8 @@ static void profilerStop(const char *f) } } -namespace quda -{ + +namespace quda { void printLaunchTimer(); } @@ -289,6 +289,7 @@ void setVerbosityQuda(QudaVerbosity verbosity, const char prefix[], FILE *outfil setOutputFile(outfile); } + typedef struct { int ndim; int dims[QUDA_MAX_DIM]; @@ -302,7 +303,9 @@ static int lex_rank_from_coords(const int *coords, void *fdata) auto *md = static_cast(fdata); int rank = coords[0]; - for (int i = 1; i < md->ndim; i++) { rank = md->dims[i] * rank + coords[i]; } + for (int i = 1; i < md->ndim; i++) { + rank = md->dims[i] * rank + coords[i]; + } return rank; } @@ -337,17 +340,21 @@ void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata { if (comms_initialized) return; - if (nDim != 4) { errorQuda("Number of communication grid dimensions must be 4"); } + if (nDim != 4) { + errorQuda("Number of communication grid dimensions must be 4"); + } LexMapData map_data; if (!func) { #if QMP_COMMS if (QMP_logical_topology_is_declared()) { - if (QMP_get_logical_number_of_dimensions() != 4) { errorQuda("QMP logical topology must have 4 dimensions"); } - for (int i = 0; i < nDim; i++) { + if (QMP_get_logical_number_of_dimensions() != 4) { + errorQuda("QMP logical topology must have 4 dimensions"); + } + for (int i=0; i= QUDA_SUMMARIZE) { #ifdef GITVERSION - printfQuda("QUDA %s (git %s)\n", quda_version.c_str(), gitversion); + printfQuda("QUDA %s (git %s)\n",quda_version.c_str(),gitversion); #else - printfQuda("QUDA %s\n", quda_version.c_str()); + printfQuda("QUDA %s\n",quda_version.c_str()); #endif } @@ -441,7 +452,7 @@ void initQudaDevice(int dev) { // determine if we will do CPU or GPU data reordering (default is GPU) char *reorder_str = getenv("QUDA_REORDER_LOCATION"); - if (!reorder_str || (strcmp(reorder_str, "CPU") && strcmp(reorder_str, "cpu"))) { + if (!reorder_str || (strcmp(reorder_str,"CPU") && strcmp(reorder_str,"cpu")) ) { warningQuda("Data reordering done on GPU (set with QUDA_REORDER_LOCATION=GPU/CPU)"); reorder_location_set(QUDA_CUDA_FIELD_LOCATION); } else { @@ -480,7 +491,7 @@ void initQudaMemory() num_failures_h = static_cast(mapped_malloc(sizeof(int))); num_failures_d = static_cast(get_mapped_device_pointer(num_failures_h)); - for (int d = 0; d < 4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d)); + for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d)); profileInit.TPSTOP(QUDA_PROFILE_INIT); profileInit.TPSTOP(QUDA_PROFILE_TOTAL); @@ -488,7 +499,7 @@ void initQudaMemory() void updateR() { - for (int d = 0; d < 4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d)); + for (int d=0; d<4; d++) R[d] = 2 * (redundant_comms || commDimPartitioned(d)); } void initQuda(int dev) @@ -523,10 +534,10 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) if (gauge_param.order <= 4) gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; GaugeField *in = (param->location == QUDA_CPU_FIELD_LOCATION) ? - static_cast(new cpuGaugeField(gauge_param)) : - static_cast(new cudaGaugeField(gauge_param)); + static_cast(new cpuGaugeField(gauge_param)) : + static_cast(new cudaGaugeField(gauge_param)); - if (in->Order() == QUDA_BQCD_GAUGE_ORDER) { + if (in->Order() == QUDA_BQCD_GAUGE_ORDER) { static size_t checksum = SIZE_MAX; size_t in_checksum = in->checksum(true); if (in_checksum == checksum) { @@ -544,62 +555,63 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) // free any current gauge field before new allocations to reduce memory overhead switch (param->type) { - case QUDA_WILSON_LINKS: - if (gaugeRefinement != gaugeSloppy && gaugeRefinement != gaugeEigensolver && gaugeRefinement) - delete gaugeRefinement; + case QUDA_WILSON_LINKS: + if (gaugeRefinement != gaugeSloppy && gaugeRefinement != gaugeEigensolver && gaugeRefinement) + delete gaugeRefinement; - if (gaugePrecondition != gaugeSloppy && gaugePrecondition != gaugeEigensolver && gaugePrecondition != gaugePrecise - && gaugePrecondition) - delete gaugePrecondition; + if (gaugePrecondition != gaugeSloppy && gaugePrecondition != gaugeEigensolver && gaugePrecondition != gaugePrecise + && gaugePrecondition) + delete gaugePrecondition; - if (gaugeEigensolver != gaugeSloppy && gaugeEigensolver != gaugePrecise && gaugeEigensolver != gaugePrecondition - && gaugeEigensolver) - delete gaugeEigensolver; + if (gaugeEigensolver != gaugeSloppy && gaugeEigensolver != gaugePrecise && gaugeEigensolver != gaugePrecondition + && gaugeEigensolver) + delete gaugeEigensolver; - if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy; + if (gaugePrecise != gaugeSloppy && gaugeSloppy) delete gaugeSloppy; - if (gaugePrecise && !param->use_resident_gauge) delete gaugePrecise; + if (gaugePrecise && !param->use_resident_gauge) delete gaugePrecise; - break; - case QUDA_ASQTAD_FAT_LINKS: - if (gaugeFatRefinement != gaugeFatSloppy && gaugeFatRefinement != gaugeFatEigensolver && gaugeFatRefinement) - delete gaugeFatRefinement; + break; + case QUDA_ASQTAD_FAT_LINKS: + if (gaugeFatRefinement != gaugeFatSloppy && gaugeFatRefinement != gaugeFatEigensolver && gaugeFatRefinement) + delete gaugeFatRefinement; - if (gaugeFatPrecondition != gaugeFatSloppy && gaugeFatPrecondition != gaugeFatEigensolver - && gaugeFatPrecondition != gaugeFatPrecise && gaugeFatPrecondition) - delete gaugeFatPrecondition; + if (gaugeFatPrecondition != gaugeFatSloppy && gaugeFatPrecondition != gaugeFatEigensolver + && gaugeFatPrecondition != gaugeFatPrecise && gaugeFatPrecondition) + delete gaugeFatPrecondition; - if (gaugeFatEigensolver != gaugeFatSloppy && gaugeFatEigensolver != gaugeFatPrecise - && gaugeFatEigensolver != gaugeFatPrecondition && gaugeFatEigensolver) - delete gaugeFatEigensolver; + if (gaugeFatEigensolver != gaugeFatSloppy && gaugeFatEigensolver != gaugeFatPrecise + && gaugeFatEigensolver != gaugeFatPrecondition && gaugeFatEigensolver) + delete gaugeFatEigensolver; - if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy; + if (gaugeFatPrecise != gaugeFatSloppy && gaugeFatSloppy) delete gaugeFatSloppy; - if (gaugeFatPrecise && !param->use_resident_gauge) delete gaugeFatPrecise; + if (gaugeFatPrecise && !param->use_resident_gauge) delete gaugeFatPrecise; - break; - case QUDA_ASQTAD_LONG_LINKS: + break; + case QUDA_ASQTAD_LONG_LINKS: - if (gaugeLongRefinement != gaugeLongSloppy && gaugeLongRefinement != gaugeLongEigensolver && gaugeLongRefinement) - delete gaugeLongRefinement; + if (gaugeLongRefinement != gaugeLongSloppy && gaugeLongRefinement != gaugeLongEigensolver && gaugeLongRefinement) + delete gaugeLongRefinement; - if (gaugeLongPrecondition != gaugeLongSloppy && gaugeLongPrecondition != gaugeLongEigensolver - && gaugeLongPrecondition != gaugeLongPrecise && gaugeLongPrecondition) - delete gaugeLongPrecondition; + if (gaugeLongPrecondition != gaugeLongSloppy && gaugeLongPrecondition != gaugeLongEigensolver + && gaugeLongPrecondition != gaugeLongPrecise && gaugeLongPrecondition) + delete gaugeLongPrecondition; - if (gaugeLongEigensolver != gaugeLongSloppy && gaugeLongEigensolver != gaugeLongPrecise - && gaugeLongEigensolver != gaugeLongPrecondition && gaugeLongEigensolver) - delete gaugeLongEigensolver; + if (gaugeLongEigensolver != gaugeLongSloppy && gaugeLongEigensolver != gaugeLongPrecise + && gaugeLongEigensolver != gaugeLongPrecondition && gaugeLongEigensolver) + delete gaugeLongEigensolver; - if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy; + if (gaugeLongPrecise != gaugeLongSloppy && gaugeLongSloppy) delete gaugeLongSloppy; - if (gaugeLongPrecise) delete gaugeLongPrecise; + if (gaugeLongPrecise) delete gaugeLongPrecise; - break; - case QUDA_SMEARED_LINKS: - if (gaugeSmeared) delete gaugeSmeared; - break; - default: errorQuda("Invalid gauge type %d", param->type); + break; + case QUDA_SMEARED_LINKS: + if (gaugeSmeared) delete gaugeSmeared; + break; + default: + errorQuda("Invalid gauge type %d", param->type); } // if not preserving then copy the gauge field passed in @@ -616,7 +628,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) precise = new cudaGaugeField(gauge_param); if (param->use_resident_gauge) { - if (gaugePrecise == nullptr) errorQuda("No resident gauge field"); + if(gaugePrecise == nullptr) errorQuda("No resident gauge field"); // copy rather than point at to ensure that the padded region is filled in precise->copy(*gaugePrecise); precise->exchangeGhost(); @@ -703,48 +715,49 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) profileGauge.TPSTOP(QUDA_PROFILE_COMPUTE); // create an extended preconditioning field - cudaGaugeField *extended = nullptr; - if (param->overlap) { + cudaGaugeField* extended = nullptr; + if (param->overlap){ lat_dim_t R; // domain-overlap widths in different directions - for (int i = 0; i < 4; ++i) R[i] = param->overlap * commDimPartitioned(i); + for (int i=0; i<4; ++i) R[i] = param->overlap*commDimPartitioned(i); extended = createExtendedGauge(*precondition, R, profileGauge); } switch (param->type) { - case QUDA_WILSON_LINKS: - gaugePrecise = precise; - gaugeSloppy = sloppy; - gaugePrecondition = precondition; - gaugeRefinement = refinement; - gaugeEigensolver = eigensolver; - - if (param->overlap) gaugeExtended = extended; - break; - case QUDA_ASQTAD_FAT_LINKS: - gaugeFatPrecise = precise; - gaugeFatSloppy = sloppy; - gaugeFatPrecondition = precondition; - gaugeFatRefinement = refinement; - gaugeFatEigensolver = eigensolver; - - if (param->overlap) { - if (gaugeFatExtended) errorQuda("Extended gauge fat field already allocated"); - gaugeFatExtended = extended; - } - break; - case QUDA_ASQTAD_LONG_LINKS: - gaugeLongPrecise = precise; - gaugeLongSloppy = sloppy; - gaugeLongPrecondition = precondition; - gaugeLongRefinement = refinement; - gaugeLongEigensolver = eigensolver; - - if (param->overlap) { - if (gaugeLongExtended) errorQuda("Extended gauge long field already allocated"); - gaugeLongExtended = extended; - } - break; - default: errorQuda("Invalid gauge type %d", param->type); + case QUDA_WILSON_LINKS: + gaugePrecise = precise; + gaugeSloppy = sloppy; + gaugePrecondition = precondition; + gaugeRefinement = refinement; + gaugeEigensolver = eigensolver; + + if(param->overlap) gaugeExtended = extended; + break; + case QUDA_ASQTAD_FAT_LINKS: + gaugeFatPrecise = precise; + gaugeFatSloppy = sloppy; + gaugeFatPrecondition = precondition; + gaugeFatRefinement = refinement; + gaugeFatEigensolver = eigensolver; + + if(param->overlap){ + if(gaugeFatExtended) errorQuda("Extended gauge fat field already allocated"); + gaugeFatExtended = extended; + } + break; + case QUDA_ASQTAD_LONG_LINKS: + gaugeLongPrecise = precise; + gaugeLongSloppy = sloppy; + gaugeLongPrecondition = precondition; + gaugeLongRefinement = refinement; + gaugeLongEigensolver = eigensolver; + + if(param->overlap){ + if(gaugeLongExtended) errorQuda("Extended gauge long field already allocated"); + gaugeLongExtended = extended; + } + break; + default: + errorQuda("Invalid gauge type %d", param->type); } profileGauge.TPSTART(QUDA_PROFILE_FREE); @@ -1003,6 +1016,7 @@ void loadSloppyCloverQuda(const QudaPrecision *prec) cloverEigensolver->copy(*cloverPrecise); } } + } // just free the sloppy fields used in mixed-precision solvers @@ -1351,7 +1365,8 @@ void freeCloverQuda(void) void flushChronoQuda(int i) { - if (i >= QUDA_MAX_CHRONO) errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO); + if (i >= QUDA_MAX_CHRONO) + errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO); chronoResident[i].clear(); } @@ -1369,7 +1384,7 @@ void endQuda(void) solutionResident.clear(); - if (momResident) delete momResident; + if(momResident) delete momResident; LatticeField::freeGhostBuffer(); ColorSpinorField::freeGhostBuffer(); @@ -1446,17 +1461,23 @@ void endQuda(void) device::destroy(); } -namespace quda -{ + +namespace quda { void setDiracParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) { double kappa = inv_param->kappa; - if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { kappa *= gaugePrecise->Anisotropy(); } + if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { + kappa *= gaugePrecise->Anisotropy(); + } switch (inv_param->dslash_type) { - case QUDA_WILSON_DSLASH: diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC; break; - case QUDA_CLOVER_WILSON_DSLASH: diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC; break; + case QUDA_WILSON_DSLASH: + diracParam.type = pc ? QUDA_WILSONPC_DIRAC : QUDA_WILSON_DIRAC; + break; + case QUDA_CLOVER_WILSON_DSLASH: + diracParam.type = pc ? QUDA_CLOVERPC_DIRAC : QUDA_CLOVER_DIRAC; + break; case QUDA_CLOVER_HASENBUSCH_TWIST_DSLASH: diracParam.type = pc ? QUDA_CLOVER_HASENBUSCH_TWISTPC_DIRAC : QUDA_CLOVER_HASENBUSCH_TWIST_DIRAC; break; @@ -1487,7 +1508,7 @@ namespace quda break; case QUDA_MOBIUS_DWF_DSLASH: if (inv_param->Ls > QUDA_MAX_DWF_LS) - errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS); + errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS); diracParam.type = pc ? QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC : QUDA_MOBIUS_DOMAIN_WALL_DIRAC; diracParam.Ls = inv_param->Ls; if (sizeof(Complex) != sizeof(double _Complex)) { @@ -1499,41 +1520,48 @@ namespace quda printfQuda("Printing b_5 and c_5 values\n"); for (int i = 0; i < diracParam.Ls; i++) { printfQuda("fromQUDA diracParam: b5[%d] = %f + i%f, c5[%d] = %f + i%f\n", i, diracParam.b_5[i].real(), - diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(), diracParam.c_5[i].imag()); + diracParam.b_5[i].imag(), i, diracParam.c_5[i].real(), diracParam.c_5[i].imag()); // printfQuda("fromQUDA inv_param: b5[%d] = %f %f c5[%d] = %f %f\n", i, inv_param->b_5[i], i, // inv_param->c_5[i] ); printfQuda("fromQUDA creal: b5[%d] = %f %f c5[%d] = %f %f \n", i, // creal(inv_param->b_5[i]), cimag(inv_param->b_5[i]), i, creal(inv_param->c_5[i]), cimag(inv_param->c_5[i]) ); } } break; - case QUDA_STAGGERED_DSLASH: diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC; break; - case QUDA_ASQTAD_DSLASH: diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC; break; + case QUDA_STAGGERED_DSLASH: + diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC; + break; + case QUDA_ASQTAD_DSLASH: + diracParam.type = pc ? QUDA_ASQTADPC_DIRAC : QUDA_ASQTAD_DIRAC; + break; case QUDA_TWISTED_MASS_DSLASH: diracParam.type = pc ? QUDA_TWISTED_MASSPC_DIRAC : QUDA_TWISTED_MASS_DIRAC; if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) { - diracParam.Ls = 1; - diracParam.epsilon = 0.0; + diracParam.Ls = 1; + diracParam.epsilon = 0.0; } else { - diracParam.Ls = 2; - diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0; + diracParam.Ls = 2; + diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0; } break; case QUDA_TWISTED_CLOVER_DSLASH: diracParam.type = pc ? QUDA_TWISTED_CLOVERPC_DIRAC : QUDA_TWISTED_CLOVER_DIRAC; - if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) { - diracParam.Ls = 1; - diracParam.epsilon = 0.0; + if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) { + diracParam.Ls = 1; + diracParam.epsilon = 0.0; } else { - diracParam.Ls = 2; - diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0; + diracParam.Ls = 2; + diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0; } break; case QUDA_LAPLACE_DSLASH: diracParam.type = pc ? QUDA_GAUGE_LAPLACEPC_DIRAC : QUDA_GAUGE_LAPLACE_DIRAC; diracParam.laplace3D = inv_param->laplace3D; break; - case QUDA_COVDEV_DSLASH: diracParam.type = QUDA_GAUGE_COVDEV_DIRAC; break; - default: errorQuda("Unsupported dslash_type %d", inv_param->dslash_type); + case QUDA_COVDEV_DSLASH: + diracParam.type = QUDA_GAUGE_COVDEV_DIRAC; + break; + default: + errorQuda("Unsupported dslash_type %d", inv_param->dslash_type); } diracParam.matpcType = inv_param->matpc_type; @@ -1548,7 +1576,7 @@ namespace quda diracParam.mu = inv_param->mu; diracParam.tm_rho = inv_param->tm_rho; - for (int i = 0; i < 4; i++) diracParam.commDim[i] = 1; // comms are always on + for (int i=0; i<4; i++) diracParam.commDim[i] = 1; // comms are always on if (diracParam.gauge->Precision() != inv_param->cuda_prec) errorQuda("Gauge precision %d does not match requested precision %d\n", diracParam.gauge->Precision(), @@ -1557,6 +1585,7 @@ namespace quda diracParam.use_mobius_fused_kernel = inv_param->use_mobius_fused_kernel; } + void setDiracSloppyParam(DiracParam &diracParam, QudaInvertParam *inv_param, const bool pc) { setDiracParam(diracParam, inv_param, pc); @@ -1566,8 +1595,8 @@ namespace quda diracParam.longGauge = gaugeLongSloppy; diracParam.clover = cloverSloppy; - for (int i = 0; i < 4; i++) { - diracParam.commDim[i] = 1; // comms are always on + for (int i=0; i<4; i++) { + diracParam.commDim[i] = 1; // comms are always on } if (diracParam.gauge->Precision() != inv_param->cuda_prec_sloppy) @@ -1584,8 +1613,8 @@ namespace quda diracParam.longGauge = gaugeLongRefinement; diracParam.clover = cloverRefinement; - for (int i = 0; i < 4; i++) { - diracParam.commDim[i] = 1; // comms are always on + for (int i=0; i<4; i++) { + diracParam.commDim[i] = 1; // comms are always on } if (diracParam.gauge->Precision() != inv_param->cuda_prec_refinement_sloppy) @@ -1609,13 +1638,15 @@ namespace quda } diracParam.clover = cloverPrecondition; - for (int i = 0; i < 4; i++) { diracParam.commDim[i] = comms ? 1 : 0; } + for (int i=0; i<4; i++) { + diracParam.commDim[i] = comms ? 1 : 0; + } // In the preconditioned staggered CG allow a different dslash type in the preconditioning - if (inv_param->inv_type == QUDA_PCG_INVERTER && inv_param->dslash_type == QUDA_ASQTAD_DSLASH - && inv_param->dslash_type_precondition == QUDA_STAGGERED_DSLASH) { - diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC; - diracParam.gauge = gaugeFatPrecondition; + if(inv_param->inv_type == QUDA_PCG_INVERTER && inv_param->dslash_type == QUDA_ASQTAD_DSLASH + && inv_param->dslash_type_precondition == QUDA_STAGGERED_DSLASH) { + diracParam.type = pc ? QUDA_STAGGEREDPC_DIRAC : QUDA_STAGGERED_DIRAC; + diracParam.gauge = gaugeFatPrecondition; } if (diracParam.gauge->Precision() != inv_param->cuda_prec_precondition) @@ -1715,7 +1746,7 @@ namespace quda void massRescale(ColorSpinorField &b, QudaInvertParam ¶m, bool for_multishift) { - double kappa5 = (0.5 / (5.0 + param.m5)); + double kappa5 = (0.5/(5.0 + param.m5)); double kappa = (param.dslash_type == QUDA_DOMAIN_WALL_DSLASH || param.dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || param.dslash_type == QUDA_MOBIUS_DWF_DSLASH || param.dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH) ? kappa5 : @@ -1728,15 +1759,16 @@ namespace quda // staggered dslash uses mass normalization internally if (param.dslash_type == QUDA_ASQTAD_DSLASH || param.dslash_type == QUDA_STAGGERED_DSLASH) { switch (param.solution_type) { - case QUDA_MAT_SOLUTION: - case QUDA_MATPC_SOLUTION: - if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(2.0 * param.mass, b); - break; - case QUDA_MATDAG_MAT_SOLUTION: - case QUDA_MATPCDAG_MATPC_SOLUTION: - if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(4.0 * param.mass * param.mass, b); - break; - default: errorQuda("Not implemented"); + case QUDA_MAT_SOLUTION: + case QUDA_MATPC_SOLUTION: + if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(2.0*param.mass, b); + break; + case QUDA_MATDAG_MAT_SOLUTION: + case QUDA_MATPCDAG_MATPC_SOLUTION: + if (param.mass_normalization == QUDA_KAPPA_NORMALIZATION) blas::ax(4.0*param.mass*param.mass, b); + break; + default: + errorQuda("Not implemented"); } return; } @@ -1744,50 +1776,51 @@ namespace quda // multiply the source to compensate for normalization of the Dirac operator, if necessary // you are responsible for restoring what's in param.offset switch (param.solution_type) { - case QUDA_MAT_SOLUTION: - if (param.mass_normalization == QUDA_MASS_NORMALIZATION - || param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(2.0 * kappa, b); - if (for_multishift) - for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 2.0 * kappa; - } - break; - case QUDA_MATDAG_MAT_SOLUTION: - if (param.mass_normalization == QUDA_MASS_NORMALIZATION - || param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(4.0 * kappa * kappa, b); - if (for_multishift) - for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; - } - break; - case QUDA_MATPC_SOLUTION: - if (param.mass_normalization == QUDA_MASS_NORMALIZATION) { - blas::ax(4.0 * kappa * kappa, b); - if (for_multishift) - for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; - } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(2.0 * kappa, b); - if (for_multishift) - for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 2.0 * kappa; - } - break; - case QUDA_MATPCDAG_MATPC_SOLUTION: - if (param.mass_normalization == QUDA_MASS_NORMALIZATION) { - blas::ax(16.0 * std::pow(kappa, 4), b); - if (for_multishift) - for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 16.0 * std::pow(kappa, 4); - } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(4.0 * kappa * kappa, b); - if (for_multishift) - for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; - } - break; - default: errorQuda("Solution type %d not supported", param.solution_type); + case QUDA_MAT_SOLUTION: + if (param.mass_normalization == QUDA_MASS_NORMALIZATION || + param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(2.0*kappa, b); + if (for_multishift) + for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 2.0 * kappa; + } + break; + case QUDA_MATDAG_MAT_SOLUTION: + if (param.mass_normalization == QUDA_MASS_NORMALIZATION || + param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(4.0*kappa*kappa, b); + if (for_multishift) + for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; + } + break; + case QUDA_MATPC_SOLUTION: + if (param.mass_normalization == QUDA_MASS_NORMALIZATION) { + blas::ax(4.0*kappa*kappa, b); + if (for_multishift) + for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; + } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(2.0*kappa, b); + if (for_multishift) + for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 2.0 * kappa; + } + break; + case QUDA_MATPCDAG_MATPC_SOLUTION: + if (param.mass_normalization == QUDA_MASS_NORMALIZATION) { + blas::ax(16.0*std::pow(kappa,4), b); + if (for_multishift) + for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 16.0 * std::pow(kappa, 4); + } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(4.0*kappa*kappa, b); + if (for_multishift) + for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; + } + break; + default: + errorQuda("Solution type %d not supported", param.solution_type); } logQuda(QUDA_DEBUG_VERBOSE, "Mass rescale: norm of source out = %g\n", blas::norm2(b)); } -} // namespace quda +} void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) { @@ -1799,8 +1832,7 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH) || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH)) errorQuda("Gauge field not allocated"); - if (cloverPrecise == nullptr - && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) + if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) errorQuda("Clover field not allocated"); pushVerbosity(inv_param->verbosity); @@ -1831,9 +1863,10 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); - if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION - && (inv_param->dslash_type == QUDA_STAGGERED_DSLASH || inv_param->dslash_type == QUDA_ASQTAD_DSLASH)) - blas::ax(1.0 / (2.0 * inv_param->mass), in); + if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION && + (inv_param->dslash_type == QUDA_STAGGERED_DSLASH || + inv_param->dslash_type == QUDA_ASQTAD_DSLASH) ) + blas::ax(1.0/(2.0*inv_param->mass), in); if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { if (parity == QUDA_EVEN_PARITY) { @@ -1848,8 +1881,8 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inv_param->dagger) { cudaParam.create = QUDA_NULL_FIELD_CREATE; ColorSpinorField tmp1(cudaParam); - ((DiracTwistedCloverPC *)dirac)->TwistCloverInv(tmp1, in, (parity + 1) % 2); // apply the clover-twist - dirac->Dslash(out, tmp1, parity); // apply the operator + ((DiracTwistedCloverPC*) dirac)->TwistCloverInv(tmp1, in, (parity+1)%2); // apply the clover-twist + dirac->Dslash(out, tmp1, parity); // apply the operator } else if (inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH || inv_param->dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH) { dirac->Dslash4(out, in, parity); @@ -1873,6 +1906,7 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity profileDslash.TPSTOP(QUDA_PROFILE_TOTAL); } + // #if 0 void dslashQudaTest(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) { @@ -1959,6 +1993,7 @@ void dslashQudaTest(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaPar } // #endif + void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) { pushVerbosity(inv_param->verbosity); @@ -1968,12 +2003,12 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH) || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH)) errorQuda("Gauge field not allocated"); - if (cloverPrecise == nullptr - && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) + if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) errorQuda("Clover field not allocated"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); - bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); + bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || + inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), pc, inv_param->input_location); ColorSpinorField in_h(cpuParam); @@ -1992,20 +2027,20 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) setDiracParam(diracParam, inv_param, pc); Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator - dirac->M(out, in); // apply the operator - delete dirac; // clean up + dirac->M(out, in); // apply the operator + delete dirac; // clean up double kappa = inv_param->kappa; if (pc) { if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) { - blas::ax(0.25 / (kappa * kappa), out); + blas::ax(0.25/(kappa*kappa), out); } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(0.5 / kappa, out); + blas::ax(0.5/kappa, out); } } else { - if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION - || inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(0.5 / kappa, out); + if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION || + inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(0.5/kappa, out); } } @@ -2019,6 +2054,7 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) popVerbosity(); } + void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) { pushVerbosity(inv_param->verbosity); @@ -2028,12 +2064,12 @@ void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH) || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH)) errorQuda("Gauge field not allocated"); - if (cloverPrecise == nullptr - && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) + if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) errorQuda("Clover field not allocated"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); - bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); + bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || + inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), pc, inv_param->input_location); ColorSpinorField in_h(cpuParam); @@ -2054,20 +2090,20 @@ void MatDagMatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) setDiracParam(diracParam, inv_param, pc); Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator - dirac->MdagM(out, in); // apply the operator - delete dirac; // clean up + dirac->MdagM(out, in); // apply the operator + delete dirac; // clean up double kappa = inv_param->kappa; if (pc) { if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) { - blas::ax(1.0 / std::pow(2.0 * kappa, 4), out); + blas::ax(1.0/std::pow(2.0*kappa,4), out); } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(0.25 / (kappa * kappa), out); + blas::ax(0.25/(kappa*kappa), out); } } else { - if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION - || inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(0.25 / (kappa * kappa), out); + if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION || + inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(0.25/(kappa*kappa), out); } } @@ -2096,10 +2132,11 @@ namespace quda } // namespace quda -void checkClover(QudaInvertParam *param) -{ +void checkClover(QudaInvertParam *param) { - if (param->dslash_type != QUDA_CLOVER_WILSON_DSLASH && param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) { return; } + if (param->dslash_type != QUDA_CLOVER_WILSON_DSLASH && param->dslash_type != QUDA_TWISTED_CLOVER_DSLASH) { + return; + } if (param->cuda_prec != cloverPrecise->Precision()) { errorQuda("Solve precision %d doesn't match clover precision %d", param->cuda_prec, cloverPrecise->Precision()); @@ -2239,12 +2276,10 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity DiracParam diracParam; setDiracParam(diracParam, inv_param, pc); - // FIXME: Do we need this for twisted clover??? + //FIXME: Do we need this for twisted clover??? DiracCloverPC dirac(diracParam); // create the Dirac operator - if (!inverse) - dirac.Clover(out, in, parity); // apply the clover operator - else - dirac.CloverInv(out, in, parity); + if (!inverse) dirac.Clover(out, in, parity); // apply the clover operator + else dirac.CloverInv(out, in, parity); cpuParam.v = h_out; cpuParam.location = inv_param->output_location; @@ -2436,8 +2471,8 @@ void eigensolveQuda(void **host_evecs, double _Complex *host_evals, QudaEigParam profileEigensolve.TPSTOP(QUDA_PROFILE_TOTAL); } -multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile) : profile(profile) -{ +multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &profile) + : profile(profile) { profile.TPSTART(QUDA_PROFILE_INIT); QudaInvertParam *param = mg_param.invert_param; // set whether we are going use native or generic blas @@ -2449,20 +2484,22 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr // check MG params (needs to go somewhere else) if (mg_param.n_level > QUDA_MAX_MG_LEVEL) errorQuda("Requested MG levels %d greater than allowed maximum %d", mg_param.n_level, QUDA_MAX_MG_LEVEL); - for (int i = 0; i < mg_param.n_level; i++) { + for (int i=0; isolve_type != QUDA_DIRECT_SOLVE) errorQuda("Outer MG solver can only use QUDA_DIRECT_SOLVE at present"); + if (param->solve_type != QUDA_DIRECT_SOLVE) + errorQuda("Outer MG solver can only use QUDA_DIRECT_SOLVE at present"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaMultigridParam(&mg_param); mg_param.secs = 0; mg_param.gflops = 0; - bool pc_solution - = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); + bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || + (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); - bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE); + bool outer_pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || + (param->solve_type == QUDA_NORMOP_PC_SOLVE); // create the dirac operators for the fine grid @@ -2474,8 +2511,8 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr // this is the Dirac operator we use for smoothing DiracParam diracSmoothParam; - bool fine_grid_pc_solve = (mg_param.smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE) - || (mg_param.smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE); + bool fine_grid_pc_solve = (mg_param.smoother_solve_type[0] == QUDA_DIRECT_PC_SOLVE) || + (mg_param.smoother_solve_type[0] == QUDA_NORMOP_PC_SOLVE); setDiracSloppyParam(diracSmoothParam, param, fine_grid_pc_solve); diracSmoothParam.halo_precision = mg_param.smoother_halo_precision[0]; dSmooth = Dirac::create(diracSmoothParam); @@ -2484,7 +2521,7 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this) DiracParam diracSmoothSloppyParam; setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve, - mg_param.smoother_schwarz_type[0] == QUDA_INVALID_SCHWARZ ? true : false); + mg_param.smoother_schwarz_type[0] == QUDA_INVALID_SCHWARZ ? true : false); diracSmoothSloppyParam.halo_precision = mg_param.smoother_halo_precision[0]; dSmoothSloppy = Dirac::create(diracSmoothSloppyParam); @@ -2493,9 +2530,7 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr ColorSpinorParam csParam(nullptr, *param, cudaGauge->X(), pc_solution, mg_param.setup_location[0]); csParam.create = QUDA_NULL_FIELD_CREATE; QudaPrecision Bprec = mg_param.precision_null[0]; - Bprec - = (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION && Bprec < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : - Bprec); + Bprec = (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION && Bprec < QUDA_SINGLE_PRECISION ? QUDA_SINGLE_PRECISION : Bprec); csParam.setPrecision(Bprec, Bprec, true); if (mg_param.setup_location[0] == QUDA_CPU_FIELD_LOCATION) csParam.fieldOrder = QUDA_SPACE_SPIN_COLOR_FIELD_ORDER; csParam.mem_type = mg_param.setup_minimize_memory == QUDA_BOOLEAN_TRUE ? QUDA_MEMORY_MAPPED : QUDA_MEMORY_DEVICE; @@ -2520,8 +2555,7 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr profile.TPSTOP(QUDA_PROFILE_INIT); } -void *newMultigridQuda(QudaMultigridParam *mg_param) -{ +void* newMultigridQuda(QudaMultigridParam *mg_param) { profilerStart(__func__); pushVerbosity(mg_param->invert_param->verbosity); @@ -2535,10 +2569,12 @@ void *newMultigridQuda(QudaMultigridParam *mg_param) popVerbosity(); profilerStop(__func__); - return static_cast(mg); + return static_cast(mg); } -void destroyMultigridQuda(void *mg) { delete static_cast(mg); } +void destroyMultigridQuda(void *mg) { + delete static_cast(mg); +} void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param) { @@ -2549,7 +2585,7 @@ void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param) profileInvert.TPSTART(QUDA_PROFILE_TOTAL); profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE); - auto *mg = static_cast(mg_); + auto *mg = static_cast(mg_); checkMultigridParam(mg_param); QudaInvertParam *param = mg_param->invert_param; @@ -2559,7 +2595,7 @@ void updateMultigridQuda(void *mg_, QudaMultigridParam *mg_param) // for reporting level 1 is the fine level but internally use level 0 for indexing // sprintf(mg->prefix,"MG level 1 (%s): ", param.location == QUDA_CUDA_FIELD_LOCATION ? "GPU" : "CPU" ); // setOutputPrefix(prefix); - setOutputPrefix("MG level 1 (GPU): "); // fix me + setOutputPrefix("MG level 1 (GPU): "); //fix me // Check if we're doing a thin update only if (mg_param->thin_update_only) { @@ -2661,7 +2697,7 @@ void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param) pushVerbosity(mg_param->invert_param->verbosity); profileInvert.TPSTART(QUDA_PROFILE_TOTAL); - auto *mg = static_cast(mg_); + auto *mg = static_cast(mg_); checkMultigridParam(mg_param); checkGauge(mg_param->invert_param); @@ -2672,9 +2708,8 @@ void dumpMultigridQuda(void *mg_, QudaMultigridParam *mg_param) profilerStop(__func__); } -deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) : - d(nullptr), m(nullptr), RV(nullptr), deflParam(nullptr), defl(nullptr), profile(profile) -{ +deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) + : d(nullptr), m(nullptr), RV(nullptr), deflParam(nullptr), defl(nullptr), profile(profile) { QudaInvertParam *param = eig_param.invert_param; @@ -2683,52 +2718,49 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) profile.TPSTART(QUDA_PROFILE_INIT); cudaGaugeField *cudaGauge = checkGauge(param); - eig_param.secs = 0; + eig_param.secs = 0; eig_param.gflops = 0; DiracParam diracParam; - if (eig_param.cuda_prec_ritz == param->cuda_prec) { - setDiracParam(diracParam, param, - (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE)); + if(eig_param.cuda_prec_ritz == param->cuda_prec) + { + setDiracParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE)); } else { - setDiracSloppyParam(diracParam, param, - (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE)); + setDiracSloppyParam(diracParam, param, (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE)); } const bool pc_solve = (param->solve_type == QUDA_NORMOP_PC_SOLVE); d = Dirac::create(diracParam); - m = pc_solve ? static_cast(new DiracMdagM(*d)) : static_cast(new DiracM(*d)); + m = pc_solve ? static_cast( new DiracMdagM(*d) ) : static_cast( new DiracM(*d)); ColorSpinorParam ritzParam(nullptr, *param, cudaGauge->X(), pc_solve, eig_param.location); - ritzParam.create = QUDA_ZERO_FIELD_CREATE; - ritzParam.is_composite = true; - ritzParam.is_component = false; + ritzParam.create = QUDA_ZERO_FIELD_CREATE; + ritzParam.is_composite = true; + ritzParam.is_component = false; ritzParam.composite_dim = param->n_ev * param->deflation_grid; ritzParam.setPrecision(param->cuda_prec_ritz); - if (ritzParam.location == QUDA_CUDA_FIELD_LOCATION) { + if (ritzParam.location==QUDA_CUDA_FIELD_LOCATION) { ritzParam.setPrecision(param->cuda_prec_ritz, param->cuda_prec_ritz, true); // set native field order if (ritzParam.nSpin != 1) ritzParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; - // select memory location here, by default ritz vectors will be allocated on the device - // but if not sufficient device memory, then the user may choose mapped type of memory + //select memory location here, by default ritz vectors will be allocated on the device + //but if not sufficient device memory, then the user may choose mapped type of memory ritzParam.mem_type = eig_param.mem_type_ritz; - } else { // host location + } else { //host location ritzParam.mem_type = QUDA_MEMORY_PINNED; } int ritzVolume = 1; - for (int d = 0; d < ritzParam.nDim; d++) ritzVolume *= ritzParam.x[d]; + for(int d = 0; d < ritzParam.nDim; d++) ritzVolume *= ritzParam.x[d]; if (getVerbosity() == QUDA_DEBUG_VERBOSE) { - size_t byte_estimate = (size_t)ritzParam.composite_dim * (size_t)ritzVolume - * (ritzParam.nColor * ritzParam.nSpin * ritzParam.Precision()); + size_t byte_estimate = (size_t)ritzParam.composite_dim*(size_t)ritzVolume*(ritzParam.nColor*ritzParam.nSpin*ritzParam.Precision()); printfQuda("allocating bytes: %lu (lattice volume %d, prec %d)", byte_estimate, ritzVolume, ritzParam.Precision()); - if (ritzParam.mem_type == QUDA_MEMORY_DEVICE) - printfQuda("Using device memory type.\n"); + if(ritzParam.mem_type == QUDA_MEMORY_DEVICE) printfQuda("Using device memory type.\n"); else if (ritzParam.mem_type == QUDA_MEMORY_MAPPED) printfQuda("Using mapped memory type.\n"); } @@ -2742,8 +2774,7 @@ deflated_solver::deflated_solver(QudaEigParam &eig_param, TimeProfile &profile) profile.TPSTOP(QUDA_PROFILE_INIT); } -void *newDeflationQuda(QudaEigParam *eig_param) -{ +void* newDeflationQuda(QudaEigParam *eig_param) { profileInvert.TPSTART(QUDA_PROFILE_TOTAL); auto *defl = new deflated_solver(*eig_param, profileInvert); @@ -2751,10 +2782,12 @@ void *newDeflationQuda(QudaEigParam *eig_param) saveProfile(__func__); flushProfile(); - return static_cast(defl); + return static_cast(defl); } -void destroyDeflationQuda(void *df) { delete static_cast(df); } +void destroyDeflationQuda(void *df) { + delete static_cast(df); +} void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) { @@ -2776,13 +2809,16 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) // solve_type and solution_type, rather than in separate members of QudaInvertParam. We're stuck with it // for now, though, so here we factorize everything for convenience. - bool pc_solution - = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); - bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE) - || (param->solve_type == QUDA_NORMERR_PC_SOLVE); - bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type == QUDA_MATPC_SOLUTION); - bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE); - bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE); + bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || + (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); + bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || + (param->solve_type == QUDA_NORMOP_PC_SOLVE) || (param->solve_type == QUDA_NORMERR_PC_SOLVE); + bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || + (param->solution_type == QUDA_MATPC_SOLUTION); + bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || + (param->solve_type == QUDA_DIRECT_PC_SOLVE); + bool norm_error_solve = (param->solve_type == QUDA_NORMERR_SOLVE) || + (param->solve_type == QUDA_NORMERR_PC_SOLVE); param->secs = 0; param->gflops = 0; @@ -2845,13 +2881,13 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) if (param->use_init_guess == QUDA_USE_INIT_GUESS_YES && !param->chrono_use_resident) { // download initial guess // initial guess only supported for single-pass solvers - if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) - && (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) { + if ((param->solution_type == QUDA_MATDAG_MAT_SOLUTION || param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION) && + (param->solve_type == QUDA_DIRECT_SOLVE || param->solve_type == QUDA_DIRECT_PC_SOLVE)) { errorQuda("Initial guess not supported for two-pass solver"); } x = h_x; // solution - } else { // zero initial guess + } else { // zero initial guess blas::zero(x); } @@ -2868,7 +2904,7 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) profileInvert.TPSTART(QUDA_PROFILE_PREAMBLE); double nb = blas::norm2(b); - if (nb == 0.0) errorQuda("Source has zero norm"); + if (nb==0.0) errorQuda("Source has zero norm"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Source: CPU = %g, CUDA copy = %g\n", blas::norm2(h_b), nb); @@ -2923,19 +2959,23 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) // taken care of by Dirac::prepare() and Dirac::reconstruct(), // respectively. - if (pc_solution && !pc_solve) { errorQuda("Preconditioned (PC) solution_type requires a PC solve_type"); } + if (pc_solution && !pc_solve) { + errorQuda("Preconditioned (PC) solution_type requires a PC solve_type"); + } if (!mat_solution && !pc_solution && pc_solve) { errorQuda("Unpreconditioned MATDAG_MAT solution_type requires an unpreconditioned solve_type"); } - if (!mat_solution && norm_error_solve) { errorQuda("Normal-error solve requires Mat solution"); } + if (!mat_solution && norm_error_solve) { + errorQuda("Normal-error solve requires Mat solution"); + } if (param->inv_type_precondition == QUDA_MG_INVERTER && (!direct_solve || !mat_solution)) { errorQuda("Multigrid preconditioning only supported for direct solves"); } - if (param->chrono_use_resident && (norm_error_solve)) { + if (param->chrono_use_resident && ( norm_error_solve) ){ errorQuda("Chronological forcasting only presently supported for M^dagger M solver"); } @@ -3041,8 +3081,8 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) ColorSpinorField tmp(*out); SolverParam solverParam(*param); Solver *solve = Solver::create(solverParam, m, mSloppy, mPre, mEig, profileInvert); - (*solve)(tmp, *in); // y = (M M^\dag) b - dirac.Mdag(*out, tmp); // x = M^dag y + (*solve)(tmp, *in); // y = (M M^\dag) b + dirac.Mdag(*out, tmp); // x = M^dag y delete solve; solverParam.updateInvertParam(*param); } @@ -3051,21 +3091,21 @@ void invertQuda(void *hp_x, void *hp_b, QudaInvertParam *param) profileInvert.TPSTART(QUDA_PROFILE_EPILOGUE); if (param->chrono_make_resident) { - if (param->chrono_max_dim < 1) { + if(param->chrono_max_dim < 1){ errorQuda("Cannot chrono_make_resident with chrono_max_dim %i", param->chrono_max_dim); } const int i = param->chrono_index; - if (i >= QUDA_MAX_CHRONO) errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO); + if (i >= QUDA_MAX_CHRONO) + errorQuda("Requested chrono index %d is outside of max %d\n", i, QUDA_MAX_CHRONO); auto &basis = chronoResident[i]; if (param->chrono_max_dim < (int)basis.size()) { - errorQuda("Requested chrono_max_dim %i is smaller than already existing chronology %lu", param->chrono_max_dim, - basis.size()); + errorQuda("Requested chrono_max_dim %i is smaller than already existing chronology %lu", param->chrono_max_dim, basis.size()); } - if (not param->chrono_replace_last) { + if(not param->chrono_replace_last){ // if we have not filled the space yet just augment if ((int)basis.size() < param->chrono_max_dim) { ColorSpinorParam cs_param(*out); @@ -3550,13 +3590,13 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) pushVerbosity(param->verbosity); - bool pc_solution - = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); + bool pc_solution = (param->solution_type == QUDA_MATPC_SOLUTION) || (param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); bool pc_solve = (param->solve_type == QUDA_DIRECT_PC_SOLVE) || (param->solve_type == QUDA_NORMOP_PC_SOLVE); - bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type == QUDA_MATPC_SOLUTION); + bool mat_solution = (param->solution_type == QUDA_MAT_SOLUTION) || (param->solution_type == QUDA_MATPC_SOLUTION); bool direct_solve = (param->solve_type == QUDA_DIRECT_SOLVE) || (param->solve_type == QUDA_DIRECT_PC_SOLVE); - if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) { + if (param->dslash_type == QUDA_ASQTAD_DSLASH || + param->dslash_type == QUDA_STAGGERED_DSLASH) { if (param->solution_type != QUDA_MATPC_SOLUTION) { errorQuda("For Staggered-type fermions, multi-shift solver only suports MATPC solution type"); @@ -3578,8 +3618,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) errorQuda("For Wilson-type fermions, preconditioned (PC) solution_type requires a PC solve_type"); } if (!pc_solution & pc_solve) { - errorQuda("For Wilson-type fermions, in multi-shift solver, a preconditioned (PC) solve_type requires a PC " - "solution_type"); + errorQuda("For Wilson-type fermions, in multi-shift solver, a preconditioned (PC) solve_type requires a PC solution_type"); } } @@ -3588,9 +3627,10 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) param->gflops = 0; param->iter = 0; - for (int i = 0; i < param->num_offset - 1; i++) { - for (int j = i + 1; j < param->num_offset; j++) { - if (param->offset[i] > param->offset[j]) errorQuda("Offsets must be ordered from smallest to largest"); + for (int i=0; inum_offset-1; i++) { + for (int j=i+1; jnum_offset; j++) { + if (param->offset[i] > param->offset[j]) + errorQuda("Offsets must be ordered from smallest to largest"); } } @@ -3601,8 +3641,9 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) // Balint: Isn't there a nice construction pattern we could use here? This is // expedient but yucky. // DiracParam diracParam; - if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) { - param->mass = sqrt(param->offset[0] / 4); + if (param->dslash_type == QUDA_ASQTAD_DSLASH || + param->dslash_type == QUDA_STAGGERED_DSLASH){ + param->mass = sqrt(param->offset[0]/4); } Dirac *d = nullptr; @@ -3634,7 +3675,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) h_x.resize(param->num_offset); cpuParam.location = param->output_location; - for (int i = 0; i < param->num_offset; i++) { + for(int i=0; i < param->num_offset; i++) { cpuParam.v = hp_x[i]; h_x[i] = std::make_unique(cpuParam); } @@ -3676,7 +3717,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) // Check source norms double nb = blas::norm2(b); - if (nb == 0.0) errorQuda("Source has zero norm"); + if (nb==0.0) errorQuda("Source has zero norm"); if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { printfQuda("Source: CPU = %g, CUDA copy = %g\n", blas::norm2(h_b), nb); @@ -3697,7 +3738,8 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) DiracMatrix *m, *mSloppy; - if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) { + if (param->dslash_type == QUDA_ASQTAD_DSLASH || + param->dslash_type == QUDA_STAGGERED_DSLASH) { m = new DiracM(dirac); mSloppy = new DiracM(diracSloppy); } else { @@ -3729,38 +3771,42 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) #define REFINE_INCREASING_MASS #ifdef REFINE_INCREASING_MASS - for (int i = 0; i < param->num_offset; i++) { + for(int i=0; i < param->num_offset; i++) { #else - for (int i = param->num_offset - 1; i >= 0; i--) { + for(int i=param->num_offset-1; i >= 0; i--) { #endif - double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? param->true_res_hq_offset[i] : 0; - double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? param->tol_hq_offset[i] : 0; + double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? + param->true_res_hq_offset[i] : 0; + double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? + param->tol_hq_offset[i] : 0; /* - In the case where the shifted systems have zero tolerance - specified, we refine these systems until either the limit of - precision is reached (prec_tol) or until the tolerance reaches - the iterated residual tolerance of the previous multi-shift - solver (iter_res_offset[i]), which ever is greater. + In the case where the shifted systems have zero tolerance + specified, we refine these systems until either the limit of + precision is reached (prec_tol) or until the tolerance reaches + the iterated residual tolerance of the previous multi-shift + solver (iter_res_offset[i]), which ever is greater. */ - const double prec_tol = std::pow(10., (-2 * (int)param->cuda_prec + 4)); // implicit refinment limit of 1e-12 - const double iter_tol = (param->iter_res_offset[i] < prec_tol ? prec_tol : (param->iter_res_offset[i] * 1.1)); + const double prec_tol = std::pow(10.,(-2*(int)param->cuda_prec+4)); // implicit refinment limit of 1e-12 + const double iter_tol = (param->iter_res_offset[i] < prec_tol ? prec_tol : (param->iter_res_offset[i] *1.1)); const double refine_tol = (param->tol_offset[i] == 0.0 ? iter_tol : param->tol_offset[i]); // refine if either L2 or heavy quark residual tolerances have not been met, only if desired residual is > 0 if (param->true_res_offset[i] > refine_tol || rsd_hq > tol_hq) { - if (getVerbosity() >= QUDA_SUMMARIZE) - printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n", i, - param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq); + if (getVerbosity() >= QUDA_SUMMARIZE) + printfQuda("Refining shift %d: L2 residual %e / %e, heavy quark %e / %e (actual / requested)\n", + i, param->true_res_offset[i], param->tol_offset[i], rsd_hq, tol_hq); // for staggered the shift is just a change in mass term (FIXME: for twisted mass also) - if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) { - dirac.setMass(sqrt(param->offset[i] / 4)); - diracSloppy.setMass(sqrt(param->offset[i] / 4)); + if (param->dslash_type == QUDA_ASQTAD_DSLASH || + param->dslash_type == QUDA_STAGGERED_DSLASH) { + dirac.setMass(sqrt(param->offset[i]/4)); + diracSloppy.setMass(sqrt(param->offset[i]/4)); } DiracMatrix *m, *mSloppy; - if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) { + if (param->dslash_type == QUDA_ASQTAD_DSLASH || + param->dslash_type == QUDA_STAGGERED_DSLASH) { m = new DiracM(dirac); mSloppy = new DiracM(diracSloppy); } else { @@ -3777,9 +3823,9 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) if (false) { // experimenting with Minimum residual extrapolation // only perform MRE using current and previously refined solutions #ifdef REFINE_INCREASING_MASS - const int nRefine = i + 1; + const int nRefine = i+1; #else - const int nRefine = param->num_offset - i + 1; + const int nRefine = param->num_offset - i + 1; #endif cudaParam.create = QUDA_NULL_FIELD_CREATE; @@ -3796,7 +3842,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) bool orthogonal = false; bool apply_mat = true; bool hermitian = true; - MinResExt mre(*m, orthogonal, apply_mat, hermitian, profileMulti); + MinResExt mre(*m, orthogonal, apply_mat, hermitian, profileMulti); mre(x[i], b, z, q); } @@ -3809,7 +3855,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) { CG cg(*m, *mSloppy, *mSloppy, *mSloppy, solverParam, profileMulti); - if (i == 0) + if (i==0) cg(x[i], b, &p[i], r2_old[i]); else cg(x[i], b); @@ -3817,11 +3863,12 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) solverParam.true_res_offset[i] = solverParam.true_res; solverParam.true_res_hq_offset[i] = solverParam.true_res_hq; - solverParam.updateInvertParam(*param, i); + solverParam.updateInvertParam(*param,i); - if (param->dslash_type == QUDA_ASQTAD_DSLASH || param->dslash_type == QUDA_STAGGERED_DSLASH) { - dirac.setMass(sqrt(param->offset[0] / 4)); // restore just in case - diracSloppy.setMass(sqrt(param->offset[0] / 4)); // restore just in case + if (param->dslash_type == QUDA_ASQTAD_DSLASH || + param->dslash_type == QUDA_STAGGERED_DSLASH) { + dirac.setMass(sqrt(param->offset[0]/4)); // restore just in case + diracSloppy.setMass(sqrt(param->offset[0]/4)); // restore just in case } delete m; @@ -3842,7 +3889,7 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) param->action[1] = action.imag(); } - for (int i = 0; i < param->num_offset; i++) { + for(int i=0; i < param->num_offset; i++) { if (param->solver_normalization == QUDA_SOURCE_NORMALIZATION) { // rescale the solution blas::ax(sqrt(nb), x[i]); } @@ -3885,14 +3932,14 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, GaugeFieldParam gParam(*param, fatlink, QUDA_GENERAL_LINKS); gParam.location = QUDA_CPU_FIELD_LOCATION; - cpuGaugeField cpuFatLink(gParam); // create the host fatlink + cpuGaugeField cpuFatLink(gParam); // create the host fatlink gParam.gauge = longlink; - cpuGaugeField cpuLongLink(gParam); // create the host longlink + cpuGaugeField cpuLongLink(gParam); // create the host longlink gParam.gauge = ulink; cpuGaugeField cpuUnitarizedLink(gParam); gParam.link_type = param->type; gParam.gauge = inlink; - cpuGaugeField cpuInLink(gParam); // create the host sitelink + cpuGaugeField cpuInLink(gParam); // create the host sitelink // create the device fields gParam.location = QUDA_CUDA_FIELD_LOCATION; @@ -3978,8 +4025,8 @@ void computeKSLinkQuda(void *fatlink, void *longlink, void *ulink, void *inlink, profileFatLink.TPSTOP(QUDA_PROFILE_TOTAL); } -int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int *path_length, double *loop_coeff, - int num_paths, int max_length, double eb3, QudaGaugeParam *qudaGaugeParam) +int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int* path_length, + double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam) { profileGaugeForce.TPSTART(QUDA_PROFILE_TOTAL); profileGaugeForce.TPSTART(QUDA_PROFILE_INIT); @@ -3992,7 +4039,7 @@ int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int gParam.site_size = qudaGaugeParam->site_size; cpuGaugeField *cpuSiteLink = (!qudaGaugeParam->use_resident_gauge) ? new cpuGaugeField(gParam) : nullptr; - cudaGaugeField *cudaSiteLink = nullptr; + cudaGaugeField* cudaSiteLink = nullptr; if (qudaGaugeParam->use_resident_gauge) { if (!gaugePrecise) errorQuda("No resident gauge field to use"); @@ -4022,9 +4069,9 @@ int computeGaugeForceQuda(void *mom, void *siteLink, int ***input_path_buf, int gParamMom.site_offset = qudaGaugeParam->mom_offset; gParamMom.site_size = qudaGaugeParam->site_size; - cpuGaugeField *cpuMom = (!qudaGaugeParam->use_resident_mom) ? new cpuGaugeField(gParamMom) : nullptr; + cpuGaugeField* cpuMom = (!qudaGaugeParam->use_resident_mom) ? new cpuGaugeField(gParamMom) : nullptr; - cudaGaugeField *cudaMom = nullptr; + cudaGaugeField* cudaMom = nullptr; if (qudaGaugeParam->use_resident_mom) { if (!momResident) errorQuda("No resident momentum field to use"); cudaMom = momResident; @@ -4267,18 +4314,16 @@ void momResidentQuda(void *mom, QudaGaugeParam *param) profileGaugeForce.TPSTOP(QUDA_PROFILE_TOTAL); } -void createCloverQuda(QudaInvertParam *invertParam) +void createCloverQuda(QudaInvertParam* invertParam) { profileClover.TPSTART(QUDA_PROFILE_TOTAL); if (!cloverPrecise) errorQuda("Clover field not allocated"); - QudaReconstructType recon - = (gaugePrecise->Reconstruct() == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_12 : gaugePrecise->Reconstruct(); + QudaReconstructType recon = (gaugePrecise->Reconstruct() == QUDA_RECONSTRUCT_8) ? QUDA_RECONSTRUCT_12 : gaugePrecise->Reconstruct(); // for clover we optimize to only send depth 1 halos in y/z/t (FIXME - make work for x, make robust in general) lat_dim_t R; - for (int d = 0; d < 4; d++) R[d] = (d == 0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d)); - cudaGaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : - createExtendedGauge(*gaugePrecise, R, profileClover, false, recon); + for (int d=0; d<4; d++) R[d] = (d==0 ? 2 : 1) * (redundant_comms || commDimPartitioned(d)); + cudaGaugeField *gauge = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profileClover, false, recon); profileClover.TPSTART(QUDA_PROFILE_INIT); @@ -4311,7 +4356,7 @@ void createCloverQuda(QudaInvertParam *invertParam) extendedGaugeResident = gauge; } -void *createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param) +void* createGaugeFieldQuda(void* gauge, int geometry, QudaGaugeParam* param) { GaugeFieldParam gParam(*param, gauge, QUDA_GENERAL_LINKS); gParam.geometry = static_cast(geometry); @@ -4323,7 +4368,7 @@ void *createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param) gParam.order = QUDA_FLOAT2_GAUGE_ORDER; gParam.create = QUDA_ZERO_FIELD_CREATE; - auto *cudaGauge = new cudaGaugeField(gParam); + auto* cudaGauge = new cudaGaugeField(gParam); if (gauge) { cudaGauge->loadCPUField(*cpuGauge); @@ -4335,7 +4380,7 @@ void *createGaugeFieldQuda(void *gauge, int geometry, QudaGaugeParam *param) void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param) { - auto *cudaGauge = reinterpret_cast(inGauge); + auto* cudaGauge = reinterpret_cast(inGauge); GaugeFieldParam gParam(*param, gauge, QUDA_GENERAL_LINKS); gParam.geometry = cudaGauge->Geometry(); @@ -4346,7 +4391,7 @@ void saveGaugeFieldQuda(void *gauge, void *inGauge, QudaGaugeParam *param) void destroyGaugeFieldQuda(void *gauge) { - auto *g = reinterpret_cast(gauge); + auto* g = reinterpret_cast(gauge); delete g; } @@ -4389,7 +4434,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi qParam.pc_type = QUDA_4D_PC; qParam.setPrecision(gParam.Precision()); qParam.pad = 0; - for (int dir = 0; dir < 4; ++dir) qParam.x[dir] = gParam.x[dir]; + for(int dir=0; dir<4; ++dir) qParam.x[dir] = gParam.x[dir]; qParam.x[4] = 1; qParam.create = QUDA_NULL_FIELD_CREATE; qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; @@ -4407,7 +4452,8 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi } // resident gauge field is required - if (!gauge_param->use_resident_gauge || !gaugePrecise) errorQuda("Resident gauge field is required"); + if (!gauge_param->use_resident_gauge || !gaugePrecise) + errorQuda("Resident gauge field is required"); if (!gaugePrecise->StaggeredPhaseApplied()) { errorQuda("Gauge field requires the staggered phase factors to be applied"); @@ -4415,39 +4461,41 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi // check if staggered phase is the desired one if (gauge_param->staggered_phase_type != gaugePrecise->StaggeredPhase()) { - errorQuda("Requested staggered phase %d, but found %d\n", gauge_param->staggered_phase_type, - gaugePrecise->StaggeredPhase()); + errorQuda("Requested staggered phase %d, but found %d\n", + gauge_param->staggered_phase_type, gaugePrecise->StaggeredPhase()); } profileStaggeredForce.TPSTOP(QUDA_PROFILE_H2D); profileStaggeredForce.TPSTART(QUDA_PROFILE_INIT); const int nvector = inv_param->num_offset; - std::vector X(nvector); - for (int i = 0; i < nvector; i++) X[i] = ColorSpinorField::Create(qParam); + std::vector X(nvector); + for ( int i=0; iuse_resident_solution) { if (solutionResident.size() < (unsigned int)nvector) - errorQuda("solutionResident.size() %lu does not match number of shifts %d", solutionResident.size(), nvector); + errorQuda("solutionResident.size() %lu does not match number of shifts %d", + solutionResident.size(), nvector); } // create the staggered operator DiracParam diracParam; - bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE); - if (!pc_solve) errorQuda("Preconditioned solve type required not %d\n", inv_param->solve_type); + bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || + (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE); + if (!pc_solve) + errorQuda("Preconditioned solve type required not %d\n", inv_param->solve_type); setDiracParam(diracParam, inv_param, pc_solve); Dirac *dirac = Dirac::create(diracParam); profileStaggeredForce.TPSTOP(QUDA_PROFILE_INIT); profileStaggeredForce.TPSTART(QUDA_PROFILE_PREAMBLE); - for (int i = 0; i < nvector; i++) { + for (int i=0; iuse_resident_solution) x.Even() = solutionResident[i]; - else - errorQuda("%s requires resident solution", __func__); + else errorQuda("%s requires resident solution", __func__); // set the odd solution component dirac->Dslash(x.Odd(), x.Even(), QUDA_ODD_PARITY); @@ -4465,7 +4513,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi profileStaggeredForce.TPSTART(QUDA_PROFILE_COMPUTE); // compute quark-field outer product - for (int i = 0; i < nvector; i++) { + for (int i=0; iresidue[i], 0.0}; @@ -4497,15 +4545,24 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi profileStaggeredForce.TPSTOP(QUDA_PROFILE_D2H); profileStaggeredForce.TPSTART(QUDA_PROFILE_FREE); - for (int i = 0; i < nvector; i++) delete X[i]; + for (int i=0; iuse_resident_mom) ? new cpuGaugeField(param) : nullptr; + param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; + cpuGaugeField* cpuMom = (!gParam->use_resident_mom) ? new cpuGaugeField(param) : nullptr; param.link_type = QUDA_GENERAL_LINKS; param.reconstruct = QUDA_RECONSTRUCT_NO; - param.gauge = (void *)w_link; + param.gauge = (void*)w_link; cpuGaugeField cpuWLink(param); - param.gauge = (void *)v_link; + param.gauge = (void*)v_link; cpuGaugeField cpuVLink(param); - param.gauge = (void *)u_link; + param.gauge = (void*)u_link; cpuGaugeField cpuULink(param); param.create = QUDA_ZERO_FIELD_CREATE; - param.order = QUDA_FLOAT2_GAUGE_ORDER; + param.order = QUDA_FLOAT2_GAUGE_ORDER; param.link_type = QUDA_ASQTAD_MOM_LINKS; param.reconstruct = QUDA_RECONSTRUCT_10; GaugeFieldParam momParam(param); @@ -4573,8 +4629,8 @@ void computeHISQForceQuda(void *const milc_momentum, double dt, const double lev lat_dim_t R = {2 * comm_dim_partitioned(0), 2 * comm_dim_partitioned(1), 2 * comm_dim_partitioned(2), 2 * comm_dim_partitioned(3)}; - for (int dir = 0; dir < 4; ++dir) { - param.x[dir] += 2 * R[dir]; + for (int dir=0; dir<4; ++dir) { + param.x[dir] += 2*R[dir]; param.r[dir] = R[dir]; } @@ -4595,7 +4651,7 @@ void computeHISQForceQuda(void *const milc_momentum, double dt, const double lev qParam.pc_type = QUDA_4D_PC; qParam.setPrecision(oParam.Precision()); qParam.pad = 0; - for (int dir = 0; dir < 4; ++dir) qParam.x[dir] = oParam.x[dir]; + for (int dir=0; dir<4; ++dir) qParam.x[dir] = oParam.x[dir]; // create the device quark field qParam.create = QUDA_NULL_FIELD_CREATE; @@ -4613,7 +4669,7 @@ void computeHISQForceQuda(void *const milc_momentum, double dt, const double lev GaugeField *oprod[2] = {stapleOprod, naikOprod}; // loop over different quark fields - for (int i = 0; i < num_terms; ++i) { + for(int i=0; iloadCPUField(cpuWLink, profileHISQForce); - cudaInForce->exchangeExtendedGhost(R, profileHISQForce, true); - cudaGauge->exchangeExtendedGhost(R, profileHISQForce, true); - cudaOutForce->exchangeExtendedGhost(R, profileHISQForce, true); + cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true); + cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true); + cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true); profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE); hisqStaplesForce(*cudaOutForce, *cudaInForce, *cudaGauge, act_path_coeff); @@ -4684,7 +4740,7 @@ void computeHISQForceQuda(void *const milc_momentum, double dt, const double lev // Load naik outer product copyExtendedGauge(*cudaInForce, *naikOprod, QUDA_CUDA_FIELD_LOCATION); - cudaInForce->exchangeExtendedGhost(R, profileHISQForce, true); + cudaInForce->exchangeExtendedGhost(R,profileHISQForce,true); delete naikOprod; // Compute Naik three-link term @@ -4693,18 +4749,17 @@ void computeHISQForceQuda(void *const milc_momentum, double dt, const double lev qudaDeviceSynchronize(); profileHISQForce.TPSTOP(QUDA_PROFILE_COMPUTE); - cudaOutForce->exchangeExtendedGhost(R, profileHISQForce, true); + cudaOutForce->exchangeExtendedGhost(R,profileHISQForce,true); // load v-link cudaGauge->loadCPUField(cpuVLink, profileHISQForce); - cudaGauge->exchangeExtendedGhost(R, profileHISQForce, true); + cudaGauge->exchangeExtendedGhost(R,profileHISQForce,true); profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE); *num_failures_h = 0; unitarizeForce(*cudaInForce, *cudaOutForce, *cudaGauge, num_failures_d); - if (*num_failures_h > 0) - errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h); + if (*num_failures_h>0) errorQuda("Error in the unitarization component of the hisq fermion force: %d failures\n", *num_failures_h); cudaOutForce->zero(); qudaDeviceSynchronize(); @@ -4722,7 +4777,7 @@ void computeHISQForceQuda(void *const milc_momentum, double dt, const double lev delete cudaInForce; momParam.location = QUDA_CUDA_FIELD_LOCATION; - cudaGaugeField *cudaMom = new cudaGaugeField(momParam); + cudaGaugeField* cudaMom = new cudaGaugeField(momParam); profileHISQForce.TPSTART(QUDA_PROFILE_COMPUTE); hisqCompleteForce(*cudaOutForce, *cudaGauge); @@ -4797,15 +4852,15 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double qParam.nDim = 4; qParam.setPrecision(fParam.Precision()); qParam.pad = 0; - for (int dir = 0; dir < 4; ++dir) qParam.x[dir] = fParam.x[dir]; + for(int dir=0; dir<4; ++dir) qParam.x[dir] = fParam.x[dir]; // create the device quark field qParam.create = QUDA_NULL_FIELD_CREATE; qParam.fieldOrder = QUDA_FLOAT2_FIELD_ORDER; qParam.gammaBasis = QUDA_UKQCD_GAMMA_BASIS; - std::vector quarkX, quarkP; - for (int i = 0; i < nvector; i++) { + std::vector quarkX, quarkP; + for (int i=0; isolve_type == QUDA_DIRECT_PC_SOLVE) || (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE); + bool pc_solve = (inv_param->solve_type == QUDA_DIRECT_PC_SOLVE) || + (inv_param->solve_type == QUDA_NORMOP_PC_SOLVE); DiracParam diracParam; setDiracParam(diracParam, inv_param, pc_solve); diracParam.tmp1 = &tmp; // use as temporary for dirac->M @@ -4828,7 +4884,8 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double if (inv_param->use_resident_solution) { if (solutionResident.size() < (unsigned int)nvector) - errorQuda("solutionResident.size() %lu does not match number of shifts %d", solutionResident.size(), nvector); + errorQuda("solutionResident.size() %lu does not match number of shifts %d", + solutionResident.size(), nvector); } cudaGaugeField &gaugeEx = *extendedGaugeResident; @@ -4842,7 +4899,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double std::vector force_coeff(nvector); // loop over different quark fields - for (int i = 0; i < nvector; i++) { + for(int i=0; icopy(gaugeEx); + u -> copy(gaugeEx); } - computeCloverSigmaTrace(oprod, *cloverPrecise, 2.0 * ck * multiplicity * dt); + computeCloverSigmaTrace(oprod, *cloverPrecise, 2.0*ck*multiplicity*dt); /* Now the U dA/dU terms */ - std::vector> ferm_epsilon(nvector); + std::vector< std::vector > ferm_epsilon(nvector); for (int shift = 0; shift < nvector; shift++) { ferm_epsilon[shift].reserve(2); - ferm_epsilon[shift][0] = 2.0 * ck * coeff[shift] * dt; - ferm_epsilon[shift][1] = -kappa2 * 2.0 * ck * coeff[shift] * dt; + ferm_epsilon[shift][0] = 2.0*ck*coeff[shift]*dt; + ferm_epsilon[shift][1] = -kappa2 * 2.0*ck*coeff[shift]*dt; } computeCloverSigmaOprod(oprod, quarkX, quarkP, ferm_epsilon); @@ -4922,7 +4979,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double profileCloverForce.TPSTART(QUDA_PROFILE_FREE); - for (int i = 0; i < nvector; i++) { + for (int i=0; imom_offset; gParamMom.site_size = param->site_size; @@ -4980,7 +5041,7 @@ void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, profileGaugeUpdate.TPSTART(QUDA_PROFILE_H2D); - if (!param->use_resident_gauge) { // load fields onto the device + if (!param->use_resident_gauge) { // load fields onto the device cudaInGauge->loadCPUField(*cpuGauge); } else { // or use resident fields already present if (!gaugePrecise) errorQuda("No resident gauge field allocated"); @@ -5000,7 +5061,8 @@ void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, // perform the update profileGaugeUpdate.TPSTART(QUDA_PROFILE_COMPUTE); - updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom, (bool)conj_mom, (bool)exact); + updateGaugeField(*cudaOutGauge, dt, *cudaInGauge, *cudaMom, + (bool)conj_mom, (bool)exact); profileGaugeUpdate.TPSTOP(QUDA_PROFILE_COMPUTE); if (param->return_result_gauge) { @@ -5033,130 +5095,127 @@ void updateGaugeFieldQuda(void *gauge, void *momentum, double dt, int conj_mom, profileGaugeUpdate.TPSTOP(QUDA_PROFILE_TOTAL); } -void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) -{ - profileProject.TPSTART(QUDA_PROFILE_TOTAL); - - profileProject.TPSTART(QUDA_PROFILE_INIT); - checkGaugeParam(param); - - // create the gauge field - GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); - gParam.location = QUDA_CPU_FIELD_LOCATION; - gParam.site_offset = param->gauge_offset; - gParam.site_size = param->site_size; - bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; - cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr; - - // create the device fields - gParam.location = QUDA_CUDA_FIELD_LOCATION; - gParam.create = QUDA_NULL_FIELD_CREATE; - gParam.order = QUDA_FLOAT2_GAUGE_ORDER; - gParam.reconstruct = param->reconstruct; - cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr; - profileProject.TPSTOP(QUDA_PROFILE_INIT); - - if (param->use_resident_gauge) { - if (!gaugePrecise) errorQuda("No resident gauge field to use"); - cudaGauge = gaugePrecise; - gaugePrecise = nullptr; - } else { - profileProject.TPSTART(QUDA_PROFILE_H2D); - cudaGauge->loadCPUField(*cpuGauge); - profileProject.TPSTOP(QUDA_PROFILE_H2D); - } - - profileProject.TPSTART(QUDA_PROFILE_COMPUTE); - *num_failures_h = 0; - - // project onto SU(3) - if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase(); - projectSU3(*cudaGauge, tol, num_failures_d); - if (!cudaGauge->StaggeredPhaseApplied() && param->staggered_phase_applied) cudaGauge->applyStaggeredPhase(); - - profileProject.TPSTOP(QUDA_PROFILE_COMPUTE); - - if (*num_failures_h > 0) errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h); - - profileProject.TPSTART(QUDA_PROFILE_D2H); - if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge); - profileProject.TPSTOP(QUDA_PROFILE_D2H); - - if (param->make_resident_gauge) { - if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise; - gaugePrecise = cudaGauge; - } else { - delete cudaGauge; - } - - profileProject.TPSTART(QUDA_PROFILE_FREE); - if (cpuGauge) delete cpuGauge; - profileProject.TPSTOP(QUDA_PROFILE_FREE); - - profileProject.TPSTOP(QUDA_PROFILE_TOTAL); -} - -void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) -{ - profilePhase.TPSTART(QUDA_PROFILE_TOTAL); - - profilePhase.TPSTART(QUDA_PROFILE_INIT); - checkGaugeParam(param); - - // create the gauge field - GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); - bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; - gParam.location = QUDA_CPU_FIELD_LOCATION; - cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr; - - // create the device fields - gParam.location = QUDA_CUDA_FIELD_LOCATION; - gParam.create = QUDA_NULL_FIELD_CREATE; - gParam.order = QUDA_FLOAT2_GAUGE_ORDER; - gParam.reconstruct = param->reconstruct; - cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr; - profilePhase.TPSTOP(QUDA_PROFILE_INIT); - - if (param->use_resident_gauge) { - if (!gaugePrecise) errorQuda("No resident gauge field to use"); - cudaGauge = gaugePrecise; - } else { - profilePhase.TPSTART(QUDA_PROFILE_H2D); - cudaGauge->loadCPUField(*cpuGauge); - profilePhase.TPSTOP(QUDA_PROFILE_H2D); - } - - profilePhase.TPSTART(QUDA_PROFILE_COMPUTE); - *num_failures_h = 0; - - // apply / remove phase as appropriate - if (!cudaGauge->StaggeredPhaseApplied()) - cudaGauge->applyStaggeredPhase(); - else - cudaGauge->removeStaggeredPhase(); - - profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE); - - profilePhase.TPSTART(QUDA_PROFILE_D2H); - if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge); - profilePhase.TPSTOP(QUDA_PROFILE_D2H); - - if (param->make_resident_gauge) { - if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise; - gaugePrecise = cudaGauge; - } else { - delete cudaGauge; - } - - profilePhase.TPSTART(QUDA_PROFILE_FREE); - if (cpuGauge) delete cpuGauge; - profilePhase.TPSTOP(QUDA_PROFILE_FREE); - - profilePhase.TPSTOP(QUDA_PROFILE_TOTAL); -} + void projectSU3Quda(void *gauge_h, double tol, QudaGaugeParam *param) { + profileProject.TPSTART(QUDA_PROFILE_TOTAL); + + profileProject.TPSTART(QUDA_PROFILE_INIT); + checkGaugeParam(param); + + // create the gauge field + GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); + gParam.location = QUDA_CPU_FIELD_LOCATION; + gParam.site_offset = param->gauge_offset; + gParam.site_size = param->site_size; + bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; + cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr; + + // create the device fields + gParam.location = QUDA_CUDA_FIELD_LOCATION; + gParam.create = QUDA_NULL_FIELD_CREATE; + gParam.order = QUDA_FLOAT2_GAUGE_ORDER; + gParam.reconstruct = param->reconstruct; + cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr; + profileProject.TPSTOP(QUDA_PROFILE_INIT); + + if (param->use_resident_gauge) { + if (!gaugePrecise) errorQuda("No resident gauge field to use"); + cudaGauge = gaugePrecise; + gaugePrecise = nullptr; + } else { + profileProject.TPSTART(QUDA_PROFILE_H2D); + cudaGauge->loadCPUField(*cpuGauge); + profileProject.TPSTOP(QUDA_PROFILE_H2D); + } + + profileProject.TPSTART(QUDA_PROFILE_COMPUTE); + *num_failures_h = 0; + + // project onto SU(3) + if (cudaGauge->StaggeredPhaseApplied()) cudaGauge->removeStaggeredPhase(); + projectSU3(*cudaGauge, tol, num_failures_d); + if (!cudaGauge->StaggeredPhaseApplied() && param->staggered_phase_applied) cudaGauge->applyStaggeredPhase(); + + profileProject.TPSTOP(QUDA_PROFILE_COMPUTE); + + if(*num_failures_h>0) + errorQuda("Error in the SU(3) unitarization: %d failures\n", *num_failures_h); + + profileProject.TPSTART(QUDA_PROFILE_D2H); + if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge); + profileProject.TPSTOP(QUDA_PROFILE_D2H); + + if (param->make_resident_gauge) { + if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise; + gaugePrecise = cudaGauge; + } else { + delete cudaGauge; + } + + profileProject.TPSTART(QUDA_PROFILE_FREE); + if (cpuGauge) delete cpuGauge; + profileProject.TPSTOP(QUDA_PROFILE_FREE); + + profileProject.TPSTOP(QUDA_PROFILE_TOTAL); + } + + void staggeredPhaseQuda(void *gauge_h, QudaGaugeParam *param) { + profilePhase.TPSTART(QUDA_PROFILE_TOTAL); + + profilePhase.TPSTART(QUDA_PROFILE_INIT); + checkGaugeParam(param); + + // create the gauge field + GaugeFieldParam gParam(*param, gauge_h, QUDA_GENERAL_LINKS); + bool need_cpu = !param->use_resident_gauge || param->return_result_gauge; + gParam.location = QUDA_CPU_FIELD_LOCATION; + cpuGaugeField *cpuGauge = need_cpu ? new cpuGaugeField(gParam) : nullptr; + + // create the device fields + gParam.location = QUDA_CUDA_FIELD_LOCATION; + gParam.create = QUDA_NULL_FIELD_CREATE; + gParam.order = QUDA_FLOAT2_GAUGE_ORDER; + gParam.reconstruct = param->reconstruct; + cudaGaugeField *cudaGauge = !param->use_resident_gauge ? new cudaGaugeField(gParam) : nullptr; + profilePhase.TPSTOP(QUDA_PROFILE_INIT); + + if (param->use_resident_gauge) { + if (!gaugePrecise) errorQuda("No resident gauge field to use"); + cudaGauge = gaugePrecise; + } else { + profilePhase.TPSTART(QUDA_PROFILE_H2D); + cudaGauge->loadCPUField(*cpuGauge); + profilePhase.TPSTOP(QUDA_PROFILE_H2D); + } + + profilePhase.TPSTART(QUDA_PROFILE_COMPUTE); + *num_failures_h = 0; + + // apply / remove phase as appropriate + if (!cudaGauge->StaggeredPhaseApplied()) cudaGauge->applyStaggeredPhase(); + else cudaGauge->removeStaggeredPhase(); + + profilePhase.TPSTOP(QUDA_PROFILE_COMPUTE); + + profilePhase.TPSTART(QUDA_PROFILE_D2H); + if (param->return_result_gauge) cudaGauge->saveCPUField(*cpuGauge); + profilePhase.TPSTOP(QUDA_PROFILE_D2H); + + if (param->make_resident_gauge) { + if (gaugePrecise != nullptr && cudaGauge != gaugePrecise) delete gaugePrecise; + gaugePrecise = cudaGauge; + } else { + delete cudaGauge; + } + + profilePhase.TPSTART(QUDA_PROFILE_FREE); + if (cpuGauge) delete cpuGauge; + profilePhase.TPSTOP(QUDA_PROFILE_FREE); + + profilePhase.TPSTOP(QUDA_PROFILE_TOTAL); + } // evaluate the momentum action -double momActionQuda(void *momentum, QudaGaugeParam *param) +double momActionQuda(void* momentum, QudaGaugeParam* param) { profileMomAction.TPSTART(QUDA_PROFILE_TOTAL); @@ -5167,8 +5226,7 @@ double momActionQuda(void *momentum, QudaGaugeParam *param) GaugeFieldParam gParam(*param, momentum, QUDA_ASQTAD_MOM_LINKS); gParam.location = QUDA_CPU_FIELD_LOCATION; gParam.reconstruct = (gParam.order == QUDA_TIFR_GAUGE_ORDER || gParam.order == QUDA_TIFR_PADDED_GAUGE_ORDER) ? - QUDA_RECONSTRUCT_NO : - QUDA_RECONSTRUCT_10; + QUDA_RECONSTRUCT_NO : QUDA_RECONSTRUCT_10; gParam.site_offset = param->mom_offset; gParam.site_size = param->site_size; @@ -5206,7 +5264,9 @@ double momActionQuda(void *momentum, QudaGaugeParam *param) delete cudaMom; momResident = nullptr; } - if (cpuMom) { delete cpuMom; } + if (cpuMom) { + delete cpuMom; + } profileMomAction.TPSTOP(QUDA_PROFILE_FREE); profileMomAction.TPSTOP(QUDA_PROFILE_TOTAL); @@ -5258,8 +5318,7 @@ void plaqQuda(double plaq[3]) if (!gaugePrecise) errorQuda("Cannot compute plaquette as there is no resident gauge field"); - cudaGaugeField *data - = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq); + cudaGaugeField *data = extendedGaugeResident ? extendedGaugeResident : createExtendedGauge(*gaugePrecise, R, profilePlaq); extendedGaugeResident = data; profilePlaq.TPSTART(QUDA_PROFILE_COMPUTE); @@ -5340,7 +5399,8 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, copyExtendedGauge(*precise, *gaugeSmeared, QUDA_CUDA_FIELD_LOCATION); precise->exchangeGhost(); } else { - if (getVerbosity() >= QUDA_VERBOSE) printfQuda("Wuppertal smearing done with gaugePrecise\n"); + if (getVerbosity() >= QUDA_VERBOSE) + printfQuda("Wuppertal smearing done with gaugePrecise\n"); precise = gaugePrecise; } @@ -5385,7 +5445,8 @@ void performWuppertalnStep(void *h_out, void *h_in, QudaInvertParam *inv_param, printfQuda("Out CPU %e CUDA %e\n", cpu, gpu); } - if (gaugeSmeared != nullptr) delete precise; + if (gaugeSmeared != nullptr) + delete precise; popVerbosity(); @@ -5565,7 +5626,7 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u delete cpuGauge; - if (timeinfo) { + if(timeinfo){ timeinfo[0] = GaugeFixOVRQuda.Last(QUDA_PROFILE_H2D); timeinfo[1] = GaugeFixOVRQuda.Last(QUDA_PROFILE_COMPUTE); timeinfo[2] = GaugeFixOVRQuda.Last(QUDA_PROFILE_D2H); @@ -5574,10 +5635,9 @@ int computeGaugeFixingOVRQuda(void *gauge, const unsigned int gauge_dir, const u return 0; } -int computeGaugeFixingFFTQuda(void *gauge, const unsigned int gauge_dir, const unsigned int Nsteps, - const unsigned int verbose_interval, const double alpha, const unsigned int autotune, - const double tolerance, const unsigned int stopWtheta, QudaGaugeParam *param, - double *timeinfo) +int computeGaugeFixingFFTQuda(void* gauge, const unsigned int gauge_dir, const unsigned int Nsteps, \ + const unsigned int verbose_interval, const double alpha, const unsigned int autotune, const double tolerance, \ + const unsigned int stopWtheta, QudaGaugeParam* param , double* timeinfo) { GaugeFixFFTQuda.TPSTART(QUDA_PROFILE_TOTAL); From b0076deb7007149c0a549ab9ce7232fbc68c82d5 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Sun, 26 Mar 2023 22:03:41 +0200 Subject: [PATCH 031/148] small corrs + formatting --- lib/copy_color_spinor.cuh | 8 +++++++- lib/openqcd_interface.cpp | 10 ++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh index b5f4665f9b..c59c5b3605 100644 --- a/lib/copy_color_spinor.cuh +++ b/lib/copy_color_spinor.cuh @@ -112,13 +112,16 @@ namespace quda { #else errorQuda("QDPJIT interface has not been built\n"); #endif + } else if (out.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { + #ifdef BUILD_OPENQCD_INTERFACE - using O = OpenQCDDiracOrder; // TODO: Seems OK + using O = OpenQCDDiracOrder; // TODO: Seems OK CopyColorSpinor(out, in, param); // TODO: Seems OK #else errorQuda("OpenQCD interface has not been built\n"); #endif + } else { errorQuda("Order %d not defined (Ns = %d, Nc = %d, precision = %d)", out.FieldOrder(), Ns, Nc, out.Precision()); } @@ -160,13 +163,16 @@ namespace quda { #else errorQuda("QDPJIT interface has not been built\n"); #endif + } else if (in.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { + #ifdef BUILD_OPENQCD_INTERFACE using ColorSpinor = OpenQCDDiracOrder; // TODO: Seems OK genericCopyColorSpinor(param); // TODO: Seems OK #else errorQuda("OpenQCD interface has not been built\n"); #endif + } else { errorQuda("Order %d not defined (Ns=%d, Nc=%d, precision = %d)", in.FieldOrder(), Ns, Nc, in.Precision()); } diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 49c7abde24..d8a2561198 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -3,9 +3,6 @@ #include #include #include -#include -#include -#include #include #include @@ -351,6 +348,7 @@ static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, Qud invertParam->compute_action = 0; } +// FIXME: static void setColorSpinorParams(const int dim[4], QudaPrecision precision, ColorSpinorParam *param) { param->nColor = 3; @@ -358,7 +356,7 @@ static void setColorSpinorParams(const int dim[4], QudaPrecision precision, Colo param->nDim = 4; // TODO: check how to adapt this for openqxd for (int dir = 0; dir < 4; ++dir) param->x[dir] = dim[dir]; - // param->x[0] /= 2; // for staggered sites only? + // param->x[0] /= 2; // for staggered sites only FIXME:? param->setPrecision(precision); param->pad = 0; @@ -418,6 +416,7 @@ void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], openQCD_Qu } // #if 0 +/* FIXME: This function is buggy: */ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, void *dst, void *gauge) { @@ -450,6 +449,9 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda // dslashQuda(dst, src, &invertParam, local_parity); dslashQudaTest(dst, src, &invertParam, local_parity); + // Original: + // dslashQuda(static_cast(dst), static_cast(src), &invertParam, local_parity); + // TODO: need save?? // saveGaugeQuda(gauge, &qudaGaugeParam); From 2525a5ba3f5ac585f5262c7bdde58026397337d0 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Sun, 26 Mar 2023 23:00:43 +0200 Subject: [PATCH 032/148] gauge save load --- include/quda_openqcd_interface.h | 2 ++ lib/openqcd_interface.cpp | 57 +++++++++++++------------------- 2 files changed, 25 insertions(+), 34 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 34e8eb63d7..8ffb31dbb8 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -165,6 +165,8 @@ void openQCD_qudaLoadGaugeField(int external_precision, int quda_precision, open void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge); void openQCD_gaugeloadsave(int precision, void *gauge); +void openQCD_gaugeload(int precision, void *gauge); +void openQCD_gaugesave(QudaGaugeParam *qudaGaugeParam, void *gauge); // int openQCD_ipt(int iy); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index d8a2561198..2f764c6436 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -11,13 +11,6 @@ #include #include -// #include "../../openQxD-devel/include/su3.h" -// #include "../../openQxD-devel/include/flags.h" -// #include "../../openQxD-devel/include/utils.h" -// #include "../../openQxD-devel/include/lattice.h" -// #include "../../openQxD-devel/include/global.h" - -// #include #define MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -83,13 +76,13 @@ using namespace quda; // template void inline qudamilc_called(const char *func) { qudamilc_called(func, getVerbosity()); } -static int safe_mod(int x, int y) -{ - if (x >= 0) - return x % y; - else - return (y - (abs(x) % y)) % y; -} +// static int safe_mod(int x, int y) +// { +// if (x >= 0) +// return x % y; +// else +// return (y - (abs(x) % y)) % y; +// } // fdata should point to 4 integers in order {NPROC0, NPROC1, NPROC2, NPROC3} // coords is the 4D cartesian coordinate of a rank. @@ -167,7 +160,7 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t input) openQCD_qudaSetLayout(input.layout); initialized = true; // qudamilc_called(__func__); - // geometry(); // TODO: Establish helper indexes from openQxD?? + // geometry_openQxD(); // TODO: in the future establish ipt and other helper indexes from openQxD? } void openQCD_qudaFinalize() { endQuda(); } @@ -252,29 +245,25 @@ void openQCD_gaugeloadsave(int precision, void *gauge) return; } -// static int openQCD_index() -// { -// // This function is the helper for ipt in QUDA +void openQCD_gaugeload(int precision, void *gauge) +{ -// return ix; -// } + QudaGaugeParam qudaGaugeParam + = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); // FIXME: -// int openQCD_ipt(int iy) -// { -// // This function computes the ipt index from iy (lexicographical index) -// int x0,x1,x2,x3; -// int k,mu,ix,iy,iz,iw; -// int bo[4],bs[4],ifc[8]; + loadGaugeQuda(gauge, &qudaGaugeParam); -// } + return; +} + +void openQCD_gaugesave(QudaGaugeParam *qudaGaugeParam, void *gauge) +{ + + saveGaugeQuda(gauge, qudaGaugeParam); + + return; +} -// static int getLinkPadding(const int dim[4]) -// { -// int padding = MAX(dim[1] * dim[2] * dim[3] / 2, dim[0] * dim[2] * dim[3] / 2); -// padding = MAX(padding, dim[0] * dim[1] * dim[3] / 2); -// padding = MAX(padding, dim[0] * dim[1] * dim[2] / 2); -// return padding; -// } // set the params for the single mass solver static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, From 9dce279429607b0b24e3522ed3944a40b56ff9f5 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Sun, 26 Mar 2023 23:33:31 +0200 Subject: [PATCH 033/148] quda openqcd gauge loadsave, load, and save --- include/quda_openqcd_interface.h | 2 +- lib/openqcd_interface.cpp | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 8ffb31dbb8..961d35e5cd 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -166,7 +166,7 @@ void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge); void openQCD_gaugeloadsave(int precision, void *gauge); void openQCD_gaugeload(int precision, void *gauge); -void openQCD_gaugesave(QudaGaugeParam *qudaGaugeParam, void *gauge); +void openQCD_gaugesave(int precision, void *gauge); // int openQCD_ipt(int iy); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 2f764c6436..0223fd9f3a 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -236,7 +236,7 @@ void openQCD_gaugeloadsave(int precision, void *gauge) { QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); // FIXME: + = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); loadGaugeQuda(gauge, &qudaGaugeParam); @@ -249,17 +249,20 @@ void openQCD_gaugeload(int precision, void *gauge) { QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); // FIXME: + = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); loadGaugeQuda(gauge, &qudaGaugeParam); return; } -void openQCD_gaugesave(QudaGaugeParam *qudaGaugeParam, void *gauge) +void openQCD_gaugesave(int precision, void *gauge) { - saveGaugeQuda(gauge, qudaGaugeParam); + QudaGaugeParam qudaGaugeParam + = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); + + saveGaugeQuda(gauge, &qudaGaugeParam); return; } From 4ff5cce94918591719658ac818bcc0f4b8f890dc Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Mon, 27 Mar 2023 22:56:25 +0200 Subject: [PATCH 034/148] removed comments --- include/color_spinor_field_order.h | 2 +- include/gauge_field_order.h | 6 ++-- include/quda_openqcd_interface.h | 56 ++++-------------------------- 3 files changed, 11 insertions(+), 53 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 8a4bd87b9d..65ee160583 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1795,7 +1795,7 @@ namespace quda auto in = &field[iy_OpenQxD * length]; // This is how they're accessed within OpenQxd (length = 24 doubles // = 12 complex doubles = 4 spinor x 3 colors) // - printf("Loading site iy: %d with field value %.10e \n", iy_OpenQxD, field[iy_OpenQxD * length]); + // printf("Loading site iy: %d with field value %.10e \n", iy_OpenQxD, field[iy_OpenQxD * length]); block_load(v, reinterpret_cast(in)); } diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 7e766bb55b..04b635478c 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2361,9 +2361,9 @@ namespace quda { // TODO: Determine whether coord[mu] is local or global int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) - coord1 in QUDA is x2 in OpenQxD (y) - coord2 in QUDA is x3 in OpenQxD (z) - coord3 in QUDA is x0 in OpenQxD (t) + coord1 in QUDA is x2 in OpenQxD (y) + coord2 in QUDA is x3 in OpenQxD (z) + coord3 in QUDA is x0 in OpenQxD (t) */ // int ix_OpenQxD = ipt[iy_OpenQxD]; int dir_OpenQxD = (dir + 1) % 4; // rotation of axes QUDA -> OpenQxD diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 961d35e5cd..2676f4bc1a 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -3,14 +3,6 @@ #include #include -// TODO: (later) The ipt and other functions can be incorporated here (so no reordering needed in OpenQXD side) -// OpenQxD helpers: -// #include "../../openQxD-devel/include/su3.h" -// #include "../../openQxD-devel/include/flags.h" -// #include "../../openQxD-devel/include/utils.h" -// #include "../../openQxD-devel/include/lattice.h" -// #include "../../openQxD-devel/include/global.h" - /** * @file quda_openqcd_interface.h * @@ -32,7 +24,7 @@ typedef struct { const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ // FIXME: const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ // FIXME: int device; /** GPU device number */ - // const int *ipt; + // const int *ipt; // TODO: IN THE FUTURE } openQCD_QudaLayout_t; /** @@ -50,47 +42,18 @@ typedef struct { */ void openQCD_qudaInit(openQCD_QudaInitArgs_t input); -// /** -// * Set set the local dimensions and machine topology for QUDA to use -// * -// * @param layout Struct defining local dimensions and machine topology -// */ -// void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout); +/** + * Set set the local dimensions and machine topology for QUDA to use + * + * @param layout Struct defining local dimensions and machine topology + */ +void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout); /** * Destroy the QUDA context. */ void openQCD_qudaFinalize(void); -#if 0 -// leave that here for now - /** - * Allocate pinned memory suitable for CPU-GPU transfers - * @param bytes The size of the requested allocation - * @return Pointer to allocated memory - */ - void* openQCD_qudaAllocatePinned(size_t bytes); - - /** - * Free pinned memory - * @param ptr Pointer to memory to be free - */ - void openQCD_qudaFreePinned(void *ptr); - - /** - * Allocate managed memory to reduce CPU-GPU transfers - * @param bytes The size of the requested allocation - * @return Pointer to allocated memory - */ - void *openQCD_qudaAllocateManaged(size_t bytes); - - /** - * Free managed memory - * @param ptr Pointer to memory to be free - */ - void openQCD_qudaFreeManaged(void *ptr); - -#endif /** * Parameters related to linear solvers. @@ -159,17 +122,12 @@ void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, * @param inv_args Meta data * @param milc_link Base pointer to host gauge field (regardless of dimensionality) */ -void openQCD_qudaLoadGaugeField(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, - const void *milc_link); void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge); - void openQCD_gaugeloadsave(int precision, void *gauge); void openQCD_gaugeload(int precision, void *gauge); void openQCD_gaugesave(int precision, void *gauge); -// int openQCD_ipt(int iy); - /** Free the gauge field allocated in QUDA. */ From b2d74460a9983c12b7365a044a81bb5319ed15d4 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Mon, 27 Mar 2023 23:41:43 +0200 Subject: [PATCH 035/148] rearrangement --- lib/openqcd_interface.cpp | 297 +++++++++++++++++++------------------- 1 file changed, 148 insertions(+), 149 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 0223fd9f3a..93d1da7787 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -11,7 +11,6 @@ #include #include - #define MAX(a, b) ((a) > (b) ? (a) : (b)) // code for NVTX taken from Jiri Kraus' blog post: @@ -76,13 +75,11 @@ using namespace quda; // template void inline qudamilc_called(const char *func) { qudamilc_called(func, getVerbosity()); } -// static int safe_mod(int x, int y) -// { -// if (x >= 0) -// return x % y; -// else -// return (y - (abs(x) % y)) % y; -// } +/******************************************* + * + * LAYOUT AND INIT + * + *******************************************/ // fdata should point to 4 integers in order {NPROC0, NPROC1, NPROC2, NPROC3} // coords is the 4D cartesian coordinate of a rank. @@ -165,16 +162,6 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t input) void openQCD_qudaFinalize() { endQuda(); } -// not sure we want to use allocators, but in case we want to -#if 0 -void *qudaAllocatePinned(size_t bytes) { return pool_pinned_malloc(bytes); } - -void qudaFreePinned(void *ptr) { pool_pinned_free(ptr); } - -void *qudaAllocateManaged(size_t bytes) { return managed_malloc(bytes); } - -void qudaFreeManaged(void *ptr) { managed_free(ptr); } -#endif static int getLinkPadding(const int dim[4]) { @@ -184,6 +171,13 @@ static int getLinkPadding(const int dim[4]) return padding; } +/******************************************* + * + * SETTINGS AND PARAMETERS + * + *******************************************/ + +/* OPENQCD GAUGE PARAMS */ static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec) { QudaGaugeParam gParam = newQudaGaugeParam(); @@ -201,74 +195,79 @@ static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec) gParam.scale = 0; gParam.ga_pad = getLinkPadding(dim); - // gParam.return_result_gauge = 1; // I think this is not needed ? - return gParam; } -void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge) +void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], openQCD_QudaInvertArgs_t &inv_args, + int external_precision, int quda_precision) { - // qudamilc_called(__func__); - QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); // FIXME: - // reateGaugeParamForObservables(precision, arg, phase_in); - - loadGaugeQuda(gauge, &qudaGaugeParam); - - QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam(); - obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; - obsParam.remove_staggered_phase = QUDA_BOOLEAN_FALSE; // - // phase_in ? QUDA_BOOLEAN_TRUE : QUDA_BOOLEAN_FALSE; - gaugeObservablesQuda(&obsParam); - - // Let MILC apply its own Nc normalization - plaq[0] = obsParam.plaquette[0]; - plaq[1] = obsParam.plaquette[1]; - plaq[2] = obsParam.plaquette[2]; - - saveGaugeQuda(gauge, &qudaGaugeParam); + const QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + const QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + QudaPrecision device_precision_sloppy; - return; -} + switch (inv_args.mixed_precision) { + case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break; + case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break; + default: device_precision_sloppy = device_precision; + } -void openQCD_gaugeloadsave(int precision, void *gauge) -{ + for (int dir = 0; dir < 4; ++dir) qudaGaugeParam.X[dir] = dim[dir]; - QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); + qudaGaugeParam.anisotropy = 1.0; + qudaGaugeParam.type = QUDA_WILSON_LINKS; + qudaGaugeParam.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; - loadGaugeQuda(gauge, &qudaGaugeParam); + // Check the boundary conditions + // Can't have twisted or anti-periodic boundary conditions in the spatial + // directions with 12 reconstruct at the moment. + bool trivial_phase = true; + for (int dir = 0; dir < 3; ++dir) { + if (inv_args.boundary_phase[dir] != 0) trivial_phase = false; + } + if (inv_args.boundary_phase[3] != 0 && inv_args.boundary_phase[3] != 1) trivial_phase = false; - saveGaugeQuda(gauge, &qudaGaugeParam); + if (trivial_phase) { + qudaGaugeParam.t_boundary = (inv_args.boundary_phase[3]) ? QUDA_ANTI_PERIODIC_T : QUDA_PERIODIC_T; + qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_12; + qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_12; + } else { + qudaGaugeParam.t_boundary = QUDA_PERIODIC_T; + qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; + qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; + } - return; + qudaGaugeParam.cpu_prec = host_precision; + qudaGaugeParam.cuda_prec = device_precision; + qudaGaugeParam.cuda_prec_sloppy = device_precision_sloppy; + qudaGaugeParam.cuda_prec_precondition = device_precision_sloppy; + qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; + // qudaGaugeParam.ga_pad = getLinkPadding(dim); } -void openQCD_gaugeload(int precision, void *gauge) -{ - - QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); - - loadGaugeQuda(gauge, &qudaGaugeParam); - - return; -} -void openQCD_gaugesave(int precision, void *gauge) +/* PARAMS FOR SPINOR FIELDS */ +static void setColorSpinorParams(const int dim[4], QudaPrecision precision, ColorSpinorParam *param) { + param->nColor = 3; + param->nSpin = 4; // =1 for staggered, =2 for coarse Dslash, =4 for 4d spinor + param->nDim = 4; // TODO: check how to adapt this for openqxd - QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); - - saveGaugeQuda(gauge, &qudaGaugeParam); + for (int dir = 0; dir < 4; ++dir) param->x[dir] = dim[dir]; + // param->x[0] /= 2; // for staggered sites only FIXME:? - return; + param->setPrecision(precision); + param->pad = 0; + param->siteSubset = QUDA_FULL_SITE_SUBSET; // FIXME: check how to adapt this for openqxd + param->siteOrder + = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: check how to adapt this for openqxd // EVEN-ODD is only about inner ordering in quda + param->fieldOrder = QUDA_OPENQCD_FIELD_ORDER; // FIXME: + param->gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // meaningless, but required by the code. // // FIXME:: + param->create = QUDA_ZERO_FIELD_CREATE; // // FIXME:: check how to adapt this for openqxd ?? created -0 in weird places } -// set the params for the single mass solver +/* PARAMS FOR DSLASH AND INVERSION */ static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, double mass, double target_residual, double target_residual_hq, int maxiter, double reliable_delta, QudaParity parity, QudaVerbosity verbosity, @@ -340,75 +339,99 @@ static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, Qud invertParam->compute_action = 0; } -// FIXME: -static void setColorSpinorParams(const int dim[4], QudaPrecision precision, ColorSpinorParam *param) + + + + +/******************************************* + * + * FUNCTIONS + * + *******************************************/ + +/* + * GAUGE FIELDS + */ + +void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge) { - param->nColor = 3; - param->nSpin = 4; // =1 for staggered, =2 for coarse Dslash, =4 for 4d spinor - param->nDim = 4; // TODO: check how to adapt this for openqxd - for (int dir = 0; dir < 4; ++dir) param->x[dir] = dim[dir]; - // param->x[0] /= 2; // for staggered sites only FIXME:? + QudaGaugeParam qudaGaugeParam + = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); - param->setPrecision(precision); - param->pad = 0; - param->siteSubset = QUDA_FULL_SITE_SUBSET; // FIXME: check how to adapt this for openqxd - param->siteOrder - = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: check how to adapt this for openqxd // EVEN-ODD is only about inner ordering in quda - param->fieldOrder = QUDA_OPENQCD_FIELD_ORDER; // FIXME: - param->gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // meaningless, but required by the code. // // FIXME:: - param->create = QUDA_ZERO_FIELD_CREATE; // // FIXME:: check how to adapt this for openqxd ?? created -0 in weird places + loadGaugeQuda(gauge, &qudaGaugeParam); + + QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam(); + obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; + obsParam.remove_staggered_phase = QUDA_BOOLEAN_FALSE; // + gaugeObservablesQuda(&obsParam); + + // Note different Nc normalization! + plaq[0] = obsParam.plaquette[0]; + plaq[1] = obsParam.plaquette[1]; + plaq[2] = obsParam.plaquette[2]; + + saveGaugeQuda(gauge, &qudaGaugeParam); + + return; } -void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], openQCD_QudaInvertArgs_t &inv_args, - int external_precision, int quda_precision) +void openQCD_gaugeloadsave(int precision, void *gauge) { - const QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - const QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - QudaPrecision device_precision_sloppy; + QudaGaugeParam qudaGaugeParam + = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); - switch (inv_args.mixed_precision) { - case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break; - case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break; - default: device_precision_sloppy = device_precision; - } + loadGaugeQuda(gauge, &qudaGaugeParam); - for (int dir = 0; dir < 4; ++dir) qudaGaugeParam.X[dir] = dim[dir]; + saveGaugeQuda(gauge, &qudaGaugeParam); - qudaGaugeParam.anisotropy = 1.0; - qudaGaugeParam.type = QUDA_WILSON_LINKS; - qudaGaugeParam.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; + return; +} - // Check the boundary conditions - // Can't have twisted or anti-periodic boundary conditions in the spatial - // directions with 12 reconstruct at the moment. - bool trivial_phase = true; - for (int dir = 0; dir < 3; ++dir) { - if (inv_args.boundary_phase[dir] != 0) trivial_phase = false; - } - if (inv_args.boundary_phase[3] != 0 && inv_args.boundary_phase[3] != 1) trivial_phase = false; +void openQCD_gaugeload(int precision, void *gauge) +{ - if (trivial_phase) { - qudaGaugeParam.t_boundary = (inv_args.boundary_phase[3]) ? QUDA_ANTI_PERIODIC_T : QUDA_PERIODIC_T; - qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_12; - qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_12; - } else { - qudaGaugeParam.t_boundary = QUDA_PERIODIC_T; - qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; - qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; - } + QudaGaugeParam qudaGaugeParam + = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); - qudaGaugeParam.cpu_prec = host_precision; - qudaGaugeParam.cuda_prec = device_precision; - qudaGaugeParam.cuda_prec_sloppy = device_precision_sloppy; - qudaGaugeParam.cuda_prec_precondition = device_precision_sloppy; - qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; - // qudaGaugeParam.ga_pad = getLinkPadding(dim); + loadGaugeQuda(gauge, &qudaGaugeParam); + + return; +} + +void openQCD_gaugesave(int precision, void *gauge) +{ + + QudaGaugeParam qudaGaugeParam + = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); + + saveGaugeQuda(gauge, &qudaGaugeParam); + + return; } +void openQCD_qudaFreeGaugeField() +{ + freeGaugeQuda(); +} // qudaFreeGaugeField + + +/* + * SPINOR FIELDS + */ + + +/* + * SPINOR AND GAUGE FIELDS + */ + + + + + // #if 0 -/* FIXME: This function is buggy: */ +/* FIXME: */ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, void *dst, void *gauge) { @@ -428,7 +451,8 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda QudaParity local_parity = inv_args.evenodd; QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY; - /* setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, + /* For reference: + setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, double mass, double target_residual, double target_residual_hq, int maxiter, double reliable_delta, QudaParity parity, QudaVerbosity verbosity, QudaInverterType inverter, QudaInvertParam *invertParam) */ @@ -441,25 +465,19 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda // dslashQuda(dst, src, &invertParam, local_parity); dslashQudaTest(dst, src, &invertParam, local_parity); - // Original: + // Original: TODO: is cast necessary? // dslashQuda(static_cast(dst), static_cast(src), &invertParam, local_parity); - // TODO: need save?? - - // saveGaugeQuda(gauge, &qudaGaugeParam); - return; } // qudaDslash -// #endif #if 0 void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, openQCD_QudaInvertArgs_t inv_args, - double target_residual, double target_fermilab_residual, const void *const fatlink, - const void *const longlink, void *source, void *solution, double *const final_residual, - double *const final_fermilab_residual, int *num_iters) + double target_residual, double target_fermilab_residual, const void *const fatlink, + const void *const longlink, void *source, void *solution, double *const final_residual, + double *const final_fermilab_residual, int *num_iters) { static const QudaVerbosity verbosity = getVerbosity(); - // qudamilc_called(__func__, verbosity); if (target_fermilab_residual == 0 && target_residual == 0) errorQuda("qudaInvert: requesting zero residual\n"); @@ -530,7 +548,6 @@ void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, if (!create_quda_gauge) invalidateGaugeQuda(); - qudamilc_called(__func__, verbosity); } // qudaInvert #endif @@ -603,22 +620,4 @@ void setInvertParam(QudaInvertParam &invertParam, openQCD_QudaInvertArgs_t &inv_ invertParam.compute_action = 0; } -void openQCD_qudaLoadGaugeField(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, - const void *milc_link) -{ - // qudamilc_called(__func__); - QudaGaugeParam qudaGaugeParam = newQudaGaugeParam(); - setGaugeParams(qudaGaugeParam, localDim, inv_args, external_precision, quda_precision); - - loadGaugeQuda(const_cast(milc_link), &qudaGaugeParam); - // qudamilc_called(__func__); -} // qudaLoadGaugeField - -void openQCD_qudaFreeGaugeField() -{ - // qudamilc_called(__func__); - freeGaugeQuda(); - // qudamilc_called(__func__); -} // qudaFreeGaugeField - // TODO: OpenQCDMultigridPack functions a la MILC (cf. milc_interface.cpp) From fdf641c1c1f5fe9b564b654166012eec4bafde50 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Tue, 28 Mar 2023 01:16:53 +0200 Subject: [PATCH 036/148] plaquette only function --- include/quda_openqcd_interface.h | 1 + lib/openqcd_interface.cpp | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 2676f4bc1a..e239d261e9 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -124,6 +124,7 @@ void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, */ void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge); +void openQCD_qudaPlaquetteOnly(int precision, double plaq[3]); void openQCD_gaugeloadsave(int precision, void *gauge); void openQCD_gaugeload(int precision, void *gauge); void openQCD_gaugesave(int precision, void *gauge); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 93d1da7787..d72066cde7 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -376,6 +376,31 @@ void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge) return; } + +void openQCD_qudaPlaquetteOnly(int precision, double plaq[3]) +{ + + // QudaGaugeParam qudaGaugeParam + // = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); + + // loadGaugeQuda(gauge, &qudaGaugeParam); + + QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam(); + obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; + obsParam.remove_staggered_phase = QUDA_BOOLEAN_FALSE; // + gaugeObservablesQuda(&obsParam); + + // Note different Nc normalization! + plaq[0] = obsParam.plaquette[0]; + plaq[1] = obsParam.plaquette[1]; + plaq[2] = obsParam.plaquette[2]; + + // saveGaugeQuda(gauge, &qudaGaugeParam); + + return; +} + + void openQCD_gaugeloadsave(int precision, void *gauge) { From ca66195f12a10f4b793f2d88f973723acf486dc4 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Tue, 28 Mar 2023 01:57:21 +0200 Subject: [PATCH 037/148] small correction --- lib/openqcd_interface.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index d72066cde7..1f4241d7de 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -439,6 +439,7 @@ void openQCD_gaugesave(int precision, void *gauge) void openQCD_qudaFreeGaugeField() { freeGaugeQuda(); + return; } // qudaFreeGaugeField From bc950792fc64bfe899c221dab80cb00eb653f374 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Mon, 3 Apr 2023 00:50:40 +0200 Subject: [PATCH 038/148] gauge working, color using spacespinorcolor totest --- include/color_spinor_field.h | 2 +- include/color_spinor_field_order.h | 22 ++++---- include/quda_openqcd_interface.h | 16 ++++++ lib/copy_color_spinor.cuh | 10 ++-- lib/interface_quda.cpp | 85 ++++++++++++++++++++++++++++++ lib/openqcd_interface.cpp | 73 +++++++++++++++++++++++-- 6 files changed, 188 insertions(+), 20 deletions(-) diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 03063bd5c5..4ee0af51e2 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -232,7 +232,7 @@ namespace quda } else if (inv_param.dirac_order == QUDA_OPENQCD_DIRAC_ORDER) { fieldOrder = QUDA_OPENQCD_FIELD_ORDER; siteOrder - = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: SHOULD THIS BE LEXICOGRAPHICAL, OR IN QUDA_OPENQCD_FIELD_ORDER THIS WORKS OUT ALREADY ??? + = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: SHOULD THIS BE LEXICOGRAPHICAL?, OR VIA FULL IMPLEMENTATION VIA IPT ARRAY IN QUDA } else { errorQuda("Dirac order %d not supported", inv_param.dirac_order); } diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 65ee160583..e2736d2db5 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1739,8 +1739,7 @@ namespace quda offset(a.Bytes() / (2 * sizeof(Float))), // TODO: What's this for?? volumeCB(a.VolumeCB()), nParity(a.SiteSubset()), - dim {a.X(0), a.X(1), a.X(2), a.X(3)} - // dim {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // GLOBAL dimensions?? + dim {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // GLOBAL dimensions?? { // TODO: ARE GHOSTS NEEDED?? // for (int i = 0; i < 4; i++) { // ghost[2 * i] = ghost_ ? ghost_[2 * i] : 0; @@ -1802,16 +1801,16 @@ namespace quda __device__ __host__ inline void save(const complex v[length / 2], int x, int parity = 0) const { /* INDEXING */ - // int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) - // getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site + int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) + getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site - // int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; + int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; - // // Loading as per QUDA style - // auto out = &field[iy_OpenQxD * length]; + // Loading as per QUDA style + auto out = &field[iy_OpenQxD * length]; // printf("Saving site iy: %d with field value %.10e \n",iy_OpenQxD,field[iy_OpenQxD * length]); - // block_store(reinterpret_cast(out), v); + block_store(reinterpret_cast(out), v); } /** @@ -1849,8 +1848,8 @@ namespace quda // } // } - size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } // FIXME: ?? - }; // openQCDDiracOrder + size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } + }; // openQCDDiracOrder } // namespace colorspinor @@ -1930,6 +1929,9 @@ namespace quda template struct colorspinor_order_mapper { typedef colorspinor::SpaceSpinorColorOrder type; }; + // template struct colorspinor_order_mapper { + // typedef colorspinor::OpenQCDDiracOrder type; + // }; // TODO: ? template struct colorspinor_order_mapper { typedef colorspinor::FloatNOrder type; }; diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index e239d261e9..0cec64e039 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -89,6 +89,22 @@ typedef struct { void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *source, void *solution, void *gauge); + +/** + * ALL the following except the Dirac operator application + * Apply the improved staggered operator to a field. All fields + * passed and returned are host (CPU) field in MILC order. + * + * @param external_precision Precision of host fields passed to QUDA (2 - double, 1 - single) + * @param quda_precision Precision for QUDA to use (2 - double, 1 - single) + * @param inv_args Struct setting some solver metadata + * @param source Right-hand side source field + * @param solution Solution spinor field + */ +void openQCD_colorspinorloadsave(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, + void *dst, void *gauge); + + /** * Solve Ax=b for an improved staggered operator. All fields are fields * passed and returned are host (CPU) field in MILC order. This diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh index c59c5b3605..486c2b6f1d 100644 --- a/lib/copy_color_spinor.cuh +++ b/lib/copy_color_spinor.cuh @@ -116,8 +116,9 @@ namespace quda { } else if (out.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { #ifdef BUILD_OPENQCD_INTERFACE - using O = OpenQCDDiracOrder; // TODO: Seems OK - CopyColorSpinor(out, in, param); // TODO: Seems OK + // using O = OpenQCDDiracOrder; // TODO: Seems OK + using O = SpaceSpinorColorOrder; // FIXME: This is a test + CopyColorSpinor(out, in, param); #else errorQuda("OpenQCD interface has not been built\n"); #endif @@ -167,8 +168,9 @@ namespace quda { } else if (in.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { #ifdef BUILD_OPENQCD_INTERFACE - using ColorSpinor = OpenQCDDiracOrder; // TODO: Seems OK - genericCopyColorSpinor(param); // TODO: Seems OK + // using ColorSpinor = OpenQCDDiracOrder; // TODO: Seems OK + using ColorSpinor = SpaceSpinorColorOrder; // FIXME: This is a test + genericCopyColorSpinor(param); #else errorQuda("OpenQCD interface has not been built\n"); #endif diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index a6a461daa8..913082b001 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -1906,6 +1906,91 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity profileDslash.TPSTOP(QUDA_PROFILE_TOTAL); } +#if 0 // FIXME: +void dslashQudaNoLoads(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) +{ + profileDslash.TPSTART(QUDA_PROFILE_TOTAL); + profileDslash.TPSTART(QUDA_PROFILE_INIT); + + const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise; + + if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH) + || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH)) + errorQuda("Gauge field not allocated"); + if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) + errorQuda("Clover field not allocated"); + + pushVerbosity(inv_param->verbosity); + if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); + + ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), true, inv_param->input_location); + ColorSpinorField in_h(cpuParam); + ColorSpinorParam cudaParam(cpuParam, *inv_param, QUDA_CUDA_FIELD_LOCATION); + + cpuParam.v = h_out; + cpuParam.location = inv_param->output_location; + ColorSpinorField out_h(cpuParam); + + ColorSpinorField in(cudaParam); + ColorSpinorField out(cudaParam); + + bool pc = true; + DiracParam diracParam; + setDiracParam(diracParam, inv_param, pc); + + profileDslash.TPSTOP(QUDA_PROFILE_INIT); + + profileDslash.TPSTART(QUDA_PROFILE_H2D); + in = in_h; + profileDslash.TPSTOP(QUDA_PROFILE_H2D); + + profileDslash.TPSTART(QUDA_PROFILE_COMPUTE); + + if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); + + if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION && + (inv_param->dslash_type == QUDA_STAGGERED_DSLASH || + inv_param->dslash_type == QUDA_ASQTAD_DSLASH) ) + blas::ax(1.0/(2.0*inv_param->mass), in); + + if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { + if (parity == QUDA_EVEN_PARITY) { + parity = QUDA_ODD_PARITY; + } else { + parity = QUDA_EVEN_PARITY; + } + blas::ax(gauge.Anisotropy(), in); + } + + Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator + if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inv_param->dagger) { + cudaParam.create = QUDA_NULL_FIELD_CREATE; + ColorSpinorField tmp1(cudaParam); + ((DiracTwistedCloverPC*) dirac)->TwistCloverInv(tmp1, in, (parity+1)%2); // apply the clover-twist + dirac->Dslash(out, tmp1, parity); // apply the operator + } else if (inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH + || inv_param->dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH) { + dirac->Dslash4(out, in, parity); + } else { + dirac->Dslash(out, in, parity); // apply the operator + } + profileDslash.TPSTOP(QUDA_PROFILE_COMPUTE); + + profileDslash.TPSTART(QUDA_PROFILE_D2H); + out_h = out; + profileDslash.TPSTOP(QUDA_PROFILE_D2H); + + if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out)); + + profileDslash.TPSTART(QUDA_PROFILE_FREE); + delete dirac; // clean up + + profileDslash.TPSTOP(QUDA_PROFILE_FREE); + + popVerbosity(); + profileDslash.TPSTOP(QUDA_PROFILE_TOTAL); +} +#endif // #if 0 void dslashQudaTest(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 1f4241d7de..1307db7c5c 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -488,14 +488,77 @@ void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_Quda ColorSpinorParam csParam; setColorSpinorParams(localDim, host_precision, &csParam); - // dslashQuda(dst, src, &invertParam, local_parity); - dslashQudaTest(dst, src, &invertParam, local_parity); + dslashQuda(static_cast(dst), static_cast(src), &invertParam, local_parity); - // Original: TODO: is cast necessary? - // dslashQuda(static_cast(dst), static_cast(src), &invertParam, local_parity); + return; +} // openQCD_qudaDslash + +#if 0 // FIXME: +void openQCD_qudaDslashNoLoads(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, + void *dst, void *gauge) +{ + static const QudaVerbosity verbosity = getVerbosity(); + + QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + QudaPrecision device_precision_sloppy = device_precision; + + QudaInvertParam invertParam = newQudaInvertParam(); + + QudaParity local_parity = inv_args.evenodd; + QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY; + + /* For reference: + setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, + double mass, double target_residual, double target_residual_hq, int maxiter, + double reliable_delta, QudaParity parity, QudaVerbosity verbosity, + QudaInverterType inverter, QudaInvertParam *invertParam) */ + setInvertParams(host_precision, device_precision, device_precision_sloppy, 0.0, 0, 0, 0, 0.0, local_parity, verbosity, + QUDA_CG_INVERTER, &invertParam); + + ColorSpinorParam csParam; + setColorSpinorParams(localDim, host_precision, &csParam); + + dslashQudaNoLoads(static_cast(dst), static_cast(src), &invertParam, local_parity); + + return; +} // openQCD_qudaDslash +#endif + +void openQCD_colorspinorloadsave(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, + void *dst, void *gauge) +{ + static const QudaVerbosity verbosity = getVerbosity(); + + QudaGaugeParam qudaGaugeParam + = newOpenQCDGaugeParam(localDim, (quda_precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); + + loadGaugeQuda(gauge, &qudaGaugeParam); + + QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + QudaPrecision device_precision_sloppy = device_precision; + + QudaInvertParam invertParam = newQudaInvertParam(); + + QudaParity local_parity = inv_args.evenodd; + QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY; + + /* For reference: + setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, + double mass, double target_residual, double target_residual_hq, int maxiter, + double reliable_delta, QudaParity parity, QudaVerbosity verbosity, + QudaInverterType inverter, QudaInvertParam *invertParam) */ + setInvertParams(host_precision, device_precision, device_precision_sloppy, 0.0, 0, 0, 0, 0.0, local_parity, verbosity, + QUDA_CG_INVERTER, &invertParam); + + ColorSpinorParam csParam; + setColorSpinorParams(localDim, host_precision, &csParam); + + dslashQudaTest(static_cast(dst), static_cast(src), &invertParam, local_parity); return; -} // qudaDslash +} // openQCD_colorspinorloadsave #if 0 void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, openQCD_QudaInvertArgs_t inv_args, From e1bed3601d6ccad0eeb771dc3c16106964a34b13 Mon Sep 17 00:00:00 2001 From: fernandezdlg Date: Thu, 13 Apr 2023 23:59:11 +0200 Subject: [PATCH 039/148] final --- lib/copy_color_spinor.cuh | 2 +- lib/cpu_gauge_field.cpp | 3 ++- lib/openqcd_interface.cpp | 4 +--- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh index 486c2b6f1d..7688377b61 100644 --- a/lib/copy_color_spinor.cuh +++ b/lib/copy_color_spinor.cuh @@ -116,7 +116,7 @@ namespace quda { } else if (out.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { #ifdef BUILD_OPENQCD_INTERFACE - // using O = OpenQCDDiracOrder; // TODO: Seems OK + // using O = OpenQCDDiracOrder; // TODO: NOT working using O = SpaceSpinorColorOrder; // FIXME: This is a test CopyColorSpinor(out, in, param); #else diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp index f3dbc8172d..5a2ee4d15f 100644 --- a/lib/cpu_gauge_field.cpp +++ b/lib/cpu_gauge_field.cpp @@ -420,7 +420,8 @@ namespace quda { for (int d = 0; d < 4; d++) { std::memcpy(p[d], &dst_buffer[d * dbytes], dbytes); } } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER - || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { + || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER + || Order() == QUDA_OPENQCD_GAUGE_ORDER) { void *p = Gauge_p(); size_t bytes = Bytes(); std::memcpy(p, buffer, bytes); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 1307db7c5c..067df5bac9 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -153,11 +153,9 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t input) { if (initialized) return; setVerbosityQuda(input.verbosity, "", stdout); - // qudamilc_called(__func__); openQCD_qudaSetLayout(input.layout); initialized = true; - // qudamilc_called(__func__); - // geometry_openQxD(); // TODO: in the future establish ipt and other helper indexes from openQxD? + // geometry_openQxD(); // TODO: in the future establish ipt and other helper indexes from openQxD } void openQCD_qudaFinalize() { endQuda(); } From 5e60bc53e0376e04963b09523e74669f07ebfa58 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 14 Aug 2023 19:50:27 +0200 Subject: [PATCH 040/148] stated to clean up mess a little --- include/quda_openqcd_interface.h | 9 ++- lib/openqcd_interface.cpp | 95 ++++++++++++++++++++++---------- 2 files changed, 72 insertions(+), 32 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 0cec64e039..d2ef4fecbd 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -23,7 +23,7 @@ typedef struct { const int *latsize; /** Local lattice dimensions L0, L1, L2, L3 */ // FIXME: const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ // FIXME: const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ // FIXME: - int device; /** GPU device number */ + int device; /** GPU device number */ // const int *ipt; // TODO: IN THE FUTURE } openQCD_QudaLayout_t; @@ -33,6 +33,10 @@ typedef struct { typedef struct { QudaVerbosity verbosity; /** How verbose QUDA should be (QUDA_SILENT, QUDA_VERBOSE or QUDA_SUMMARIZE) */ openQCD_QudaLayout_t layout; /** Layout for QUDA to use */ + FILE *logfile; + /*void (*reorder_gauge)(void *gauge); + int VOLUME; + int sizeof_su3_dble;*/ } openQCD_QudaInitArgs_t; // passed to the initialization struct /** @@ -140,10 +144,11 @@ void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, */ void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge); -void openQCD_qudaPlaquetteOnly(int precision, double plaq[3]); +void openQCD_qudaPlaquetteOnly(double plaq[3]); void openQCD_gaugeloadsave(int precision, void *gauge); void openQCD_gaugeload(int precision, void *gauge); void openQCD_gaugesave(int precision, void *gauge); +void openQCD_qudaFreeGaugeField(void); /** Free the gauge field allocated in QUDA. diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 067df5bac9..6a050e0cb3 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -47,6 +47,7 @@ static const int num_colors = sizeof(colors) / sizeof(uint32_t); #endif static bool initialized = false; +static openQCD_QudaInitArgs_t input; static int commsGridDim[4]; static int localDim[4]; @@ -112,28 +113,38 @@ static int rankFromCoords(const int *coords, void *fdata) // TODO: ib = ib * NPROC1_OpenQxD + n1_OpenQxD; ib = ib * NPROC2_OpenQxD + n2_OpenQxD; ib = ib * NPROC3_OpenQxD + n3_OpenQxD; - printf("Coords are: %d,%d,%d,%d \n Rank is: %d \n\n", coords[0], coords[1], coords[2], coords[3], ib); + + printfQuda("Coords are: %d,%d,%d,%d, Rank is: %d \n", coords[0], coords[1], coords[2], coords[3], ib); return ib; } -void openQCD_qudaSetLayout(openQCD_QudaLayout_t input) +void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) { int local_dim[4]; - for (int dir = 0; dir < 4; ++dir) { local_dim[dir] = input.latsize[dir]; } + for (int dir = 0; dir < 4; ++dir) { + local_dim[dir] = layout.latsize[dir]; + } + #ifdef MULTI_GPU - for (int dir = 0; dir < 4; ++dir) { local_dim[dir] /= input.machsize[dir]; } + for (int dir = 0; dir < 4; ++dir) { + local_dim[dir] /= layout.machsize[dir]; + } #endif for (int dir = 0; dir < 4; ++dir) { if (local_dim[dir] % 2 != 0) { - printf("Error: Odd lattice dimensions are not supported\n"); + printfQuda("Error: Odd lattice dimensions are not supported\n"); exit(1); } } // TODO: do we need to track this here - for (int dir = 0; dir < 4; ++dir) localDim[dir] = local_dim[dir]; + for (int dir = 0; dir < 4; ++dir) { + localDim[dir] = local_dim[dir]; + } #ifdef MULTI_GPU - for (int dir = 0; dir < 4; ++dir) commsGridDim[dir] = input.machsize[dir]; + for (int dir = 0; dir < 4; ++dir) { + commsGridDim[dir] = layout.machsize[dir]; + } // TODO: would we ever want to run with QMP COMMS? #ifdef QMP_COMMS initCommsGridQuda(4, commsGridDim, nullptr, nullptr); @@ -143,22 +154,26 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t input) static int device = -1; #else - static int device = input.device; + static int device = layout.device; #endif initQuda(device); } -void openQCD_qudaInit(openQCD_QudaInitArgs_t input) +void openQCD_qudaInit(openQCD_QudaInitArgs_t in) { if (initialized) return; - setVerbosityQuda(input.verbosity, "", stdout); - openQCD_qudaSetLayout(input.layout); + setVerbosityQuda(in.verbosity, "QUDA: ", in.logfile); + openQCD_qudaSetLayout(in.layout); + + input = in; initialized = true; // geometry_openQxD(); // TODO: in the future establish ipt and other helper indexes from openQxD } -void openQCD_qudaFinalize() { endQuda(); } +void openQCD_qudaFinalize() { + endQuda(); +} static int getLinkPadding(const int dim[4]) @@ -347,13 +362,15 @@ static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, Qud * *******************************************/ -/* - * GAUGE FIELDS +/** + * @brief Calculate the plaquette, given gauge fields + * + * @param[in] precision The precision + * @param[out] plaq The plaquette + * @param[in,out] gauge The gauge fields in correct order */ - void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge) { - QudaGaugeParam qudaGaugeParam = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); @@ -375,14 +392,13 @@ void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge) } -void openQCD_qudaPlaquetteOnly(int precision, double plaq[3]) +/** + * @brief Calculate the plaquette + * + * @param[out] plaq The plaquette + */ +void openQCD_qudaPlaquetteOnly(double plaq[3]) { - - // QudaGaugeParam qudaGaugeParam - // = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); - - // loadGaugeQuda(gauge, &qudaGaugeParam); - QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam(); obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; obsParam.remove_staggered_phase = QUDA_BOOLEAN_FALSE; // @@ -393,39 +409,58 @@ void openQCD_qudaPlaquetteOnly(int precision, double plaq[3]) plaq[1] = obsParam.plaquette[1]; plaq[2] = obsParam.plaquette[2]; - // saveGaugeQuda(gauge, &qudaGaugeParam); - return; } +/** + * @brief Load and save the gauge fields + * + * @param[in] precision The precision + * @param[in,out] gauge The gauge fields + */ void openQCD_gaugeloadsave(int precision, void *gauge) { - QudaGaugeParam qudaGaugeParam = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); loadGaugeQuda(gauge, &qudaGaugeParam); - saveGaugeQuda(gauge, &qudaGaugeParam); return; } + +/** + * @brief Load the gauge fields from host to quda + * + * @param[in] precision The precision + * @param[in] gauge The gauge fields (in lexicographical order) + */ void openQCD_gaugeload(int precision, void *gauge) { - QudaGaugeParam qudaGaugeParam = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); + /*printfQuda("input.VOLUME = %d\n", input.VOLUME); + printfQuda("input.sizeof_su3_dble = %d\n", input.sizeof_su3_dble);*/ + /*buffer = malloc(4*VOLUME*sizeof(su3_dble));*/ + /*input.reorder_gauge();*/ + loadGaugeQuda(gauge, &qudaGaugeParam); return; } + +/** + * @brief Save the gauge fields from quda to host + * + * @param[in] precision The precision + * @param[out] gauge The gauge fields + */ void openQCD_gaugesave(int precision, void *gauge) { - QudaGaugeParam qudaGaugeParam = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); @@ -438,7 +473,7 @@ void openQCD_qudaFreeGaugeField() { freeGaugeQuda(); return; -} // qudaFreeGaugeField +} /* From 402cab6b272769085a376afb36d32a7dfa7a8de9 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Tue, 15 Aug 2023 19:48:43 +0200 Subject: [PATCH 041/148] cleaned up --- include/quda_openqcd_interface.h | 86 +++++--- lib/openqcd_interface.cpp | 353 ++++++++++++------------------- 2 files changed, 201 insertions(+), 238 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index d2ef4fecbd..9fbc06358b 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -8,7 +8,7 @@ * * @section Description * - * The header file defines the milc interface to enable easy + * The header file defines the interface to enable easy * interfacing between QUDA and the OpenQCD software. */ @@ -20,9 +20,9 @@ extern "C" { * Parameters related to problem size and machine topology. */ typedef struct { - const int *latsize; /** Local lattice dimensions L0, L1, L2, L3 */ // FIXME: - const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ // FIXME: - const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ // FIXME: + const int *latsize; /** Local lattice dimensions L0, L1, L2, L3 */ + const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ + const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ int device; /** GPU device number */ // const int *ipt; // TODO: IN THE FUTURE } openQCD_QudaLayout_t; @@ -34,11 +34,20 @@ typedef struct { QudaVerbosity verbosity; /** How verbose QUDA should be (QUDA_SILENT, QUDA_VERBOSE or QUDA_SUMMARIZE) */ openQCD_QudaLayout_t layout; /** Layout for QUDA to use */ FILE *logfile; - /*void (*reorder_gauge)(void *gauge); - int VOLUME; - int sizeof_su3_dble;*/ + int volume; /* VOLUME */ + int sizeof_su3_dble; /* sizeof(su3_dble) */ + void (*reorder_gauge_openqcd_to_quda)(void *in, void *out); + void (*reorder_gauge_quda_to_openqcd)(void *in, void *out); } openQCD_QudaInitArgs_t; // passed to the initialization struct + +typedef struct { + int initialized; + int gauge_loaded; + int dslash_setup; +} openQCD_QudaState_t; + + /** * Initialize the QUDA context. * @@ -80,20 +89,35 @@ typedef struct { QudaDslashType dslash_type; } openQCD_QudaInvertArgs_t; + /** - * Apply the improved staggered operator to a field. All fields - * passed and returned are host (CPU) field in MILC order. + * @brief Setup Dslash * - * @param external_precision Precision of host fields passed to QUDA (2 - double, 1 - single) - * @param quda_precision Precision for QUDA to use (2 - double, 1 - single) - * @param inv_args Struct setting some solver metadata - * @param source Right-hand side source field - * @param solution Solution spinor field + * @param[in] kappa kappa + * @param[in] mu twisted mass */ -void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *source, - void *solution, void *gauge); +void openQCD_qudaSetDslashOptions(double kappa, double mu); +/** + * @brief Apply the Wilson-Clover Dirac operator to a field. All fields + * passed and returned are host (CPU) fields in openQCD order. + * + * @param[in] src Source spinor field + * @param[out] dst Destination spinor field + */ +void openQCD_qudaDslash(void *src, void *dst); + + +/** + * @brief Set metadata, options for Dslash. + * + * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single) + * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single) + * @param[in] inv_args Struct containing arguments, metadata + */ +/*void openQCD_qudaSetDslashOptions(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args);*/ + /** * ALL the following except the Dirac operator application * Apply the improved staggered operator to a field. All fields @@ -135,25 +159,35 @@ void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, double *const final_rel_resid, int *num_iters); /** - * Load the gauge field from the host. + * @brief Calculate the plaquette * - * @param external_precision Precision of host fields passed to QUDA (2 - double, 1 - single) - * @param quda_precision Precision for QUDA to use (2 - double, 1 - single) - * @param inv_args Meta data - * @param milc_link Base pointer to host gauge field (regardless of dimensionality) + * @param[out] plaq array to store the 3 plaquette values */ +void openQCD_qudaPlaquette(double plaq[3]); -void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge); -void openQCD_qudaPlaquetteOnly(double plaq[3]); -void openQCD_gaugeloadsave(int precision, void *gauge); + +/** + * @brief Load the gauge fields from host to quda + * + * @param[in] precision The precision + * @param[in] gauge The gauge fields (in openqcd order) + */ void openQCD_gaugeload(int precision, void *gauge); + + +/** + * @brief Save the gauge fields from quda to host + * + * @param[in] precision The precision + * @param[out] gauge The gauge fields (will be stored in openqcd order) + */ void openQCD_gaugesave(int precision, void *gauge); -void openQCD_qudaFreeGaugeField(void); + /** Free the gauge field allocated in QUDA. */ -void openQCD_qudaFreeGaugeField(); +void openQCD_qudaFreeGaugeField(void); #ifdef __cplusplus } diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 6a050e0cb3..610c867689 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -46,8 +46,9 @@ static const int num_colors = sizeof(colors) / sizeof(uint32_t); #define POP_RANGE #endif -static bool initialized = false; static openQCD_QudaInitArgs_t input; +static QudaInvertParam invertParam = newQudaInvertParam(); +static openQCD_QudaState_t qudaState = {false, false, false}; static int commsGridDim[4]; static int localDim[4]; @@ -82,42 +83,34 @@ using namespace quda; * *******************************************/ -// fdata should point to 4 integers in order {NPROC0, NPROC1, NPROC2, NPROC3} -// coords is the 4D cartesian coordinate of a rank. +/** + * @brief Calculate the rank from coordinates. + * + * @param[in] coords coords is the 4D cartesian coordinate of a rank + * @param[in] fdata should point to 4 integers in order {NPROC0, NPROC1, + * NPROC2, NPROC3} + * + * @return rank + */ static int rankFromCoords(const int *coords, void *fdata) // TODO: { int *NPROC = static_cast(fdata); - // int *NPROC = BLK_NPROC + 4; - int ib; - int n0_OpenQxD; - int n1_OpenQxD; - int n2_OpenQxD; - int n3_OpenQxD; - // int NPROC0_OpenQxD; - int NPROC1_OpenQxD; - int NPROC2_OpenQxD; - int NPROC3_OpenQxD; - - n0_OpenQxD = coords[3]; - n1_OpenQxD = coords[0]; - n2_OpenQxD = coords[1]; - n3_OpenQxD = coords[2]; - - // NPROC0_OpenQxD=NPROC[3]; - NPROC1_OpenQxD = NPROC[0]; - NPROC2_OpenQxD = NPROC[1]; - NPROC3_OpenQxD = NPROC[2]; - - ib = n0_OpenQxD; - ib = ib * NPROC1_OpenQxD + n1_OpenQxD; - ib = ib * NPROC2_OpenQxD + n2_OpenQxD; - ib = ib * NPROC3_OpenQxD + n3_OpenQxD; - - printfQuda("Coords are: %d,%d,%d,%d, Rank is: %d \n", coords[0], coords[1], coords[2], coords[3], ib); + + ib = coords[3]; + ib = ib*NPROC[0] + coords[0]; + ib = ib*NPROC[1] + coords[1]; + ib = ib*NPROC[2] + coords[2]; + return ib; } + +/** + * @brief Set layout parameters. + * + * @param[in] layout The layout + */ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) { int local_dim[4]; @@ -160,14 +153,15 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) initQuda(device); } + void openQCD_qudaInit(openQCD_QudaInitArgs_t in) { - if (initialized) return; + if (qudaState.initialized) return; setVerbosityQuda(in.verbosity, "QUDA: ", in.logfile); openQCD_qudaSetLayout(in.layout); input = in; - initialized = true; + qudaState.initialized = true; // geometry_openQxD(); // TODO: in the future establish ipt and other helper indexes from openQxD } @@ -190,7 +184,14 @@ static int getLinkPadding(const int dim[4]) * *******************************************/ -/* OPENQCD GAUGE PARAMS */ +/** + * @brief OPENQCD GAUGE PARAMS + * + * @param[in] dim dimensions + * @param[in] prec precision + * + * @return The quda gauge parameter. + */ static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec) { QudaGaugeParam gParam = newQudaGaugeParam(); @@ -272,8 +273,7 @@ static void setColorSpinorParams(const int dim[4], QudaPrecision precision, Colo param->setPrecision(precision); param->pad = 0; param->siteSubset = QUDA_FULL_SITE_SUBSET; // FIXME: check how to adapt this for openqxd - param->siteOrder - = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: check how to adapt this for openqxd // EVEN-ODD is only about inner ordering in quda + param->siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: check how to adapt this for openqxd // EVEN-ODD is only about inner ordering in quda param->fieldOrder = QUDA_OPENQCD_FIELD_ORDER; // FIXME: param->gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // meaningless, but required by the code. // // FIXME:: param->create = QUDA_ZERO_FIELD_CREATE; // // FIXME:: check how to adapt this for openqxd ?? created -0 in weird places @@ -284,72 +284,72 @@ static void setColorSpinorParams(const int dim[4], QudaPrecision precision, Colo static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, double mass, double target_residual, double target_residual_hq, int maxiter, double reliable_delta, QudaParity parity, QudaVerbosity verbosity, - QudaInverterType inverter, QudaInvertParam *invertParam) + QudaInverterType inverter) { - invertParam->verbosity = verbosity; - invertParam->mass = mass; - invertParam->tol = target_residual; - invertParam->tol_hq = target_residual_hq; - - invertParam->residual_type = static_cast(0); - invertParam->residual_type = (target_residual != 0) ? - static_cast(invertParam->residual_type | QUDA_L2_RELATIVE_RESIDUAL) : - invertParam->residual_type; - invertParam->residual_type = (target_residual_hq != 0) ? - static_cast(invertParam->residual_type | QUDA_HEAVY_QUARK_RESIDUAL) : - invertParam->residual_type; - - invertParam->heavy_quark_check = (invertParam->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? 1 : 0); - if (invertParam->heavy_quark_check) { - invertParam->max_hq_res_increase = 5; // this caps the number of consecutive hq residual increases - invertParam->max_hq_res_restart_total = 10; // this caps the number of hq restarts in case of solver stalling + invertParam.verbosity = verbosity; + invertParam.mass = mass; + invertParam.tol = target_residual; + invertParam.tol_hq = target_residual_hq; + + invertParam.residual_type = static_cast(0); + invertParam.residual_type = (target_residual != 0) ? + static_cast(invertParam.residual_type | QUDA_L2_RELATIVE_RESIDUAL) : + invertParam.residual_type; + invertParam.residual_type = (target_residual_hq != 0) ? + static_cast(invertParam.residual_type | QUDA_HEAVY_QUARK_RESIDUAL) : + invertParam.residual_type; + + invertParam.heavy_quark_check = (invertParam.residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? 1 : 0); + if (invertParam.heavy_quark_check) { + invertParam.max_hq_res_increase = 5; // this caps the number of consecutive hq residual increases + invertParam.max_hq_res_restart_total = 10; // this caps the number of hq restarts in case of solver stalling } - invertParam->use_sloppy_partial_accumulator = 0; - invertParam->num_offset = 0; + invertParam.use_sloppy_partial_accumulator = 0; + invertParam.num_offset = 0; - invertParam->inv_type = inverter; - invertParam->maxiter = maxiter; - invertParam->reliable_delta = reliable_delta; + invertParam.inv_type = inverter; + invertParam.maxiter = maxiter; + invertParam.reliable_delta = reliable_delta; - invertParam->mass_normalization = QUDA_MASS_NORMALIZATION; - invertParam->cpu_prec = cpu_prec; - invertParam->cuda_prec = cuda_prec; - invertParam->cuda_prec_sloppy = invertParam->heavy_quark_check ? cuda_prec : cuda_prec_sloppy; - invertParam->cuda_prec_precondition = cuda_prec_sloppy; + invertParam.mass_normalization = QUDA_MASS_NORMALIZATION; + invertParam.cpu_prec = cpu_prec; + invertParam.cuda_prec = cuda_prec; + invertParam.cuda_prec_sloppy = invertParam.heavy_quark_check ? cuda_prec : cuda_prec_sloppy; + invertParam.cuda_prec_precondition = cuda_prec_sloppy; - invertParam->gcrNkrylov = 10; + invertParam.gcrNkrylov = 10; - invertParam->solution_type = QUDA_MATPC_SOLUTION; - invertParam->solve_type = QUDA_DIRECT_PC_SOLVE; - invertParam->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // not used, but required by the code. - invertParam->dirac_order = QUDA_OPENQCD_DIRAC_ORDER; + invertParam.solution_type = QUDA_MATPC_SOLUTION; + invertParam.solve_type = QUDA_DIRECT_PC_SOLVE; + invertParam.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // not used, but required by the code. + invertParam.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; - invertParam->dslash_type = QUDA_WILSON_DSLASH; // FIXME: OR THIS; QUDA_ASQTAD_DSLASH; - invertParam->Ls = 1; - invertParam->gflops = 0.0; + invertParam.dslash_type = QUDA_WILSON_DSLASH; // FIXME: OR THIS; QUDA_ASQTAD_DSLASH; + invertParam.Ls = 1; + invertParam.gflops = 0.0; - invertParam->input_location = QUDA_CPU_FIELD_LOCATION; - invertParam->output_location = QUDA_CPU_FIELD_LOCATION; + invertParam.input_location = QUDA_CPU_FIELD_LOCATION; + invertParam.output_location = QUDA_CPU_FIELD_LOCATION; if (parity == QUDA_EVEN_PARITY) { // even parity - invertParam->matpc_type = QUDA_MATPC_EVEN_EVEN; + invertParam.matpc_type = QUDA_MATPC_EVEN_EVEN; } else if (parity == QUDA_ODD_PARITY) { - invertParam->matpc_type = QUDA_MATPC_ODD_ODD; + invertParam.matpc_type = QUDA_MATPC_ODD_ODD; } else { errorQuda("Invalid parity\n"); } - invertParam->dagger = QUDA_DAG_NO; - invertParam->use_init_guess = QUDA_USE_INIT_GUESS_YES; + invertParam.dagger = QUDA_DAG_NO; + invertParam.use_init_guess = QUDA_USE_INIT_GUESS_YES; // for the preconditioner - invertParam->inv_type_precondition = QUDA_CG_INVERTER; - invertParam->tol_precondition = 1e-1; - invertParam->maxiter_precondition = 2; - invertParam->verbosity_precondition = QUDA_SILENT; + invertParam.inv_type_precondition = QUDA_CG_INVERTER; + invertParam.tol_precondition = 1e-1; + invertParam.maxiter_precondition = 2; + invertParam.verbosity_precondition = QUDA_SILENT; - invertParam->compute_action = 0; + invertParam.compute_action = 0; } @@ -362,46 +362,17 @@ static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, Qud * *******************************************/ -/** - * @brief Calculate the plaquette, given gauge fields - * - * @param[in] precision The precision - * @param[out] plaq The plaquette - * @param[in,out] gauge The gauge fields in correct order - */ -void openQCD_qudaPlaquette(int precision, double plaq[3], void *gauge) -{ - QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); - - loadGaugeQuda(gauge, &qudaGaugeParam); - - QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam(); - obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; - obsParam.remove_staggered_phase = QUDA_BOOLEAN_FALSE; // - gaugeObservablesQuda(&obsParam); - - // Note different Nc normalization! - plaq[0] = obsParam.plaquette[0]; - plaq[1] = obsParam.plaquette[1]; - plaq[2] = obsParam.plaquette[2]; - - saveGaugeQuda(gauge, &qudaGaugeParam); - - return; -} - -/** - * @brief Calculate the plaquette - * - * @param[out] plaq The plaquette - */ -void openQCD_qudaPlaquetteOnly(double plaq[3]) +void openQCD_qudaPlaquette(double plaq[3]) { + if (!qudaState.gauge_loaded) { + errorQuda("Gauge field not loaded into QUDA, cannot calculate plaquette. Call openQCD_gaugeload() first."); + return; + } + QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam(); obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; - obsParam.remove_staggered_phase = QUDA_BOOLEAN_FALSE; // + obsParam.remove_staggered_phase = QUDA_BOOLEAN_FALSE; gaugeObservablesQuda(&obsParam); // Note different Nc normalization! @@ -413,65 +384,43 @@ void openQCD_qudaPlaquetteOnly(double plaq[3]) } -/** - * @brief Load and save the gauge fields - * - * @param[in] precision The precision - * @param[in,out] gauge The gauge fields - */ -void openQCD_gaugeloadsave(int precision, void *gauge) -{ - QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); - - loadGaugeQuda(gauge, &qudaGaugeParam); - saveGaugeQuda(gauge, &qudaGaugeParam); - - return; -} - - -/** - * @brief Load the gauge fields from host to quda - * - * @param[in] precision The precision - * @param[in] gauge The gauge fields (in lexicographical order) - */ void openQCD_gaugeload(int precision, void *gauge) { + void *buffer; + QudaGaugeParam qudaGaugeParam = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); - /*printfQuda("input.VOLUME = %d\n", input.VOLUME); - printfQuda("input.sizeof_su3_dble = %d\n", input.sizeof_su3_dble);*/ - /*buffer = malloc(4*VOLUME*sizeof(su3_dble));*/ - /*input.reorder_gauge();*/ + buffer = malloc(4*input.volume*input.sizeof_su3_dble); + input.reorder_gauge_openqcd_to_quda(gauge, buffer); + loadGaugeQuda(buffer, &qudaGaugeParam); + free(buffer); - loadGaugeQuda(gauge, &qudaGaugeParam); + qudaState.gauge_loaded = true; return; } -/** - * @brief Save the gauge fields from quda to host - * - * @param[in] precision The precision - * @param[out] gauge The gauge fields - */ void openQCD_gaugesave(int precision, void *gauge) { + void *buffer; + QudaGaugeParam qudaGaugeParam = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); - saveGaugeQuda(gauge, &qudaGaugeParam); + buffer = malloc(4*input.volume*input.sizeof_su3_dble); + saveGaugeQuda(buffer, &qudaGaugeParam); + input.reorder_gauge_quda_to_openqcd(buffer, gauge); + free(buffer); return; } -void openQCD_qudaFreeGaugeField() +void openQCD_qudaFreeGaugeField(void) { freeGaugeQuda(); + qudaState.gauge_loaded = false; return; } @@ -486,77 +435,57 @@ void openQCD_qudaFreeGaugeField() */ - - - -// #if 0 -/* FIXME: */ -void openQCD_qudaDslash(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, - void *dst, void *gauge) +void openQCD_qudaSetDslashOptions(double kappa, double mu) { static const QudaVerbosity verbosity = getVerbosity(); - QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (quda_precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); - - loadGaugeQuda(gauge, &qudaGaugeParam); - - QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - QudaPrecision device_precision_sloppy = device_precision; + invertParam.input_location = QUDA_CPU_FIELD_LOCATION; + invertParam.output_location = QUDA_CPU_FIELD_LOCATION; + invertParam.dslash_type = QUDA_WILSON_DSLASH; + invertParam.inv_type = QUDA_CG_INVERTER; /* just set some */ + invertParam.kappa = kappa; + invertParam.dagger = QUDA_DAG_NO; + invertParam.mass_normalization = QUDA_KAPPA_NORMALIZATION; + invertParam.Ls = 1; /**< Extent of the 5th dimension (for domain wall) */ + invertParam.mu = mu; /**< Twisted mass parameter */ + /*invertParam.tm_rho = ?;*/ /**< Hasenbusch mass shift applied like twisted mass to diagonal (but not inverse) */ + /*invertParam.epsilon = ?;*/ /**< Twisted mass parameter */ + /*invertParam.twist_flavor = ??;*/ /**< Twisted mass flavor */ + invertParam.laplace3D = -1; /**< omit this direction from laplace operator: x,y,z,t -> 0,1,2,3 (-1 is full 4D) */ - QudaInvertParam invertParam = newQudaInvertParam(); + invertParam.cpu_prec = QUDA_DOUBLE_PRECISION; /**< The precision used by the input fermion fields */ + invertParam.cuda_prec = QUDA_DOUBLE_PRECISION; /**< The precision used by the QUDA solver */ - QudaParity local_parity = inv_args.evenodd; - QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY; + invertParam.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; /**< The order of the input and output fermion fields */ + invertParam.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; /**< Gamma basis of the input and output host fields */ - /* For reference: - setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, - double mass, double target_residual, double target_residual_hq, int maxiter, - double reliable_delta, QudaParity parity, QudaVerbosity verbosity, - QudaInverterType inverter, QudaInvertParam *invertParam) */ - setInvertParams(host_precision, device_precision, device_precision_sloppy, 0.0, 0, 0, 0, 0.0, local_parity, verbosity, - QUDA_CG_INVERTER, &invertParam); + invertParam.verbosity = verbosity; /**< The verbosity setting to use in the solver */ + invertParam.compute_action = 0; ColorSpinorParam csParam; - setColorSpinorParams(localDim, host_precision, &csParam); + setColorSpinorParams(localDim, invertParam.cpu_prec, &csParam); - dslashQuda(static_cast(dst), static_cast(src), &invertParam, local_parity); + qudaState.dslash_setup = true; +} - return; -} // openQCD_qudaDslash -#if 0 // FIXME: -void openQCD_qudaDslashNoLoads(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, - void *dst, void *gauge) +void openQCD_qudaDslash(void *src, void *dst) { - static const QudaVerbosity verbosity = getVerbosity(); - - QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - QudaPrecision device_precision_sloppy = device_precision; - - QudaInvertParam invertParam = newQudaInvertParam(); - - QudaParity local_parity = inv_args.evenodd; - QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY; - - /* For reference: - setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, - double mass, double target_residual, double target_residual_hq, int maxiter, - double reliable_delta, QudaParity parity, QudaVerbosity verbosity, - QudaInverterType inverter, QudaInvertParam *invertParam) */ - setInvertParams(host_precision, device_precision, device_precision_sloppy, 0.0, 0, 0, 0, 0.0, local_parity, verbosity, - QUDA_CG_INVERTER, &invertParam); + if (!qudaState.gauge_loaded) { + errorQuda("Gauge field not loaded into QUDA, cannot apply Dslash. Call openQCD_gaugeload() first."); + return; + } - ColorSpinorParam csParam; - setColorSpinorParams(localDim, host_precision, &csParam); + if (!qudaState.dslash_setup) { + errorQuda("Dslash parameters are not set, cannot apply Dslash!"); + return; + } - dslashQudaNoLoads(static_cast(dst), static_cast(src), &invertParam, local_parity); + dslashQuda(static_cast(dst), static_cast(src), &invertParam, QUDA_EVEN_PARITY); return; -} // openQCD_qudaDslash -#endif +} + void openQCD_colorspinorloadsave(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, void *dst, void *gauge) @@ -582,8 +511,8 @@ void openQCD_colorspinorloadsave(int external_precision, int quda_precision, ope double mass, double target_residual, double target_residual_hq, int maxiter, double reliable_delta, QudaParity parity, QudaVerbosity verbosity, QudaInverterType inverter, QudaInvertParam *invertParam) */ - setInvertParams(host_precision, device_precision, device_precision_sloppy, 0.0, 0, 0, 0, 0.0, local_parity, verbosity, - QUDA_CG_INVERTER, &invertParam); + /*setInvertParams(host_precision, device_precision, device_precision_sloppy, 0.0, 0, 0, 0, 0.0, local_parity, verbosity, + QUDA_CG_INVERTER, &invertParam);*/ ColorSpinorParam csParam; setColorSpinorParams(localDim, host_precision, &csParam); From ed234bd2481d8a705adde564d0f90addeaba3168 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 21 Aug 2023 13:55:12 +0200 Subject: [PATCH 042/148] refactored plaquette --- include/quda_openqcd_interface.h | 5 +++-- lib/openqcd_interface.cpp | 19 ++++++++++++++----- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 9fbc06358b..e4d564d9d6 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -161,9 +161,10 @@ void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, /** * @brief Calculate the plaquette * - * @param[out] plaq array to store the 3 plaquette values + * @return plaquette value + * @see https://github.com/lattice/quda/wiki/Gauge-Measurements#wilson-plaquette-action */ -void openQCD_qudaPlaquette(double plaq[3]); +double openQCD_qudaPlaquette(void); /** diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 610c867689..60b8bee9e5 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -363,14 +363,16 @@ static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, Qud *******************************************/ -void openQCD_qudaPlaquette(double plaq[3]) +double openQCD_qudaPlaquette(void) { + double plaq[3]; + if (!qudaState.gauge_loaded) { errorQuda("Gauge field not loaded into QUDA, cannot calculate plaquette. Call openQCD_gaugeload() first."); - return; + return 0.0; } - QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam(); + /*QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam(); obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; obsParam.remove_staggered_phase = QUDA_BOOLEAN_FALSE; gaugeObservablesQuda(&obsParam); @@ -378,9 +380,16 @@ void openQCD_qudaPlaquette(double plaq[3]) // Note different Nc normalization! plaq[0] = obsParam.plaquette[0]; plaq[1] = obsParam.plaquette[1]; - plaq[2] = obsParam.plaquette[2]; + plaq[2] = obsParam.plaquette[2];*/ - return; + plaqQuda(plaq); + +/* plaq[1] *= 3.0; + plaq[2] *= 3.0; + plaq[0] *= 3.0;*/ + + // Note different Nc normalization wrt openQCD! + return 3.0*plaq[0]; } From bfe766f54c274698074b980a22775e7f8bc96417 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 23 Aug 2023 19:00:42 +0200 Subject: [PATCH 043/148] Only have generic lexicographical ordering in OpenQCDOrder --- include/gauge_field_order.h | 115 ++++++++++++------------------------ 1 file changed, 39 insertions(+), 76 deletions(-) diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 04b635478c..dd454a0235 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2315,102 +2315,64 @@ namespace quda { }; -/** - struct to define OpenQCD ordered gauge fields: - [volumecb][dim][parity*][row][col] parity*: uplink/downlink (link attached to closest odd site) - */ + /** + * struct to define order of gauge fields in OpenQCD + */ template struct OpenQCDOrder : LegacyOrder { + using Accessor = OpenQCDOrder; using real = typename mapper::type; using complex = complex; + Float *gauge; const int volumeCB; - // int ipt; static constexpr int Nc = 3; - const int dim[4]; + const int L[4]; + OpenQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), - gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), - volumeCB( - u.VolumeCB()), // NOTE: Volume and VolumeCB refer to the global lattice, if VolumeLocal, then local lattice - dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]} // GLOBAL dimensions + gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), // pointer to the gauge field on CPU + volumeCB(u.VolumeCB()), // Volume and VolumeCB refer to the global lattice, if VolumeLocal, then local lattice + L {u.X()[0], u.X()[1], u.X()[2], u.X()[3]} // initialized dim with *local* lattice dimensions { - if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); + if constexpr (length != 18) { + errorQuda("Gauge field length %d not supported", length); + } } + /** + * @brief Gets the offset in Floats from the base pointer of the gauge fields. + * + * @param[in] x Checkerboard index coming from quda + * @param[in] dir The direction coming from quda + * @param[in] parity The parity coming from quda + * + * @return The offset. + */ + __device__ __host__ inline int getGaugeOffset(int x, int dir, int parity) const { + int coord[4]; + getCoords(coord, x, L, parity); + int idx = coord[3] + L[3]*coord[2] + L[3]*L[2]*coord[1] + L[3]*L[2]*L[1]*coord[0]; + return (4*idx + dir)*length; + } - // TODO: make this function - // __device__ __host__ inline int QUDAtoOpenQxD(int x_cb_QUDA, int dir_QUDA, int parity_QUDA) const - // TODO: Implement ipt and iup functions - // { - - // } - - - __device__ __host__ inline void load(complex v[9], int x, int dir, int parity, - Float = 1.0) const + __device__ __host__ inline void load(complex v[length/2], int x, int dir, int parity, Float = 1.0) const { - // With ''natural'' order: lexicographical 0123 = txyz , t fastest, links 0123 = txyz in pos directions - - // Indexing fun: - int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) - - getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site - - // int iy_OpenQxD = x3 + L3*x2 + L3*L2*x1 + L3*L2*L1*x0; - // TODO: Determine whether coord[mu] is local or global - int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; - /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) - coord1 in QUDA is x2 in OpenQxD (y) - coord2 in QUDA is x3 in OpenQxD (z) - coord3 in QUDA is x0 in OpenQxD (t) - */ - // int ix_OpenQxD = ipt[iy_OpenQxD]; - int dir_OpenQxD = (dir + 1) % 4; // rotation of axes QUDA -> OpenQxD - - // Loading as per QUDA style - auto in - = &gauge[(4 * iy_OpenQxD + dir_OpenQxD) * length]; // This is how they're accessed within OpenQxd (length = 18 - // doubles = 9 complex doubles = 1 su3dble struct) - // auto in = &gauge[ (8*(ix_OpenQxD - volumeCB) + 2*dir_OpenQxD)* length]; // This is how they're accessed - // within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) - block_load(v, reinterpret_cast(in)); - - + auto in = &gauge[getGaugeOffset(x, dir, parity)]; + block_load(v, reinterpret_cast(in)); } - __device__ __host__ inline void save(const complex v[9], int x, int dir, int parity) const + __device__ __host__ inline void save(const complex v[length/2], int x, int dir, int parity) const { - // Indexing fun: - int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) - - getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site - - // int iy_OpenQxD = x3 + L3*x2 + L3*L2*x1 + L3*L2*L1*x0; - // TODO: Determine whether coord[mu] is local or global - /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) - coord1 in QUDA is x2 in OpenQxD (y) - coord2 in QUDA is x3 in OpenQxD (z) - coord3 in QUDA is x0 in OpenQxD (t) - */ - int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; - - // int ix_OpenQxD = ipt[iy_OpenQxD]; - int dir_OpenQxD = (dir + 1) % 4; // rotation of axes QUDA -> OpenQxD - - // Loading as per QUDA style - // This is how they're accessed within OpenQxd (length = 18 - // doubles = 9 complex doubles = 1 su3dble struct) - auto out = &gauge[(4 * iy_OpenQxD + dir_OpenQxD) * length]; - // within OpenQxd (length = 18 doubles = 9 complex doubles = 1 su3dble struct) - block_store(reinterpret_cast(out), v); + auto out = &gauge[getGaugeOffset(x, dir, parity)]; + block_store(reinterpret_cast(out), v); } /** @brief This accessor routine returns a gauge_wrapper to this object, allowing us to overload various operators for manipulating at the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting + @param[in] dim Which dimension are we requesting @param[in] x_cb Checkerboarded space-time index we are requesting @param[in] parity Parity we are requesting @return Instance of a gauge_wrapper that curries in access to @@ -2423,9 +2385,10 @@ namespace quda { size_t Bytes() const { - return Nc * Nc * 2 * sizeof(Float); - } // Double => Float = 1.0 => 1 byte per float, 18 floats per complex 3x3 matrix - }; + return 2*Nc*Nc*sizeof(Float); + } + }; // class OpenQCDOrder + } // namespace gauge From b837622ae460c6c59e32c772782bc46adccd9bca Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Thu, 24 Aug 2023 17:37:46 +0200 Subject: [PATCH 044/148] norms on both sides coincide --- include/color_spinor_field.h | 3 +- include/color_spinor_field_order.h | 120 +++++++---------------------- include/gauge_field_order.h | 5 +- include/quda_openqcd_interface.h | 44 ++++++----- lib/copy_color_spinor.cuh | 7 +- lib/openqcd_interface.cpp | 107 ++++++++++++++----------- 6 files changed, 123 insertions(+), 163 deletions(-) diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index 4ee0af51e2..e525c99369 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -231,8 +231,7 @@ namespace quda siteOrder = QUDA_EVEN_ODD_SITE_ORDER; } else if (inv_param.dirac_order == QUDA_OPENQCD_DIRAC_ORDER) { fieldOrder = QUDA_OPENQCD_FIELD_ORDER; - siteOrder - = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: SHOULD THIS BE LEXICOGRAPHICAL?, OR VIA FULL IMPLEMENTATION VIA IPT ARRAY IN QUDA + siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: SHOULD THIS BE LEXICOGRAPHICAL?, OR VIA FULL IMPLEMENTATION VIA IPT ARRAY IN QUDA } else { errorQuda("Dirac order %d not supported", inv_param.dirac_order); } diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index e2736d2db5..a61785ae19 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1720,97 +1720,53 @@ namespace quda size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; - // Based on ''SpaceSpinorColorOrder'' TODO: + /** + * struct to define order of spinor fields in OpenQCD + */ template struct OpenQCDDiracOrder { + using Accessor = OpenQCDDiracOrder; using real = typename mapper::type; using complex = complex; - static const int length = 2 * Ns * Nc; // 12 complex (2 floats) numbers per spinor color field + + static const int length = 2*Ns*Nc; // 12 complex (2 floats) numbers per spinor color field Float *field; size_t offset; Float *ghost[8]; int volumeCB; int faceVolumeCB[4]; int nParity; - const int dim[4]; + const int L[4]; OpenQCDDiracOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : field(field_ ? field_ : (Float *)a.V()), offset(a.Bytes() / (2 * sizeof(Float))), // TODO: What's this for?? volumeCB(a.VolumeCB()), nParity(a.SiteSubset()), - dim {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // GLOBAL dimensions?? - { // TODO: ARE GHOSTS NEEDED?? - // for (int i = 0; i < 4; i++) { - // ghost[2 * i] = ghost_ ? ghost_[2 * i] : 0; - // ghost[2 * i + 1] = ghost_ ? ghost_[2 * i + 1] : 0; - // faceVolumeCB[i] = a.SurfaceCB(i) * nFace; - // } - if constexpr (length != 24) errorQuda("Spinor field length %d not supported", length); - } - - /** - @brief Convert from 1-dimensional index to the n-dimensional - spatial index. With full fields, we assume that the field is - even-odd ordered. The lattice coordinates that are computed - here are full-field coordinates. - */ - // __device__ __host__ inline void LatticeIndexOpenQCD(int y[4], int i) const - // { - // // if (siteSubset == QUDA_FULL_SITE_SUBSET) - // x[0] /= 2; - - // for (int d = 0; d < 4; d++) { - // y[d] = i % x[d]; - // i /= x[d]; - // } - // int parity = i; // parity is the slowest running dimension - - // // convert into the full-field lattice coordinate - // // if (siteSubset == QUDA_FULL_SITE_SUBSET) { - // for (int d = 1; d < nDim; d++) parity += y[d]; - // parity = parity & 1; - // x[0] *= 2; // restore x[0] - // // } - - // y[0] = 2 * y[0] + parity; // compute the full x coordinate - // } - - /* lexicographical index: coord0 in QUDA is x1 in OpenQxD (x) - coord1 in QUDA is x2 in OpenQxD (y) - coord2 in QUDA is x3 in OpenQxD (z) - coord3 in QUDA is x0 in OpenQxD (t) - */ - __device__ __host__ inline void load(complex v[length / 2], int x, int parity = 0) const + L {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // local dimensions { + if constexpr (length != 24) { + errorQuda("Spinor field length %d not supported", length); + } + } - /* INDEXING */ - int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) - getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site - - int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; - - // Loading as per QUDA style - auto in = &field[iy_OpenQxD * length]; // This is how they're accessed within OpenQxd (length = 24 doubles - // = 12 complex doubles = 4 spinor x 3 colors) - // - // printf("Loading site iy: %d with field value %.10e \n", iy_OpenQxD, field[iy_OpenQxD * length]); - block_load(v, reinterpret_cast(in)); + __device__ __host__ inline int getSpinorOffset(int x, int parity) const { + int coord[4]; + getCoords(coord, x, L, parity); + int idx = coord[3] + L[3]*coord[2] + L[3]*L[2]*coord[1] + L[3]*L[2]*L[1]*coord[0]; + return idx*length; } - __device__ __host__ inline void save(const complex v[length / 2], int x, int parity = 0) const + __device__ __host__ inline void load(complex v[length/2], int x, int parity = 0) const { - /* INDEXING */ - int coord[4]; // declare a 4D vector x0, x1, x2, x3 = (xyzt), t fastest (ix = x0 + x1 * L0 + ...) - getCoords(coord, x, dim, parity); // from x, dim, parity obtain coordinate of the site - - int iy_OpenQxD = coord[2] + dim[2] * coord[1] + dim[2] * dim[1] * coord[0] + dim[0] * dim[2] * dim[1] * coord[3]; - - // Loading as per QUDA style - auto out = &field[iy_OpenQxD * length]; - // printf("Saving site iy: %d with field value %.10e \n",iy_OpenQxD,field[iy_OpenQxD * length]); + auto in = &field[getSpinorOffset(x, parity)]; + block_load(v, reinterpret_cast(in)); + } - block_store(reinterpret_cast(out), v); + __device__ __host__ inline void save(const complex v[length/2], int x, int parity = 0) const + { + auto out = &field[getSpinorOffset(x, parity)]; + block_store(reinterpret_cast(out), v); } /** @@ -1827,28 +1783,10 @@ namespace quda return colorspinor_wrapper(*this, x_cb, parity); } - // __device__ __host__ inline void loadGhost(complex v[length / 2], int x, int dim, int dir, int parity = 0) const - // { - // for (int s = 0; s < Ns; s++) { - // for (int c = 0; c < Nc; c++) { - // v[s * Nc + c] - // = complex(ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 0], - // ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 1]); - // } - // } - // } - - // __device__ __host__ inline void saveGhost(const complex v[length / 2], int x, int dim, int dir, int parity = 0) const - // { - // for (int s = 0; s < Ns; s++) { - // for (int c = 0; c < Nc; c++) { - // ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 0] = v[s * Nc + c].real(); - // ghost[2 * dim + dir][(((parity * faceVolumeCB[dim] + x) * Ns + s) * Nc + c) * 2 + 1] = v[s * Nc + c].imag(); - // } - // } - // } - - size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } + size_t Bytes() const + { + return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); + } }; // openQCDDiracOrder } // namespace colorspinor diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index dd454a0235..b53ae36ee1 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2341,7 +2341,10 @@ namespace quda { } /** - * @brief Gets the offset in Floats from the base pointer of the gauge fields. + * @brief Gets the offset in Floats from the openQCD base pointer of + * the gauge fields. At this point, fields are reorder with a + * xyzt-lexicographical spacetime index, so nothing special to + * do here. * * @param[in] x Checkerboard index coming from quda * @param[in] dir The direction coming from quda diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index e4d564d9d6..138a862b7a 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -20,31 +20,34 @@ extern "C" { * Parameters related to problem size and machine topology. */ typedef struct { - const int *latsize; /** Local lattice dimensions L0, L1, L2, L3 */ - const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ - const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ - int device; /** GPU device number */ - // const int *ipt; // TODO: IN THE FUTURE + const int *latsize; /** Local lattice dimensions L0, L1, L2, L3 */ + const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ + const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ + int device; /** GPU device number */ } openQCD_QudaLayout_t; /** * Parameters used to create a QUDA context. */ typedef struct { - QudaVerbosity verbosity; /** How verbose QUDA should be (QUDA_SILENT, QUDA_VERBOSE or QUDA_SUMMARIZE) */ - openQCD_QudaLayout_t layout; /** Layout for QUDA to use */ - FILE *logfile; - int volume; /* VOLUME */ - int sizeof_su3_dble; /* sizeof(su3_dble) */ + QudaVerbosity verbosity; /** How verbose QUDA should be (QUDA_SILENT, QUDA_VERBOSE or QUDA_SUMMARIZE) */ + openQCD_QudaLayout_t layout; /** Layout for QUDA to use */ + FILE *logfile; /** log file handler */ + void *gauge; /** base pointer to the gauge fields */ + int volume; /** VOLUME */ + int sizeof_su3_dble; /** sizeof(su3_dble) */ + int sizeof_spinor_dble; /** sizeof(spinor_dble) */ void (*reorder_gauge_openqcd_to_quda)(void *in, void *out); void (*reorder_gauge_quda_to_openqcd)(void *in, void *out); -} openQCD_QudaInitArgs_t; // passed to the initialization struct + void (*reorder_spinor_openqcd_to_quda)(void *in, void *out); + void (*reorder_spinor_quda_to_openqcd)(void *in, void *out); +} openQCD_QudaInitArgs_t; typedef struct { - int initialized; - int gauge_loaded; - int dslash_setup; + int initialized; /** Whether openQCD_qudaInit() was called or not */ + int gauge_loaded; /** Whether openQCD_gaugeload() was called or not */ + int dslash_setup; /** Whether openQCD_qudaSetDslashOptions() was called or not */ } openQCD_QudaState_t; @@ -98,6 +101,7 @@ typedef struct { */ void openQCD_qudaSetDslashOptions(double kappa, double mu); +double openQCD_qudaNorm(void *h_in); /** * @brief Apply the Wilson-Clover Dirac operator to a field. All fields @@ -159,9 +163,11 @@ void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, double *const final_rel_resid, int *num_iters); /** - * @brief Calculate the plaquette + * @brief Wrapper for the plaquette. We could call plaqQuda() directly in + * openQCD, but we have to make sure manually that the gauge field + * is loaded * - * @return plaquette value + * @return Plaquette value * @see https://github.com/lattice/quda/wiki/Gauge-Measurements#wilson-plaquette-action */ double openQCD_qudaPlaquette(void); @@ -170,19 +176,17 @@ double openQCD_qudaPlaquette(void); /** * @brief Load the gauge fields from host to quda * - * @param[in] precision The precision * @param[in] gauge The gauge fields (in openqcd order) */ -void openQCD_gaugeload(int precision, void *gauge); +void openQCD_gaugeload(void *gauge); /** * @brief Save the gauge fields from quda to host * - * @param[in] precision The precision * @param[out] gauge The gauge fields (will be stored in openqcd order) */ -void openQCD_gaugesave(int precision, void *gauge); +void openQCD_gaugesave(void *gauge); /** diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh index 7688377b61..36bda6af08 100644 --- a/lib/copy_color_spinor.cuh +++ b/lib/copy_color_spinor.cuh @@ -116,8 +116,7 @@ namespace quda { } else if (out.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { #ifdef BUILD_OPENQCD_INTERFACE - // using O = OpenQCDDiracOrder; // TODO: NOT working - using O = SpaceSpinorColorOrder; // FIXME: This is a test + using O = OpenQCDDiracOrder; CopyColorSpinor(out, in, param); #else errorQuda("OpenQCD interface has not been built\n"); @@ -168,8 +167,8 @@ namespace quda { } else if (in.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { #ifdef BUILD_OPENQCD_INTERFACE - // using ColorSpinor = OpenQCDDiracOrder; // TODO: Seems OK - using ColorSpinor = SpaceSpinorColorOrder; // FIXME: This is a test + using ColorSpinor = OpenQCDDiracOrder; // TODO: Seems OK + //using ColorSpinor = SpaceSpinorColorOrder; // FIXME: This is a test genericCopyColorSpinor(param); #else errorQuda("OpenQCD interface has not been built\n"); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 60b8bee9e5..8b9931cb66 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -157,12 +157,11 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) void openQCD_qudaInit(openQCD_QudaInitArgs_t in) { if (qudaState.initialized) return; - setVerbosityQuda(in.verbosity, "QUDA: ", in.logfile); - openQCD_qudaSetLayout(in.layout); - input = in; + + setVerbosityQuda(input.verbosity, "QUDA: ", input.logfile); + openQCD_qudaSetLayout(input.layout); qudaState.initialized = true; - // geometry_openQxD(); // TODO: in the future establish ipt and other helper indexes from openQxD } void openQCD_qudaFinalize() { @@ -185,17 +184,21 @@ static int getLinkPadding(const int dim[4]) *******************************************/ /** - * @brief OPENQCD GAUGE PARAMS + * @brief Initialize gauge param struct * * @param[in] dim dimensions * @param[in] prec precision * - * @return The quda gauge parameter. + * @return The quda gauge parameter struct. */ static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec) { QudaGaugeParam gParam = newQudaGaugeParam(); - for (int dir = 0; dir < 4; ++dir) gParam.X[dir] = dim[dir]; + + for (int dir = 0; dir < 4; ++dir) { + gParam.X[dir] = dim[dir]; + } + gParam.cuda_prec_sloppy = gParam.cpu_prec = gParam.cuda_prec = prec; gParam.type = QUDA_SU3_LINKS; @@ -207,12 +210,13 @@ static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec) gParam.anisotropy = 1.0; gParam.tadpole_coeff = 1.0; gParam.scale = 0; - gParam.ga_pad = getLinkPadding(dim); + gParam.ga_pad = getLinkPadding(dim); /* Why this? */ return gParam; } -void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], openQCD_QudaInvertArgs_t &inv_args, + +/*void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], openQCD_QudaInvertArgs_t &inv_args, int external_precision, int quda_precision) { @@ -257,7 +261,7 @@ void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], openQCD_Qu qudaGaugeParam.cuda_prec_precondition = device_precision_sloppy; qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; // qudaGaugeParam.ga_pad = getLinkPadding(dim); -} +}*/ /* PARAMS FOR SPINOR FIELDS */ @@ -281,7 +285,7 @@ static void setColorSpinorParams(const int dim[4], QudaPrecision precision, Colo /* PARAMS FOR DSLASH AND INVERSION */ -static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, +/*static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, double mass, double target_residual, double target_residual_hq, int maxiter, double reliable_delta, QudaParity parity, QudaVerbosity verbosity, QudaInverterType inverter) @@ -351,7 +355,7 @@ static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, Qud invertParam.compute_action = 0; } - +*/ @@ -393,57 +397,36 @@ double openQCD_qudaPlaquette(void) } -void openQCD_gaugeload(int precision, void *gauge) +void openQCD_gaugeload(void *gauge) { - void *buffer; - - QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); + QudaGaugeParam qudaGaugeParam = newOpenQCDGaugeParam(localDim, QUDA_DOUBLE_PRECISION); - buffer = malloc(4*input.volume*input.sizeof_su3_dble); + void* buffer = malloc(4*input.volume*input.sizeof_su3_dble); input.reorder_gauge_openqcd_to_quda(gauge, buffer); + input.gauge = gauge; loadGaugeQuda(buffer, &qudaGaugeParam); free(buffer); qudaState.gauge_loaded = true; - - return; } -void openQCD_gaugesave(int precision, void *gauge) +void openQCD_gaugesave(void *gauge) { - void *buffer; - - QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); + QudaGaugeParam qudaGaugeParam = newOpenQCDGaugeParam(localDim, QUDA_DOUBLE_PRECISION); - buffer = malloc(4*input.volume*input.sizeof_su3_dble); + void* buffer = malloc(4*input.volume*input.sizeof_su3_dble); saveGaugeQuda(buffer, &qudaGaugeParam); input.reorder_gauge_quda_to_openqcd(buffer, gauge); free(buffer); - - return; } void openQCD_qudaFreeGaugeField(void) { freeGaugeQuda(); qudaState.gauge_loaded = false; - return; } - -/* - * SPINOR FIELDS - */ - - -/* - * SPINOR AND GAUGE FIELDS - */ - - void openQCD_qudaSetDslashOptions(double kappa, double mu) { static const QudaVerbosity verbosity = getVerbosity(); @@ -478,8 +461,34 @@ void openQCD_qudaSetDslashOptions(double kappa, double mu) } +double openQCD_qudaNorm(void *h_in) +{ + lat_dim_t X; + + for (int i = 0; i<4; i++) { + X[i] = localDim[i]; + } + + QudaInvertParam sParam = newQudaInvertParam(); + sParam.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; + sParam.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + sParam.cpu_prec = QUDA_DOUBLE_PRECISION; + sParam.cuda_prec = QUDA_DOUBLE_PRECISION; + + ColorSpinorParam cpuParam(h_in, sParam, X, false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField in_h(cpuParam); + ColorSpinorParam cudaParam(cpuParam, sParam, QUDA_CUDA_FIELD_LOCATION); + ColorSpinorField in(cudaParam); + in = in_h; + + return blas::norm2(in); +} + + void openQCD_qudaDslash(void *src, void *dst) { + void *buffer1, *buffer2; + if (!qudaState.gauge_loaded) { errorQuda("Gauge field not loaded into QUDA, cannot apply Dslash. Call openQCD_gaugeload() first."); return; @@ -490,7 +499,15 @@ void openQCD_qudaDslash(void *src, void *dst) return; } - dslashQuda(static_cast(dst), static_cast(src), &invertParam, QUDA_EVEN_PARITY); + buffer1 = malloc(input.volume*input.sizeof_spinor_dble); + buffer2 = malloc(input.volume*input.sizeof_spinor_dble); + + input.reorder_spinor_openqcd_to_quda(src, buffer1); + MatQuda(static_cast(buffer2), static_cast(buffer1), &invertParam); + input.reorder_spinor_quda_to_openqcd(buffer2, dst); + + free(buffer1); + free(buffer2); return; } @@ -499,7 +516,7 @@ void openQCD_qudaDslash(void *src, void *dst) void openQCD_colorspinorloadsave(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, void *dst, void *gauge) { - static const QudaVerbosity verbosity = getVerbosity(); + //static const QudaVerbosity verbosity = getVerbosity(); QudaGaugeParam qudaGaugeParam = newOpenQCDGaugeParam(localDim, (quda_precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); @@ -507,13 +524,13 @@ void openQCD_colorspinorloadsave(int external_precision, int quda_precision, ope loadGaugeQuda(gauge, &qudaGaugeParam); QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - QudaPrecision device_precision_sloppy = device_precision; + //QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; + //QudaPrecision device_precision_sloppy = device_precision; QudaInvertParam invertParam = newQudaInvertParam(); QudaParity local_parity = inv_args.evenodd; - QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY; + //QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY; /* For reference: setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, From 23b1e0ba9d45de4f0ec2eaea1e624d30cf0afe68 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 25 Aug 2023 17:23:09 +0200 Subject: [PATCH 045/148] all reordering of spinor fields done in quda now --- include/color_spinor_field_order.h | 98 +++++++++++++++++++++++++++--- include/gauge_field_order.h | 8 +-- 2 files changed, 95 insertions(+), 11 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index a61785ae19..e6100915a0 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1737,24 +1737,108 @@ namespace quda int faceVolumeCB[4]; int nParity; const int L[4]; + const int rank; - OpenQCDDiracOrder(const ColorSpinorField &a, int nFace = 1, Float *field_ = 0, float * = 0, Float **ghost_ = 0) : + OpenQCDDiracOrder(const ColorSpinorField &a, int = 1, Float *field_ = 0, float * = 0) : field(field_ ? field_ : (Float *)a.V()), offset(a.Bytes() / (2 * sizeof(Float))), // TODO: What's this for?? volumeCB(a.VolumeCB()), nParity(a.SiteSubset()), - L {a.X()[0], a.X()[1], a.X()[2], a.X()[3]} // local dimensions + L {a.X()[0], a.X()[1], a.X()[2], a.X()[3]}, // local dimensions (xyzt) + rank(comm_rank()) { if constexpr (length != 24) { errorQuda("Spinor field length %d not supported", length); } } - __device__ __host__ inline int getSpinorOffset(int x, int parity) const { - int coord[4]; - getCoords(coord, x, L, parity); - int idx = coord[3] + L[3]*coord[2] + L[3]*L[2]*coord[1] + L[3]*L[2]*L[1]*coord[0]; - return idx*length; + + /** + * @brief Pure function to return ipt[iy], where + * iy=x3+L3*x2+L2*L3*x1+L1*L2*L3*x0 without accessing the + * ipt-array, but calculating the index on the fly. Notice that + * xi and Li are in openQCD (txyz) convention. If they come + * from QUDA, you have to rotate them first. + * + * @param[in] x Carthesian local lattice corrdinates, 0 <= x[i] < Li + * + * @return ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*x0] = the local flat index of + * openQCD + */ + __device__ __host__ inline int ipt(int *x) const + { + int xb[4], xn[4], ib, in, is, cbs[4], mu, L_[4]; + + rotate_coords(L, L_); + + /* cache_block */ + for (mu=1;mu<4;mu++) { + if ((L[mu]%4)==0) { + cbs[mu]=4; + } else if ((L[mu]%3)==0) { + cbs[mu]=3; + } else if ((L[mu]%2)==0) { + cbs[mu]=2; + } else { + cbs[mu]=1; + } + } + + xb[0] = x[0]; + xb[1] = x[1] % cbs[1]; + xb[2] = x[2] % cbs[2]; + xb[3] = x[3] % cbs[3]; + + xn[1] = x[1]/cbs[1]; + xn[2] = x[2]/cbs[2]; + xn[3] = x[3]/cbs[3]; + + /** + * This is essentially what cbix[...] does. + * Notice integer division; truncated towards zero, i.e. 5/2=2 + */ + ib = (xb[3] + cbs[3]*xb[2] + cbs[2]*cbs[3]*xb[1] + cbs[1]*cbs[2]*cbs[3]*xb[0])/2; + + in = xn[3] + (L_[3]/cbs[3])*xn[2] + (L_[3]/cbs[3])*(L_[2]/cbs[2])*xn[1]; + is = x[0] + x[1] + x[2] + x[3]; + + return ib + (L_[0]*cbs[1]*cbs[2]*cbs[3]*in)/2 + (is%2)*(L_[0]*L_[1]*L_[2]*L_[3]/2); + } + + /** + * @brief Rotate corrdinates (xyzt -> txyz) + * + * @param[in] x_quda Carthesian local lattice coordinates in quda + * convention (xyzt) + * @param[out] x_openQCD Carthesian local lattice coordinates in openQCD + * convention (txyz) + */ + __device__ __host__ inline void rotate_coords(const int *x_quda, int *x_openQCD) const + { + x_openQCD[1] = x_quda[0]; + x_openQCD[2] = x_quda[1]; + x_openQCD[3] = x_quda[2]; + x_openQCD[0] = x_quda[3]; + } + + /** + * @brief Gets the offset in Floats from the openQCD base pointer to + * the spinor field. + * + * @param[in] x Checkerboard index coming from quda + * @param[in] parity The parity coming from quda + * + * @return The offset. + */ + __device__ __host__ inline int getSpinorOffset(int x, int parity) const + { + int coord_quda[4], coord_openQCD[4]; + + /* coord_quda contains xyzt local Carthesian corrdinates */ + getCoords(coord_quda, x, L, parity); + rotate_coords(coord_quda, coord_openQCD); /* xyzt -> txyz */ + + return ipt(coord_openQCD)*length; } __device__ __host__ inline void load(complex v[length/2], int x, int parity = 0) const diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index b53ae36ee1..12657973b8 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2341,10 +2341,10 @@ namespace quda { } /** - * @brief Gets the offset in Floats from the openQCD base pointer of - * the gauge fields. At this point, fields are reorder with a - * xyzt-lexicographical spacetime index, so nothing special to - * do here. + * @brief Gets the offset in Floats from the openQCD base pointer to + * the gauge fields. At this point, fields are already + * reordered with a xyzt-lexicographical spacetime index, so + * nothing special to do here. * * @param[in] x Checkerboard index coming from quda * @param[in] dir The direction coming from quda From b577392863e138c31dcd52b65fbd2abc215fc67f Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 25 Aug 2023 17:23:41 +0200 Subject: [PATCH 046/148] added functions gamma{0,2,3}() that don't work --- include/dslash_quda.h | 6 +++ include/quda_openqcd_interface.h | 10 +++++ lib/dslash_gamma_helper.cu | 5 +++ lib/openqcd_interface.cpp | 69 +++++++++++++++++++++++++++----- 4 files changed, 79 insertions(+), 11 deletions(-) diff --git a/include/dslash_quda.h b/include/dslash_quda.h index 02b13c5326..eb3438af34 100644 --- a/include/dslash_quda.h +++ b/include/dslash_quda.h @@ -792,4 +792,10 @@ namespace quda */ void gamma5(ColorSpinorField &out, const ColorSpinorField &in); + /* RG: I have written these */ + void gamma0(ColorSpinorField &out, const ColorSpinorField &in); + void gamma1(ColorSpinorField &out, const ColorSpinorField &in); + void gamma2(ColorSpinorField &out, const ColorSpinorField &in); + void gamma3(ColorSpinorField &out, const ColorSpinorField &in); + } // namespace quda diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 138a862b7a..a426324c94 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -101,8 +101,18 @@ typedef struct { */ void openQCD_qudaSetDslashOptions(double kappa, double mu); + +/** + * @brief Norm square on QUDA. + * + * @param[in] h_in Input field (from openQCD) + * + * @return The norm + */ double openQCD_qudaNorm(void *h_in); +void openQCD_qudaGamma(int dir, void *openQCD_in, void *openQCD_out); + /** * @brief Apply the Wilson-Clover Dirac operator to a field. All fields * passed and returned are host (CPU) fields in openQCD order. diff --git a/lib/dslash_gamma_helper.cu b/lib/dslash_gamma_helper.cu index 4b7ef2458c..6f93c6ab33 100644 --- a/lib/dslash_gamma_helper.cu +++ b/lib/dslash_gamma_helper.cu @@ -97,4 +97,9 @@ namespace quda { // Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma) void gamma5(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,4); } + /* RG: I have written these */ + void gamma0(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,0); } + void gamma1(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,1); } + void gamma2(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,2); } + void gamma3(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,3); } } diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 8b9931cb66..adb77903dc 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -485,10 +485,64 @@ double openQCD_qudaNorm(void *h_in) } -void openQCD_qudaDslash(void *src, void *dst) +void openQCD_qudaGamma(int dir, void *openQCD_in, void *openQCD_out) { - void *buffer1, *buffer2; + lat_dim_t X; + + for (int i = 0; i<4; i++) { + X[i] = localDim[i]; + } + + // sets up the necessary parameters + QudaInvertParam sParam = newQudaInvertParam(); + sParam.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; + sParam.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + sParam.cpu_prec = QUDA_DOUBLE_PRECISION; + sParam.cuda_prec = QUDA_DOUBLE_PRECISION; + + // creates a field on the CPU + ColorSpinorParam cpuParam(openQCD_in, sParam, X, false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField in_h(cpuParam); + + // creates a field on the GPU with the same parameter set as the CPU field + ColorSpinorParam cudaParam(cpuParam, sParam, QUDA_CUDA_FIELD_LOCATION); + ColorSpinorField in(cudaParam); + + // transfer the CPU field to GPU + in = in_h; + + // creates a zero-field on the GPU + cudaParam.create = QUDA_NULL_FIELD_CREATE; + cudaParam.location = QUDA_CUDA_FIELD_LOCATION; + ColorSpinorField out(cudaParam); + + // gamma5 runs within QUDA using QUDA fields + if (dir == 5 || dir == 4) { + gamma5(out, in); + } else if (dir == 0) { + gamma0(out, in); + } else if (dir == 1) { + gamma1(out, in); + } else if (dir == 2) { + gamma2(out, in); + } else if (dir == 3) { + gamma3(out, in); + } else { + errorQuda("Unknown gamma: %d\n", dir); + } + + // creates a field on the CPU + cpuParam.v = openQCD_out; + cpuParam.location = QUDA_CPU_FIELD_LOCATION; + ColorSpinorField out_h(cpuParam); + // transfer the GPU field back to CPU + out_h = out; +} + + +void openQCD_qudaDslash(void *src, void *dst) +{ if (!qudaState.gauge_loaded) { errorQuda("Gauge field not loaded into QUDA, cannot apply Dslash. Call openQCD_gaugeload() first."); return; @@ -499,15 +553,8 @@ void openQCD_qudaDslash(void *src, void *dst) return; } - buffer1 = malloc(input.volume*input.sizeof_spinor_dble); - buffer2 = malloc(input.volume*input.sizeof_spinor_dble); - - input.reorder_spinor_openqcd_to_quda(src, buffer1); - MatQuda(static_cast(buffer2), static_cast(buffer1), &invertParam); - input.reorder_spinor_quda_to_openqcd(buffer2, dst); - - free(buffer1); - free(buffer2); + MatQuda(static_cast(dst), static_cast(src), &invertParam); + /*dslashQuda(static_cast(dst), static_cast(src), &invertParam, QUDA_ODD_PARITY);*/ return; } From 26918e6910f06e59e7e2aa4dd86b0c6032403a85 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Thu, 31 Aug 2023 16:36:45 +0200 Subject: [PATCH 047/148] Add QUDA_OPENQCD_GAMMA_BASIS --- include/enum_quda.h | 3 +- include/gamma.cuh | 104 ++++++++++++++++++++++++-- include/kernels/copy_color_spinor.cuh | 62 +++++++++++++++ lib/copy_color_spinor.cuh | 6 ++ lib/openqcd_interface.cpp | 19 +++-- 5 files changed, 180 insertions(+), 14 deletions(-) diff --git a/include/enum_quda.h b/include/enum_quda.h index cc228446da..167e36bb57 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -372,7 +372,8 @@ typedef enum QudaFieldCreate_s { typedef enum QudaGammaBasis_s { QUDA_DEGRAND_ROSSI_GAMMA_BASIS, QUDA_UKQCD_GAMMA_BASIS, - QUDA_CHIRAL_GAMMA_BASIS, // check ? TODO: use this for quda ? + QUDA_CHIRAL_GAMMA_BASIS, + QUDA_OPENQCD_GAMMA_BASIS, QUDA_INVALID_GAMMA_BASIS = QUDA_INVALID_ENUM } QudaGammaBasis; diff --git a/include/gamma.cuh b/include/gamma.cuh index c108ee9725..ec0c7986c7 100644 --- a/include/gamma.cuh +++ b/include/gamma.cuh @@ -26,10 +26,11 @@ namespace quda { Gamma(const Gamma &g) = default; __device__ __host__ inline int getcol(int row) const { - if (basis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) { + if (basis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS || + basis == QUDA_OPENQCD_GAMMA_BASIS) { switch(dir) { - case 0: - case 1: + case 0: /* gamma1 */ + case 1: /* gamma2 */ switch(row) { case 0: return 3; case 1: return 2; @@ -37,8 +38,8 @@ namespace quda { case 3: return 0; } break; - case 2: - case 3: + case 2: /* gamma3 */ + case 3: /* gamma0 */ switch(row) { case 0: return 2; case 1: return 3; @@ -46,7 +47,7 @@ namespace quda { case 3: return 1; } break; - case 4: + case 4: /* gamma5 */ switch(row) { case 0: return 0; case 1: return 1; @@ -201,7 +202,61 @@ namespace quda { } break; } - } + } else if (basis == QUDA_OPENQCD_GAMMA_BASIS) { + switch(dir) { + case 0: /* gamma1 */ + switch(row) { + case 0: + case 1: + return -I; + case 2: + case 3: + return I; + } + break; + case 1: /* gamma2 */ + switch(row) { + case 0: + case 3: + return -1; + case 1: + case 2: + return 1; + } + break; + case 2: /* gamma3 */ + switch(row) { + case 0: + case 3: + return -I; + case 1: + case 2: + return I; + } + break; + case 3: /* gamma0 */ + switch(row) { + case 0: + case 1: + case 2: + case 3: + return -1; + } + break; + case 4: /* gamma5 */ + switch(row) { + case 0: + case 1: + return 1; + case 2: + case 3: + return -1; + } + break; + } + } + + return 0; } @@ -283,7 +338,40 @@ namespace quda { } break; } - } + } else if (basis == QUDA_OPENQCD_GAMMA_BASIS) { + switch(dir) { + case 0: /* gamma1 */ + switch(row) { + case 0: case 1: return complex(a.imag(), -a.real()); // I + case 2: case 3: return complex(-a.imag(), a.real()); // -I + } + break; + case 1: /* gamma2 */ + switch(row) { + case 0: case 3: return -a; + case 1: case 2: return a; + } + break; + case 2: /* gamma3 */ + switch(row) { + case 0: case 3: return complex(a.imag(), -a.real()); // I + case 1: case 2: return complex(-a.imag(), a.real()); // -I + } + break; + case 3: /* gamma0 */ + switch(row) { + case 0: case 1: case 2: case 3: return -a; + } + break; + case 4: /* gamma5 */ + switch(row) { + case 0: case 1: return a; + case 2: case 3: return -a; + } + break; + } + } + return a; } diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index ac35b468c9..99228ca080 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -51,10 +51,20 @@ namespace quda { __device__ __host__ inline void operator()(complex out[Ns*Nc], const complex in[Ns*Nc]) const { int s1[4] = {1, 2, 3, 0}; int s2[4] = {3, 0, 1, 2}; + /* K1 = [1, -1, -1, -1] / sqrt(2) */ FloatOut K1[4] = {static_cast(kP), static_cast(-kP), static_cast(-kP), static_cast(-kP)}; + /* K2 = [1, -1, 1, 1] / sqrt(2) */ FloatOut K2[4] = {static_cast(kP), static_cast(-kP), static_cast(kP), static_cast(kP)}; for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); } } @@ -89,6 +99,14 @@ namespace quda { FloatOut K2[4] = {static_cast(kP), static_cast(kP), static_cast(kP), static_cast(kP)}; for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); } } @@ -112,6 +130,50 @@ namespace quda { } }; + /** Transform from openqcd into non-relativistic basis */ + template + struct ReverseOpenqcdBasis { + template + __device__ __host__ inline void operator()(complex out[Ns*Nc], const complex in[Ns*Nc]) const { + int s1[4] = {0, 1, 0, 1}; + int s2[4] = {2, 3, 2, 3}; + FloatOut K1[4] = {static_cast(kP), static_cast(kP), static_cast(-kP), static_cast(-kP)}; + FloatOut K2[4] = {static_cast(kP), static_cast(kP), static_cast(kP), static_cast(kP)}; + /* U = [1 0 1 0] + [0 1 0 1] + [-1 0 1 0] + [0 -1 0 1] / sqrt(2) */ + for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); + } + } + } + }; + + /** Transform from non-relativistic into openqcd basis */ + template + struct OpenqcdBasis { + template + __device__ __host__ inline void operator()(complex out[Ns*Nc], const complex in[Ns*Nc]) const { + int s1[4] = {0, 1, 0, 1}; + int s2[4] = {2, 3, 2, 3}; + FloatOut K1[4] = {static_cast(kU), static_cast(kU), static_cast(kU), static_cast(kU)}; + FloatOut K2[4] = {static_cast(-kU), static_cast(-kU), static_cast(kU), static_cast(kU)}; + /* U = [1 0 -1 0] + [0 1 0 -1] + [1 0 1 0] + [0 1 0 1] / sqrt(2) */ + for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); + } + } + } + }; + + + template struct CopyColorSpinor_ { const Arg &arg; constexpr CopyColorSpinor_(const Arg &arg): arg(arg) {} diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh index 36bda6af08..b7c18766a0 100644 --- a/lib/copy_color_spinor.cuh +++ b/lib/copy_color_spinor.cuh @@ -32,6 +32,8 @@ namespace quda { else if (out.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) strcat(aux, ",RelBasis"); else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_CHIRAL_GAMMA_BASIS) strcat(aux, ",ChiralToNonRelBasis"); else if (out.GammaBasis() == QUDA_CHIRAL_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) strcat(aux, ",NonRelToChiralBasis"); + else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_OPENQCD_GAMMA_BASIS) strcat(aux, ",ReverseOpenqcdBasis"); + else if (out.GammaBasis() == QUDA_OPENQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) strcat(aux, ",OpenqcdBasis"); else errorQuda("Basis change from %d to %d not supported", in.GammaBasis(), out.GammaBasis()); apply(device::get_default_stream()); @@ -60,6 +62,10 @@ namespace quda { launch(tp, stream, Arg(out, in, Out_, In_)); } else if (out.GammaBasis() == QUDA_CHIRAL_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) { launch(tp, stream, Arg(out, in, Out_, In_)); + } else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_OPENQCD_GAMMA_BASIS) { + launch(tp, stream, Arg(out, in, Out_, In_)); + } else if (out.GammaBasis() == QUDA_OPENQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) { + launch(tp, stream, Arg(out, in, Out_, In_)); } else { errorQuda("Unexpected basis change from %d to %d", in.GammaBasis(), out.GammaBasis()); } diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index adb77903dc..3e3caea2c5 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -279,7 +279,7 @@ static void setColorSpinorParams(const int dim[4], QudaPrecision precision, Colo param->siteSubset = QUDA_FULL_SITE_SUBSET; // FIXME: check how to adapt this for openqxd param->siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: check how to adapt this for openqxd // EVEN-ODD is only about inner ordering in quda param->fieldOrder = QUDA_OPENQCD_FIELD_ORDER; // FIXME: - param->gammaBasis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // meaningless, but required by the code. // // FIXME:: + param->gammaBasis = QUDA_OPENQCD_GAMMA_BASIS; // meaningless, but required by the code. // // FIXME:: param->create = QUDA_ZERO_FIELD_CREATE; // // FIXME:: check how to adapt this for openqxd ?? created -0 in weird places } @@ -449,7 +449,7 @@ void openQCD_qudaSetDslashOptions(double kappa, double mu) invertParam.cuda_prec = QUDA_DOUBLE_PRECISION; /**< The precision used by the QUDA solver */ invertParam.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; /**< The order of the input and output fermion fields */ - invertParam.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; /**< Gamma basis of the input and output host fields */ + invertParam.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; /**< Gamma basis of the input and output host fields */ invertParam.verbosity = verbosity; /**< The verbosity setting to use in the solver */ invertParam.compute_action = 0; @@ -471,7 +471,7 @@ double openQCD_qudaNorm(void *h_in) QudaInvertParam sParam = newQudaInvertParam(); sParam.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; - sParam.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + sParam.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; sParam.cpu_prec = QUDA_DOUBLE_PRECISION; sParam.cuda_prec = QUDA_DOUBLE_PRECISION; @@ -485,6 +485,15 @@ double openQCD_qudaNorm(void *h_in) } +/** + * @brief Applies Dirac matrix to spinor. + * + * openQCD_out = gamma[dir] * openQCD_in + * + * @param dir Dirac index, 0 <= dir <= 5 + * @param openQCD_in of type spinor_dble[NSPIN] + * @param openQCD_out of type spinor_dble[NSPIN] + */ void openQCD_qudaGamma(int dir, void *openQCD_in, void *openQCD_out) { lat_dim_t X; @@ -496,7 +505,7 @@ void openQCD_qudaGamma(int dir, void *openQCD_in, void *openQCD_out) // sets up the necessary parameters QudaInvertParam sParam = newQudaInvertParam(); sParam.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; - sParam.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + sParam.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; sParam.cpu_prec = QUDA_DOUBLE_PRECISION; sParam.cuda_prec = QUDA_DOUBLE_PRECISION; @@ -733,7 +742,7 @@ void setInvertParam(QudaInvertParam &invertParam, openQCD_QudaInvertArgs_t &inv_ invertParam.cpu_prec = host_precision; invertParam.cuda_prec = device_precision; invertParam.cuda_prec_sloppy = device_precision_sloppy; - invertParam.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + invertParam.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; invertParam.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; invertParam.clover_cpu_prec = host_precision; invertParam.clover_cuda_prec = device_precision; From d3fdc43eeb13dccc5a31e64dbd8a3a63852518cb Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 11 Sep 2023 12:11:47 +0200 Subject: [PATCH 048/148] fixed openQCD_qudaGamma --- include/kernels/copy_color_spinor.cuh | 28 ++++++++++++++------ lib/openqcd_interface.cpp | 38 ++++++++++++++++----------- 2 files changed, 43 insertions(+), 23 deletions(-) diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index 99228ca080..c2c825a9cb 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -137,8 +137,14 @@ namespace quda { __device__ __host__ inline void operator()(complex out[Ns*Nc], const complex in[Ns*Nc]) const { int s1[4] = {0, 1, 0, 1}; int s2[4] = {2, 3, 2, 3}; - FloatOut K1[4] = {static_cast(kP), static_cast(kP), static_cast(-kP), static_cast(-kP)}; - FloatOut K2[4] = {static_cast(kP), static_cast(kP), static_cast(kP), static_cast(kP)}; + FloatOut K1[4] = {static_cast(kP), + static_cast(kP), + static_cast(-kP), + static_cast(-kP)}; + FloatOut K2[4] = {static_cast(kP), + static_cast(kP), + static_cast(kP), + static_cast(kP)}; /* U = [1 0 1 0] [0 1 0 1] [-1 0 1 0] @@ -158,12 +164,18 @@ namespace quda { __device__ __host__ inline void operator()(complex out[Ns*Nc], const complex in[Ns*Nc]) const { int s1[4] = {0, 1, 0, 1}; int s2[4] = {2, 3, 2, 3}; - FloatOut K1[4] = {static_cast(kU), static_cast(kU), static_cast(kU), static_cast(kU)}; - FloatOut K2[4] = {static_cast(-kU), static_cast(-kU), static_cast(kU), static_cast(kU)}; - /* U = [1 0 -1 0] - [0 1 0 -1] - [1 0 1 0] - [0 1 0 1] / sqrt(2) */ + FloatOut K1[4] = {static_cast(-kU), + static_cast(-kU), + static_cast(-kU), + static_cast(-kU)}; + FloatOut K2[4] = {static_cast(kU), + static_cast(kU), + static_cast(-kU), + static_cast(-kU)}; + /* U = [-1 0 1 0] + [ 0 -1 0 1] + [-1 0 -1 0] + [ 0 -1 0 -1] / sqrt(2) */ for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 3e3caea2c5..89b6e84afa 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -461,6 +461,13 @@ void openQCD_qudaSetDslashOptions(double kappa, double mu) } +/** + * @brief Calculates the norm of a spinor. + * + * @param[in] h_in input spinor of type spinor_dble[NSPIN] + * + * @return norm + */ double openQCD_qudaNorm(void *h_in) { lat_dim_t X; @@ -486,15 +493,16 @@ double openQCD_qudaNorm(void *h_in) /** - * @brief Applies Dirac matrix to spinor. - * - * openQCD_out = gamma[dir] * openQCD_in - * - * @param dir Dirac index, 0 <= dir <= 5 - * @param openQCD_in of type spinor_dble[NSPIN] - * @param openQCD_out of type spinor_dble[NSPIN] + * @brief Applies Dirac matrix to spinor. + * + * openQCD_out = gamma[dir] * openQCD_in + * + * @param[in] dir Dirac index, 0 <= dir <= 5, notice that dir is in + * openQCD convention, ie. (0: t, 1: x, 2: y, 3: z, 4: 5, 5: 5) + * @param[in] openQCD_in of type spinor_dble[NSPIN] + * @param[out] openQCD_out of type spinor_dble[NSPIN] */ -void openQCD_qudaGamma(int dir, void *openQCD_in, void *openQCD_out) +void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) { lat_dim_t X; @@ -528,14 +536,14 @@ void openQCD_qudaGamma(int dir, void *openQCD_in, void *openQCD_out) // gamma5 runs within QUDA using QUDA fields if (dir == 5 || dir == 4) { gamma5(out, in); - } else if (dir == 0) { + } else if (dir == 0) { // t direction + gamma3(out, in); + } else if (dir == 1) { // x direction gamma0(out, in); - } else if (dir == 1) { + } else if (dir == 2) { // y direction gamma1(out, in); - } else if (dir == 2) { + } else if (dir == 3) { // z direction gamma2(out, in); - } else if (dir == 3) { - gamma3(out, in); } else { errorQuda("Unknown gamma: %d\n", dir); } @@ -713,7 +721,7 @@ void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, // void setInvertParam(QudaInvertParam &invertParam, openQCD_QudaInvertArgs_t &inv_args, int external_precision, // int quda_precision, double kappa, double reliable_delta); -void setInvertParam(QudaInvertParam &invertParam, openQCD_QudaInvertArgs_t &inv_args, int external_precision, +/*void setInvertParam(QudaInvertParam &invertParam, openQCD_QudaInvertArgs_t &inv_args, int external_precision, int quda_precision, double kappa, double reliable_delta) { @@ -751,6 +759,6 @@ void setInvertParam(QudaInvertParam &invertParam, openQCD_QudaInvertArgs_t &inv_ invertParam.clover_order = QUDA_PACKED_CLOVER_ORDER; invertParam.compute_action = 0; -} +}*/ // TODO: OpenQCDMultigridPack functions a la MILC (cf. milc_interface.cpp) From fe01653a52b00d1a02fa315cc6ebf7a334aede52 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 11 Sep 2023 12:12:10 +0200 Subject: [PATCH 049/148] removed implicit fallthrough in switch case of gammai() --- include/kernels/dslash_gamma_helper.cuh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/include/kernels/dslash_gamma_helper.cuh b/include/kernels/dslash_gamma_helper.cuh index 3b5e27492a..83e89f0ef5 100644 --- a/include/kernels/dslash_gamma_helper.cuh +++ b/include/kernels/dslash_gamma_helper.cuh @@ -77,12 +77,14 @@ namespace quda { __device__ __host__ void operator()(int x_cb, int parity) { ColorSpinor in = arg.in(x_cb, parity); + + /* RG: I had to add the break, else there is an implicit fallthrough */ switch(arg.d) { - case 0: arg.out(x_cb, parity) = in.gamma(0); - case 1: arg.out(x_cb, parity) = in.gamma(1); - case 2: arg.out(x_cb, parity) = in.gamma(2); - case 3: arg.out(x_cb, parity) = in.gamma(3); - case 4: arg.out(x_cb, parity) = in.gamma(4); + case 0: arg.out(x_cb, parity) = in.gamma(0); break; + case 1: arg.out(x_cb, parity) = in.gamma(1); break; + case 2: arg.out(x_cb, parity) = in.gamma(2); break; + case 3: arg.out(x_cb, parity) = in.gamma(3); break; + case 4: arg.out(x_cb, parity) = in.gamma(4); break; } } }; From 34d388d5096a4a796b5219b1ec5faccaa1934871 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 11 Sep 2023 12:22:01 +0200 Subject: [PATCH 050/148] global minus for openqcd gamma matrix transformation --- include/kernels/copy_color_spinor.cuh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index c2c825a9cb..88b91bfc0a 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -164,6 +164,8 @@ namespace quda { __device__ __host__ inline void operator()(complex out[Ns*Nc], const complex in[Ns*Nc]) const { int s1[4] = {0, 1, 0, 1}; int s2[4] = {2, 3, 2, 3}; + + /* RG: I added a global minus here to ix the global minus imposed by the transformation */ FloatOut K1[4] = {static_cast(-kU), static_cast(-kU), static_cast(-kU), From b6157c53e456a881a00e8a784a76bb3d4671a84c Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Tue, 12 Sep 2023 12:47:46 +0200 Subject: [PATCH 051/148] dirac operator seems to work now --- lib/openqcd_interface.cpp | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 89b6e84afa..2a79dc53fe 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -203,13 +203,22 @@ static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec) gParam.type = QUDA_SU3_LINKS; gParam.reconstruct_sloppy = gParam.reconstruct = QUDA_RECONSTRUCT_NO; + + /** + * This instantiates the object OpenQCDOrder + */ gParam.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; + + /** + * Seems to have no effect ... + */ gParam.t_boundary = QUDA_PERIODIC_T; + gParam.gauge_fix = QUDA_GAUGE_FIXED_NO; gParam.scale = 1.0; gParam.anisotropy = 1.0; - gParam.tadpole_coeff = 1.0; - gParam.scale = 0; + //gParam.tadpole_coeff = 1.0; + //gParam.scale = 0; gParam.ga_pad = getLinkPadding(dim); /* Why this? */ return gParam; @@ -401,6 +410,8 @@ void openQCD_gaugeload(void *gauge) { QudaGaugeParam qudaGaugeParam = newOpenQCDGaugeParam(localDim, QUDA_DOUBLE_PRECISION); + printQudaGaugeParam(&qudaGaugeParam); + void* buffer = malloc(4*input.volume*input.sizeof_su3_dble); input.reorder_gauge_openqcd_to_quda(gauge, buffer); input.gauge = gauge; @@ -437,7 +448,7 @@ void openQCD_qudaSetDslashOptions(double kappa, double mu) invertParam.inv_type = QUDA_CG_INVERTER; /* just set some */ invertParam.kappa = kappa; invertParam.dagger = QUDA_DAG_NO; - invertParam.mass_normalization = QUDA_KAPPA_NORMALIZATION; + invertParam.mass_normalization = QUDA_MASS_NORMALIZATION; /* what is the difference? only works with QUDA_MASS_NORMALIZATION */ invertParam.Ls = 1; /**< Extent of the 5th dimension (for domain wall) */ invertParam.mu = mu; /**< Twisted mass parameter */ /*invertParam.tm_rho = ?;*/ /**< Hasenbusch mass shift applied like twisted mass to diagonal (but not inverse) */ From fdd1b62a232d8979fbf64fe86b67341a3d30ac2e Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 13 Sep 2023 15:34:24 +0200 Subject: [PATCH 052/148] clover term works --- include/color_spinor_field_order.h | 5 +- include/quda_openqcd_interface.h | 91 ++-- lib/copy_color_spinor.cuh | 1 - lib/openqcd_interface.cpp | 680 ++++++++--------------------- 4 files changed, 241 insertions(+), 536 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index e6100915a0..48e0e6b3f6 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1769,7 +1769,7 @@ namespace quda { int xb[4], xn[4], ib, in, is, cbs[4], mu, L_[4]; - rotate_coords(L, L_); + rotate_coords(L, L_); // L_ local lattice dimensions in openQCD format (txyz) /* cache_block */ for (mu=1;mu<4;mu++) { @@ -1951,9 +1951,6 @@ namespace quda template struct colorspinor_order_mapper { typedef colorspinor::SpaceSpinorColorOrder type; }; - // template struct colorspinor_order_mapper { - // typedef colorspinor::OpenQCDDiracOrder type; - // }; // TODO: ? template struct colorspinor_order_mapper { typedef colorspinor::FloatNOrder type; }; diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index a426324c94..4cec2fd675 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -17,13 +17,16 @@ extern "C" { #endif /** - * Parameters related to problem size and machine topology. + * Parameters related to problem size and machine topology. They should hold the + * numbers in quda format, i.e. xyzt convention. For example L[0] = L1, L[1] = + * L2, ... */ typedef struct { - const int *latsize; /** Local lattice dimensions L0, L1, L2, L3 */ - const int *machsize; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ - const int *blksize; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ - int device; /** GPU device number */ + int L[4]; /** Local lattice dimensions L0, L1, L2, L3 */ + int nproc[4]; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ + int nproc_blk[4]; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ + int N[4]; /** Glocal lattice dimensions N0, N1, N2, N3 */ + int device; /** GPU device number */ } openQCD_QudaLayout_t; /** @@ -46,7 +49,8 @@ typedef struct { typedef struct { int initialized; /** Whether openQCD_qudaInit() was called or not */ - int gauge_loaded; /** Whether openQCD_gaugeload() was called or not */ + int gauge_loaded; /** Whether openQCD_qudaGaugeLoad() was called or not */ + int clover_loaded; /** Whether openQCD_qudaCloverLoad() was called or not */ int dslash_setup; /** Whether openQCD_qudaSetDslashOptions() was called or not */ } openQCD_QudaState_t; @@ -94,12 +98,12 @@ typedef struct { /** - * @brief Setup Dslash + * @brief Setup Dirac operator * - * @param[in] kappa kappa - * @param[in] mu twisted mass + * @param[in] kappa kappa + * @param[in] mu twisted mass */ -void openQCD_qudaSetDslashOptions(double kappa, double mu); +void openQCD_qudaSetDwOptions(double kappa, double mu); /** @@ -117,34 +121,12 @@ void openQCD_qudaGamma(int dir, void *openQCD_in, void *openQCD_out); * @brief Apply the Wilson-Clover Dirac operator to a field. All fields * passed and returned are host (CPU) fields in openQCD order. * - * @param[in] src Source spinor field - * @param[out] dst Destination spinor field + * @param[in] src Source spinor field + * @param[out] dst Destination spinor field + * @param[in] dagger Whether we are using the Hermitian conjugate system or + * not (QUDA_DAG_NO or QUDA_DAG_YES) */ -void openQCD_qudaDslash(void *src, void *dst); - - -/** - * @brief Set metadata, options for Dslash. - * - * @param[in] external_precision Precision of host fields passed to QUDA (2 - double, 1 - single) - * @param[in] quda_precision Precision for QUDA to use (2 - double, 1 - single) - * @param[in] inv_args Struct containing arguments, metadata - */ -/*void openQCD_qudaSetDslashOptions(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args);*/ - -/** - * ALL the following except the Dirac operator application - * Apply the improved staggered operator to a field. All fields - * passed and returned are host (CPU) field in MILC order. - * - * @param external_precision Precision of host fields passed to QUDA (2 - double, 1 - single) - * @param quda_precision Precision for QUDA to use (2 - double, 1 - single) - * @param inv_args Struct setting some solver metadata - * @param source Right-hand side source field - * @param solution Solution spinor field - */ -void openQCD_colorspinorloadsave(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, - void *dst, void *gauge); +void openQCD_qudaDw(void *src, void *dst, QudaDagType dagger); /** @@ -184,25 +166,48 @@ double openQCD_qudaPlaquette(void); /** - * @brief Load the gauge fields from host to quda + * @brief Load the gauge fields from host to quda. * * @param[in] gauge The gauge fields (in openqcd order) */ -void openQCD_gaugeload(void *gauge); +void openQCD_qudaGaugeLoad(void *gauge); /** - * @brief Save the gauge fields from quda to host + * @brief Save the gauge fields from quda to host. * * @param[out] gauge The gauge fields (will be stored in openqcd order) */ -void openQCD_gaugesave(void *gauge); +void openQCD_qudaGaugeSave(void *gauge); /** - Free the gauge field allocated in QUDA. + * @brief Free the gauge field allocated in quda. */ -void openQCD_qudaFreeGaugeField(void); +void openQCD_qudaGaugeFree(void); + + +/** + * @brief Load the clover fields from host to quda. + * + * @param[in] clover The clover fields (in openqcd order) + */ +void openQCD_qudaCloverLoad(void *clover); + + +/** + * @brief Calculates the clover field and its inverse + * + * @param[in] su3csw The csw coefficient + */ +void openQCD_qudaCloverCreate(double su3csw); + + +/** + * @brief Free the clover field allocated in quda. + */ +void openQCD_qudaCloverFree(void); + #ifdef __cplusplus } diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh index b7c18766a0..20833e911e 100644 --- a/lib/copy_color_spinor.cuh +++ b/lib/copy_color_spinor.cuh @@ -174,7 +174,6 @@ namespace quda { #ifdef BUILD_OPENQCD_INTERFACE using ColorSpinor = OpenQCDDiracOrder; // TODO: Seems OK - //using ColorSpinor = SpaceSpinorColorOrder; // FIXME: This is a test genericCopyColorSpinor(param); #else errorQuda("OpenQCD interface has not been built\n"); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 2a79dc53fe..5bc89d4f45 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -47,41 +47,32 @@ static const int num_colors = sizeof(colors) / sizeof(uint32_t); #endif static openQCD_QudaInitArgs_t input; -static QudaInvertParam invertParam = newQudaInvertParam(); -static openQCD_QudaState_t qudaState = {false, false, false}; -static int commsGridDim[4]; -static int localDim[4]; +static QudaInvertParam invertParam; +static openQCD_QudaState_t qudaState = {false, false, false, false}; using namespace quda; -// #define QUDAMILC_VERBOSE 1 -// template void inline qudamilc_called(const char *func, QudaVerbosity verb) -// { -// // add NVTX markup if enabled -// if (start) { -// PUSH_RANGE(func, 1); -// } else { -// POP_RANGE; -// } - -// #ifdef QUDAMILC_VERBOSE -// if (verb >= QUDA_VERBOSE) { -// if (start) { -// printfQuda("QUDA_MILC_INTERFACE: %s (called) \n", func); -// } else { -// printfQuda("QUDA_MILC_INTERFACE: %s (return) \n", func); -// } -// } -// #endif -// } - -// template void inline qudamilc_called(const char *func) { qudamilc_called(func, getVerbosity()); } - -/******************************************* - * - * LAYOUT AND INIT - * - *******************************************/ + +/** + * @brief Returns the local lattice dimensions as lat_dim_t + * + * @return The local dimensions. + */ +static lat_dim_t get_local_dims(int *fill = nullptr) +{ + lat_dim_t X; + + for (int i=0; i<4; i++) { + if (fill) { + fill[i] = input.layout.L[i]; + } else { + X[i] = input.layout.L[i]; + } + } + + return X; +} + /** * @brief Calculate the rank from coordinates. @@ -113,39 +104,21 @@ static int rankFromCoords(const int *coords, void *fdata) // TODO: */ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) { - int local_dim[4]; for (int dir = 0; dir < 4; ++dir) { - local_dim[dir] = layout.latsize[dir]; - } - -#ifdef MULTI_GPU - for (int dir = 0; dir < 4; ++dir) { - local_dim[dir] /= layout.machsize[dir]; - } -#endif - for (int dir = 0; dir < 4; ++dir) { - if (local_dim[dir] % 2 != 0) { - printfQuda("Error: Odd lattice dimensions are not supported\n"); + if (layout.N[dir] % 2 != 0) { + errorQuda("Error: Odd lattice dimensions are not supported\n"); exit(1); } } - // TODO: do we need to track this here - for (int dir = 0; dir < 4; ++dir) { - localDim[dir] = local_dim[dir]; - } #ifdef MULTI_GPU - for (int dir = 0; dir < 4; ++dir) { - commsGridDim[dir] = layout.machsize[dir]; - } // TODO: would we ever want to run with QMP COMMS? #ifdef QMP_COMMS - initCommsGridQuda(4, commsGridDim, nullptr, nullptr); + initCommsGridQuda(4, layout.nproc, nullptr, nullptr); #else - initCommsGridQuda(4, commsGridDim, rankFromCoords, (void *)(commsGridDim)); + initCommsGridQuda(4, layout.nproc, rankFromCoords, (void *)(layout.nproc)); #endif - - static int device = -1; + static int device = -1; // enable a default allocation of devices to processes #else static int device = layout.device; #endif @@ -154,21 +127,6 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) } -void openQCD_qudaInit(openQCD_QudaInitArgs_t in) -{ - if (qudaState.initialized) return; - input = in; - - setVerbosityQuda(input.verbosity, "QUDA: ", input.logfile); - openQCD_qudaSetLayout(input.layout); - qudaState.initialized = true; -} - -void openQCD_qudaFinalize() { - endQuda(); -} - - static int getLinkPadding(const int dim[4]) { int padding = MAX(dim[1] * dim[2] * dim[3] / 2, dim[0] * dim[2] * dim[3] / 2); @@ -177,203 +135,116 @@ static int getLinkPadding(const int dim[4]) return padding; } -/******************************************* - * - * SETTINGS AND PARAMETERS - * - *******************************************/ /** - * @brief Initialize gauge param struct + * @brief Initialize invert param struct * - * @param[in] dim dimensions - * @param[in] prec precision - * - * @return The quda gauge parameter struct. + * @return The quda invert parameter struct. */ -static QudaGaugeParam newOpenQCDGaugeParam(const int *dim, QudaPrecision prec) +static QudaInvertParam newOpenQCDParam(void) { - QudaGaugeParam gParam = newQudaGaugeParam(); - - for (int dir = 0; dir < 4; ++dir) { - gParam.X[dir] = dim[dir]; - } + static const QudaVerbosity verbosity = getVerbosity(); - gParam.cuda_prec_sloppy = gParam.cpu_prec = gParam.cuda_prec = prec; - gParam.type = QUDA_SU3_LINKS; + QudaInvertParam param = newQudaInvertParam(); - gParam.reconstruct_sloppy = gParam.reconstruct = QUDA_RECONSTRUCT_NO; + param.verbosity = verbosity; - /** - * This instantiates the object OpenQCDOrder - */ - gParam.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; + param.cpu_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the input fermion fields */ + param.cuda_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the QUDA solver */ /** - * Seems to have no effect ... + * The order of the input and output fermion fields. Imposes fieldOrder = + * QUDA_OPENQCD_FIELD_ORDER in color_spinor_field.h and + * QUDA_OPENQCD_FIELD_ORDER makes quda to instantiate OpenQCDDiracOrder. */ - gParam.t_boundary = QUDA_PERIODIC_T; + param.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; - gParam.gauge_fix = QUDA_GAUGE_FIXED_NO; - gParam.scale = 1.0; - gParam.anisotropy = 1.0; - //gParam.tadpole_coeff = 1.0; - //gParam.scale = 0; - gParam.ga_pad = getLinkPadding(dim); /* Why this? */ + /* Gamma basis of the input and output host fields */ + param.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; - return gParam; + return param; } -/*void setGaugeParams(QudaGaugeParam &qudaGaugeParam, const int dim[4], openQCD_QudaInvertArgs_t &inv_args, - int external_precision, int quda_precision) +/** + * @brief Initialize gauge param struct + * + * @param[in] prec precision + * + * @return The quda gauge parameter struct. + */ +static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec) { + QudaGaugeParam param = newQudaGaugeParam(); - const QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - const QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - QudaPrecision device_precision_sloppy; - - switch (inv_args.mixed_precision) { - case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break; - case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break; - default: device_precision_sloppy = device_precision; - } - - for (int dir = 0; dir < 4; ++dir) qudaGaugeParam.X[dir] = dim[dir]; + get_local_dims(param.X); + param.cuda_prec_sloppy = param.cpu_prec = param.cuda_prec = prec; + param.type = QUDA_SU3_LINKS; - qudaGaugeParam.anisotropy = 1.0; - qudaGaugeParam.type = QUDA_WILSON_LINKS; - qudaGaugeParam.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; + param.reconstruct_sloppy = param.reconstruct = QUDA_RECONSTRUCT_NO; - // Check the boundary conditions - // Can't have twisted or anti-periodic boundary conditions in the spatial - // directions with 12 reconstruct at the moment. - bool trivial_phase = true; - for (int dir = 0; dir < 3; ++dir) { - if (inv_args.boundary_phase[dir] != 0) trivial_phase = false; - } - if (inv_args.boundary_phase[3] != 0 && inv_args.boundary_phase[3] != 1) trivial_phase = false; - - if (trivial_phase) { - qudaGaugeParam.t_boundary = (inv_args.boundary_phase[3]) ? QUDA_ANTI_PERIODIC_T : QUDA_PERIODIC_T; - qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_12; - qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_12; - } else { - qudaGaugeParam.t_boundary = QUDA_PERIODIC_T; - qudaGaugeParam.reconstruct = QUDA_RECONSTRUCT_NO; - qudaGaugeParam.reconstruct_sloppy = QUDA_RECONSTRUCT_NO; - } + /** + * This make quda to instantiate OpenQCDOrder + */ + param.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; - qudaGaugeParam.cpu_prec = host_precision; - qudaGaugeParam.cuda_prec = device_precision; - qudaGaugeParam.cuda_prec_sloppy = device_precision_sloppy; - qudaGaugeParam.cuda_prec_precondition = device_precision_sloppy; - qudaGaugeParam.gauge_fix = QUDA_GAUGE_FIXED_NO; - // qudaGaugeParam.ga_pad = getLinkPadding(dim); -}*/ + /** + * Seems to have no effect ... + */ + param.t_boundary = QUDA_PERIODIC_T; + param.gauge_fix = QUDA_GAUGE_FIXED_NO; + param.scale = 1.0; + param.anisotropy = 1.0; // 1.0 means not anisotropic + //param.tadpole_coeff = 1.0; + //param.scale = 0; + param.ga_pad = getLinkPadding(param.X); /* Why this? */ -/* PARAMS FOR SPINOR FIELDS */ -static void setColorSpinorParams(const int dim[4], QudaPrecision precision, ColorSpinorParam *param) -{ - param->nColor = 3; - param->nSpin = 4; // =1 for staggered, =2 for coarse Dslash, =4 for 4d spinor - param->nDim = 4; // TODO: check how to adapt this for openqxd - - for (int dir = 0; dir < 4; ++dir) param->x[dir] = dim[dir]; - // param->x[0] /= 2; // for staggered sites only FIXME:? - - param->setPrecision(precision); - param->pad = 0; - param->siteSubset = QUDA_FULL_SITE_SUBSET; // FIXME: check how to adapt this for openqxd - param->siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: check how to adapt this for openqxd // EVEN-ODD is only about inner ordering in quda - param->fieldOrder = QUDA_OPENQCD_FIELD_ORDER; // FIXME: - param->gammaBasis = QUDA_OPENQCD_GAMMA_BASIS; // meaningless, but required by the code. // // FIXME:: - param->create = QUDA_ZERO_FIELD_CREATE; // // FIXME:: check how to adapt this for openqxd ?? created -0 in weird places + return param; } -/* PARAMS FOR DSLASH AND INVERSION */ -/*static void setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, - double mass, double target_residual, double target_residual_hq, int maxiter, - double reliable_delta, QudaParity parity, QudaVerbosity verbosity, - QudaInverterType inverter) +/** + * @brief Initialize clover param struct + * + * @param[in] kappa hopping parameter + * @param[in] su3csw The su 3 csw + * + * @return The quda gauge parameter struct. + */ +static QudaInvertParam newOpenQCDCloverParam(double kappa, double su3csw) { - invertParam.verbosity = verbosity; - invertParam.mass = mass; - invertParam.tol = target_residual; - invertParam.tol_hq = target_residual_hq; - - invertParam.residual_type = static_cast(0); - invertParam.residual_type = (target_residual != 0) ? - static_cast(invertParam.residual_type | QUDA_L2_RELATIVE_RESIDUAL) : - invertParam.residual_type; - invertParam.residual_type = (target_residual_hq != 0) ? - static_cast(invertParam.residual_type | QUDA_HEAVY_QUARK_RESIDUAL) : - invertParam.residual_type; - - invertParam.heavy_quark_check = (invertParam.residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? 1 : 0); - if (invertParam.heavy_quark_check) { - invertParam.max_hq_res_increase = 5; // this caps the number of consecutive hq residual increases - invertParam.max_hq_res_restart_total = 10; // this caps the number of hq restarts in case of solver stalling - } - - invertParam.use_sloppy_partial_accumulator = 0; - invertParam.num_offset = 0; - - invertParam.inv_type = inverter; - invertParam.maxiter = maxiter; - invertParam.reliable_delta = reliable_delta; - - invertParam.mass_normalization = QUDA_MASS_NORMALIZATION; - invertParam.cpu_prec = cpu_prec; - invertParam.cuda_prec = cuda_prec; - invertParam.cuda_prec_sloppy = invertParam.heavy_quark_check ? cuda_prec : cuda_prec_sloppy; - invertParam.cuda_prec_precondition = cuda_prec_sloppy; - - invertParam.gcrNkrylov = 10; - - invertParam.solution_type = QUDA_MATPC_SOLUTION; - invertParam.solve_type = QUDA_DIRECT_PC_SOLVE; - invertParam.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; // not used, but required by the code. - invertParam.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; - - invertParam.dslash_type = QUDA_WILSON_DSLASH; // FIXME: OR THIS; QUDA_ASQTAD_DSLASH; - invertParam.Ls = 1; - invertParam.gflops = 0.0; - - invertParam.input_location = QUDA_CPU_FIELD_LOCATION; - invertParam.output_location = QUDA_CPU_FIELD_LOCATION; - - if (parity == QUDA_EVEN_PARITY) { // even parity - invertParam.matpc_type = QUDA_MATPC_EVEN_EVEN; - } else if (parity == QUDA_ODD_PARITY) { - invertParam.matpc_type = QUDA_MATPC_ODD_ODD; - } else { - errorQuda("Invalid parity\n"); - } + QudaInvertParam param = newOpenQCDParam(); - invertParam.dagger = QUDA_DAG_NO; - invertParam.use_init_guess = QUDA_USE_INIT_GUESS_YES; + param.clover_location = QUDA_CPU_FIELD_LOCATION; + param.clover_cpu_prec = QUDA_DOUBLE_PRECISION; + param.clover_cuda_prec = QUDA_DOUBLE_PRECISION; + param.clover_order = QUDA_FLOAT8_CLOVER_ORDER; /*QUDA_OPENQCD_CLOVER_ORDER; */ - // for the preconditioner - invertParam.inv_type_precondition = QUDA_CG_INVERTER; - invertParam.tol_precondition = 1e-1; - invertParam.maxiter_precondition = 2; - invertParam.verbosity_precondition = QUDA_SILENT; + param.compute_clover = true; + param.kappa = kappa; + param.clover_csw = su3csw; + param.clover_coeff = 0.0; + param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; - invertParam.compute_action = 0; + return param; } -*/ +void openQCD_qudaInit(openQCD_QudaInitArgs_t in) +{ + if (qudaState.initialized) return; + input = in; + setVerbosityQuda(input.verbosity, "QUDA: ", input.logfile); + openQCD_qudaSetLayout(input.layout); + qudaState.initialized = true; +} -/******************************************* - * - * FUNCTIONS - * - *******************************************/ +void openQCD_qudaFinalize() { + qudaState.initialized = false; + endQuda(); +} double openQCD_qudaPlaquette(void) @@ -381,7 +252,7 @@ double openQCD_qudaPlaquette(void) double plaq[3]; if (!qudaState.gauge_loaded) { - errorQuda("Gauge field not loaded into QUDA, cannot calculate plaquette. Call openQCD_gaugeload() first."); + errorQuda("Gauge field not loaded into QUDA, cannot calculate plaquette. Call openQCD_qudaGaugeLoad() first."); return 0.0; } @@ -406,25 +277,23 @@ double openQCD_qudaPlaquette(void) } -void openQCD_gaugeload(void *gauge) +void openQCD_qudaGaugeLoad(void *gauge) { - QudaGaugeParam qudaGaugeParam = newOpenQCDGaugeParam(localDim, QUDA_DOUBLE_PRECISION); - - printQudaGaugeParam(&qudaGaugeParam); + QudaGaugeParam param = newOpenQCDGaugeParam(QUDA_DOUBLE_PRECISION); void* buffer = malloc(4*input.volume*input.sizeof_su3_dble); input.reorder_gauge_openqcd_to_quda(gauge, buffer); input.gauge = gauge; - loadGaugeQuda(buffer, &qudaGaugeParam); + loadGaugeQuda(buffer, ¶m); free(buffer); qudaState.gauge_loaded = true; } -void openQCD_gaugesave(void *gauge) +void openQCD_qudaGaugeSave(void *gauge) { - QudaGaugeParam qudaGaugeParam = newOpenQCDGaugeParam(localDim, QUDA_DOUBLE_PRECISION); + QudaGaugeParam qudaGaugeParam = newOpenQCDGaugeParam(QUDA_DOUBLE_PRECISION); void* buffer = malloc(4*input.volume*input.sizeof_su3_dble); saveGaugeQuda(buffer, &qudaGaugeParam); @@ -432,41 +301,71 @@ void openQCD_gaugesave(void *gauge) free(buffer); } -void openQCD_qudaFreeGaugeField(void) +void openQCD_qudaGaugeFree(void) { freeGaugeQuda(); qudaState.gauge_loaded = false; } -void openQCD_qudaSetDslashOptions(double kappa, double mu) + +void openQCD_qudaCloverLoad(void *clover) { - static const QudaVerbosity verbosity = getVerbosity(); + /*QudaInvertParam qudaCloverParam = newOpenQCDCloverParam(); + loadCloverQuda(clover, NULL, &qudaCloverParam);*/ + errorQuda("openQCD_qudaCloverLoad() is not implemented yet."); + qudaState.clover_loaded = true; +} - invertParam.input_location = QUDA_CPU_FIELD_LOCATION; - invertParam.output_location = QUDA_CPU_FIELD_LOCATION; +void openQCD_qudaCloverCreate(double su3csw) +{ + if (!qudaState.dslash_setup) { + errorQuda("Need to call openQCD_qudaSetDwOptions() first!"); + } + + QudaInvertParam param = newOpenQCDCloverParam(invertParam.kappa, su3csw); + + /* Set to Wilson Dirac operator with Clover term */ + invertParam.dslash_type = QUDA_CLOVER_WILSON_DSLASH; + + /** + * Leaving both h_clover = h_clovinv = NULL allocates the clover field on the + * GPU and finally calls @createCloverQuda to calculate the clover field. + */ + loadCloverQuda(NULL, NULL, ¶m); + qudaState.clover_loaded = true; +} + +void openQCD_qudaCloverFree(void) +{ + freeCloverQuda(); + qudaState.clover_loaded = false; +} + + +void openQCD_qudaSetDwOptions(double kappa, double mu) +{ + if (mu != 0.0) { + errorQuda("twisted mass not implemented yet."); + } + + invertParam = newOpenQCDParam(); + + /* Set to Wilson Dirac operator */ invertParam.dslash_type = QUDA_WILSON_DSLASH; - invertParam.inv_type = QUDA_CG_INVERTER; /* just set some */ + + /* Hopping parameter */ invertParam.kappa = kappa; - invertParam.dagger = QUDA_DAG_NO; - invertParam.mass_normalization = QUDA_MASS_NORMALIZATION; /* what is the difference? only works with QUDA_MASS_NORMALIZATION */ - invertParam.Ls = 1; /**< Extent of the 5th dimension (for domain wall) */ - invertParam.mu = mu; /**< Twisted mass parameter */ - /*invertParam.tm_rho = ?;*/ /**< Hasenbusch mass shift applied like twisted mass to diagonal (but not inverse) */ - /*invertParam.epsilon = ?;*/ /**< Twisted mass parameter */ - /*invertParam.twist_flavor = ??;*/ /**< Twisted mass flavor */ - invertParam.laplace3D = -1; /**< omit this direction from laplace operator: x,y,z,t -> 0,1,2,3 (-1 is full 4D) */ - invertParam.cpu_prec = QUDA_DOUBLE_PRECISION; /**< The precision used by the input fermion fields */ - invertParam.cuda_prec = QUDA_DOUBLE_PRECISION; /**< The precision used by the QUDA solver */ + invertParam.inv_type = QUDA_CG_INVERTER; /* just set some, needed? */ - invertParam.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; /**< The order of the input and output fermion fields */ - invertParam.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; /**< Gamma basis of the input and output host fields */ + /* What is the difference? only works with QUDA_MASS_NORMALIZATION */ + invertParam.mass_normalization = QUDA_MASS_NORMALIZATION; - invertParam.verbosity = verbosity; /**< The verbosity setting to use in the solver */ - invertParam.compute_action = 0; + /* Extent of the 5th dimension (for domain wall) */ + invertParam.Ls = 1; - ColorSpinorParam csParam; - setColorSpinorParams(localDim, invertParam.cpu_prec, &csParam); + /* Twisted mass parameter */ + invertParam.mu = mu; qudaState.dslash_setup = true; } @@ -481,22 +380,14 @@ void openQCD_qudaSetDslashOptions(double kappa, double mu) */ double openQCD_qudaNorm(void *h_in) { - lat_dim_t X; + QudaInvertParam param = newOpenQCDParam(); - for (int i = 0; i<4; i++) { - X[i] = localDim[i]; - } - - QudaInvertParam sParam = newQudaInvertParam(); - sParam.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; - sParam.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; - sParam.cpu_prec = QUDA_DOUBLE_PRECISION; - sParam.cuda_prec = QUDA_DOUBLE_PRECISION; - - ColorSpinorParam cpuParam(h_in, sParam, X, false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorParam cpuParam(h_in, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); ColorSpinorField in_h(cpuParam); - ColorSpinorParam cudaParam(cpuParam, sParam, QUDA_CUDA_FIELD_LOCATION); + + ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); ColorSpinorField in(cudaParam); + in = in_h; return blas::norm2(in); @@ -515,25 +406,15 @@ double openQCD_qudaNorm(void *h_in) */ void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) { - lat_dim_t X; - - for (int i = 0; i<4; i++) { - X[i] = localDim[i]; - } - // sets up the necessary parameters - QudaInvertParam sParam = newQudaInvertParam(); - sParam.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; - sParam.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; - sParam.cpu_prec = QUDA_DOUBLE_PRECISION; - sParam.cuda_prec = QUDA_DOUBLE_PRECISION; + QudaInvertParam param = newOpenQCDParam(); // creates a field on the CPU - ColorSpinorParam cpuParam(openQCD_in, sParam, X, false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorParam cpuParam(openQCD_in, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); ColorSpinorField in_h(cpuParam); // creates a field on the GPU with the same parameter set as the CPU field - ColorSpinorParam cudaParam(cpuParam, sParam, QUDA_CUDA_FIELD_LOCATION); + ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); ColorSpinorField in(cudaParam); // transfer the CPU field to GPU @@ -544,18 +425,25 @@ void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) cudaParam.location = QUDA_CUDA_FIELD_LOCATION; ColorSpinorField out(cudaParam); - // gamma5 runs within QUDA using QUDA fields - if (dir == 5 || dir == 4) { - gamma5(out, in); - } else if (dir == 0) { // t direction + // gamma_i run within QUDA using QUDA fields + switch (dir) { + case 0: // t direction gamma3(out, in); - } else if (dir == 1) { // x direction + break; + case 1: // x direction gamma0(out, in); - } else if (dir == 2) { // y direction + break; + case 2: // y direction gamma1(out, in); - } else if (dir == 3) { // z direction + break; + case 3: // z direction gamma2(out, in); - } else { + break; + case 4: + case 5: + gamma5(out, in); + break; + default: errorQuda("Unknown gamma: %d\n", dir); } @@ -569,10 +457,10 @@ void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) } -void openQCD_qudaDslash(void *src, void *dst) +void openQCD_qudaDw(void *src, void *dst, QudaDagType dagger) { if (!qudaState.gauge_loaded) { - errorQuda("Gauge field not loaded into QUDA, cannot apply Dslash. Call openQCD_gaugeload() first."); + errorQuda("Gauge field not loaded into QUDA, cannot apply Dslash. Call openQCD_qudaGaugeLoad() first."); return; } @@ -581,195 +469,11 @@ void openQCD_qudaDslash(void *src, void *dst) return; } - MatQuda(static_cast(dst), static_cast(src), &invertParam); - /*dslashQuda(static_cast(dst), static_cast(src), &invertParam, QUDA_ODD_PARITY);*/ - - return; -} - - -void openQCD_colorspinorloadsave(int external_precision, int quda_precision, openQCD_QudaInvertArgs_t inv_args, void *src, - void *dst, void *gauge) -{ - //static const QudaVerbosity verbosity = getVerbosity(); - - QudaGaugeParam qudaGaugeParam - = newOpenQCDGaugeParam(localDim, (quda_precision == 1) ? QUDA_SINGLE_PRECISION : QUDA_DOUBLE_PRECISION); + invertParam.dagger = dagger; - loadGaugeQuda(gauge, &qudaGaugeParam); - - QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - //QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - //QudaPrecision device_precision_sloppy = device_precision; - - QudaInvertParam invertParam = newQudaInvertParam(); - - QudaParity local_parity = inv_args.evenodd; - //QudaParity other_parity = local_parity == QUDA_EVEN_PARITY ? QUDA_ODD_PARITY : QUDA_EVEN_PARITY; - - /* For reference: - setInvertParams(QudaPrecision cpu_prec, QudaPrecision cuda_prec, QudaPrecision cuda_prec_sloppy, - double mass, double target_residual, double target_residual_hq, int maxiter, - double reliable_delta, QudaParity parity, QudaVerbosity verbosity, - QudaInverterType inverter, QudaInvertParam *invertParam) */ - /*setInvertParams(host_precision, device_precision, device_precision_sloppy, 0.0, 0, 0, 0, 0.0, local_parity, verbosity, - QUDA_CG_INVERTER, &invertParam);*/ - - ColorSpinorParam csParam; - setColorSpinorParams(localDim, host_precision, &csParam); - - dslashQudaTest(static_cast(dst), static_cast(src), &invertParam, local_parity); - - return; -} // openQCD_colorspinorloadsave - -#if 0 -void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, openQCD_QudaInvertArgs_t inv_args, - double target_residual, double target_fermilab_residual, const void *const fatlink, - const void *const longlink, void *source, void *solution, double *const final_residual, - double *const final_fermilab_residual, int *num_iters) -{ - static const QudaVerbosity verbosity = getVerbosity(); - - if (target_fermilab_residual == 0 && target_residual == 0) errorQuda("qudaInvert: requesting zero residual\n"); - - QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - - static bool force_double_queried = false; - static bool do_not_force_double = false; - if (!force_double_queried) { - char *donotusedouble_env = getenv("QUDA_MILC_OVERRIDE_DOUBLE_MULTISHIFT"); // disable forcing outer double precision - if (donotusedouble_env && (!(strcmp(donotusedouble_env, "0") == 0))) { - do_not_force_double = true; - printfQuda("Disabling always using double as fine precision for MILC multishift\n"); - } - force_double_queried = true; - } - - QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - - QudaPrecision device_precision_sloppy; - switch (inv_args.mixed_precision) { - case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break; - case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break; - default: device_precision_sloppy = device_precision; - } - - // override fine precision to double, switch to mixed as necessary - if (!do_not_force_double && device_precision == QUDA_SINGLE_PRECISION) { - // force outer double - device_precision = QUDA_DOUBLE_PRECISION; - } - - QudaGaugeParam fat_param = newQudaGaugeParam(); - QudaGaugeParam long_param = newQudaGaugeParam(); - setGaugeParams(fat_param, long_param, longlink, localDim, host_precision, device_precision, device_precision_sloppy, - inv_args.tadpole, inv_args.naik_epsilon); - - QudaInvertParam invertParam = newQudaInvertParam(); - - QudaParity local_parity = inv_args.evenodd; - const double reliable_delta = 1e-1; - - setInvertParams(host_precision, device_precision, device_precision_sloppy, mass, target_residual, - target_fermilab_residual, inv_args.max_iter, reliable_delta, local_parity, verbosity, - QUDA_CG_INVERTER, &invertParam); - - ColorSpinorParam csParam; - setColorSpinorParams(localDim, host_precision, &csParam); - - // dirty hack to invalidate the cached gauge field without breaking interface compatability - if (*num_iters == -1 || !canReuseResidentGauge(&invertParam)) invalidateGaugeQuda(); - - if (invalidate_quda_gauge || !create_quda_gauge) { - loadGaugeQuda(const_cast(fatlink), &fat_param); - if (longlink != nullptr) loadGaugeQuda(const_cast(longlink), &long_param); - invalidate_quda_gauge = false; - } - - if (longlink == nullptr) invertParam.dslash_type = QUDA_STAGGERED_DSLASH; - - int quark_offset = getColorVectorOffset(local_parity, false, localDim) * host_precision; - - invertQuda(static_cast(solution) + quark_offset, static_cast(source) + quark_offset, &invertParam); - - // return the number of iterations taken by the inverter - *num_iters = invertParam.iter; - *final_residual = invertParam.true_res; - *final_fermilab_residual = invertParam.true_res_hq; - - if (!create_quda_gauge) invalidateGaugeQuda(); - -} // qudaInvert -#endif - -// void* openQCD_qudaCreateGaugeField(void *gauge, int geometry, int precision) -// { -// qudamilc_called(__func__); -// QudaPrecision qudaPrecision = (precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; -// QudaGaugeParam qudaGaugeParam -// = newMILCGaugeParam(localDim, qudaPrecision, (geometry == 1) ? QUDA_GENERAL_LINKS : QUDA_SU3_LINKS); // TODO: -// change MILC to openQCD -// qudamilc_called(__func__); -// return createGaugeFieldQuda(gauge, geometry, &qudaGaugeParam); -// } - -// void qudaSaveGaugeField(void *gauge, void *inGauge) -// { -// qudamilc_called(__func__); -// cudaGaugeField *cudaGauge = reinterpret_cast(inGauge); -// QudaGaugeParam qudaGaugeParam = newMILCGaugeParam(localDim, cudaGauge->Precision(), QUDA_GENERAL_LINKS); // TODO: -// change MILC to openQCD saveGaugeFieldQuda(gauge, inGauge, &qudaGaugeParam); qudamilc_called(__func__); -// } - -// void qudaDestroyGaugeField(void *gauge) -// { -// qudamilc_called(__func__); -// destroyGaugeFieldQuda(gauge); -// qudamilc_called(__func__); -// } - -// void setInvertParam(QudaInvertParam &invertParam, openQCD_QudaInvertArgs_t &inv_args, int external_precision, -// int quda_precision, double kappa, double reliable_delta); - -/*void setInvertParam(QudaInvertParam &invertParam, openQCD_QudaInvertArgs_t &inv_args, int external_precision, - int quda_precision, double kappa, double reliable_delta) -{ - - const QudaPrecision host_precision = (external_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - const QudaPrecision device_precision = (quda_precision == 2) ? QUDA_DOUBLE_PRECISION : QUDA_SINGLE_PRECISION; - QudaPrecision device_precision_sloppy; - switch (inv_args.mixed_precision) { - case 2: device_precision_sloppy = QUDA_HALF_PRECISION; break; - case 1: device_precision_sloppy = QUDA_SINGLE_PRECISION; break; - default: device_precision_sloppy = device_precision; - } - - static const QudaVerbosity verbosity = getVerbosity(); + /* both fields reside on the CPU */ + invertParam.input_location = QUDA_CPU_FIELD_LOCATION; + invertParam.output_location = QUDA_CPU_FIELD_LOCATION; - invertParam.dslash_type = QUDA_WILSON_DSLASH; - invertParam.kappa = kappa; - invertParam.dagger = QUDA_DAG_NO; - invertParam.mass_normalization = QUDA_KAPPA_NORMALIZATION; - invertParam.gcrNkrylov = 30; - invertParam.reliable_delta = reliable_delta; - invertParam.maxiter = inv_args.max_iter; - - invertParam.cuda_prec_precondition = device_precision_sloppy; - invertParam.verbosity_precondition = verbosity; - invertParam.verbosity = verbosity; - invertParam.cpu_prec = host_precision; - invertParam.cuda_prec = device_precision; - invertParam.cuda_prec_sloppy = device_precision_sloppy; - invertParam.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; - invertParam.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; - invertParam.clover_cpu_prec = host_precision; - invertParam.clover_cuda_prec = device_precision; - invertParam.clover_cuda_prec_sloppy = device_precision_sloppy; - invertParam.clover_cuda_prec_precondition = device_precision_sloppy; - invertParam.clover_order = QUDA_PACKED_CLOVER_ORDER; - - invertParam.compute_action = 0; -}*/ - -// TODO: OpenQCDMultigridPack functions a la MILC (cf. milc_interface.cpp) + MatQuda(static_cast(dst), static_cast(src), &invertParam); +} From be90f9d724c11935acd527a00af745924d38003d Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Thu, 14 Sep 2023 16:48:15 +0200 Subject: [PATCH 053/148] GCR inverter works --- include/quda_openqcd_interface.h | 126 +++++++++-------- lib/openqcd_interface.cpp | 231 +++++++++++++++++-------------- 2 files changed, 192 insertions(+), 165 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 4cec2fd675..78bb8cd8fc 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -29,6 +29,7 @@ typedef struct { int device; /** GPU device number */ } openQCD_QudaLayout_t; + /** * Parameters used to create a QUDA context. */ @@ -52,15 +53,35 @@ typedef struct { int gauge_loaded; /** Whether openQCD_qudaGaugeLoad() was called or not */ int clover_loaded; /** Whether openQCD_qudaCloverLoad() was called or not */ int dslash_setup; /** Whether openQCD_qudaSetDslashOptions() was called or not */ + openQCD_QudaInitArgs_t init; + openQCD_QudaLayout_t layout; } openQCD_QudaState_t; +typedef struct { + double kappa; + double mu; + double su3csw; + int dagger; +} openQCD_QudaDiracParam_t; + + +typedef struct { + double tol; + double nmx; + int nkv; + double reliable_delta; +} openQCD_QudaGCRParam_t; + + /** * Initialize the QUDA context. * - * @param input Meta data for the QUDA context + * @param[in] init Meta data for the QUDA context + * @param[in] layout The layout */ -void openQCD_qudaInit(openQCD_QudaInitArgs_t input); +void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout); + /** * Set set the local dimensions and machine topology for QUDA to use @@ -69,43 +90,13 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t input); */ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout); + /** * Destroy the QUDA context. */ void openQCD_qudaFinalize(void); -/** - * Parameters related to linear solvers. - */ - -typedef struct { - // TODO: work out what we want to expose here - int max_iter; /** Maximum number of iterations */ - QudaParity - evenodd; /** Which parity are we working on ? (options are QUDA_EVEN_PARITY, QUDA_ODD_PARITY, QUDA_INVALID_PARITY */ - int mixed_precision; /** Whether to use mixed precision or not (1 - yes, 0 - no) */ - double boundary_phase[4]; /** Boundary conditions */ - int make_resident_solution; /** Make the solution resident and don't copy back */ - int use_resident_solution; /** Use the resident solution */ - QudaInverterType solver_type; /** Type of solver to use */ - double tadpole; /** Tadpole improvement factor - set to 1.0 for - HISQ fermions since the tadpole factor is - baked into the links during their construction */ - double naik_epsilon; /** Naik epsilon parameter (HISQ fermions only).*/ - QudaDslashType dslash_type; -} openQCD_QudaInvertArgs_t; - - -/** - * @brief Setup Dirac operator - * - * @param[in] kappa kappa - * @param[in] mu twisted mass - */ -void openQCD_qudaSetDwOptions(double kappa, double mu); - - /** * @brief Norm square on QUDA. * @@ -115,8 +106,20 @@ void openQCD_qudaSetDwOptions(double kappa, double mu); */ double openQCD_qudaNorm(void *h_in); + +/** + * @brief Applies Dirac matrix to spinor. + * + * openQCD_out = gamma[dir] * openQCD_in + * + * @param[in] dir Dirac index, 0 <= dir <= 5, notice that dir is in + * openQCD convention, ie. (0: t, 1: x, 2: y, 3: z, 4: 5, 5: 5) + * @param[in] openQCD_in of type spinor_dble[NSPIN] + * @param[out] openQCD_out of type spinor_dble[NSPIN] + */ void openQCD_qudaGamma(int dir, void *openQCD_in, void *openQCD_out); + /** * @brief Apply the Wilson-Clover Dirac operator to a field. All fields * passed and returned are host (CPU) fields in openQCD order. @@ -126,33 +129,36 @@ void openQCD_qudaGamma(int dir, void *openQCD_in, void *openQCD_out); * @param[in] dagger Whether we are using the Hermitian conjugate system or * not (QUDA_DAG_NO or QUDA_DAG_YES) */ -void openQCD_qudaDw(void *src, void *dst, QudaDagType dagger); +void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p); /** - * Solve Ax=b for an improved staggered operator. All fields are fields - * passed and returned are host (CPU) field in MILC order. This - * function requires that persistent gauge and clover fields have - * been created prior. This interface is experimental. + * Solve Ax=b for a Clover Wilson operator using QUDAs GCR algorithm. All fields + * are fields passed and returned are host (CPU) field in openQCD order. This + * function requires that persistent gauge and clover fields have been created + * prior. * - * @param external_precision Precision of host fields passed to QUDA (2 - double, 1 - single) - * @param quda_precision Precision for QUDA to use (2 - double, 1 - single) - * @param mass Fermion mass parameter - * @param inv_args Struct setting some solver metadata - * @param target_residual Target residual - * @param target_relative_residual Target Fermilab residual - * @param milc_fatlink Fat-link field on the host - * @param milc_longlink Long-link field on the host - * @param source Right-hand side source field - * @param solution Solution spinor field - * @param final_residual True residual - * @param final_relative_residual True Fermilab residual - * @param num_iters Number of iterations taken - */ -void openQCD_qudaInvert(int external_precision, int quda_precision, double mass, openQCD_QudaInvertArgs_t inv_args, - double target_residual, double target_fermilab_residual, const void *const milc_fatlink, - const void *const milc_longlink, void *source, void *solution, double *const final_resid, - double *const final_rel_resid, int *num_iters); + * @param[in] source Source spinor + * @param[out] solution Solution spinor + * @param[in] dirac_param Dirac parameter struct + * @param[in] gcr_param GCR parameter struct + */ +void openQCD_qudaGCR(void *source, void *solution, + openQCD_QudaDiracParam_t dirac_param, openQCD_QudaGCRParam_t gcr_param); + + +/** + * Solve Ax=b for an Clover Wilson operator. All fields are fields passed and + * returned are host (CPU) field in openQCD order. This function requires that + * persistent gauge and clover fields have been created prior. + * + * @param[in] source Right-hand side source field + * @param[out] solution Solution spinor field + * @param[in] tol The tolerance + * @param[in] maxiter The maxiter + */ +void openQCD_qudaInvert(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param); + /** * @brief Wrapper for the plaquette. We could call plaqQuda() directly in @@ -195,14 +201,6 @@ void openQCD_qudaGaugeFree(void); void openQCD_qudaCloverLoad(void *clover); -/** - * @brief Calculates the clover field and its inverse - * - * @param[in] su3csw The csw coefficient - */ -void openQCD_qudaCloverCreate(double su3csw); - - /** * @brief Free the clover field allocated in quda. */ diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 5bc89d4f45..91eaff18cc 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -46,9 +46,7 @@ static const int num_colors = sizeof(colors) / sizeof(uint32_t); #define POP_RANGE #endif -static openQCD_QudaInitArgs_t input; -static QudaInvertParam invertParam; -static openQCD_QudaState_t qudaState = {false, false, false, false}; +static openQCD_QudaState_t qudaState = {false, false, false, false, {}, {}}; using namespace quda; @@ -64,9 +62,9 @@ static lat_dim_t get_local_dims(int *fill = nullptr) for (int i=0; i<4; i++) { if (fill) { - fill[i] = input.layout.L[i]; + fill[i] = qudaState.layout.L[i]; } else { - X[i] = input.layout.L[i]; + X[i] = qudaState.layout.L[i]; } } @@ -196,48 +194,20 @@ static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec) param.gauge_fix = QUDA_GAUGE_FIXED_NO; param.scale = 1.0; param.anisotropy = 1.0; // 1.0 means not anisotropic - //param.tadpole_coeff = 1.0; - //param.scale = 0; param.ga_pad = getLinkPadding(param.X); /* Why this? */ return param; } -/** - * @brief Initialize clover param struct - * - * @param[in] kappa hopping parameter - * @param[in] su3csw The su 3 csw - * - * @return The quda gauge parameter struct. - */ -static QudaInvertParam newOpenQCDCloverParam(double kappa, double su3csw) -{ - QudaInvertParam param = newOpenQCDParam(); - - param.clover_location = QUDA_CPU_FIELD_LOCATION; - param.clover_cpu_prec = QUDA_DOUBLE_PRECISION; - param.clover_cuda_prec = QUDA_DOUBLE_PRECISION; - param.clover_order = QUDA_FLOAT8_CLOVER_ORDER; /*QUDA_OPENQCD_CLOVER_ORDER; */ - - param.compute_clover = true; - param.kappa = kappa; - param.clover_csw = su3csw; - param.clover_coeff = 0.0; - param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; - - return param; -} - - -void openQCD_qudaInit(openQCD_QudaInitArgs_t in) +void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout) { if (qudaState.initialized) return; - input = in; + qudaState.init = init; + qudaState.layout = layout; - setVerbosityQuda(input.verbosity, "QUDA: ", input.logfile); - openQCD_qudaSetLayout(input.layout); + setVerbosityQuda(qudaState.init.verbosity, "QUDA: ", qudaState.init.logfile); + openQCD_qudaSetLayout(qudaState.layout); qudaState.initialized = true; } @@ -281,9 +251,9 @@ void openQCD_qudaGaugeLoad(void *gauge) { QudaGaugeParam param = newOpenQCDGaugeParam(QUDA_DOUBLE_PRECISION); - void* buffer = malloc(4*input.volume*input.sizeof_su3_dble); - input.reorder_gauge_openqcd_to_quda(gauge, buffer); - input.gauge = gauge; + void* buffer = malloc(4*qudaState.init.volume*qudaState.init.sizeof_su3_dble); + qudaState.init.reorder_gauge_openqcd_to_quda(gauge, buffer); + qudaState.init.gauge = gauge; loadGaugeQuda(buffer, ¶m); free(buffer); @@ -293,11 +263,11 @@ void openQCD_qudaGaugeLoad(void *gauge) void openQCD_qudaGaugeSave(void *gauge) { - QudaGaugeParam qudaGaugeParam = newOpenQCDGaugeParam(QUDA_DOUBLE_PRECISION); + QudaGaugeParam param = newOpenQCDGaugeParam(QUDA_DOUBLE_PRECISION); - void* buffer = malloc(4*input.volume*input.sizeof_su3_dble); - saveGaugeQuda(buffer, &qudaGaugeParam); - input.reorder_gauge_quda_to_openqcd(buffer, gauge); + void* buffer = malloc(4*qudaState.init.volume*qudaState.init.sizeof_su3_dble); + saveGaugeQuda(buffer, ¶m); + qudaState.init.reorder_gauge_quda_to_openqcd(buffer, gauge); free(buffer); } @@ -316,24 +286,6 @@ void openQCD_qudaCloverLoad(void *clover) qudaState.clover_loaded = true; } -void openQCD_qudaCloverCreate(double su3csw) -{ - if (!qudaState.dslash_setup) { - errorQuda("Need to call openQCD_qudaSetDwOptions() first!"); - } - - QudaInvertParam param = newOpenQCDCloverParam(invertParam.kappa, su3csw); - - /* Set to Wilson Dirac operator with Clover term */ - invertParam.dslash_type = QUDA_CLOVER_WILSON_DSLASH; - - /** - * Leaving both h_clover = h_clovinv = NULL allocates the clover field on the - * GPU and finally calls @createCloverQuda to calculate the clover field. - */ - loadCloverQuda(NULL, NULL, ¶m); - qudaState.clover_loaded = true; -} void openQCD_qudaCloverFree(void) { @@ -342,32 +294,72 @@ void openQCD_qudaCloverFree(void) } -void openQCD_qudaSetDwOptions(double kappa, double mu) +QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) { - if (mu != 0.0) { - errorQuda("twisted mass not implemented yet."); + if (!qudaState.gauge_loaded) { + errorQuda("Gauge field not loaded into QUDA, cannot setup Dirac operator / clover term. Call openQCD_qudaGaugeLoad() first."); } - invertParam = newOpenQCDParam(); - - /* Set to Wilson Dirac operator */ - invertParam.dslash_type = QUDA_WILSON_DSLASH; + QudaInvertParam param = newOpenQCDParam(); - /* Hopping parameter */ - invertParam.kappa = kappa; + param.dslash_type = QUDA_WILSON_DSLASH; + param.kappa = p.kappa; + param.mu = p.mu; + param.dagger = p.dagger ? QUDA_DAG_YES : QUDA_DAG_NO; + + if (p.su3csw != 0.0) { + param.clover_location = QUDA_CPU_FIELD_LOCATION; // TODO: ?? not GPU?? + param.clover_cpu_prec = QUDA_DOUBLE_PRECISION; + param.clover_cuda_prec = QUDA_DOUBLE_PRECISION; + param.clover_order = QUDA_FLOAT8_CLOVER_ORDER; // what implication has this? + + param.compute_clover = true; + param.clover_csw = p.su3csw; + param.clover_coeff = 0.0; + + // Set to Wilson Dirac operator with Clover term + param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; + + if (!qudaState.clover_loaded) { + /** + * Leaving both h_clover = h_clovinv = NULL allocates the clover field on + * the GPU and finally calls @createCloverQuda to calculate the clover + * field. + */ + loadCloverQuda(NULL, NULL, ¶m); // Create the clover field + qudaState.clover_loaded = true; + } + } - invertParam.inv_type = QUDA_CG_INVERTER; /* just set some, needed? */ + param.inv_type = QUDA_CG_INVERTER; /* just set some, needed? */ /* What is the difference? only works with QUDA_MASS_NORMALIZATION */ - invertParam.mass_normalization = QUDA_MASS_NORMALIZATION; + param.mass_normalization = QUDA_MASS_NORMALIZATION; /* Extent of the 5th dimension (for domain wall) */ - invertParam.Ls = 1; + param.Ls = 1; - /* Twisted mass parameter */ - invertParam.mu = mu; + return param; +} + +QudaInvertParam newOpenQCDSolverParam(openQCD_QudaDiracParam_t p) +{ + QudaInvertParam param = newOpenQCDDiracParam(p); + + param.compute_true_res = true; + + param.solution_type = QUDA_MAT_SOLUTION; + param.solve_type = QUDA_DIRECT_SOLVE; + param.matpc_type = QUDA_MATPC_EVEN_EVEN; + param.dagger = QUDA_DAG_YES; + param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; + param.inv_type_precondition = QUDA_INVALID_INVERTER; // disables any preconditioning + + // both fields reside on the CPU + param.input_location = QUDA_CPU_FIELD_LOCATION; + param.output_location = QUDA_CPU_FIELD_LOCATION; - qudaState.dslash_setup = true; + return param; } @@ -394,16 +386,6 @@ double openQCD_qudaNorm(void *h_in) } -/** - * @brief Applies Dirac matrix to spinor. - * - * openQCD_out = gamma[dir] * openQCD_in - * - * @param[in] dir Dirac index, 0 <= dir <= 5, notice that dir is in - * openQCD convention, ie. (0: t, 1: x, 2: y, 3: z, 4: 5, 5: 5) - * @param[in] openQCD_in of type spinor_dble[NSPIN] - * @param[out] openQCD_out of type spinor_dble[NSPIN] - */ void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) { // sets up the necessary parameters @@ -457,23 +439,70 @@ void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) } -void openQCD_qudaDw(void *src, void *dst, QudaDagType dagger) +void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p) { - if (!qudaState.gauge_loaded) { - errorQuda("Gauge field not loaded into QUDA, cannot apply Dslash. Call openQCD_qudaGaugeLoad() first."); - return; - } + QudaInvertParam param = newOpenQCDDiracParam(p); - if (!qudaState.dslash_setup) { - errorQuda("Dslash parameters are not set, cannot apply Dslash!"); - return; - } + /* both fields reside on the CPU */ + param.input_location = QUDA_CPU_FIELD_LOCATION; + param.output_location = QUDA_CPU_FIELD_LOCATION; + + MatQuda(static_cast(dst), static_cast(src), ¶m); +} + + +void openQCD_qudaGCR(void *source, void *solution, + openQCD_QudaDiracParam_t dirac_param, openQCD_QudaGCRParam_t gcr_param) +{ + QudaInvertParam param = newOpenQCDSolverParam(dirac_param); + + param.inv_type = QUDA_GCR_INVERTER; + param.tol = gcr_param.tol; + param.maxiter = gcr_param.nmx; + param.gcrNkrylov = gcr_param.nkv; + param.reliable_delta = gcr_param.reliable_delta; + + invertQuda(static_cast(solution), static_cast(source), ¶m); + + printfQuda("true_res = %.2e\n", param.true_res); + printfQuda("true_res_hq = %.2e\n", param.true_res_hq); + printfQuda("iter = %d\n", param.iter); + printfQuda("gflops = %.2e\n", param.gflops); + printfQuda("secs = %.2e\n", param.secs); + printfQuda("Nsteps = %d\n", param.Nsteps); +} + + +void openQCD_qudaInvert(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param) +{ + QudaInvertParam param = newOpenQCDSolverParam(dirac_param); - invertParam.dagger = dagger; + /*param.verbosity = QUDA_VERBOSE;*/ + param.inv_type = QUDA_GCR_INVERTER; /*QUDA_CG_INVERTER*/ + param.tol = 1e-2; + param.compute_true_res = true; + param.maxiter = 100; + + param.gcrNkrylov = 20; + param.reliable_delta = 1e-5; + + param.solution_type = QUDA_MAT_SOLUTION; + param.solve_type = QUDA_DIRECT_SOLVE; + param.matpc_type = QUDA_MATPC_EVEN_EVEN; + param.dagger = QUDA_DAG_YES; + param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; + param.inv_type_precondition = QUDA_INVALID_INVERTER; /* disables any preconditioning */ /* both fields reside on the CPU */ - invertParam.input_location = QUDA_CPU_FIELD_LOCATION; - invertParam.output_location = QUDA_CPU_FIELD_LOCATION; + param.input_location = QUDA_CPU_FIELD_LOCATION; + param.output_location = QUDA_CPU_FIELD_LOCATION; + + invertQuda(static_cast(solution), static_cast(source), ¶m); - MatQuda(static_cast(dst), static_cast(src), &invertParam); + printfQuda("true_res = %.2e\n", param.true_res); + printfQuda("true_res_hq = %.2e\n", param.true_res_hq); + printfQuda("iter = %d\n", param.iter); + printfQuda("gflops = %.2e\n", param.gflops); + printfQuda("secs = %.2e\n", param.secs); + printfQuda("Nsteps = %d\n", param.Nsteps); } From 736d2cddbbb1f9deba0e19970101e225de40e8d1 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Thu, 14 Sep 2023 19:27:19 +0200 Subject: [PATCH 054/148] cleaned up a little --- include/quda_openqcd_interface.h | 54 ++++++++---------- lib/openqcd_interface.cpp | 95 ++++++++++++++++++-------------- 2 files changed, 76 insertions(+), 73 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 78bb8cd8fc..8601fe09e0 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -39,8 +39,6 @@ typedef struct { FILE *logfile; /** log file handler */ void *gauge; /** base pointer to the gauge fields */ int volume; /** VOLUME */ - int sizeof_su3_dble; /** sizeof(su3_dble) */ - int sizeof_spinor_dble; /** sizeof(spinor_dble) */ void (*reorder_gauge_openqcd_to_quda)(void *in, void *out); void (*reorder_gauge_quda_to_openqcd)(void *in, void *out); void (*reorder_spinor_openqcd_to_quda)(void *in, void *out); @@ -59,18 +57,18 @@ typedef struct { typedef struct { - double kappa; - double mu; - double su3csw; - int dagger; + double kappa; /* kappa: hopping parameter */ + double mu; /* mu: twisted mass */ + double su3csw; /* su3csw: csw coefficient */ + int dagger; /* dagger: whether to apply D or D^dagger */ } openQCD_QudaDiracParam_t; typedef struct { - double tol; - double nmx; - int nkv; - double reliable_delta; + double tol; /* solver tolerance (relative residual) */ + double nmx; /* maximal number of steps */ + int nkv; /* number of Krylov vector to keep */ + double reliable_delta; /* ???? */ } openQCD_QudaGCRParam_t; @@ -78,19 +76,11 @@ typedef struct { * Initialize the QUDA context. * * @param[in] init Meta data for the QUDA context - * @param[in] layout The layout + * @param[in] layout The layout struct */ void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout); -/** - * Set set the local dimensions and machine topology for QUDA to use - * - * @param layout Struct defining local dimensions and machine topology - */ -void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout); - - /** * Destroy the QUDA context. */ @@ -100,7 +90,7 @@ void openQCD_qudaFinalize(void); /** * @brief Norm square on QUDA. * - * @param[in] h_in Input field (from openQCD) + * @param[in] h_in Spinor input field (from openQCD) * * @return The norm */ @@ -124,10 +114,9 @@ void openQCD_qudaGamma(int dir, void *openQCD_in, void *openQCD_out); * @brief Apply the Wilson-Clover Dirac operator to a field. All fields * passed and returned are host (CPU) fields in openQCD order. * - * @param[in] src Source spinor field - * @param[out] dst Destination spinor field - * @param[in] dagger Whether we are using the Hermitian conjugate system or - * not (QUDA_DAG_NO or QUDA_DAG_YES) + * @param[in] src Source spinor field + * @param[out] dst Destination spinor field + * @param[in] p Dirac parameter struct */ void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p); @@ -152,10 +141,9 @@ void openQCD_qudaGCR(void *source, void *solution, * returned are host (CPU) field in openQCD order. This function requires that * persistent gauge and clover fields have been created prior. * - * @param[in] source Right-hand side source field - * @param[out] solution Solution spinor field - * @param[in] tol The tolerance - * @param[in] maxiter The maxiter + * @param[in] source Right-hand side source field + * @param[out] solution Solution spinor field + * @param[in] dirac_param Dirac parameter struct */ void openQCD_qudaInvert(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param); @@ -174,17 +162,19 @@ double openQCD_qudaPlaquette(void); /** * @brief Load the gauge fields from host to quda. * - * @param[in] gauge The gauge fields (in openqcd order) + * @param[in] gauge The gauge fields (in openqcd order) + * @param[in] prec Precision of the gauge field */ -void openQCD_qudaGaugeLoad(void *gauge); +void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec); /** * @brief Save the gauge fields from quda to host. * - * @param[out] gauge The gauge fields (will be stored in openqcd order) + * @param[out] gauge The gauge fields (will be stored in openqcd order) + * @param[in] prec Precision of the gauge field */ -void openQCD_qudaGaugeSave(void *gauge); +void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec); /** diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 91eaff18cc..f22b8a0937 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -96,9 +96,9 @@ static int rankFromCoords(const int *coords, void *fdata) // TODO: /** - * @brief Set layout parameters. + * Set set the local dimensions and machine topology for QUDA to use * - * @param[in] layout The layout + * @param layout Struct defining local dimensions and machine topology */ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) { @@ -135,9 +135,9 @@ static int getLinkPadding(const int dim[4]) /** - * @brief Initialize invert param struct - * - * @return The quda invert parameter struct. + * @brief Creates a new quda parameter struct + * + * @return The quda parameter struct. */ static QudaInvertParam newOpenQCDParam(void) { @@ -147,8 +147,8 @@ static QudaInvertParam newOpenQCDParam(void) param.verbosity = verbosity; - param.cpu_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the input fermion fields */ - param.cuda_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the QUDA solver */ + param.cpu_prec = QUDA_DOUBLE_PRECISION; // The precision used by the input fermion fields + param.cuda_prec = QUDA_DOUBLE_PRECISION; // The precision used by the QUDA solver /** * The order of the input and output fermion fields. Imposes fieldOrder = @@ -157,7 +157,7 @@ static QudaInvertParam newOpenQCDParam(void) */ param.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; - /* Gamma basis of the input and output host fields */ + // Gamma basis of the input and output host fields param.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; return param; @@ -165,7 +165,7 @@ static QudaInvertParam newOpenQCDParam(void) /** - * @brief Initialize gauge param struct + * @brief Initialize quda gauge param struct * * @param[in] prec precision * @@ -181,20 +181,16 @@ static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec) param.reconstruct_sloppy = param.reconstruct = QUDA_RECONSTRUCT_NO; - /** - * This make quda to instantiate OpenQCDOrder - */ + // This make quda to instantiate OpenQCDOrder param.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; - /** - * Seems to have no effect ... - */ + // Seems to have no effect ... param.t_boundary = QUDA_PERIODIC_T; param.gauge_fix = QUDA_GAUGE_FIXED_NO; param.scale = 1.0; param.anisotropy = 1.0; // 1.0 means not anisotropic - param.ga_pad = getLinkPadding(param.X); /* Why this? */ + param.ga_pad = getLinkPadding(param.X); // Why this? return param; } @@ -247,13 +243,12 @@ double openQCD_qudaPlaquette(void) } -void openQCD_qudaGaugeLoad(void *gauge) +void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec) { - QudaGaugeParam param = newOpenQCDGaugeParam(QUDA_DOUBLE_PRECISION); + QudaGaugeParam param = newOpenQCDGaugeParam(prec); - void* buffer = malloc(4*qudaState.init.volume*qudaState.init.sizeof_su3_dble); + void* buffer = malloc(4*qudaState.init.volume*18*prec); qudaState.init.reorder_gauge_openqcd_to_quda(gauge, buffer); - qudaState.init.gauge = gauge; loadGaugeQuda(buffer, ¶m); free(buffer); @@ -261,16 +256,17 @@ void openQCD_qudaGaugeLoad(void *gauge) } -void openQCD_qudaGaugeSave(void *gauge) +void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec) { - QudaGaugeParam param = newOpenQCDGaugeParam(QUDA_DOUBLE_PRECISION); + QudaGaugeParam param = newOpenQCDGaugeParam(prec); - void* buffer = malloc(4*qudaState.init.volume*qudaState.init.sizeof_su3_dble); + void* buffer = malloc(4*qudaState.init.volume*18*prec); saveGaugeQuda(buffer, ¶m); qudaState.init.reorder_gauge_quda_to_openqcd(buffer, gauge); free(buffer); } + void openQCD_qudaGaugeFree(void) { freeGaugeQuda(); @@ -294,10 +290,17 @@ void openQCD_qudaCloverFree(void) } -QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) +/** + * @brief Creates a new quda Dirac parameter struct + * + * @param[in] p OpenQCD Dirac parameter struct + * + * @return The quda Dirac parameter struct. + */ +static QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) { if (!qudaState.gauge_loaded) { - errorQuda("Gauge field not loaded into QUDA, cannot setup Dirac operator / clover term. Call openQCD_qudaGaugeLoad() first."); + errorQuda("Gauge field not loaded into QUDA, cannot setup Dirac operator / Clover term. Call openQCD_qudaGaugeLoad() first."); } QudaInvertParam param = newOpenQCDParam(); @@ -308,7 +311,7 @@ QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) param.dagger = p.dagger ? QUDA_DAG_YES : QUDA_DAG_NO; if (p.su3csw != 0.0) { - param.clover_location = QUDA_CPU_FIELD_LOCATION; // TODO: ?? not GPU?? + param.clover_location = QUDA_CUDA_FIELD_LOCATION; // seems to have no effect? param.clover_cpu_prec = QUDA_DOUBLE_PRECISION; param.clover_cuda_prec = QUDA_DOUBLE_PRECISION; param.clover_order = QUDA_FLOAT8_CLOVER_ORDER; // what implication has this? @@ -331,18 +334,26 @@ QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) } } - param.inv_type = QUDA_CG_INVERTER; /* just set some, needed? */ + param.inv_type = QUDA_CG_INVERTER; // just set some, needed? - /* What is the difference? only works with QUDA_MASS_NORMALIZATION */ + // What is the difference? only works with QUDA_MASS_NORMALIZATION param.mass_normalization = QUDA_MASS_NORMALIZATION; - /* Extent of the 5th dimension (for domain wall) */ + // Extent of the 5th dimension (for domain wall) param.Ls = 1; return param; } -QudaInvertParam newOpenQCDSolverParam(openQCD_QudaDiracParam_t p) + +/** + * @brief Creates a new quda solver parameter struct + * + * @param[in] p OpenQCD Dirac parameter struct + * + * @return The quda solver parameter struct. + */ +static QudaInvertParam newOpenQCDSolverParam(openQCD_QudaDiracParam_t p) { QudaInvertParam param = newOpenQCDDiracParam(p); @@ -351,14 +362,9 @@ QudaInvertParam newOpenQCDSolverParam(openQCD_QudaDiracParam_t p) param.solution_type = QUDA_MAT_SOLUTION; param.solve_type = QUDA_DIRECT_SOLVE; param.matpc_type = QUDA_MATPC_EVEN_EVEN; - param.dagger = QUDA_DAG_YES; param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; param.inv_type_precondition = QUDA_INVALID_INVERTER; // disables any preconditioning - // both fields reside on the CPU - param.input_location = QUDA_CPU_FIELD_LOCATION; - param.output_location = QUDA_CPU_FIELD_LOCATION; - return param; } @@ -443,7 +449,7 @@ void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p) { QudaInvertParam param = newOpenQCDDiracParam(p); - /* both fields reside on the CPU */ + // both fields reside on the CPU param.input_location = QUDA_CPU_FIELD_LOCATION; param.output_location = QUDA_CPU_FIELD_LOCATION; @@ -456,6 +462,10 @@ void openQCD_qudaGCR(void *source, void *solution, { QudaInvertParam param = newOpenQCDSolverParam(dirac_param); + // both fields reside on the CPU + param.input_location = QUDA_CPU_FIELD_LOCATION; + param.output_location = QUDA_CPU_FIELD_LOCATION; + param.inv_type = QUDA_GCR_INVERTER; param.tol = gcr_param.tol; param.maxiter = gcr_param.nmx; @@ -477,8 +487,12 @@ void openQCD_qudaInvert(void *source, void *solution, openQCD_QudaDiracParam_t d { QudaInvertParam param = newOpenQCDSolverParam(dirac_param); - /*param.verbosity = QUDA_VERBOSE;*/ - param.inv_type = QUDA_GCR_INVERTER; /*QUDA_CG_INVERTER*/ + // both fields reside on the CPU + param.input_location = QUDA_CPU_FIELD_LOCATION; + param.output_location = QUDA_CPU_FIELD_LOCATION; + + //param.verbosity = QUDA_VERBOSE; + param.inv_type = QUDA_GCR_INVERTER; // QUDA_CG_INVERTER param.tol = 1e-2; param.compute_true_res = true; param.maxiter = 100; @@ -489,11 +503,10 @@ void openQCD_qudaInvert(void *source, void *solution, openQCD_QudaDiracParam_t d param.solution_type = QUDA_MAT_SOLUTION; param.solve_type = QUDA_DIRECT_SOLVE; param.matpc_type = QUDA_MATPC_EVEN_EVEN; - param.dagger = QUDA_DAG_YES; param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; - param.inv_type_precondition = QUDA_INVALID_INVERTER; /* disables any preconditioning */ + param.inv_type_precondition = QUDA_INVALID_INVERTER; // disables any preconditioning - /* both fields reside on the CPU */ + // both fields reside on the CPU param.input_location = QUDA_CPU_FIELD_LOCATION; param.output_location = QUDA_CPU_FIELD_LOCATION; From 3f131b02cc58b4adc709b5f3dcb1ac1aece3c52b Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Tue, 19 Sep 2023 16:27:28 +0200 Subject: [PATCH 055/148] Add fvtx for profiler --- include/quda_openqcd_interface.h | 4 ++-- lib/copy_color_spinor.cuh | 1 + lib/openqcd_interface.cpp | 29 ++++++++++++++++++++++++++++- 3 files changed, 31 insertions(+), 3 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 8601fe09e0..27b7b94144 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -68,7 +68,7 @@ typedef struct { double tol; /* solver tolerance (relative residual) */ double nmx; /* maximal number of steps */ int nkv; /* number of Krylov vector to keep */ - double reliable_delta; /* ???? */ + double reliable_delta; /* controls interval at wich accurate residual is updated */ } openQCD_QudaGCRParam_t; @@ -132,7 +132,7 @@ void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p); * @param[in] dirac_param Dirac parameter struct * @param[in] gcr_param GCR parameter struct */ -void openQCD_qudaGCR(void *source, void *solution, +double openQCD_qudaGCR(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param, openQCD_QudaGCRParam_t gcr_param); diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh index 20833e911e..f290b97f00 100644 --- a/lib/copy_color_spinor.cuh +++ b/lib/copy_color_spinor.cuh @@ -27,6 +27,7 @@ namespace quda { in(in) { strcat(aux, out.AuxString()); + /* AA: for tune cache */ if (out.GammaBasis()==in.GammaBasis()) strcat(aux, ",PreserveBasis"); else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS) strcat(aux, ",NonRelBasis"); else if (out.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) strcat(aux, ",RelBasis"); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index f22b8a0937..5cb5535ca9 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -50,6 +50,28 @@ static openQCD_QudaState_t qudaState = {false, false, false, false, {}, {}}; using namespace quda; +template void inline qudaopenqcd_called(const char *func, QudaVerbosity verb) +{ + // add NVTX markup if enabled + if (start) { + PUSH_RANGE(func, 1); + } else { + POP_RANGE; + } + + #ifdef QUDAMILC_VERBOSE + if (verb >= QUDA_VERBOSE) { + if (start) { + printfQuda("QUDA_OPENQCD_INTERFACE: %s (called) \n", func); + } else { + printfQuda("QUDA_OPENQCD_INTERFACE: %s (return) \n", func); + } + } +#endif +} + +template void inline qudaopenqcd_called(const char *func) { qudaopenqcd_called(func, getVerbosity()); } + /** * @brief Returns the local lattice dimensions as lat_dim_t @@ -203,7 +225,9 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout) qudaState.layout = layout; setVerbosityQuda(qudaState.init.verbosity, "QUDA: ", qudaState.init.logfile); + qudaopenqcd_called(__func__); openQCD_qudaSetLayout(qudaState.layout); + qudaopenqcd_called(__func__); qudaState.initialized = true; } @@ -247,6 +271,7 @@ void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec) { QudaGaugeParam param = newOpenQCDGaugeParam(prec); + /* Matthias Wagner: optimize that */ void* buffer = malloc(4*qudaState.init.volume*18*prec); qudaState.init.reorder_gauge_openqcd_to_quda(gauge, buffer); loadGaugeQuda(buffer, ¶m); @@ -457,7 +482,7 @@ void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p) } -void openQCD_qudaGCR(void *source, void *solution, +double openQCD_qudaGCR(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param, openQCD_QudaGCRParam_t gcr_param) { QudaInvertParam param = newOpenQCDSolverParam(dirac_param); @@ -480,6 +505,8 @@ void openQCD_qudaGCR(void *source, void *solution, printfQuda("gflops = %.2e\n", param.gflops); printfQuda("secs = %.2e\n", param.secs); printfQuda("Nsteps = %d\n", param.Nsteps); + + return param.true_res; } From 92a8b50a46dd3f2b9aef1e3953d81b21228f25ba Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Tue, 19 Sep 2023 16:58:38 +0200 Subject: [PATCH 056/148] Optimize malloc --- lib/openqcd_interface.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 5cb5535ca9..137e330818 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -272,10 +272,10 @@ void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec) QudaGaugeParam param = newOpenQCDGaugeParam(prec); /* Matthias Wagner: optimize that */ - void* buffer = malloc(4*qudaState.init.volume*18*prec); + void* buffer = pool_pinned_malloc(4*qudaState.init.volume*18*prec); qudaState.init.reorder_gauge_openqcd_to_quda(gauge, buffer); loadGaugeQuda(buffer, ¶m); - free(buffer); + pool_pinned_free(buffer); qudaState.gauge_loaded = true; } @@ -285,10 +285,10 @@ void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec) { QudaGaugeParam param = newOpenQCDGaugeParam(prec); - void* buffer = malloc(4*qudaState.init.volume*18*prec); + void* buffer = pool_pinned_malloc(4*qudaState.init.volume*18*prec); saveGaugeQuda(buffer, ¶m); qudaState.init.reorder_gauge_quda_to_openqcd(buffer, gauge); - free(buffer); + pool_pinned_free(buffer); } From 088678ef2d99965c21601666aff8f7b708b113b7 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Wed, 20 Sep 2023 17:26:00 +0200 Subject: [PATCH 057/148] Add parameters for multigrid (incomplete) --- include/quda_openqcd_interface.h | 11 ++++++ lib/openqcd_interface.cpp | 64 ++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 27b7b94144..692b78f831 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -147,6 +147,17 @@ double openQCD_qudaGCR(void *source, void *solution, */ void openQCD_qudaInvert(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param); +/** + * Solve Ax=b for an Clover Wilson operator with a multigrid solver. All fields are fields passed and + * returned are host (CPU) field in openQCD order. This function requires that + * persistent gauge and clover fields have been created prior. + * + * @param[in] source Right-hand side source field + * @param[out] solution Solution spinor field + * @param[in] dirac_param Dirac parameter struct + */ +void openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param); + /** * @brief Wrapper for the plaquette. We could call plaqQuda() directly in diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 137e330818..0ef220978a 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -546,3 +546,67 @@ void openQCD_qudaInvert(void *source, void *solution, openQCD_QudaDiracParam_t d printfQuda("secs = %.2e\n", param.secs); printfQuda("Nsteps = %d\n", param.Nsteps); } + +void openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param) +{ + QudaInvertParam invert_param = newOpenQCDSolverParam(dirac_param); + QudaMultigridParam multigrid_param = newQudaMultigridParam(); + + //param.verbosity = QUDA_VERBOSE; + invert_param.reliable_delta = 1e-5; + invert_param.gcrNkrylov = 20; + invert_param.maxiter = 2000; + invert_param.tol = 1e-5; + invert_param.inv_type = QUDA_GCR_INVERTER; + invert_param.solution_type = QUDA_MAT_SOLUTION; + invert_param.solve_type = QUDA_DIRECT_SOLVE; + invert_param.matpc_type = QUDA_MATPC_EVEN_EVEN; + invert_param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; + invert_param.inv_type_precondition = QUDA_MG_INVERTER; + + // set the params, hard code the solver + // parameters copied from recommended settings from Wiki + multigrid_param.n_level = 2; + multigrid_param.generate_all_levels = QUDA_BOOLEAN_TRUE; + multigrid_param.run_verify = QUDA_BOOLEAN_TRUE; + multigrid_param.invert_param = &invert_param; + + // try setting minimal parameters - leave rest to default + // level 0 fine + multigrid_param.geo_block_size[0][0] = 4; // xytz + multigrid_param.geo_block_size[0][1] = 4; + multigrid_param.geo_block_size[0][2] = 4; + multigrid_param.geo_block_size[0][3] = 4; + multigrid_param.n_vec[0] = 24; + multigrid_param.precision_null[0] = QUDA_HALF_PRECISION; + multigrid_param.smoother[0] = QUDA_CA_GCR_INVERTER; + multigrid_param.nu_pre[0] = 0; + multigrid_param.nu_post[0] = 8; + multigrid_param.omega[0] = 0.8; + multigrid_param.smoother_solve_type[0] = QUDA_DIRECT_PC_SOLVE; + multigrid_param.cycle_type[0] = QUDA_MG_CYCLE_RECURSIVE; + + // level 1 coarse + // no smoother required for innermost + // so no blocks + multigrid_param.precision_null[1] = QUDA_HALF_PRECISION; + multigrid_param.coarse_solver[1] = QUDA_CA_GCR_INVERTER; + multigrid_param.coarse_solver_tol[1] = 0.25; + multigrid_param.coarse_solver_maxiter[1] = 50; + multigrid_param.coarse_grid_solution_type[1] = QUDA_MATPC_SOLUTION; + multigrid_param.smoother_solve_type[1] = QUDA_DIRECT_PC_SOLVE; + multigrid_param.cycle_type[1] = QUDA_MG_CYCLE_RECURSIVE; + + void *mgprec = newMultigridQuda(&multigrid_param); + invert_param.preconditioner = mgprec; + + invertQuda(static_cast(solution), static_cast(source), &invert_param); + + destroyMultigridQuda(mgprec); + + printfQuda("true_res = %.2e\n", invert_param.true_res); + printfQuda("true_res_hq = %.2e\n", invert_param.true_res_hq); + printfQuda("iter = %d\n", invert_param.iter); + printfQuda("gflops = %.2e\n", invert_param.gflops); + printfQuda("secs = %.2e\n", invert_param.secs); +} From 271f8285dc6972ae7c8a58e16f8b466c2176493d Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Thu, 21 Sep 2023 10:24:14 +0200 Subject: [PATCH 058/148] Undo minus sign in back transformation --- include/kernels/copy_color_spinor.cuh | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index 88b91bfc0a..6c67fa7f42 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -166,18 +166,18 @@ namespace quda { int s2[4] = {2, 3, 2, 3}; /* RG: I added a global minus here to ix the global minus imposed by the transformation */ - FloatOut K1[4] = {static_cast(-kU), - static_cast(-kU), - static_cast(-kU), - static_cast(-kU)}; - FloatOut K2[4] = {static_cast(kU), + FloatOut K1[4] = {static_cast(kU), static_cast(kU), + static_cast(kU), + static_cast(kU)}; + FloatOut K2[4] = {static_cast(-kU), static_cast(-kU), - static_cast(-kU)}; - /* U = [-1 0 1 0] - [ 0 -1 0 1] - [-1 0 -1 0] - [ 0 -1 0 -1] / sqrt(2) */ + static_cast(kU), + static_cast(kU)}; + /* U = [1 0 -1 0] + [0 1 0 -1] + [1 0 1 0] + [0 1 0 1] / sqrt(2) */ for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); From 00136bfe268b9995933fe8da20f4e8eb30f1d351 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Thu, 21 Sep 2023 11:13:22 +0200 Subject: [PATCH 059/148] Add global minus in Dirac --- include/color_spinor.h | 6 ++++++ include/kernels/copy_color_spinor.cuh | 2 +- lib/openqcd_interface.cpp | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/include/color_spinor.h b/include/color_spinor.h index ceae596e04..4f24e8d742 100644 --- a/include/color_spinor.h +++ b/include/color_spinor.h @@ -789,6 +789,12 @@ namespace quda { case 1: // positive projector #pragma unroll for (int i = 0; i < Nc; i++) { + /* + recon[0] = input[0]; + recon[1] = input[1]; + recon[2] = input[0]; + recon[3] = input[0]; + */ recon(0, i) = t(0, i); recon(1, i) = t(1, i); recon(2, i) = t(0, i); diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index 6c67fa7f42..db730ad170 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -165,7 +165,7 @@ namespace quda { int s1[4] = {0, 1, 0, 1}; int s2[4] = {2, 3, 2, 3}; - /* RG: I added a global minus here to ix the global minus imposed by the transformation */ + /* RG: I added a global minus here to fix the global minus imposed by the transformation */ FloatOut K1[4] = {static_cast(kU), static_cast(kU), static_cast(kU), diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 137e330818..972285fe7e 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -479,6 +479,8 @@ void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p) param.output_location = QUDA_CPU_FIELD_LOCATION; MatQuda(static_cast(dst), static_cast(src), ¶m); + /* AA: QUDA applies - Dw */ + blas::ax(-1.0, dst); } From 076cf0e94237b08fd18d5a41c56d4968d8a36cc0 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Thu, 21 Sep 2023 11:20:26 +0200 Subject: [PATCH 060/148] Add back and forth test --- include/kernels/copy_color_spinor.cuh | 1 - include/quda_openqcd_interface.h | 2 ++ lib/openqcd_interface.cpp | 25 +++++++++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index db730ad170..40704bec19 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -165,7 +165,6 @@ namespace quda { int s1[4] = {0, 1, 0, 1}; int s2[4] = {2, 3, 2, 3}; - /* RG: I added a global minus here to fix the global minus imposed by the transformation */ FloatOut K1[4] = {static_cast(kU), static_cast(kU), static_cast(kU), diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 27b7b94144..193717efa9 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -86,6 +86,8 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout); */ void openQCD_qudaFinalize(void); +void openQCD_back_and_forth(void *h_in, void *h_out); + /** * @brief Norm square on QUDA. diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 972285fe7e..3567d6e2ce 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -394,6 +394,31 @@ static QudaInvertParam newOpenQCDSolverParam(openQCD_QudaDiracParam_t p) } +/** + * @brief Calculates the norm of a spinor. + * + * @param[in] h_in input spinor of type spinor_dble[NSPIN] + * + * @return norm + */ +void openQCD_back_and_forth(void *h_in, void *h_out) +{ + QudaInvertParam param = newOpenQCDParam(); + + ColorSpinorParam cpuParam(h_in, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField in_h(cpuParam); + + ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); + ColorSpinorField in(cudaParam); + + ColorSpinorParam cpuParam_out(h_out, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField out_h(cpuParam); + + in = in_h; + out_h = in; +} + + /** * @brief Calculates the norm of a spinor. * From d9c4fff67d2a94d2f369b6cf2f2fd05a40cbece5 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Thu, 21 Sep 2023 11:26:18 +0200 Subject: [PATCH 061/148] Remove *= -1 --- lib/openqcd_interface.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 3567d6e2ce..d2ec5edc17 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -505,7 +505,7 @@ void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p) MatQuda(static_cast(dst), static_cast(src), ¶m); /* AA: QUDA applies - Dw */ - blas::ax(-1.0, dst); + /* blas::ax(-1.0, dst); */ } From a37c2861f078f874892ca246655b13e1aa4e67ee Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Thu, 21 Sep 2023 12:17:01 +0200 Subject: [PATCH 062/148] Revert "Undo minus sign in back transformation" This reverts commit 271f8285dc6972ae7c8a58e16f8b466c2176493d. --- include/kernels/copy_color_spinor.cuh | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index 40704bec19..88b91bfc0a 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -165,18 +165,19 @@ namespace quda { int s1[4] = {0, 1, 0, 1}; int s2[4] = {2, 3, 2, 3}; - FloatOut K1[4] = {static_cast(kU), - static_cast(kU), - static_cast(kU), - static_cast(kU)}; - FloatOut K2[4] = {static_cast(-kU), + /* RG: I added a global minus here to ix the global minus imposed by the transformation */ + FloatOut K1[4] = {static_cast(-kU), static_cast(-kU), + static_cast(-kU), + static_cast(-kU)}; + FloatOut K2[4] = {static_cast(kU), static_cast(kU), - static_cast(kU)}; - /* U = [1 0 -1 0] - [0 1 0 -1] - [1 0 1 0] - [0 1 0 1] / sqrt(2) */ + static_cast(-kU), + static_cast(-kU)}; + /* U = [-1 0 1 0] + [ 0 -1 0 1] + [-1 0 -1 0] + [ 0 -1 0 -1] / sqrt(2) */ for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); From 7c8936202e92011b0fffc9faeb004ee6731e1d3e Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Thu, 21 Sep 2023 13:50:46 +0200 Subject: [PATCH 063/148] Fix back_and_forth test --- lib/openqcd_interface.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index d2ec5edc17..bab95ef746 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -412,7 +412,7 @@ void openQCD_back_and_forth(void *h_in, void *h_out) ColorSpinorField in(cudaParam); ColorSpinorParam cpuParam_out(h_out, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); - ColorSpinorField out_h(cpuParam); + ColorSpinorField out_h(cpuParam_out); in = in_h; out_h = in; From 30ae3bdfa742aa842531078a579ff5279a0352da Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Thu, 21 Sep 2023 14:34:18 +0200 Subject: [PATCH 064/148] Fix mistake in transformation and remove Romans' minus sign in transformation --- include/kernels/copy_color_spinor.cuh | 35 +++++++++++++-------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index 88b91bfc0a..cf13a01136 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -137,18 +137,18 @@ namespace quda { __device__ __host__ inline void operator()(complex out[Ns*Nc], const complex in[Ns*Nc]) const { int s1[4] = {0, 1, 0, 1}; int s2[4] = {2, 3, 2, 3}; - FloatOut K1[4] = {static_cast(kP), - static_cast(kP), + FloatOut K1[4] = {static_cast(-kP), static_cast(-kP), - static_cast(-kP)}; + static_cast(kP), + static_cast(kP)}; FloatOut K2[4] = {static_cast(kP), static_cast(kP), static_cast(kP), static_cast(kP)}; - /* U = [1 0 1 0] - [0 1 0 1] - [-1 0 1 0] - [0 -1 0 1] / sqrt(2) */ + /* U = [-1 0 1 0] + [0 -1 0 1] + [1 0 1 0] + [0 1 0 1] / sqrt(2) */ for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); @@ -165,19 +165,18 @@ namespace quda { int s1[4] = {0, 1, 0, 1}; int s2[4] = {2, 3, 2, 3}; - /* RG: I added a global minus here to ix the global minus imposed by the transformation */ - FloatOut K1[4] = {static_cast(-kU), - static_cast(-kU), - static_cast(-kU), - static_cast(-kU)}; - FloatOut K2[4] = {static_cast(kU), + FloatOut K1[4] = {static_cast(kU), + static_cast(kU), static_cast(kU), + static_cast(kU)}; + FloatOut K2[4] = {static_cast(-kU), static_cast(-kU), - static_cast(-kU)}; - /* U = [-1 0 1 0] - [ 0 -1 0 1] - [-1 0 -1 0] - [ 0 -1 0 -1] / sqrt(2) */ + static_cast(kU), + static_cast(kU)}; + /* U = [-1 0 1 0] + [ 0 -1 0 1] + [1 0 1 0] + [ 0 1 0 1] / sqrt(2) */ for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); From 7985910cdd75103dbbf87faa32b810490a2e1745 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Thu, 21 Sep 2023 14:47:52 +0200 Subject: [PATCH 065/148] Fix mistake in Udagger --- include/kernels/copy_color_spinor.cuh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index cf13a01136..778bd36d9a 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -165,12 +165,12 @@ namespace quda { int s1[4] = {0, 1, 0, 1}; int s2[4] = {2, 3, 2, 3}; - FloatOut K1[4] = {static_cast(kU), - static_cast(kU), + FloatOut K1[4] = {static_cast(-kU), + static_cast(-kU), static_cast(kU), static_cast(kU)}; - FloatOut K2[4] = {static_cast(-kU), - static_cast(-kU), + FloatOut K2[4] = {static_cast(kU), + static_cast(kU), static_cast(kU), static_cast(kU)}; /* U = [-1 0 1 0] From 5f57224fc6f8f9e0f320cb58aa8a852bea0e0128 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Thu, 21 Sep 2023 15:47:48 +0200 Subject: [PATCH 066/148] Modify gamma5 gamma5_openqcd = -1 * U * gamma5_ukqcd * U^dagger --- lib/openqcd_interface.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index bab95ef746..06df29256b 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -480,6 +480,8 @@ void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) case 4: case 5: gamma5(out, in); + /* gamma5_openqcd = -1 * gamma5_ukqcd */ + blas::caxpby(Complex(-1.0, 0.0), out, 0.0, out); break; default: errorQuda("Unknown gamma: %d\n", dir); From fd9975e15af39404afc6e34d58ebfeb83225e56f Mon Sep 17 00:00:00 2001 From: Tim Harris Date: Fri, 22 Sep 2023 11:16:28 +0200 Subject: [PATCH 067/148] Changes for MG support. --- lib/openqcd_interface.cpp | 47 ++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 0ef220978a..79932ff2b9 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -171,12 +171,16 @@ static QudaInvertParam newOpenQCDParam(void) param.cpu_prec = QUDA_DOUBLE_PRECISION; // The precision used by the input fermion fields param.cuda_prec = QUDA_DOUBLE_PRECISION; // The precision used by the QUDA solver + /* TH added for MG support */ + param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; // The precision used by the QUDA solver + param.cuda_prec_precondition = QUDA_HALF_PRECISION; // The precision used by the QUDA solver /** * The order of the input and output fermion fields. Imposes fieldOrder = * QUDA_OPENQCD_FIELD_ORDER in color_spinor_field.h and * QUDA_OPENQCD_FIELD_ORDER makes quda to instantiate OpenQCDDiracOrder. */ + param.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; // Gamma basis of the input and output host fields @@ -518,7 +522,7 @@ void openQCD_qudaInvert(void *source, void *solution, openQCD_QudaDiracParam_t d param.input_location = QUDA_CPU_FIELD_LOCATION; param.output_location = QUDA_CPU_FIELD_LOCATION; - //param.verbosity = QUDA_VERBOSE; + param.verbosity = QUDA_VERBOSE; param.inv_type = QUDA_GCR_INVERTER; // QUDA_CG_INVERTER param.tol = 1e-2; param.compute_true_res = true; @@ -550,13 +554,14 @@ void openQCD_qudaInvert(void *source, void *solution, openQCD_QudaDiracParam_t d void openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param) { QudaInvertParam invert_param = newOpenQCDSolverParam(dirac_param); + QudaInvertParam invert_param_mg = newOpenQCDSolverParam(dirac_param); QudaMultigridParam multigrid_param = newQudaMultigridParam(); //param.verbosity = QUDA_VERBOSE; invert_param.reliable_delta = 1e-5; invert_param.gcrNkrylov = 20; invert_param.maxiter = 2000; - invert_param.tol = 1e-5; + invert_param.tol = 1e-12; invert_param.inv_type = QUDA_GCR_INVERTER; invert_param.solution_type = QUDA_MAT_SOLUTION; invert_param.solve_type = QUDA_DIRECT_SOLVE; @@ -564,12 +569,26 @@ void openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_ invert_param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; invert_param.inv_type_precondition = QUDA_MG_INVERTER; + invert_param_mg.reliable_delta = 1e-5; + invert_param_mg.gcrNkrylov = 20; + invert_param_mg.maxiter = 2000; + invert_param_mg.tol = 1e-12; + invert_param_mg.inv_type = QUDA_GCR_INVERTER; + invert_param_mg.solution_type = QUDA_MAT_SOLUTION; + invert_param_mg.solve_type = QUDA_DIRECT_SOLVE; + invert_param_mg.matpc_type = QUDA_MATPC_EVEN_EVEN; + invert_param_mg.solver_normalization = QUDA_DEFAULT_NORMALIZATION; + invert_param_mg.inv_type_precondition = QUDA_MG_INVERTER; + invert_param_mg.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + invert_param_mg.dirac_order = QUDA_DIRAC_ORDER; + // set the params, hard code the solver // parameters copied from recommended settings from Wiki multigrid_param.n_level = 2; multigrid_param.generate_all_levels = QUDA_BOOLEAN_TRUE; - multigrid_param.run_verify = QUDA_BOOLEAN_TRUE; - multigrid_param.invert_param = &invert_param; + multigrid_param.run_verify = QUDA_BOOLEAN_FALSE; + multigrid_param.invert_param = &invert_param_mg; + multigrid_param.compute_null_vector = QUDA_COMPUTE_NULL_VECTOR_YES; // try setting minimal parameters - leave rest to default // level 0 fine @@ -578,29 +597,47 @@ void openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_ multigrid_param.geo_block_size[0][2] = 4; multigrid_param.geo_block_size[0][3] = 4; multigrid_param.n_vec[0] = 24; - multigrid_param.precision_null[0] = QUDA_HALF_PRECISION; + multigrid_param.spin_block_size[0] = 2; + multigrid_param.precision_null[0] = QUDA_HALF_PRECISION; multigrid_param.smoother[0] = QUDA_CA_GCR_INVERTER; + multigrid_param.smoother_tol[0] = 0.25; + multigrid_param.location[0] = QUDA_CUDA_FIELD_LOCATION; multigrid_param.nu_pre[0] = 0; multigrid_param.nu_post[0] = 8; multigrid_param.omega[0] = 0.8; multigrid_param.smoother_solve_type[0] = QUDA_DIRECT_PC_SOLVE; multigrid_param.cycle_type[0] = QUDA_MG_CYCLE_RECURSIVE; + multigrid_param.coarse_solver[0] = QUDA_GCR_INVERTER; + multigrid_param.coarse_solver_tol[0] = 0.25; + multigrid_param.coarse_solver_maxiter[0] = 50; + multigrid_param.coarse_grid_solution_type[0] = QUDA_MAT_SOLUTION; // level 1 coarse // no smoother required for innermost // so no blocks multigrid_param.precision_null[1] = QUDA_HALF_PRECISION; multigrid_param.coarse_solver[1] = QUDA_CA_GCR_INVERTER; + multigrid_param.smoother[1] = QUDA_CA_GCR_INVERTER; + multigrid_param.smoother_tol[1] = 0.25; + multigrid_param.spin_block_size[1] = 1; multigrid_param.coarse_solver_tol[1] = 0.25; multigrid_param.coarse_solver_maxiter[1] = 50; multigrid_param.coarse_grid_solution_type[1] = QUDA_MATPC_SOLUTION; multigrid_param.smoother_solve_type[1] = QUDA_DIRECT_PC_SOLVE; multigrid_param.cycle_type[1] = QUDA_MG_CYCLE_RECURSIVE; + multigrid_param.location[1] = QUDA_CUDA_FIELD_LOCATION; + multigrid_param.nu_pre[1] = 0; + multigrid_param.nu_post[1] = 8; + multigrid_param.omega[1] = 0.8; + PUSH_RANGE("newMultigridQuda",4); void *mgprec = newMultigridQuda(&multigrid_param); invert_param.preconditioner = mgprec; + POP_RANGE; + PUSH_RANGE("invertQUDA",5); invertQuda(static_cast(solution), static_cast(source), &invert_param); + POP_RANGE; destroyMultigridQuda(mgprec); From 9f52f30c778990baa3a869f1776025fd3f4f8364 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Fri, 22 Sep 2023 11:48:25 +0200 Subject: [PATCH 068/148] Add debugging information --- include/communicator_quda.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/include/communicator_quda.h b/include/communicator_quda.h index 4199ea9c8c..6e12484593 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -130,6 +130,9 @@ namespace quda coords[i] = (i < topo->ndim) ? mod(comm_coords(topo)[i] + displacement[i], comm_dims(topo)[i]) : 0; } + std::cout << ": " << coords[0] << " " << coords[1] << " " << coords[2] << + " " << coords[3] << " yields rank" << comm_rank_from_coords(topo, coords) << std::endl; + return comm_rank_from_coords(topo, coords); } @@ -365,13 +368,18 @@ namespace quda Topology *topology = topo ? topo : default_topo; // use default topology if topo is NULL if (!topology) { errorQuda("Topology not specified"); } + const int *rank_grid = comm_coords_from_rank(topology, comm_rank()); for (int d = 0; d < 4; ++d) { int pos_displacement[QUDA_MAX_DIM] = {}; int neg_displacement[QUDA_MAX_DIM] = {}; pos_displacement[d] = +1; neg_displacement[d] = -1; + std::cout << "rank: " << rank_grid[0] << "," << rank_grid[1] + << "," << rank_grid[2] << "," << rank_grid[3] << " negative " << d << std::endl; neighbor_rank[0][d] = comm_rank_displaced(topology, neg_displacement); + std::cout << "rank: " << rank_grid[0] << "," << rank_grid[1] + << "," << rank_grid[2] << "," << rank_grid[3] << " positive " << d << std::endl; neighbor_rank[1][d] = comm_rank_displaced(topology, pos_displacement); } neighbors_cached = true; From 61e49857f534096c3f9fcc7973dffbce15149448 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Mon, 25 Sep 2023 10:48:32 +0200 Subject: [PATCH 069/148] Polish code --- include/color_spinor.h | 6 ------ include/color_spinor_field.h | 2 +- include/color_spinor_field_order.h | 7 +++---- include/gamma.cuh | 30 +++++++++++++-------------- include/kernels/copy_color_spinor.cuh | 25 +++++++++++++--------- include/quda_openqcd_interface.h | 8 +++++++ lib/openqcd_interface.cpp | 29 +++++++++++++------------- 7 files changed, 56 insertions(+), 51 deletions(-) diff --git a/include/color_spinor.h b/include/color_spinor.h index 4f24e8d742..ceae596e04 100644 --- a/include/color_spinor.h +++ b/include/color_spinor.h @@ -789,12 +789,6 @@ namespace quda { case 1: // positive projector #pragma unroll for (int i = 0; i < Nc; i++) { - /* - recon[0] = input[0]; - recon[1] = input[1]; - recon[2] = input[0]; - recon[3] = input[0]; - */ recon(0, i) = t(0, i); recon(1, i) = t(1, i); recon(2, i) = t(0, i); diff --git a/include/color_spinor_field.h b/include/color_spinor_field.h index e525c99369..4fdbc1c02d 100644 --- a/include/color_spinor_field.h +++ b/include/color_spinor_field.h @@ -231,7 +231,7 @@ namespace quda siteOrder = QUDA_EVEN_ODD_SITE_ORDER; } else if (inv_param.dirac_order == QUDA_OPENQCD_DIRAC_ORDER) { fieldOrder = QUDA_OPENQCD_FIELD_ORDER; - siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // FIXME: SHOULD THIS BE LEXICOGRAPHICAL?, OR VIA FULL IMPLEMENTATION VIA IPT ARRAY IN QUDA + siteOrder = QUDA_EVEN_ODD_SITE_ORDER; // internal QUDA site order } else { errorQuda("Dirac order %d not supported", inv_param.dirac_order); } diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 48e0e6b3f6..e2eec4ccbe 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1724,7 +1724,6 @@ namespace quda * struct to define order of spinor fields in OpenQCD */ template struct OpenQCDDiracOrder { - using Accessor = OpenQCDDiracOrder; using real = typename mapper::type; using complex = complex; @@ -1806,11 +1805,11 @@ namespace quda } /** - * @brief Rotate corrdinates (xyzt -> txyz) + * @brief Rotate coordinates (xyzt -> txyz) * - * @param[in] x_quda Carthesian local lattice coordinates in quda + * @param[in] x_quda Cartesian local lattice coordinates in quda * convention (xyzt) - * @param[out] x_openQCD Carthesian local lattice coordinates in openQCD + * @param[out] x_openQCD Cartesian local lattice coordinates in openQCD * convention (txyz) */ __device__ __host__ inline void rotate_coords(const int *x_quda, int *x_openQCD) const diff --git a/include/gamma.cuh b/include/gamma.cuh index ec0c7986c7..27961e7099 100644 --- a/include/gamma.cuh +++ b/include/gamma.cuh @@ -29,8 +29,8 @@ namespace quda { if (basis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS || basis == QUDA_OPENQCD_GAMMA_BASIS) { switch(dir) { - case 0: /* gamma1 */ - case 1: /* gamma2 */ + case 0: + case 1: switch(row) { case 0: return 3; case 1: return 2; @@ -38,8 +38,8 @@ namespace quda { case 3: return 0; } break; - case 2: /* gamma3 */ - case 3: /* gamma0 */ + case 2: + case 3: switch(row) { case 0: return 2; case 1: return 3; @@ -47,7 +47,7 @@ namespace quda { case 3: return 1; } break; - case 4: /* gamma5 */ + case 4: switch(row) { case 0: return 0; case 1: return 1; @@ -204,7 +204,7 @@ namespace quda { } } else if (basis == QUDA_OPENQCD_GAMMA_BASIS) { switch(dir) { - case 0: /* gamma1 */ + case 0: /* corresponds to gamma1 in OpenQCD convention */ switch(row) { case 0: case 1: @@ -214,7 +214,7 @@ namespace quda { return I; } break; - case 1: /* gamma2 */ + case 1: /* gamma2 in openQCD */ switch(row) { case 0: case 3: @@ -224,7 +224,7 @@ namespace quda { return 1; } break; - case 2: /* gamma3 */ + case 2: /* gamma3 in openQCD */ switch(row) { case 0: case 3: @@ -234,7 +234,7 @@ namespace quda { return I; } break; - case 3: /* gamma0 */ + case 3: /* gamma0 in openQCD */ switch(row) { case 0: case 1: @@ -243,7 +243,7 @@ namespace quda { return -1; } break; - case 4: /* gamma5 */ + case 4: /* gamma5 in openQCD */ switch(row) { case 0: case 1: @@ -340,30 +340,30 @@ namespace quda { } } else if (basis == QUDA_OPENQCD_GAMMA_BASIS) { switch(dir) { - case 0: /* gamma1 */ + case 0: /* gamma1 in openQCD convention */ switch(row) { case 0: case 1: return complex(a.imag(), -a.real()); // I case 2: case 3: return complex(-a.imag(), a.real()); // -I } break; - case 1: /* gamma2 */ + case 1: /* gamma2 in openQCD */ switch(row) { case 0: case 3: return -a; case 1: case 2: return a; } break; - case 2: /* gamma3 */ + case 2: /* gamma3 in openQCD */ switch(row) { case 0: case 3: return complex(a.imag(), -a.real()); // I case 1: case 2: return complex(-a.imag(), a.real()); // -I } break; - case 3: /* gamma0 */ + case 3: /* gamma0 in openQCD */ switch(row) { case 0: case 1: case 2: case 3: return -a; } break; - case 4: /* gamma5 */ + case 4: /* gamma5 in openQCD */ switch(row) { case 0: case 1: return a; case 2: case 3: return -a; diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index 778bd36d9a..0d98b1a917 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -130,7 +130,14 @@ namespace quda { } }; - /** Transform from openqcd into non-relativistic basis */ + /** Transform from openqcd into non-relativistic basis (a.k.a UKQCD basis): + * gamma_ukqcd = U gamma_openqcd U^dagger with + * U = [-1 0 1 0] + [ 0 -1 0 1] + [ 1 0 1 0] + [ 0 1 0 1] / sqrt(2), + * see https://github.com/JeffersonLab/chroma/blob/master/docs/notes/gamma_conventions.tex + for further notes. */ template struct ReverseOpenqcdBasis { template @@ -145,10 +152,6 @@ namespace quda { static_cast(kP), static_cast(kP), static_cast(kP)}; - /* U = [-1 0 1 0] - [0 -1 0 1] - [1 0 1 0] - [0 1 0 1] / sqrt(2) */ for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); @@ -157,7 +160,13 @@ namespace quda { } }; - /** Transform from non-relativistic into openqcd basis */ + /** Transform from non-relativistic (aka ukqcd) into openqcd basis: + * gamma_ukqcd = U gamma_openqcd U^dagger with + * U = [-1 0 1 0] + * [ 0 -1 0 1] + * [ 1 0 1 0] + * [ 0 1 0 1] / sqrt(2) + */ template struct OpenqcdBasis { template @@ -173,10 +182,6 @@ namespace quda { static_cast(kU), static_cast(kU), static_cast(kU)}; - /* U = [-1 0 1 0] - [ 0 -1 0 1] - [1 0 1 0] - [ 0 1 0 1] / sqrt(2) */ for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 193717efa9..b94a6e0f10 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -27,6 +27,7 @@ typedef struct { int nproc_blk[4]; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ int N[4]; /** Glocal lattice dimensions N0, N1, N2, N3 */ int device; /** GPU device number */ + int cstar; /** number of cstar directions */ } openQCD_QudaLayout_t; @@ -86,6 +87,13 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout); */ void openQCD_qudaFinalize(void); + +/** + * Copy a spinor to GPU and back to CPU. + * + * @param[in] h_in Spinor input field (from openQCD) + * @param[out] h_out Spinor output field + */ void openQCD_back_and_forth(void *h_in, void *h_out); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 06df29256b..f074dde697 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -13,6 +13,11 @@ #define MAX(a, b) ((a) > (b) ? (a) : (b)) + +static openQCD_QudaState_t qudaState = {false, false, false, false, {}, {}}; + +using namespace quda; + // code for NVTX taken from Jiri Kraus' blog post: // http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/ @@ -46,10 +51,6 @@ static const int num_colors = sizeof(colors) / sizeof(uint32_t); #define POP_RANGE #endif -static openQCD_QudaState_t qudaState = {false, false, false, false, {}, {}}; - -using namespace quda; - template void inline qudaopenqcd_called(const char *func, QudaVerbosity verb) { // add NVTX markup if enabled @@ -394,19 +395,12 @@ static QudaInvertParam newOpenQCDSolverParam(openQCD_QudaDiracParam_t p) } -/** - * @brief Calculates the norm of a spinor. - * - * @param[in] h_in input spinor of type spinor_dble[NSPIN] - * - * @return norm - */ void openQCD_back_and_forth(void *h_in, void *h_out) { QudaInvertParam param = newOpenQCDParam(); - ColorSpinorParam cpuParam(h_in, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); - ColorSpinorField in_h(cpuParam); + ColorSpinorParam cpuParam_in(h_in, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField in_h(cpuParam_in); ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); ColorSpinorField in(cudaParam); @@ -480,8 +474,13 @@ void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) case 4: case 5: gamma5(out, in); - /* gamma5_openqcd = -1 * gamma5_ukqcd */ - blas::caxpby(Complex(-1.0, 0.0), out, 0.0, out); + /* UKQCD uses a different convention for Gamma matrices: + * gamma5_ukqcd = gammax gammay gammaz gammat, + * gamma5_openqcd = gammat gammax gammay gammaz, + * and thus + * gamma5_openqcd = -1 * U gamma5_ukqcd U^dagger, + * with U the transformation matrix from OpenQCD to UKQCD. */ + blas::ax(-1.0, out); break; default: errorQuda("Unknown gamma: %d\n", dir); From ef78041fc0a443428d13545f8a0f2d4872aac9a4 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Mon, 25 Sep 2023 10:58:51 +0200 Subject: [PATCH 070/148] Change back and forth test --- lib/openqcd_interface.cpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index f074dde697..8438bdfb24 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -397,18 +397,26 @@ static QudaInvertParam newOpenQCDSolverParam(openQCD_QudaDiracParam_t p) void openQCD_back_and_forth(void *h_in, void *h_out) { + // sets up the necessary parameters QudaInvertParam param = newOpenQCDParam(); - ColorSpinorParam cpuParam_in(h_in, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); - ColorSpinorField in_h(cpuParam_in); + // creates a field on the CPU + ColorSpinorParam cpuParam(h_in, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField in_h(cpuParam); + // creates a field on the GPU with the same parameter set as the CPU field ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); ColorSpinorField in(cudaParam); - ColorSpinorParam cpuParam_out(h_out, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); - ColorSpinorField out_h(cpuParam_out); - + // transfer the CPU field to GPU in = in_h; + + // creates a field on the CPU + cpuParam.v = h_out; + cpuParam.location = QUDA_CPU_FIELD_LOCATION; + ColorSpinorField out_h(cpuParam); + + // transfer the GPU field back to CPU out_h = in; } From 29da4688d5f243f60f9149ad32c45452e915df07 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Mon, 25 Sep 2023 11:20:52 +0200 Subject: [PATCH 071/148] Modify back and forth test --- lib/openqcd_interface.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 8c8477a14a..6aaaad5f20 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -416,8 +416,15 @@ void openQCD_back_and_forth(void *h_in, void *h_out) cpuParam.location = QUDA_CPU_FIELD_LOCATION; ColorSpinorField out_h(cpuParam); + // creates a zero-field on the GPU + cudaParam.create = QUDA_NULL_FIELD_CREATE; + cudaParam.location = QUDA_CUDA_FIELD_LOCATION; + ColorSpinorField out(cudaParam); + + out = in; + // transfer the GPU field back to CPU - out_h = in; + out_h = out; } From 8ababe5af8118b5b94fb97d7801f9181208179b4 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Mon, 25 Sep 2023 11:49:16 +0200 Subject: [PATCH 072/148] Modify printfQuda --- lib/openqcd_interface.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 6aaaad5f20..5e6f431a8a 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -542,12 +542,13 @@ double openQCD_qudaGCR(void *source, void *solution, invertQuda(static_cast(solution), static_cast(source), ¶m); - printfQuda("true_res = %.2e\n", param.true_res); + printfQuda("true_res = %e\n", param.true_res); printfQuda("true_res_hq = %.2e\n", param.true_res_hq); printfQuda("iter = %d\n", param.iter); printfQuda("gflops = %.2e\n", param.gflops); printfQuda("secs = %.2e\n", param.secs); - printfQuda("Nsteps = %d\n", param.Nsteps); + /* this is not properly set */ + /* printfQuda("Nsteps = %d\n", param.Nsteps); */ return param.true_res; } From 4a9ba76b04f6c0311b79a21300e48ab7459d902f Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Mon, 25 Sep 2023 12:27:43 +0200 Subject: [PATCH 073/148] Return true_res and disable half precision in multigrid --- include/quda_openqcd_interface.h | 2 +- lib/openqcd_interface.cpp | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 4a1c1ea45e..1567ee71fe 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -166,7 +166,7 @@ void openQCD_qudaInvert(void *source, void *solution, openQCD_QudaDiracParam_t d * @param[out] solution Solution spinor field * @param[in] dirac_param Dirac parameter struct */ -void openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param); +double openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param); /** diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 0aefe59fbc..376c912617 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -172,9 +172,11 @@ static QudaInvertParam newOpenQCDParam(void) param.cpu_prec = QUDA_DOUBLE_PRECISION; // The precision used by the input fermion fields param.cuda_prec = QUDA_DOUBLE_PRECISION; // The precision used by the QUDA solver - /* TH added for MG support */ - param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; // The precision used by the QUDA solver - param.cuda_prec_precondition = QUDA_HALF_PRECISION; // The precision used by the QUDA solver + + /* AA: This breaks GCR */ + // /* TH added for MG support */ + // param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; // The precision used by the QUDA solver + // param.cuda_prec_precondition = QUDA_HALF_PRECISION; // The precision used by the QUDA solver /** * The order of the input and output fermion fields. Imposes fieldOrder = @@ -595,7 +597,7 @@ void openQCD_qudaInvert(void *source, void *solution, openQCD_QudaDiracParam_t d printfQuda("Nsteps = %d\n", param.Nsteps); } -void openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param) +double openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param) { QudaInvertParam invert_param = newOpenQCDSolverParam(dirac_param); QudaInvertParam invert_param_mg = newOpenQCDSolverParam(dirac_param); @@ -685,9 +687,11 @@ void openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_ destroyMultigridQuda(mgprec); - printfQuda("true_res = %.2e\n", invert_param.true_res); + printfQuda("true_res = %e\n", invert_param.true_res); printfQuda("true_res_hq = %.2e\n", invert_param.true_res_hq); printfQuda("iter = %d\n", invert_param.iter); printfQuda("gflops = %.2e\n", invert_param.gflops); printfQuda("secs = %.2e\n", invert_param.secs); + + return invert_param.true_res; } From 1cac7c0c54e4cf563b15fdef981128cc9f22b7bc Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Mon, 25 Sep 2023 12:51:32 +0200 Subject: [PATCH 074/148] Remove qudaInvert --- include/quda_openqcd_interface.h | 11 --------- lib/openqcd_interface.cpp | 38 -------------------------------- 2 files changed, 49 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 1567ee71fe..54199b68ef 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -146,17 +146,6 @@ double openQCD_qudaGCR(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param, openQCD_QudaGCRParam_t gcr_param); -/** - * Solve Ax=b for an Clover Wilson operator. All fields are fields passed and - * returned are host (CPU) field in openQCD order. This function requires that - * persistent gauge and clover fields have been created prior. - * - * @param[in] source Right-hand side source field - * @param[out] solution Solution spinor field - * @param[in] dirac_param Dirac parameter struct - */ -void openQCD_qudaInvert(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param); - /** * Solve Ax=b for an Clover Wilson operator with a multigrid solver. All fields are fields passed and * returned are host (CPU) field in openQCD order. This function requires that diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 376c912617..cbb05df4a8 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -559,44 +559,6 @@ double openQCD_qudaGCR(void *source, void *solution, return param.true_res; } - -void openQCD_qudaInvert(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param) -{ - QudaInvertParam param = newOpenQCDSolverParam(dirac_param); - - // both fields reside on the CPU - param.input_location = QUDA_CPU_FIELD_LOCATION; - param.output_location = QUDA_CPU_FIELD_LOCATION; - - param.verbosity = QUDA_VERBOSE; - param.inv_type = QUDA_GCR_INVERTER; // QUDA_CG_INVERTER - param.tol = 1e-2; - param.compute_true_res = true; - param.maxiter = 100; - - param.gcrNkrylov = 20; - param.reliable_delta = 1e-5; - - param.solution_type = QUDA_MAT_SOLUTION; - param.solve_type = QUDA_DIRECT_SOLVE; - param.matpc_type = QUDA_MATPC_EVEN_EVEN; - param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; - param.inv_type_precondition = QUDA_INVALID_INVERTER; // disables any preconditioning - - // both fields reside on the CPU - param.input_location = QUDA_CPU_FIELD_LOCATION; - param.output_location = QUDA_CPU_FIELD_LOCATION; - - invertQuda(static_cast(solution), static_cast(source), ¶m); - - printfQuda("true_res = %.2e\n", param.true_res); - printfQuda("true_res_hq = %.2e\n", param.true_res_hq); - printfQuda("iter = %d\n", param.iter); - printfQuda("gflops = %.2e\n", param.gflops); - printfQuda("secs = %.2e\n", param.secs); - printfQuda("Nsteps = %d\n", param.Nsteps); -} - double openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param) { QudaInvertParam invert_param = newOpenQCDSolverParam(dirac_param); From ed254d5b888cd483073a482aa1f24e66854d17b3 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Mon, 25 Sep 2023 15:20:41 +0200 Subject: [PATCH 075/148] Add comment about Precision for multigrid --- include/quda_openqcd_interface.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 54199b68ef..81388c693e 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -150,6 +150,8 @@ double openQCD_qudaGCR(void *source, void *solution, * Solve Ax=b for an Clover Wilson operator with a multigrid solver. All fields are fields passed and * returned are host (CPU) field in openQCD order. This function requires that * persistent gauge and clover fields have been created prior. + * + * Requires QUDA_PRECISION & 2 != 0, e.g. QUDA_PRECISON = 14 * * @param[in] source Right-hand side source field * @param[out] solution Solution spinor field From febf4fb3ba6b26b7c555f3d30c83fee4c3860465 Mon Sep 17 00:00:00 2001 From: Tim Harris Date: Tue, 26 Sep 2023 09:56:31 +0200 Subject: [PATCH 076/148] Moved init of cuda_prec_sloppy into openQCD_qudaMultigrid. --- lib/openqcd_interface.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index cbb05df4a8..cf996ef555 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -576,6 +576,8 @@ double openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracPara invert_param.matpc_type = QUDA_MATPC_EVEN_EVEN; invert_param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; invert_param.inv_type_precondition = QUDA_MG_INVERTER; + invert_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; // The precision used by the QUDA solver + invert_param.cuda_prec_precondition = QUDA_HALF_PRECISION; // The precision used by the QUDA solver invert_param_mg.reliable_delta = 1e-5; invert_param_mg.gcrNkrylov = 20; From 83ce801a7cb98d37635b572647d920a0031e464d Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 27 Sep 2023 17:50:58 +0200 Subject: [PATCH 077/148] added clover field load function to load cloer field from openQCD --- include/clover_field_order.h | 167 +++++++++++++++++++++++++++++ include/color_spinor_field_order.h | 25 +++-- include/enum_quda.h | 1 + include/enum_quda_fortran.h | 1 + include/quda_openqcd_interface.h | 12 ++- lib/copy_clover.cu | 6 ++ lib/copy_clover_offset.cu | 10 ++ lib/openqcd_interface.cpp | 30 +++--- 8 files changed, 217 insertions(+), 35 deletions(-) diff --git a/include/clover_field_order.h b/include/clover_field_order.h index 7c17b25b51..cfee2bf73a 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -1015,6 +1016,172 @@ namespace quda { size_t Bytes() const { return length*sizeof(Float); } }; + /** + * OpenQCD ordering for clover fields + */ + template struct OpenQCDOrder { + static constexpr bool enable_reconstruct = false; + typedef typename mapper::type RegType; + Float *clover; + const int volumeCB; + const QudaTwistFlavorType twist_flavor; + const Float mu2; + const Float epsilon2; + const int L[4]; + const double coeff; + const double csw; + const double kappa; + + OpenQCDOrder(const CloverField &clover, bool inverse, Float *clover_ = nullptr, void * = nullptr) : + volumeCB(clover.Stride()), + twist_flavor(clover.TwistFlavor()), + mu2(clover.Mu2()), + epsilon2(clover.Epsilon2()), + L {clover.X()[0], clover.X()[1], clover.X()[2], clover.X()[3]}, // local dimensions (xyzt) + coeff(clover.Coeff()), + csw(clover.Csw()), + kappa(clover.Coeff()/clover.Csw()) + { + if (clover.Order() != QUDA_OPENQCD_CLOVER_ORDER) { + errorQuda("Invalid clover order %d for this accessor", clover.Order()); + } + this->clover = clover_ ? clover_ : (Float *)(clover.V(inverse)); + if (clover.Coeff() == 0.0 || clover.Csw() == 0.0) { + errorQuda("Neither coeff nor csw may be zero!"); + } + } + + QudaTwistFlavorType TwistFlavor() const { return twist_flavor; } + Float Mu2() const { return mu2; } + Float Epsilon2() const { return epsilon2; } + + /** + * @brief Pure function to return ipt[iy], where + * iy=x3+L3*x2+L2*L3*x1+L1*L2*L3*x0 without accessing the + * ipt-array, but calculating the index on the fly. Notice + * that xi and Li are in openQCD (txyz) convention. If they + * come from QUDA, you have to rotate them first. + * + * @param[in] x Carthesian local lattice corrdinates, 0 <= x[i] < + * Li + * + * @return ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*x0] = the local flat index + * of openQCD + */ + __device__ __host__ inline int ipt(int *x) const + { + int xb[4], xn[4], ib, in, is, cbs[4], mu, L_[4]; + + rotate_coords(L, L_); // L_ local lattice dimensions in openQCD format (txyz) + + /* cache_block */ + for (mu=1;mu<4;mu++) { + if ((L[mu]%4)==0) { + cbs[mu]=4; + } else if ((L[mu]%3)==0) { + cbs[mu]=3; + } else if ((L[mu]%2)==0) { + cbs[mu]=2; + } else { + cbs[mu]=1; + } + } + + xb[0] = x[0]; + xb[1] = x[1] % cbs[1]; + xb[2] = x[2] % cbs[2]; + xb[3] = x[3] % cbs[3]; + + xn[1] = x[1]/cbs[1]; + xn[2] = x[2]/cbs[2]; + xn[3] = x[3]/cbs[3]; + + /** + * This is essentially what cbix[...] does. + * Notice integer division; truncated towards zero, i.e. 5/2=2 + */ + ib = (xb[3] + cbs[3]*xb[2] + cbs[2]*cbs[3]*xb[1] + cbs[1]*cbs[2]*cbs[3]*xb[0])/2; + + in = xn[3] + (L_[3]/cbs[3])*xn[2] + (L_[3]/cbs[3])*(L_[2]/cbs[2])*xn[1]; + is = x[0] + x[1] + x[2] + x[3]; + + return ib + (L_[0]*cbs[1]*cbs[2]*cbs[3]*in)/2 + (is%2)*(L_[0]*L_[1]*L_[2]*L_[3]/2); + } + + /** + * @brief Rotate coordinates (xyzt -> txyz) + * + * @param[in] x_quda Cartesian local lattice coordinates in quda + * convention (xyzt) + * @param[out] x_openQCD Cartesian local lattice coordinates in openQCD + * convention (txyz) + */ + __device__ __host__ inline void rotate_coords(const int *x_quda, int *x_openQCD) const + { + x_openQCD[1] = x_quda[0]; + x_openQCD[2] = x_quda[1]; + x_openQCD[3] = x_quda[2]; + x_openQCD[0] = x_quda[3]; + } + + /** + * @brief Gets the offset in Floats from the openQCD base pointer to + * the spinor field. + * + * @param[in] x Checkerboard index coming from quda + * @param[in] parity The parity coming from quda + * + * @return The offset. + */ + __device__ __host__ inline int getCloverOffset(int x, int parity) const + { + int coord_quda[4], coord_openQCD[4]; + + /* coord_quda contains xyzt local Carthesian corrdinates */ + getCoords(coord_quda, x, L, parity); + rotate_coords(coord_quda, coord_openQCD); /* xyzt -> txyz */ + + return ipt(coord_openQCD)*length; + } + + /** + * @brief Load a clover field at lattice point x + * + * @param v The output clover matrix in QUDA order + * @param x The checkerboarded lattice site + * @param parity The parity of the lattice site + */ + __device__ __host__ inline void load(RegType v[length], int x, int parity) const { + int sign[36] = {-1,-1,-1,-1,-1,-1, // diagonals (idx 0-5) + -1,+1,-1,+1,-1,-1,-1,-1,-1,-1, // column 0 (idx 6-15) + -1,+1,-1,-1,-1,-1,-1,-1, // column 1 (idx 16-23) + -1,-1,-1,-1,-1,-1, // column 2 (idx 24-29) + -1,+1,-1,+1, // column 3 (idx 30-33) + -1,+1}; // column 4 (idx 34-35) + int map[36] = {0,1,2,3,4,5, + 6,7,8,9,10,11,18,19,24,25, + 16,17,12,13,20,21,26,27, + 14,15,22,23,28,29, + 30,31,32,33, + 34,35}; + const int M = length/2; + int offset = getCloverOffset(x, parity); + auto Ap = &clover[offset]; // A_+ + auto Am = &clover[offset + M]; // A_- + +#pragma unroll + for (int i = 0; i < M; i++) { + v[ i] = sign[i]*(kappa*Am[map[i]] - (i<6)); + v[M + i] = sign[i]*(kappa*Ap[map[i]] - (i<6)); + } + } + + // FIXME implement the save routine for OpenQCD ordered fields + __device__ __host__ inline void save(RegType[length], int, int) const { } + + size_t Bytes() const { return length*sizeof(Float); } + }; + } // namespace clover // Use traits to reduce the template explosion diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index e2eec4ccbe..6cf52ed0ff 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1751,19 +1751,18 @@ namespace quda } } - - /** - * @brief Pure function to return ipt[iy], where - * iy=x3+L3*x2+L2*L3*x1+L1*L2*L3*x0 without accessing the - * ipt-array, but calculating the index on the fly. Notice that - * xi and Li are in openQCD (txyz) convention. If they come - * from QUDA, you have to rotate them first. - * - * @param[in] x Carthesian local lattice corrdinates, 0 <= x[i] < Li - * - * @return ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*x0] = the local flat index of - * openQCD - */ + /** + * @brief Pure function to return ipt[iy], where + * iy=x3+L3*x2+L2*L3*x1+L1*L2*L3*x0 without accessing the + * ipt-array, but calculating the index on the fly. Notice that + * xi and Li are in openQCD (txyz) convention. If they come + * from QUDA, you have to rotate them first. + * + * @param[in] x Carthesian local lattice corrdinates, 0 <= x[i] < Li + * + * @return ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*x0] = the local flat index of + * openQCD + */ __device__ __host__ inline int ipt(int *x) const { int xb[4], xn[4], ib, in, is, cbs[4], mu, L_[4]; diff --git a/include/enum_quda.h b/include/enum_quda.h index 167e36bb57..9d8e188406 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -262,6 +262,7 @@ typedef enum QudaCloverFieldOrder_s { QUDA_PACKED_CLOVER_ORDER, // even-odd, QDP packed QUDA_QDPJIT_CLOVER_ORDER, // (diagonal / off-diagonal)-chirality-spacetime QUDA_BQCD_CLOVER_ORDER, // even-odd, super-diagonal packed and reordered + QUDA_OPENQCD_CLOVER_ORDER, // openqcd QUDA_INVALID_CLOVER_ORDER = QUDA_INVALID_ENUM } QudaCloverFieldOrder; diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index d6b954d0bf..dcc7f0410f 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -244,6 +244,7 @@ #define QUDA_PACKED_CLOVER_ORDER 9 // even-odd packed #define QUDA_QDPJIT_CLOVER_ORDER 10 // lexicographical order packed #define QUDA_BQCD_CLOVER_ORDER 11 // BQCD order which is a packed super-diagonal form +#define QUDA_OPENQCD_CLOVER_ORDER 12 // openqcd #define QUDA_INVALID_CLOVER_ORDER QUDA_INVALID_ENUM #define QudaVerbosity integer(4) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 81388c693e..50c7447bde 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -175,7 +175,7 @@ double openQCD_qudaPlaquette(void); * @brief Load the gauge fields from host to quda. * * @param[in] gauge The gauge fields (in openqcd order) - * @param[in] prec Precision of the gauge field + * @param[in] prec Precision of the incoming gauge field */ void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec); @@ -184,7 +184,7 @@ void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec); * @brief Save the gauge fields from quda to host. * * @param[out] gauge The gauge fields (will be stored in openqcd order) - * @param[in] prec Precision of the gauge field + * @param[in] prec Precision of the outgoing gauge field */ void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec); @@ -198,9 +198,13 @@ void openQCD_qudaGaugeFree(void); /** * @brief Load the clover fields from host to quda. * - * @param[in] clover The clover fields (in openqcd order) + * @param[in] clover The clover fields (in openqcd order) + * @param[in] kappa The kappa (we need this, because quda has its clover + * field multiplied by kappa and we have to reverse this + * when loading ours) + * @param[in] csw The csw coefficient of the clover field */ -void openQCD_qudaCloverLoad(void *clover); +void openQCD_qudaCloverLoad(void *clover, double kappa, double csw); /** diff --git a/lib/copy_clover.cu b/lib/copy_clover.cu index 25d6b06ff5..126650f0e6 100644 --- a/lib/copy_clover.cu +++ b/lib/copy_clover.cu @@ -129,6 +129,12 @@ namespace quda { copyClover, FloatOut, FloatIn>(out, in, inverse, location, Out, In); #else errorQuda("BQCD interface has not been built\n"); +#endif + } else if (in.Order() == QUDA_OPENQCD_CLOVER_ORDER) { +#ifdef BUILD_OPENQCD_INTERFACE + copyClover, FloatOut, FloatIn>(out, in, inverse, location, Out, In); +#else + errorQuda("OpenQCD interface has not been built\n"); #endif } else { errorQuda("Clover field %d order not supported", in.Order()); diff --git a/lib/copy_clover_offset.cu b/lib/copy_clover_offset.cu index 1300082c24..58301245b3 100644 --- a/lib/copy_clover_offset.cu +++ b/lib/copy_clover_offset.cu @@ -45,6 +45,16 @@ namespace quda CopyFieldOffset copier(arg, in); #else errorQuda("BQCD interface has not been built\n"); +#endif + } else if (in.Order() == QUDA_OPENQCD_CLOVER_ORDER) { +#ifdef BUILD_OPENQCD_INTERFACE + using C = OpenQCDOrder; + C out_accessor(out, inverse); + C in_accessor(in, inverse); + CopyFieldOffsetArg arg(out_accessor, out, in_accessor, in, offset); + CopyFieldOffset copier(arg, in); +#else + errorQuda("OpenQCD interface has not been built\n"); #endif } else { errorQuda("Clover field %d order not supported", in.Order()); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index cbb05df4a8..bde026aced 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -253,22 +253,8 @@ double openQCD_qudaPlaquette(void) return 0.0; } - /*QudaGaugeObservableParam obsParam = newQudaGaugeObservableParam(); - obsParam.compute_plaquette = QUDA_BOOLEAN_TRUE; - obsParam.remove_staggered_phase = QUDA_BOOLEAN_FALSE; - gaugeObservablesQuda(&obsParam); - - // Note different Nc normalization! - plaq[0] = obsParam.plaquette[0]; - plaq[1] = obsParam.plaquette[1]; - plaq[2] = obsParam.plaquette[2];*/ - plaqQuda(plaq); -/* plaq[1] *= 3.0; - plaq[2] *= 3.0; - plaq[0] *= 3.0;*/ - // Note different Nc normalization wrt openQCD! return 3.0*plaq[0]; } @@ -306,11 +292,19 @@ void openQCD_qudaGaugeFree(void) } -void openQCD_qudaCloverLoad(void *clover) +void openQCD_qudaCloverLoad(void *clover, double kappa, double csw) { - /*QudaInvertParam qudaCloverParam = newOpenQCDCloverParam(); - loadCloverQuda(clover, NULL, &qudaCloverParam);*/ - errorQuda("openQCD_qudaCloverLoad() is not implemented yet."); + QudaInvertParam param = newOpenQCDParam(); + param.clover_order = QUDA_OPENQCD_CLOVER_ORDER; + param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; + param.clover_cpu_prec = QUDA_DOUBLE_PRECISION; + param.clover_cuda_prec = QUDA_DOUBLE_PRECISION; + + param.kappa = kappa; + param.clover_csw = csw; + param.clover_coeff = 0.0; + + loadCloverQuda(clover, NULL, ¶m); qudaState.clover_loaded = true; } From 0c9bcf07056317a3351d0bd447e9bb8c185cfc40 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 27 Sep 2023 17:51:27 +0200 Subject: [PATCH 078/148] we need csw and coeff to determine kappa in clover_field_order.h OpenQCDOrder --- include/clover_field.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/clover_field.h b/include/clover_field.h index d4b0dff107..50accd9093 100644 --- a/include/clover_field.h +++ b/include/clover_field.h @@ -140,6 +140,8 @@ namespace quda { inverse(param.inverse), clover(param.clover), cloverInv(param.cloverInv), + csw(param.csw), + coeff(param.coeff), twist_flavor(param.twist_flavor), mu2(param.mu2), epsilon2(param.epsilon2), From 3a0012c58aec2b6481368152d9e0f4c5ba756db9 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 29 Sep 2023 15:30:17 +0200 Subject: [PATCH 079/148] first attempt for no loads --- include/quda_openqcd_interface.h | 9 +++++- lib/openqcd_interface.cpp | 51 ++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 1 deletion(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 50c7447bde..f59a667b64 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -105,6 +105,7 @@ void openQCD_back_and_forth(void *h_in, void *h_out); * @return The norm */ double openQCD_qudaNorm(void *h_in); +double openQCD_qudaNorm_NoLoads(void *d_in); /** @@ -117,7 +118,12 @@ double openQCD_qudaNorm(void *h_in); * @param[in] openQCD_in of type spinor_dble[NSPIN] * @param[out] openQCD_out of type spinor_dble[NSPIN] */ -void openQCD_qudaGamma(int dir, void *openQCD_in, void *openQCD_out); +void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out); + + +void* openQCD_qudaH2D(void *openQCD_field); +void openQCD_qudaD2H(void *quda_field, void *openQCD_field); +void openQCD_qudaSpinorFree(void** quda_field); /** @@ -129,6 +135,7 @@ void openQCD_qudaGamma(int dir, void *openQCD_in, void *openQCD_out); * @param[in] p Dirac parameter struct */ void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p); +void openQCD_qudaDw_NoLoads(void *src, void *dst, openQCD_QudaDiracParam_t p); /** diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 23b2630b25..12b8b32c62 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -450,6 +450,11 @@ double openQCD_qudaNorm(void *h_in) return blas::norm2(in); } +double openQCD_qudaNorm_NoLoads(void *d_in) +{ + return blas::norm2(*reinterpret_cast(d_in)); +} + void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) { @@ -511,6 +516,52 @@ void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) } +void* openQCD_qudaH2D(void *openQCD_field) +{ + // sets up the necessary parameters + QudaInvertParam param = newOpenQCDParam(); + + // creates a field on the CPU + ColorSpinorParam cpuParam(openQCD_field, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField in_h(cpuParam); + + // creates a field on the GPU with the same parameter set as the CPU field + ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); + ColorSpinorField *in = new ColorSpinorField(cudaParam); + + *in = in_h; // transfer the CPU field to GPU + + return in; +} + + +void openQCD_qudaSpinorFree(void** quda_field) +{ + delete reinterpret_cast(*quda_field); + *quda_field = nullptr; +} + +void openQCD_qudaD2H(void *quda_field, void *openQCD_field) +{ + // sets up the necessary parameters + QudaInvertParam param = newOpenQCDParam(); + + // creates a field on the CPU + ColorSpinorParam cpuParam(openQCD_field, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); + ColorSpinorField out_h(cpuParam); + + ColorSpinorField* in = reinterpret_cast(quda_field); + ColorSpinorField out(*in); + + out_h = out; // transfer the GPU field to CPU +} + + +void openQCD_qudaDw_NoLoads(void *src, void *dst, openQCD_QudaDiracParam_t p) +{ +} + + void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p) { QudaInvertParam param = newOpenQCDDiracParam(p); From 14baf0f45666b9c025d2c7b32aaf37594d94a58a Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Fri, 22 Sep 2023 11:45:23 +0200 Subject: [PATCH 080/148] Debug cstar error --- include/communicator_quda.h | 18 ++++++++++++++---- lib/comm_common.cpp | 10 ++++++---- lib/communicator_mpi.cpp | 4 ++-- lib/interface_quda.cpp | 2 +- lib/openqcd_interface.cpp | 4 +++- tests/utils/host_utils.cpp | 4 ++-- 6 files changed, 28 insertions(+), 14 deletions(-) diff --git a/include/communicator_quda.h b/include/communicator_quda.h index 6e12484593..940bcc87b3 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -40,6 +40,7 @@ namespace quda int (*coords)[QUDA_MAX_DIM]; int my_rank; int my_coords[QUDA_MAX_DIM]; + int cstar[QUDA_MAX_DIM]; // It might be worth adding communicators to allow for efficient reductions: // #if defined(MPI_COMMS) // MPI_Comm comm; @@ -126,8 +127,12 @@ namespace quda { int coords[QUDA_MAX_DIM]; - for (int i = 0; i < QUDA_MAX_DIM; i++) { - coords[i] = (i < topo->ndim) ? mod(comm_coords(topo)[i] + displacement[i], comm_dims(topo)[i]) : 0; + int Nx_displacement = 0; + for (int i = QUDA_MAX_DIM-1; i >=0; i--) { + if(topo->cstar[i]==1 && i < topo->ndim){ + Nx_displacement += ((comm_coords(topo)[i] + displacement[i] + comm_dims(topo)[i])/comm_dims(topo)[i] -1) * (comm_dims(topo)[0]/2); + } + coords[i] = (i < topo->ndim) ? mod(comm_coords(topo)[i] + displacement[i] + (i==0 ? Nx_displacement :0), comm_dims(topo)[i]) : 0; } std::cout << ": " << coords[0] << " " << coords[1] << " " << coords[2] << @@ -258,7 +263,12 @@ namespace quda const int gpuid = comm_gpuid(); comm_set_neighbor_ranks(); - + for (int dir = 0; dir < 2; ++dir) { // forward/backward directions + for (int dim = 0; dim < 4; ++dim) { + printfQuda("my (%i):neighbors in dim/dir %i/%i: %i\n",comm_rank(),dim,dir,comm_neighbor_rank(dir, dim)); + } + } + char *hostname = comm_hostname(); int *gpuid_recv_buf = (int *)safe_malloc(sizeof(int) * comm_size()); @@ -524,7 +534,7 @@ namespace quda void comm_init_common(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data) { - Topology *topo = comm_create_topology(ndim, dims, rank_from_coords, map_data, comm_rank()); + Topology *topo = comm_create_topology(ndim, dims, rank_from_coords, map_data, comm_rank()); comm_set_default_topology(topo); // determine which GPU this rank will use diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index abecd4822b..c58b788350 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -106,8 +106,9 @@ namespace quda int nodes = 1; for (int i = 0; i < ndim; i++) { - topo->dims[i] = dims[i]; - nodes *= dims[i]; + topo->dims[i] = abs(dims[i]); + topo->cstar[i] = dims[i] < 0 ? 1:0; + nodes *= topo->dims[i]; } topo->ranks = new int[nodes]; @@ -118,9 +119,10 @@ namespace quda do { int rank = rank_from_coords(x, map_data); - topo->ranks[index(ndim, dims, x)] = rank; + topo->ranks[index(ndim, topo->dims, x)] = rank; + if(rank<0) errorQuda("rank <0"); for (int i = 0; i < ndim; i++) { topo->coords[rank][i] = x[i]; } - } while (advance_coords(ndim, dims, x)); + } while (advance_coords(ndim, topo->dims, x)); topo->my_rank = my_rank; for (int i = 0; i < ndim; i++) { topo->my_coords[i] = topo->coords[my_rank][i]; } diff --git a/lib/communicator_mpi.cpp b/lib/communicator_mpi.cpp index 61869c5811..5e4c0bed9b 100644 --- a/lib/communicator_mpi.cpp +++ b/lib/communicator_mpi.cpp @@ -52,7 +52,7 @@ namespace quda } comm_init(nDim, commDims, rank_from_coords, map_data); - globalReduce.push(true); + globalReduce.push(true); } Communicator::Communicator(Communicator &other, const int *comm_split) : globalReduce(other.globalReduce) @@ -113,7 +113,7 @@ namespace quda MPI_CHECK(MPI_Comm_size(MPI_COMM_HANDLE, &size)); int grid_size = 1; - for (int i = 0; i < ndim; i++) { grid_size *= dims[i]; } + for (int i = 0; i < ndim; i++) { grid_size *= abs(dims[i]); } if (grid_size != size) { errorQuda("Communication grid size declared via initCommsGridQuda() does not match" " total number of MPI ranks (%d != %d)", diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 913082b001..7745de703e 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -366,7 +366,7 @@ void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata map_data.ndim = nDim; for (int i=0; i= 0; i--) { rank = gridsize_from_cmdline[i] * rank + coords[i]; } + for (int i = 2; i >= 0; i--) { rank = abs(gridsize_from_cmdline[i]) * rank + coords[i]; } return rank; } From 1f7fb9af23c6ecde4a487acc0ac1f27f51f26a17 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Fri, 22 Sep 2023 11:45:23 +0200 Subject: [PATCH 081/148] Debug cstar error --- include/communicator_quda.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/communicator_quda.h b/include/communicator_quda.h index 940bcc87b3..3cba04ee4c 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -134,7 +134,6 @@ namespace quda } coords[i] = (i < topo->ndim) ? mod(comm_coords(topo)[i] + displacement[i] + (i==0 ? Nx_displacement :0), comm_dims(topo)[i]) : 0; } - std::cout << ": " << coords[0] << " " << coords[1] << " " << coords[2] << " " << coords[3] << " yields rank" << comm_rank_from_coords(topo, coords) << std::endl; From ac3d0d0690e12a9ffbdc407e04af6e46dc28a637 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Mon, 25 Sep 2023 12:59:45 +0200 Subject: [PATCH 082/148] Use layout.cstar for setting cstar --- lib/openqcd_interface.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index d0b19cecd6..39b015cbf5 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -131,8 +131,15 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) errorQuda("Error: Odd lattice dimensions are not supported\n"); exit(1); } - mynproc[dir] = (dir==2 || dir==1) ? -layout.nproc[dir] : layout.nproc[dir]; + mynproc[dir] = layout.nproc[dir]; } + if(layout.cstar > 1) { + mynproc[1] *= -1; /* y direction */ + } + if(layout.cstar > 2) { + mynproc[2] *= -1; /* z direction */ + } +} #ifdef MULTI_GPU // TODO: would we ever want to run with QMP COMMS? From e7eaaa9500da52516a8d79ab8324b1e7fe178d01 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Mon, 25 Sep 2023 13:07:42 +0200 Subject: [PATCH 083/148] Fix } --- lib/openqcd_interface.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 39b015cbf5..d1964d812e 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -139,7 +139,6 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) if(layout.cstar > 2) { mynproc[2] *= -1; /* z direction */ } -} #ifdef MULTI_GPU // TODO: would we ever want to run with QMP COMMS? From abdcbccdce5e3b01a58ee608a1020ea8a7e74b0d Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Wed, 4 Oct 2023 14:19:37 +0200 Subject: [PATCH 084/148] Use global_ipr for ranksFromCoords --- include/communicator_quda.h | 33 ++++++++++++++++++++++---------- include/quda_openqcd_interface.h | 16 +++++++++++----- lib/interface_quda.cpp | 1 + lib/openqcd_interface.cpp | 25 +++++++++++++++--------- 4 files changed, 51 insertions(+), 24 deletions(-) diff --git a/include/communicator_quda.h b/include/communicator_quda.h index 3cba04ee4c..93e634c6ac 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -134,8 +134,12 @@ namespace quda } coords[i] = (i < topo->ndim) ? mod(comm_coords(topo)[i] + displacement[i] + (i==0 ? Nx_displacement :0), comm_dims(topo)[i]) : 0; } - std::cout << ": " << coords[0] << " " << coords[1] << " " << coords[2] << + + // CSTAR_DEBUG + if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { + std::cout << ": " << coords[0] << " " << coords[1] << " " << coords[2] << " " << coords[3] << " yields rank" << comm_rank_from_coords(topo, coords) << std::endl; + } return comm_rank_from_coords(topo, coords); } @@ -262,9 +266,13 @@ namespace quda const int gpuid = comm_gpuid(); comm_set_neighbor_ranks(); - for (int dir = 0; dir < 2; ++dir) { // forward/backward directions - for (int dim = 0; dim < 4; ++dim) { - printfQuda("my (%i):neighbors in dim/dir %i/%i: %i\n",comm_rank(),dim,dir,comm_neighbor_rank(dir, dim)); + if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { + // CSTAR_DEBUG + for (int dir = 0; dir < 2; ++dir) { // forward/backward directions + for (int dim = 0; dim < 4; ++dim) { + printf("my (%i):neighbors in dim/dir %i/%i: %i\n",comm_rank(),dim,dir,comm_neighbor_rank(dir, dim)); + } + } } } @@ -384,13 +392,18 @@ namespace quda int neg_displacement[QUDA_MAX_DIM] = {}; pos_displacement[d] = +1; neg_displacement[d] = -1; - std::cout << "rank: " << rank_grid[0] << "," << rank_grid[1] - << "," << rank_grid[2] << "," << rank_grid[3] << " negative " << d << std::endl; - neighbor_rank[0][d] = comm_rank_displaced(topology, neg_displacement); - std::cout << "rank: " << rank_grid[0] << "," << rank_grid[1] - << "," << rank_grid[2] << "," << rank_grid[3] << " positive " << d << std::endl; - neighbor_rank[1][d] = comm_rank_displaced(topology, pos_displacement); + + // CSTAR_DEBUG + if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { + std::cout << "rank: " << rank_grid[0] << "," << rank_grid[1] + << "," << rank_grid[2] << "," << rank_grid[3] << " negative " << d << std::endl; + neighbor_rank[0][d] = comm_rank_displaced(topology, neg_displacement); + std::cout << "rank: " << rank_grid[0] << "," << rank_grid[1] + << "," << rank_grid[2] << "," << rank_grid[3] << " positive " << d << std::endl; + neighbor_rank[1][d] = comm_rank_displaced(topology, pos_displacement); + } } + neighbors_cached = true; } diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index f59a667b64..c66784eff6 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -22,12 +22,18 @@ extern "C" { * L2, ... */ typedef struct { - int L[4]; /** Local lattice dimensions L0, L1, L2, L3 */ - int nproc[4]; /** Machine grid size NPROC0, NPROC1, NPROC2, NPROC3*/ - int nproc_blk[4]; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK */ - int N[4]; /** Glocal lattice dimensions N0, N1, N2, N3 */ + int L[4]; /** Local lattice dimensions L1, L2, L3, L0 */ + int nproc[4]; /** Machine grid size NPROC1, NPROC2, NPROC3, NPROC0*/ + int nproc_blk[4]; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK, + is assumed to be [1, 1, 1, 1] */ + int N[4]; /** Glocal lattice dimensions N1, N2, N3, N3 */ int device; /** GPU device number */ - int cstar; /** number of cstar directions */ + int cstar; /** number of cstar directions, equals bc_cstar() */ + int *ranks; /** rank topology, length 4 + NPROC1*NPROC2*NPROC3*NPROC0: + ranks[i] = nproc[i] for 0 <= i < 4 + ranks[4+lex(ix,iy,iz,it)] returns rank number in + openQCD, where lex stands for lexicographical + indexing (in QUDA order (xyzt)) */ } openQCD_QudaLayout_t; diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 7745de703e..48a891e63c 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -377,6 +377,7 @@ void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata } + #if defined(QMP_COMMS) || defined(MPI_COMMS) comm_init(nDim, dims, func, fdata, user_set_comm_handle, (void *)&MPI_COMM_HANDLE_USER); #else diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index d1964d812e..8d03961ba8 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -99,22 +99,29 @@ static lat_dim_t get_local_dims(int *fill = nullptr) * @brief Calculate the rank from coordinates. * * @param[in] coords coords is the 4D cartesian coordinate of a rank - * @param[in] fdata should point to 4 integers in order {NPROC0, NPROC1, - * NPROC2, NPROC3} + * @param[in] fdata should point to an instance of qudaLayout.ranks, + * @see struct openQCD_QudaLayout_t in + * @file include/quda_openqcd_interface.h * * @return rank */ static int rankFromCoords(const int *coords, void *fdata) // TODO: { int *NPROC = static_cast(fdata); - int ib; + int *ranks = NPROC + 4; + int i; - ib = coords[3]; - ib = ib*NPROC[0] + coords[0]; - ib = ib*NPROC[1] + coords[1]; - ib = ib*NPROC[2] + coords[2]; + i = coords[3] + NPROC[3]*(coords[2] + NPROC[2]*(coords[1] + NPROC[1]*(coords[0]))); + return ranks[i]; - return ib; + // Juan's version, not needed anymore + // int ib; + // ib = coords[3]; + // ib = ib*NPROC[0] + coords[0]; + // ib = ib*NPROC[1] + coords[1]; + // ib = ib*NPROC[2] + coords[2]; + + // return ib; } @@ -145,7 +152,7 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) #ifdef QMP_COMMS initCommsGridQuda(4, layout.nproc, nullptr, nullptr); #else - initCommsGridQuda(4, mynproc, rankFromCoords, (void *)(layout.nproc)); + initCommsGridQuda(4, mynproc, rankFromCoords, (void *)(layout.ranks)); #endif static int device = -1; // enable a default allocation of devices to processes #else From 732d760147eef9aa018b88ef1cf6e31b4813044d Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Fri, 29 Sep 2023 15:53:37 +0200 Subject: [PATCH 085/148] Print debug --- lib/comm_common.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index c58b788350..16d9cfcdf7 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -104,6 +104,10 @@ namespace quda topo->ndim = ndim; + for (int i = 0; i < QUDA_MAX_DIM; i++) { + topo->cstar[i] = 0; + } + int nodes = 1; for (int i = 0; i < ndim; i++) { topo->dims[i] = abs(dims[i]); @@ -111,6 +115,10 @@ namespace quda nodes *= topo->dims[i]; } + for(int i=0; icstar[i] = " << topo->cstar[i] << std::endl; + } + topo->ranks = new int[nodes]; topo->coords = (int(*)[QUDA_MAX_DIM]) new int[QUDA_MAX_DIM * nodes]; From 25ed05613e8bde70bc22da65d7ecdc2c4073c2f8 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Wed, 4 Oct 2023 14:28:37 +0200 Subject: [PATCH 086/148] Remove debug info --- lib/comm_common.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index 16d9cfcdf7..985196c67c 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -115,10 +115,6 @@ namespace quda nodes *= topo->dims[i]; } - for(int i=0; icstar[i] = " << topo->cstar[i] << std::endl; - } - topo->ranks = new int[nodes]; topo->coords = (int(*)[QUDA_MAX_DIM]) new int[QUDA_MAX_DIM * nodes]; From b69b67ade6da457460ca7a59d6328341ee6b9d3c Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Wed, 4 Oct 2023 14:37:23 +0200 Subject: [PATCH 087/148] Fix bracket, clean up --- include/communicator_quda.h | 7 +++---- lib/comm_common.cpp | 4 ---- lib/communicator_mpi.cpp | 2 +- lib/interface_quda.cpp | 1 - 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/include/communicator_quda.h b/include/communicator_quda.h index 93e634c6ac..177d6b1fcd 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -271,7 +271,6 @@ namespace quda for (int dir = 0; dir < 2; ++dir) { // forward/backward directions for (int dim = 0; dim < 4; ++dim) { printf("my (%i):neighbors in dim/dir %i/%i: %i\n",comm_rank(),dim,dir,comm_neighbor_rank(dir, dim)); - } } } } @@ -392,15 +391,15 @@ namespace quda int neg_displacement[QUDA_MAX_DIM] = {}; pos_displacement[d] = +1; neg_displacement[d] = -1; + neighbor_rank[0][d] = comm_rank_displaced(topology, neg_displacement); + neighbor_rank[1][d] = comm_rank_displaced(topology, pos_displacement); // CSTAR_DEBUG if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { std::cout << "rank: " << rank_grid[0] << "," << rank_grid[1] << "," << rank_grid[2] << "," << rank_grid[3] << " negative " << d << std::endl; - neighbor_rank[0][d] = comm_rank_displaced(topology, neg_displacement); std::cout << "rank: " << rank_grid[0] << "," << rank_grid[1] << "," << rank_grid[2] << "," << rank_grid[3] << " positive " << d << std::endl; - neighbor_rank[1][d] = comm_rank_displaced(topology, pos_displacement); } } @@ -546,7 +545,7 @@ namespace quda void comm_init_common(int ndim, const int *dims, QudaCommsMap rank_from_coords, void *map_data) { - Topology *topo = comm_create_topology(ndim, dims, rank_from_coords, map_data, comm_rank()); + Topology *topo = comm_create_topology(ndim, dims, rank_from_coords, map_data, comm_rank()); comm_set_default_topology(topo); // determine which GPU this rank will use diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index 985196c67c..c58b788350 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -104,10 +104,6 @@ namespace quda topo->ndim = ndim; - for (int i = 0; i < QUDA_MAX_DIM; i++) { - topo->cstar[i] = 0; - } - int nodes = 1; for (int i = 0; i < ndim; i++) { topo->dims[i] = abs(dims[i]); diff --git a/lib/communicator_mpi.cpp b/lib/communicator_mpi.cpp index 5e4c0bed9b..f5a6c6d200 100644 --- a/lib/communicator_mpi.cpp +++ b/lib/communicator_mpi.cpp @@ -52,7 +52,7 @@ namespace quda } comm_init(nDim, commDims, rank_from_coords, map_data); - globalReduce.push(true); + globalReduce.push(true); } Communicator::Communicator(Communicator &other, const int *comm_split) : globalReduce(other.globalReduce) diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 48a891e63c..7745de703e 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -377,7 +377,6 @@ void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata } - #if defined(QMP_COMMS) || defined(MPI_COMMS) comm_init(nDim, dims, func, fdata, user_set_comm_handle, (void *)&MPI_COMM_HANDLE_USER); #else From 748693ae6005719999c60693382f2a923ad3de77 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Wed, 4 Oct 2023 15:28:49 +0200 Subject: [PATCH 088/148] Comment debugging --- include/communicator_quda.h | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/include/communicator_quda.h b/include/communicator_quda.h index 177d6b1fcd..9c1e9177d8 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -136,10 +136,10 @@ namespace quda } // CSTAR_DEBUG - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { - std::cout << ": " << coords[0] << " " << coords[1] << " " << coords[2] << - " " << coords[3] << " yields rank" << comm_rank_from_coords(topo, coords) << std::endl; - } + // if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { + // std::cout << ": " << coords[0] << " " << coords[1] << " " << coords[2] << + // " " << coords[3] << " yields rank" << comm_rank_from_coords(topo, coords) << std::endl; + // } return comm_rank_from_coords(topo, coords); } @@ -266,14 +266,14 @@ namespace quda const int gpuid = comm_gpuid(); comm_set_neighbor_ranks(); - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { - // CSTAR_DEBUG - for (int dir = 0; dir < 2; ++dir) { // forward/backward directions - for (int dim = 0; dim < 4; ++dim) { - printf("my (%i):neighbors in dim/dir %i/%i: %i\n",comm_rank(),dim,dir,comm_neighbor_rank(dir, dim)); - } - } - } + // CSTAR_DEBUG + // if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { + // for (int dir = 0; dir < 2; ++dir) { // forward/backward directions + // for (int dim = 0; dim < 4; ++dim) { + // printf("my (%i):neighbors in dim/dir %i/%i: %i\n",comm_rank(),dim,dir,comm_neighbor_rank(dir, dim)); + // } + // } + // } char *hostname = comm_hostname(); int *gpuid_recv_buf = (int *)safe_malloc(sizeof(int) * comm_size()); @@ -395,12 +395,12 @@ namespace quda neighbor_rank[1][d] = comm_rank_displaced(topology, pos_displacement); // CSTAR_DEBUG - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { - std::cout << "rank: " << rank_grid[0] << "," << rank_grid[1] - << "," << rank_grid[2] << "," << rank_grid[3] << " negative " << d << std::endl; - std::cout << "rank: " << rank_grid[0] << "," << rank_grid[1] - << "," << rank_grid[2] << "," << rank_grid[3] << " positive " << d << std::endl; - } + // if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { + // std::cout << "rank: " << rank_grid[0] << "," << rank_grid[1] + // << "," << rank_grid[2] << "," << rank_grid[3] << " negative " << d << std::endl; + // std::cout << "rank: " << rank_grid[0] << "," << rank_grid[1] + // << "," << rank_grid[2] << "," << rank_grid[3] << " positive " << d << std::endl; + // } } neighbors_cached = true; From f663ec2c37f75e9f29dad8d919b77ca232f954f0 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Wed, 4 Oct 2023 16:23:08 +0200 Subject: [PATCH 089/148] Remove comment --- lib/openqcd_interface.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 8d03961ba8..4491257b19 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -113,15 +113,6 @@ static int rankFromCoords(const int *coords, void *fdata) // TODO: i = coords[3] + NPROC[3]*(coords[2] + NPROC[2]*(coords[1] + NPROC[1]*(coords[0]))); return ranks[i]; - - // Juan's version, not needed anymore - // int ib; - // ib = coords[3]; - // ib = ib*NPROC[0] + coords[0]; - // ib = ib*NPROC[1] + coords[1]; - // ib = ib*NPROC[2] + coords[2]; - - // return ib; } From 129c578a91b32b12d91092bd31384a1bd313eb05 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Fri, 6 Oct 2023 16:19:25 +0200 Subject: [PATCH 090/148] Restore info about communicator --- include/communicator_quda.h | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/include/communicator_quda.h b/include/communicator_quda.h index 9c1e9177d8..6c7dded070 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -301,19 +301,21 @@ namespace quda // if (canAccessPeer[0] * canAccessPeer[1] != 0 || gpuid == neighbor_gpuid) { if ((can_access_peer && access_rank <= enable_p2p_max_access_rank) || gpuid == neighbor_gpuid) { peer2peer_enabled[dir][dim] = true; - if (getVerbosity() > QUDA_SILENT) { - printf("Peer-to-peer enabled for rank %3d (gpu=%d) with neighbor %3d (gpu=%d) dir=%d, dim=%d, " - "access rank = (%3d)\n", - comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim, access_rank); - } + // CSTAR_DEBUG + // if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { + // printf("Peer-to-peer enabled for rank %3d (gpu=%d) with neighbor %3d (gpu=%d) dir=%d, dim=%d, " + // "access rank = (%3d)\n", + // comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim, access_rank); + // } } else { intranode_enabled[dir][dim] = true; - if (getVerbosity() > QUDA_SILENT) { - printf( - "Intra-node (non peer-to-peer) enabled for rank %3d (gpu=%d) with neighbor %3d (gpu=%d) dir=%d, " - "dim=%d\n", - comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim); - } + // CSTAR_DEBUG + // if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { + // printf( + // "Intra-node (non peer-to-peer) enabled for rank %3d (gpu=%d) with neighbor %3d (gpu=%d) dir=%d, " + // "dim=%d\n", + // comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim); + // } } } // on the same node From 512e3e92b6885fd865dd49484544b10cc2b1d444 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Fri, 6 Oct 2023 16:46:36 +0200 Subject: [PATCH 091/148] Rename variables --- include/communicator_quda.h | 4 ++-- include/quda_openqcd_interface.h | 7 ++++--- lib/comm_common.cpp | 4 +++- lib/openqcd_interface.cpp | 7 ++++--- 4 files changed, 13 insertions(+), 9 deletions(-) diff --git a/include/communicator_quda.h b/include/communicator_quda.h index 6c7dded070..234d965394 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -40,7 +40,7 @@ namespace quda int (*coords)[QUDA_MAX_DIM]; int my_rank; int my_coords[QUDA_MAX_DIM]; - int cstar[QUDA_MAX_DIM]; + int shift_boundary[QUDA_MAX_DIM]; // It might be worth adding communicators to allow for efficient reductions: // #if defined(MPI_COMMS) // MPI_Comm comm; @@ -129,7 +129,7 @@ namespace quda int Nx_displacement = 0; for (int i = QUDA_MAX_DIM-1; i >=0; i--) { - if(topo->cstar[i]==1 && i < topo->ndim){ + if(topo->shift_boundary[i]==1 && i < topo->ndim){ Nx_displacement += ((comm_coords(topo)[i] + displacement[i] + comm_dims(topo)[i])/comm_dims(topo)[i] -1) * (comm_dims(topo)[0]/2); } coords[i] = (i < topo->ndim) ? mod(comm_coords(topo)[i] + displacement[i] + (i==0 ? Nx_displacement :0), comm_dims(topo)[i]) : 0; diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index c66784eff6..1fadc1d5e0 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -29,9 +29,10 @@ typedef struct { int N[4]; /** Glocal lattice dimensions N1, N2, N3, N3 */ int device; /** GPU device number */ int cstar; /** number of cstar directions, equals bc_cstar() */ - int *ranks; /** rank topology, length 4 + NPROC1*NPROC2*NPROC3*NPROC0: - ranks[i] = nproc[i] for 0 <= i < 4 - ranks[4+lex(ix,iy,iz,it)] returns rank number in + int *data; /** rank topology, length 5 + NPROC1*NPROC2*NPROC3*NPROC0: + data[0] = cstar; + data[1+i] = nproc[i] for 0 <= i < 4 + data[5+lex(ix,iy,iz,it)] returns rank number in openQCD, where lex stands for lexicographical indexing (in QUDA order (xyzt)) */ } openQCD_QudaLayout_t; diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index c58b788350..910bedfe9d 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -107,7 +107,9 @@ namespace quda int nodes = 1; for (int i = 0; i < ndim; i++) { topo->dims[i] = abs(dims[i]); - topo->cstar[i] = dims[i] < 0 ? 1:0; + // We pass negative dimensions from openQxD + // @file lib/openqcd_interface.cpp:openQCD_qudaSetLayout + topo->shift_boundary[i] = dims[i] < 0 ? 1:0; nodes *= topo->dims[i]; } diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 4491257b19..38b8c4989f 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -107,8 +107,9 @@ static lat_dim_t get_local_dims(int *fill = nullptr) */ static int rankFromCoords(const int *coords, void *fdata) // TODO: { - int *NPROC = static_cast(fdata); - int *ranks = NPROC + 4; + int *base = static_cast(fdata); + int *NPROC = base + 1; + int *ranks = base + 5; int i; i = coords[3] + NPROC[3]*(coords[2] + NPROC[2]*(coords[1] + NPROC[1]*(coords[0]))); @@ -143,7 +144,7 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) #ifdef QMP_COMMS initCommsGridQuda(4, layout.nproc, nullptr, nullptr); #else - initCommsGridQuda(4, mynproc, rankFromCoords, (void *)(layout.ranks)); + initCommsGridQuda(4, mynproc, rankFromCoords, (void *)(layout.data)); #endif static int device = -1; // enable a default allocation of devices to processes #else From 4d9b28a705f943372277298f4060d0782f721d2b Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Fri, 6 Oct 2023 16:49:46 +0200 Subject: [PATCH 092/148] Remove debug statements --- include/communicator_quda.h | 52 ++++++++++--------------------------- lib/openqcd_interface.cpp | 2 ++ 2 files changed, 16 insertions(+), 38 deletions(-) diff --git a/include/communicator_quda.h b/include/communicator_quda.h index 234d965394..623ef632d8 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -130,17 +130,13 @@ namespace quda int Nx_displacement = 0; for (int i = QUDA_MAX_DIM-1; i >=0; i--) { if(topo->shift_boundary[i]==1 && i < topo->ndim){ + // if we go over the boundary and have a shifted boundary condition, + // we shift Nx/2 ranks in x-direction Nx_displacement += ((comm_coords(topo)[i] + displacement[i] + comm_dims(topo)[i])/comm_dims(topo)[i] -1) * (comm_dims(topo)[0]/2); } coords[i] = (i < topo->ndim) ? mod(comm_coords(topo)[i] + displacement[i] + (i==0 ? Nx_displacement :0), comm_dims(topo)[i]) : 0; } - // CSTAR_DEBUG - // if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { - // std::cout << ": " << coords[0] << " " << coords[1] << " " << coords[2] << - // " " << coords[3] << " yields rank" << comm_rank_from_coords(topo, coords) << std::endl; - // } - return comm_rank_from_coords(topo, coords); } @@ -266,15 +262,7 @@ namespace quda const int gpuid = comm_gpuid(); comm_set_neighbor_ranks(); - // CSTAR_DEBUG - // if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { - // for (int dir = 0; dir < 2; ++dir) { // forward/backward directions - // for (int dim = 0; dim < 4; ++dim) { - // printf("my (%i):neighbors in dim/dir %i/%i: %i\n",comm_rank(),dim,dir,comm_neighbor_rank(dir, dim)); - // } - // } - // } - + char *hostname = comm_hostname(); int *gpuid_recv_buf = (int *)safe_malloc(sizeof(int) * comm_size()); @@ -301,21 +289,19 @@ namespace quda // if (canAccessPeer[0] * canAccessPeer[1] != 0 || gpuid == neighbor_gpuid) { if ((can_access_peer && access_rank <= enable_p2p_max_access_rank) || gpuid == neighbor_gpuid) { peer2peer_enabled[dir][dim] = true; - // CSTAR_DEBUG - // if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { - // printf("Peer-to-peer enabled for rank %3d (gpu=%d) with neighbor %3d (gpu=%d) dir=%d, dim=%d, " - // "access rank = (%3d)\n", - // comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim, access_rank); - // } + if (getVerbosity() > QUDA_SILENT) { + printf("Peer-to-peer enabled for rank %3d (gpu=%d) with neighbor %3d (gpu=%d) dir=%d, dim=%d, " + "access rank = (%3d)\n", + comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim, access_rank); + } } else { intranode_enabled[dir][dim] = true; - // CSTAR_DEBUG - // if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { - // printf( - // "Intra-node (non peer-to-peer) enabled for rank %3d (gpu=%d) with neighbor %3d (gpu=%d) dir=%d, " - // "dim=%d\n", - // comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim); - // } + if (getVerbosity() > QUDA_SILENT) { + printf( + "Intra-node (non peer-to-peer) enabled for rank %3d (gpu=%d) with neighbor %3d (gpu=%d) dir=%d, " + "dim=%d\n", + comm_rank(), gpuid, neighbor_rank, neighbor_gpuid, dir, dim); + } } } // on the same node @@ -386,7 +372,6 @@ namespace quda Topology *topology = topo ? topo : default_topo; // use default topology if topo is NULL if (!topology) { errorQuda("Topology not specified"); } - const int *rank_grid = comm_coords_from_rank(topology, comm_rank()); for (int d = 0; d < 4; ++d) { int pos_displacement[QUDA_MAX_DIM] = {}; @@ -395,16 +380,7 @@ namespace quda neg_displacement[d] = -1; neighbor_rank[0][d] = comm_rank_displaced(topology, neg_displacement); neighbor_rank[1][d] = comm_rank_displaced(topology, pos_displacement); - - // CSTAR_DEBUG - // if (getVerbosity() >= QUDA_DEBUG_VERBOSE) { - // std::cout << "rank: " << rank_grid[0] << "," << rank_grid[1] - // << "," << rank_grid[2] << "," << rank_grid[3] << " negative " << d << std::endl; - // std::cout << "rank: " << rank_grid[0] << "," << rank_grid[1] - // << "," << rank_grid[2] << "," << rank_grid[3] << " positive " << d << std::endl; - // } } - neighbors_cached = true; } diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 38b8c4989f..a3ce970ede 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -132,6 +132,8 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) } mynproc[dir] = layout.nproc[dir]; } + // Negative dimensions are used to indicate shifted boundary conditions, + // @see lib/comm_common.cpp:comm_create_topology() if(layout.cstar > 1) { mynproc[1] *= -1; /* y direction */ } From 236f86942dad4f26ecab0184f45af6cd3fc6fef9 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 6 Oct 2023 18:37:58 +0200 Subject: [PATCH 093/148] added relevant parameters to Dirac struct --- include/quda_openqcd_interface.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 1fadc1d5e0..a899860493 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -67,7 +67,9 @@ typedef struct { typedef struct { double kappa; /* kappa: hopping parameter */ double mu; /* mu: twisted mass */ - double su3csw; /* su3csw: csw coefficient */ + double su3csw; /* su3csw: csw coefficient for SU(3) fields */ + double u1csw; /* u1csw: csw coefficient for U(1) fields, quda doesn't respect that parameter (yet) */ + int qhat; /* qhat: quda doesn't respect that parameter (yet) */ int dagger; /* dagger: whether to apply D or D^dagger */ } openQCD_QudaDiracParam_t; From cdd360e93e42ee7f411a7439c88caae3669dac50 Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Mon, 9 Oct 2023 16:33:15 +0200 Subject: [PATCH 094/148] Use BUILD_INTERFACE_OPENQCD to set cstar --- include/communicator_quda.h | 11 ++++++++++- lib/comm_common.cpp | 12 ++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/include/communicator_quda.h b/include/communicator_quda.h index 623ef632d8..458316c11e 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -41,6 +41,7 @@ namespace quda int my_rank; int my_coords[QUDA_MAX_DIM]; int shift_boundary[QUDA_MAX_DIM]; + int cstar; // It might be worth adding communicators to allow for efficient reductions: // #if defined(MPI_COMMS) // MPI_Comm comm; @@ -129,7 +130,15 @@ namespace quda int Nx_displacement = 0; for (int i = QUDA_MAX_DIM-1; i >=0; i--) { - if(topo->shift_boundary[i]==1 && i < topo->ndim){ + // cstar shift[x] shift[y] shift[z] shift[t] + // 0 0 0 0 0 + // 1 0 0 0 0 + // 2 0 1 0 0 + // 3 0 1 1 0 + if(i < topo->ndim && ( + (i==1 && topo->cstar >= 2) || + (i==2 && topo->cstar >= 3) + )) { // if we go over the boundary and have a shifted boundary condition, // we shift Nx/2 ranks in x-direction Nx_displacement += ((comm_coords(topo)[i] + displacement[i] + comm_dims(topo)[i])/comm_dims(topo)[i] -1) * (comm_dims(topo)[0]/2); diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index 910bedfe9d..6b674e9ab4 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -102,6 +102,14 @@ namespace quda Topology *topo = new Topology; + + #ifdef BUILD_OPENQCD_INTERFACE + int *data = static_cast(map_data); + topo->cstar = data[0]; + #else + topo->cstar = 0; + #endif + topo->ndim = ndim; int nodes = 1; @@ -110,6 +118,10 @@ namespace quda // We pass negative dimensions from openQxD // @file lib/openqcd_interface.cpp:openQCD_qudaSetLayout topo->shift_boundary[i] = dims[i] < 0 ? 1:0; + // cstar = 3: shift_boundary = [0, 1, 1, 0] (xyzt convention) + // cstar = 2: shift_boundary = [0, 1, 0, 0] (xyzt convention) + // cstar = 1: shift_boundary = [0, 0, 0, 0] (xyzt convention) + // cstar = 0: shift_boundary = [0, 0, 0, 0] (xyzt convention) nodes *= topo->dims[i]; } From dd0e4a5c1dfef0731821f71dc2bac101b0a5656f Mon Sep 17 00:00:00 2001 From: Anian Altherr Date: Mon, 9 Oct 2023 16:52:56 +0200 Subject: [PATCH 095/148] Remove abs(dims) --- include/communicator_quda.h | 10 ++++++---- lib/comm_common.cpp | 16 ++++------------ lib/communicator_mpi.cpp | 2 +- lib/interface_quda.cpp | 2 +- lib/openqcd_interface.cpp | 12 +----------- tests/utils/host_utils.cpp | 4 ++-- 6 files changed, 15 insertions(+), 31 deletions(-) diff --git a/include/communicator_quda.h b/include/communicator_quda.h index 458316c11e..dc077ca52a 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -40,8 +40,7 @@ namespace quda int (*coords)[QUDA_MAX_DIM]; int my_rank; int my_coords[QUDA_MAX_DIM]; - int shift_boundary[QUDA_MAX_DIM]; - int cstar; + int cstar; // number of C* direction as per openQxD convention // It might be worth adding communicators to allow for efficient reductions: // #if defined(MPI_COMMS) // MPI_Comm comm; @@ -127,6 +126,7 @@ namespace quda inline int comm_rank_displaced(const Topology *topo, const int displacement[]) { int coords[QUDA_MAX_DIM]; + int shift_integer; int Nx_displacement = 0; for (int i = QUDA_MAX_DIM-1; i >=0; i--) { @@ -140,8 +140,10 @@ namespace quda (i==2 && topo->cstar >= 3) )) { // if we go over the boundary and have a shifted boundary condition, - // we shift Nx/2 ranks in x-direction - Nx_displacement += ((comm_coords(topo)[i] + displacement[i] + comm_dims(topo)[i])/comm_dims(topo)[i] -1) * (comm_dims(topo)[0]/2); + // we shift Nx/2 ranks in x-direction: + // shift_integer in {-1, 0, 1} + shift_integer = (comm_coords(topo)[i] + displacement[i] + comm_dims(topo)[i]) / comm_dims(topo)[i]; + Nx_displacement += (shift_integer - 1) * (comm_dims(topo)[0]/2); } coords[i] = (i < topo->ndim) ? mod(comm_coords(topo)[i] + displacement[i] + (i==0 ? Nx_displacement :0), comm_dims(topo)[i]) : 0; } diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index 6b674e9ab4..412136b397 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -114,15 +114,8 @@ namespace quda int nodes = 1; for (int i = 0; i < ndim; i++) { - topo->dims[i] = abs(dims[i]); - // We pass negative dimensions from openQxD - // @file lib/openqcd_interface.cpp:openQCD_qudaSetLayout - topo->shift_boundary[i] = dims[i] < 0 ? 1:0; - // cstar = 3: shift_boundary = [0, 1, 1, 0] (xyzt convention) - // cstar = 2: shift_boundary = [0, 1, 0, 0] (xyzt convention) - // cstar = 1: shift_boundary = [0, 0, 0, 0] (xyzt convention) - // cstar = 0: shift_boundary = [0, 0, 0, 0] (xyzt convention) - nodes *= topo->dims[i]; + topo->dims[i] = dims[i]; + nodes *= dims[i]; } topo->ranks = new int[nodes]; @@ -133,10 +126,9 @@ namespace quda do { int rank = rank_from_coords(x, map_data); - topo->ranks[index(ndim, topo->dims, x)] = rank; - if(rank<0) errorQuda("rank <0"); + topo->ranks[index(ndim, dims, x)] = rank; for (int i = 0; i < ndim; i++) { topo->coords[rank][i] = x[i]; } - } while (advance_coords(ndim, topo->dims, x)); + } while (advance_coords(ndim, dims, x)); topo->my_rank = my_rank; for (int i = 0; i < ndim; i++) { topo->my_coords[i] = topo->coords[my_rank][i]; } diff --git a/lib/communicator_mpi.cpp b/lib/communicator_mpi.cpp index f5a6c6d200..61869c5811 100644 --- a/lib/communicator_mpi.cpp +++ b/lib/communicator_mpi.cpp @@ -113,7 +113,7 @@ namespace quda MPI_CHECK(MPI_Comm_size(MPI_COMM_HANDLE, &size)); int grid_size = 1; - for (int i = 0; i < ndim; i++) { grid_size *= abs(dims[i]); } + for (int i = 0; i < ndim; i++) { grid_size *= dims[i]; } if (grid_size != size) { errorQuda("Communication grid size declared via initCommsGridQuda() does not match" " total number of MPI ranks (%d != %d)", diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 7745de703e..913082b001 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -366,7 +366,7 @@ void initCommsGridQuda(int nDim, const int *dims, QudaCommsMap func, void *fdata map_data.ndim = nDim; for (int i=0; i 1) { - mynproc[1] *= -1; /* y direction */ - } - if(layout.cstar > 2) { - mynproc[2] *= -1; /* z direction */ } #ifdef MULTI_GPU @@ -146,7 +136,7 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) #ifdef QMP_COMMS initCommsGridQuda(4, layout.nproc, nullptr, nullptr); #else - initCommsGridQuda(4, mynproc, rankFromCoords, (void *)(layout.data)); + initCommsGridQuda(4, layout.nproc, rankFromCoords, (void *)(layout.data)); #endif static int device = -1; // enable a default allocation of devices to processes #else diff --git a/tests/utils/host_utils.cpp b/tests/utils/host_utils.cpp index f40782e37c..ce188dbdcc 100644 --- a/tests/utils/host_utils.cpp +++ b/tests/utils/host_utils.cpp @@ -674,14 +674,14 @@ void get_size_from_env(int *const dims, const char env[]) int lex_rank_from_coords_t(const int *coords, void *) { int rank = coords[0]; - for (int i = 1; i < 4; i++) { rank = abs(gridsize_from_cmdline[i]) * rank + coords[i]; } + for (int i = 1; i < 4; i++) { rank = gridsize_from_cmdline[i] * rank + coords[i]; } return rank; } int lex_rank_from_coords_x(const int *coords, void *) { int rank = coords[3]; - for (int i = 2; i >= 0; i--) { rank = abs(gridsize_from_cmdline[i]) * rank + coords[i]; } + for (int i = 2; i >= 0; i--) { rank = gridsize_from_cmdline[i] * rank + coords[i]; } return rank; } From 86134fb0aa8244213618d1ae103300c96c19f552 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Tue, 17 Oct 2023 18:50:02 +0200 Subject: [PATCH 096/148] towards a general solver interface --- include/quda_openqcd_interface.h | 111 +++++- lib/openqcd_interface.cpp | 652 ++++++++++++++++++++++++++++++- 2 files changed, 744 insertions(+), 19 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index a899860493..9313640b7f 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -16,6 +16,46 @@ extern "C" { #endif + + +/** + * Copied from flags.h + * ############################################# + */ +#ifndef FLAGS_H +typedef struct +{ + int type; + int cstar; + double phi3[2][3]; + double phi1[2]; +} bc_parms_t; + +typedef struct +{ + int qhat; + double m0,su3csw,u1csw,cF[2],theta[3]; +} dirac_parms_t; + +typedef struct +{ + int gauge; + int nfl; +} flds_parms_t; +#endif +/** + * ############################################# + */ + + +typedef enum OpenQCDGaugeGroup_s { + OPENQCD_GAUGE_SU3 = 1, + OPENQCD_GAUGE_U1 = 2, + OPENQCD_GAUGE_SU3xU1 = 3, + OPENQCD_GAUGE_INVALID = QUDA_INVALID_ENUM +} OpenQCDGaugeGroup; + + /** * Parameters related to problem size and machine topology. They should hold the * numbers in quda format, i.e. xyzt convention. For example L[0] = L1, L[1] = @@ -35,6 +75,11 @@ typedef struct { data[5+lex(ix,iy,iz,it)] returns rank number in openQCD, where lex stands for lexicographical indexing (in QUDA order (xyzt)) */ + bc_parms_t bc_parms; + dirac_parms_t dirac_parms; + flds_parms_t flds_parms; + void *h_gauge; + void *h_sw; } openQCD_QudaLayout_t; @@ -71,6 +116,8 @@ typedef struct { double u1csw; /* u1csw: csw coefficient for U(1) fields, quda doesn't respect that parameter (yet) */ int qhat; /* qhat: quda doesn't respect that parameter (yet) */ int dagger; /* dagger: whether to apply D or D^dagger */ + void *h_gauge; + void *h_sw; } openQCD_QudaDiracParam_t; @@ -144,7 +191,6 @@ void openQCD_qudaSpinorFree(void** quda_field); * @param[in] p Dirac parameter struct */ void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p); -void openQCD_qudaDw_NoLoads(void *src, void *dst, openQCD_QudaDiracParam_t p); /** @@ -157,25 +203,80 @@ void openQCD_qudaDw_NoLoads(void *src, void *dst, openQCD_QudaDiracParam_t p); * @param[out] solution Solution spinor * @param[in] dirac_param Dirac parameter struct * @param[in] gcr_param GCR parameter struct + * + * @return residual */ double openQCD_qudaGCR(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param, openQCD_QudaGCRParam_t gcr_param); /** - * Solve Ax=b for an Clover Wilson operator with a multigrid solver. All fields are fields passed and - * returned are host (CPU) field in openQCD order. This function requires that - * persistent gauge and clover fields have been created prior. - * + * Solve Ax=b for an Clover Wilson operator with a multigrid solver. All fields + * are fields passed and returned are host (CPU) field in openQCD order. This + * function requires that persistent gauge and clover fields have been created + * prior. + * * Requires QUDA_PRECISION & 2 != 0, e.g. QUDA_PRECISON = 14 * * @param[in] source Right-hand side source field * @param[out] solution Solution spinor field * @param[in] dirac_param Dirac parameter struct + * + * @return residual */ double openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param); +/** + * Setup the solver interface to quda. This function parses the file given by + * [infile] as an openQCD ini file. The solver section given by the [section] + * parameter must have a key-value pair like solver = QUDA and may contain every + * member of the struct [QudaInvertParam]. If one sets inv_type_precondition = + * QUDA_MG_INVERTER, one can additionally use all the members from the struct + * [QudaMultigridParam] in a section called "{section} Multigrid", where + * {section} is replaced by [section]. For every level given by n_level in the + * above section, one has to provide a subsection called + * "{section} Multigrid Level {level}", where {level} runs from 0 to n_level-1. + * All these subsections may have keys given by all the array-valued members of + * QudaMultigridParam, for example smoother_tol may appear in all subsections. + * + * @param[in] infile Ini-file containing sections about the solver + * @param[in] section The section name + * + * @return Pointer to the solver context + */ +void* openQCD_qudaSolverSetup(char *infile, char *section); + + +/** + * @brief Solve Ax=b for an Clover Wilson operator with a multigrid + * solver. All fields are fields passed and returned are host + * (CPU) field in openQCD order. This function requires an + * existing solver context created with openQCD_qudaSolverSetup() + * + * @param[inout] param Pointer returned by openQCD_qudaSolverSetup() + * @param[in] mu Twisted mass + * @param[in] source The source + * @param[out] solution The solution + * @param[out] status If the function is able to solve the Dirac equation + * to the desired accuracy (invert_param->tol), status + * reports the total number of iteration steps. -1 + * indicates that the inversion failed. + * + * @return Residual + */ +double openQCD_qudaInvert(void *param, double mu, void *source, void *solution, int *status); + + +/** + * @brief Destroys an existing solver context and frees all involed + * structs. + * + * @param param Pointer to the context to destroy + */ +void openQCD_qudaSolverDestroy(void *param); + + /** * @brief Wrapper for the plaquette. We could call plaqQuda() directly in * openQCD, but we have to make sure manually that the gauge field diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 2872c1eccf..cbabd91123 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -3,6 +3,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include @@ -10,9 +15,13 @@ #include #include #include +#include -#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#define CHECK_PARAM +#include "check_params.h" +#undef CHECK_PARAM +#define MAX(a, b) ((a) > (b) ? (a) : (b)) static openQCD_QudaState_t qudaState = {false, false, false, false, {}, {}}; @@ -51,6 +60,8 @@ static const int num_colors = sizeof(colors) / sizeof(uint32_t); #define POP_RANGE #endif +#define QUDA_OPENQCD_VERBOSE 1 + template void inline qudaopenqcd_called(const char *func, QudaVerbosity verb) { // add NVTX markup if enabled @@ -60,7 +71,7 @@ template void inline qudaopenqcd_called(const char *func, QudaVerbo POP_RANGE; } - #ifdef QUDAMILC_VERBOSE + #ifdef QUDA_OPENQCD_VERBOSE if (verb >= QUDA_VERBOSE) { if (start) { printfQuda("QUDA_OPENQCD_INTERFACE: %s (called) \n", func); @@ -74,6 +85,152 @@ template void inline qudaopenqcd_called(const char *func, QudaVerbo template void inline qudaopenqcd_called(const char *func) { qudaopenqcd_called(func, getVerbosity()); } +/** + * @brief Just a simple key-value store + */ +class KeyValueStore { +private: + std::unordered_map>> store; + std::unordered_map *map = nullptr; + +public: + /** + * @brief Sets a key value pair + * + * @param[in] section The section + * @param[in] key The key + * @param[in] value The value + */ + void set(const std::string& section, const std::string& key, const std::string& value) { + if (map != nullptr) { + auto mvalue = map->find(value); + if (mvalue != map->end()) { + //store[section][key] = mvalue->second; + std::get<0>(store[section][key]) = mvalue->second; + std::get<1>(store[section][key]) = value; + return; + } + } + std::get<0>(store[section][key]) = value; + std::get<1>(store[section][key]) = value; + //store[section][key] = value; + } + + void set_map(std::unordered_map *_map) { + map = _map; + } + + /** + * @brief Gets the specified key. + * + * @param[in] section The section + * @param[in] key The key + * @param[in] default_value The default value if section/key is absent + * + * @tparam T Desired return type + * + * @return The corresponding value + */ + template + T get(const std::string& section, const std::string& key, T default_value = T()) { + int idx; + std::string rkey; + std::smatch match; + std::regex p_key("([^\\[]+)\\[(\\d+)\\]"); // key[idx] + auto sec = store.find(section); + + if (sec != store.end()) { + if (std::regex_search(key, match, p_key)) { + rkey = match[1]; + idx = std::stoi(match[2]); + } else { + rkey = key; + idx = 0; + } + + auto item = sec->second.find(rkey); + if (item != sec->second.end()) { + std::stringstream ss(std::get<0>(item->second)); + if constexpr (std::is_enum_v) { + typename std::underlying_type::type result, dummy; + for (int i=0; i> dummy; + } + if (ss >> result) { + return static_cast(result); + } + } else { + T result, dummy; + for (int i=0; i> dummy; + } + if (ss >> result) { + return result; + } + } + } + } + return default_value; // Return default value for non-existent keys + } + + /** + * @brief Fill the store with entries from an ini-file + * + * @param[in] filename The filename + */ + void load(const std::string& filename) { + std::string line, section; + std::smatch match; + std::ifstream file(filename.c_str()); + + std::regex p_section("^\\s*\\[([\\w\\ ]+)\\].*$"); // [section] + std::regex p_comment("^[^#]*(\\s*#.*)$"); // line # comment + std::regex p_key_val("^([^\\s]+)\\s+(.*[^\\s]+)\\s*$"); // key value + + if (file.is_open()) { + + while (std::getline(file, line)) { + + // remove all comments + if (std::regex_search(line, match, p_comment)) { + line.erase(match.position(1)); + } + + if (std::regex_search(line, match, p_section)) { + section = match[1]; + } else if (std::regex_search(line, match, p_key_val)) { + std::string key = match[1]; + std::string val = match[2]; + this->set(section, key, val); + } + } + + file.close(); + } else { + std::cerr << "Error opening file: " << filename << std::endl; + } + } + + /** + * @brief Dumps all entries in the store. + */ + void dump(std::string _section = "") { + for (const auto& section : store) { + if (_section == "" || _section == section.first) { + std::cout << "[" << section.first << "]" << std::endl; + for (const auto& pair : section.second) { + std::cout << " " << pair.first << " = " << std::get<1>(pair.second); + if (std::get<0>(pair.second) != std::get<1>(pair.second)) { + std::cout << " # " << std::get<0>(pair.second); + } + std::cout << std::endl; + } + } + } + } +}; + + /** * @brief Returns the local lattice dimensions as lat_dim_t * @@ -174,15 +331,14 @@ static QudaInvertParam newOpenQCDParam(void) /* AA: This breaks GCR */ // /* TH added for MG support */ - // param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; // The precision used by the QUDA solver - // param.cuda_prec_precondition = QUDA_HALF_PRECISION; // The precision used by the QUDA solver + param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; // The precision used by the QUDA solver + param.cuda_prec_precondition = QUDA_HALF_PRECISION; // The precision used by the QUDA solver /** * The order of the input and output fermion fields. Imposes fieldOrder = * QUDA_OPENQCD_FIELD_ORDER in color_spinor_field.h and * QUDA_OPENQCD_FIELD_ORDER makes quda to instantiate OpenQCDDiracOrder. */ - param.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; // Gamma basis of the input and output host fields @@ -220,6 +376,8 @@ static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec) param.anisotropy = 1.0; // 1.0 means not anisotropic param.ga_pad = getLinkPadding(param.X); // Why this? + checkGaugeParam(¶m); + return param; } @@ -556,11 +714,6 @@ void openQCD_qudaD2H(void *quda_field, void *openQCD_field) } -void openQCD_qudaDw_NoLoads(void *src, void *dst, openQCD_QudaDiracParam_t p) -{ -} - - void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p) { QudaInvertParam param = newOpenQCDDiracParam(p); @@ -592,17 +745,488 @@ double openQCD_qudaGCR(void *source, void *solution, invertQuda(static_cast(solution), static_cast(source), ¶m); - printfQuda("true_res = %e\n", param.true_res); + printfQuda("true_res = %.2e\n", param.true_res); printfQuda("true_res_hq = %.2e\n", param.true_res_hq); printfQuda("iter = %d\n", param.iter); printfQuda("gflops = %.2e\n", param.gflops); printfQuda("secs = %.2e\n", param.secs); - /* this is not properly set */ - /* printfQuda("Nsteps = %d\n", param.Nsteps); */ return param.true_res; } + +void* openQCD_qudaSolverSetup(char *infile, char *section) +{ + int my_rank; + void *mgprec; + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + + // Allocate on the heap + QudaInvertParam* param = new QudaInvertParam(newQudaInvertParam()); + QudaInvertParam* invert_param_mg = new QudaInvertParam(newQudaInvertParam()); + QudaMultigridParam* multigrid_param = new QudaMultigridParam(newQudaMultigridParam()); + + // Some default settings + // Some of them should not be changed + param->verbosity = QUDA_SUMMARIZE; + param->cpu_prec = QUDA_DOUBLE_PRECISION; + param->cuda_prec = QUDA_DOUBLE_PRECISION; + param->cuda_prec_sloppy = QUDA_SINGLE_PRECISION; + param->cuda_prec_precondition = QUDA_HALF_PRECISION; + param->dirac_order = QUDA_OPENQCD_DIRAC_ORDER; + param->gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; + param->dslash_type = QUDA_WILSON_DSLASH; + param->kappa = 1.0/(2.0*(qudaState.layout.dirac_parms.m0+4.0)); + param->mu = 0.0; + param->dagger = QUDA_DAG_NO; + param->solution_type = QUDA_MAT_SOLUTION; + param->solve_type = QUDA_DIRECT_SOLVE; + param->matpc_type = QUDA_MATPC_EVEN_EVEN; + param->solver_normalization = QUDA_DEFAULT_NORMALIZATION; + param->inv_type_precondition = QUDA_INVALID_INVERTER; // disables any preconditioning + param->mass_normalization = QUDA_MASS_NORMALIZATION; + + if (qudaState.layout.dirac_parms.su3csw != 0.0) { + param->clover_location = QUDA_CUDA_FIELD_LOCATION; // seems to have no effect? + param->clover_cpu_prec = QUDA_DOUBLE_PRECISION; + param->clover_cuda_prec = QUDA_DOUBLE_PRECISION; + param->clover_order = QUDA_FLOAT8_CLOVER_ORDER; // what implication has this? + + param->compute_clover = true; + param->clover_csw = qudaState.layout.dirac_parms.su3csw; + param->clover_coeff = 0.0; + + // Set to Wilson Dirac operator with Clover term + param->dslash_type = QUDA_CLOVER_WILSON_DSLASH; + } + + if (my_rank == 0) { + + std::unordered_map enum_map = { + {"QUDA_CG_INVERTER", std::to_string(QUDA_CG_INVERTER)}, + {"QUDA_BICGSTAB_INVERTER", std::to_string(QUDA_BICGSTAB_INVERTER)}, + {"QUDA_GCR_INVERTER", std::to_string(QUDA_GCR_INVERTER)}, + {"QUDA_MR_INVERTER", std::to_string(QUDA_MR_INVERTER)}, + {"QUDA_SD_INVERTER", std::to_string(QUDA_SD_INVERTER)}, + {"QUDA_PCG_INVERTER", std::to_string(QUDA_PCG_INVERTER)}, + {"QUDA_EIGCG_INVERTER", std::to_string(QUDA_EIGCG_INVERTER)}, + {"QUDA_INC_EIGCG_INVERTER", std::to_string(QUDA_INC_EIGCG_INVERTER)}, + {"QUDA_GMRESDR_INVERTER", std::to_string(QUDA_GMRESDR_INVERTER)}, + {"QUDA_GMRESDR_PROJ_INVERTER", std::to_string(QUDA_GMRESDR_PROJ_INVERTER)}, + {"QUDA_GMRESDR_SH_INVERTER", std::to_string(QUDA_GMRESDR_SH_INVERTER)}, + {"QUDA_FGMRESDR_INVERTER", std::to_string(QUDA_FGMRESDR_INVERTER)}, + {"QUDA_MG_INVERTER", std::to_string(QUDA_MG_INVERTER)}, + {"QUDA_BICGSTABL_INVERTER", std::to_string(QUDA_BICGSTABL_INVERTER)}, + {"QUDA_CGNE_INVERTER", std::to_string(QUDA_CGNE_INVERTER)}, + {"QUDA_CGNR_INVERTER", std::to_string(QUDA_CGNR_INVERTER)}, + {"QUDA_CG3_INVERTER", std::to_string(QUDA_CG3_INVERTER)}, + {"QUDA_CG3NE_INVERTER", std::to_string(QUDA_CG3NE_INVERTER)}, + {"QUDA_CG3NR_INVERTER", std::to_string(QUDA_CG3NR_INVERTER)}, + {"QUDA_CA_CG_INVERTER", std::to_string(QUDA_CA_CG_INVERTER)}, + {"QUDA_CA_CGNE_INVERTER", std::to_string(QUDA_CA_CGNE_INVERTER)}, + {"QUDA_CA_CGNR_INVERTER", std::to_string(QUDA_CA_CGNR_INVERTER)}, + {"QUDA_CA_GCR_INVERTER", std::to_string(QUDA_CA_GCR_INVERTER)}, + {"QUDA_INVALID_INVERTER", std::to_string(QUDA_INVALID_INVERTER)}, + {"QUDA_MAT_SOLUTION", std::to_string(QUDA_MAT_SOLUTION)}, + {"QUDA_MATDAG_MAT_SOLUTION", std::to_string(QUDA_MATDAG_MAT_SOLUTION)}, + {"QUDA_MATPC_SOLUTION", std::to_string(QUDA_MATPC_SOLUTION)}, + {"QUDA_MATPC_DAG_SOLUTION", std::to_string(QUDA_MATPC_DAG_SOLUTION)}, + {"QUDA_MATPCDAG_MATPC_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SOLUTION)}, + {"QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION)}, + {"QUDA_INVALID_SOLUTION", std::to_string(QUDA_INVALID_SOLUTION)}, + {"QUDA_DIRECT_SOLVE", std::to_string(QUDA_DIRECT_SOLVE)}, + {"QUDA_NORMOP_SOLVE", std::to_string(QUDA_NORMOP_SOLVE)}, + {"QUDA_DIRECT_PC_SOLVE", std::to_string(QUDA_DIRECT_PC_SOLVE)}, + {"QUDA_NORMOP_PC_SOLVE", std::to_string(QUDA_NORMOP_PC_SOLVE)}, + {"QUDA_NORMERR_SOLVE", std::to_string(QUDA_NORMERR_SOLVE)}, + {"QUDA_NORMERR_PC_SOLVE", std::to_string(QUDA_NORMERR_PC_SOLVE)}, + {"QUDA_NORMEQ_SOLVE", std::to_string(QUDA_NORMEQ_SOLVE)}, + {"QUDA_NORMEQ_PC_SOLVE", std::to_string(QUDA_NORMEQ_PC_SOLVE)}, + {"QUDA_INVALID_SOLVE", std::to_string(QUDA_INVALID_SOLVE)}, + {"QUDA_MATPC_EVEN_EVEN", std::to_string(QUDA_MATPC_EVEN_EVEN)}, + {"QUDA_MATPC_ODD_ODD", std::to_string(QUDA_MATPC_ODD_ODD)}, + {"QUDA_MATPC_EVEN_EVEN_ASYMMETRIC", std::to_string(QUDA_MATPC_EVEN_EVEN_ASYMMETRIC)}, + {"QUDA_MATPC_ODD_ODD_ASYMMETRIC", std::to_string(QUDA_MATPC_ODD_ODD_ASYMMETRIC)}, + {"QUDA_MATPC_INVALID", std::to_string(QUDA_MATPC_INVALID)}, + {"QUDA_DEFAULT_NORMALIZATION", std::to_string(QUDA_DEFAULT_NORMALIZATION)}, + {"QUDA_SOURCE_NORMALIZATION", std::to_string(QUDA_SOURCE_NORMALIZATION)}, + {"QUDA_QUARTER_PRECISION", std::to_string(QUDA_QUARTER_PRECISION)}, + {"QUDA_HALF_PRECISION", std::to_string(QUDA_HALF_PRECISION)}, + {"QUDA_SINGLE_PRECISION", std::to_string(QUDA_SINGLE_PRECISION)}, + {"QUDA_DOUBLE_PRECISION", std::to_string(QUDA_DOUBLE_PRECISION)}, + {"QUDA_INVALID_PRECISION", std::to_string(QUDA_INVALID_PRECISION)}, + {"QUDA_BOOLEAN_FALSE", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"QUDA_BOOLEAN_TRUE", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"QUDA_BOOLEAN_INVALID", std::to_string(QUDA_BOOLEAN_INVALID)}, + {"QUDA_COMPUTE_NULL_VECTOR_NO", std::to_string(QUDA_COMPUTE_NULL_VECTOR_NO)}, + {"QUDA_COMPUTE_NULL_VECTOR_YES", std::to_string(QUDA_COMPUTE_NULL_VECTOR_YES)}, + {"QUDA_COMPUTE_NULL_VECTOR_INVALID", std::to_string(QUDA_COMPUTE_NULL_VECTOR_INVALID)}, + {"QUDA_MG_CYCLE_VCYCLE", std::to_string(QUDA_MG_CYCLE_VCYCLE)}, + {"QUDA_MG_CYCLE_FCYCLE", std::to_string(QUDA_MG_CYCLE_FCYCLE)}, + {"QUDA_MG_CYCLE_WCYCLE", std::to_string(QUDA_MG_CYCLE_WCYCLE)}, + {"QUDA_MG_CYCLE_RECURSIVE", std::to_string(QUDA_MG_CYCLE_RECURSIVE)}, + {"QUDA_MG_CYCLE_INVALID", std::to_string(QUDA_MG_CYCLE_INVALID)}, + {"QUDA_CPU_FIELD_LOCATION", std::to_string(QUDA_CPU_FIELD_LOCATION)}, + {"QUDA_CUDA_FIELD_LOCATION", std::to_string(QUDA_CUDA_FIELD_LOCATION)}, + {"QUDA_INVALID_FIELD_LOCATION", std::to_string(QUDA_INVALID_FIELD_LOCATION)}, + {"QUDA_TWIST_SINGLET", std::to_string(QUDA_TWIST_SINGLET)}, + {"QUDA_TWIST_NONDEG_DOUBLET", std::to_string(QUDA_TWIST_NONDEG_DOUBLET)}, + {"QUDA_TWIST_NO", std::to_string(QUDA_TWIST_NO)}, + {"QUDA_TWIST_INVALID", std::to_string(QUDA_TWIST_INVALID)}, + {"QUDA_DAG_NO", std::to_string(QUDA_DAG_NO)}, + {"QUDA_DAG_YES", std::to_string(QUDA_DAG_YES)}, + {"QUDA_DAG_INVALID", std::to_string(QUDA_DAG_INVALID)}, + {"QUDA_KAPPA_NORMALIZATION", std::to_string(QUDA_KAPPA_NORMALIZATION)}, + {"QUDA_MASS_NORMALIZATION", std::to_string(QUDA_MASS_NORMALIZATION)}, + {"QUDA_ASYMMETRIC_MASS_NORMALIZATION", std::to_string(QUDA_ASYMMETRIC_MASS_NORMALIZATION)}, + {"QUDA_INVALID_NORMALIZATION", std::to_string(QUDA_INVALID_NORMALIZATION)}, + {"QUDA_PRESERVE_SOURCE_NO", std::to_string(QUDA_PRESERVE_SOURCE_NO)}, + {"QUDA_PRESERVE_SOURCE_YES", std::to_string(QUDA_PRESERVE_SOURCE_YES)}, + {"QUDA_PRESERVE_SOURCE_INVALID", std::to_string(QUDA_PRESERVE_SOURCE_INVALID)}, + {"QUDA_USE_INIT_GUESS_NO", std::to_string(QUDA_USE_INIT_GUESS_NO)}, + {"QUDA_USE_INIT_GUESS_YES", std::to_string(QUDA_USE_INIT_GUESS_YES)}, + {"QUDA_USE_INIT_GUESS_INVALID", std::to_string(QUDA_USE_INIT_GUESS_INVALID)}, + {"QUDA_SILENT", std::to_string(QUDA_SILENT)}, + {"QUDA_SUMMARIZE", std::to_string(QUDA_SUMMARIZE)}, + {"QUDA_VERBOSE", std::to_string(QUDA_VERBOSE)}, + {"QUDA_DEBUG_VERBOSE", std::to_string(QUDA_DEBUG_VERBOSE)}, + {"QUDA_INVALID_VERBOSITY", std::to_string(QUDA_INVALID_VERBOSITY)}, + {"QUDA_TUNE_NO", std::to_string(QUDA_TUNE_NO)}, + {"QUDA_TUNE_YES", std::to_string(QUDA_TUNE_YES)}, + {"QUDA_TUNE_INVALID", std::to_string(QUDA_TUNE_INVALID)}, + {"QUDA_POWER_BASIS", std::to_string(QUDA_POWER_BASIS)}, + {"QUDA_CHEBYSHEV_BASIS", std::to_string(QUDA_CHEBYSHEV_BASIS)}, + {"QUDA_INVALID_BASIS", std::to_string(QUDA_INVALID_BASIS)}, + {"QUDA_ADDITIVE_SCHWARZ", std::to_string(QUDA_ADDITIVE_SCHWARZ)}, + {"QUDA_MULTIPLICATIVE_SCHWARZ", std::to_string(QUDA_MULTIPLICATIVE_SCHWARZ)}, + {"QUDA_INVALID_SCHWARZ", std::to_string(QUDA_INVALID_SCHWARZ)}, + {"QUDA_MADWF_ACCELERATOR", std::to_string(QUDA_MADWF_ACCELERATOR)}, + {"QUDA_INVALID_ACCELERATOR", std::to_string(QUDA_INVALID_ACCELERATOR)}, + {"QUDA_L2_RELATIVE_RESIDUAL", std::to_string(QUDA_L2_RELATIVE_RESIDUAL)}, + {"QUDA_L2_ABSOLUTE_RESIDUAL", std::to_string(QUDA_L2_ABSOLUTE_RESIDUAL)}, + {"QUDA_HEAVY_QUARK_RESIDUAL", std::to_string(QUDA_HEAVY_QUARK_RESIDUAL)}, + {"QUDA_INVALID_RESIDUAL", std::to_string(QUDA_INVALID_RESIDUAL)}, + {"QUDA_NULL_VECTOR_SETUP", std::to_string(QUDA_NULL_VECTOR_SETUP)}, + {"QUDA_TEST_VECTOR_SETUP", std::to_string(QUDA_TEST_VECTOR_SETUP)}, + {"QUDA_INVALID_SETUP_TYPE", std::to_string(QUDA_INVALID_SETUP_TYPE)}, + {"QUDA_TRANSFER_AGGREGATE", std::to_string(QUDA_TRANSFER_AGGREGATE)}, + {"QUDA_TRANSFER_COARSE_KD", std::to_string(QUDA_TRANSFER_COARSE_KD)}, + {"QUDA_TRANSFER_OPTIMIZED_KD", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD)}, + {"QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG)}, + {"QUDA_TRANSFER_INVALID", std::to_string(QUDA_TRANSFER_INVALID)} + }; + + KeyValueStore kv; + kv.set_map(&enum_map); + kv.load(infile); + + if (param->verbosity >= QUDA_VERBOSE) { + kv.dump(); + } + + if (kv.get(section, "solver") != "QUDA") { + errorQuda("Solver section %s is not a quda-solver section\n", section); + } + + // both fields reside on the CPU + param->input_location = kv.get(section, "input_location", QUDA_CPU_FIELD_LOCATION); + param->output_location = kv.get(section, "output_location", QUDA_CPU_FIELD_LOCATION); + + param->inv_type = kv.get(section, "inv_type", param->inv_type); + param->kappa = kv.get(section, "kappa", param->kappa); + param->mu = kv.get(section, "mu", param->mu); + param->tm_rho = kv.get(section, "tm_rho", param->tm_rho); + param->epsilon = kv.get(section, "epsilon", param->epsilon); + param->twist_flavor = kv.get(section, "twist_flavor", param->twist_flavor); + param->laplace3D = kv.get(section, "laplace3D", param->laplace3D); + + /* Solver settings */ + param->tol = kv.get(section, "tol", param->tol); + param->tol_restart = kv.get(section, "tol_restart", param->tol_restart); + param->tol_hq = kv.get(section, "tol_hq", param->tol_hq); + + param->compute_true_res = kv.get(section, "compute_true_res", param->compute_true_res); + param->true_res = kv.get(section, "true_res", param->true_res); + param->true_res_hq = kv.get(section, "true_res_hq", param->true_res_hq); + param->maxiter = kv.get(section, "maxiter", param->maxiter); + param->reliable_delta = kv.get(section, "reliable_delta", param->reliable_delta); + param->reliable_delta_refinement = kv.get(section, "reliable_delta_refinement", param->reliable_delta_refinement); + param->use_alternative_reliable = kv.get(section, "use_alternative_reliable", param->use_alternative_reliable); + param->use_sloppy_partial_accumulator = kv.get(section, "use_sloppy_partial_accumulator", param->use_sloppy_partial_accumulator); + + param->solution_accumulator_pipeline = kv.get(section, "solution_accumulator_pipeline", param->solution_accumulator_pipeline); + + param->max_res_increase = kv.get(section, "max_res_increase", param->max_res_increase); + param->max_res_increase_total = kv.get(section, "max_res_increase_total", param->max_res_increase_total); + param->max_hq_res_increase = kv.get(section, "max_hq_res_increase", param->max_hq_res_increase); + param->max_hq_res_restart_total = kv.get(section, "max_hq_res_restart_total", param->max_hq_res_restart_total); + + param->heavy_quark_check = kv.get(section, "heavy_quark_check", param->heavy_quark_check); + + param->pipeline = kv.get(section, "pipeline", param->pipeline); + param->num_offset = kv.get(section, "num_offset", param->num_offset); + param->num_src = kv.get(section, "num_src", param->num_src); + param->num_src_per_sub_partition = kv.get(section, "num_src_per_sub_partition", param->num_src_per_sub_partition); + + param->split_grid[0] = kv.get(section, "split_grid[1]", param->split_grid[0]); + param->split_grid[1] = kv.get(section, "split_grid[2]", param->split_grid[1]); + param->split_grid[2] = kv.get(section, "split_grid[3]", param->split_grid[2]); + param->split_grid[3] = kv.get(section, "split_grid[0]", param->split_grid[3]); + + param->overlap = kv.get(section, "overlap", param->overlap); + + /*param->offset = kv.get(section, "offset", param->offset)[QUDA_MAX_MULTI_SHIFT]; + param->tol_offset = kv.get(section, "tol_offset", param->tol_offset)[QUDA_MAX_MULTI_SHIFT]; + param->tol_hq_offset = kv.get(section, "tol_hq_offset", param->tol_hq_offset)[QUDA_MAX_MULTI_SHIFT];*/ + + param->compute_action = kv.get(section, "compute_action", param->compute_action); + + param->solution_type = kv.get(section, "solution_type", param->solution_type); + param->solve_type = kv.get(section, "solve_type", param->solve_type); + param->matpc_type = kv.get(section, "matpc_type", param->matpc_type); + param->dagger = kv.get(section, "dagger", param->dagger); + param->mass_normalization = kv.get(section, "mass_normalization", param->mass_normalization); + param->solver_normalization = kv.get(section, "solver_normalization", param->solver_normalization); + + param->preserve_source = kv.get(section, "preserve_source", param->preserve_source); + + param->cpu_prec = kv.get(section, "cpu_prec", param->cpu_prec); + param->cuda_prec = kv.get(section, "cuda_prec", param->cuda_prec); + param->cuda_prec_sloppy = kv.get(section, "cuda_prec_sloppy", param->cuda_prec_sloppy); + param->cuda_prec_refinement_sloppy = kv.get(section, "cuda_prec_refinement_sloppy", param->cuda_prec_refinement_sloppy); + param->cuda_prec_precondition = kv.get(section, "cuda_prec_precondition", param->cuda_prec_precondition); + param->cuda_prec_eigensolver = kv.get(section, "cuda_prec_eigensolver", param->cuda_prec_eigensolver); + + param->clover_location = kv.get(section, "clover_location", param->clover_location); + param->clover_cpu_prec = kv.get(section, "clover_cpu_prec", param->clover_cpu_prec); + param->clover_cuda_prec = kv.get(section, "clover_cuda_prec", param->clover_cuda_prec); + param->clover_cuda_prec_sloppy = kv.get(section, "clover_cuda_prec_sloppy", param->clover_cuda_prec_sloppy); + param->clover_cuda_prec_refinement_sloppy = kv.get(section, "clover_cuda_prec_refinement_sloppy", param->clover_cuda_prec_refinement_sloppy); + param->clover_cuda_prec_precondition = kv.get(section, "clover_cuda_prec_precondition", param->clover_cuda_prec_precondition); + param->clover_cuda_prec_eigensolver = kv.get(section, "clover_cuda_prec_eigensolver", param->clover_cuda_prec_eigensolver); + + param->use_init_guess = kv.get(section, "use_init_guess", param->use_init_guess); + + param->clover_csw = kv.get(section, "clover_csw", param->clover_csw); + param->clover_coeff = kv.get(section, "clover_coeff", param->clover_coeff); + param->clover_rho = kv.get(section, "clover_rho", param->clover_rho); + param->compute_clover_trlog = kv.get(section, "compute_clover_trlog", param->compute_clover_trlog); + param->verbosity = kv.get(section, "verbosity", param->verbosity); + param->tune = kv.get(section, "tune", param->tune); + param->Nsteps = kv.get(section, "Nsteps", param->Nsteps); + param->gcrNkrylov = kv.get(section, "gcrNkrylov", param->gcrNkrylov); + + param->inv_type_precondition = kv.get(section, "inv_type_precondition", param->inv_type_precondition); + param->deflate = kv.get(section, "deflate", param->deflate); + param->verbosity_precondition = kv.get(section, "verbosity_precondition", param->verbosity_precondition); + param->tol_precondition = kv.get(section, "tol_precondition", param->tol_precondition); + param->maxiter_precondition = kv.get(section, "maxiter_precondition", param->maxiter_precondition); + param->omega = kv.get(section, "omega", param->omega); + param->ca_basis = kv.get(section, "ca_basis", param->ca_basis); + param->ca_lambda_min = kv.get(section, "ca_lambda_min", param->ca_lambda_min); + param->ca_lambda_max = kv.get(section, "ca_lambda_max", param->ca_lambda_max); + param->ca_basis_precondition = kv.get(section, "ca_basis_precondition", param->ca_basis_precondition); + param->ca_lambda_min_precondition = kv.get(section, "ca_lambda_min_precondition", param->ca_lambda_min_precondition); + param->ca_lambda_max_precondition = kv.get(section, "ca_lambda_max_precondition", param->ca_lambda_max_precondition); + param->precondition_cycle = kv.get(section, "precondition_cycle", param->precondition_cycle); + param->schwarz_type = kv.get(section, "schwarz_type", param->schwarz_type); + param->accelerator_type_precondition = kv.get(section, "accelerator_type_precondition", param->accelerator_type_precondition); + + param->madwf_diagonal_suppressor = kv.get(section, "madwf_diagonal_suppressor", param->madwf_diagonal_suppressor); + param->madwf_ls = kv.get(section, "madwf_ls", param->madwf_ls); + param->madwf_null_miniter = kv.get(section, "madwf_null_miniter", param->madwf_null_miniter); + param->madwf_null_tol = kv.get(section, "madwf_null_tol", param->madwf_null_tol); + param->madwf_train_maxiter = kv.get(section, "madwf_train_maxiter", param->madwf_train_maxiter); + + param->madwf_param_load = kv.get(section, "madwf_param_load", param->madwf_param_load); + param->madwf_param_save = kv.get(section, "madwf_param_save", param->madwf_param_save); + + /*param->madwf_param_infile = kv.get(section, "madwf_param_infile", param->madwf_param_infile); + param->madwf_param_outfile = kv.get(section, "madwf_param_outfile", param->madwf_param_outfile);*/ + + param->residual_type = kv.get(section, "residual_type", param->residual_type); + + if (param->inv_type_precondition == QUDA_MG_INVERTER) { + + std::string mg_section = std::string(section) + " Multigrid"; + + // (shallow) copy the struct + *invert_param_mg = *param; + + // these have to be fixed + invert_param_mg->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; + invert_param_mg->dirac_order = QUDA_DIRAC_ORDER; + + multigrid_param->n_level = kv.get(mg_section, "n_level", multigrid_param->n_level); + multigrid_param->setup_type = kv.get(mg_section, "setup_type", multigrid_param->setup_type); + multigrid_param->pre_orthonormalize = kv.get(mg_section, "pre_orthonormalize", multigrid_param->pre_orthonormalize); + multigrid_param->post_orthonormalize = kv.get(mg_section, "post_orthonormalize", multigrid_param->post_orthonormalize); + multigrid_param->setup_minimize_memory = kv.get(mg_section, "setup_minimize_memory", multigrid_param->setup_minimize_memory); + multigrid_param->compute_null_vector = kv.get(mg_section, "compute_null_vector", multigrid_param->compute_null_vector); + multigrid_param->generate_all_levels = kv.get(mg_section, "generate_all_levels", multigrid_param->generate_all_levels); + multigrid_param->run_verify = kv.get(mg_section, "run_verify", multigrid_param->run_verify); + multigrid_param->run_low_mode_check = kv.get(mg_section, "run_low_mode_check", multigrid_param->run_low_mode_check); + multigrid_param->run_oblique_proj_check = kv.get(mg_section, "run_oblique_proj_check", multigrid_param->run_oblique_proj_check); + multigrid_param->coarse_guess = kv.get(mg_section, "coarse_guess", multigrid_param->coarse_guess); + multigrid_param->preserve_deflation = kv.get(mg_section, "preserve_deflation", multigrid_param->preserve_deflation); + multigrid_param->allow_truncation = kv.get(mg_section, "allow_truncation", multigrid_param->allow_truncation); + multigrid_param->staggered_kd_dagger_approximation = kv.get(mg_section, "staggered_kd_dagger_approximation", multigrid_param->staggered_kd_dagger_approximation); + multigrid_param->use_mma = kv.get(mg_section, "use_mma", multigrid_param->use_mma); + multigrid_param->thin_update_only = kv.get(mg_section, "thin_update_only", multigrid_param->thin_update_only); + + for (int i=0; in_level; i++) { + std::string subsection = std::string(section) + " Multigrid Level " + std::to_string(i); + + multigrid_param->geo_block_size[i][0] = kv.get(subsection, "geo_block_size[1]", multigrid_param->geo_block_size[i][0]); + multigrid_param->geo_block_size[i][1] = kv.get(subsection, "geo_block_size[2]", multigrid_param->geo_block_size[i][1]); + multigrid_param->geo_block_size[i][2] = kv.get(subsection, "geo_block_size[3]", multigrid_param->geo_block_size[i][2]); + multigrid_param->geo_block_size[i][3] = kv.get(subsection, "geo_block_size[0]", multigrid_param->geo_block_size[i][3]); + + if (i==0) { + multigrid_param->geo_block_size[i][0] = 4; + multigrid_param->geo_block_size[i][1] = 4; + multigrid_param->geo_block_size[i][2] = 4; + multigrid_param->geo_block_size[i][3] = 4; + } + + multigrid_param->spin_block_size[i] = kv.get(subsection, "spin_block_size", multigrid_param->spin_block_size[i]); + multigrid_param->n_vec[i] = kv.get(subsection, "n_vec", multigrid_param->n_vec[i]); + multigrid_param->precision_null[i] = kv.get(subsection, "precision_null", multigrid_param->precision_null[i]); + multigrid_param->n_block_ortho[i] = kv.get(subsection, "n_block_ortho", multigrid_param->n_block_ortho[i]); + multigrid_param->block_ortho_two_pass[i] = kv.get(subsection, "block_ortho_two_pass", multigrid_param->block_ortho_two_pass[i]); + multigrid_param->verbosity[i] = kv.get(subsection, "verbosity", multigrid_param->verbosity[i]); + multigrid_param->setup_inv_type[i] = kv.get(subsection, "setup_inv_type", multigrid_param->setup_inv_type[i]); + multigrid_param->num_setup_iter[i] = kv.get(subsection, "num_setup_iter", multigrid_param->num_setup_iter[i]); + multigrid_param->setup_tol[i] = kv.get(subsection, "setup_tol", multigrid_param->setup_tol[i]); + multigrid_param->setup_maxiter[i] = kv.get(subsection, "setup_maxiter", multigrid_param->setup_maxiter[i]); + multigrid_param->setup_maxiter_refresh[i] = kv.get(subsection, "setup_maxiter_refresh", multigrid_param->setup_maxiter_refresh[i]); + multigrid_param->setup_ca_basis[i] = kv.get(subsection, "setup_ca_basis", multigrid_param->setup_ca_basis[i]); + multigrid_param->setup_ca_basis_size[i] = kv.get(subsection, "setup_ca_basis_size", multigrid_param->setup_ca_basis_size[i]); + multigrid_param->setup_ca_lambda_min[i] = kv.get(subsection, "setup_ca_lambda_min", multigrid_param->setup_ca_lambda_min[i]); + multigrid_param->setup_ca_lambda_max[i] = kv.get(subsection, "setup_ca_lambda_max", multigrid_param->setup_ca_lambda_max[i]); + + multigrid_param->coarse_solver[i] = kv.get(subsection, "coarse_solver", multigrid_param->coarse_solver[i]); + multigrid_param->coarse_solver_tol[i] = kv.get(subsection, "coarse_solver_tol", multigrid_param->coarse_solver_tol[i]); + multigrid_param->coarse_solver_maxiter[i] = kv.get(subsection, "coarse_solver_maxiter", multigrid_param->coarse_solver_maxiter[i]); + multigrid_param->coarse_solver_ca_basis[i] = kv.get(subsection, "coarse_solver_ca_basis", multigrid_param->coarse_solver_ca_basis[i]); + multigrid_param->coarse_solver_ca_basis_size[i] = kv.get(subsection, "coarse_solver_ca_basis_size", multigrid_param->coarse_solver_ca_basis_size[i]); + multigrid_param->coarse_solver_ca_lambda_min[i] = kv.get(subsection, "coarse_solver_ca_lambda_min", multigrid_param->coarse_solver_ca_lambda_min[i]); + multigrid_param->coarse_solver_ca_lambda_max[i] = kv.get(subsection, "coarse_solver_ca_lambda_max", multigrid_param->coarse_solver_ca_lambda_max[i]); + multigrid_param->smoother[i] = kv.get(subsection, "smoother", multigrid_param->smoother[i]); + multigrid_param->smoother_tol[i] = kv.get(subsection, "smoother_tol", multigrid_param->smoother_tol[i]); + multigrid_param->nu_pre[i] = kv.get(subsection, "nu_pre", multigrid_param->nu_pre[i]); + multigrid_param->nu_post[i] = kv.get(subsection, "nu_post", multigrid_param->nu_post[i]); + multigrid_param->smoother_solver_ca_basis[i] = kv.get(subsection, "smoother_solver_ca_basis", multigrid_param->smoother_solver_ca_basis[i]); + multigrid_param->smoother_solver_ca_lambda_min[i] = kv.get(subsection, "smoother_solver_ca_lambda_min", multigrid_param->smoother_solver_ca_lambda_min[i]); + multigrid_param->smoother_solver_ca_lambda_max[i] = kv.get(subsection, "smoother_solver_ca_lambda_max", multigrid_param->smoother_solver_ca_lambda_max[i]); + multigrid_param->omega[i] = kv.get(subsection, "omega", multigrid_param->omega[i]); + multigrid_param->smoother_halo_precision[i] = kv.get(subsection, "smoother_halo_precision", multigrid_param->smoother_halo_precision[i]); + multigrid_param->smoother_schwarz_type[i] = kv.get(subsection, "smoother_schwarz_type", multigrid_param->smoother_schwarz_type[i]); + multigrid_param->smoother_schwarz_cycle[i] = kv.get(subsection, "smoother_schwarz_cycle", multigrid_param->smoother_schwarz_cycle[i]); + multigrid_param->coarse_grid_solution_type[i] = kv.get(subsection, "coarse_grid_solution_type", multigrid_param->coarse_grid_solution_type[i]); + multigrid_param->smoother_solve_type[i] = kv.get(subsection, "smoother_solve_type", multigrid_param->smoother_solve_type[i]); + multigrid_param->cycle_type[i] = kv.get(subsection, "cycle_type", multigrid_param->cycle_type[i]); + multigrid_param->global_reduction[i] = kv.get(subsection, "global_reduction", multigrid_param->global_reduction[i]); + multigrid_param->location[i] = kv.get(subsection, "location", multigrid_param->location[i]); + multigrid_param->setup_location[i] = kv.get(subsection, "setup_location", multigrid_param->setup_location[i]); + multigrid_param->use_eig_solver[i] = kv.get(subsection, "use_eig_solver", multigrid_param->use_eig_solver[i]); + + /*multigrid_param->vec_load[i] = kv.get(subsection, "vec_load", multigrid_param->vec_load[i]); + multigrid_param->vec_infile[i] = kv.get(subsection, "vec_infile", multigrid_param->vec_infile[i]); + multigrid_param->vec_store[i] = kv.get(subsection, "vec_store", multigrid_param->vec_store[i]); + multigrid_param->vec_outfile[i] = kv.get(subsection, "vec_outfile", multigrid_param->vec_outfile[i]);*/ + + multigrid_param->mu_factor[i] = kv.get(subsection, "mu_factor", multigrid_param->mu_factor[i]); + multigrid_param->transfer_type[i] = kv.get(subsection, "transfer_type", multigrid_param->transfer_type[i]); + } + } + } + + // transfer of the struct to all the processes + MPI_Bcast((void*) param, sizeof(*param), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Bcast((void*) invert_param_mg, sizeof(*invert_param_mg), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Bcast((void*) multigrid_param, sizeof(*multigrid_param), MPI_BYTE, 0, MPI_COMM_WORLD); + multigrid_param->invert_param = invert_param_mg; + + if (qudaState.layout.h_gauge != nullptr) { + openQCD_qudaGaugeLoad(qudaState.layout.h_gauge, QUDA_DOUBLE_PRECISION); + } + + if (qudaState.layout.dirac_parms.su3csw != 0.0) { + if (qudaState.layout.flds_parms.gauge == OPENQCD_GAUGE_SU3) { + /** + * Leaving both h_clover = h_clovinv = NULL allocates the clover field on + * the GPU and finally calls @createCloverQuda to calculate the clover + * field. + */ + loadCloverQuda(NULL, NULL, param); + } else { + /** + * Transfer the SW-field from openQCD. + */ + loadCloverQuda(qudaState.layout.h_sw, NULL, param); + } + } + + if (param->inv_type_precondition == QUDA_MG_INVERTER) { + PUSH_RANGE("newMultigridQuda",4); + mgprec = newMultigridQuda(multigrid_param); + param->preconditioner = mgprec; + POP_RANGE; + } + + checkInvertParam(param); + if (param->verbosity >= QUDA_DEBUG_VERBOSE) { + printQudaInvertParam(param); + } + + if (param->inv_type_precondition == QUDA_MG_INVERTER) { + checkMultigridParam(multigrid_param); + if (param->verbosity >= QUDA_DEBUG_VERBOSE) { + printQudaMultigridParam(multigrid_param); + } + } + + return (void*) param; +} + +double openQCD_qudaInvert(void *param, double mu, void *source, void *solution, int *status) +{ + QudaInvertParam* invert_param = static_cast(param); + invert_param->mu = mu; + + PUSH_RANGE("invertQUDA",5); + invertQuda(static_cast(solution), static_cast(source), invert_param); + POP_RANGE; + + if (invert_param->verbosity >= QUDA_VERBOSE) { + printfQuda("openQCD_qudaInvert()\n"); + printfQuda(" true_res = %.2e\n", invert_param->true_res); + printfQuda(" true_res_hq = %.2e\n", invert_param->true_res_hq); + printfQuda(" iter = %d\n", invert_param->iter); + printfQuda(" gflops = %.2e\n", invert_param->gflops); + printfQuda(" secs = %.2e\n", invert_param->secs); + } + + *status = invert_param->true_res <= invert_param->tol ? invert_param->iter : -1; + + return invert_param->true_res; +} + + +void openQCD_qudaSolverDestroy(void *param) +{ + QudaInvertParam* invert_param = static_cast(param); + + if (invert_param->inv_type_precondition == QUDA_MG_INVERTER) { + destroyMultigridQuda(invert_param->preconditioner); + } + + delete invert_param; +} + + double openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param) { QudaInvertParam invert_param = newOpenQCDSolverParam(dirac_param); @@ -695,7 +1319,7 @@ double openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracPara destroyMultigridQuda(mgprec); - printfQuda("true_res = %e\n", invert_param.true_res); + printfQuda("true_res = %.2e\n", invert_param.true_res); printfQuda("true_res_hq = %.2e\n", invert_param.true_res_hq); printfQuda("iter = %d\n", invert_param.iter); printfQuda("gflops = %.2e\n", invert_param.gflops); From 3ad9b063c277c2eae1d64f78d1795647f309b08c Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 18 Oct 2023 17:34:41 +0200 Subject: [PATCH 097/148] works with QCD+QED (clover term is transfered always) --- lib/openqcd_interface.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index cbabd91123..3a23fdd600 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -791,9 +791,11 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) param->clover_location = QUDA_CUDA_FIELD_LOCATION; // seems to have no effect? param->clover_cpu_prec = QUDA_DOUBLE_PRECISION; param->clover_cuda_prec = QUDA_DOUBLE_PRECISION; - param->clover_order = QUDA_FLOAT8_CLOVER_ORDER; // what implication has this? - param->compute_clover = true; + //param->clover_order = QUDA_FLOAT8_CLOVER_ORDER; // what implication has this? + //param->compute_clover = true; + param->clover_order = QUDA_OPENQCD_CLOVER_ORDER; + param->clover_csw = qudaState.layout.dirac_parms.su3csw; param->clover_coeff = 0.0; @@ -921,6 +923,9 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) kv.set_map(&enum_map); kv.load(infile); + param->verbosity = kv.get(section, "verbosity", param->verbosity); + setVerbosity(param->verbosity); + if (param->verbosity >= QUDA_VERBOSE) { kv.dump(); } @@ -1012,7 +1017,6 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) param->clover_coeff = kv.get(section, "clover_coeff", param->clover_coeff); param->clover_rho = kv.get(section, "clover_rho", param->clover_rho); param->compute_clover_trlog = kv.get(section, "compute_clover_trlog", param->compute_clover_trlog); - param->verbosity = kv.get(section, "verbosity", param->verbosity); param->tune = kv.get(section, "tune", param->tune); param->Nsteps = kv.get(section, "Nsteps", param->Nsteps); param->gcrNkrylov = kv.get(section, "gcrNkrylov", param->gcrNkrylov); @@ -1154,7 +1158,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) } if (qudaState.layout.dirac_parms.su3csw != 0.0) { - if (qudaState.layout.flds_parms.gauge == OPENQCD_GAUGE_SU3) { + if (false && qudaState.layout.flds_parms.gauge == OPENQCD_GAUGE_SU3) { /** * Leaving both h_clover = h_clovinv = NULL allocates the clover field on * the GPU and finally calls @createCloverQuda to calculate the clover @@ -1165,7 +1169,9 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) /** * Transfer the SW-field from openQCD. */ - loadCloverQuda(qudaState.layout.h_sw, NULL, param); + printfQuda("loading Clover field\n"); + openQCD_qudaCloverLoad(qudaState.layout.h_sw, param->kappa, param->clover_csw); + //loadCloverQuda(qudaState.layout.h_sw, NULL, param); } } From 36251b5de28af19cf63416ee01575d5f85897cc3 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 18 Oct 2023 19:19:50 +0200 Subject: [PATCH 098/148] finally transfer clover only when needed (ie QCD+QED) --- include/quda_openqcd_interface.h | 7 ++--- lib/openqcd_interface.cpp | 52 +++++++++++++++++++++----------- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 9313640b7f..207b0732b5 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -78,8 +78,8 @@ typedef struct { bc_parms_t bc_parms; dirac_parms_t dirac_parms; flds_parms_t flds_parms; - void *h_gauge; - void *h_sw; + void* (*h_gauge)(void); + void* (*h_sw)(void); } openQCD_QudaLayout_t; @@ -115,9 +115,6 @@ typedef struct { double su3csw; /* su3csw: csw coefficient for SU(3) fields */ double u1csw; /* u1csw: csw coefficient for U(1) fields, quda doesn't respect that parameter (yet) */ int qhat; /* qhat: quda doesn't respect that parameter (yet) */ - int dagger; /* dagger: whether to apply D or D^dagger */ - void *h_gauge; - void *h_sw; } openQCD_QudaDiracParam_t; diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 3a23fdd600..637210fc15 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -491,7 +491,7 @@ static QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) param.dslash_type = QUDA_WILSON_DSLASH; param.kappa = p.kappa; param.mu = p.mu; - param.dagger = p.dagger ? QUDA_DAG_YES : QUDA_DAG_NO; + param.dagger = QUDA_DAG_NO; if (p.su3csw != 0.0) { param.clover_location = QUDA_CUDA_FIELD_LOCATION; // seems to have no effect? @@ -792,15 +792,18 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) param->clover_cpu_prec = QUDA_DOUBLE_PRECISION; param->clover_cuda_prec = QUDA_DOUBLE_PRECISION; - //param->clover_order = QUDA_FLOAT8_CLOVER_ORDER; // what implication has this? - //param->compute_clover = true; - param->clover_order = QUDA_OPENQCD_CLOVER_ORDER; - param->clover_csw = qudaState.layout.dirac_parms.su3csw; param->clover_coeff = 0.0; // Set to Wilson Dirac operator with Clover term param->dslash_type = QUDA_CLOVER_WILSON_DSLASH; + + if (qudaState.layout.flds_parms.gauge == OPENQCD_GAUGE_SU3) { + param->clover_order = QUDA_FLOAT8_CLOVER_ORDER; // what implication has this? + param->compute_clover = true; + } else { + param->clover_order = QUDA_OPENQCD_CLOVER_ORDER; + } } if (my_rank == 0) { @@ -1154,24 +1157,38 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) multigrid_param->invert_param = invert_param_mg; if (qudaState.layout.h_gauge != nullptr) { - openQCD_qudaGaugeLoad(qudaState.layout.h_gauge, QUDA_DOUBLE_PRECISION); + logQuda(QUDA_VERBOSE, "Loading gauge field from openQCD ...\n"); + PUSH_RANGE("openQCD_qudaGaugeLoad",3); + openQCD_qudaGaugeLoad(qudaState.layout.h_gauge(), QUDA_DOUBLE_PRECISION); + POP_RANGE; } if (qudaState.layout.dirac_parms.su3csw != 0.0) { - if (false && qudaState.layout.flds_parms.gauge == OPENQCD_GAUGE_SU3) { + if (qudaState.layout.flds_parms.gauge == OPENQCD_GAUGE_SU3) { /** * Leaving both h_clover = h_clovinv = NULL allocates the clover field on * the GPU and finally calls @createCloverQuda to calculate the clover * field. */ + logQuda(QUDA_VERBOSE, "Generating clover field in QUDA ...\n"); + PUSH_RANGE("loadCloverQuda",3); loadCloverQuda(NULL, NULL, param); + POP_RANGE; } else { /** * Transfer the SW-field from openQCD. */ - printfQuda("loading Clover field\n"); - openQCD_qudaCloverLoad(qudaState.layout.h_sw, param->kappa, param->clover_csw); - //loadCloverQuda(qudaState.layout.h_sw, NULL, param); + logQuda(QUDA_VERBOSE, "Loading clover field from openQCD ...\n"); + PUSH_RANGE("openQCD_qudaCloverLoad",3); + openQCD_qudaCloverLoad(qudaState.layout.h_sw(), param->kappa, param->clover_csw); + POP_RANGE; + + //loadCloverQuda(qudaState.layout.h_sw(), NULL, param); + // The above line would be prefered over openQCD_qudaCloverLoad, but throws this error, no idea why? + //QUDA: ERROR: qudaEventRecord_ returned CUDA_ERROR_ILLEGAL_ADDRESS + // (timer.h:82 in start()) + // (rank 0, host yoshi, quda_api.cpp:72 in void quda::target::cuda::set_driver_error(CUresult, const char*, const char*, const char*, const char*, bool)()) + //QUDA: last kernel called was (name=N4quda10CopyCloverINS_6clover11FloatNOrderIdLi72ELi2ELb0ELb1ELb0EEENS1_12OpenQCDOrderIdLi72EEEddEE,volume=32x16x16x64,aux=GPU-offline,vol=524288precision=8Nc=3,compute_diagonal) } } @@ -1202,17 +1219,18 @@ double openQCD_qudaInvert(void *param, double mu, void *source, void *solution, QudaInvertParam* invert_param = static_cast(param); invert_param->mu = mu; - PUSH_RANGE("invertQUDA",5); + logQuda(QUDA_VERBOSE, "Calling invertQuda() ...\n"); + PUSH_RANGE("invertQuda",5); invertQuda(static_cast(solution), static_cast(source), invert_param); POP_RANGE; if (invert_param->verbosity >= QUDA_VERBOSE) { - printfQuda("openQCD_qudaInvert()\n"); - printfQuda(" true_res = %.2e\n", invert_param->true_res); - printfQuda(" true_res_hq = %.2e\n", invert_param->true_res_hq); - printfQuda(" iter = %d\n", invert_param->iter); - printfQuda(" gflops = %.2e\n", invert_param->gflops); - printfQuda(" secs = %.2e\n", invert_param->secs); + logQuda(QUDA_VERBOSE, "openQCD_qudaInvert()\n"); + logQuda(QUDA_VERBOSE, " true_res = %.2e\n", invert_param->true_res); + logQuda(QUDA_VERBOSE, " true_res_hq = %.2e\n", invert_param->true_res_hq); + logQuda(QUDA_VERBOSE, " iter = %d\n", invert_param->iter); + logQuda(QUDA_VERBOSE, " gflops = %.2e\n", invert_param->gflops); + logQuda(QUDA_VERBOSE, " secs = %.2e\n", invert_param->secs); } *status = invert_param->true_res <= invert_param->tol ? invert_param->iter : -1; From a0d86223a457dccadf7639897b2d724632baf99b Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Thu, 19 Oct 2023 12:41:16 +0200 Subject: [PATCH 099/148] added remaining settings --- lib/openqcd_interface.cpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 637210fc15..8e61a40c03 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -984,9 +984,14 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) param->overlap = kv.get(section, "overlap", param->overlap); - /*param->offset = kv.get(section, "offset", param->offset)[QUDA_MAX_MULTI_SHIFT]; - param->tol_offset = kv.get(section, "tol_offset", param->tol_offset)[QUDA_MAX_MULTI_SHIFT]; - param->tol_hq_offset = kv.get(section, "tol_hq_offset", param->tol_hq_offset)[QUDA_MAX_MULTI_SHIFT];*/ + for(int i=0; inum_offset; i++) { + std::string sub_key = "offset[" + std::to_string(i) + "]"; + param->offset[i] = kv.get(section, sub_key, param->offset[i]); + sub_key = "tol_offset[" + std::to_string(i) + "]"; + param->tol_offset[i] = kv.get(section, sub_key, param->tol_offset[i]); + sub_key = "tol_hq_offset[" + std::to_string(i) + "]"; + param->tol_hq_offset[i] = kv.get(section, sub_key, param->tol_hq_offset[i]); + } param->compute_action = kv.get(section, "compute_action", param->compute_action); @@ -1048,9 +1053,8 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) param->madwf_param_load = kv.get(section, "madwf_param_load", param->madwf_param_load); param->madwf_param_save = kv.get(section, "madwf_param_save", param->madwf_param_save); - - /*param->madwf_param_infile = kv.get(section, "madwf_param_infile", param->madwf_param_infile); - param->madwf_param_outfile = kv.get(section, "madwf_param_outfile", param->madwf_param_outfile);*/ + strcpy(param->madwf_param_infile, kv.get(section, "madwf_param_infile", param->madwf_param_infile).c_str()); + strcpy(param->madwf_param_outfile, kv.get(section, "madwf_param_outfile", param->madwf_param_outfile).c_str()); param->residual_type = kv.get(section, "residual_type", param->residual_type); @@ -1139,10 +1143,10 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) multigrid_param->setup_location[i] = kv.get(subsection, "setup_location", multigrid_param->setup_location[i]); multigrid_param->use_eig_solver[i] = kv.get(subsection, "use_eig_solver", multigrid_param->use_eig_solver[i]); - /*multigrid_param->vec_load[i] = kv.get(subsection, "vec_load", multigrid_param->vec_load[i]); - multigrid_param->vec_infile[i] = kv.get(subsection, "vec_infile", multigrid_param->vec_infile[i]); + multigrid_param->vec_load[i] = kv.get(subsection, "vec_load", multigrid_param->vec_load[i]); multigrid_param->vec_store[i] = kv.get(subsection, "vec_store", multigrid_param->vec_store[i]); - multigrid_param->vec_outfile[i] = kv.get(subsection, "vec_outfile", multigrid_param->vec_outfile[i]);*/ + strcpy(multigrid_param->vec_infile[i], kv.get(subsection, "vec_infile", multigrid_param->vec_infile[i]).c_str()); + strcpy(multigrid_param->vec_outfile[i], kv.get(subsection, "vec_outfile", multigrid_param->vec_outfile[i]).c_str()); multigrid_param->mu_factor[i] = kv.get(subsection, "mu_factor", multigrid_param->mu_factor[i]); multigrid_param->transfer_type[i] = kv.get(subsection, "transfer_type", multigrid_param->transfer_type[i]); From ec3e911a868a90bab5762bcadab02cc80bd92e24 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 1 Nov 2023 16:00:02 +0100 Subject: [PATCH 100/148] solved a bug that prevented reordering on CPU to work --- lib/color_spinor_field.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index 4bf7457584..7c42045bbd 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -517,7 +517,8 @@ namespace quda if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // reorder on the host void *buffer = pool_pinned_malloc(bytes); - qudaMemcpy(buffer, v, bytes, qudaMemcpyDefault); + // this is a bug: v should be src.v, else reordering onCPU doesn't work + qudaMemcpy(buffer, src.v, bytes, qudaMemcpyDefault); copyGenericColorSpinor(*this, src, QUDA_CPU_FIELD_LOCATION, 0, buffer); pool_pinned_free(buffer); From 86ff1d559849527c36fde61f5c82d4015a375d76 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 1 Nov 2023 16:00:43 +0100 Subject: [PATCH 101/148] in case of openQCD gauge fields, allocate and transfer 4*VOLUME + 7*BNDRY/4 su3 matrices --- lib/gauge_field.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index 1181ecb733..6e4cec077c 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -97,6 +97,25 @@ namespace quda { phase_offset = half_gauge_bytes; phase_bytes = half_phase_bytes*2; bytes = (half_gauge_bytes + half_phase_bytes)*2; + } else if (order == QUDA_OPENQCD_GAUGE_ORDER) { + /** + * With an openQCD gauge field, we need all the links of even lattice + * points in positive direction. These are links that lie in the buffer + * space that spans 7*BNDRY/4 gauge fields. These boundary fields are + * located at base_ptr + 4*VOLUME. Therefore, we need to transfer more + * than 4*VOLUME matrices. + */ + + /* analogue to BNDRY in openQCD:include/global.h */ + long int bndry = 0; + bndry += (1-(comm_dim(0)%2))*x[1]*x[2]*x[3]; + bndry += (1-(comm_dim(1)%2))*x[0]*x[2]*x[3]; + bndry += (1-(comm_dim(2)%2))*x[0]*x[1]*x[3]; + bndry += (1-(comm_dim(3)%2))*x[0]*x[1]*x[2]; + bndry *= 2; + + length += 18*7*bndry/4; + bytes = length * precision; } else { bytes = length * precision; if (isNative()) bytes = 2*ALIGNMENT_ADJUST(bytes/2); From b8bc138634074af7caf58ab2c77e9b09adf673f9 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 1 Nov 2023 16:02:17 +0100 Subject: [PATCH 102/148] reordering of gauge field done purely within QUDA (only load, not store) --- include/color_spinor_field_order.h | 143 ++++++++------- include/gauge_field_order.h | 267 ++++++++++++++++++++++++++++- 2 files changed, 340 insertions(+), 70 deletions(-) diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 6cf52ed0ff..b774f78eb7 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1735,22 +1735,81 @@ namespace quda int volumeCB; int faceVolumeCB[4]; int nParity; - const int L[4]; - const int rank; + const int L[4]; // xyzt convention + const int L_[4]; // txyz convention + const int volume; + const int cbs[4]; // openQCDs cache block size + const int cbn[4]; // openQCDs cache block grid OpenQCDDiracOrder(const ColorSpinorField &a, int = 1, Float *field_ = 0, float * = 0) : field(field_ ? field_ : (Float *)a.V()), offset(a.Bytes() / (2 * sizeof(Float))), // TODO: What's this for?? volumeCB(a.VolumeCB()), nParity(a.SiteSubset()), - L {a.X()[0], a.X()[1], a.X()[2], a.X()[3]}, // local dimensions (xyzt) - rank(comm_rank()) + L {a.X()[0], a.X()[1], a.X()[2], a.X()[3]}, // *local* lattice dimensions, xyzt + L_ {a.X()[3], a.X()[0], a.X()[1], a.X()[2]}, // *local* lattice dimensions, txyz + volume(L_[0]*L_[1]*L_[2]*L_[3]), // *local* lattice volume + cbs {setup_cbs(0, L_), setup_cbs(1, L_), setup_cbs(2, L_), setup_cbs(3, L_)}, // txyz + cbn {L_[0]/cbs[0], L_[1]/cbs[1], L_[2]/cbs[2], L_[3]/cbs[3]} // txyz { if constexpr (length != 24) { errorQuda("Spinor field length %d not supported", length); } } + __device__ __host__ inline int setup_cbs(const int mu, const int *X) const + { + if (mu==0) { + return X[0]; + } else if ((X[mu]%4)==0) { + return 4; + } else if ((X[mu]%3)==0) { + return 3; + } else if ((X[mu]%2)==0) { + return 2; + } else { + return 1; + } + } + + /** + * @brief Rotate coordinates (xyzt -> txyz) + * + * @param[in] x_quda Cartesian local lattice coordinates in quda + * convention (xyzt) + * @param[out] x_openQCD Cartesian local lattice coordinates in openQCD + * convention (txyz) + */ + __device__ __host__ inline void rotate_coords(const int *x_quda, int *x_openQCD) const + { + x_openQCD[1] = x_quda[0]; + x_openQCD[2] = x_quda[1]; + x_openQCD[3] = x_quda[2]; + x_openQCD[0] = x_quda[3]; + } + + /** + * @brief Generate a lexicographical index with x[Ndims-1] running + * fastest, for example if Ndims=4: + * ix = X3*X2*X1*x0 + X3*X2*x1 + X3*x2 + x3. + * + * @param[in] x Integer array of dimension Ndims with coordinates + * @param[in] X Integer array of dimension Ndims with extents + * @param[in] Ndims The number of dimensions + * + * @return Lexicographical index + */ + __device__ __host__ inline int lexi(const int *x, const int *X, const int Ndims) const + { + int i, ix = x[0]; + + #pragma unroll + for (i=1; i txyz) - * - * @param[in] x_quda Cartesian local lattice coordinates in quda - * convention (xyzt) - * @param[out] x_openQCD Cartesian local lattice coordinates in openQCD - * convention (txyz) - */ - __device__ __host__ inline void rotate_coords(const int *x_quda, int *x_openQCD) const - { - x_openQCD[1] = x_quda[0]; - x_openQCD[2] = x_quda[1]; - x_openQCD[3] = x_quda[2]; - x_openQCD[0] = x_quda[3]; + return ( + lexi(xb, cbs, 4)/2 + + cbs[0]*cbs[1]*cbs[2]*cbs[3]*lexi(xn, cbn, 4)/2 + + ((x[0]+x[1]+x[2]+x[3]) % 2 != 0)*(volume/2) /* odd -> +VOLUME/2 */ + ); } /** @@ -1828,26 +1852,23 @@ namespace quda * * @return The offset. */ - __device__ __host__ inline int getSpinorOffset(int x, int parity) const + __device__ __host__ inline int getSpinorOffset(int x_cb, int parity) const { - int coord_quda[4], coord_openQCD[4]; - - /* coord_quda contains xyzt local Carthesian corrdinates */ - getCoords(coord_quda, x, L, parity); - rotate_coords(coord_quda, coord_openQCD); /* xyzt -> txyz */ - - return ipt(coord_openQCD)*length; + int x_quda[4], x[4]; + getCoords(x_quda, x_cb, L, parity); // x_quda contains xyzt local Carthesian corrdinates + rotate_coords(x_quda, x); // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate + return ipt(x)*length; } - __device__ __host__ inline void load(complex v[length/2], int x, int parity = 0) const + __device__ __host__ inline void load(complex v[length/2], int x_cb, int parity = 0) const { - auto in = &field[getSpinorOffset(x, parity)]; + auto in = &field[getSpinorOffset(x_cb, parity)]; block_load(v, reinterpret_cast(in)); } - __device__ __host__ inline void save(const complex v[length/2], int x, int parity = 0) const + __device__ __host__ inline void save(const complex v[length/2], int x_cb, int parity = 0) const { - auto out = &field[getSpinorOffset(x, parity)]; + auto out = &field[getSpinorOffset(x_cb, parity)]; block_store(reinterpret_cast(out), v); } diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 12657973b8..7e89d76fb0 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -23,6 +23,7 @@ #include #include #include +#include // TODO: The ipt functions can be incorporated here (so no reordering needed in OpenQXD side) // OpenQxD helpers: @@ -2327,22 +2328,235 @@ namespace quda { Float *gauge; const int volumeCB; static constexpr int Nc = 3; - const int L[4]; + const int L[4]; // xyzt convention + const int L_[4]; // txyz convention + const int volume; + const int nproc[4]; + const int face[4]; + const int bndry; + const int ifc[4]; + const int face_offset[4]; + const int cbs[4]; // openQCDs cache block size + const int cbn[4]; // openQCDs cache block grid OpenQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), gauge(gauge_ ? gauge_ : (Float *)u.Gauge_p()), // pointer to the gauge field on CPU volumeCB(u.VolumeCB()), // Volume and VolumeCB refer to the global lattice, if VolumeLocal, then local lattice - L {u.X()[0], u.X()[1], u.X()[2], u.X()[3]} // initialized dim with *local* lattice dimensions + L {u.X()[0], u.X()[1], u.X()[2], u.X()[3]}, // *local* lattice dimensions, xyzt + L_ {u.X()[3], u.X()[0], u.X()[1], u.X()[2]}, // *local* lattice dimensions, txyz + volume(L_[0]*L_[1]*L_[2]*L_[3]), // *local* lattice volume + nproc {comm_dim(3), comm_dim(0), comm_dim(1), comm_dim(2)}, // txyz + face {((1-(nproc[0]%2))*L_[1]*L_[2]*L_[3]), + ((1-(nproc[1]%2))*L_[2]*L_[3]*L_[0]), + ((1-(nproc[2]%2))*L_[3]*L_[0]*L_[1]), + ((1-(nproc[3]%2))*L_[0]*L_[1]*L_[2])}, // txyz + bndry(2*(face[0]+face[1]+face[2]+face[3])), + ifc {(face[0]/2), + face[0] + (face[1]/2), + face[0] + face[1] + (face[2]/2), + face[0] + face[1] + face[2] + (face[3]/2)}, // txyz + face_offset {0, face[0]/2, face[0]/2 + face[1]/2, face[0]/2 + face[1]/2 + face[2]/2}, //txyz + cbs {setup_cbs(0, L_), setup_cbs(1, L_), setup_cbs(2, L_), setup_cbs(3, L_)}, // txyz + cbn {L_[0]/cbs[0], L_[1]/cbs[1], L_[2]/cbs[2], L_[3]/cbs[3]} // txyz { if constexpr (length != 18) { errorQuda("Gauge field length %d not supported", length); } } + __device__ __host__ inline int setup_cbs(const int mu, const int *X) const + { + if (mu==0) { + return X[0]; + } else if ((X[mu]%4)==0) { + return 4; + } else if ((X[mu]%3)==0) { + return 3; + } else if ((X[mu]%2)==0) { + return 2; + } else { + return 1; + } + } + + /** + * @brief Rotate coordinates (xyzt -> txyz) + * + * @param[in] x_quda Cartesian local lattice coordinates in quda + * convention (xyzt) + * @param[out] x_openQCD Cartesian local lattice coordinates in openQCD + * convention (txyz) + */ + __device__ __host__ inline void rotate_coords(const int *x_quda, int *x_openQCD) const + { + x_openQCD[1] = x_quda[0]; + x_openQCD[2] = x_quda[1]; + x_openQCD[3] = x_quda[2]; + x_openQCD[0] = x_quda[3]; + } + + /** + * @brief Generate a lexicographical index with x[Ndims-1] running + * fastest, for example if Ndims=4: + * ix = X3*X2*X1*x0 + X3*X2*x1 + X3*x2 + x3. + * + * @param[in] x Integer array of dimension Ndims with coordinates + * @param[in] X Integer array of dimension Ndims with extents + * @param[in] Ndims The number of dimensions + * + * @return Lexicographical index + */ + __device__ __host__ inline int lexi(const int *x, const int *X, const int Ndims) const + { + int i, ix = x[0]; + + #pragma unroll + for (i=1; i +VOLUME/2 */ + ); + } + + /** + * @brief Pure implementation of iup[ix][mu] + * + * @param[in] x Cartesian local lattice corrdinates, 0 <= x[i] < Li, length + * 4 in txyz convention + * @param[in] mu Direction in txyz convention + * + * @return iup[ix][mu] + */ + __device__ __host__ inline int iup(const int *x, const int mu) const + { + int i, ret, xb[4], xn[4]; + + if ((x[mu]==(L_[mu]-1))&&(nproc[mu]>1)) { + + xb[0] = x[0] % cbs[0]; + xb[1] = x[1] % cbs[1]; + xb[2] = x[2] % cbs[2]; + xb[3] = x[3] % cbs[3]; + + xn[0] = x[0]/cbs[0]; + xn[1] = x[1]/cbs[1]; + xn[2] = x[2]/cbs[2]; + xn[3] = x[3]/cbs[3]; + + ret = volume + ifc[mu]; + if ((x[0]+x[1]+x[2]+x[3]) % 2 == 0) { + ret += bndry/2; + } + + ret += surface(cbs, mu)*boundary_pts(mu, xn, cbn)/2; + ret += boundary_pts(mu, xb, cbs)/2; + return ret; + + } else { + #pragma unroll + for (i=0; i<4; i++) { + xb[i] = x[i]; + } + + xb[mu] = (xb[mu] + 1) % (L_[mu]*nproc[mu]); + return ipt(xb); + } + } + + /** + * @brief Obtains the offset in Floats from the openQCD base pointer + * to the gauge fields. At this point, fields are already * reordered with a xyzt-lexicographical spacetime index, so * nothing special to do here. * @@ -2352,22 +2566,57 @@ namespace quda { * * @return The offset. */ - __device__ __host__ inline int getGaugeOffset(int x, int dir, int parity) const { + __device__ __host__ inline int getGaugeOffset_old(int x, int dir, int parity) const { int coord[4]; getCoords(coord, x, L, parity); int idx = coord[3] + L[3]*coord[2] + L[3]*L[2]*coord[1] + L[3]*L[2]*L[1]*coord[0]; return (4*idx + dir)*length; } - __device__ __host__ inline void load(complex v[length/2], int x, int dir, int parity, Float = 1.0) const + + /** + * @brief Obtains the offset in Floats from the openQCD base pointer + * to the gauge fields. + * + * @param[in] x_cb Checkerboard index coming from quda + * @param[in] dir The direction coming from quda + * @param[in] parity The parity coming from quda + * + * @return The offset. + */ + __device__ __host__ inline int getGaugeOffset(int x_cb, int dir, int parity) const { + int quda_x[4], x[4]; + getCoords(quda_x, x_cb, L, parity); // x_quda = quda local lattice coordinates + rotate_coords(quda_x, x); // x = openQCD local lattice coordinates + + int mu = (dir+1) % 4; // mu = openQCD direction + int ix = ipt(x); + int iz = iup(x, mu); + int ofs = 0; + + if (ix < volume/2) { // ix even -> iz odd + if (iz < volume) { // iz in interior + ofs = 8*(iz - volume/2) + 2*mu + 1; + } else { + int ib = iz - volume - ifc[mu] - bndry/2; // iz in exterior + ofs = 4*volume + face_offset[mu] + ib; + } + } else if (volume/2 <= ix && ix < volume) { // ix odd + ofs = 8*(ix - volume/2) + 2*mu; + } + + return ofs*length; + } + + __device__ __host__ inline void load(complex v[length/2], int x_cb, int dir, int parity, Float = 1.0) const { - auto in = &gauge[getGaugeOffset(x, dir, parity)]; + auto in = &gauge[getGaugeOffset(x_cb, dir, parity)]; block_load(v, reinterpret_cast(in)); } - __device__ __host__ inline void save(const complex v[length/2], int x, int dir, int parity) const + __device__ __host__ inline void save(const complex v[length/2], int x_cb, int dir, int parity) const { - auto out = &gauge[getGaugeOffset(x, dir, parity)]; + auto out = &gauge[getGaugeOffset_old(x_cb, dir, parity)]; block_store(reinterpret_cast(out), v); } From 1ce6b194fd7ebe5e8866e24a30f8d696824d46e9 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 1 Nov 2023 16:03:07 +0100 Subject: [PATCH 103/148] changed gauge loading and added eigensolver interface (not working yet) --- include/quda_openqcd_interface.h | 6 + lib/openqcd_interface.cpp | 403 ++++++++++++++++++++----------- 2 files changed, 272 insertions(+), 137 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 207b0732b5..e14275b5e8 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -92,6 +92,7 @@ typedef struct { FILE *logfile; /** log file handler */ void *gauge; /** base pointer to the gauge fields */ int volume; /** VOLUME */ + int bndry; /** BNDRY */ void (*reorder_gauge_openqcd_to_quda)(void *in, void *out); void (*reorder_gauge_quda_to_openqcd)(void *in, void *out); void (*reorder_spinor_openqcd_to_quda)(void *in, void *out); @@ -274,6 +275,11 @@ double openQCD_qudaInvert(void *param, double mu, void *source, void *solution, void openQCD_qudaSolverDestroy(void *param); +void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_section); +void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals); +void openQCD_qudaEigensolverDestroy(void *param); + + /** * @brief Wrapper for the plaquette. We could call plaqQuda() directly in * openQCD, but we have to make sure manually that the gauge field diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 8e61a40c03..a94505680d 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -85,6 +85,140 @@ template void inline qudaopenqcd_called(const char *func, QudaVerbo template void inline qudaopenqcd_called(const char *func) { qudaopenqcd_called(func, getVerbosity()); } +std::unordered_map enum_map = { + {"QUDA_CG_INVERTER", std::to_string(QUDA_CG_INVERTER)}, + {"QUDA_BICGSTAB_INVERTER", std::to_string(QUDA_BICGSTAB_INVERTER)}, + {"QUDA_GCR_INVERTER", std::to_string(QUDA_GCR_INVERTER)}, + {"QUDA_MR_INVERTER", std::to_string(QUDA_MR_INVERTER)}, + {"QUDA_SD_INVERTER", std::to_string(QUDA_SD_INVERTER)}, + {"QUDA_PCG_INVERTER", std::to_string(QUDA_PCG_INVERTER)}, + {"QUDA_EIGCG_INVERTER", std::to_string(QUDA_EIGCG_INVERTER)}, + {"QUDA_INC_EIGCG_INVERTER", std::to_string(QUDA_INC_EIGCG_INVERTER)}, + {"QUDA_GMRESDR_INVERTER", std::to_string(QUDA_GMRESDR_INVERTER)}, + {"QUDA_GMRESDR_PROJ_INVERTER", std::to_string(QUDA_GMRESDR_PROJ_INVERTER)}, + {"QUDA_GMRESDR_SH_INVERTER", std::to_string(QUDA_GMRESDR_SH_INVERTER)}, + {"QUDA_FGMRESDR_INVERTER", std::to_string(QUDA_FGMRESDR_INVERTER)}, + {"QUDA_MG_INVERTER", std::to_string(QUDA_MG_INVERTER)}, + {"QUDA_BICGSTABL_INVERTER", std::to_string(QUDA_BICGSTABL_INVERTER)}, + {"QUDA_CGNE_INVERTER", std::to_string(QUDA_CGNE_INVERTER)}, + {"QUDA_CGNR_INVERTER", std::to_string(QUDA_CGNR_INVERTER)}, + {"QUDA_CG3_INVERTER", std::to_string(QUDA_CG3_INVERTER)}, + {"QUDA_CG3NE_INVERTER", std::to_string(QUDA_CG3NE_INVERTER)}, + {"QUDA_CG3NR_INVERTER", std::to_string(QUDA_CG3NR_INVERTER)}, + {"QUDA_CA_CG_INVERTER", std::to_string(QUDA_CA_CG_INVERTER)}, + {"QUDA_CA_CGNE_INVERTER", std::to_string(QUDA_CA_CGNE_INVERTER)}, + {"QUDA_CA_CGNR_INVERTER", std::to_string(QUDA_CA_CGNR_INVERTER)}, + {"QUDA_CA_GCR_INVERTER", std::to_string(QUDA_CA_GCR_INVERTER)}, + {"QUDA_INVALID_INVERTER", std::to_string(QUDA_INVALID_INVERTER)}, + {"QUDA_MAT_SOLUTION", std::to_string(QUDA_MAT_SOLUTION)}, + {"QUDA_MATDAG_MAT_SOLUTION", std::to_string(QUDA_MATDAG_MAT_SOLUTION)}, + {"QUDA_MATPC_SOLUTION", std::to_string(QUDA_MATPC_SOLUTION)}, + {"QUDA_MATPC_DAG_SOLUTION", std::to_string(QUDA_MATPC_DAG_SOLUTION)}, + {"QUDA_MATPCDAG_MATPC_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SOLUTION)}, + {"QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION)}, + {"QUDA_INVALID_SOLUTION", std::to_string(QUDA_INVALID_SOLUTION)}, + {"QUDA_DIRECT_SOLVE", std::to_string(QUDA_DIRECT_SOLVE)}, + {"QUDA_NORMOP_SOLVE", std::to_string(QUDA_NORMOP_SOLVE)}, + {"QUDA_DIRECT_PC_SOLVE", std::to_string(QUDA_DIRECT_PC_SOLVE)}, + {"QUDA_NORMOP_PC_SOLVE", std::to_string(QUDA_NORMOP_PC_SOLVE)}, + {"QUDA_NORMERR_SOLVE", std::to_string(QUDA_NORMERR_SOLVE)}, + {"QUDA_NORMERR_PC_SOLVE", std::to_string(QUDA_NORMERR_PC_SOLVE)}, + {"QUDA_NORMEQ_SOLVE", std::to_string(QUDA_NORMEQ_SOLVE)}, + {"QUDA_NORMEQ_PC_SOLVE", std::to_string(QUDA_NORMEQ_PC_SOLVE)}, + {"QUDA_INVALID_SOLVE", std::to_string(QUDA_INVALID_SOLVE)}, + {"QUDA_MATPC_EVEN_EVEN", std::to_string(QUDA_MATPC_EVEN_EVEN)}, + {"QUDA_MATPC_ODD_ODD", std::to_string(QUDA_MATPC_ODD_ODD)}, + {"QUDA_MATPC_EVEN_EVEN_ASYMMETRIC", std::to_string(QUDA_MATPC_EVEN_EVEN_ASYMMETRIC)}, + {"QUDA_MATPC_ODD_ODD_ASYMMETRIC", std::to_string(QUDA_MATPC_ODD_ODD_ASYMMETRIC)}, + {"QUDA_MATPC_INVALID", std::to_string(QUDA_MATPC_INVALID)}, + {"QUDA_DEFAULT_NORMALIZATION", std::to_string(QUDA_DEFAULT_NORMALIZATION)}, + {"QUDA_SOURCE_NORMALIZATION", std::to_string(QUDA_SOURCE_NORMALIZATION)}, + {"QUDA_QUARTER_PRECISION", std::to_string(QUDA_QUARTER_PRECISION)}, + {"QUDA_HALF_PRECISION", std::to_string(QUDA_HALF_PRECISION)}, + {"QUDA_SINGLE_PRECISION", std::to_string(QUDA_SINGLE_PRECISION)}, + {"QUDA_DOUBLE_PRECISION", std::to_string(QUDA_DOUBLE_PRECISION)}, + {"QUDA_INVALID_PRECISION", std::to_string(QUDA_INVALID_PRECISION)}, + {"QUDA_BOOLEAN_FALSE", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"QUDA_BOOLEAN_TRUE", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"QUDA_BOOLEAN_INVALID", std::to_string(QUDA_BOOLEAN_INVALID)}, + {"QUDA_COMPUTE_NULL_VECTOR_NO", std::to_string(QUDA_COMPUTE_NULL_VECTOR_NO)}, + {"QUDA_COMPUTE_NULL_VECTOR_YES", std::to_string(QUDA_COMPUTE_NULL_VECTOR_YES)}, + {"QUDA_COMPUTE_NULL_VECTOR_INVALID", std::to_string(QUDA_COMPUTE_NULL_VECTOR_INVALID)}, + {"QUDA_MG_CYCLE_VCYCLE", std::to_string(QUDA_MG_CYCLE_VCYCLE)}, + {"QUDA_MG_CYCLE_FCYCLE", std::to_string(QUDA_MG_CYCLE_FCYCLE)}, + {"QUDA_MG_CYCLE_WCYCLE", std::to_string(QUDA_MG_CYCLE_WCYCLE)}, + {"QUDA_MG_CYCLE_RECURSIVE", std::to_string(QUDA_MG_CYCLE_RECURSIVE)}, + {"QUDA_MG_CYCLE_INVALID", std::to_string(QUDA_MG_CYCLE_INVALID)}, + {"QUDA_CPU_FIELD_LOCATION", std::to_string(QUDA_CPU_FIELD_LOCATION)}, + {"QUDA_CUDA_FIELD_LOCATION", std::to_string(QUDA_CUDA_FIELD_LOCATION)}, + {"QUDA_INVALID_FIELD_LOCATION", std::to_string(QUDA_INVALID_FIELD_LOCATION)}, + {"QUDA_TWIST_SINGLET", std::to_string(QUDA_TWIST_SINGLET)}, + {"QUDA_TWIST_NONDEG_DOUBLET", std::to_string(QUDA_TWIST_NONDEG_DOUBLET)}, + {"QUDA_TWIST_NO", std::to_string(QUDA_TWIST_NO)}, + {"QUDA_TWIST_INVALID", std::to_string(QUDA_TWIST_INVALID)}, + {"QUDA_DAG_NO", std::to_string(QUDA_DAG_NO)}, + {"QUDA_DAG_YES", std::to_string(QUDA_DAG_YES)}, + {"QUDA_DAG_INVALID", std::to_string(QUDA_DAG_INVALID)}, + {"QUDA_KAPPA_NORMALIZATION", std::to_string(QUDA_KAPPA_NORMALIZATION)}, + {"QUDA_MASS_NORMALIZATION", std::to_string(QUDA_MASS_NORMALIZATION)}, + {"QUDA_ASYMMETRIC_MASS_NORMALIZATION", std::to_string(QUDA_ASYMMETRIC_MASS_NORMALIZATION)}, + {"QUDA_INVALID_NORMALIZATION", std::to_string(QUDA_INVALID_NORMALIZATION)}, + {"QUDA_PRESERVE_SOURCE_NO", std::to_string(QUDA_PRESERVE_SOURCE_NO)}, + {"QUDA_PRESERVE_SOURCE_YES", std::to_string(QUDA_PRESERVE_SOURCE_YES)}, + {"QUDA_PRESERVE_SOURCE_INVALID", std::to_string(QUDA_PRESERVE_SOURCE_INVALID)}, + {"QUDA_USE_INIT_GUESS_NO", std::to_string(QUDA_USE_INIT_GUESS_NO)}, + {"QUDA_USE_INIT_GUESS_YES", std::to_string(QUDA_USE_INIT_GUESS_YES)}, + {"QUDA_USE_INIT_GUESS_INVALID", std::to_string(QUDA_USE_INIT_GUESS_INVALID)}, + {"QUDA_SILENT", std::to_string(QUDA_SILENT)}, + {"QUDA_SUMMARIZE", std::to_string(QUDA_SUMMARIZE)}, + {"QUDA_VERBOSE", std::to_string(QUDA_VERBOSE)}, + {"QUDA_DEBUG_VERBOSE", std::to_string(QUDA_DEBUG_VERBOSE)}, + {"QUDA_INVALID_VERBOSITY", std::to_string(QUDA_INVALID_VERBOSITY)}, + {"QUDA_TUNE_NO", std::to_string(QUDA_TUNE_NO)}, + {"QUDA_TUNE_YES", std::to_string(QUDA_TUNE_YES)}, + {"QUDA_TUNE_INVALID", std::to_string(QUDA_TUNE_INVALID)}, + {"QUDA_POWER_BASIS", std::to_string(QUDA_POWER_BASIS)}, + {"QUDA_CHEBYSHEV_BASIS", std::to_string(QUDA_CHEBYSHEV_BASIS)}, + {"QUDA_INVALID_BASIS", std::to_string(QUDA_INVALID_BASIS)}, + {"QUDA_ADDITIVE_SCHWARZ", std::to_string(QUDA_ADDITIVE_SCHWARZ)}, + {"QUDA_MULTIPLICATIVE_SCHWARZ", std::to_string(QUDA_MULTIPLICATIVE_SCHWARZ)}, + {"QUDA_INVALID_SCHWARZ", std::to_string(QUDA_INVALID_SCHWARZ)}, + {"QUDA_MADWF_ACCELERATOR", std::to_string(QUDA_MADWF_ACCELERATOR)}, + {"QUDA_INVALID_ACCELERATOR", std::to_string(QUDA_INVALID_ACCELERATOR)}, + {"QUDA_L2_RELATIVE_RESIDUAL", std::to_string(QUDA_L2_RELATIVE_RESIDUAL)}, + {"QUDA_L2_ABSOLUTE_RESIDUAL", std::to_string(QUDA_L2_ABSOLUTE_RESIDUAL)}, + {"QUDA_HEAVY_QUARK_RESIDUAL", std::to_string(QUDA_HEAVY_QUARK_RESIDUAL)}, + {"QUDA_INVALID_RESIDUAL", std::to_string(QUDA_INVALID_RESIDUAL)}, + {"QUDA_NULL_VECTOR_SETUP", std::to_string(QUDA_NULL_VECTOR_SETUP)}, + {"QUDA_TEST_VECTOR_SETUP", std::to_string(QUDA_TEST_VECTOR_SETUP)}, + {"QUDA_INVALID_SETUP_TYPE", std::to_string(QUDA_INVALID_SETUP_TYPE)}, + {"QUDA_TRANSFER_AGGREGATE", std::to_string(QUDA_TRANSFER_AGGREGATE)}, + {"QUDA_TRANSFER_COARSE_KD", std::to_string(QUDA_TRANSFER_COARSE_KD)}, + {"QUDA_TRANSFER_OPTIMIZED_KD", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD)}, + {"QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG)}, + {"QUDA_TRANSFER_INVALID", std::to_string(QUDA_TRANSFER_INVALID)}, + {"QUDA_EIG_TR_LANCZOS", std::to_string(QUDA_EIG_TR_LANCZOS)}, + {"QUDA_EIG_BLK_TR_LANCZOS", std::to_string(QUDA_EIG_BLK_TR_LANCZOS)}, + {"QUDA_EIG_IR_ARNOLDI", std::to_string(QUDA_EIG_IR_ARNOLDI)}, + {"QUDA_EIG_BLK_IR_ARNOLDI", std::to_string(QUDA_EIG_BLK_IR_ARNOLDI)}, + {"QUDA_EIG_INVALID", std::to_string(QUDA_EIG_INVALID)}, + {"QUDA_SPECTRUM_LM_EIG", std::to_string(QUDA_SPECTRUM_LM_EIG)}, + {"QUDA_SPECTRUM_SM_EIG", std::to_string(QUDA_SPECTRUM_SM_EIG)}, + {"QUDA_SPECTRUM_LR_EIG", std::to_string(QUDA_SPECTRUM_LR_EIG)}, + {"QUDA_SPECTRUM_SR_EIG", std::to_string(QUDA_SPECTRUM_SR_EIG)}, + {"QUDA_SPECTRUM_LI_EIG", std::to_string(QUDA_SPECTRUM_LI_EIG)}, + {"QUDA_SPECTRUM_SI_EIG", std::to_string(QUDA_SPECTRUM_SI_EIG)}, + {"QUDA_SPECTRUM_INVALID", std::to_string(QUDA_SPECTRUM_INVALID)}, + {"QUDA_MEMORY_DEVICE", std::to_string(QUDA_MEMORY_DEVICE)}, + {"QUDA_MEMORY_PINNED", std::to_string(QUDA_MEMORY_PINNED)}, + {"QUDA_MEMORY_MAPPED", std::to_string(QUDA_MEMORY_MAPPED)}, + {"QUDA_MEMORY_INVALID", std::to_string(QUDA_MEMORY_INVALID)}, + {"QUDA_CUSOLVE_EXTLIB", std::to_string(QUDA_CUSOLVE_EXTLIB)}, + {"QUDA_EIGEN_EXTLIB", std::to_string(QUDA_EIGEN_EXTLIB)}, + {"QUDA_EXTLIB_INVALID", std::to_string(QUDA_EXTLIB_INVALID)} +}; + + /** * @brief Just a simple key-value store */ @@ -281,6 +415,9 @@ static int rankFromCoords(const int *coords, void *fdata) // TODO: */ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) { + int my_rank; + char prefix[20]; + for (int dir = 0; dir < 4; ++dir) { if (layout.N[dir] % 2 != 0) { errorQuda("Error: Odd lattice dimensions are not supported\n"); @@ -300,6 +437,11 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) static int device = layout.device; #endif + // must happen *after* communication initialization + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + sprintf(prefix, "QUDA (rank=%d): ", my_rank); + + setVerbosityQuda(qudaState.init.verbosity, prefix, qudaState.init.logfile); initQuda(device); } @@ -388,7 +530,6 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout) qudaState.init = init; qudaState.layout = layout; - setVerbosityQuda(qudaState.init.verbosity, "QUDA: ", qudaState.init.logfile); qudaopenqcd_called(__func__); openQCD_qudaSetLayout(qudaState.layout); qudaopenqcd_called(__func__); @@ -420,13 +561,7 @@ double openQCD_qudaPlaquette(void) void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec) { QudaGaugeParam param = newOpenQCDGaugeParam(prec); - - /* Matthias Wagner: optimize that */ - void* buffer = pool_pinned_malloc(4*qudaState.init.volume*18*prec); - qudaState.init.reorder_gauge_openqcd_to_quda(gauge, buffer); - loadGaugeQuda(buffer, ¶m); - pool_pinned_free(buffer); - + loadGaugeQuda(gauge, ¶m); qudaState.gauge_loaded = true; } @@ -435,7 +570,7 @@ void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec) { QudaGaugeParam param = newOpenQCDGaugeParam(prec); - void* buffer = pool_pinned_malloc(4*qudaState.init.volume*18*prec); + void* buffer = pool_pinned_malloc((4*qudaState.init.volume + 7*qudaState.init.bndry/4)*18*prec); saveGaugeQuda(buffer, ¶m); qudaState.init.reorder_gauge_quda_to_openqcd(buffer, gauge); pool_pinned_free(buffer); @@ -700,6 +835,9 @@ void openQCD_qudaSpinorFree(void** quda_field) void openQCD_qudaD2H(void *quda_field, void *openQCD_field) { + int my_rank; + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + // sets up the necessary parameters QudaInvertParam param = newOpenQCDParam(); @@ -707,10 +845,8 @@ void openQCD_qudaD2H(void *quda_field, void *openQCD_field) ColorSpinorParam cpuParam(openQCD_field, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); ColorSpinorField out_h(cpuParam); - ColorSpinorField* in = reinterpret_cast(quda_field); - ColorSpinorField out(*in); - - out_h = out; // transfer the GPU field to CPU + // transfer the GPU field to CPU + out_h = *reinterpret_cast(quda_field); } @@ -808,120 +944,6 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) if (my_rank == 0) { - std::unordered_map enum_map = { - {"QUDA_CG_INVERTER", std::to_string(QUDA_CG_INVERTER)}, - {"QUDA_BICGSTAB_INVERTER", std::to_string(QUDA_BICGSTAB_INVERTER)}, - {"QUDA_GCR_INVERTER", std::to_string(QUDA_GCR_INVERTER)}, - {"QUDA_MR_INVERTER", std::to_string(QUDA_MR_INVERTER)}, - {"QUDA_SD_INVERTER", std::to_string(QUDA_SD_INVERTER)}, - {"QUDA_PCG_INVERTER", std::to_string(QUDA_PCG_INVERTER)}, - {"QUDA_EIGCG_INVERTER", std::to_string(QUDA_EIGCG_INVERTER)}, - {"QUDA_INC_EIGCG_INVERTER", std::to_string(QUDA_INC_EIGCG_INVERTER)}, - {"QUDA_GMRESDR_INVERTER", std::to_string(QUDA_GMRESDR_INVERTER)}, - {"QUDA_GMRESDR_PROJ_INVERTER", std::to_string(QUDA_GMRESDR_PROJ_INVERTER)}, - {"QUDA_GMRESDR_SH_INVERTER", std::to_string(QUDA_GMRESDR_SH_INVERTER)}, - {"QUDA_FGMRESDR_INVERTER", std::to_string(QUDA_FGMRESDR_INVERTER)}, - {"QUDA_MG_INVERTER", std::to_string(QUDA_MG_INVERTER)}, - {"QUDA_BICGSTABL_INVERTER", std::to_string(QUDA_BICGSTABL_INVERTER)}, - {"QUDA_CGNE_INVERTER", std::to_string(QUDA_CGNE_INVERTER)}, - {"QUDA_CGNR_INVERTER", std::to_string(QUDA_CGNR_INVERTER)}, - {"QUDA_CG3_INVERTER", std::to_string(QUDA_CG3_INVERTER)}, - {"QUDA_CG3NE_INVERTER", std::to_string(QUDA_CG3NE_INVERTER)}, - {"QUDA_CG3NR_INVERTER", std::to_string(QUDA_CG3NR_INVERTER)}, - {"QUDA_CA_CG_INVERTER", std::to_string(QUDA_CA_CG_INVERTER)}, - {"QUDA_CA_CGNE_INVERTER", std::to_string(QUDA_CA_CGNE_INVERTER)}, - {"QUDA_CA_CGNR_INVERTER", std::to_string(QUDA_CA_CGNR_INVERTER)}, - {"QUDA_CA_GCR_INVERTER", std::to_string(QUDA_CA_GCR_INVERTER)}, - {"QUDA_INVALID_INVERTER", std::to_string(QUDA_INVALID_INVERTER)}, - {"QUDA_MAT_SOLUTION", std::to_string(QUDA_MAT_SOLUTION)}, - {"QUDA_MATDAG_MAT_SOLUTION", std::to_string(QUDA_MATDAG_MAT_SOLUTION)}, - {"QUDA_MATPC_SOLUTION", std::to_string(QUDA_MATPC_SOLUTION)}, - {"QUDA_MATPC_DAG_SOLUTION", std::to_string(QUDA_MATPC_DAG_SOLUTION)}, - {"QUDA_MATPCDAG_MATPC_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SOLUTION)}, - {"QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION)}, - {"QUDA_INVALID_SOLUTION", std::to_string(QUDA_INVALID_SOLUTION)}, - {"QUDA_DIRECT_SOLVE", std::to_string(QUDA_DIRECT_SOLVE)}, - {"QUDA_NORMOP_SOLVE", std::to_string(QUDA_NORMOP_SOLVE)}, - {"QUDA_DIRECT_PC_SOLVE", std::to_string(QUDA_DIRECT_PC_SOLVE)}, - {"QUDA_NORMOP_PC_SOLVE", std::to_string(QUDA_NORMOP_PC_SOLVE)}, - {"QUDA_NORMERR_SOLVE", std::to_string(QUDA_NORMERR_SOLVE)}, - {"QUDA_NORMERR_PC_SOLVE", std::to_string(QUDA_NORMERR_PC_SOLVE)}, - {"QUDA_NORMEQ_SOLVE", std::to_string(QUDA_NORMEQ_SOLVE)}, - {"QUDA_NORMEQ_PC_SOLVE", std::to_string(QUDA_NORMEQ_PC_SOLVE)}, - {"QUDA_INVALID_SOLVE", std::to_string(QUDA_INVALID_SOLVE)}, - {"QUDA_MATPC_EVEN_EVEN", std::to_string(QUDA_MATPC_EVEN_EVEN)}, - {"QUDA_MATPC_ODD_ODD", std::to_string(QUDA_MATPC_ODD_ODD)}, - {"QUDA_MATPC_EVEN_EVEN_ASYMMETRIC", std::to_string(QUDA_MATPC_EVEN_EVEN_ASYMMETRIC)}, - {"QUDA_MATPC_ODD_ODD_ASYMMETRIC", std::to_string(QUDA_MATPC_ODD_ODD_ASYMMETRIC)}, - {"QUDA_MATPC_INVALID", std::to_string(QUDA_MATPC_INVALID)}, - {"QUDA_DEFAULT_NORMALIZATION", std::to_string(QUDA_DEFAULT_NORMALIZATION)}, - {"QUDA_SOURCE_NORMALIZATION", std::to_string(QUDA_SOURCE_NORMALIZATION)}, - {"QUDA_QUARTER_PRECISION", std::to_string(QUDA_QUARTER_PRECISION)}, - {"QUDA_HALF_PRECISION", std::to_string(QUDA_HALF_PRECISION)}, - {"QUDA_SINGLE_PRECISION", std::to_string(QUDA_SINGLE_PRECISION)}, - {"QUDA_DOUBLE_PRECISION", std::to_string(QUDA_DOUBLE_PRECISION)}, - {"QUDA_INVALID_PRECISION", std::to_string(QUDA_INVALID_PRECISION)}, - {"QUDA_BOOLEAN_FALSE", std::to_string(QUDA_BOOLEAN_FALSE)}, - {"QUDA_BOOLEAN_TRUE", std::to_string(QUDA_BOOLEAN_TRUE)}, - {"QUDA_BOOLEAN_INVALID", std::to_string(QUDA_BOOLEAN_INVALID)}, - {"QUDA_COMPUTE_NULL_VECTOR_NO", std::to_string(QUDA_COMPUTE_NULL_VECTOR_NO)}, - {"QUDA_COMPUTE_NULL_VECTOR_YES", std::to_string(QUDA_COMPUTE_NULL_VECTOR_YES)}, - {"QUDA_COMPUTE_NULL_VECTOR_INVALID", std::to_string(QUDA_COMPUTE_NULL_VECTOR_INVALID)}, - {"QUDA_MG_CYCLE_VCYCLE", std::to_string(QUDA_MG_CYCLE_VCYCLE)}, - {"QUDA_MG_CYCLE_FCYCLE", std::to_string(QUDA_MG_CYCLE_FCYCLE)}, - {"QUDA_MG_CYCLE_WCYCLE", std::to_string(QUDA_MG_CYCLE_WCYCLE)}, - {"QUDA_MG_CYCLE_RECURSIVE", std::to_string(QUDA_MG_CYCLE_RECURSIVE)}, - {"QUDA_MG_CYCLE_INVALID", std::to_string(QUDA_MG_CYCLE_INVALID)}, - {"QUDA_CPU_FIELD_LOCATION", std::to_string(QUDA_CPU_FIELD_LOCATION)}, - {"QUDA_CUDA_FIELD_LOCATION", std::to_string(QUDA_CUDA_FIELD_LOCATION)}, - {"QUDA_INVALID_FIELD_LOCATION", std::to_string(QUDA_INVALID_FIELD_LOCATION)}, - {"QUDA_TWIST_SINGLET", std::to_string(QUDA_TWIST_SINGLET)}, - {"QUDA_TWIST_NONDEG_DOUBLET", std::to_string(QUDA_TWIST_NONDEG_DOUBLET)}, - {"QUDA_TWIST_NO", std::to_string(QUDA_TWIST_NO)}, - {"QUDA_TWIST_INVALID", std::to_string(QUDA_TWIST_INVALID)}, - {"QUDA_DAG_NO", std::to_string(QUDA_DAG_NO)}, - {"QUDA_DAG_YES", std::to_string(QUDA_DAG_YES)}, - {"QUDA_DAG_INVALID", std::to_string(QUDA_DAG_INVALID)}, - {"QUDA_KAPPA_NORMALIZATION", std::to_string(QUDA_KAPPA_NORMALIZATION)}, - {"QUDA_MASS_NORMALIZATION", std::to_string(QUDA_MASS_NORMALIZATION)}, - {"QUDA_ASYMMETRIC_MASS_NORMALIZATION", std::to_string(QUDA_ASYMMETRIC_MASS_NORMALIZATION)}, - {"QUDA_INVALID_NORMALIZATION", std::to_string(QUDA_INVALID_NORMALIZATION)}, - {"QUDA_PRESERVE_SOURCE_NO", std::to_string(QUDA_PRESERVE_SOURCE_NO)}, - {"QUDA_PRESERVE_SOURCE_YES", std::to_string(QUDA_PRESERVE_SOURCE_YES)}, - {"QUDA_PRESERVE_SOURCE_INVALID", std::to_string(QUDA_PRESERVE_SOURCE_INVALID)}, - {"QUDA_USE_INIT_GUESS_NO", std::to_string(QUDA_USE_INIT_GUESS_NO)}, - {"QUDA_USE_INIT_GUESS_YES", std::to_string(QUDA_USE_INIT_GUESS_YES)}, - {"QUDA_USE_INIT_GUESS_INVALID", std::to_string(QUDA_USE_INIT_GUESS_INVALID)}, - {"QUDA_SILENT", std::to_string(QUDA_SILENT)}, - {"QUDA_SUMMARIZE", std::to_string(QUDA_SUMMARIZE)}, - {"QUDA_VERBOSE", std::to_string(QUDA_VERBOSE)}, - {"QUDA_DEBUG_VERBOSE", std::to_string(QUDA_DEBUG_VERBOSE)}, - {"QUDA_INVALID_VERBOSITY", std::to_string(QUDA_INVALID_VERBOSITY)}, - {"QUDA_TUNE_NO", std::to_string(QUDA_TUNE_NO)}, - {"QUDA_TUNE_YES", std::to_string(QUDA_TUNE_YES)}, - {"QUDA_TUNE_INVALID", std::to_string(QUDA_TUNE_INVALID)}, - {"QUDA_POWER_BASIS", std::to_string(QUDA_POWER_BASIS)}, - {"QUDA_CHEBYSHEV_BASIS", std::to_string(QUDA_CHEBYSHEV_BASIS)}, - {"QUDA_INVALID_BASIS", std::to_string(QUDA_INVALID_BASIS)}, - {"QUDA_ADDITIVE_SCHWARZ", std::to_string(QUDA_ADDITIVE_SCHWARZ)}, - {"QUDA_MULTIPLICATIVE_SCHWARZ", std::to_string(QUDA_MULTIPLICATIVE_SCHWARZ)}, - {"QUDA_INVALID_SCHWARZ", std::to_string(QUDA_INVALID_SCHWARZ)}, - {"QUDA_MADWF_ACCELERATOR", std::to_string(QUDA_MADWF_ACCELERATOR)}, - {"QUDA_INVALID_ACCELERATOR", std::to_string(QUDA_INVALID_ACCELERATOR)}, - {"QUDA_L2_RELATIVE_RESIDUAL", std::to_string(QUDA_L2_RELATIVE_RESIDUAL)}, - {"QUDA_L2_ABSOLUTE_RESIDUAL", std::to_string(QUDA_L2_ABSOLUTE_RESIDUAL)}, - {"QUDA_HEAVY_QUARK_RESIDUAL", std::to_string(QUDA_HEAVY_QUARK_RESIDUAL)}, - {"QUDA_INVALID_RESIDUAL", std::to_string(QUDA_INVALID_RESIDUAL)}, - {"QUDA_NULL_VECTOR_SETUP", std::to_string(QUDA_NULL_VECTOR_SETUP)}, - {"QUDA_TEST_VECTOR_SETUP", std::to_string(QUDA_TEST_VECTOR_SETUP)}, - {"QUDA_INVALID_SETUP_TYPE", std::to_string(QUDA_INVALID_SETUP_TYPE)}, - {"QUDA_TRANSFER_AGGREGATE", std::to_string(QUDA_TRANSFER_AGGREGATE)}, - {"QUDA_TRANSFER_COARSE_KD", std::to_string(QUDA_TRANSFER_COARSE_KD)}, - {"QUDA_TRANSFER_OPTIMIZED_KD", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD)}, - {"QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG)}, - {"QUDA_TRANSFER_INVALID", std::to_string(QUDA_TRANSFER_INVALID)} - }; - KeyValueStore kv; kv.set_map(&enum_map); kv.load(infile); @@ -1162,8 +1184,9 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) if (qudaState.layout.h_gauge != nullptr) { logQuda(QUDA_VERBOSE, "Loading gauge field from openQCD ...\n"); + void *h_gauge = qudaState.layout.h_gauge(); PUSH_RANGE("openQCD_qudaGaugeLoad",3); - openQCD_qudaGaugeLoad(qudaState.layout.h_gauge(), QUDA_DOUBLE_PRECISION); + openQCD_qudaGaugeLoad(h_gauge, QUDA_DOUBLE_PRECISION); POP_RANGE; } @@ -1183,8 +1206,9 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) * Transfer the SW-field from openQCD. */ logQuda(QUDA_VERBOSE, "Loading clover field from openQCD ...\n"); + void *h_sw = qudaState.layout.h_sw(); PUSH_RANGE("openQCD_qudaCloverLoad",3); - openQCD_qudaCloverLoad(qudaState.layout.h_sw(), param->kappa, param->clover_csw); + openQCD_qudaCloverLoad(h_sw, param->kappa, param->clover_csw); POP_RANGE; //loadCloverQuda(qudaState.layout.h_sw(), NULL, param); @@ -1218,6 +1242,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) return (void*) param; } + double openQCD_qudaInvert(void *param, double mu, void *source, void *solution, int *status) { QudaInvertParam* invert_param = static_cast(param); @@ -1228,14 +1253,12 @@ double openQCD_qudaInvert(void *param, double mu, void *source, void *solution, invertQuda(static_cast(solution), static_cast(source), invert_param); POP_RANGE; - if (invert_param->verbosity >= QUDA_VERBOSE) { - logQuda(QUDA_VERBOSE, "openQCD_qudaInvert()\n"); - logQuda(QUDA_VERBOSE, " true_res = %.2e\n", invert_param->true_res); - logQuda(QUDA_VERBOSE, " true_res_hq = %.2e\n", invert_param->true_res_hq); - logQuda(QUDA_VERBOSE, " iter = %d\n", invert_param->iter); - logQuda(QUDA_VERBOSE, " gflops = %.2e\n", invert_param->gflops); - logQuda(QUDA_VERBOSE, " secs = %.2e\n", invert_param->secs); - } + logQuda(QUDA_VERBOSE, "openQCD_qudaInvert()\n"); + logQuda(QUDA_VERBOSE, " true_res = %.2e\n", invert_param->true_res); + logQuda(QUDA_VERBOSE, " true_res_hq = %.2e\n", invert_param->true_res_hq); + logQuda(QUDA_VERBOSE, " iter = %d\n", invert_param->iter); + logQuda(QUDA_VERBOSE, " gflops = %.2e\n", invert_param->gflops); + logQuda(QUDA_VERBOSE, " secs = %.2e\n", invert_param->secs); *status = invert_param->true_res <= invert_param->tol ? invert_param->iter : -1; @@ -1255,6 +1278,112 @@ void openQCD_qudaSolverDestroy(void *param) } +void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_section) +{ + int my_rank; + + MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); + QudaVerbosity verbosity = QUDA_SUMMARIZE; + + // Allocate on the heap + QudaEigParam* param = new QudaEigParam(newQudaEigParam()); + + if (my_rank == 0) { + + KeyValueStore kv; + kv.set_map(&enum_map); + kv.load(infile); + + verbosity = kv.get(section, "verbosity", verbosity); + setVerbosity(verbosity); + + if (verbosity >= QUDA_VERBOSE) { + kv.dump(); + } + + param->eig_type = kv.get(section, "eig_type", param->eig_type); + param->use_poly_acc = kv.get(section, "use_poly_acc", param->use_poly_acc); + param->poly_deg = kv.get(section, "poly_deg", param->poly_deg); + param->a_min = kv.get(section, "a_min", param->a_min); + param->a_max = kv.get(section, "a_max", param->a_max); + param->preserve_deflation = kv.get(section, "preserve_deflation", param->preserve_deflation); + //param->*preserve_deflation_space = kv.get(section, *"*preserve_deflation_space", param->preserve_deflation_space); + param->preserve_evals = kv.get(section, "preserve_evals", param->preserve_evals); + param->use_dagger = kv.get(section, "use_dagger", param->use_dagger); + param->use_norm_op = kv.get(section, "use_norm_op", param->use_norm_op); + param->use_pc = kv.get(section, "use_pc", param->use_pc); + param->use_eigen_qr = kv.get(section, "use_eigen_qr", param->use_eigen_qr); + param->compute_svd = kv.get(section, "compute_svd", param->compute_svd); + param->compute_gamma5 = kv.get(section, "compute_gamma5", param->compute_gamma5); + param->require_convergence = kv.get(section, "require_convergence", param->require_convergence); + param->spectrum = kv.get(section, "spectrum", param->spectrum); + param->n_ev = kv.get(section, "n_ev", param->n_ev); + param->n_kr = kv.get(section, "n_kr", param->n_kr); + param->n_conv = kv.get(section, "n_conv", param->n_conv); + param->n_ev_deflate = kv.get(section, "n_ev_deflate", param->n_ev_deflate); + param->tol = kv.get(section, "tol", param->tol); + param->qr_tol = kv.get(section, "qr_tol", param->qr_tol); + param->check_interval = kv.get(section, "check_interval", param->check_interval); + param->max_restarts = kv.get(section, "max_restarts", param->max_restarts); + param->batched_rotate = kv.get(section, "batched_rotate", param->batched_rotate); + param->block_size = kv.get(section, "block_size", param->block_size); + param->arpack_check = kv.get(section, "arpack_check", param->arpack_check); + strcpy(param->QUDA_logfile, kv.get(section, "QUDA_logfile", param->QUDA_logfile).c_str()); + strcpy(param->arpack_logfile, kv.get(section, "arpack_logfile", param->arpack_logfile).c_str()); + + param->nk = kv.get(section, "nk", param->nk); + param->np = kv.get(section, "np", param->np); + param->import_vectors = kv.get(section, "import_vectors", param->import_vectors); + param->cuda_prec_ritz = kv.get(section, "cuda_prec_ritz", param->cuda_prec_ritz); + param->mem_type_ritz = kv.get(section, "mem_type_ritz", param->mem_type_ritz); + param->location = kv.get(section, "location", param->location); + param->run_verify = kv.get(section, "run_verify", param->run_verify); + //strcpy(param->vec_infile, kv.get(section, "vec_infile", param->vec_infile).c_str()); + //strcpy(param->vec_outfile, kv.get(section, "vec_outfile", param->vec_outfile).c_str()); + param->save_prec = kv.get(section, "save_prec", param->save_prec); + param->io_parity_inflate = kv.get(section, "io_parity_inflate", param->io_parity_inflate); + param->extlib_type = kv.get(section, "extlib_type", param->extlib_type); + } + + // transfer of the struct to all the processes + MPI_Bcast((void*) param, sizeof(*param), MPI_BYTE, 0, MPI_COMM_WORLD); + + void *inv_param = openQCD_qudaSolverSetup(infile, inv_section); + param->invert_param = static_cast(inv_param); + + if (verbosity >= QUDA_DEBUG_VERBOSE) { + printQudaEigParam(param); + } + checkEigParam(param); + if (verbosity >= QUDA_DEBUG_VERBOSE) { + printQudaEigParam(param); + } + + return (void*) param; +} + + +void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals) +{ + QudaEigParam* eig_param = static_cast(param); + + logQuda(QUDA_VERBOSE, "Calling eigensolveQuda() ...\n"); + PUSH_RANGE("eigensolveQuda",6); + eigensolveQuda(h_evecs, static_cast(h_evals), eig_param); + POP_RANGE; + + logQuda(QUDA_VERBOSE, "openQCD_qudaEigensolve()\n"); + logQuda(QUDA_VERBOSE, " gflops = %.2e\n", eig_param->gflops); + logQuda(QUDA_VERBOSE, " secs = %.2e\n", eig_param->secs); +} + +void openQCD_qudaEigensolverDestroy(void *param) +{ + QudaInvertParam* eig_param = static_cast(param); + delete eig_param; +} + + double openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param) { QudaInvertParam invert_param = newOpenQCDSolverParam(dirac_param); From 9e81a4c6a6f68652cb45999489e13610669641a0 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 1 Nov 2023 18:40:56 +0100 Subject: [PATCH 104/148] solved the mystical local lattice direction = 6 bug --- include/clover_field_order.h | 152 ++++++++++++++++++++--------------- 1 file changed, 87 insertions(+), 65 deletions(-) diff --git a/include/clover_field_order.h b/include/clover_field_order.h index cfee2bf73a..8af947617f 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -1027,20 +1027,28 @@ namespace quda { const QudaTwistFlavorType twist_flavor; const Float mu2; const Float epsilon2; - const int L[4]; const double coeff; const double csw; const double kappa; + const int L[4]; // xyzt convention + const int L_[4]; // txyz convention + const int volume; + const int cbs[4]; // openQCDs cache block size + const int cbn[4]; // openQCDs cache block grid OpenQCDOrder(const CloverField &clover, bool inverse, Float *clover_ = nullptr, void * = nullptr) : volumeCB(clover.Stride()), twist_flavor(clover.TwistFlavor()), mu2(clover.Mu2()), epsilon2(clover.Epsilon2()), - L {clover.X()[0], clover.X()[1], clover.X()[2], clover.X()[3]}, // local dimensions (xyzt) coeff(clover.Coeff()), csw(clover.Csw()), - kappa(clover.Coeff()/clover.Csw()) + kappa(clover.Coeff()/clover.Csw()), + L {clover.X()[0], clover.X()[1], clover.X()[2], clover.X()[3]}, // *local* lattice dimensions, xyzt + L_ {clover.X()[3], clover.X()[0], clover.X()[1], clover.X()[2]}, // *local* lattice dimensions, txyz + volume(L_[0]*L_[1]*L_[2]*L_[3]), // *local* lattice volume + cbs {setup_cbs(0, L_), setup_cbs(1, L_), setup_cbs(2, L_), setup_cbs(3, L_)}, // txyz + cbn {L_[0]/cbs[0], L_[1]/cbs[1], L_[2]/cbs[2], L_[3]/cbs[3]} // txyz { if (clover.Order() != QUDA_OPENQCD_CLOVER_ORDER) { errorQuda("Invalid clover order %d for this accessor", clover.Order()); @@ -1055,57 +1063,19 @@ namespace quda { Float Mu2() const { return mu2; } Float Epsilon2() const { return epsilon2; } - /** - * @brief Pure function to return ipt[iy], where - * iy=x3+L3*x2+L2*L3*x1+L1*L2*L3*x0 without accessing the - * ipt-array, but calculating the index on the fly. Notice - * that xi and Li are in openQCD (txyz) convention. If they - * come from QUDA, you have to rotate them first. - * - * @param[in] x Carthesian local lattice corrdinates, 0 <= x[i] < - * Li - * - * @return ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*x0] = the local flat index - * of openQCD - */ - __device__ __host__ inline int ipt(int *x) const + __device__ __host__ inline int setup_cbs(const int mu, const int *X) const { - int xb[4], xn[4], ib, in, is, cbs[4], mu, L_[4]; - - rotate_coords(L, L_); // L_ local lattice dimensions in openQCD format (txyz) - - /* cache_block */ - for (mu=1;mu<4;mu++) { - if ((L[mu]%4)==0) { - cbs[mu]=4; - } else if ((L[mu]%3)==0) { - cbs[mu]=3; - } else if ((L[mu]%2)==0) { - cbs[mu]=2; - } else { - cbs[mu]=1; - } + if (mu==0) { + return X[0]; + } else if ((X[mu]%4)==0) { + return 4; + } else if ((X[mu]%3)==0) { + return 3; + } else if ((X[mu]%2)==0) { + return 2; + } else { + return 1; } - - xb[0] = x[0]; - xb[1] = x[1] % cbs[1]; - xb[2] = x[2] % cbs[2]; - xb[3] = x[3] % cbs[3]; - - xn[1] = x[1]/cbs[1]; - xn[2] = x[2]/cbs[2]; - xn[3] = x[3]/cbs[3]; - - /** - * This is essentially what cbix[...] does. - * Notice integer division; truncated towards zero, i.e. 5/2=2 - */ - ib = (xb[3] + cbs[3]*xb[2] + cbs[2]*cbs[3]*xb[1] + cbs[1]*cbs[2]*cbs[3]*xb[0])/2; - - in = xn[3] + (L_[3]/cbs[3])*xn[2] + (L_[3]/cbs[3])*(L_[2]/cbs[2])*xn[1]; - is = x[0] + x[1] + x[2] + x[3]; - - return ib + (L_[0]*cbs[1]*cbs[2]*cbs[3]*in)/2 + (is%2)*(L_[0]*L_[1]*L_[2]*L_[3]/2); } /** @@ -1124,34 +1094,86 @@ namespace quda { x_openQCD[0] = x_quda[3]; } + /** + * @brief Generate a lexicographical index with x[Ndims-1] running + * fastest, for example if Ndims=4: + * ix = X3*X2*X1*x0 + X3*X2*x1 + X3*x2 + x3. + * + * @param[in] x Integer array of dimension Ndims with coordinates + * @param[in] X Integer array of dimension Ndims with extents + * @param[in] Ndims The number of dimensions + * + * @return Lexicographical index + */ + __device__ __host__ inline int lexi(const int *x, const int *X, const int Ndims) const + { + int i, ix = x[0]; + + #pragma unroll + for (i=1; i +VOLUME/2 */ + ); + } + /** * @brief Gets the offset in Floats from the openQCD base pointer to * the spinor field. * - * @param[in] x Checkerboard index coming from quda + * @param[in] x_cb Checkerboard index coming from quda * @param[in] parity The parity coming from quda * * @return The offset. */ - __device__ __host__ inline int getCloverOffset(int x, int parity) const + __device__ __host__ inline int getCloverOffset(int x_cb, int parity) const { - int coord_quda[4], coord_openQCD[4]; - - /* coord_quda contains xyzt local Carthesian corrdinates */ - getCoords(coord_quda, x, L, parity); - rotate_coords(coord_quda, coord_openQCD); /* xyzt -> txyz */ - - return ipt(coord_openQCD)*length; + int x_quda[4], x[4]; + getCoords(x_quda, x_cb, L, parity); // x_quda contains xyzt local Carthesian corrdinates + rotate_coords(x_quda, x); // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate + return ipt(x)*length; } /** - * @brief Load a clover field at lattice point x + * @brief Load a clover field at lattice point x_cb * * @param v The output clover matrix in QUDA order - * @param x The checkerboarded lattice site + * @param x_cb The checkerboarded lattice site * @param parity The parity of the lattice site */ - __device__ __host__ inline void load(RegType v[length], int x, int parity) const { + __device__ __host__ inline void load(RegType v[length], int x_cb, int parity) const { int sign[36] = {-1,-1,-1,-1,-1,-1, // diagonals (idx 0-5) -1,+1,-1,+1,-1,-1,-1,-1,-1,-1, // column 0 (idx 6-15) -1,+1,-1,-1,-1,-1,-1,-1, // column 1 (idx 16-23) @@ -1165,7 +1187,7 @@ namespace quda { 30,31,32,33, 34,35}; const int M = length/2; - int offset = getCloverOffset(x, parity); + int offset = getCloverOffset(x_cb, parity); auto Ap = &clover[offset]; // A_+ auto Am = &clover[offset + M]; // A_- From 97780110c3a93205735de17187c94bd4d9845826 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Thu, 2 Nov 2023 19:07:31 +0100 Subject: [PATCH 105/148] changed comments to C89 style (such that we can link C89 programs against it) --- include/enum_quda.h | 223 ++++++++++++++++--------------- include/quda.h | 29 ++-- include/quda_openqcd_interface.h | 1 - lib/openqcd_interface.cpp | 197 +++++++++++++-------------- 4 files changed, 225 insertions(+), 225 deletions(-) diff --git a/include/enum_quda.h b/include/enum_quda.h index 9d8e188406..38a16ceb8d 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -15,18 +15,18 @@ typedef enum QudaMemoryType_s { QUDA_MEMORY_INVALID = QUDA_INVALID_ENUM } QudaMemoryType; -// -// Types used in QudaGaugeParam -// +/** + * Types used in QudaGaugeParam + */ typedef enum QudaLinkType_s { QUDA_SU3_LINKS, QUDA_GENERAL_LINKS, QUDA_THREE_LINKS, QUDA_MOMENTUM_LINKS, - QUDA_COARSE_LINKS, // used for coarse-gauge field with multigrid - QUDA_SMEARED_LINKS, // used for loading and saving gaugeSmeared in the interface - QUDA_WILSON_LINKS = QUDA_SU3_LINKS, // used by wilson, clover, twisted mass, and domain wall + QUDA_COARSE_LINKS, /* used for coarse-gauge field with multigrid */ + QUDA_SMEARED_LINKS, /* used for loading and saving gaugeSmeared in the interface */ + QUDA_WILSON_LINKS = QUDA_SU3_LINKS, /* used by wilson, clover, twisted mass, and domain wall */ QUDA_ASQTAD_FAT_LINKS = QUDA_GENERAL_LINKS, QUDA_ASQTAD_LONG_LINKS = QUDA_THREE_LINKS, QUDA_ASQTAD_MOM_LINKS = QUDA_MOMENTUM_LINKS, @@ -36,19 +36,19 @@ typedef enum QudaLinkType_s { typedef enum QudaGaugeFieldOrder_s { QUDA_FLOAT_GAUGE_ORDER = 1, - QUDA_FLOAT2_GAUGE_ORDER = 2, // no reconstruct and double precision - QUDA_FLOAT4_GAUGE_ORDER = 4, // 8 reconstruct single, and 12 reconstruct single, half, quarter - QUDA_FLOAT8_GAUGE_ORDER = 8, // 8 reconstruct half and quarter - QUDA_NATIVE_GAUGE_ORDER, // used to denote one of the above types in a trait, not used directly - QUDA_QDP_GAUGE_ORDER, // expect *gauge[mu], even-odd, spacetime, row-column color - QUDA_QDPJIT_GAUGE_ORDER, // expect *gauge[mu], even-odd, complex-column-row-spacetime - QUDA_CPS_WILSON_GAUGE_ORDER, // expect *gauge, even-odd, mu, spacetime, column-row color - QUDA_MILC_GAUGE_ORDER, // expect *gauge, even-odd, mu, spacetime, row-column order - QUDA_MILC_SITE_GAUGE_ORDER, // packed into MILC site AoS [even-odd][spacetime] array, and [dir][row][col] inside - QUDA_BQCD_GAUGE_ORDER, // expect *gauge, mu, even-odd, spacetime+halos, column-row order - QUDA_TIFR_GAUGE_ORDER, // expect *gauge, mu, even-odd, spacetime, column-row order - QUDA_TIFR_PADDED_GAUGE_ORDER, // expect *gauge, mu, parity, t, z+halo, y, x/2, column-row order - QUDA_OPENQCD_GAUGE_ORDER, // expect *gauge, spacetime, mu, parity (uplink/downlink), row-column order -- links attached to odd points only + QUDA_FLOAT2_GAUGE_ORDER = 2, /* no reconstruct and double precision */ + QUDA_FLOAT4_GAUGE_ORDER = 4, /* 8 reconstruct single, and 12 reconstruct single, half, quarter */ + QUDA_FLOAT8_GAUGE_ORDER = 8, /* 8 reconstruct half and quarter */ + QUDA_NATIVE_GAUGE_ORDER, /* used to denote one of the above types in a trait, not used directly */ + QUDA_QDP_GAUGE_ORDER, /* expect *gauge[mu], even-odd, spacetime, row-column color */ + QUDA_QDPJIT_GAUGE_ORDER, /* expect *gauge[mu], even-odd, complex-column-row-spacetime */ + QUDA_CPS_WILSON_GAUGE_ORDER, /* expect *gauge, even-odd, mu, spacetime, column-row color */ + QUDA_MILC_GAUGE_ORDER, /* expect *gauge, even-odd, mu, spacetime, row-column order */ + QUDA_MILC_SITE_GAUGE_ORDER, /* packed into MILC site AoS [even-odd][spacetime] array, and [dir][row][col] inside */ + QUDA_BQCD_GAUGE_ORDER, /* expect *gauge, mu, even-odd, spacetime+halos, column-row order */ + QUDA_TIFR_GAUGE_ORDER, /* expect *gauge, mu, even-odd, spacetime, column-row order */ + QUDA_TIFR_PADDED_GAUGE_ORDER, /* expect *gauge, mu, parity, t, z+halo, y, x/2, column-row order */ + QUDA_OPENQCD_GAUGE_ORDER, /* expect *gauge, spacetime, mu, parity row-column order -- links attached to odd points only */ QUDA_INVALID_GAUGE_ORDER = QUDA_INVALID_ENUM } QudaGaugeFieldOrder; @@ -67,24 +67,24 @@ typedef enum QudaPrecision_s { } QudaPrecision; typedef enum QudaReconstructType_s { - QUDA_RECONSTRUCT_NO = 18, // store all 18 real numbers explicitly - QUDA_RECONSTRUCT_12 = 12, // reconstruct from 12 real numbers - QUDA_RECONSTRUCT_8 = 8, // reconstruct from 8 real numbers - QUDA_RECONSTRUCT_9 = 9, // used for storing HISQ long-link variables - QUDA_RECONSTRUCT_13 = 13, // used for storing HISQ long-link variables - QUDA_RECONSTRUCT_10 = 10, // 10-number parameterization used for storing the momentum field + QUDA_RECONSTRUCT_NO = 18, /* store all 18 real numbers explicitly */ + QUDA_RECONSTRUCT_12 = 12, /* reconstruct from 12 real numbers */ + QUDA_RECONSTRUCT_8 = 8, /* reconstruct from 8 real numbers */ + QUDA_RECONSTRUCT_9 = 9, /* used for storing HISQ long-link variables */ + QUDA_RECONSTRUCT_13 = 13, /* used for storing HISQ long-link variables */ + QUDA_RECONSTRUCT_10 = 10, /* 10-number parameterization used for storing the momentum field */ QUDA_RECONSTRUCT_INVALID = QUDA_INVALID_ENUM } QudaReconstructType; typedef enum QudaGaugeFixed_s { - QUDA_GAUGE_FIXED_NO, // no gauge fixing - QUDA_GAUGE_FIXED_YES, // gauge field stored in temporal gauge + QUDA_GAUGE_FIXED_NO, /* no gauge fixing */ + QUDA_GAUGE_FIXED_YES, /* gauge field stored in temporal gauge */ QUDA_GAUGE_FIXED_INVALID = QUDA_INVALID_ENUM } QudaGaugeFixed; -// -// Types used in QudaInvertParam -// +/** + * Types used in QudaInvertParam + */ typedef enum QudaDslashType_s { QUDA_WILSON_DSLASH, @@ -131,10 +131,10 @@ typedef enum QudaInverterType_s { } QudaInverterType; typedef enum QudaEigType_s { - QUDA_EIG_TR_LANCZOS, // Thick restarted lanczos solver - QUDA_EIG_BLK_TR_LANCZOS, // Block Thick restarted lanczos solver - QUDA_EIG_IR_ARNOLDI, // Implicitly Restarted Arnoldi solver - QUDA_EIG_BLK_IR_ARNOLDI, // Block Implicitly Restarted Arnoldi solver + QUDA_EIG_TR_LANCZOS, /* Thick restarted lanczos solver */ + QUDA_EIG_BLK_TR_LANCZOS, /* Block Thick restarted lanczos solver */ + QUDA_EIG_IR_ARNOLDI, /* Implicitly Restarted Arnoldi solver */ + QUDA_EIG_BLK_IR_ARNOLDI, /* Block Implicitly Restarted Arnoldi solver */ QUDA_EIG_INVALID = QUDA_INVALID_ENUM } QudaEigType; @@ -167,8 +167,8 @@ typedef enum QudaSolveType_s { QUDA_NORMOP_PC_SOLVE, QUDA_NORMERR_SOLVE, QUDA_NORMERR_PC_SOLVE, - QUDA_NORMEQ_SOLVE = QUDA_NORMOP_SOLVE, // deprecated - QUDA_NORMEQ_PC_SOLVE = QUDA_NORMOP_PC_SOLVE, // deprecated + QUDA_NORMEQ_SOLVE = QUDA_NORMOP_SOLVE, /* deprecated */ + QUDA_NORMEQ_PC_SOLVE = QUDA_NORMOP_PC_SOLVE, /* deprecated */ QUDA_INVALID_SOLVE = QUDA_INVALID_ENUM } QudaSolveType; @@ -187,33 +187,34 @@ typedef enum QudaSchwarzType_s { } QudaSchwarzType; typedef enum QudaAcceleratorType_s { - QUDA_MADWF_ACCELERATOR = 0, // Use the MADWF accelerator + QUDA_MADWF_ACCELERATOR = 0, /* Use the MADWF accelerator */ QUDA_INVALID_ACCELERATOR = QUDA_INVALID_ENUM } QudaAcceleratorType; typedef enum QudaResidualType_s { - QUDA_L2_RELATIVE_RESIDUAL = 1, // L2 relative residual (default) - QUDA_L2_ABSOLUTE_RESIDUAL = 2, // L2 absolute residual - QUDA_HEAVY_QUARK_RESIDUAL = 4, // Fermilab heavy quark residual + QUDA_L2_RELATIVE_RESIDUAL = 1, /* L2 relative residual (default) */ + QUDA_L2_ABSOLUTE_RESIDUAL = 2, /* L2 absolute residual */ + QUDA_HEAVY_QUARK_RESIDUAL = 4, /* Fermilab heavy quark residual */ QUDA_INVALID_RESIDUAL = QUDA_INVALID_ENUM } QudaResidualType; -// Which basis to use for CA algorithms +/* Which basis to use for CA algorithms */ typedef enum QudaCABasis_s { QUDA_POWER_BASIS, QUDA_CHEBYSHEV_BASIS, QUDA_INVALID_BASIS = QUDA_INVALID_ENUM } QudaCABasis; -// Whether the preconditioned matrix is (1-k^2 Deo Doe) or (1-k^2 Doe Deo) -// -// For the clover-improved Wilson Dirac operator, QUDA_MATPC_EVEN_EVEN -// defaults to the "symmetric" form, (1 - k^2 A_ee^-1 D_eo A_oo^-1 D_oe), -// and likewise for QUDA_MATPC_ODD_ODD. -// -// For the "asymmetric" form, (A_ee - k^2 D_eo A_oo^-1 D_oe), select -// QUDA_MATPC_EVEN_EVEN_ASYMMETRIC. -// +/** + * Whether the preconditioned matrix is (1-k^2 Deo Doe) or (1-k^2 Doe Deo) + * + * For the clover-improved Wilson Dirac operator, QUDA_MATPC_EVEN_EVEN + * defaults to the "symmetric" form, (1 - k^2 A_ee^-1 D_eo A_oo^-1 D_oe), + * and likewise for QUDA_MATPC_ODD_ODD. + * + * For the "asymmetric" form, (A_ee - k^2 D_eo A_oo^-1 D_oe), select + * QUDA_MATPC_EVEN_EVEN_ASYMMETRIC. + */ typedef enum QudaMatPCType_s { QUDA_MATPC_EVEN_EVEN, QUDA_MATPC_ODD_ODD, @@ -232,37 +233,37 @@ typedef enum QudaMassNormalization_s { } QudaMassNormalization; typedef enum QudaSolverNormalization_s { - QUDA_DEFAULT_NORMALIZATION, // leave source and solution untouched - QUDA_SOURCE_NORMALIZATION // normalize such that || src || = 1 + QUDA_DEFAULT_NORMALIZATION, /* leave source and solution untouched */ + QUDA_SOURCE_NORMALIZATION /* normalize such that || src || = 1 */ } QudaSolverNormalization; typedef enum QudaPreserveSource_s { - QUDA_PRESERVE_SOURCE_NO, // use the source for the residual - QUDA_PRESERVE_SOURCE_YES, // keep the source intact + QUDA_PRESERVE_SOURCE_NO, /* use the source for the residual */ + QUDA_PRESERVE_SOURCE_YES, /* keep the source intact */ QUDA_PRESERVE_SOURCE_INVALID = QUDA_INVALID_ENUM } QudaPreserveSource; typedef enum QudaDiracFieldOrder_s { - QUDA_INTERNAL_DIRAC_ORDER, // internal dirac order used, varies on precision and dslash type - QUDA_DIRAC_ORDER, // even-odd, color inside spin - QUDA_QDP_DIRAC_ORDER, // even-odd, spin inside color - QUDA_QDPJIT_DIRAC_ORDER, // even-odd, complex-color-spin-spacetime - QUDA_CPS_WILSON_DIRAC_ORDER, // odd-even, color inside spin - QUDA_LEX_DIRAC_ORDER, // lexicographical order, color inside spin - QUDA_TIFR_PADDED_DIRAC_ORDER, // padded z dimension for TIFR RHMC code - QUDA_OPENQCD_DIRAC_ORDER, // openqcd + QUDA_INTERNAL_DIRAC_ORDER, /* internal dirac order used, varies on precision and dslash type */ + QUDA_DIRAC_ORDER, /* even-odd, color inside spin */ + QUDA_QDP_DIRAC_ORDER, /* even-odd, spin inside color */ + QUDA_QDPJIT_DIRAC_ORDER, /* even-odd, complex-color-spin-spacetime */ + QUDA_CPS_WILSON_DIRAC_ORDER, /* odd-even, color inside spin */ + QUDA_LEX_DIRAC_ORDER, /* lexicographical order, color inside spin */ + QUDA_TIFR_PADDED_DIRAC_ORDER, /* padded z dimension for TIFR RHMC code */ + QUDA_OPENQCD_DIRAC_ORDER, /* openqcd */ QUDA_INVALID_DIRAC_ORDER = QUDA_INVALID_ENUM } QudaDiracFieldOrder; typedef enum QudaCloverFieldOrder_s { - QUDA_FLOAT_CLOVER_ORDER = 1, // even-odd float ordering - QUDA_FLOAT2_CLOVER_ORDER = 2, // even-odd float2 ordering - QUDA_FLOAT4_CLOVER_ORDER = 4, // even-odd float4 ordering - QUDA_FLOAT8_CLOVER_ORDER = 8, // even-odd float8 ordering - QUDA_PACKED_CLOVER_ORDER, // even-odd, QDP packed - QUDA_QDPJIT_CLOVER_ORDER, // (diagonal / off-diagonal)-chirality-spacetime - QUDA_BQCD_CLOVER_ORDER, // even-odd, super-diagonal packed and reordered - QUDA_OPENQCD_CLOVER_ORDER, // openqcd + QUDA_FLOAT_CLOVER_ORDER = 1, /* even-odd float ordering */ + QUDA_FLOAT2_CLOVER_ORDER = 2, /* even-odd float2 ordering */ + QUDA_FLOAT4_CLOVER_ORDER = 4, /* even-odd float4 ordering */ + QUDA_FLOAT8_CLOVER_ORDER = 8, /* even-odd float8 ordering */ + QUDA_PACKED_CLOVER_ORDER, /* even-odd, QDP packed */ + QUDA_QDPJIT_CLOVER_ORDER, /* (diagonal / off-diagonal)-chirality-spacetime */ + QUDA_BQCD_CLOVER_ORDER, /* even-odd, super-diagonal packed and reordered */ + QUDA_OPENQCD_CLOVER_ORDER, /* openqcd */ QUDA_INVALID_CLOVER_ORDER = QUDA_INVALID_ENUM } QudaCloverFieldOrder; @@ -282,15 +283,15 @@ typedef enum QudaPreserveDirac_s { QUDA_PRESERVE_DIRAC_INVALID = QUDA_INVALID_ENUM } QudaPreserveDirac; -// -// Type used for "parity" argument to dslashQuda() -// +/** + * Type used for "parity" argument to dslashQuda() + */ typedef enum QudaParity_s { QUDA_EVEN_PARITY = 0, QUDA_ODD_PARITY, QUDA_INVALID_PARITY = QUDA_INVALID_ENUM } QudaParity; -// -// Types used only internally -// +/** + * Types used only internally + */ typedef enum QudaDiracType_s { QUDA_WILSON_DIRAC, @@ -325,48 +326,48 @@ typedef enum QudaDiracType_s { QUDA_INVALID_DIRAC = QUDA_INVALID_ENUM } QudaDiracType; -// Where the field is stored +/* Where the field is stored */ typedef enum QudaFieldLocation_s { QUDA_CPU_FIELD_LOCATION = 1, QUDA_CUDA_FIELD_LOCATION = 2, QUDA_INVALID_FIELD_LOCATION = QUDA_INVALID_ENUM } QudaFieldLocation; -// Which sites are included +/* Which sites are included */ typedef enum QudaSiteSubset_s { QUDA_PARITY_SITE_SUBSET = 1, QUDA_FULL_SITE_SUBSET = 2, QUDA_INVALID_SITE_SUBSET = QUDA_INVALID_ENUM } QudaSiteSubset; -// Site ordering (always t-z-y-x, with rightmost varying fastest) +/* Site ordering (always t-z-y-x, with rightmost varying fastest) */ typedef enum QudaSiteOrder_s { - QUDA_LEXICOGRAPHIC_SITE_ORDER, // lexicographic ordering - QUDA_EVEN_ODD_SITE_ORDER, // QUDA and QDP use this - QUDA_ODD_EVEN_SITE_ORDER, // CPS uses this + QUDA_LEXICOGRAPHIC_SITE_ORDER, /* lexicographic ordering */ + QUDA_EVEN_ODD_SITE_ORDER, /* QUDA and QDP use this */ + QUDA_ODD_EVEN_SITE_ORDER, /* CPS uses this */ QUDA_INVALID_SITE_ORDER = QUDA_INVALID_ENUM } QudaSiteOrder; -// Degree of freedom ordering +/* Degree of freedom ordering */ typedef enum QudaFieldOrder_s { - QUDA_FLOAT_FIELD_ORDER = 1, // spin-color-complex-space - QUDA_FLOAT2_FIELD_ORDER = 2, // (spin-color-complex)/2-space-(spin-color-complex)%2 - QUDA_FLOAT4_FIELD_ORDER = 4, // (spin-color-complex)/4-space-(spin-color-complex)%4 - QUDA_FLOAT8_FIELD_ORDER = 8, // (spin-color-complex)/8-space-(spin-color-complex)%8 - QUDA_SPACE_SPIN_COLOR_FIELD_ORDER, // CPS/QDP++ ordering - QUDA_SPACE_COLOR_SPIN_FIELD_ORDER, // QLA ordering (spin inside color) - QUDA_QDPJIT_FIELD_ORDER, // QDP field ordering (complex-color-spin-spacetime) - QUDA_QOP_DOMAIN_WALL_FIELD_ORDER, // QOP domain-wall ordering - QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER, // TIFR RHMC ordering - QUDA_OPENQCD_FIELD_ORDER, // OPENQCD geometry ordering (at the moment lexicographical w/ rotation zyxt = x3x2x1x0 |-> xyzt x0x1x2x3 ) + QUDA_FLOAT_FIELD_ORDER = 1, /* spin-color-complex-space */ + QUDA_FLOAT2_FIELD_ORDER = 2, /* (spin-color-complex)/2-space-(spin-color-complex)%2 */ + QUDA_FLOAT4_FIELD_ORDER = 4, /* (spin-color-complex)/4-space-(spin-color-complex)%4 */ + QUDA_FLOAT8_FIELD_ORDER = 8, /* (spin-color-complex)/8-space-(spin-color-complex)%8 */ + QUDA_SPACE_SPIN_COLOR_FIELD_ORDER, /* CPS/QDP++ ordering */ + QUDA_SPACE_COLOR_SPIN_FIELD_ORDER, /* QLA ordering (spin inside color) */ + QUDA_QDPJIT_FIELD_ORDER, /* QDP field ordering (complex-color-spin-spacetime) */ + QUDA_QOP_DOMAIN_WALL_FIELD_ORDER, /* QOP domain-wall ordering */ + QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER, /* TIFR RHMC ordering */ + QUDA_OPENQCD_FIELD_ORDER, /* OPENQCD ordering */ QUDA_INVALID_FIELD_ORDER = QUDA_INVALID_ENUM } QudaFieldOrder; typedef enum QudaFieldCreate_s { - QUDA_NULL_FIELD_CREATE, // create new field - QUDA_ZERO_FIELD_CREATE, // create new field and zero it - QUDA_COPY_FIELD_CREATE, // create copy to field - QUDA_REFERENCE_FIELD_CREATE, // create reference to field + QUDA_NULL_FIELD_CREATE, /* create new field */ + QUDA_ZERO_FIELD_CREATE, /* create new field and zero it */ + QUDA_COPY_FIELD_CREATE, /* create copy to field */ + QUDA_REFERENCE_FIELD_CREATE, /* create reference to field */ QUDA_INVALID_FIELD_CREATE = QUDA_INVALID_ENUM } QudaFieldCreate; @@ -401,14 +402,14 @@ typedef enum QudaDilutionType_s { QUDA_DILUTION_INVALID = QUDA_INVALID_ENUM } QudaDilutionType; -// used to select projection method for deflated solvers +/* used to select projection method for deflated solvers */ typedef enum QudaProjectionType_s { QUDA_MINRES_PROJECTION, QUDA_GALERKIN_PROJECTION, QUDA_INVALID_PROJECTION = QUDA_INVALID_ENUM } QudaProjectionType; -// used to select checkerboard preconditioning method +/* used to select checkerboard preconditioning method */ typedef enum QudaPCType_s { QUDA_4D_PC = 4, QUDA_5D_PC = 5, QUDA_PC_INVALID = QUDA_INVALID_ENUM } QudaPCType; typedef enum QudaTwistFlavorType_s { @@ -477,7 +478,7 @@ typedef enum QudaBoolean_s { QUDA_BOOLEAN_INVALID = QUDA_INVALID_ENUM } QudaBoolean; -// define these for backwards compatibility +/* define these for backwards compatibility */ #define QUDA_BOOLEAN_NO QUDA_BOOLEAN_FALSE #define QUDA_BOOLEAN_YES QUDA_BOOLEAN_TRUE @@ -488,17 +489,17 @@ typedef enum QudaBLASType_s { } QudaBLASType; typedef enum QudaBLASOperation_s { - QUDA_BLAS_OP_N = 0, // No transpose - QUDA_BLAS_OP_T = 1, // Transpose only - QUDA_BLAS_OP_C = 2, // Conjugate transpose + QUDA_BLAS_OP_N = 0, /* No transpose */ + QUDA_BLAS_OP_T = 1, /* Transpose only */ + QUDA_BLAS_OP_C = 2, /* Conjugate transpose */ QUDA_BLAS_OP_INVALID = QUDA_INVALID_ENUM } QudaBLASOperation; typedef enum QudaBLASDataType_s { - QUDA_BLAS_DATATYPE_S = 0, // Single - QUDA_BLAS_DATATYPE_D = 1, // Double - QUDA_BLAS_DATATYPE_C = 2, // Complex(single) - QUDA_BLAS_DATATYPE_Z = 3, // Complex(double) + QUDA_BLAS_DATATYPE_S = 0, /* Single */ + QUDA_BLAS_DATATYPE_D = 1, /* Double */ + QUDA_BLAS_DATATYPE_C = 2, /* Complex(single) */ + QUDA_BLAS_DATATYPE_Z = 3, /* Complex(double) */ QUDA_BLAS_DATATYPE_INVALID = QUDA_INVALID_ENUM } QudaBLASDataType; @@ -522,7 +523,7 @@ typedef enum QudaFieldGeometry_s { QUDA_VECTOR_GEOMETRY = 4, QUDA_TENSOR_GEOMETRY = 6, QUDA_COARSE_GEOMETRY = 8, - QUDA_KDINVERSE_GEOMETRY = 16, // Decomposition of Kahler-Dirac block + QUDA_KDINVERSE_GEOMETRY = 16, /* Decomposition of Kahler-Dirac block */ QUDA_INVALID_GEOMETRY = QUDA_INVALID_ENUM } QudaFieldGeometry; @@ -542,8 +543,8 @@ typedef enum QudaStaggeredPhase_s { } QudaStaggeredPhase; typedef enum QudaContractType_s { - QUDA_CONTRACT_TYPE_OPEN, // Open spin elementals - QUDA_CONTRACT_TYPE_DR, // DegrandRossi + QUDA_CONTRACT_TYPE_OPEN, /* Open spin elementals */ + QUDA_CONTRACT_TYPE_DR, /* DegrandRossi */ QUDA_CONTRACT_TYPE_INVALID = QUDA_INVALID_ENUM } QudaContractType; @@ -576,7 +577,7 @@ typedef enum QudaGaugeSmearType_s { QUDA_GAUGE_SMEAR_INVALID = QUDA_INVALID_ENUM } QudaGaugeSmearType; -// Allows to choose an appropriate external library +/* Allows to choose an appropriate external library */ typedef enum QudaExtLibType_s { QUDA_CUSOLVE_EXTLIB, QUDA_EIGEN_EXTLIB, diff --git a/include/quda.h b/include/quda.h index 70a14bc804..2798f3e42b 100644 --- a/include/quda.h +++ b/include/quda.h @@ -16,7 +16,7 @@ #ifndef __CUDACC_RTC__ #define double_complex double _Complex -#else // keep NVRTC happy since it can't handle C types +#else /* keep NVRTC happy since it can't handle C types */ #define double_complex double2 #endif @@ -449,13 +449,13 @@ extern "C" { } QudaInvertParam; - // Parameter set for solving eigenvalue problems. + /* Parameter set for solving eigenvalue problems. */ typedef struct QudaEigParam_s { /** Size of this struct in bytes. Used to ensure that the host application and QUDA see the same struct size */ size_t struct_size; - // EIGENSOLVER PARAMS - //------------------------------------------------- + /* EIGENSOLVER PARAMS */ + /*-------------------------------------------------*/ /** Used to store information pertinent to the operator **/ QudaInvertParam *invert_param; @@ -545,10 +545,9 @@ extern "C" { /** Name of the QUDA logfile (residua, upper Hessenberg/tridiag matrix updates) **/ char QUDA_logfile[512]; - //------------------------------------------------- - - // EIG-CG PARAMS - //------------------------------------------------- + /*-------------------------------------------------*/ + /* EIG-CG PARAMS */ + /*-------------------------------------------------*/ int nk; int np; @@ -589,7 +588,7 @@ extern "C" { /** Which external library to use in the deflation operations (Eigen) */ QudaExtLibType extlib_type; - //------------------------------------------------- + /*-------------------------------------------------*/ } QudaEigParam; typedef struct QudaMultigridParam_s { @@ -842,7 +841,7 @@ extern "C" { QudaBLASType blas_type; /**< Type of BLAS computation to perfrom */ - // GEMM params + /* GEMM params */ QudaBLASOperation trans_a; /**< operation op(A) that is non- or (conj.) transpose. */ QudaBLASOperation trans_b; /**< operation op(B) that is non- or (conj.) transpose. */ int m; /**< number of rows of matrix op(A) and C. */ @@ -860,10 +859,10 @@ extern "C" { double_complex alpha; /**< scalar used for multiplication. */ double_complex beta; /**< scalar used for multiplication. If beta==0, C does not have to be a valid input. */ - // LU inversion params + /* LU inversion params */ int inv_mat_size; /**< The rank of the square matrix in the LU inversion */ - // Common params + /* Common params */ int batch_count; /**< number of pointers contained in arrayA, arrayB and arrayC. */ QudaBLASDataType data_type; /**< Specifies if using S(C) or D(Z) BLAS type */ QudaBLASDataOrder data_order; /**< Specifies if using Row or Column major */ @@ -967,7 +966,7 @@ extern "C" { * initQuda. Calling initQudaMemory requires that the user has * previously called initQudaDevice. */ - void initQudaMemory(); + void initQudaMemory(void); /** * Initialize the library. This function is actually a wrapper @@ -990,7 +989,7 @@ extern "C" { * @details This should only be needed for automated testing when * different partitioning is applied within a single run. */ - void updateR(); + void updateR(void); /** * A new QudaGaugeParam should always be initialized immediately @@ -1728,7 +1727,7 @@ extern "C" { } #endif -// remove NVRTC WAR +/* remove NVRTC WAR */ #undef double_complex /* #include */ diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index e14275b5e8..4c7366cbdb 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -1,6 +1,5 @@ #pragma once -#include #include /** diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index a94505680d..cf575a4822 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -27,8 +27,10 @@ static openQCD_QudaState_t qudaState = {false, false, false, false, {}, {}}; using namespace quda; -// code for NVTX taken from Jiri Kraus' blog post: -// http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/ +/** + * code for NVTX taken from Jiri Kraus' blog post: + * http://devblogs.nvidia.com/parallelforall/cuda-pro-tip-generate-custom-application-profile-timelines-nvtx/ + */ #ifdef INTERFACE_NVTX @@ -64,7 +66,7 @@ static const int num_colors = sizeof(colors) / sizeof(uint32_t); template void inline qudaopenqcd_called(const char *func, QudaVerbosity verb) { - // add NVTX markup if enabled + /* add NVTX markup if enabled */ if (start) { PUSH_RANGE(func, 1); } else { @@ -239,7 +241,6 @@ class KeyValueStore { if (map != nullptr) { auto mvalue = map->find(value); if (mvalue != map->end()) { - //store[section][key] = mvalue->second; std::get<0>(store[section][key]) = mvalue->second; std::get<1>(store[section][key]) = value; return; @@ -247,7 +248,6 @@ class KeyValueStore { } std::get<0>(store[section][key]) = value; std::get<1>(store[section][key]) = value; - //store[section][key] = value; } void set_map(std::unordered_map *_map) { @@ -270,7 +270,7 @@ class KeyValueStore { int idx; std::string rkey; std::smatch match; - std::regex p_key("([^\\[]+)\\[(\\d+)\\]"); // key[idx] + std::regex p_key("([^\\[]+)\\[(\\d+)\\]"); /* key[idx] */ auto sec = store.find(section); if (sec != store.end()) { @@ -304,7 +304,7 @@ class KeyValueStore { } } } - return default_value; // Return default value for non-existent keys + return default_value; /* Return default value for non-existent keys */ } /** @@ -317,15 +317,15 @@ class KeyValueStore { std::smatch match; std::ifstream file(filename.c_str()); - std::regex p_section("^\\s*\\[([\\w\\ ]+)\\].*$"); // [section] - std::regex p_comment("^[^#]*(\\s*#.*)$"); // line # comment - std::regex p_key_val("^([^\\s]+)\\s+(.*[^\\s]+)\\s*$"); // key value + std::regex p_section("^\\s*\\[([\\w\\ ]+)\\].*$"); /* [section] */ + std::regex p_comment("^[^#]*(\\s*#.*)$"); /* line # comment */ + std::regex p_key_val("^([^\\s]+)\\s+(.*[^\\s]+)\\s*$"); /* key value */ if (file.is_open()) { while (std::getline(file, line)) { - // remove all comments + /* remove all comments */ if (std::regex_search(line, match, p_comment)) { line.erase(match.position(1)); } @@ -396,7 +396,7 @@ static lat_dim_t get_local_dims(int *fill = nullptr) * * @return rank */ -static int rankFromCoords(const int *coords, void *fdata) // TODO: +static int rankFromCoords(const int *coords, void *fdata) { int *base = static_cast(fdata); int *NPROC = base + 1; @@ -426,18 +426,18 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) } #ifdef MULTI_GPU -// TODO: would we ever want to run with QMP COMMS? +/* TODO: would we ever want to run with QMP COMMS? */ #ifdef QMP_COMMS initCommsGridQuda(4, layout.nproc, nullptr, nullptr); #else initCommsGridQuda(4, layout.nproc, rankFromCoords, (void *)(layout.data)); #endif - static int device = -1; // enable a default allocation of devices to processes + static int device = -1; /* enable a default allocation of devices to processes */ #else static int device = layout.device; #endif - // must happen *after* communication initialization + /* must happen *after* communication initialization */ MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); sprintf(prefix, "QUDA (rank=%d): ", my_rank); @@ -468,13 +468,11 @@ static QudaInvertParam newOpenQCDParam(void) param.verbosity = verbosity; - param.cpu_prec = QUDA_DOUBLE_PRECISION; // The precision used by the input fermion fields - param.cuda_prec = QUDA_DOUBLE_PRECISION; // The precision used by the QUDA solver + param.cpu_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the input fermion fields */ + param.cuda_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the QUDA solver */ - /* AA: This breaks GCR */ - // /* TH added for MG support */ - param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; // The precision used by the QUDA solver - param.cuda_prec_precondition = QUDA_HALF_PRECISION; // The precision used by the QUDA solver + param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; /* The precision used by the QUDA solver */ + param.cuda_prec_precondition = QUDA_HALF_PRECISION; /* The precision used by the QUDA solver */ /** * The order of the input and output fermion fields. Imposes fieldOrder = @@ -483,7 +481,7 @@ static QudaInvertParam newOpenQCDParam(void) */ param.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; - // Gamma basis of the input and output host fields + /* Gamma basis of the input and output host fields */ param.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; return param; @@ -507,16 +505,16 @@ static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec) param.reconstruct_sloppy = param.reconstruct = QUDA_RECONSTRUCT_NO; - // This make quda to instantiate OpenQCDOrder + /* This make quda to instantiate OpenQCDOrder */ param.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; - // Seems to have no effect ... + /* Seems to have no effect ... */ param.t_boundary = QUDA_PERIODIC_T; param.gauge_fix = QUDA_GAUGE_FIXED_NO; param.scale = 1.0; - param.anisotropy = 1.0; // 1.0 means not anisotropic - param.ga_pad = getLinkPadding(param.X); // Why this? + param.anisotropy = 1.0; /* 1.0 means not anisotropic */ + param.ga_pad = getLinkPadding(param.X); /* Why this? */ checkGaugeParam(¶m); @@ -553,7 +551,7 @@ double openQCD_qudaPlaquette(void) plaqQuda(plaq); - // Note different Nc normalization wrt openQCD! + /* Note different Nc normalization wrt openQCD! */ return 3.0*plaq[0]; } @@ -629,16 +627,16 @@ static QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) param.dagger = QUDA_DAG_NO; if (p.su3csw != 0.0) { - param.clover_location = QUDA_CUDA_FIELD_LOCATION; // seems to have no effect? + param.clover_location = QUDA_CUDA_FIELD_LOCATION; /* seems to have no effect? */ param.clover_cpu_prec = QUDA_DOUBLE_PRECISION; param.clover_cuda_prec = QUDA_DOUBLE_PRECISION; - param.clover_order = QUDA_FLOAT8_CLOVER_ORDER; // what implication has this? + param.clover_order = QUDA_FLOAT8_CLOVER_ORDER; /* what implication has this? */ param.compute_clover = true; param.clover_csw = p.su3csw; param.clover_coeff = 0.0; - // Set to Wilson Dirac operator with Clover term + /* Set to Wilson Dirac operator with Clover term */ param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; if (!qudaState.clover_loaded) { @@ -647,17 +645,17 @@ static QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) * the GPU and finally calls @createCloverQuda to calculate the clover * field. */ - loadCloverQuda(NULL, NULL, ¶m); // Create the clover field + loadCloverQuda(NULL, NULL, ¶m); /* Create the clover field */ qudaState.clover_loaded = true; } } - param.inv_type = QUDA_CG_INVERTER; // just set some, needed? + param.inv_type = QUDA_CG_INVERTER; /* just set some, needed? */ - // What is the difference? only works with QUDA_MASS_NORMALIZATION + /* What is the difference? only works with QUDA_MASS_NORMALIZATION */ param.mass_normalization = QUDA_MASS_NORMALIZATION; - // Extent of the 5th dimension (for domain wall) + /* Extent of the 5th dimension (for domain wall) */ param.Ls = 1; return param; @@ -681,7 +679,7 @@ static QudaInvertParam newOpenQCDSolverParam(openQCD_QudaDiracParam_t p) param.solve_type = QUDA_DIRECT_SOLVE; param.matpc_type = QUDA_MATPC_EVEN_EVEN; param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; - param.inv_type_precondition = QUDA_INVALID_INVERTER; // disables any preconditioning + param.inv_type_precondition = QUDA_INVALID_INVERTER; /* disables any preconditioning */ return param; } @@ -689,33 +687,33 @@ static QudaInvertParam newOpenQCDSolverParam(openQCD_QudaDiracParam_t p) void openQCD_back_and_forth(void *h_in, void *h_out) { - // sets up the necessary parameters + /* sets up the necessary parameters */ QudaInvertParam param = newOpenQCDParam(); - // creates a field on the CPU + /* creates a field on the CPU */ ColorSpinorParam cpuParam(h_in, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); ColorSpinorField in_h(cpuParam); - // creates a field on the GPU with the same parameter set as the CPU field + /* creates a field on the GPU with the same parameter set as the CPU field */ ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); ColorSpinorField in(cudaParam); - // transfer the CPU field to GPU + /* transfer the CPU field to GPU */ in = in_h; - // creates a field on the CPU + /* creates a field on the CPU */ cpuParam.v = h_out; cpuParam.location = QUDA_CPU_FIELD_LOCATION; ColorSpinorField out_h(cpuParam); - // creates a zero-field on the GPU + /* creates a zero-field on the GPU */ cudaParam.create = QUDA_NULL_FIELD_CREATE; cudaParam.location = QUDA_CUDA_FIELD_LOCATION; ColorSpinorField out(cudaParam); out = in; - // transfer the GPU field back to CPU + /* transfer the GPU field back to CPU */ out_h = out; } @@ -750,37 +748,37 @@ double openQCD_qudaNorm_NoLoads(void *d_in) void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) { - // sets up the necessary parameters + /* sets up the necessary parameters */ QudaInvertParam param = newOpenQCDParam(); - // creates a field on the CPU + /* creates a field on the CPU */ ColorSpinorParam cpuParam(openQCD_in, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); ColorSpinorField in_h(cpuParam); - // creates a field on the GPU with the same parameter set as the CPU field + /* creates a field on the GPU with the same parameter set as the CPU field */ ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); ColorSpinorField in(cudaParam); - // transfer the CPU field to GPU + /* transfer the CPU field to GPU */ in = in_h; - // creates a zero-field on the GPU + /* creates a zero-field on the GPU */ cudaParam.create = QUDA_NULL_FIELD_CREATE; cudaParam.location = QUDA_CUDA_FIELD_LOCATION; ColorSpinorField out(cudaParam); - // gamma_i run within QUDA using QUDA fields + /* gamma_i run within QUDA using QUDA fields */ switch (dir) { - case 0: // t direction + case 0: /* t direction */ gamma3(out, in); break; - case 1: // x direction + case 1: /* x direction */ gamma0(out, in); break; - case 2: // y direction + case 2: /* y direction */ gamma1(out, in); break; - case 3: // z direction + case 3: /* z direction */ gamma2(out, in); break; case 4: @@ -798,30 +796,30 @@ void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) errorQuda("Unknown gamma: %d\n", dir); } - // creates a field on the CPU + /* creates a field on the CPU */ cpuParam.v = openQCD_out; cpuParam.location = QUDA_CPU_FIELD_LOCATION; ColorSpinorField out_h(cpuParam); - // transfer the GPU field back to CPU + /* transfer the GPU field back to CPU */ out_h = out; } void* openQCD_qudaH2D(void *openQCD_field) { - // sets up the necessary parameters + /* sets up the necessary parameters */ QudaInvertParam param = newOpenQCDParam(); - // creates a field on the CPU + /* creates a field on the CPU */ ColorSpinorParam cpuParam(openQCD_field, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); ColorSpinorField in_h(cpuParam); - // creates a field on the GPU with the same parameter set as the CPU field + /* creates a field on the GPU with the same parameter set as the CPU field */ ColorSpinorParam cudaParam(cpuParam, param, QUDA_CUDA_FIELD_LOCATION); ColorSpinorField *in = new ColorSpinorField(cudaParam); - *in = in_h; // transfer the CPU field to GPU + *in = in_h; /* transfer the CPU field to GPU */ return in; } @@ -838,14 +836,14 @@ void openQCD_qudaD2H(void *quda_field, void *openQCD_field) int my_rank; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - // sets up the necessary parameters + /* sets up the necessary parameters */ QudaInvertParam param = newOpenQCDParam(); - // creates a field on the CPU + /* creates a field on the CPU */ ColorSpinorParam cpuParam(openQCD_field, param, get_local_dims(), false, QUDA_CPU_FIELD_LOCATION); ColorSpinorField out_h(cpuParam); - // transfer the GPU field to CPU + /* transfer the GPU field to CPU */ out_h = *reinterpret_cast(quda_field); } @@ -854,13 +852,11 @@ void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p) { QudaInvertParam param = newOpenQCDDiracParam(p); - // both fields reside on the CPU + /* both fields reside on the CPU */ param.input_location = QUDA_CPU_FIELD_LOCATION; param.output_location = QUDA_CPU_FIELD_LOCATION; MatQuda(static_cast(dst), static_cast(src), ¶m); - /* AA: QUDA applies - Dw */ - /* blas::ax(-1.0, dst); */ } @@ -869,7 +865,7 @@ double openQCD_qudaGCR(void *source, void *solution, { QudaInvertParam param = newOpenQCDSolverParam(dirac_param); - // both fields reside on the CPU + /* both fields reside on the CPU */ param.input_location = QUDA_CPU_FIELD_LOCATION; param.output_location = QUDA_CPU_FIELD_LOCATION; @@ -898,13 +894,13 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - // Allocate on the heap + /* Allocate on the heap */ QudaInvertParam* param = new QudaInvertParam(newQudaInvertParam()); QudaInvertParam* invert_param_mg = new QudaInvertParam(newQudaInvertParam()); QudaMultigridParam* multigrid_param = new QudaMultigridParam(newQudaMultigridParam()); - // Some default settings - // Some of them should not be changed + /* Some default settings */ + /* Some of them should not be changed */ param->verbosity = QUDA_SUMMARIZE; param->cpu_prec = QUDA_DOUBLE_PRECISION; param->cuda_prec = QUDA_DOUBLE_PRECISION; @@ -920,22 +916,22 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) param->solve_type = QUDA_DIRECT_SOLVE; param->matpc_type = QUDA_MATPC_EVEN_EVEN; param->solver_normalization = QUDA_DEFAULT_NORMALIZATION; - param->inv_type_precondition = QUDA_INVALID_INVERTER; // disables any preconditioning + param->inv_type_precondition = QUDA_INVALID_INVERTER; /* disables any preconditioning */ param->mass_normalization = QUDA_MASS_NORMALIZATION; if (qudaState.layout.dirac_parms.su3csw != 0.0) { - param->clover_location = QUDA_CUDA_FIELD_LOCATION; // seems to have no effect? + param->clover_location = QUDA_CUDA_FIELD_LOCATION; /* seems to have no effect? */ param->clover_cpu_prec = QUDA_DOUBLE_PRECISION; param->clover_cuda_prec = QUDA_DOUBLE_PRECISION; param->clover_csw = qudaState.layout.dirac_parms.su3csw; param->clover_coeff = 0.0; - // Set to Wilson Dirac operator with Clover term + /* Set to Wilson Dirac operator with Clover term */ param->dslash_type = QUDA_CLOVER_WILSON_DSLASH; if (qudaState.layout.flds_parms.gauge == OPENQCD_GAUGE_SU3) { - param->clover_order = QUDA_FLOAT8_CLOVER_ORDER; // what implication has this? + param->clover_order = QUDA_FLOAT8_CLOVER_ORDER; /* what implication has this? */ param->compute_clover = true; } else { param->clover_order = QUDA_OPENQCD_CLOVER_ORDER; @@ -959,7 +955,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) errorQuda("Solver section %s is not a quda-solver section\n", section); } - // both fields reside on the CPU + /* both fields reside on the CPU */ param->input_location = kv.get(section, "input_location", QUDA_CPU_FIELD_LOCATION); param->output_location = kv.get(section, "output_location", QUDA_CPU_FIELD_LOCATION); @@ -1084,10 +1080,10 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) std::string mg_section = std::string(section) + " Multigrid"; - // (shallow) copy the struct + /* (shallow) copy the struct */ *invert_param_mg = *param; - // these have to be fixed + /* these have to be fixed */ invert_param_mg->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; invert_param_mg->dirac_order = QUDA_DIRAC_ORDER; @@ -1176,7 +1172,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) } } - // transfer of the struct to all the processes + /* transfer of the struct to all the processes */ MPI_Bcast((void*) param, sizeof(*param), MPI_BYTE, 0, MPI_COMM_WORLD); MPI_Bcast((void*) invert_param_mg, sizeof(*invert_param_mg), MPI_BYTE, 0, MPI_COMM_WORLD); MPI_Bcast((void*) multigrid_param, sizeof(*multigrid_param), MPI_BYTE, 0, MPI_COMM_WORLD); @@ -1211,12 +1207,12 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) openQCD_qudaCloverLoad(h_sw, param->kappa, param->clover_csw); POP_RANGE; - //loadCloverQuda(qudaState.layout.h_sw(), NULL, param); - // The above line would be prefered over openQCD_qudaCloverLoad, but throws this error, no idea why? - //QUDA: ERROR: qudaEventRecord_ returned CUDA_ERROR_ILLEGAL_ADDRESS - // (timer.h:82 in start()) - // (rank 0, host yoshi, quda_api.cpp:72 in void quda::target::cuda::set_driver_error(CUresult, const char*, const char*, const char*, const char*, bool)()) - //QUDA: last kernel called was (name=N4quda10CopyCloverINS_6clover11FloatNOrderIdLi72ELi2ELb0ELb1ELb0EEENS1_12OpenQCDOrderIdLi72EEEddEE,volume=32x16x16x64,aux=GPU-offline,vol=524288precision=8Nc=3,compute_diagonal) + /*loadCloverQuda(qudaState.layout.h_sw(), NULL, param);*/ + /* The above line would be prefered over openQCD_qudaCloverLoad, but throws this error, no idea why? + QUDA: ERROR: qudaEventRecord_ returned CUDA_ERROR_ILLEGAL_ADDRESS + (timer.h:82 in start()) + (rank 0, host yoshi, quda_api.cpp:72 in void quda::target::cuda::set_driver_error(CUresult, const char*, const char*, const char*, const char*, bool)()) + QUDA: last kernel called was (name=N4quda10CopyCloverINS_6clover11FloatNOrderIdLi72ELi2ELb0ELb1ELb0EEENS1_12OpenQCDOrderIdLi72EEEddEE,volume=32x16x16x64,aux=GPU-offline,vol=524288precision=8Nc=3,compute_diagonal)*/ } } @@ -1285,7 +1281,7 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_sectio MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); QudaVerbosity verbosity = QUDA_SUMMARIZE; - // Allocate on the heap + /* Allocate on the heap */ QudaEigParam* param = new QudaEigParam(newQudaEigParam()); if (my_rank == 0) { @@ -1307,7 +1303,7 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_sectio param->a_min = kv.get(section, "a_min", param->a_min); param->a_max = kv.get(section, "a_max", param->a_max); param->preserve_deflation = kv.get(section, "preserve_deflation", param->preserve_deflation); - //param->*preserve_deflation_space = kv.get(section, *"*preserve_deflation_space", param->preserve_deflation_space); + /*param->*preserve_deflation_space = kv.get(section, *"*preserve_deflation_space", param->preserve_deflation_space);*/ param->preserve_evals = kv.get(section, "preserve_evals", param->preserve_evals); param->use_dagger = kv.get(section, "use_dagger", param->use_dagger); param->use_norm_op = kv.get(section, "use_norm_op", param->use_norm_op); @@ -1338,14 +1334,14 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_sectio param->mem_type_ritz = kv.get(section, "mem_type_ritz", param->mem_type_ritz); param->location = kv.get(section, "location", param->location); param->run_verify = kv.get(section, "run_verify", param->run_verify); - //strcpy(param->vec_infile, kv.get(section, "vec_infile", param->vec_infile).c_str()); - //strcpy(param->vec_outfile, kv.get(section, "vec_outfile", param->vec_outfile).c_str()); + /*strcpy(param->vec_infile, kv.get(section, "vec_infile", param->vec_infile).c_str());*/ + /*strcpy(param->vec_outfile, kv.get(section, "vec_outfile", param->vec_outfile).c_str());*/ param->save_prec = kv.get(section, "save_prec", param->save_prec); param->io_parity_inflate = kv.get(section, "io_parity_inflate", param->io_parity_inflate); param->extlib_type = kv.get(section, "extlib_type", param->extlib_type); } - // transfer of the struct to all the processes + /* transfer of the struct to all the processes */ MPI_Bcast((void*) param, sizeof(*param), MPI_BYTE, 0, MPI_COMM_WORLD); void *inv_param = openQCD_qudaSolverSetup(infile, inv_section); @@ -1390,7 +1386,6 @@ double openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracPara QudaInvertParam invert_param_mg = newOpenQCDSolverParam(dirac_param); QudaMultigridParam multigrid_param = newQudaMultigridParam(); - //param.verbosity = QUDA_VERBOSE; invert_param.reliable_delta = 1e-5; invert_param.gcrNkrylov = 20; invert_param.maxiter = 2000; @@ -1401,8 +1396,8 @@ double openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracPara invert_param.matpc_type = QUDA_MATPC_EVEN_EVEN; invert_param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; invert_param.inv_type_precondition = QUDA_MG_INVERTER; - invert_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; // The precision used by the QUDA solver - invert_param.cuda_prec_precondition = QUDA_HALF_PRECISION; // The precision used by the QUDA solver + invert_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; /* The precision used by the QUDA solver */ + invert_param.cuda_prec_precondition = QUDA_HALF_PRECISION; /* The precision used by the QUDA solver */ invert_param_mg.reliable_delta = 1e-5; invert_param_mg.gcrNkrylov = 20; @@ -1417,17 +1412,21 @@ double openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracPara invert_param_mg.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; invert_param_mg.dirac_order = QUDA_DIRAC_ORDER; - // set the params, hard code the solver - // parameters copied from recommended settings from Wiki + /** + * set the params, hard code the solver + * parameters copied from recommended settings from Wiki + */ multigrid_param.n_level = 2; multigrid_param.generate_all_levels = QUDA_BOOLEAN_TRUE; multigrid_param.run_verify = QUDA_BOOLEAN_FALSE; multigrid_param.invert_param = &invert_param_mg; multigrid_param.compute_null_vector = QUDA_COMPUTE_NULL_VECTOR_YES; - // try setting minimal parameters - leave rest to default - // level 0 fine - multigrid_param.geo_block_size[0][0] = 4; // xytz + /** + * try setting minimal parameters - leave rest to default + * level 0 fine + */ + multigrid_param.geo_block_size[0][0] = 4; /* xytz */ multigrid_param.geo_block_size[0][1] = 4; multigrid_param.geo_block_size[0][2] = 4; multigrid_param.geo_block_size[0][3] = 4; @@ -1447,9 +1446,11 @@ double openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracPara multigrid_param.coarse_solver_maxiter[0] = 50; multigrid_param.coarse_grid_solution_type[0] = QUDA_MAT_SOLUTION; - // level 1 coarse - // no smoother required for innermost - // so no blocks + /** + * level 1 coarse + * no smoother required for innermost + * so no blocks + */ multigrid_param.precision_null[1] = QUDA_HALF_PRECISION; multigrid_param.coarse_solver[1] = QUDA_CA_GCR_INVERTER; multigrid_param.smoother[1] = QUDA_CA_GCR_INVERTER; From 8f4c7dcccebc80cd9741ec73aacdce8d2554f43a Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 3 Nov 2023 10:01:29 +0100 Subject: [PATCH 106/148] setup verbosity --- lib/openqcd_interface.cpp | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index cf575a4822..939dc14852 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -1163,8 +1163,8 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) multigrid_param->vec_load[i] = kv.get(subsection, "vec_load", multigrid_param->vec_load[i]); multigrid_param->vec_store[i] = kv.get(subsection, "vec_store", multigrid_param->vec_store[i]); - strcpy(multigrid_param->vec_infile[i], kv.get(subsection, "vec_infile", multigrid_param->vec_infile[i]).c_str()); - strcpy(multigrid_param->vec_outfile[i], kv.get(subsection, "vec_outfile", multigrid_param->vec_outfile[i]).c_str()); + /*strcpy(multigrid_param->vec_infile[i], kv.get(subsection, "vec_infile", multigrid_param->vec_infile[i]).c_str()); + strcpy(multigrid_param->vec_outfile[i], kv.get(subsection, "vec_outfile", multigrid_param->vec_outfile[i]).c_str());*/ multigrid_param->mu_factor[i] = kv.get(subsection, "mu_factor", multigrid_param->mu_factor[i]); multigrid_param->transfer_type[i] = kv.get(subsection, "transfer_type", multigrid_param->transfer_type[i]); @@ -1217,6 +1217,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) } if (param->inv_type_precondition == QUDA_MG_INVERTER) { + logQuda(QUDA_VERBOSE, "Setting up multigrid solver ...\n"); PUSH_RANGE("newMultigridQuda",4); mgprec = newMultigridQuda(multigrid_param); param->preconditioner = mgprec; @@ -1249,12 +1250,12 @@ double openQCD_qudaInvert(void *param, double mu, void *source, void *solution, invertQuda(static_cast(solution), static_cast(source), invert_param); POP_RANGE; - logQuda(QUDA_VERBOSE, "openQCD_qudaInvert()\n"); - logQuda(QUDA_VERBOSE, " true_res = %.2e\n", invert_param->true_res); - logQuda(QUDA_VERBOSE, " true_res_hq = %.2e\n", invert_param->true_res_hq); - logQuda(QUDA_VERBOSE, " iter = %d\n", invert_param->iter); - logQuda(QUDA_VERBOSE, " gflops = %.2e\n", invert_param->gflops); - logQuda(QUDA_VERBOSE, " secs = %.2e\n", invert_param->secs); + logQuda(QUDA_SUMMARIZE, "openQCD_qudaInvert()\n"); + logQuda(QUDA_SUMMARIZE, " true_res = %.2e\n", invert_param->true_res); + logQuda(QUDA_SUMMARIZE, " true_res_hq = %.2e\n", invert_param->true_res_hq); + logQuda(QUDA_SUMMARIZE, " iter = %d\n", invert_param->iter); + logQuda(QUDA_SUMMARIZE, " gflops = %.2e\n", invert_param->gflops); + logQuda(QUDA_SUMMARIZE, " secs = %.2e\n", invert_param->secs); *status = invert_param->true_res <= invert_param->tol ? invert_param->iter : -1; @@ -1368,9 +1369,9 @@ void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals) eigensolveQuda(h_evecs, static_cast(h_evals), eig_param); POP_RANGE; - logQuda(QUDA_VERBOSE, "openQCD_qudaEigensolve()\n"); - logQuda(QUDA_VERBOSE, " gflops = %.2e\n", eig_param->gflops); - logQuda(QUDA_VERBOSE, " secs = %.2e\n", eig_param->secs); + logQuda(QUDA_SUMMARIZE, "openQCD_qudaEigensolve()\n"); + logQuda(QUDA_SUMMARIZE, " gflops = %.2e\n", eig_param->gflops); + logQuda(QUDA_SUMMARIZE, " secs = %.2e\n", eig_param->secs); } void openQCD_qudaEigensolverDestroy(void *param) From 1c85393fa56b9564a37264260b5ccf0695f95f4a Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 6 Nov 2023 10:29:12 +0100 Subject: [PATCH 107/148] fixed eigensolverDestroy --- include/quda_openqcd_interface.h | 2 ++ lib/openqcd_interface.cpp | 29 ++++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 4c7366cbdb..8f476b6cca 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -188,6 +188,8 @@ void openQCD_qudaSpinorFree(void** quda_field); * @param[in] p Dirac parameter struct */ void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p); +void openQCD_qudaDdagD(void *src, void *dst, openQCD_QudaDiracParam_t p); +void openQCD_qudaDw2(void *param, double mu, void *src, void *dst); /** diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 939dc14852..65042e481c 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -860,6 +860,30 @@ void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p) } +void openQCD_qudaDdagD(void *src, void *dst, openQCD_QudaDiracParam_t p) +{ + QudaInvertParam param = newOpenQCDDiracParam(p); + + /* both fields reside on the CPU */ + param.input_location = QUDA_CPU_FIELD_LOCATION; + param.output_location = QUDA_CPU_FIELD_LOCATION; + + MatDagMatQuda(static_cast(dst), static_cast(src), ¶m); +} + + +void openQCD_qudaDw2(void *param, double mu, void *src, void *dst) +{ + QudaInvertParam* inv_param = static_cast(param); + inv_param->mu = mu; + + /* both fields reside on the CPU */ + inv_param->input_location = QUDA_CPU_FIELD_LOCATION; + inv_param->output_location = QUDA_CPU_FIELD_LOCATION; + + MatQuda(static_cast(dst), static_cast(src), inv_param); +} + double openQCD_qudaGCR(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param, openQCD_QudaGCRParam_t gcr_param) { @@ -1337,6 +1361,8 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_sectio param->run_verify = kv.get(section, "run_verify", param->run_verify); /*strcpy(param->vec_infile, kv.get(section, "vec_infile", param->vec_infile).c_str());*/ /*strcpy(param->vec_outfile, kv.get(section, "vec_outfile", param->vec_outfile).c_str());*/ + param->vec_outfile[0] = '\0'; + param->vec_infile[0] = '\0'; param->save_prec = kv.get(section, "save_prec", param->save_prec); param->io_parity_inflate = kv.get(section, "io_parity_inflate", param->io_parity_inflate); param->extlib_type = kv.get(section, "extlib_type", param->extlib_type); @@ -1376,7 +1402,8 @@ void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals) void openQCD_qudaEigensolverDestroy(void *param) { - QudaInvertParam* eig_param = static_cast(param); + QudaEigParam* eig_param = static_cast(param); + openQCD_qudaSolverDestroy(eig_param->invert_param); delete eig_param; } From 0842ed03cd18efd0521f2862a6791deee70c25a4 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 6 Nov 2023 14:08:42 +0100 Subject: [PATCH 108/148] cleaned up tabs/spaces, comment, unnecessary additions --- .gitignore | 3 +- CMakeLists.txt | 14 - include/enum_quda_fortran.h | 2 +- include/gauge_field_order.h | 208 ++++++------- include/index_helper.cuh | 24 +- include/kernels/copy_color_spinor.cuh | 20 -- include/quda.h | 20 +- lib/color_spinor_field.cpp | 1 - lib/comm_common.cpp | 7 +- lib/copy_gauge_extended.cu | 2 + lib/copy_gauge_inc.cu | 1 + lib/cpu_gauge_field.cpp | 433 -------------------------- lib/interface_quda.cpp | 237 ++------------ 13 files changed, 149 insertions(+), 823 deletions(-) delete mode 100644 lib/cpu_gauge_field.cpp diff --git a/.gitignore b/.gitignore index 7665c11baa..a4b290fbb2 100644 --- a/.gitignore +++ b/.gitignore @@ -19,5 +19,4 @@ include/quda_define.h include/jitify_options.hpp .tags* autom4te.cache/* -.vscode -build \ No newline at end of file +.vscode \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index d5b5c56477..61f69c2faf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -702,17 +702,3 @@ include(CTest) add_subdirectory(lib) add_subdirectory(tests) add_subdirectory(doc) - - - -# ###################################################################################################################### -# OpenQxD -# ###################################################################################################################### -# We might only want to do that if using QUDA_DOWNLOAD_USQCD, but this does not work if not set on the initial run -# if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) - # set(CMAKE_INSTALL_PREFIX - # ${CMAKE_BINARY_DIR}/usqcd - # CACHE PATH "..." FORCE) -# endif() - -# include_directories(/scratch/jfernande/openQxD-devel/include) \ No newline at end of file diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index 32e86d0e18..5de7760732 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -330,7 +330,7 @@ #define QUDA_QDPJIT_FIELD_ORDER 11 // QDP field ordering (complex-color-spin-spacetime) #define QUDA_QOP_DOMAIN_WALL_FIELD_ORDER 12 // QOP domain-wall ordering #define QUDA_PADDED_SPACE_SPIN_COLOR_FIELD_ORDER 13 // TIFR RHMC ordering -#define QUDA_OPENQCD_FIELD_ORDER 14 // OPENQCD geometry ordering (at the moment lexicographical w/ rotation zyxt = x3x2x1x0 |-> xyzt x0x1x2x3 ) +#define QUDA_OPENQCD_FIELD_ORDER 14 // openQCD ordering #define QUDA_INVALID_FIELD_ORDER QUDA_INVALID_ENUM #define QudaFieldCreate integer(4) diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index 4f9df56f45..fba5e31dfb 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -25,11 +25,6 @@ #include #include -// TODO: The ipt functions can be incorporated here (so no reordering needed in OpenQXD side) -// OpenQxD helpers: -// #include "../../openQxD-devel/include/lattice.h" - - namespace quda { /** @@ -52,11 +47,11 @@ namespace quda { T &gauge; /** - @brief gauge_wrapper constructor - @param[in] gauge Gauge field accessor we are wrapping - @param[in] dim Dimension we are accessing - @param[in] x_cb Checkerboarded space-time index we are accessing - @param[in] parity Parity we are accessing + @brief gauge_wrapper constructor + @param[in] gauge Gauge field accessor we are wrapping + @param[in] dim Dimension we are accessing + @param[in] x_cb Checkerboarded space-time index we are accessing + @param[in] parity Parity we are accessing */ __device__ __host__ inline gauge_wrapper(T &gauge, int dim, int x_cb, int parity, Float phase = 1.0) : dim(dim), x_cb(x_cb), parity(parity), phase(phase), gauge(gauge) @@ -64,8 +59,8 @@ namespace quda { } /** - @brief Assignment operator with Matrix instance as input - @param[in] M Matrix we want to store in this accessor + @brief Assignment operator with Matrix instance as input + @param[in] M Matrix we want to store in this accessor */ template __device__ __host__ inline void operator=(const M &a) const { @@ -113,11 +108,11 @@ namespace quda { T &gauge; /** - @brief gauge_wrapper constructor - @param[in] gauge Gauge field accessor we are wrapping - @param[in] dim Dimension we are accessing - @param[in] ghost_idx Ghost index we are accessing - @param[in] parity Parity we are accessing + @brief gauge_wrapper constructor + @param[in] gauge Gauge field accessor we are wrapping + @param[in] dim Dimension we are accessing + @param[in] ghost_idx Ghost index we are accessing + @param[in] parity Parity we are accessing */ __device__ __host__ inline gauge_ghost_wrapper(T &gauge, int dim, int ghost_idx, int parity, Float phase = 1.0) : @@ -126,8 +121,8 @@ namespace quda { } /** - @brief Assignment operator with Matrix instance as input - @param[in] M Matrix we want to store in this accessot + @brief Assignment operator with Matrix instance as input + @param[in] M Matrix we want to store in this accessot */ template __device__ __host__ inline void operator=(const M &a) const { @@ -642,14 +637,14 @@ namespace quda { scale(static_cast(1.0)), scale_inv(static_cast(1.0)) { - resetScale(U.Scale()); + resetScale(U.Scale()); } void resetScale(Float max) { if (fixed) { scale = static_cast(std::numeric_limits::max()) / max; - scale_inv = max / static_cast(std::numeric_limits::max()); + scale_inv = max / static_cast(std::numeric_limits::max()); } } @@ -791,14 +786,14 @@ namespace quda { ghostAccessor(U, (void *)gauge_, (void **)ghost_) { if (U.Reconstruct() != QUDA_RECONSTRUCT_NO) errorQuda("GaugeField ordering not supported with reconstruction"); - } + } - void resetScale(double max) { - accessor.resetScale(max); - ghostAccessor.resetScale(max); - } + void resetScale(double max) { + accessor.resetScale(max); + ghostAccessor.resetScale(max); + } - static constexpr bool fixedPoint() { return fixed_point(); } + static constexpr bool fixedPoint() { return fixed_point(); } /** * accessor function @@ -880,7 +875,7 @@ namespace quda { } /** Returns the number of field colors */ - __device__ __host__ inline int Ncolor() const { return nColor; } + __device__ __host__ inline int Ncolor() const { return nColor; } /** Returns the field volume */ __device__ __host__ inline auto Volume() const { return 2 * volumeCB; } @@ -891,21 +886,21 @@ namespace quda { /** Returns the field geometric dimension */ __device__ __host__ inline int Ndim() const { return nDim; } - /** Returns the field geometry */ - __device__ __host__ inline int Geometry() const { return geometry; } + /** Returns the field geometry */ + __device__ __host__ inline int Geometry() const { return geometry; } - /** Returns the number of coarse gauge field spins */ - __device__ __host__ inline int NspinCoarse() const { return nSpinCoarse; } + /** Returns the number of coarse gauge field spins */ + __device__ __host__ inline int NspinCoarse() const { return nSpinCoarse; } - /** Returns the number of coarse gauge field colors */ - __device__ __host__ inline int NcolorCoarse() const { return nColorCoarse; } + /** Returns the number of coarse gauge field colors */ + __device__ __host__ inline int NcolorCoarse() const { return nColorCoarse; } - /** - * @brief Returns the L1 norm of the field in a given dimension - * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) - * @return L1 norm - */ - __host__ double norm1(int dim=-1, bool global=true) const { + /** + * @brief Returns the L1 norm of the field in a given dimension + * @param[in] dim Which dimension we are taking the norm of (dim=-1 mean all dimensions) + * @return L1 norm + */ + __host__ double norm1(int dim=-1, bool global=true) const { commGlobalReductionPush(global); double nrm1 = accessor.template transform_reduce>(location, dim, abs_(accessor.scale_inv)); @@ -1059,12 +1054,12 @@ namespace quda { __device__ __host__ inline Float milcStaggeredPhase(int dim, const int x[], const I R[]) { // could consider non-extended variant too? Float sign = static_cast(1.0); - switch (dim) { - case 0: if ( ((x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); break; - case 1: if ( ((x[0] - R[0] + x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); break; - case 2: if ( ((x[0] - R[0] + x[1] - R[1] + x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); break; - } - return sign; + switch (dim) { + case 0: if ( ((x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); break; + case 1: if ( ((x[0] - R[0] + x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); break; + case 2: if ( ((x[0] - R[0] + x[1] - R[1] + x[3] - R[3]) & 1) != 0) sign = -static_cast(1.0); break; + } + return sign; } /** @@ -1561,7 +1556,7 @@ namespace quda { errorQuda("This accessor does not support coarse-link fields (lacks support for bidirectional ghost zone"); // static_assert( !(stag_phase!=QUDA_STAGGERED_PHASE_NO && reconLenParam != 18 && reconLenParam != 12), - // "staggered phase only presently supported for 18 and 12 reconstruct"); + // "staggered phase only presently supported for 18 and 12 reconstruct"); for (int i = 0; i < 4; i++) { X[i] = u.X()[i]; R[i] = u.R()[i]; @@ -1601,11 +1596,11 @@ namespace quda { #pragma unroll for (int i=0; i(&vecTmp)[j], tmp[i*N+j]); - // second do vectorized copy into memory + for (int j=0; j(&vecTmp)[j], tmp[i*N+j]); + // second do vectorized copy into memory vector_store(gauge, parity * offset + x + (dir * M + i) * stride, vecTmp); } if constexpr (hasPhase) { @@ -1615,14 +1610,14 @@ namespace quda { } /** - @brief This accessor routine returns a gauge_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] x_cb Checkerboarded space-time index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_wrapper that curries in access to - this field at the above coordinates. + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. */ __device__ __host__ inline auto operator()(int dim, int x_cb, int parity, real phase = 1.0) const { @@ -1640,7 +1635,7 @@ namespace quda { #pragma unroll for (int i=0; i( ghost[dir] + parity * faceVolumeCB[dir] * (M * N + hasPhase), i * faceVolumeCB[dir] + x); // second do copy converting into register type @@ -1673,12 +1668,12 @@ namespace quda { #pragma unroll for (int i=0; i(&vecTmp)[j], tmp[i*N+j]); - // second do vectorized copy into memory - vector_store(ghost[dir]+parity*faceVolumeCB[dir]*(M*N + hasPhase), i*faceVolumeCB[dir]+x, vecTmp); + for (int j=0; j(&vecTmp)[j], tmp[i*N+j]); + // second do vectorized copy into memory + vector_store(ghost[dir]+parity*faceVolumeCB[dir]*(M*N + hasPhase), i*faceVolumeCB[dir]+x, vecTmp); } if constexpr (hasPhase) { @@ -1728,14 +1723,14 @@ namespace quda { real tmp[reconLen]; #pragma unroll - for (int i=0; i(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase), - +i*R[dim]*faceVolumeCB[dim]+buff_idx); - // second do copy converting into register type + for (int i=0; i(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase), + +i*R[dim]*faceVolumeCB[dim]+buff_idx); + // second do copy converting into register type #pragma unroll - for (int j=0; j(&vecTmp)[j]); - } + for (int j=0; j(&vecTmp)[j]); + } real phase = 0.; if constexpr (hasPhase) copy(phase, @@ -1754,15 +1749,15 @@ namespace quda { reconstruct.Pack(tmp, v); #pragma unroll - for (int i=0; i(&vecTmp)[j], tmp[i*N+j]); - // second do vectorized copy to memory - vector_store(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase), - i*R[dim]*faceVolumeCB[dim]+buff_idx, vecTmp); - } + for (int j=0; j(&vecTmp)[j], tmp[i*N+j]); + // second do vectorized copy to memory + vector_store(ghost[dim] + ((dir*2+parity)*geometry+g)*R[dim]*faceVolumeCB[dim]*(M*N + hasPhase), + i*R[dim]*faceVolumeCB[dim]+buff_idx, vecTmp); + } if constexpr (hasPhase) { real phase = reconstruct.getPhase(v); copy(ghost[dim][((dir * 2 + parity) * geometry + g) * R[dim] * faceVolumeCB[dim] * (M * N + 1) @@ -1880,14 +1875,14 @@ namespace quda { } /** - @brief This accessor routine returns a gauge_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] x_cb Checkerboarded space-time index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_wrapper that curries in access to - this field at the above coordinates. + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. */ __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const { @@ -1930,14 +1925,14 @@ namespace quda { } /** - @brief This accessor routine returns a gauge_wrapper to this object, - allowing us to overload various operators for manipulating at - the site level interms of matrix operations. - @param[in] dir Which dimension are we requesting - @param[in] x_cb Checkerboarded space-time index we are requesting - @param[in] parity Parity we are requesting - @return Instance of a gauge_wrapper that curries in access to - this field at the above coordinates. + @brief This accessor routine returns a gauge_wrapper to this object, + allowing us to overload various operators for manipulating at + the site level interms of matrix operations. + @param[in] dir Which dimension are we requesting + @param[in] x_cb Checkerboarded space-time index we are requesting + @param[in] parity Parity we are requesting + @return Instance of a gauge_wrapper that curries in access to + this field at the above coordinates. */ __device__ __host__ inline auto operator()(int dim, int x_cb, int parity) const { @@ -2154,7 +2149,7 @@ namespace quda { if constexpr (length != 18) errorQuda("Gauge length %d not supported", length); // compute volumeCB + halo region exVolumeCB = u.X()[0]/2 + 2; - for (int i=1; i<4; i++) exVolumeCB *= u.X()[i] + 2; + for (int i=1; i<4; i++) exVolumeCB *= u.X()[i] + 2; } // we need to transpose for BQCD ordering @@ -2292,21 +2287,21 @@ namespace quda { // exVolumeCB is the padded checkboard volume for (int i=0; i<4; i++) exVolumeCB *= exDim[i]; - exVolumeCB /= 2; + exVolumeCB /= 2; } /** - @brief Compute the index into the padded field. Assumes that - parity doesn't change from unpadded to padded. + @brief Compute the index into the padded field. Assumes that + parity doesn't change from unpadded to padded. */ __device__ __host__ inline int getPaddedIndex(int x_cb, int parity) const { - // find coordinates - int coord[4]; - getCoords(coord, x_cb, dim, parity); + // find coordinates + int coord[4]; + getCoords(coord, x_cb, dim, parity); - // get z-extended index - coord[2] += 2; // offset for halo - return linkIndex(coord, exDim); + // get z-extended index + coord[2] += 2; // offset for halo + return linkIndex(coord, exDim); } // we need to transpose for TIFR ordering @@ -2680,7 +2675,6 @@ namespace quda { }; // class OpenQCDOrder } // namespace gauge - template __device__ __host__ inline auto diff --git a/include/index_helper.cuh b/include/index_helper.cuh index 8381a5b194..688c997043 100644 --- a/include/index_helper.cuh +++ b/include/index_helper.cuh @@ -2,10 +2,6 @@ #include -// TODO: The ipt functions can be incorporated here (so no reordering needed in OpenQXD side) -// OpenQxD helpers: -// #include "../../openQxD-devel/include/lattice.h" - namespace quda { /** Compute the checkerboard 1-d index from the 4-d coordinate x[] + dx[] @@ -1112,23 +1108,9 @@ namespace quda { } // namespace quda -// namespace OpenQxD_Helpers { -// /** -// Compute the 4-d spatial index from the checkerboarded 1-d index -// at parity parity. Wrapper around getCoordsCB. - -// @param[out] x Computed spatial index -// @param[in] cb_index 1-d checkerboarded index -// @param[in] X Full lattice dimensions -// @param[in] X0h Half of x-dim lattice dimension -// @param[in] parity Site parity -// @return Full linear lattice index -// */ -// template __device__ __host__ inline int getCoords(Coord &x, int cb_index, const I &X, int parity) +// namespace openqcd { +// __device__ __host__ inline int ipt(...) // { -// return getCoordsCB(x, cb_index, X, X[0] >> 1, parity); +// return ...; // } - - // } - diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index 0d98b1a917..c3b234e42c 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -51,20 +51,10 @@ namespace quda { __device__ __host__ inline void operator()(complex out[Ns*Nc], const complex in[Ns*Nc]) const { int s1[4] = {1, 2, 3, 0}; int s2[4] = {3, 0, 1, 2}; - /* K1 = [1, -1, -1, -1] / sqrt(2) */ FloatOut K1[4] = {static_cast(kP), static_cast(-kP), static_cast(-kP), static_cast(-kP)}; - /* K2 = [1, -1, 1, 1] / sqrt(2) */ FloatOut K2[4] = {static_cast(kP), static_cast(-kP), static_cast(kP), static_cast(kP)}; for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); } } @@ -99,14 +89,6 @@ namespace quda { FloatOut K2[4] = {static_cast(kP), static_cast(kP), static_cast(kP), static_cast(kP)}; for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); } } @@ -190,8 +172,6 @@ namespace quda { } }; - - template struct CopyColorSpinor_ { const Arg &arg; constexpr CopyColorSpinor_(const Arg &arg): arg(arg) {} diff --git a/include/quda.h b/include/quda.h index fa31a25c61..dcee3e3189 100644 --- a/include/quda.h +++ b/include/quda.h @@ -223,8 +223,8 @@ extern "C" { int compute_action; /** Computed value of the bilinear action (complex-valued) - invert: \phi^\dagger A^{-1} \phi - multishift: \phi^\dagger r(x) \phi = \phi^\dagger (sum_k residue[k] * (A + offset[k])^{-1} ) \phi */ + invert: \phi^\dagger A^{-1} \phi + multishift: \phi^\dagger r(x) \phi = \phi^\dagger (sum_k residue[k] * (A + offset[k])^{-1} ) \phi */ double action[2]; QudaSolutionType solution_type; /**< Type of system to solve */ @@ -728,7 +728,7 @@ extern "C" { int smoother_schwarz_cycle[QUDA_MAX_MG_LEVEL]; /** The type of residual to send to the next coarse grid, and thus the - type of solution to receive back from this coarse grid */ + type of solution to receive back from this coarse grid */ QudaSolutionType coarse_grid_solution_type[QUDA_MAX_MG_LEVEL]; /** The type of smoother solve to do on each grid (e/o preconditioning or not)*/ @@ -1279,16 +1279,6 @@ extern "C" { */ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity); - /** - * Apply the Dslash operator (D_{eo} or D_{oe}). - * @param h_out Result spinor field - * @param h_in Input spinor field - * @param param Contains all metadata regarding host and device - * storage - * @param parity The destination parity of the field - */ - void dslashQudaTest(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity); - /** * @brief Perform the solve like @dslashQuda but for multiple rhs by spliting the comm grid into * sub-partitions: each sub-partition does one or more rhs'. @@ -1538,8 +1528,8 @@ extern "C" { * @param inv_param Dirac and solver meta data */ void computeCloverForceQuda(void *mom, double dt, void **x, void **p, double *coeff, double kappa2, double ck, - int nvector, double multiplicity, void *gauge, - QudaGaugeParam *gauge_param, QudaInvertParam *inv_param); + int nvector, double multiplicity, void *gauge, + QudaGaugeParam *gauge_param, QudaInvertParam *inv_param); /** * Compute the naive staggered force. All fields must be in the same precision. diff --git a/lib/color_spinor_field.cpp b/lib/color_spinor_field.cpp index 4300d6eab5..13d93f16bc 100644 --- a/lib/color_spinor_field.cpp +++ b/lib/color_spinor_field.cpp @@ -454,7 +454,6 @@ namespace quda if (reorder_location() == QUDA_CPU_FIELD_LOCATION) { // reorder on the host void *buffer = pool_pinned_malloc(bytes); - // Roman Gruber: this is a bug: v should be src.v, else reordering on CPU doesn't work qudaMemcpy(buffer, src.v.data(), bytes, qudaMemcpyDefault); copyGenericColorSpinor(*this, src, QUDA_CPU_FIELD_LOCATION, 0, buffer); pool_pinned_free(buffer); diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index 9d7939f62b..2b39110f52 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -102,13 +102,12 @@ namespace quda Topology *topo = new Topology; - - #ifdef BUILD_OPENQCD_INTERFACE +#ifdef BUILD_OPENQCD_INTERFACE int *data = static_cast(map_data); topo->cstar = data[0]; - #else +#else topo->cstar = 0; - #endif +#endif topo->ndim = ndim; diff --git a/lib/copy_gauge_extended.cu b/lib/copy_gauge_extended.cu index fde877951f..dede657c76 100644 --- a/lib/copy_gauge_extended.cu +++ b/lib/copy_gauge_extended.cu @@ -122,6 +122,7 @@ namespace quda { } else { errorQuda("Gauge field %d order not supported", out.Order()); } + } template @@ -203,6 +204,7 @@ namespace quda { } else { errorQuda("Gauge field %d order not supported", in.Order()); } + } template diff --git a/lib/copy_gauge_inc.cu b/lib/copy_gauge_inc.cu index d5c7d42c4e..97af1e6ce2 100644 --- a/lib/copy_gauge_inc.cu +++ b/lib/copy_gauge_inc.cu @@ -143,6 +143,7 @@ namespace quda { } else { errorQuda("Gauge field %d order not supported", out.Order()); } + } template diff --git a/lib/cpu_gauge_field.cpp b/lib/cpu_gauge_field.cpp deleted file mode 100644 index 5a2ee4d15f..0000000000 --- a/lib/cpu_gauge_field.cpp +++ /dev/null @@ -1,433 +0,0 @@ -#include -#include -#include -#include -#include -#include - -namespace quda { - - cpuGaugeField::cpuGaugeField(const GaugeFieldParam ¶m) : - GaugeField(param) - { - if (precision == QUDA_HALF_PRECISION) { - errorQuda("CPU fields do not support half precision"); - } - if (precision == QUDA_QUARTER_PRECISION) { - errorQuda("CPU fields do not support quarter precision"); - } - if (pad != 0) { - errorQuda("CPU fields do not support non-zero padding"); - } - if (reconstruct != QUDA_RECONSTRUCT_NO && reconstruct != QUDA_RECONSTRUCT_10) { - errorQuda("Reconstruction type %d not supported", reconstruct); - } - if (reconstruct == QUDA_RECONSTRUCT_10 && link_type != QUDA_ASQTAD_MOM_LINKS) { - errorQuda("10-reconstruction only supported with momentum links"); - } - - int siteDim=0; - if (geometry == QUDA_SCALAR_GEOMETRY) siteDim = 1; - else if (geometry == QUDA_VECTOR_GEOMETRY) siteDim = nDim; - else if (geometry == QUDA_TENSOR_GEOMETRY) siteDim = nDim * (nDim-1) / 2; - else if (geometry == QUDA_COARSE_GEOMETRY) siteDim = 2*nDim; - else if (geometry == QUDA_KDINVERSE_GEOMETRY) - siteDim = 1 << nDim; - else errorQuda("Unknown geometry type %d", geometry); - - // compute the correct bytes size for these padded field orders - if (order == QUDA_TIFR_PADDED_GAUGE_ORDER) { - bytes = siteDim * (x[0]*x[1]*(x[2]+4)*x[3]) * nInternal * precision; - } else if (order == QUDA_BQCD_GAUGE_ORDER) { - bytes = siteDim * (x[0]+4)*(x[1]+2)*(x[2]+2)*(x[3]+2) * nInternal * precision; - } else if (order == QUDA_MILC_SITE_GAUGE_ORDER) { - bytes = volume * site_size; - } - - if (order == QUDA_QDP_GAUGE_ORDER) { - gauge = (void**) safe_malloc(siteDim * sizeof(void*)); - - for (int d=0; dabs_max(); - } - - - cpuGaugeField::~cpuGaugeField() - { - int siteDim = 0; - if (geometry == QUDA_SCALAR_GEOMETRY) siteDim = 1; - else if (geometry == QUDA_VECTOR_GEOMETRY) siteDim = nDim; - else if (geometry == QUDA_TENSOR_GEOMETRY) siteDim = nDim * (nDim-1) / 2; - else if (geometry == QUDA_COARSE_GEOMETRY) siteDim = 2*nDim; - else if (geometry == QUDA_KDINVERSE_GEOMETRY) - siteDim = 1 << nDim; - else errorQuda("Unknown geometry type %d", geometry); - - if (create == QUDA_NULL_FIELD_CREATE || create == QUDA_ZERO_FIELD_CREATE) { - if (order == QUDA_QDP_GAUGE_ORDER) { - for (int d=0; d(recv[d])+bytes[d], send[d], bytes[d]); - memcpy(recv[d], static_cast(send[d])+bytes[d], bytes[d]); - } - - // inject back into the gauge field - extractExtendedGaugeGhost(*this, d, R, recv, false); - } - - for (int d=0; d(src).Gauge_p(), src.Bytes(), qudaMemcpyDeviceToHost); - - copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, gauge, buffer); - pool_pinned_free(buffer); - - } else { // else on the GPU - - void *buffer = create_gauge_buffer(bytes, order, geometry); - size_t ghost_bytes[8]; - int dstNinternal = reconstruct != QUDA_RECONSTRUCT_NO ? reconstruct : 2*nColor*nColor; - for (int d=0; d 0) ? create_ghost_buffer(ghost_bytes, order, geometry) : nullptr; - - if (ghostExchange != QUDA_GHOST_EXCHANGE_EXTENDED) { - copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0, ghost_buffer, 0); - if (geometry == QUDA_COARSE_GEOMETRY) copyGenericGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0, ghost_buffer, 0, 3); // forwards links if bi-directional - } else { - copyExtendedGauge(*this, src, QUDA_CUDA_FIELD_LOCATION, buffer, 0); - } - - if (order == QUDA_QDP_GAUGE_ORDER) { - for (int d=0; d 4 && ghostExchange == QUDA_GHOST_EXCHANGE_PAD && src.GhostExchange() == QUDA_GHOST_EXCHANGE_PAD && nFace) - for (int d=0; d 0) free_ghost_buffer(ghost_buffer, order, geometry); - } - - } else if (typeid(src) == typeid(cpuGaugeField)) { - // copy field and ghost zone directly - copyGenericGauge(*this, src, QUDA_CPU_FIELD_LOCATION, gauge, - const_cast(static_cast(src).Gauge_p())); - } else { - errorQuda("Invalid gauge field type"); - } - - // if we have copied from a source without a pad then we need to exchange - if (ghostExchange == QUDA_GHOST_EXCHANGE_PAD && - src.GhostExchange() != QUDA_GHOST_EXCHANGE_PAD) { - exchangeGhost(geometry == QUDA_VECTOR_GEOMETRY ? QUDA_LINK_BACKWARDS : QUDA_LINK_BIDIRECTIONAL); - } - } - - void cpuGaugeField::setGauge(void **gauge_) - { - if(create != QUDA_REFERENCE_FIELD_CREATE) { - errorQuda("Setting gauge pointer is only allowed when create=" - "QUDA_REFERENCE_FIELD_CREATE type\n"); - } - gauge = gauge_; - } - - void cpuGaugeField::backup() const { - if (backed_up) errorQuda("Gauge field already backed up"); - - if (order == QUDA_QDP_GAUGE_ORDER) { - char **buffer = new char*[geometry]; - for (int d=0; d(buffer); - } else { - backup_h = new char[bytes]; - memcpy(backup_h, gauge, bytes); - } - - backed_up = true; - } - - void cpuGaugeField::restore() const - { - if (!backed_up) errorQuda("Cannot restore since not backed up"); - - if (order == QUDA_QDP_GAUGE_ORDER) { - char **buffer = reinterpret_cast(backup_h); - for (int d=0; d(Gauge_p()); - int dbytes = Bytes() / 4; - static_assert(sizeof(char) == 1, "Assuming sizeof(char) == 1"); - char *dst_buffer = reinterpret_cast(buffer); - for (int d = 0; d < 4; d++) { std::memcpy(&dst_buffer[d * dbytes], p[d], dbytes); } - } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER - || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER - || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER - || Order() == QUDA_OPENQCD_GAUGE_ORDER) { - const void *p = Gauge_p(); - int bytes = Bytes(); - std::memcpy(buffer, p, bytes); - } else { - errorQuda("Unsupported order = %d\n", Order()); - } - } - - void cpuGaugeField::copy_from_buffer(void *buffer) - { - - if (Order() == QUDA_QDP_GAUGE_ORDER || Order() == QUDA_QDPJIT_GAUGE_ORDER) { - void **p = static_cast(Gauge_p()); - size_t dbytes = Bytes() / 4; - static_assert(sizeof(char) == 1, "Assuming sizeof(char) == 1"); - const char *dst_buffer = reinterpret_cast(buffer); - for (int d = 0; d < 4; d++) { std::memcpy(p[d], &dst_buffer[d * dbytes], dbytes); } - } else if (Order() == QUDA_CPS_WILSON_GAUGE_ORDER || Order() == QUDA_MILC_GAUGE_ORDER - || Order() == QUDA_MILC_SITE_GAUGE_ORDER || Order() == QUDA_BQCD_GAUGE_ORDER - || Order() == QUDA_TIFR_GAUGE_ORDER || Order() == QUDA_TIFR_PADDED_GAUGE_ORDER - || Order() == QUDA_OPENQCD_GAUGE_ORDER) { - void *p = Gauge_p(); - size_t bytes = Bytes(); - std::memcpy(p, buffer, bytes); - } else { - errorQuda("Unsupported order = %d\n", Order()); - } - } - -} // namespace quda diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index e40f7fc164..7117677300 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -564,7 +564,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) if (gauge_param.order <= 4) gauge_param.ghostExchange = QUDA_GHOST_EXCHANGE_NO; GaugeField *in = GaugeField::Create(gauge_param); - if (in->Order() == QUDA_BQCD_GAUGE_ORDER) { + if (in->Order() == QUDA_BQCD_GAUGE_ORDER) { static size_t checksum = SIZE_MAX; size_t in_checksum = in->checksum(true); if (in_checksum == checksum) { @@ -715,7 +715,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) if(param->overlap){ if(gaugeFatExtended) errorQuda("Extended gauge fat field already allocated"); - gaugeFatExtended = extended; + gaugeFatExtended = extended; } break; case QUDA_ASQTAD_LONG_LINKS: @@ -727,7 +727,7 @@ void loadGaugeQuda(void *h_gauge, QudaGaugeParam *param) if(param->overlap){ if(gaugeLongExtended) errorQuda("Extended gauge long field already allocated"); - gaugeLongExtended = extended; + gaugeLongExtended = extended; } break; default: @@ -1459,7 +1459,7 @@ namespace quda { break; case QUDA_MOBIUS_DWF_DSLASH: if (inv_param->Ls > QUDA_MAX_DWF_LS) - errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS); + errorQuda("Length of Ls dimension %d greater than QUDA_MAX_DWF_LS %d", inv_param->Ls, QUDA_MAX_DWF_LS); diracParam.type = pc ? QUDA_MOBIUS_DOMAIN_WALLPC_DIRAC : QUDA_MOBIUS_DOMAIN_WALL_DIRAC; diracParam.Ls = inv_param->Ls; if (sizeof(Complex) != sizeof(double _Complex)) { @@ -1483,21 +1483,21 @@ namespace quda { case QUDA_TWISTED_MASS_DSLASH: diracParam.type = pc ? QUDA_TWISTED_MASSPC_DIRAC : QUDA_TWISTED_MASS_DIRAC; if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) { - diracParam.Ls = 1; - diracParam.epsilon = 0.0; + diracParam.Ls = 1; + diracParam.epsilon = 0.0; } else { - diracParam.Ls = 2; - diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0; + diracParam.Ls = 2; + diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0; } break; case QUDA_TWISTED_CLOVER_DSLASH: diracParam.type = pc ? QUDA_TWISTED_CLOVERPC_DIRAC : QUDA_TWISTED_CLOVER_DIRAC; if (inv_param->twist_flavor == QUDA_TWIST_SINGLET) { - diracParam.Ls = 1; - diracParam.epsilon = 0.0; + diracParam.Ls = 1; + diracParam.epsilon = 0.0; } else { - diracParam.Ls = 2; - diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0; + diracParam.Ls = 2; + diracParam.epsilon = inv_param->twist_flavor == QUDA_TWIST_NONDEG_DOUBLET ? inv_param->epsilon : 0.0; } break; case QUDA_LAPLACE_DSLASH: @@ -1726,7 +1726,7 @@ namespace quda { case QUDA_MAT_SOLUTION: if (param.mass_normalization == QUDA_MASS_NORMALIZATION || param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(2.0*kappa, b); + blas::ax(2.0*kappa, b); if (for_multishift) for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 2.0 * kappa; } @@ -1734,29 +1734,29 @@ namespace quda { case QUDA_MATDAG_MAT_SOLUTION: if (param.mass_normalization == QUDA_MASS_NORMALIZATION || param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(4.0*kappa*kappa, b); + blas::ax(4.0*kappa*kappa, b); if (for_multishift) for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; } break; case QUDA_MATPC_SOLUTION: if (param.mass_normalization == QUDA_MASS_NORMALIZATION) { - blas::ax(4.0*kappa*kappa, b); + blas::ax(4.0*kappa*kappa, b); if (for_multishift) for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(2.0*kappa, b); + blas::ax(2.0*kappa, b); if (for_multishift) for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 2.0 * kappa; } break; case QUDA_MATPCDAG_MATPC_SOLUTION: if (param.mass_normalization == QUDA_MASS_NORMALIZATION) { - blas::ax(16.0*std::pow(kappa,4), b); + blas::ax(16.0*std::pow(kappa,4), b); if (for_multishift) for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 16.0 * std::pow(kappa, 4); } else if (param.mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { - blas::ax(4.0*kappa*kappa, b); + blas::ax(4.0*kappa*kappa, b); if (for_multishift) for (int i = 0; i < param.num_offset; i++) param.offset[i] *= 4.0 * kappa * kappa; } @@ -1842,179 +1842,6 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity popVerbosity(); } -#if 0 // FIXME: -void dslashQudaNoLoads(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) -{ - profileDslash.TPSTART(QUDA_PROFILE_TOTAL); - profileDslash.TPSTART(QUDA_PROFILE_INIT); - - const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise; - - if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH) - || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH)) - errorQuda("Gauge field not allocated"); - if (cloverPrecise == nullptr && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) - errorQuda("Clover field not allocated"); - - pushVerbosity(inv_param->verbosity); - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); - - ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), true, inv_param->input_location); - ColorSpinorField in_h(cpuParam); - ColorSpinorParam cudaParam(cpuParam, *inv_param, QUDA_CUDA_FIELD_LOCATION); - - cpuParam.v = h_out; - cpuParam.location = inv_param->output_location; - ColorSpinorField out_h(cpuParam); - - ColorSpinorField in(cudaParam); - ColorSpinorField out(cudaParam); - - bool pc = true; - DiracParam diracParam; - setDiracParam(diracParam, inv_param, pc); - - profileDslash.TPSTOP(QUDA_PROFILE_INIT); - - profileDslash.TPSTART(QUDA_PROFILE_H2D); - in = in_h; - profileDslash.TPSTOP(QUDA_PROFILE_H2D); - - profileDslash.TPSTART(QUDA_PROFILE_COMPUTE); - - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); - - if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION && - (inv_param->dslash_type == QUDA_STAGGERED_DSLASH || - inv_param->dslash_type == QUDA_ASQTAD_DSLASH) ) - blas::ax(1.0/(2.0*inv_param->mass), in); - - if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { - if (parity == QUDA_EVEN_PARITY) { - parity = QUDA_ODD_PARITY; - } else { - parity = QUDA_EVEN_PARITY; - } - blas::ax(gauge.Anisotropy(), in); - } - - Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inv_param->dagger) { - cudaParam.create = QUDA_NULL_FIELD_CREATE; - ColorSpinorField tmp1(cudaParam); - ((DiracTwistedCloverPC*) dirac)->TwistCloverInv(tmp1, in, (parity+1)%2); // apply the clover-twist - dirac->Dslash(out, tmp1, parity); // apply the operator - } else if (inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH - || inv_param->dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH) { - dirac->Dslash4(out, in, parity); - } else { - dirac->Dslash(out, in, parity); // apply the operator - } - profileDslash.TPSTOP(QUDA_PROFILE_COMPUTE); - - profileDslash.TPSTART(QUDA_PROFILE_D2H); - out_h = out; - profileDslash.TPSTOP(QUDA_PROFILE_D2H); - - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out)); - - profileDslash.TPSTART(QUDA_PROFILE_FREE); - delete dirac; // clean up - - profileDslash.TPSTOP(QUDA_PROFILE_FREE); - - popVerbosity(); - profileDslash.TPSTOP(QUDA_PROFILE_TOTAL); -} -#endif - -// #if 0 -void dslashQudaTest(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity parity) -{ - profileDslash.TPSTART(QUDA_PROFILE_TOTAL); - profileDslash.TPSTART(QUDA_PROFILE_INIT); - - const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise; - - if ((!gaugePrecise && inv_param->dslash_type != QUDA_ASQTAD_DSLASH) - || ((!gaugeFatPrecise || !gaugeLongPrecise) && inv_param->dslash_type == QUDA_ASQTAD_DSLASH)) - errorQuda("Gauge field not allocated"); - if (cloverPrecise == nullptr - && ((inv_param->dslash_type == QUDA_CLOVER_WILSON_DSLASH) || (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH))) - errorQuda("Clover field not allocated"); - - pushVerbosity(inv_param->verbosity); - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printQudaInvertParam(inv_param); - - ColorSpinorParam cpuParam(h_in, *inv_param, gauge.X(), true, inv_param->input_location); - ColorSpinorField in_h(cpuParam); - ColorSpinorParam cudaParam(cpuParam, *inv_param, QUDA_CUDA_FIELD_LOCATION); - - cpuParam.v = h_out; - cpuParam.location = inv_param->output_location; - ColorSpinorField out_h(cpuParam); - - ColorSpinorField in(cudaParam); - ColorSpinorField out(cudaParam); - - bool pc = true; - DiracParam diracParam; - setDiracParam(diracParam, inv_param, pc); - - profileDslash.TPSTOP(QUDA_PROFILE_INIT); - - profileDslash.TPSTART(QUDA_PROFILE_H2D); - in = in_h; - profileDslash.TPSTOP(QUDA_PROFILE_H2D); - - profileDslash.TPSTART(QUDA_PROFILE_COMPUTE); - - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); - - if (inv_param->mass_normalization == QUDA_KAPPA_NORMALIZATION - && (inv_param->dslash_type == QUDA_STAGGERED_DSLASH || inv_param->dslash_type == QUDA_ASQTAD_DSLASH)) - blas::ax(1.0 / (2.0 * inv_param->mass), in); - - if (inv_param->dirac_order == QUDA_CPS_WILSON_DIRAC_ORDER) { - if (parity == QUDA_EVEN_PARITY) { - parity = QUDA_ODD_PARITY; - } else { - parity = QUDA_EVEN_PARITY; - } - blas::ax(gauge.Anisotropy(), in); - } - - Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator - if (inv_param->dslash_type == QUDA_TWISTED_CLOVER_DSLASH && inv_param->dagger) { - // cudaParam.create = QUDA_NULL_FIELD_CREATE; - // ColorSpinorField tmp1(cudaParam); - // ((DiracTwistedCloverPC *)dirac)->TwistCloverInv(tmp1, in, (parity + 1) % 2); // // DO NOT APPLY the clover-twist - // dirac->Dslash(out, tmp1, parity); // DO NOT APPLY OPERATOR - } else if (inv_param->dslash_type == QUDA_DOMAIN_WALL_4D_DSLASH || inv_param->dslash_type == QUDA_MOBIUS_DWF_DSLASH - || inv_param->dslash_type == QUDA_MOBIUS_DWF_EOFA_DSLASH) { - // dirac->Dslash4(out, in, parity); // DO NOT APPLY OPERATOR - } else { - // dirac->Dslash(out, in, parity); // DO NOT APPLY OPERATOR - } - profileDslash.TPSTOP(QUDA_PROFILE_COMPUTE); - - profileDslash.TPSTART(QUDA_PROFILE_D2H); - out_h = out; - profileDslash.TPSTOP(QUDA_PROFILE_D2H); - - if (getVerbosity() >= QUDA_DEBUG_VERBOSE) printfQuda("Out CPU %e CUDA %e\n", blas::norm2(out_h), blas::norm2(out)); - - profileDslash.TPSTART(QUDA_PROFILE_FREE); - delete dirac; // clean up - - profileDslash.TPSTOP(QUDA_PROFILE_FREE); - - popVerbosity(); - profileDslash.TPSTOP(QUDA_PROFILE_TOTAL); -} -// #endif - - void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) { pushVerbosity(inv_param->verbosity); @@ -2295,7 +2122,7 @@ void cloverQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity DiracParam diracParam; setDiracParam(diracParam, inv_param, pc); - //FIXME: Do we need this for twisted clover??? + //FIXME: Do we need this for twisted clover??? DiracCloverPC dirac(diracParam); // create the Dirac operator if (!inverse) dirac.Clover(out, in, parity); // apply the clover operator else dirac.CloverInv(out, in, parity); @@ -2528,7 +2355,7 @@ multigrid_solver::multigrid_solver(QudaMultigridParam &mg_param, TimeProfile &pr // this is the Dirac operator we use for sloppy smoothing (we use the preconditioner fields for this) DiracParam diracSmoothSloppyParam; setDiracPreParam(diracSmoothSloppyParam, param, fine_grid_pc_solve, - mg_param.smoother_schwarz_type[0] == QUDA_INVALID_SCHWARZ ? true : false); + mg_param.smoother_schwarz_type[0] == QUDA_INVALID_SCHWARZ ? true : false); diracSmoothSloppyParam.halo_precision = mg_param.smoother_halo_precision[0]; dSmoothSloppy = Dirac::create(diracSmoothSloppyParam); @@ -3715,16 +3542,16 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) for(int i=param->num_offset-1; i >= 0; i--) { #endif double rsd_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? - param->true_res_hq_offset[i] : 0; + param->true_res_hq_offset[i] : 0; double tol_hq = param->residual_type & QUDA_HEAVY_QUARK_RESIDUAL ? - param->tol_hq_offset[i] : 0; + param->tol_hq_offset[i] : 0; /* - In the case where the shifted systems have zero tolerance - specified, we refine these systems until either the limit of - precision is reached (prec_tol) or until the tolerance reaches - the iterated residual tolerance of the previous multi-shift - solver (iter_res_offset[i]), which ever is greater. + In the case where the shifted systems have zero tolerance + specified, we refine these systems until either the limit of + precision is reached (prec_tol) or until the tolerance reaches + the iterated residual tolerance of the previous multi-shift + solver (iter_res_offset[i]), which ever is greater. */ const double prec_tol = std::pow(10.,(-2*(int)param->cuda_prec+4)); // implicit refinment limit of 1e-12 const double iter_tol = (param->iter_res_offset[i] < prec_tol ? prec_tol : (param->iter_res_offset[i] *1.1)); @@ -3761,9 +3588,9 @@ void invertMultiShiftQuda(void **hp_x, void *hp_b, QudaInvertParam *param) if (false) { // experimenting with Minimum residual extrapolation // only perform MRE using current and previously refined solutions #ifdef REFINE_INCREASING_MASS - const int nRefine = i+1; + const int nRefine = i+1; #else - const int nRefine = param->num_offset - i + 1; + const int nRefine = param->num_offset - i + 1; #endif cudaParam.create = QUDA_NULL_FIELD_CREATE; @@ -3981,7 +3808,7 @@ void computeTwoLinkQuda(void *twolink, void *inlink, QudaGaugeParam *param) } int computeGaugeForceQuda(void* mom, void* siteLink, int*** input_path_buf, int* path_length, - double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam) + double* loop_coeff, int num_paths, int max_length, double eb3, QudaGaugeParam* qudaGaugeParam) { auto profile = pushProfile(profileGaugeForce); checkGaugeParam(qudaGaugeParam); @@ -4304,7 +4131,7 @@ void computeStaggeredForceQuda(void *h_mom, double dt, double delta, void *, voi if (inv_param->use_resident_solution) { if (solutionResident.size() < (unsigned int)nvector) errorQuda("solutionResident.size() %lu does not match number of shifts %d", - solutionResident.size(), nvector); + solutionResident.size(), nvector); } // create the staggered operator @@ -4729,7 +4556,7 @@ void computeCloverForceQuda(void *h_mom, double dt, void **h_x, void **, double if (inv_param->use_resident_solution) { if (solutionResident.size() < (unsigned int)nvector) errorQuda("solutionResident.size() %lu does not match number of shifts %d", - solutionResident.size(), nvector); + solutionResident.size(), nvector); } GaugeField &gaugeEx = *extendedGaugeResident; From ef57751c2172e244a8ea23c397d40c305b71527b Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 6 Nov 2023 14:09:50 +0100 Subject: [PATCH 109/148] fixed gitignore --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index a4b290fbb2..58f2516546 100644 --- a/.gitignore +++ b/.gitignore @@ -19,4 +19,4 @@ include/quda_define.h include/jitify_options.hpp .tags* autom4te.cache/* -.vscode \ No newline at end of file +.vscode From 00209934383f8e398ba72d50fcc617e20bc0d83d Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 6 Nov 2023 14:13:07 +0100 Subject: [PATCH 110/148] removed comment --- include/kernels/dslash_gamma_helper.cuh | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/kernels/dslash_gamma_helper.cuh b/include/kernels/dslash_gamma_helper.cuh index 83e89f0ef5..5261ea5b32 100644 --- a/include/kernels/dslash_gamma_helper.cuh +++ b/include/kernels/dslash_gamma_helper.cuh @@ -77,8 +77,6 @@ namespace quda { __device__ __host__ void operator()(int x_cb, int parity) { ColorSpinor in = arg.in(x_cb, parity); - - /* RG: I had to add the break, else there is an implicit fallthrough */ switch(arg.d) { case 0: arg.out(x_cb, parity) = in.gamma(0); break; case 1: arg.out(x_cb, parity) = in.gamma(1); break; From e622ce87312f9aee0cc843d2ec96065334f09dc3 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 6 Nov 2023 15:22:16 +0100 Subject: [PATCH 111/148] cleaned up openqcd interface --- include/dslash_quda.h | 2 +- include/quda_openqcd_interface.h | 57 ++--------- lib/dslash_gamma_helper.cu | 2 +- lib/openqcd_interface.cpp | 171 ++----------------------------- 4 files changed, 21 insertions(+), 211 deletions(-) diff --git a/include/dslash_quda.h b/include/dslash_quda.h index 1f17d8d2d3..805cd5e016 100644 --- a/include/dslash_quda.h +++ b/include/dslash_quda.h @@ -808,7 +808,7 @@ namespace quda */ void gamma5(ColorSpinorField &out, const ColorSpinorField &in); - /* RG: I have written these */ + /* RG: I have added these */ void gamma0(ColorSpinorField &out, const ColorSpinorField &in); void gamma1(ColorSpinorField &out, const ColorSpinorField &in); void gamma2(ColorSpinorField &out, const ColorSpinorField &in); diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 8f476b6cca..145153fafb 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -92,10 +92,7 @@ typedef struct { void *gauge; /** base pointer to the gauge fields */ int volume; /** VOLUME */ int bndry; /** BNDRY */ - void (*reorder_gauge_openqcd_to_quda)(void *in, void *out); void (*reorder_gauge_quda_to_openqcd)(void *in, void *out); - void (*reorder_spinor_openqcd_to_quda)(void *in, void *out); - void (*reorder_spinor_quda_to_openqcd)(void *in, void *out); } openQCD_QudaInitArgs_t; @@ -118,14 +115,6 @@ typedef struct { } openQCD_QudaDiracParam_t; -typedef struct { - double tol; /* solver tolerance (relative residual) */ - double nmx; /* maximal number of steps */ - int nkv; /* number of Krylov vector to keep */ - double reliable_delta; /* controls interval at wich accurate residual is updated */ -} openQCD_QudaGCRParam_t; - - /** * Initialize the QUDA context. * @@ -151,13 +140,23 @@ void openQCD_back_and_forth(void *h_in, void *h_out); /** - * @brief Norm square on QUDA. + * @brief Norm square in QUDA. * * @param[in] h_in Spinor input field (from openQCD) * * @return The norm */ double openQCD_qudaNorm(void *h_in); + + +/** + * @brief Prototype function for the norm-square in QUDA without loading + * the field. + * + * @param[in] d_in Spinor input field (device pointer) + * + * @return The norm + */ double openQCD_qudaNorm_NoLoads(void *d_in); @@ -192,40 +191,6 @@ void openQCD_qudaDdagD(void *src, void *dst, openQCD_QudaDiracParam_t p); void openQCD_qudaDw2(void *param, double mu, void *src, void *dst); -/** - * Solve Ax=b for a Clover Wilson operator using QUDAs GCR algorithm. All fields - * are fields passed and returned are host (CPU) field in openQCD order. This - * function requires that persistent gauge and clover fields have been created - * prior. - * - * @param[in] source Source spinor - * @param[out] solution Solution spinor - * @param[in] dirac_param Dirac parameter struct - * @param[in] gcr_param GCR parameter struct - * - * @return residual - */ -double openQCD_qudaGCR(void *source, void *solution, - openQCD_QudaDiracParam_t dirac_param, openQCD_QudaGCRParam_t gcr_param); - - -/** - * Solve Ax=b for an Clover Wilson operator with a multigrid solver. All fields - * are fields passed and returned are host (CPU) field in openQCD order. This - * function requires that persistent gauge and clover fields have been created - * prior. - * - * Requires QUDA_PRECISION & 2 != 0, e.g. QUDA_PRECISON = 14 - * - * @param[in] source Right-hand side source field - * @param[out] solution Solution spinor field - * @param[in] dirac_param Dirac parameter struct - * - * @return residual - */ -double openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param); - - /** * Setup the solver interface to quda. This function parses the file given by * [infile] as an openQCD ini file. The solver section given by the [section] diff --git a/lib/dslash_gamma_helper.cu b/lib/dslash_gamma_helper.cu index 164f40ba3a..b1668cefa5 100644 --- a/lib/dslash_gamma_helper.cu +++ b/lib/dslash_gamma_helper.cu @@ -103,7 +103,7 @@ namespace quda { // Applies a gamma5 matrix to a spinor (wrapper to ApplyGamma) void gamma5(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,4); } - /* RG: I have written these */ + /* RG: I have added these */ void gamma0(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,0); } void gamma1(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,1); } void gamma2(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,2); } diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 23b6c70700..0bf40baffd 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -87,6 +87,11 @@ template void inline qudaopenqcd_called(const char *func, QudaVerbo template void inline qudaopenqcd_called(const char *func) { qudaopenqcd_called(func, getVerbosity()); } +/** + * Mapping of enums to their actual values. We have this mapping such that we + * can use the named parameters in our input files rather than the number. this + * makes reading and writing the configuration more understandable. + */ std::unordered_map enum_map = { {"QUDA_CG_INVERTER", std::to_string(QUDA_CG_INVERTER)}, {"QUDA_BICGSTAB_INVERTER", std::to_string(QUDA_BICGSTAB_INVERTER)}, @@ -665,29 +670,6 @@ static QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) } -/** - * @brief Creates a new quda solver parameter struct - * - * @param[in] p OpenQCD Dirac parameter struct - * - * @return The quda solver parameter struct. - */ -static QudaInvertParam newOpenQCDSolverParam(openQCD_QudaDiracParam_t p) -{ - QudaInvertParam param = newOpenQCDDiracParam(p); - - param.compute_true_res = true; - - param.solution_type = QUDA_MAT_SOLUTION; - param.solve_type = QUDA_DIRECT_SOLVE; - param.matpc_type = QUDA_MATPC_EVEN_EVEN; - param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; - param.inv_type_precondition = QUDA_INVALID_INVERTER; /* disables any preconditioning */ - - return param; -} - - void openQCD_back_and_forth(void *h_in, void *h_out) { /* sets up the necessary parameters */ @@ -721,13 +703,6 @@ void openQCD_back_and_forth(void *h_in, void *h_out) } -/** - * @brief Calculates the norm of a spinor. - * - * @param[in] h_in input spinor of type spinor_dble[NSPIN] - * - * @return norm - */ double openQCD_qudaNorm(void *h_in) { QudaInvertParam param = newOpenQCDParam(); @@ -743,6 +718,7 @@ double openQCD_qudaNorm(void *h_in) return blas::norm2(in); } + double openQCD_qudaNorm_NoLoads(void *d_in) { return blas::norm2(*reinterpret_cast(d_in)); @@ -834,6 +810,7 @@ void openQCD_qudaSpinorFree(void** quda_field) *quda_field = nullptr; } + void openQCD_qudaD2H(void *quda_field, void *openQCD_field) { int my_rank; @@ -887,32 +864,6 @@ void openQCD_qudaDw2(void *param, double mu, void *src, void *dst) MatQuda(static_cast(dst), static_cast(src), inv_param); } -double openQCD_qudaGCR(void *source, void *solution, - openQCD_QudaDiracParam_t dirac_param, openQCD_QudaGCRParam_t gcr_param) -{ - QudaInvertParam param = newOpenQCDSolverParam(dirac_param); - - /* both fields reside on the CPU */ - param.input_location = QUDA_CPU_FIELD_LOCATION; - param.output_location = QUDA_CPU_FIELD_LOCATION; - - param.inv_type = QUDA_GCR_INVERTER; - param.tol = gcr_param.tol; - param.maxiter = gcr_param.nmx; - param.gcrNkrylov = gcr_param.nkv; - param.reliable_delta = gcr_param.reliable_delta; - - invertQuda(static_cast(solution), static_cast(source), ¶m); - - printfQuda("true_res = %.2e\n", param.true_res); - printfQuda("true_res_hq = %.2e\n", param.true_res_hq); - printfQuda("iter = %d\n", param.iter); - printfQuda("gflops = %.2e\n", param.gflops); - printfQuda("secs = %.2e\n", param.secs); - - return param.true_res; -} - void* openQCD_qudaSolverSetup(char *infile, char *section) { @@ -1404,116 +1355,10 @@ void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals) logQuda(QUDA_SUMMARIZE, " secs = %.2e\n", eig_param->secs); } + void openQCD_qudaEigensolverDestroy(void *param) { QudaEigParam* eig_param = static_cast(param); openQCD_qudaSolverDestroy(eig_param->invert_param); delete eig_param; } - - -double openQCD_qudaMultigrid(void *source, void *solution, openQCD_QudaDiracParam_t dirac_param) -{ - QudaInvertParam invert_param = newOpenQCDSolverParam(dirac_param); - QudaInvertParam invert_param_mg = newOpenQCDSolverParam(dirac_param); - QudaMultigridParam multigrid_param = newQudaMultigridParam(); - - invert_param.reliable_delta = 1e-5; - invert_param.gcrNkrylov = 20; - invert_param.maxiter = 2000; - invert_param.tol = 1e-12; - invert_param.inv_type = QUDA_GCR_INVERTER; - invert_param.solution_type = QUDA_MAT_SOLUTION; - invert_param.solve_type = QUDA_DIRECT_SOLVE; - invert_param.matpc_type = QUDA_MATPC_EVEN_EVEN; - invert_param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; - invert_param.inv_type_precondition = QUDA_MG_INVERTER; - invert_param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; /* The precision used by the QUDA solver */ - invert_param.cuda_prec_precondition = QUDA_HALF_PRECISION; /* The precision used by the QUDA solver */ - - invert_param_mg.reliable_delta = 1e-5; - invert_param_mg.gcrNkrylov = 20; - invert_param_mg.maxiter = 2000; - invert_param_mg.tol = 1e-12; - invert_param_mg.inv_type = QUDA_GCR_INVERTER; - invert_param_mg.solution_type = QUDA_MAT_SOLUTION; - invert_param_mg.solve_type = QUDA_DIRECT_SOLVE; - invert_param_mg.matpc_type = QUDA_MATPC_EVEN_EVEN; - invert_param_mg.solver_normalization = QUDA_DEFAULT_NORMALIZATION; - invert_param_mg.inv_type_precondition = QUDA_MG_INVERTER; - invert_param_mg.gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; - invert_param_mg.dirac_order = QUDA_DIRAC_ORDER; - - /** - * set the params, hard code the solver - * parameters copied from recommended settings from Wiki - */ - multigrid_param.n_level = 2; - multigrid_param.generate_all_levels = QUDA_BOOLEAN_TRUE; - multigrid_param.run_verify = QUDA_BOOLEAN_FALSE; - multigrid_param.invert_param = &invert_param_mg; - multigrid_param.compute_null_vector = QUDA_COMPUTE_NULL_VECTOR_YES; - - /** - * try setting minimal parameters - leave rest to default - * level 0 fine - */ - multigrid_param.geo_block_size[0][0] = 4; /* xytz */ - multigrid_param.geo_block_size[0][1] = 4; - multigrid_param.geo_block_size[0][2] = 4; - multigrid_param.geo_block_size[0][3] = 4; - multigrid_param.n_vec[0] = 24; - multigrid_param.spin_block_size[0] = 2; - multigrid_param.precision_null[0] = QUDA_HALF_PRECISION; - multigrid_param.smoother[0] = QUDA_CA_GCR_INVERTER; - multigrid_param.smoother_tol[0] = 0.25; - multigrid_param.location[0] = QUDA_CUDA_FIELD_LOCATION; - multigrid_param.nu_pre[0] = 0; - multigrid_param.nu_post[0] = 8; - multigrid_param.omega[0] = 0.8; - multigrid_param.smoother_solve_type[0] = QUDA_DIRECT_PC_SOLVE; - multigrid_param.cycle_type[0] = QUDA_MG_CYCLE_RECURSIVE; - multigrid_param.coarse_solver[0] = QUDA_GCR_INVERTER; - multigrid_param.coarse_solver_tol[0] = 0.25; - multigrid_param.coarse_solver_maxiter[0] = 50; - multigrid_param.coarse_grid_solution_type[0] = QUDA_MAT_SOLUTION; - - /** - * level 1 coarse - * no smoother required for innermost - * so no blocks - */ - multigrid_param.precision_null[1] = QUDA_HALF_PRECISION; - multigrid_param.coarse_solver[1] = QUDA_CA_GCR_INVERTER; - multigrid_param.smoother[1] = QUDA_CA_GCR_INVERTER; - multigrid_param.smoother_tol[1] = 0.25; - multigrid_param.spin_block_size[1] = 1; - multigrid_param.coarse_solver_tol[1] = 0.25; - multigrid_param.coarse_solver_maxiter[1] = 50; - multigrid_param.coarse_grid_solution_type[1] = QUDA_MATPC_SOLUTION; - multigrid_param.smoother_solve_type[1] = QUDA_DIRECT_PC_SOLVE; - multigrid_param.cycle_type[1] = QUDA_MG_CYCLE_RECURSIVE; - multigrid_param.location[1] = QUDA_CUDA_FIELD_LOCATION; - multigrid_param.nu_pre[1] = 0; - multigrid_param.nu_post[1] = 8; - multigrid_param.omega[1] = 0.8; - - PUSH_RANGE("newMultigridQuda",4); - void *mgprec = newMultigridQuda(&multigrid_param); - invert_param.preconditioner = mgprec; - POP_RANGE; - - PUSH_RANGE("invertQUDA",5); - invertQuda(static_cast(solution), static_cast(source), &invert_param); - POP_RANGE; - - destroyMultigridQuda(mgprec); - - printfQuda("true_res = %.2e\n", invert_param.true_res); - printfQuda("true_res_hq = %.2e\n", invert_param.true_res_hq); - printfQuda("iter = %d\n", invert_param.iter); - printfQuda("gflops = %.2e\n", invert_param.gflops); - printfQuda("secs = %.2e\n", invert_param.secs); - - return invert_param.true_res; -} From 7085cc538888bd9e8d83f697b14defa40095042b Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 6 Nov 2023 17:51:15 +0100 Subject: [PATCH 112/148] outsourced all openqcd index functions --- include/clover_field_order.h | 106 +--------- include/color_spinor_field_order.h | 110 ++--------- include/gauge_field_order.h | 244 ++--------------------- include/index_helper.cuh | 305 ++++++++++++++++++++++++++++- 4 files changed, 333 insertions(+), 432 deletions(-) diff --git a/include/clover_field_order.h b/include/clover_field_order.h index 5b83952567..0187228dc5 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -1030,11 +1030,8 @@ namespace quda { const double coeff; const double csw; const double kappa; - const int L[4]; // xyzt convention - const int L_[4]; // txyz convention - const int volume; - const int cbs[4]; // openQCDs cache block size - const int cbn[4]; // openQCDs cache block grid + const int dim[4]; // xyzt convention + const int L[4]; // txyz convention OpenQCDOrder(const CloverField &clover, bool inverse, Float *clover_ = nullptr, void * = nullptr) : volumeCB(clover.Stride()), @@ -1044,11 +1041,8 @@ namespace quda { coeff(clover.Coeff()), csw(clover.Csw()), kappa(clover.Coeff()/clover.Csw()), - L {clover.X()[0], clover.X()[1], clover.X()[2], clover.X()[3]}, // *local* lattice dimensions, xyzt - L_ {clover.X()[3], clover.X()[0], clover.X()[1], clover.X()[2]}, // *local* lattice dimensions, txyz - volume(L_[0]*L_[1]*L_[2]*L_[3]), // *local* lattice volume - cbs {setup_cbs(0, L_), setup_cbs(1, L_), setup_cbs(2, L_), setup_cbs(3, L_)}, // txyz - cbn {L_[0]/cbs[0], L_[1]/cbs[1], L_[2]/cbs[2], L_[3]/cbs[3]} // txyz + dim {clover.X()[0], clover.X()[1], clover.X()[2], clover.X()[3]}, // *local* lattice dimensions, xyzt + L {clover.X()[3], clover.X()[0], clover.X()[1], clover.X()[2]} // *local* lattice dimensions, txyz { if (clover.Order() != QUDA_OPENQCD_CLOVER_ORDER) { errorQuda("Invalid clover order %d for this accessor", clover.Order()); @@ -1063,92 +1057,6 @@ namespace quda { Float Mu2() const { return mu2; } Float Epsilon2() const { return epsilon2; } - __device__ __host__ inline int setup_cbs(const int mu, const int *X) const - { - if (mu==0) { - return X[0]; - } else if ((X[mu]%4)==0) { - return 4; - } else if ((X[mu]%3)==0) { - return 3; - } else if ((X[mu]%2)==0) { - return 2; - } else { - return 1; - } - } - - /** - * @brief Rotate coordinates (xyzt -> txyz) - * - * @param[in] x_quda Cartesian local lattice coordinates in quda - * convention (xyzt) - * @param[out] x_openQCD Cartesian local lattice coordinates in openQCD - * convention (txyz) - */ - __device__ __host__ inline void rotate_coords(const int *x_quda, int *x_openQCD) const - { - x_openQCD[1] = x_quda[0]; - x_openQCD[2] = x_quda[1]; - x_openQCD[3] = x_quda[2]; - x_openQCD[0] = x_quda[3]; - } - - /** - * @brief Generate a lexicographical index with x[Ndims-1] running - * fastest, for example if Ndims=4: - * ix = X3*X2*X1*x0 + X3*X2*x1 + X3*x2 + x3. - * - * @param[in] x Integer array of dimension Ndims with coordinates - * @param[in] X Integer array of dimension Ndims with extents - * @param[in] Ndims The number of dimensions - * - * @return Lexicographical index - */ - __device__ __host__ inline int lexi(const int *x, const int *X, const int Ndims) const - { - int i, ix = x[0]; - - #pragma unroll - for (i=1; i +VOLUME/2 */ - ); - } - /** * @brief Gets the offset in Floats from the openQCD base pointer to * the spinor field. @@ -1161,9 +1069,9 @@ namespace quda { __device__ __host__ inline int getCloverOffset(int x_cb, int parity) const { int x_quda[4], x[4]; - getCoords(x_quda, x_cb, L, parity); // x_quda contains xyzt local Carthesian corrdinates - rotate_coords(x_quda, x); // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate - return ipt(x)*length; + getCoords(x_quda, x_cb, dim, parity); // x_quda contains xyzt local Carthesian corrdinates + openqcd::rotate_coords(x_quda, x); // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate + return openqcd::ipt(x, L)*length; } /** diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index e4df3cd876..89096e3202 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1813,6 +1813,10 @@ namespace quda /** * struct to define order of spinor fields in OpenQCD + * + * @tparam Float Underlying type of data (precision) + * @tparam Ns Number of spin degrees of freedom + * @tparam Nc Number of color degrees of freedom */ template struct OpenQCDDiracOrder { using Accessor = OpenQCDDiracOrder; @@ -1826,114 +1830,22 @@ namespace quda int volumeCB; int faceVolumeCB[4]; int nParity; - const int L[4]; // xyzt convention - const int L_[4]; // txyz convention - const int volume; - const int cbs[4]; // openQCDs cache block size - const int cbn[4]; // openQCDs cache block grid + const int dim[4]; // xyzt convention + const int L[4]; // txyz convention OpenQCDDiracOrder(const ColorSpinorField &a, int = 1, Float *field_ = 0, float * = 0) : field(field_ ? field_ : a.data()), offset(a.Bytes() / (2 * sizeof(Float))), // TODO: What's this for?? volumeCB(a.VolumeCB()), nParity(a.SiteSubset()), - L {a.X()[0], a.X()[1], a.X()[2], a.X()[3]}, // *local* lattice dimensions, xyzt - L_ {a.X()[3], a.X()[0], a.X()[1], a.X()[2]}, // *local* lattice dimensions, txyz - volume(L_[0]*L_[1]*L_[2]*L_[3]), // *local* lattice volume - cbs {setup_cbs(0, L_), setup_cbs(1, L_), setup_cbs(2, L_), setup_cbs(3, L_)}, // txyz - cbn {L_[0]/cbs[0], L_[1]/cbs[1], L_[2]/cbs[2], L_[3]/cbs[3]} // txyz + dim {a.X(0), a.X(1), a.X(2), a.X(3)}, // *local* lattice dimensions, xyzt + L {a.X(3), a.X(0), a.X(1), a.X(2)} // *local* lattice dimensions, txyz { if constexpr (length != 24) { errorQuda("Spinor field length %d not supported", length); } } - __device__ __host__ inline int setup_cbs(const int mu, const int *X) const - { - if (mu==0) { - return X[0]; - } else if ((X[mu]%4)==0) { - return 4; - } else if ((X[mu]%3)==0) { - return 3; - } else if ((X[mu]%2)==0) { - return 2; - } else { - return 1; - } - } - - /** - * @brief Rotate coordinates (xyzt -> txyz) - * - * @param[in] x_quda Cartesian local lattice coordinates in quda - * convention (xyzt) - * @param[out] x_openQCD Cartesian local lattice coordinates in openQCD - * convention (txyz) - */ - __device__ __host__ inline void rotate_coords(const int *x_quda, int *x_openQCD) const - { - x_openQCD[1] = x_quda[0]; - x_openQCD[2] = x_quda[1]; - x_openQCD[3] = x_quda[2]; - x_openQCD[0] = x_quda[3]; - } - - /** - * @brief Generate a lexicographical index with x[Ndims-1] running - * fastest, for example if Ndims=4: - * ix = X3*X2*X1*x0 + X3*X2*x1 + X3*x2 + x3. - * - * @param[in] x Integer array of dimension Ndims with coordinates - * @param[in] X Integer array of dimension Ndims with extents - * @param[in] Ndims The number of dimensions - * - * @return Lexicographical index - */ - __device__ __host__ inline int lexi(const int *x, const int *X, const int Ndims) const - { - int i, ix = x[0]; - - #pragma unroll - for (i=1; i +VOLUME/2 */ - ); - } - /** * @brief Gets the offset in Floats from the openQCD base pointer to * the spinor field. @@ -1946,9 +1858,9 @@ namespace quda __device__ __host__ inline int getSpinorOffset(int x_cb, int parity) const { int x_quda[4], x[4]; - getCoords(x_quda, x_cb, L, parity); // x_quda contains xyzt local Carthesian corrdinates - rotate_coords(x_quda, x); // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate - return ipt(x)*length; + getCoords(x_quda, x_cb, dim, parity); // x_quda contains xyzt local Carthesian corrdinates + openqcd::rotate_coords(x_quda, x); // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate + return openqcd::ipt(x, L)*length; } __device__ __host__ inline void load(complex v[length/2], int x_cb, int parity = 0) const diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index fba5e31dfb..b4cd6e108e 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2361,232 +2361,23 @@ namespace quda { Float *gauge; const int volumeCB; static constexpr int Nc = 3; - const int L[4]; // xyzt convention - const int L_[4]; // txyz convention - const int volume; + const int dim[4]; // xyzt convention + const int L[4]; // txyz convention const int nproc[4]; - const int face[4]; - const int bndry; - const int ifc[4]; - const int face_offset[4]; - const int cbs[4]; // openQCDs cache block size - const int cbn[4]; // openQCDs cache block grid OpenQCDOrder(const GaugeField &u, Float *gauge_ = 0, Float **ghost_ = 0) : LegacyOrder(u, ghost_), gauge(gauge_ ? gauge_ : (Float *)u.data()), // pointer to the gauge field on CPU volumeCB(u.VolumeCB()), // Volume and VolumeCB refer to the global lattice, if VolumeLocal, then local lattice - L {u.X()[0], u.X()[1], u.X()[2], u.X()[3]}, // *local* lattice dimensions, xyzt - L_ {u.X()[3], u.X()[0], u.X()[1], u.X()[2]}, // *local* lattice dimensions, txyz - volume(L_[0]*L_[1]*L_[2]*L_[3]), // *local* lattice volume - nproc {comm_dim(3), comm_dim(0), comm_dim(1), comm_dim(2)}, // txyz - face {((1-(nproc[0]%2))*L_[1]*L_[2]*L_[3]), - ((1-(nproc[1]%2))*L_[2]*L_[3]*L_[0]), - ((1-(nproc[2]%2))*L_[3]*L_[0]*L_[1]), - ((1-(nproc[3]%2))*L_[0]*L_[1]*L_[2])}, // txyz - bndry(2*(face[0]+face[1]+face[2]+face[3])), - ifc {(face[0]/2), - face[0] + (face[1]/2), - face[0] + face[1] + (face[2]/2), - face[0] + face[1] + face[2] + (face[3]/2)}, // txyz - face_offset {0, face[0]/2, face[0]/2 + face[1]/2, face[0]/2 + face[1]/2 + face[2]/2}, //txyz - cbs {setup_cbs(0, L_), setup_cbs(1, L_), setup_cbs(2, L_), setup_cbs(3, L_)}, // txyz - cbn {L_[0]/cbs[0], L_[1]/cbs[1], L_[2]/cbs[2], L_[3]/cbs[3]} // txyz + dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]}, // *local* lattice dimensions, xyzt + L {u.X()[3], u.X()[0], u.X()[1], u.X()[2]}, // *local* lattice dimensions, txyz + nproc {comm_dim(3), comm_dim(0), comm_dim(1), comm_dim(2)} // txyz { if constexpr (length != 18) { errorQuda("Gauge field length %d not supported", length); } } - __device__ __host__ inline int setup_cbs(const int mu, const int *X) const - { - if (mu==0) { - return X[0]; - } else if ((X[mu]%4)==0) { - return 4; - } else if ((X[mu]%3)==0) { - return 3; - } else if ((X[mu]%2)==0) { - return 2; - } else { - return 1; - } - } - - /** - * @brief Rotate coordinates (xyzt -> txyz) - * - * @param[in] x_quda Cartesian local lattice coordinates in quda - * convention (xyzt) - * @param[out] x_openQCD Cartesian local lattice coordinates in openQCD - * convention (txyz) - */ - __device__ __host__ inline void rotate_coords(const int *x_quda, int *x_openQCD) const - { - x_openQCD[1] = x_quda[0]; - x_openQCD[2] = x_quda[1]; - x_openQCD[3] = x_quda[2]; - x_openQCD[0] = x_quda[3]; - } - - /** - * @brief Generate a lexicographical index with x[Ndims-1] running - * fastest, for example if Ndims=4: - * ix = X3*X2*X1*x0 + X3*X2*x1 + X3*x2 + x3. - * - * @param[in] x Integer array of dimension Ndims with coordinates - * @param[in] X Integer array of dimension Ndims with extents - * @param[in] Ndims The number of dimensions - * - * @return Lexicographical index - */ - __device__ __host__ inline int lexi(const int *x, const int *X, const int Ndims) const - { - int i, ix = x[0]; - - #pragma unroll - for (i=1; i +VOLUME/2 */ - ); - } - - /** - * @brief Pure implementation of iup[ix][mu] - * - * @param[in] x Cartesian local lattice corrdinates, 0 <= x[i] < Li, length - * 4 in txyz convention - * @param[in] mu Direction in txyz convention - * - * @return iup[ix][mu] - */ - __device__ __host__ inline int iup(const int *x, const int mu) const - { - int i, ret, xb[4], xn[4]; - - if ((x[mu]==(L_[mu]-1))&&(nproc[mu]>1)) { - - xb[0] = x[0] % cbs[0]; - xb[1] = x[1] % cbs[1]; - xb[2] = x[2] % cbs[2]; - xb[3] = x[3] % cbs[3]; - - xn[0] = x[0]/cbs[0]; - xn[1] = x[1]/cbs[1]; - xn[2] = x[2]/cbs[2]; - xn[3] = x[3]/cbs[3]; - - ret = volume + ifc[mu]; - if ((x[0]+x[1]+x[2]+x[3]) % 2 == 0) { - ret += bndry/2; - } - - ret += surface(cbs, mu)*boundary_pts(mu, xn, cbn)/2; - ret += boundary_pts(mu, xb, cbs)/2; - return ret; - - } else { - #pragma unroll - for (i=0; i<4; i++) { - xb[i] = x[i]; - } - - xb[mu] = (xb[mu] + 1) % (L_[mu]*nproc[mu]); - return ipt(xb); - } - } - /** * @brief Obtains the offset in Floats from the openQCD base pointer * to the gauge fields. At this point, fields are already @@ -2599,14 +2390,12 @@ namespace quda { * * @return The offset. */ - __device__ __host__ inline int getGaugeOffset_old(int x, int dir, int parity) const { - int coord[4]; - getCoords(coord, x, L, parity); - int idx = coord[3] + L[3]*coord[2] + L[3]*L[2]*coord[1] + L[3]*L[2]*L[1]*coord[0]; - return (4*idx + dir)*length; + __device__ __host__ inline int getGaugeOffset_lexi(int x_cb, int dir, int parity) const { + int x[4]; + getCoords(x, x_cb, dim, parity); + return (4*openqcd::lexi(x, dim, 4) + dir)*length; } - /** * @brief Obtains the offset in Floats from the openQCD base pointer * to the gauge fields. @@ -2619,20 +2408,21 @@ namespace quda { */ __device__ __host__ inline int getGaugeOffset(int x_cb, int dir, int parity) const { int quda_x[4], x[4]; - getCoords(quda_x, x_cb, L, parity); // x_quda = quda local lattice coordinates - rotate_coords(quda_x, x); // x = openQCD local lattice coordinates + getCoords(quda_x, x_cb, dim, parity); // x_quda = quda local lattice coordinates + openqcd::rotate_coords(quda_x, x); // x = openQCD local lattice coordinates int mu = (dir+1) % 4; // mu = openQCD direction - int ix = ipt(x); - int iz = iup(x, mu); + int ix = openqcd::ipt(x, L); + int iz = openqcd::iup(x, mu, L, nproc); int ofs = 0; + int volume = openqcd::vol(L); if (ix < volume/2) { // ix even -> iz odd if (iz < volume) { // iz in interior ofs = 8*(iz - volume/2) + 2*mu + 1; } else { - int ib = iz - volume - ifc[mu] - bndry/2; // iz in exterior - ofs = 4*volume + face_offset[mu] + ib; + int ib = iz - volume - openqcd::ifc(L, nproc, mu) - openqcd::bndry(L, nproc)/2; // iz in exterior + ofs = 4*volume + openqcd::face_offset(L, nproc, mu) + ib; } } else if (volume/2 <= ix && ix < volume) { // ix odd ofs = 8*(ix - volume/2) + 2*mu; @@ -2649,7 +2439,7 @@ namespace quda { __device__ __host__ inline void save(const complex v[length/2], int x_cb, int dir, int parity) const { - auto out = &gauge[getGaugeOffset_old(x_cb, dir, parity)]; + auto out = &gauge[getGaugeOffset_lexi(x_cb, dir, parity)]; block_store(reinterpret_cast(out), v); } diff --git a/include/index_helper.cuh b/include/index_helper.cuh index 688c997043..2ffead559c 100644 --- a/include/index_helper.cuh +++ b/include/index_helper.cuh @@ -1106,11 +1106,302 @@ namespace quda { return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1; } -} // namespace quda -// namespace openqcd { -// __device__ __host__ inline int ipt(...) -// { -// return ...; -// } -// } + /** + * These are index helper functions used in the order classes of openQCD, i.e. + * + * - OpenQCDOrder in quda:include/gauge_field_order.h + * - OpenQCDDiracOrder in quda:include/color_spinor_field_order.h + * - OpenQCDOrder in quda:include/clover_field_order.h. + * + * The main helper functions are ipt() and iup(), giving pure function + * implementations of the ipt[] and iup[][] arrays (see + * openqcd:include/global.h) that are needed to calculate the correct offsets + * of the fields base pointers. + */ + namespace openqcd { + + /** + * @brief Returns the surface in direction mu + * + * @param X Extent in all 4 directions + * @param[in] mu Direction + * + * @return Surface + */ + __device__ __host__ inline int surface(const int *X, const int mu) + { + if (mu==0) { + return X[1]*X[2]*X[3]; + } else if (mu==1) { + return X[0]*X[2]*X[3]; + } else if (mu==2) { + return X[0]*X[1]*X[3]; + } + return X[0]*X[1]*X[2]; + } + + + /** + * @brief Return BNDRY (see openqcd:include/global.h) + * + * @param[in] L Local lattice extent L0-L3 in txyz convention + * @param[in] nproc NPROC0-NPROC3 from openqcd + * + * @return BNDRY + */ + __device__ __host__ inline int bndry(const int *L, const int *nproc) + { + return 2*(((1-(nproc[0]%2))*surface(L, 0)) + + ((1-(nproc[1]%2))*surface(L, 1)) + + ((1-(nproc[2]%2))*surface(L, 2)) + + ((1-(nproc[3]%2))*surface(L, 3)) + ); + } + + + __device__ __host__ inline int ifc(const int *L, const int *nproc, const int mu) + { + if (mu==0) { + return ((1-(nproc[0]%2))*surface(L, 0))/2; + } else if (mu==1) { + return ((1-(nproc[0]%2))*surface(L, 0)) + + (((1-(nproc[1]%2))*surface(L, 1))/2); + } else if (mu==2) { + return ((1-(nproc[0]%2))*surface(L, 0)) + + ((1-(nproc[1]%2))*surface(L, 1)) + + (((1-(nproc[2]%2))*surface(L, 2))/2); + } + return ((1-(nproc[0]%2))*surface(L, 0)) + + ((1-(nproc[1]%2))*surface(L, 1)) + + ((1-(nproc[2]%2))*surface(L, 2)) + + (((1-(nproc[3]%2))*surface(L, 3))/2); + } + + + __device__ __host__ inline int face_offset(const int *L, const int *nproc, const int mu) + { + if (mu==0) { + return 0; + } else if (mu==1) { + return ((1-(nproc[0]%2))*surface(L, 0))/2; + } else if (mu==2) { + return ((1-(nproc[0]%2))*surface(L, 0))/2 + + ((1-(nproc[1]%2))*surface(L, 1))/2; + } + return ((1-(nproc[0]%2))*surface(L, 0))/2 + + ((1-(nproc[1]%2))*surface(L, 1))/2 + + ((1-(nproc[2]%2))*surface(L, 2))/2; + } + + + /** + * @brief Rotate coordinates (xyzt -> txyz) + * + * @param[in] x_quda Cartesian local lattice coordinates in quda + * convention (xyzt) + * @param[out] x_openQCD Cartesian local lattice coordinates in openQCD + * convention (txyz) + */ + __device__ __host__ inline void rotate_coords(const int *x_quda, int *x_openQCD) + { + x_openQCD[1] = x_quda[0]; + x_openQCD[2] = x_quda[1]; + x_openQCD[3] = x_quda[2]; + x_openQCD[0] = x_quda[3]; + } + + + /** + * @brief Generate a lexicographical index with x[Ndims-1] running + * fastest, for example if Ndims=4: + * ix = X3*X2*X1*x0 + X3*X2*x1 + X3*x2 + x3. + * + * @param[in] x Integer array of dimension Ndims with coordinates + * @param[in] X Integer array of dimension Ndims with extents + * @param[in] Ndims The number of dimensions + * + * @return Lexicographical index + */ + __device__ __host__ inline int lexi(const int *x, const int *X, const int Ndims) + { + int i, ix = x[0]; + + #pragma unroll + for (i=1; i +VOLUME/2 + ); + } + + + /** + * @brief Determines the number of boundary points in direction mu prior to + * the Carthesian index x with dimensions X + * + * @param[in] mu Direction + * @param[in] x Lattice point + * @param[in] X Dimensions of the lattice/block + * + * @return Number of prior boundary points + */ + __device__ __host__ inline int boundary_pts(const int mu, const int *x, const int *X) + { + int ret = 0; + + if (mu==3) { + ret = lexi(x, X, 3); // lexi without x[3] + } else if (mu==2) { + ret = X[3]*lexi(x, X, 2); + if (x[2]==(X[2]-1)) { + ret += x[3]; // lexi without x[2] + } + } else if (mu==1) { + if (x[1]==(X[1]-1)) { + ret = X[2]*X[3]*x[0] + X[3]*x[2] + x[3]; // lexi without x[1] + } else { + ret = surface(X, 1); + } + } else if (mu==0) { + if (x[0]==(X[0]-1)) { + ret = lexi(x+1, X+1, 3); // lexi without x[0] + } else { + ret = surface(X, 0); + } + } + + return ret; + } + + + /** + * @brief Pure implementation of iup[ix][mu] + * + * @param[in] x Cartesian local lattice corrdinates, 0 <= x[i] < Li, + * length 4 in txyz convention + * @param[in] mu Direction in txyz convention + * @param[in] L Local lattice extents, length 4 in txyz convention + * + * @return iup[ix][mu] + */ + __device__ __host__ inline int iup(const int *x, const int mu, const int *L, const int *nproc) + { + int i, ret, xb[4], xn[4]; + + if ((x[mu]==(L[mu]-1))&&(nproc[mu]>1)) { + + int cbs[4] = {setup_cbs(0, L), setup_cbs(1, L), setup_cbs(2, L), setup_cbs(3, L)}; + int cbn[4] = {L[0]/cbs[0], L[1]/cbs[1], L[2]/cbs[2], L[3]/cbs[3]}; + + xb[0] = x[0] % cbs[0]; + xb[1] = x[1] % cbs[1]; + xb[2] = x[2] % cbs[2]; + xb[3] = x[3] % cbs[3]; + + xn[0] = x[0]/cbs[0]; + xn[1] = x[1]/cbs[1]; + xn[2] = x[2]/cbs[2]; + xn[3] = x[3]/cbs[3]; + + ret = vol(L) + ifc(L, nproc, mu); + if ((x[0]+x[1]+x[2]+x[3]) % 2 == 0) { + ret += bndry(L, nproc)/2; + } + + ret += surface(cbs, mu)*boundary_pts(mu, xn, cbn)/2; + ret += boundary_pts(mu, xb, cbs)/2; + return ret; + + } else { + #pragma unroll + for (i=0; i<4; i++) { + xb[i] = x[i]; + } + + xb[mu] = (xb[mu] + 1) % (L[mu]*nproc[mu]); + return ipt(xb, L); + } + } + + + } // namespace openqcd + +} // namespace quda From ad6787acbf02a6c45d05777cd4a9a5250f393fd2 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 6 Nov 2023 18:28:35 +0100 Subject: [PATCH 113/148] fixed reordering on GPU --- lib/gauge_field.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index 5454c375ba..35b18bf45b 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -1013,7 +1013,7 @@ namespace quda { } else { // else reorder on the GPU if (order == QUDA_MILC_SITE_GAUGE_ORDER || order == QUDA_BQCD_GAUGE_ORDER - || order == QUDA_TIFR_PADDED_GAUGE_ORDER || order == QUDA_OPENQCD_GAUGE_ORDER) { + || order == QUDA_TIFR_PADDED_GAUGE_ORDER) { // special case where we use zero-copy memory to read/write directly from application's array void *data_d = get_mapped_device_pointer(data()); if (GhostExchange() == QUDA_GHOST_EXCHANGE_NO) { @@ -1082,7 +1082,7 @@ namespace quda { } else { // else on the GPU if (src.Order() == QUDA_MILC_SITE_GAUGE_ORDER || src.Order() == QUDA_BQCD_GAUGE_ORDER - || src.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER || src.Order() == QUDA_OPENQCD_GAUGE_ORDER) { + || src.Order() == QUDA_TIFR_PADDED_GAUGE_ORDER) { // special case where we use zero-copy memory to read/write directly from application's array void *src_d = get_mapped_device_pointer(src.data()); From df1bd161918f3c00967037121db9698d175cfa76 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Tue, 7 Nov 2023 16:30:20 +0100 Subject: [PATCH 114/148] macro war to enable "-std=c89 -pedantic -Werror" in openqxds checks --- include/quda_openqcd_interface.h | 39 ++++++++++++++++++++++++++++++++ lib/openqcd_interface.cpp | 2 +- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 145153fafb..192a914ebf 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -1,6 +1,45 @@ #pragma once +/** + * The macro battle below is to trick quda.h to think that double_complex is + * defined to be the struct below. For this we need to set the __CUDACC_RTC__, + * which makes double_complex to be defined as double2 (see quda.h), which we + * redefine below as openqcd_complex_dble. The original definitions of + * __CUDACC_RTC__ and double2 are recovered below. We do this to be able to + * include this header file into a openQxD program and compile with flags + * "-std=C89 -pedantic -Werror". Else the compiler trows an + * "ISO C90 does not support complex types" error because of the + * "double _Complex" data types exposed in quda.h. + */ + +typedef struct +{ + double re,im; +} openqcd_complex_dble; + +#ifdef __CUDACC_RTC__ +#define __CUDACC_RTC_ORIGINAL__ __CUDACC_RTC__ +#endif + +#ifdef double2 +#define double2_ORIGINAL double2 +#endif + +#define __CUDACC_RTC__ +#define double2 openqcd_complex_dble #include +#undef double2 +#undef __CUDACC_RTC__ + +#ifdef double2_ORIGINAL +#define double2 double2_ORIGINAL +#undef double2_ORIGINAL +#endif + +#ifdef __CUDACC_RTC_ORIGINAL__ +#define __CUDACC_RTC__ __CUDACC_RTC_ORIGINAL__ +#undef __CUDACC_RTC_ORIGINAL__ +#endif /** * @file quda_openqcd_interface.h diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 0bf40baffd..b5cc76ee92 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -1347,7 +1347,7 @@ void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals) logQuda(QUDA_VERBOSE, "Calling eigensolveQuda() ...\n"); PUSH_RANGE("eigensolveQuda",6); - eigensolveQuda(h_evecs, static_cast(h_evals), eig_param); + eigensolveQuda(h_evecs, static_cast(h_evals), eig_param); POP_RANGE; logQuda(QUDA_SUMMARIZE, "openQCD_qudaEigensolve()\n"); From 93c3f6aa8069c3f8be8e51a11900e2c49c91efa1 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 8 Nov 2023 11:51:18 +0100 Subject: [PATCH 115/148] added comment for loadGaugeQuda --- include/quda_openqcd_interface.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 192a914ebf..b9ec693c55 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -297,7 +297,9 @@ double openQCD_qudaPlaquette(void); /** - * @brief Load the gauge fields from host to quda. + * @brief Load the gauge fields from host to quda. Notice that the boundary + * fields have to be up2date; i.e. call copy_bnd_hd(), copy_bnd_ud() + * before pass fields into this function. * * @param[in] gauge The gauge fields (in openqcd order) * @param[in] prec Precision of the incoming gauge field From 8301dbfe82e6d6b71e78b20892faafbe1cda46a8 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 8 Nov 2023 12:03:14 +0100 Subject: [PATCH 116/148] added MatQuda profiling --- lib/interface_quda.cpp | 9 +++++++++ lib/openqcd_interface.cpp | 4 ++++ 2 files changed, 13 insertions(+) diff --git a/lib/interface_quda.cpp b/lib/interface_quda.cpp index 7117677300..eda9dcc9ea 100644 --- a/lib/interface_quda.cpp +++ b/lib/interface_quda.cpp @@ -128,6 +128,9 @@ static TimeProfile profileClover("loadCloverQuda"); //!< Profiler for dslashQuda static TimeProfile profileDslash("dslashQuda"); +//!< Profiler for MatQuda +static TimeProfile profileMat("MatQuda"); + //!< Profiler for invertQuda static TimeProfile profileInvert("invertQuda"); @@ -1372,6 +1375,7 @@ void endQuda(void) profileGauge.Print(); profileClover.Print(); profileDslash.Print(); + profileMat.Print(); profileInvert.Print(); profileInvertMultiSrc.Print(); profileMulti.Print(); @@ -1844,6 +1848,7 @@ void dslashQuda(void *h_out, void *h_in, QudaInvertParam *inv_param, QudaParity void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) { + auto profile = pushProfile(profileMat, inv_param->secs, inv_param->gflops); pushVerbosity(inv_param->verbosity); const auto &gauge = (inv_param->dslash_type != QUDA_ASQTAD_DSLASH) ? *gaugePrecise : *gaugeFatPrecise; @@ -1865,6 +1870,8 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) ColorSpinorField in(cudaParam); in = in_h; + profileMat.TPSTART(QUDA_PROFILE_COMPUTE); + logQuda(QUDA_DEBUG_VERBOSE, "In CPU %e CUDA %e\n", blas::norm2(in_h), blas::norm2(in)); cudaParam.create = QUDA_NULL_FIELD_CREATE; @@ -1892,6 +1899,8 @@ void MatQuda(void *h_out, void *h_in, QudaInvertParam *inv_param) } } + profileMat.TPSTOP(QUDA_PROFILE_COMPUTE); + cpuParam.v = h_out; cpuParam.location = inv_param->output_location; ColorSpinorField out_h(cpuParam); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index b5cc76ee92..479a31fd21 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -837,6 +837,10 @@ void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p) param.output_location = QUDA_CPU_FIELD_LOCATION; MatQuda(static_cast(dst), static_cast(src), ¶m); + + logQuda(QUDA_DEBUG_VERBOSE, "MatQuda()\n"); + logQuda(QUDA_DEBUG_VERBOSE, " gflops = %.2e\n", param.gflops); + logQuda(QUDA_DEBUG_VERBOSE, " secs = %.2e\n", param.secs); } From 200295dec3d56fa900f78f9c74adb5288e607af4 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Tue, 14 Nov 2023 16:15:34 +0100 Subject: [PATCH 117/148] removed check functions and unnecessary including --- include/gauge_field_order.h | 1 - lib/openqcd_interface.cpp | 8 -------- 2 files changed, 9 deletions(-) diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index b4cd6e108e..f2c14223cb 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -23,7 +23,6 @@ #include #include #include -#include namespace quda { diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 479a31fd21..27090bd2c7 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -17,9 +17,6 @@ #include #include -#define CHECK_PARAM -#include "check_params.h" -#undef CHECK_PARAM #define MAX(a, b) ((a) > (b) ? (a) : (b)) @@ -524,8 +521,6 @@ static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec) param.anisotropy = 1.0; /* 1.0 means not anisotropic */ param.ga_pad = getLinkPadding(param.X); /* Why this? */ - checkGaugeParam(¶m); - return param; } @@ -1207,13 +1202,11 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) POP_RANGE; } - checkInvertParam(param); if (param->verbosity >= QUDA_DEBUG_VERBOSE) { printQudaInvertParam(param); } if (param->inv_type_precondition == QUDA_MG_INVERTER) { - checkMultigridParam(multigrid_param); if (param->verbosity >= QUDA_DEBUG_VERBOSE) { printQudaMultigridParam(multigrid_param); } @@ -1336,7 +1329,6 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_sectio if (verbosity >= QUDA_DEBUG_VERBOSE) { printQudaEigParam(param); } - checkEigParam(param); if (verbosity >= QUDA_DEBUG_VERBOSE) { printQudaEigParam(param); } From 59578cc908207040a00090506381d2897ebf1387 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Tue, 14 Nov 2023 16:30:22 +0100 Subject: [PATCH 118/148] added requested comments --- include/index_helper.cuh | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/include/index_helper.cuh b/include/index_helper.cuh index 2ffead559c..e3c2293667 100644 --- a/include/index_helper.cuh +++ b/include/index_helper.cuh @@ -1160,6 +1160,15 @@ namespace quda { } + /** + * @brief Calculate the offset needed for boundary points in openQCD. + * + * @param[in] L Local lattice extent L0-L3 in txyz convention + * @param[in] nproc NPROC0-NPROC3 from openqcd + * @param[in] mu Direction in txyz + * + * @return The offset + */ __device__ __host__ inline int ifc(const int *L, const int *nproc, const int mu) { if (mu==0) { @@ -1179,6 +1188,15 @@ namespace quda { } + /** + * @brief Calculate the offset of the faces in openQCD. + * + * @param[in] L Local lattice extent L0-L3 in txyz convention + * @param[in] nproc NPROC0-NPROC3 from openqcd + * @param[in] mu Direction in txyz + * + * @return The offset + */ __device__ __host__ inline int face_offset(const int *L, const int *nproc, const int mu) { if (mu==0) { @@ -1249,7 +1267,8 @@ namespace quda { /** - * @brief Return cbs[]. + * @brief Return cbs[]. This is the cache block size in openQCD, which + * the local lattice is divided into. * * @param[in] mu Direction * @param[in] X Extents @@ -1353,12 +1372,14 @@ namespace quda { /** - * @brief Pure implementation of iup[ix][mu] + * @brief Pure implementation of iup[ix][mu]. Returns neighbouring + * point of ix in positive mu direction. * - * @param[in] x Cartesian local lattice corrdinates, 0 <= x[i] < Li, - * length 4 in txyz convention - * @param[in] mu Direction in txyz convention - * @param[in] L Local lattice extents, length 4 in txyz convention + * @param[in] x Cartesian local lattice corrdinates, 0 <= x[i] < Li, + * length 4 in txyz convention + * @param[in] mu Direction in txyz convention + * @param[in] L Local lattice extents, length 4 in txyz convention + * @param[in] nproc NPROC0-NPROC3 from openqcd * * @return iup[ix][mu] */ From bab885c0d8c4e7cf745dadec1ace9e380e418826 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Tue, 5 Dec 2023 09:15:40 +0100 Subject: [PATCH 119/148] check for nullptr in comm_create_topology --- include/quda_openqcd_interface.h | 10 +++++----- lib/comm_common.cpp | 3 +-- lib/copy_gauge_extended.cu | 2 +- lib/openqcd_interface.cpp | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index b9ec693c55..60b4d1175b 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -113,11 +113,11 @@ typedef struct { data[5+lex(ix,iy,iz,it)] returns rank number in openQCD, where lex stands for lexicographical indexing (in QUDA order (xyzt)) */ - bc_parms_t bc_parms; - dirac_parms_t dirac_parms; - flds_parms_t flds_parms; - void* (*h_gauge)(void); - void* (*h_sw)(void); + bc_parms_t bc_parms; /** @see bc_parms() */ + dirac_parms_t dirac_parms; /** @see dirac_parms() */ + flds_parms_t flds_parms; /** @see flds_parms() */ + void* (*h_gauge)(void); /** function to return a pointer to the gauge fields */ + void* (*h_sw)(void); /** function to return a pointer to the clover fields */ } openQCD_QudaLayout_t; diff --git a/lib/comm_common.cpp b/lib/comm_common.cpp index 2b39110f52..11b55e6085 100644 --- a/lib/comm_common.cpp +++ b/lib/comm_common.cpp @@ -103,8 +103,7 @@ namespace quda Topology *topo = new Topology; #ifdef BUILD_OPENQCD_INTERFACE - int *data = static_cast(map_data); - topo->cstar = data[0]; + topo->cstar = (map_data == nullptr) ? 0 : static_cast(map_data)[0]; #else topo->cstar = 0; #endif diff --git a/lib/copy_gauge_extended.cu b/lib/copy_gauge_extended.cu index dede657c76..01045642fc 100644 --- a/lib/copy_gauge_extended.cu +++ b/lib/copy_gauge_extended.cu @@ -116,7 +116,7 @@ namespace quda { using G = OpenQCDOrder; CopyGaugeEx(out, in, location, Out, In); #else - errorQuda("OPENQCD interface has not been built\n"); + errorQuda("OPENQCD interface has not been built"); #endif } else { diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 27090bd2c7..747eebb38a 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -537,7 +537,7 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout) qudaState.initialized = true; } -void openQCD_qudaFinalize() { +void openQCD_qudaFinalize(void) { qudaState.initialized = false; endQuda(); } From dd29926a5546a3ede0b1f38a7915f218875ffd3f Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Tue, 5 Dec 2023 12:22:30 +0100 Subject: [PATCH 120/148] ipt and iup function exposed, parsing infile section "QUDA" --- include/quda_openqcd_interface.h | 37 +++++++++++++++++++++---- lib/openqcd_interface.cpp | 47 +++++++++++++++++++++++++------- 2 files changed, 68 insertions(+), 16 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 60b4d1175b..a0d6680f7b 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -113,9 +113,9 @@ typedef struct { data[5+lex(ix,iy,iz,it)] returns rank number in openQCD, where lex stands for lexicographical indexing (in QUDA order (xyzt)) */ - bc_parms_t bc_parms; /** @see bc_parms() */ - dirac_parms_t dirac_parms; /** @see dirac_parms() */ - flds_parms_t flds_parms; /** @see flds_parms() */ + bc_parms_t (*bc_parms)(void); /** @see bc_parms() */ + flds_parms_t (*flds_parms)(void); /** @see flds_parms() */ + dirac_parms_t (*dirac_parms)(void); /** @see dirac_parms() */ void* (*h_gauge)(void); /** function to return a pointer to the gauge fields */ void* (*h_sw)(void); /** function to return a pointer to the clover fields */ } openQCD_QudaLayout_t; @@ -126,7 +126,6 @@ typedef struct { */ typedef struct { QudaVerbosity verbosity; /** How verbose QUDA should be (QUDA_SILENT, QUDA_VERBOSE or QUDA_SUMMARIZE) */ - openQCD_QudaLayout_t layout; /** Layout for QUDA to use */ FILE *logfile; /** log file handler */ void *gauge; /** base pointer to the gauge fields */ int volume; /** VOLUME */ @@ -158,9 +157,10 @@ typedef struct { * Initialize the QUDA context. * * @param[in] init Meta data for the QUDA context - * @param[in] layout The layout struct + * @param[in] layout Layout struct + * @param infile Input file */ -void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout); +void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, char *infile); /** @@ -178,6 +178,31 @@ void openQCD_qudaFinalize(void); void openQCD_back_and_forth(void *h_in, void *h_out); +/** + * @brief Wrapper around openqcd::ipt + * + * @param[in] x Euclidean corrdinate in txyz convention + * + * @return ipt[x] + * + * @see openqcd::ipt() + */ +int openQCD_qudaIndexIpt(const int *x); + + +/** + * @brief Wrapper around openqcd::iup + * + * @param[in] x Euclidean corrdinate in txyz convention + * @param[in] mu Direction + * + * @return iup[x][mu] + * + * @see openqcd::iup() + */ +int openQCD_qudaIndexIup(const int *x, const int mu); + + /** * @brief Norm square in QUDA. * diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 747eebb38a..89878ec217 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -416,9 +417,10 @@ static int rankFromCoords(const int *coords, void *fdata) /** * Set set the local dimensions and machine topology for QUDA to use * - * @param layout Struct defining local dimensions and machine topology + * @param layout Struct defining local dimensions and machine topology + * @param infile Input file */ -void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) +void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout, char *infile) { int my_rank; char prefix[20]; @@ -446,6 +448,14 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout) MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); sprintf(prefix, "QUDA (rank=%d): ", my_rank); + if (my_rank == 0 && infile != nullptr) { + KeyValueStore kv; + kv.set_map(&enum_map); + kv.load(infile); + qudaState.init.verbosity = kv.get("QUDA", "verbosity", qudaState.init.verbosity); + } + + MPI_Bcast((void*) &qudaState.init.verbosity, sizeof(qudaState.init.verbosity), MPI_INT, 0, MPI_COMM_WORLD); setVerbosityQuda(qudaState.init.verbosity, prefix, qudaState.init.logfile); initQuda(device); } @@ -525,14 +535,14 @@ static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec) } -void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout) +void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, char *infile) { if (qudaState.initialized) return; qudaState.init = init; qudaState.layout = layout; qudaopenqcd_called(__func__); - openQCD_qudaSetLayout(qudaState.layout); + openQCD_qudaSetLayout(qudaState.layout, infile); qudaopenqcd_called(__func__); qudaState.initialized = true; } @@ -698,6 +708,23 @@ void openQCD_back_and_forth(void *h_in, void *h_out) } +int openQCD_qudaIndexIpt(const int *x) +{ + int L_openqcd[4]; + openqcd::rotate_coords(qudaState.layout.L, L_openqcd); + return openqcd::ipt(x, L_openqcd); +} + + +int openQCD_qudaIndexIup(const int *x, const int mu) +{ + int L_openqcd[4], nproc_openqcd[4]; + openqcd::rotate_coords(qudaState.layout.L, L_openqcd); + openqcd::rotate_coords(qudaState.layout.nproc, nproc_openqcd); + return openqcd::iup(x, mu, L_openqcd, nproc_openqcd); +} + + double openQCD_qudaNorm(void *h_in) { QudaInvertParam param = newOpenQCDParam(); @@ -886,7 +913,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) param->dirac_order = QUDA_OPENQCD_DIRAC_ORDER; param->gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; param->dslash_type = QUDA_WILSON_DSLASH; - param->kappa = 1.0/(2.0*(qudaState.layout.dirac_parms.m0+4.0)); + param->kappa = 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)); param->mu = 0.0; param->dagger = QUDA_DAG_NO; param->solution_type = QUDA_MAT_SOLUTION; @@ -896,18 +923,18 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) param->inv_type_precondition = QUDA_INVALID_INVERTER; /* disables any preconditioning */ param->mass_normalization = QUDA_MASS_NORMALIZATION; - if (qudaState.layout.dirac_parms.su3csw != 0.0) { + if (qudaState.layout.dirac_parms().su3csw != 0.0) { param->clover_location = QUDA_CUDA_FIELD_LOCATION; /* seems to have no effect? */ param->clover_cpu_prec = QUDA_DOUBLE_PRECISION; param->clover_cuda_prec = QUDA_DOUBLE_PRECISION; - param->clover_csw = qudaState.layout.dirac_parms.su3csw; + param->clover_csw = qudaState.layout.dirac_parms().su3csw; param->clover_coeff = 0.0; /* Set to Wilson Dirac operator with Clover term */ param->dslash_type = QUDA_CLOVER_WILSON_DSLASH; - if (qudaState.layout.flds_parms.gauge == OPENQCD_GAUGE_SU3) { + if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { param->clover_order = QUDA_FLOAT8_CLOVER_ORDER; /* what implication has this? */ param->compute_clover = true; } else { @@ -1164,8 +1191,8 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) POP_RANGE; } - if (qudaState.layout.dirac_parms.su3csw != 0.0) { - if (qudaState.layout.flds_parms.gauge == OPENQCD_GAUGE_SU3) { + if (qudaState.layout.dirac_parms().su3csw != 0.0) { + if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { /** * Leaving both h_clover = h_clovinv = NULL allocates the clover field on * the GPU and finally calls @createCloverQuda to calculate the clover From 43a1b4fdea3003f20d85c4cfdeb991ae8b6bf1c2 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 6 Dec 2023 12:46:37 +0100 Subject: [PATCH 121/148] the Wilson and Wilson-clover Dirac operator are gamma5 Hermtitian --- include/dirac_quda.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/dirac_quda.h b/include/dirac_quda.h index 1aa339139f..225f4e6567 100644 --- a/include/dirac_quda.h +++ b/include/dirac_quda.h @@ -2606,6 +2606,9 @@ namespace quda { && (pc_type == QUDA_MATPC_EVEN_EVEN || pc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC)) return true; + if (dirac_type == QUDA_WILSON_DIRAC || dirac_type == QUDA_CLOVER_DIRAC) + return true; + return false; } }; From 23fefb1699f3deaf9b21f5901e8d543833b5be7e Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 6 Dec 2023 13:11:35 +0100 Subject: [PATCH 122/148] added functionality to compute both (left and right) singular vectors in computeSVD --- include/eigensolve_quda.h | 7 ++++--- lib/eig_block_trlm.cpp | 2 +- lib/eig_iram.cpp | 2 +- lib/eig_trlm.cpp | 2 +- lib/eigensolve_quda.cpp | 11 ++++++++--- 5 files changed, 15 insertions(+), 9 deletions(-) diff --git a/include/eigensolve_quda.h b/include/eigensolve_quda.h index 06472f069b..f249aa2660 100644 --- a/include/eigensolve_quda.h +++ b/include/eigensolve_quda.h @@ -258,10 +258,11 @@ namespace quda /** @brief Computes Left/Right SVD from pre computed Right/Left - @param[in] evecs Computed eigenvectors of NormOp - @param[in] evals Computed eigenvalues of NormOp + @param[in,out] evecs Computed eigenvectors of NormOp + @param[in,out] evals Computed eigenvalues of NormOp + @param[in] dagger Whether NormOp was MdagM (false) or MMdag (true) */ - void computeSVD(std::vector &evecs, std::vector &evals); + void computeSVD(std::vector &evecs, std::vector &evals, bool dagger = false); /** @brief Compute eigenvalues and their residiua diff --git a/lib/eig_block_trlm.cpp b/lib/eig_block_trlm.cpp index 160af9dff4..5c3065794a 100644 --- a/lib/eig_block_trlm.cpp +++ b/lib/eig_block_trlm.cpp @@ -197,7 +197,7 @@ namespace quda // Compute eigenvalues computeEvals(kSpace, evals); - if (compute_svd) computeSVD(kSpace, evals); + if (compute_svd) computeSVD(kSpace, evals, eig_param->use_dagger); } // Local clean-up diff --git a/lib/eig_iram.cpp b/lib/eig_iram.cpp index 4817cf0fc7..626d552d92 100644 --- a/lib/eig_iram.cpp +++ b/lib/eig_iram.cpp @@ -515,7 +515,7 @@ namespace quda // Compute the eigen/singular values. profile.TPSTART(QUDA_PROFILE_COMPUTE); computeEvals(kSpace, evals); - if (compute_svd) computeSVD(kSpace, evals); + if (compute_svd) computeSVD(kSpace, evals, eig_param->use_dagger); converged = true; } else { diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp index 00d3941527..0381f6e6a9 100644 --- a/lib/eig_trlm.cpp +++ b/lib/eig_trlm.cpp @@ -173,7 +173,7 @@ namespace quda // Compute eigenvalues/singular values computeEvals(kSpace, evals); - if (compute_svd) computeSVD(kSpace, evals); + if (compute_svd) computeSVD(kSpace, evals, eig_param->use_dagger); } // Local clean-up diff --git a/lib/eigensolve_quda.cpp b/lib/eigensolve_quda.cpp index 710d6ac13a..071a192784 100644 --- a/lib/eigensolve_quda.cpp +++ b/lib/eigensolve_quda.cpp @@ -502,7 +502,7 @@ namespace quda } } - void EigenSolver::computeSVD(std::vector &evecs, std::vector &evals) + void EigenSolver::computeSVD(std::vector &evecs, std::vector &evals, bool dagger) { logQuda(QUDA_SUMMARIZE, "Computing SVD of M\n"); @@ -527,8 +527,13 @@ namespace quda // Lambda already contains the square root of the eigenvalue of the norm op. Complex lambda = evals[i]; - // M*Rev_i = M*Rsv_i = sigma_i Lsv_i - mat.Expose()->M(evecs[n_conv + i], evecs[i]); + if (dagger) { + // Mdag*Lev_i = Mdag*Lsv_i = sigma_i Rsv_i + mat.Expose()->Mdag(evecs[n_conv + i], evecs[i]); + } else { + // M*Rev_i = M*Rsv_i = sigma_i Lsv_i + mat.Expose()->M(evecs[n_conv + i], evecs[i]); + } // sigma_i = sqrt(sigma_i (Lsv_i)^dag * sigma_i * Lsv_i ) sigma_tmp[i] = sqrt(blas::norm2(evecs[n_conv + i])); From 848e6d919426b9737954e83ade7b57490b649894 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 6 Dec 2023 13:12:55 +0100 Subject: [PATCH 123/148] added comments for eigensolver interface --- include/quda_openqcd_interface.h | 37 +++++++++++++++++++++++++++++++- lib/openqcd_interface.cpp | 31 ++++++++++---------------- 2 files changed, 47 insertions(+), 21 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index a0d6680f7b..bba33f8d98 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -251,7 +251,6 @@ void openQCD_qudaSpinorFree(void** quda_field); * @param[in] p Dirac parameter struct */ void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p); -void openQCD_qudaDdagD(void *src, void *dst, openQCD_QudaDiracParam_t p); void openQCD_qudaDw2(void *param, double mu, void *src, void *dst); @@ -305,8 +304,44 @@ double openQCD_qudaInvert(void *param, double mu, void *source, void *solution, void openQCD_qudaSolverDestroy(void *param); +/** + * Setup the eigen-solver interface to quda. This function parses the file + * given by [infile] as an openQCD ini file. The solver section given by the + * [inv_section] parameter must have a key-value pair like solver = QUDA and may + * contain every member of the struct [QudaInvertParam]. See + * [openQCD_qudaSolverSetup] for more details about the solver. The eigen-solver + * section given by the [section] parameter may contain every member of the + * struct [QudaEigParam]. + * + * @param[in] infile Ini-file containing sections about the eigen-solver + * @param[in] section The section name of the eigen-solver + * @param[in] inv_section The section name of the solver + * + * @return Pointer to the eigen-solver context + */ void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_section); + + +/** + * @brief Solve Ax=b for an Clover Wilson operator with a multigrid + * solver. All fields are fields passed and returned are host + * (CPU) field in openQCD order. This function requires an + * existing solver context created with openQCD_qudaSolverSetup(). + * + * @param[inout] param Pointer returned by openQCD_qudaEigensolverSetup() + * @param[inout] h_evecs Allocated array of void-pointers to param->n_conf + * fields + * @param[out] h_evals Allocated array of param->n_conf complex_dbles + */ void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals); + + +/** + * @brief Destroys an existing eigen-solver context and frees all involed + * structs. + * + * @param param Pointer to the context to destroy + */ void openQCD_qudaEigensolverDestroy(void *param); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 89878ec217..0dc8a9792a 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -866,18 +866,6 @@ void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p) } -void openQCD_qudaDdagD(void *src, void *dst, openQCD_QudaDiracParam_t p) -{ - QudaInvertParam param = newOpenQCDDiracParam(p); - - /* both fields reside on the CPU */ - param.input_location = QUDA_CPU_FIELD_LOCATION; - param.output_location = QUDA_CPU_FIELD_LOCATION; - - MatDagMatQuda(static_cast(dst), static_cast(src), ¶m); -} - - void openQCD_qudaDw2(void *param, double mu, void *src, void *dst) { QudaInvertParam* inv_param = static_cast(param); @@ -949,9 +937,8 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) kv.load(infile); param->verbosity = kv.get(section, "verbosity", param->verbosity); - setVerbosity(param->verbosity); - if (param->verbosity >= QUDA_VERBOSE) { + if (param->verbosity >= QUDA_DEBUG_VERBOSE) { kv.dump(); } @@ -1295,9 +1282,8 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_sectio kv.load(infile); verbosity = kv.get(section, "verbosity", verbosity); - setVerbosity(verbosity); - if (verbosity >= QUDA_VERBOSE) { + if (verbosity >= QUDA_DEBUG_VERBOSE) { kv.dump(); } @@ -1350,13 +1336,17 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_sectio /* transfer of the struct to all the processes */ MPI_Bcast((void*) param, sizeof(*param), MPI_BYTE, 0, MPI_COMM_WORLD); - void *inv_param = openQCD_qudaSolverSetup(infile, inv_section); - param->invert_param = static_cast(inv_param); + if (inv_section != nullptr) { + void *inv_param = openQCD_qudaSolverSetup(infile, inv_section); + param->invert_param = static_cast(inv_param); + } else { + param->invert_param = new QudaInvertParam(newQudaInvertParam()); + } - if (verbosity >= QUDA_DEBUG_VERBOSE) { + if (inv_section != nullptr && verbosity >= QUDA_DEBUG_VERBOSE) { printQudaEigParam(param); } - if (verbosity >= QUDA_DEBUG_VERBOSE) { + if (inv_section != nullptr && verbosity >= QUDA_DEBUG_VERBOSE) { printQudaEigParam(param); } @@ -1376,6 +1366,7 @@ void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals) logQuda(QUDA_SUMMARIZE, "openQCD_qudaEigensolve()\n"); logQuda(QUDA_SUMMARIZE, " gflops = %.2e\n", eig_param->gflops); logQuda(QUDA_SUMMARIZE, " secs = %.2e\n", eig_param->secs); + logQuda(QUDA_SUMMARIZE, " iter = %d\n", eig_param->invert_param->iter); } From 02b84810ec43a857e66b4f4997657e7bb792f6ce Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 8 Dec 2023 14:12:02 +0100 Subject: [PATCH 124/148] some convenience mappings for the enums --- lib/openqcd_interface.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 0dc8a9792a..aa830be542 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -144,7 +144,17 @@ std::unordered_map enum_map = { {"QUDA_DOUBLE_PRECISION", std::to_string(QUDA_DOUBLE_PRECISION)}, {"QUDA_INVALID_PRECISION", std::to_string(QUDA_INVALID_PRECISION)}, {"QUDA_BOOLEAN_FALSE", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"false", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"FALSE", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"no", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"n", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"off", std::to_string(QUDA_BOOLEAN_FALSE)}, {"QUDA_BOOLEAN_TRUE", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"true", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"TRUE", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"yes", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"y", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"on", std::to_string(QUDA_BOOLEAN_TRUE)}, {"QUDA_BOOLEAN_INVALID", std::to_string(QUDA_BOOLEAN_INVALID)}, {"QUDA_COMPUTE_NULL_VECTOR_NO", std::to_string(QUDA_COMPUTE_NULL_VECTOR_NO)}, {"QUDA_COMPUTE_NULL_VECTOR_YES", std::to_string(QUDA_COMPUTE_NULL_VECTOR_YES)}, From ae170fe25f1bd80dec57ae1a39ea5d328f9ec20c Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 8 Dec 2023 14:12:37 +0100 Subject: [PATCH 125/148] in TRLM: n_kr must be >= n_conv+12, else iter_keep may become negatice (ends in a segfault) --- lib/eig_trlm.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/eig_trlm.cpp b/lib/eig_trlm.cpp index 0381f6e6a9..f407972d21 100644 --- a/lib/eig_trlm.cpp +++ b/lib/eig_trlm.cpp @@ -28,7 +28,8 @@ namespace quda beta.resize(n_kr, 0.0); // Thick restart specific checks - if (n_kr < n_ev + 6) errorQuda("n_kr=%d must be greater than n_ev+6=%d\n", n_kr, n_ev + 6); + if (n_kr < n_ev + 6) errorQuda("n_kr=%d must be greater than or equal to n_ev+6=%d\n", n_kr, n_ev + 6); + if (n_kr < n_conv + 12) errorQuda("n_kr=%d must be greater than or equal to n_conv+12=%d\n", n_kr, n_conv + 12); if (!(eig_param->spectrum == QUDA_SPECTRUM_LR_EIG || eig_param->spectrum == QUDA_SPECTRUM_SR_EIG)) { errorQuda("Only real spectrum type (LR or SR) can be passed to the TR Lanczos solver"); From d56921e4e8f14299f0fc57086fc6f4674907f5b9 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Tue, 12 Dec 2023 11:04:13 +0100 Subject: [PATCH 126/148] verbose error message if section is not a quda section --- include/quda_openqcd_interface.h | 4 +++- lib/openqcd_interface.cpp | 5 +++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index bba33f8d98..7694aeb1c1 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -315,7 +315,9 @@ void openQCD_qudaSolverDestroy(void *param); * * @param[in] infile Ini-file containing sections about the eigen-solver * @param[in] section The section name of the eigen-solver - * @param[in] inv_section The section name of the solver + * @param[in] inv_section The section name of the solver. If NULL, the section + * is not read in and the gauge and clover fields are + * not transfered/updated. * * @return Pointer to the eigen-solver context */ diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index aa830be542..41dce27f0c 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -71,7 +71,7 @@ template void inline qudaopenqcd_called(const char *func, QudaVerbo POP_RANGE; } - #ifdef QUDA_OPENQCD_VERBOSE +#ifdef QUDA_OPENQCD_VERBOSE if (verb >= QUDA_VERBOSE) { if (start) { printfQuda("QUDA_OPENQCD_INTERFACE: %s (called) \n", func); @@ -953,7 +953,8 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) } if (kv.get(section, "solver") != "QUDA") { - errorQuda("Solver section %s is not a quda-solver section\n", section); + errorQuda("Solver section \"%s\" in file %s is not a valid quda-solver section (solver = %s)\n", + section, infile, kv.get(section, "solver").c_str()); } /* both fields reside on the CPU */ From 886c90fee6fb72e6fb4cd58296941b5cf2a14d4f Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 15 Dec 2023 19:33:55 +0200 Subject: [PATCH 127/148] more verbose error msg when solver section is not a QUDA one --- lib/openqcd_interface.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index aa830be542..dfe5da7ae9 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -506,7 +506,11 @@ static QudaInvertParam newOpenQCDParam(void) */ param.dirac_order = QUDA_OPENQCD_DIRAC_ORDER; - /* Gamma basis of the input and output host fields */ + /** + * Gamma basis of the input and output host fields. Specifies the basis change + * into QUDAs internal gamma basis. Note that QUDA applies the basis change U + * to a spinor field when uploading and U^dagger when downloading. + */ param.gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; return param; @@ -953,7 +957,8 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) } if (kv.get(section, "solver") != "QUDA") { - errorQuda("Solver section %s is not a quda-solver section\n", section); + errorQuda("Solver section \"%s\" in file %s is not a valid quda-solver section (solver = %s)\n", + section, infile, kv.get(section, "solver").c_str()); } /* both fields reside on the CPU */ From e6ce72841bbc02dbf16deb05f27e31f5d5b93339 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 15 Dec 2023 19:34:47 +0200 Subject: [PATCH 128/148] added function comm_dim_cstar to query if a dimension is C* --- include/comm_quda.h | 7 +++++++ include/communicator_quda.h | 9 ++++++++- lib/communicator_stack.cpp | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/include/comm_quda.h b/include/comm_quda.h index 8b855f5b0a..c573db5da1 100644 --- a/include/comm_quda.h +++ b/include/comm_quda.h @@ -49,6 +49,13 @@ namespace quda */ int comm_dim(int dim); + /** + Return whether the dimension dim is a C* dimension or not + @param dim Dimension which we are querying + @return C* dimension or nor + */ + bool comm_dim_cstar(int dim); + /** Return the coording of this process in the dimension dim @param dim Dimension which we are querying diff --git a/include/communicator_quda.h b/include/communicator_quda.h index f63bd93f28..6eb7c406ae 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -142,7 +142,8 @@ namespace quda )) { // if we go over the boundary and have a shifted boundary condition, // we shift Nx/2 ranks in x-direction: - // shift_integer in {-1, 0, 1} + // shift_integer in { 0, 1, 2} + // (shift_integer - 1) in {-1, 0, 1} shift_integer = (comm_coords(topo)[i] + displacement[i] + comm_dims(topo)[i]) / comm_dims(topo)[i]; Nx_displacement += (shift_integer - 1) * (comm_dims(topo)[0]/2); } @@ -408,6 +409,12 @@ namespace quda return comm_dims(topo)[dim]; } + bool comm_dim_cstar(int dim) + { + Topology *topo = comm_default_topology(); + return (topo->cstar >= 2 && dim == 1) || (topo->cstar >= 3 && dim == 2); + } + int comm_coord(int dim) { Topology *topo = comm_default_topology(); diff --git a/lib/communicator_stack.cpp b/lib/communicator_stack.cpp index fe9d1faaba..9351f10a81 100644 --- a/lib/communicator_stack.cpp +++ b/lib/communicator_stack.cpp @@ -62,6 +62,8 @@ namespace quda int comm_dim(int dim) { return get_current_communicator().comm_dim(dim); } + bool comm_dim_cstar(int dim) { return get_current_communicator().comm_dim_cstar(dim); } + int comm_coord(int dim) { return get_current_communicator().comm_coord(dim); } int comm_rank_from_coords(const int *coords) { return get_current_communicator().comm_rank_from_coords(coords); } From ba9ace106072c37b92a490857f9a318f3f5ce66c Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 15 Dec 2023 19:46:49 +0200 Subject: [PATCH 129/148] solved P2P communication bug when C* boundaries are active --- lib/targets/cuda/comm_target.cpp | 4 ++-- lib/targets/hip/comm_target.cpp | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/targets/cuda/comm_target.cpp b/lib/targets/cuda/comm_target.cpp index bd9fa1fd3e..b8bd74d770 100644 --- a/lib/targets/cuda/comm_target.cpp +++ b/lib/targets/cuda/comm_target.cpp @@ -75,8 +75,8 @@ namespace quda // TODO: We maybe can force loopback comms to use the IB path here if (comm_dim(dim) == 1) continue; #endif - // even if comm_dim(2) == 2, we might not have p2p enabled in both directions, so check this - const int num_dir = (comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? 1 : 2; + // even if comm_dim(dim) == 2, we might not have p2p enabled in both directions, so check this + const int num_dir = (!comm_dim_cstar(dim) && comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? 1 : 2; for (int dir = 0; dir < num_dir; dir++) { remote[dim][dir] = nullptr; #ifndef NVSHMEM_COMMS diff --git a/lib/targets/hip/comm_target.cpp b/lib/targets/hip/comm_target.cpp index cbb4725253..622f73b9ee 100644 --- a/lib/targets/hip/comm_target.cpp +++ b/lib/targets/hip/comm_target.cpp @@ -69,9 +69,8 @@ namespace quda // open the remote memory handles and set the send ghost pointers for (int dim = 0; dim < 4; ++dim) { if (comm_dim(dim) == 1) continue; - // even if comm_dim(2) == 2, we might not have p2p enabled in both directions, so check this - const int num_dir - = (comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? 1 : 2; + // even if comm_dim(dim) == 2, we might not have p2p enabled in both directions, so check this + const int num_dir = (!comm_dim_cstar(dim) && comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? 1 : 2; for (int dir = 0; dir < num_dir; ++dir) { remote[dim][dir] = nullptr; if (!comm_peer2peer_enabled(dir, dim)) continue; From 1b1a55cb2e7f95a049b181d30135574e366c982e Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 5 Feb 2024 17:44:35 +0100 Subject: [PATCH 130/148] added packed formats --- include/quda_openqcd_interface.h | 6 ++++-- lib/check_params.h | 1 + lib/openqcd_interface.cpp | 27 +++++++++++++++------------ 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 7694aeb1c1..eebd8843b8 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -365,8 +365,9 @@ double openQCD_qudaPlaquette(void); * * @param[in] gauge The gauge fields (in openqcd order) * @param[in] prec Precision of the incoming gauge field + * @param[in] rec How the field should be stored internally in QUDA */ -void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec); +void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType rec); /** @@ -374,8 +375,9 @@ void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec); * * @param[out] gauge The gauge fields (will be stored in openqcd order) * @param[in] prec Precision of the outgoing gauge field + * @param[in] rec How the field should be stored internally in QUDA */ -void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec); +void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType rec); /** diff --git a/lib/check_params.h b/lib/check_params.h index 00892741f2..1e7e9263b6 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -210,6 +210,7 @@ void printQudaEigParam(QudaEigParam *param) { P(use_norm_op, QUDA_BOOLEAN_INVALID); P(compute_svd, QUDA_BOOLEAN_INVALID); P(require_convergence, QUDA_BOOLEAN_INVALID); + P(spectrum, QUDA_SPECTRUM_INVALID); P(n_ev, INVALID_INT); P(n_kr, INVALID_INT); P(n_conv, INVALID_INT); diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 41dce27f0c..42412b5fb8 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -517,10 +517,11 @@ static QudaInvertParam newOpenQCDParam(void) * @brief Initialize quda gauge param struct * * @param[in] prec precision + * @param[in] rec QUDA internal gauge field format * * @return The quda gauge parameter struct. */ -static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec) +static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec, QudaReconstructType rec = QUDA_RECONSTRUCT_NO) { QudaGaugeParam param = newQudaGaugeParam(); @@ -528,13 +529,13 @@ static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec) param.cuda_prec_sloppy = param.cpu_prec = param.cuda_prec = prec; param.type = QUDA_SU3_LINKS; - param.reconstruct_sloppy = param.reconstruct = QUDA_RECONSTRUCT_NO; + param.reconstruct_sloppy = param.reconstruct = rec; - /* This make quda to instantiate OpenQCDOrder */ + /* This makes quda to instantiate OpenQCDOrder */ param.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; - /* Seems to have no effect ... */ - param.t_boundary = QUDA_PERIODIC_T; + /* Seems to have only effect if reconstruct != QUDA_RECONSTRUCT_NO ... */ + param.t_boundary = QUDA_ANTI_PERIODIC_T; param.gauge_fix = QUDA_GAUGE_FIXED_NO; param.scale = 1.0; @@ -579,17 +580,17 @@ double openQCD_qudaPlaquette(void) } -void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec) +void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType rec) { - QudaGaugeParam param = newOpenQCDGaugeParam(prec); + QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec); loadGaugeQuda(gauge, ¶m); qudaState.gauge_loaded = true; } -void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec) +void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType rec) { - QudaGaugeParam param = newOpenQCDGaugeParam(prec); + QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec); void* buffer = pool_pinned_malloc((4*qudaState.init.volume + 7*qudaState.init.bndry/4)*18*prec); saveGaugeQuda(buffer, ¶m); @@ -1185,13 +1186,15 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) logQuda(QUDA_VERBOSE, "Loading gauge field from openQCD ...\n"); void *h_gauge = qudaState.layout.h_gauge(); PUSH_RANGE("openQCD_qudaGaugeLoad",3); - openQCD_qudaGaugeLoad(h_gauge, QUDA_DOUBLE_PRECISION); + QudaReconstructType rec = qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3 ? QUDA_RECONSTRUCT_8 : QUDA_RECONSTRUCT_9; + openQCD_qudaGaugeLoad(h_gauge, QUDA_DOUBLE_PRECISION, rec); POP_RANGE; } - if (qudaState.layout.dirac_parms().su3csw != 0.0) { + if (param->clover_csw != 0.0) { if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { /** + * SU3 case: * Leaving both h_clover = h_clovinv = NULL allocates the clover field on * the GPU and finally calls @createCloverQuda to calculate the clover * field. @@ -1202,7 +1205,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) POP_RANGE; } else { /** - * Transfer the SW-field from openQCD. + * U3 case: Transfer the SW-field from openQCD. */ logQuda(QUDA_VERBOSE, "Loading clover field from openQCD ...\n"); void *h_sw = qudaState.layout.h_sw(); From dd92262124fc215ae8f3fc2463d04d3f7b8af752 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 7 Feb 2024 18:00:21 +0100 Subject: [PATCH 131/148] dynamic t_boundary --- include/quda_openqcd_interface.h | 18 ++++++++------- lib/openqcd_interface.cpp | 39 +++++++++++++++++++++++--------- 2 files changed, 38 insertions(+), 19 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index eebd8843b8..2c656f0c4e 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -363,21 +363,23 @@ double openQCD_qudaPlaquette(void); * fields have to be up2date; i.e. call copy_bnd_hd(), copy_bnd_ud() * before pass fields into this function. * - * @param[in] gauge The gauge fields (in openqcd order) - * @param[in] prec Precision of the incoming gauge field - * @param[in] rec How the field should be stored internally in QUDA + * @param[in] gauge The gauge fields (in openqcd order) + * @param[in] prec Precision of the incoming gauge field + * @param[in] rec How the field should be stored internally in QUDA + * @param[in] t_boundary Time boundary condition */ -void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType rec); +void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary); /** * @brief Save the gauge fields from quda to host. * - * @param[out] gauge The gauge fields (will be stored in openqcd order) - * @param[in] prec Precision of the outgoing gauge field - * @param[in] rec How the field should be stored internally in QUDA + * @param[out] gauge The gauge fields (will be stored in openqcd order) + * @param[in] prec Precision of the outgoing gauge field + * @param[in] rec How the field should be stored internally in QUDA + * @param[in] t_boundary Time boundary condition */ -void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType rec); +void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary); /** diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index b1d6fbd524..cf393e6018 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -520,12 +520,13 @@ static QudaInvertParam newOpenQCDParam(void) /** * @brief Initialize quda gauge param struct * - * @param[in] prec precision - * @param[in] rec QUDA internal gauge field format + * @param[in] prec Precision + * @param[in] rec QUDA internal gauge field format + * @param[in] t_boundary Time boundary condition * * @return The quda gauge parameter struct. */ -static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec, QudaReconstructType rec = QUDA_RECONSTRUCT_NO) +static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary) { QudaGaugeParam param = newQudaGaugeParam(); @@ -538,9 +539,7 @@ static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec, QudaReconstructTy /* This makes quda to instantiate OpenQCDOrder */ param.gauge_order = QUDA_OPENQCD_GAUGE_ORDER; - /* Seems to have only effect if reconstruct != QUDA_RECONSTRUCT_NO ... */ - param.t_boundary = QUDA_ANTI_PERIODIC_T; - + param.t_boundary = t_boundary; param.gauge_fix = QUDA_GAUGE_FIXED_NO; param.scale = 1.0; param.anisotropy = 1.0; /* 1.0 means not anisotropic */ @@ -584,17 +583,17 @@ double openQCD_qudaPlaquette(void) } -void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType rec) +void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary) { - QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec); + QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec, t_boundary); loadGaugeQuda(gauge, ¶m); qudaState.gauge_loaded = true; } -void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType rec) +void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary) { - QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec); + QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec, t_boundary); void* buffer = pool_pinned_malloc((4*qudaState.init.volume + 7*qudaState.init.bndry/4)*18*prec); saveGaugeQuda(buffer, ¶m); @@ -1191,7 +1190,25 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) void *h_gauge = qudaState.layout.h_gauge(); PUSH_RANGE("openQCD_qudaGaugeLoad",3); QudaReconstructType rec = qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3 ? QUDA_RECONSTRUCT_8 : QUDA_RECONSTRUCT_9; - openQCD_qudaGaugeLoad(h_gauge, QUDA_DOUBLE_PRECISION, rec); + + /** + * We set t_boundary = QUDA_ANTI_PERIODIC_T. This setting is a label that + * tells QUDA the current state of the residing gauge field, that is the + * same state as the one we transfer from openqxd. In openqxd the hdfld + * exhibits phases of -1 for the temporal time boundaries, meaning that the + * gauge fields are explicitly multiplied by -1 on the t=0 time slice, see + * chs_hd0() in hflds.c. The QUDA_ANTI_PERIODIC_T flag says that these + * phases are incorporated into the field and that QUDA has to add these + * phases on the t=0 time slice when reconstructing the field from + * QUDA_RECONSTRUCT_8/12, but not from QUDA_RECONSTRUCT_NO. In case of + * QUDA_RECONSTRUCT_NO the value if t_boundary has no effect. + * + * @see https://github.com/lattice/quda/issues/1315 + * @see Reconstruct#Unpack() in gauge_field_order.h + * @see Reconstruct<8,...>#Unpack() in gauge_field_order.h + * @see Reconstruct<12,...>#Unpack() in gauge_field_order.h + */ + openQCD_qudaGaugeLoad(h_gauge, QUDA_DOUBLE_PRECISION, rec, QUDA_ANTI_PERIODIC_T); POP_RANGE; } From 20e59ba5b933a30fe668986df5f037146c86ec40 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 7 Feb 2024 18:01:09 +0100 Subject: [PATCH 132/148] enable RECONSTRUCT_8/9/12/13/NO for openqxd interface --- include/instantiate.h | 10 ++++++++++ include/instantiate_dslash.h | 12 ++++++++++++ lib/copy_gauge_extended.cu | 8 ++++---- lib/copy_gauge_inc.cu | 8 ++++---- lib/instantiate.cpp | 8 ++++++++ 5 files changed, 38 insertions(+), 8 deletions(-) diff --git a/include/instantiate.h b/include/instantiate.h index 5748bba54a..f846ece347 100644 --- a/include/instantiate.h +++ b/include/instantiate.h @@ -112,8 +112,13 @@ namespace quda }; struct ReconstructWilson { +#ifdef BUILD_OPENQCD_INTERFACE + static constexpr std::array recon + = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_9}; +#else static constexpr std::array recon = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8}; +#endif }; struct ReconstructStaggered { @@ -498,8 +503,13 @@ namespace quda // these are used in dslash.h struct WilsonReconstruct { +#ifdef BUILD_OPENQCD_INTERFACE + static constexpr std::array recon + = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8, QUDA_RECONSTRUCT_13, QUDA_RECONSTRUCT_9}; +#else static constexpr std::array recon = {QUDA_RECONSTRUCT_NO, QUDA_RECONSTRUCT_12, QUDA_RECONSTRUCT_8}; +#endif }; struct StaggeredReconstruct { diff --git a/include/instantiate_dslash.h b/include/instantiate_dslash.h index d9c8144de4..c7f59b16c7 100644 --- a/include/instantiate_dslash.h +++ b/include/instantiate_dslash.h @@ -37,6 +37,18 @@ namespace quda Apply(out, in, U, args...); #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8/9", QUDA_RECONSTRUCT); +#endif + } else if (Recon::recon.size() > 3 && U.Reconstruct() == Recon::recon[3]) { +#if QUDA_RECONSTRUCT & 2 + Apply(out, in, U, args...); +#else + errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-13", QUDA_RECONSTRUCT); +#endif + } else if (Recon::recon.size() > 4 && U.Reconstruct() == Recon::recon[4]) { +#if QUDA_RECONSTRUCT & 1 + Apply(out, in, U, args...); +#else + errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); #endif } else { errorQuda("Unsupported reconstruct type %d\n", U.Reconstruct()); diff --git a/lib/copy_gauge_extended.cu b/lib/copy_gauge_extended.cu index 01045642fc..94d5390d59 100644 --- a/lib/copy_gauge_extended.cu +++ b/lib/copy_gauge_extended.cu @@ -64,7 +64,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT); #endif -#ifdef GPU_STAGGERED_DIRAC +#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else if (out.Reconstruct() == QUDA_RECONSTRUCT_13) { #if QUDA_RECONSTRUCT & 2 typedef typename gauge_mapper::type G; @@ -79,7 +79,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); #endif -#endif // GPU_STAGGERED_DIRAC +#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else { errorQuda("Reconstruction %d and order %d not supported", out.Reconstruct(), out.Order()); } @@ -147,7 +147,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT); #endif -#ifdef GPU_STAGGERED_DIRAC +#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) { #if QUDA_RECONSTRUCT & 2 typedef typename gauge_mapper::type G; @@ -162,7 +162,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); #endif -#endif // GPU_STAGGERED_DIRAC +#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else { errorQuda("Reconstruction %d and order %d not supported", in.Reconstruct(), in.Order()); } diff --git a/lib/copy_gauge_inc.cu b/lib/copy_gauge_inc.cu index 97af1e6ce2..d41a45de23 100644 --- a/lib/copy_gauge_inc.cu +++ b/lib/copy_gauge_inc.cu @@ -27,7 +27,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT); #endif -#ifdef GPU_STAGGERED_DIRAC +#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else if (out.Reconstruct() == QUDA_RECONSTRUCT_13) { #if QUDA_RECONSTRUCT & 2 typedef typename gauge_mapper::type G; @@ -56,7 +56,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); #endif -#endif // GPU_STAGGERED_DIRAC +#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else { errorQuda("Reconstruction %d and order %d not supported", out.Reconstruct(), out.Order()); } @@ -170,7 +170,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8", QUDA_RECONSTRUCT); #endif -#ifdef GPU_STAGGERED_DIRAC +#if defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else if (in.Reconstruct() == QUDA_RECONSTRUCT_13) { #if QUDA_RECONSTRUCT & 2 typedef typename gauge_mapper::type G; @@ -192,7 +192,7 @@ namespace quda { #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); #endif -#endif // GPU_STAGGERED_DIRAC +#endif // defined(GPU_STAGGERED_DIRAC) || defined(BUILD_OPENQCD_INTERFACE) } else { errorQuda("Reconstruction %d and order %d not supported", in.Reconstruct(), in.Order()); } diff --git a/lib/instantiate.cpp b/lib/instantiate.cpp index 06e95ad1d6..ba63bbf060 100644 --- a/lib/instantiate.cpp +++ b/lib/instantiate.cpp @@ -12,7 +12,11 @@ namespace quda // declared in instantiate.h constexpr std::array ReconstructFull::recon; +#ifdef BUILD_OPENQCD_INTERFACE + constexpr std::array ReconstructWilson::recon; +#else constexpr std::array ReconstructWilson::recon; +#endif constexpr std::array ReconstructStaggered::recon; constexpr std::array ReconstructNo12::recon; constexpr std::array ReconstructNone::recon; @@ -20,7 +24,11 @@ namespace quda constexpr std::array Reconstruct10::recon; // declared in dslash.h +#ifdef BUILD_OPENQCD_INTERFACE + constexpr std::array WilsonReconstruct::recon; +#else constexpr std::array WilsonReconstruct::recon; +#endif constexpr std::array StaggeredReconstruct::recon; } // namespace quda From d40d6e5fb5bc8de5a96825816746d06f521ce4e5 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Tue, 13 Feb 2024 19:56:28 +0100 Subject: [PATCH 133/148] fixed eigensolver, when cuda_prec_eigensolver is not set in the infile --- lib/openqcd_interface.cpp | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index cf393e6018..b2a00b831a 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -495,6 +495,7 @@ static QudaInvertParam newOpenQCDParam(void) param.cpu_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the input fermion fields */ param.cuda_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the QUDA solver */ + param.cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; /* The precision used by the QUDA eigensolver */ param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; /* The precision used by the QUDA solver */ param.cuda_prec_precondition = QUDA_HALF_PRECISION; /* The precision used by the QUDA solver */ @@ -616,6 +617,7 @@ void openQCD_qudaCloverLoad(void *clover, double kappa, double csw) param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; param.clover_cpu_prec = QUDA_DOUBLE_PRECISION; param.clover_cuda_prec = QUDA_DOUBLE_PRECISION; + param.clover_cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; param.kappa = kappa; param.clover_csw = csw; @@ -657,6 +659,7 @@ static QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) param.clover_location = QUDA_CUDA_FIELD_LOCATION; /* seems to have no effect? */ param.clover_cpu_prec = QUDA_DOUBLE_PRECISION; param.clover_cuda_prec = QUDA_DOUBLE_PRECISION; + param.clover_cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; param.clover_order = QUDA_FLOAT8_CLOVER_ORDER; /* what implication has this? */ param.compute_clover = true; @@ -910,6 +913,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) param->verbosity = QUDA_SUMMARIZE; param->cpu_prec = QUDA_DOUBLE_PRECISION; param->cuda_prec = QUDA_DOUBLE_PRECISION; + param->cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; param->cuda_prec_sloppy = QUDA_SINGLE_PRECISION; param->cuda_prec_precondition = QUDA_HALF_PRECISION; param->dirac_order = QUDA_OPENQCD_DIRAC_ORDER; @@ -929,6 +933,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) param->clover_location = QUDA_CUDA_FIELD_LOCATION; /* seems to have no effect? */ param->clover_cpu_prec = QUDA_DOUBLE_PRECISION; param->clover_cuda_prec = QUDA_DOUBLE_PRECISION; + param->clover_cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; param->clover_csw = qudaState.layout.dirac_parms().su3csw; param->clover_coeff = 0.0; @@ -1089,7 +1094,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) /* (shallow) copy the struct */ *invert_param_mg = *param; - /* these have to be fixed */ + /* these have to be fixed, and cannot be overwritten by the input file */ invert_param_mg->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; invert_param_mg->dirac_order = QUDA_DIRAC_ORDER; @@ -1179,7 +1184,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) } } - /* transfer of the struct to all the processes */ + /* transfer of the struct to all processes */ MPI_Bcast((void*) param, sizeof(*param), MPI_BYTE, 0, MPI_COMM_WORLD); MPI_Bcast((void*) invert_param_mg, sizeof(*invert_param_mg), MPI_BYTE, 0, MPI_COMM_WORLD); MPI_Bcast((void*) multigrid_param, sizeof(*multigrid_param), MPI_BYTE, 0, MPI_COMM_WORLD); @@ -1322,6 +1327,11 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_sectio kv.dump(); } + if (kv.get(section, "solver") != "QUDA") { + errorQuda("Eigensolver section \"%s\" in file %s is not a valid quda-eigensolver section (solver = %s)\n", + section, infile, kv.get(section, "solver").c_str()); + } + param->eig_type = kv.get(section, "eig_type", param->eig_type); param->use_poly_acc = kv.get(section, "use_poly_acc", param->use_poly_acc); param->poly_deg = kv.get(section, "poly_deg", param->poly_deg); @@ -1378,10 +1388,12 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_sectio param->invert_param = new QudaInvertParam(newQudaInvertParam()); } - if (inv_section != nullptr && verbosity >= QUDA_DEBUG_VERBOSE) { - printQudaEigParam(param); + param->invert_param->verbosity = std::max(param->invert_param->verbosity, verbosity); + + if (inv_section != nullptr && param->invert_param->verbosity >= QUDA_DEBUG_VERBOSE) { + printQudaInvertParam(param->invert_param); } - if (inv_section != nullptr && verbosity >= QUDA_DEBUG_VERBOSE) { + if (param->invert_param->verbosity >= QUDA_DEBUG_VERBOSE) { printQudaEigParam(param); } From 6c7bb9d174fcfae516eb235efca28b61f2abff45 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 14 Feb 2024 15:13:11 +0100 Subject: [PATCH 134/148] only transfer/recalc gauge/clover field according to openqxd's flag DB --- include/quda_openqcd_interface.h | 1 + lib/openqcd_interface.cpp | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 2c656f0c4e..0c157f269f 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -118,6 +118,7 @@ typedef struct { dirac_parms_t (*dirac_parms)(void); /** @see dirac_parms() */ void* (*h_gauge)(void); /** function to return a pointer to the gauge fields */ void* (*h_sw)(void); /** function to return a pointer to the clover fields */ + void (*get_gfld_flags)(int *ud, int *ad); /** function pointer to field query function */ } openQCD_QudaLayout_t; diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index b2a00b831a..e24b3529a1 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -900,6 +900,10 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) { int my_rank; void *mgprec; + static int ad_rev = -1; + static int ud_rev = -1; + static int ad_rev_old = -2; + static int ud_rev_old = -2; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); @@ -1190,7 +1194,14 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) MPI_Bcast((void*) multigrid_param, sizeof(*multigrid_param), MPI_BYTE, 0, MPI_COMM_WORLD); multigrid_param->invert_param = invert_param_mg; - if (qudaState.layout.h_gauge != nullptr) { + /* get current residing gauge field revision (residing in openqxd) */ + qudaState.layout.get_gfld_flags(&ud_rev, &ad_rev); + + /* whether to update the gauge/clover field or not */ + bool update = ud_rev != ud_rev_old || ad_rev != ad_rev_old; + logQuda(QUDA_VERBOSE, "Gauge/Clover field updated in openQxD according to flag-database: %s\n", update ? "true" : "false"); + + if (qudaState.layout.h_gauge != nullptr && update) { logQuda(QUDA_VERBOSE, "Loading gauge field from openQCD ...\n"); void *h_gauge = qudaState.layout.h_gauge(); PUSH_RANGE("openQCD_qudaGaugeLoad",3); @@ -1214,10 +1225,12 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) * @see Reconstruct<12,...>#Unpack() in gauge_field_order.h */ openQCD_qudaGaugeLoad(h_gauge, QUDA_DOUBLE_PRECISION, rec, QUDA_ANTI_PERIODIC_T); + ud_rev_old = ud_rev; + ad_rev_old = ad_rev; POP_RANGE; } - if (param->clover_csw != 0.0) { + if (param->clover_csw != 0.0 && update) { if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { /** * SU3 case: From 005aed97cace8b190dbad1fd6963296c9e818ac6 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 19 Feb 2024 19:01:58 +0100 Subject: [PATCH 135/148] added void pointer to QudaInvertParam struct for arbitrary additinal properties --- include/quda.h | 3 +++ lib/check_params.h | 4 ++++ lib/quda_fortran.F90 | 3 +++ 3 files changed, 10 insertions(+) diff --git a/include/quda.h b/include/quda.h index dcee3e3189..a9e8ec1cb2 100644 --- a/include/quda.h +++ b/include/quda.h @@ -447,6 +447,9 @@ extern "C" { /** Whether to use fused kernels for mobius */ QudaBoolean use_mobius_fused_kernel; + /** Additional user-defined properties */ + void *additional_prop; + } QudaInvertParam; /* Parameter set for solving eigenvalue problems. */ diff --git a/lib/check_params.h b/lib/check_params.h index 1e7e9263b6..e6115807ff 100644 --- a/lib/check_params.h +++ b/lib/check_params.h @@ -728,6 +728,10 @@ void printQudaInvertParam(QudaInvertParam *param) { P(use_mobius_fused_kernel, QUDA_BOOLEAN_INVALID); #endif +#ifdef INIT_PARAM + P(additional_prop, 0); +#endif + #ifdef INIT_PARAM return ret; #endif diff --git a/lib/quda_fortran.F90 b/lib/quda_fortran.F90 index 01a792558a..02d832d8a8 100644 --- a/lib/quda_fortran.F90 +++ b/lib/quda_fortran.F90 @@ -359,6 +359,9 @@ module quda_fortran ! Whether to use the fused kernels for Mobius/DWF-4D dslash QudaBoolean :: use_mobius_fused_kernel + ! Additional user-defined properties + integer(8) :: additional_prop + end type quda_invert_param end module quda_fortran From ad1ab827dd3f8d7a2ec900bc948046c594527897 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Mon, 19 Feb 2024 19:11:09 +0100 Subject: [PATCH 136/148] changed solver interface to only deal with solver section IDs --- include/quda_openqcd_interface.h | 108 ++++-- lib/openqcd_interface.cpp | 547 +++++++++++++++++++++++-------- 2 files changed, 489 insertions(+), 166 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 0c157f269f..7557bdb69d 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -118,7 +118,8 @@ typedef struct { dirac_parms_t (*dirac_parms)(void); /** @see dirac_parms() */ void* (*h_gauge)(void); /** function to return a pointer to the gauge fields */ void* (*h_sw)(void); /** function to return a pointer to the clover fields */ - void (*get_gfld_flags)(int *ud, int *ad); /** function pointer to field query function */ + void (*get_gfld_flags)(int *ud, int *ad); /** function pointer to gauge field query function */ + void (*get_swfld_flags)(int *uswd, int *aswd); /** function pointer to SW field query function */ } openQCD_QudaLayout_t; @@ -140,11 +141,25 @@ typedef struct { int gauge_loaded; /** Whether openQCD_qudaGaugeLoad() was called or not */ int clover_loaded; /** Whether openQCD_qudaCloverLoad() was called or not */ int dslash_setup; /** Whether openQCD_qudaSetDslashOptions() was called or not */ + int ud_rev; /** Revision of ud field from openqxd */ + int ad_rev; /** Revision of ad field from openqxd */ openQCD_QudaInitArgs_t init; openQCD_QudaLayout_t layout; + void* handles[32]; /** Array of void-pointers to QudaInvertParam structs */ + char infile[1024]; /** Path to the input file (if given to quda_init()) */ } openQCD_QudaState_t; +typedef struct openQCD_QudaSolver_s { + char infile[1024]; /** Path to the input file (if given to quda_init()) */ + int id; /** Solver section identifier in the input file */ + QudaMultigridParam* mg_param; /** Pointer to the multigrid param struct */ + int ud_swd_rev; /** Revision of SU3 SW field from openqxd */ + int ad_swd_rev; /** Revision of U1 SW field from openqxd */ + double u1csw; /** Value of u1csw */ +} openQCD_QudaSolver; + + typedef struct { double kappa; /* kappa: hopping parameter */ double mu; /* mu: twisted mass */ @@ -257,52 +272,73 @@ void openQCD_qudaDw2(void *param, double mu, void *src, void *dst); /** * Setup the solver interface to quda. This function parses the file given by - * [infile] as an openQCD ini file. The solver section given by the [section] + * [infile] as an openQCD ini file. The solver section given by the [id] * parameter must have a key-value pair like solver = QUDA and may contain every * member of the struct [QudaInvertParam]. If one sets inv_type_precondition = * QUDA_MG_INVERTER, one can additionally use all the members from the struct - * [QudaMultigridParam] in a section called "{section} Multigrid", where - * {section} is replaced by [section]. For every level given by n_level in the - * above section, one has to provide a subsection called - * "{section} Multigrid Level {level}", where {level} runs from 0 to n_level-1. - * All these subsections may have keys given by all the array-valued members of - * QudaMultigridParam, for example smoother_tol may appear in all subsections. + * [QudaMultigridParam] in a section called "Solver {id} Multigrid", where {id} + * is replaced by [id]. For every level given by n_level in the above section, + * one has to provide a subsection called "Solver {id} Multigrid Level {level}", + * where {level} runs from 0 to n_level-1. All these subsections may have keys + * given by all the array-valued members of QudaMultigridParam, for example + * smoother_tol may appear in all subsections. * - * @param[in] infile Ini-file containing sections about the solver - * @param[in] section The section name + * @param[in] id The identifier of the solver section, i.e. "Solver #". The + * input file is taken from the arguments of quda_init(). * * @return Pointer to the solver context */ -void* openQCD_qudaSolverSetup(char *infile, char *section); + +void* openQCD_qudaSolverGetHandle(int id); /** - * @brief Solve Ax=b for an Clover Wilson operator with a multigrid - * solver. All fields are fields passed and returned are host - * (CPU) field in openQCD order. This function requires an - * existing solver context created with openQCD_qudaSolverSetup() - * - * @param[inout] param Pointer returned by openQCD_qudaSolverSetup() - * @param[in] mu Twisted mass - * @param[in] source The source - * @param[out] solution The solution - * @param[out] status If the function is able to solve the Dirac equation - * to the desired accuracy (invert_param->tol), status - * reports the total number of iteration steps. -1 - * indicates that the inversion failed. - * - * @return Residual + * @brief Return a hash from a subset of the settings in the + * QudaInvertParam struct. + * + * @param[in] id The solver identifier + * + * @return Hash value + */ +int openQCD_qudaSolverGetHash(int id); + + +/** + * @brief Prints solver information about the QUDA solver. + * + * @param[in] id The solver identifier + */ +void openQCD_qudaSolverPrintSetup(int id); + + +/** + * @brief Solve Ax=b for an Clover Wilson operator with a multigrid solver. + * All fields passed and returned are host (CPU) field in openQCD + * order. + * + * @param[in] id The solver identifier in the input file, i.e. + * "Solver #". The input file is the one given by + * quda_init + * @param[in] mu Twisted mass parameter + * @param[in] source The source + * @param[out] solution The solution + * @param[out] status If the function is able to solve the Dirac equation to + * the desired accuracy (invert_param->tol), status + * reports the total number of iteration steps. -1 + * indicates that the inversion failed. + * + * @return Residual */ -double openQCD_qudaInvert(void *param, double mu, void *source, void *solution, int *status); +double openQCD_qudaInvert(int id, double mu, void *source, void *solution, int *status); /** * @brief Destroys an existing solver context and frees all involed * structs. * - * @param param Pointer to the context to destroy + * @param[in] id The solver identifier */ -void openQCD_qudaSolverDestroy(void *param); +void openQCD_qudaSolverDestroy(int id); /** @@ -314,15 +350,17 @@ void openQCD_qudaSolverDestroy(void *param); * section given by the [section] parameter may contain every member of the * struct [QudaEigParam]. * - * @param[in] infile Ini-file containing sections about the eigen-solver - * @param[in] section The section name of the eigen-solver - * @param[in] inv_section The section name of the solver. If NULL, the section - * is not read in and the gauge and clover fields are - * not transfered/updated. + * @param[in] infile Ini-file containing sections about the eigen-solver, + * if null we use the value of qudaState.infile + * @param[in] section The section name of the eigen-solver + * @param[in] solver_id The section id of the solver as in + * openQCD_qudaSolverSetup(). If -1, the section is not + * read in and the gauge and clover fields are not + * transfered/updated. * * @return Pointer to the eigen-solver context */ -void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_section); +void* openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id); /** diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index e24b3529a1..fe8332f397 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -16,12 +16,13 @@ #include #include #include +#include #include #define MAX(a, b) ((a) > (b) ? (a) : (b)) -static openQCD_QudaState_t qudaState = {false, false, false, false, {}, {}}; +static openQCD_QudaState_t qudaState = {false, false, false, false, -1, -1, {}, {}, {}, '\0'}; using namespace quda; @@ -458,10 +459,11 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout, char *infile) MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); sprintf(prefix, "QUDA (rank=%d): ", my_rank); - if (my_rank == 0 && infile != nullptr) { + strcpy(qudaState.infile, infile); + if (my_rank == 0) { KeyValueStore kv; kv.set_map(&enum_map); - kv.load(infile); + kv.load(qudaState.infile); qudaState.init.verbosity = kv.get("QUDA", "verbosity", qudaState.init.verbosity); } @@ -563,6 +565,13 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, } void openQCD_qudaFinalize(void) { + + for (int id=0; id<32; ++id) { + if (qudaState.handles[id] != nullptr) { + openQCD_qudaSolverDestroy(id); + } + } + qudaState.initialized = false; endQuda(); } @@ -896,14 +905,267 @@ void openQCD_qudaDw2(void *param, double mu, void *src, void *dst) } -void* openQCD_qudaSolverSetup(char *infile, char *section) + +/** + * @brief Check whether the gauge field from openQCD is in sync with the + * one from QUDA. + * + * @return true/false + */ +inline bool gauge_field_get_up2date(void) +{ + int ud_rev = -2, ad_rev = -2; + + /* get current residing gauge field revision (residing in openqxd) */ + qudaState.layout.get_gfld_flags(&ud_rev, &ad_rev); + logQuda(QUDA_VERBOSE, "Gauge field status according to get_gfld_flags: (ud,ad)=(%d,%d)\n", + ud_rev, ad_rev); + + return ud_rev == qudaState.ud_rev && ad_rev == qudaState.ad_rev; +} + + + +/** + * @brief Check whether the gauge field is not (yet) set in openQCD. + * + * @return true/false + */ +inline bool gauge_field_get_unset(void) +{ + int ud_rev = -2, ad_rev = -2; + qudaState.layout.get_gfld_flags(&ud_rev, &ad_rev); + return ud_rev == 0 && ad_rev == 0; +} + + + +/** + * @brief Check if the revision of the clover field in openQCD coincides + * with the information on the parameter struct. + * + * @param param The parameter struct + * + * @return true/false + */ +inline bool clover_field_get_up2date(QudaInvertParam* param) +{ + int uswd_rev = -2, aswd_rev = -2; + + /* get current residing clover field revision (residing in openqxd) */ + qudaState.layout.get_swfld_flags(&uswd_rev, &aswd_rev); + logQuda(QUDA_VERBOSE, "Clover field status according to get_swfld_flags: (uswd,aswd)=(%d,%d)\n", + uswd_rev, aswd_rev); + + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + return uswd_rev == additional_prop->ud_swd_rev && aswd_rev == additional_prop->ad_swd_rev; +} + + + +/** + * @brief Set the global revisions numners for the SU(3) and U(1) fields. + */ +inline void gauge_field_set_up2date(void) +{ + qudaState.layout.get_gfld_flags(&qudaState.ud_rev, &qudaState.ad_rev); +} + + +/** + * @brief Set the su3csw corfficient and all related properties. + * + * @param param The parameter struct + * @param[in] su3csw The su3csw coefficient + */ +inline void set_su3csw(QudaInvertParam* param, double su3csw) +{ + param->clover_csw = su3csw; + if (su3csw != 0.0) { + param->clover_location = QUDA_CUDA_FIELD_LOCATION; /* seems to have no effect? */ + param->clover_cpu_prec = QUDA_DOUBLE_PRECISION; + param->clover_cuda_prec = QUDA_DOUBLE_PRECISION; + param->clover_cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; + + param->clover_coeff = 0.0; + + /* Set to Wilson Dirac operator with Clover term */ + param->dslash_type = QUDA_CLOVER_WILSON_DSLASH; + + if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { + param->clover_order = QUDA_FLOAT8_CLOVER_ORDER; /* what implication has this? */ + param->compute_clover = true; + } else { + param->clover_order = QUDA_OPENQCD_CLOVER_ORDER; + } + } +} + + + +/** + * @brief Check if the solver parameters are in sync with the parameters + * from openQCD. + * + * @param param_ The parameter struct + * + * @return Whether parameters are in sync or not + */ +int openQCD_qudaSolverCheck(void *param_) +{ + QudaInvertParam* param = static_cast(param_); + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + bool ret = true; + + if (param->kappa != (1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)))) { + logQuda(QUDA_VERBOSE, "Property m0/kappa does not match in QudaInvertParam struct and openQxD:dirac_parms (%.2e, %.2e)\n", + (1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0))), param->kappa); + ret = false; + } + + if (additional_prop->u1csw != qudaState.layout.dirac_parms().u1csw) { + logQuda(QUDA_VERBOSE, "Property u1csw does not match in QudaInvertParam struct and openQxD:dirac_parms (%.2e, %.2e)\n", + qudaState.layout.dirac_parms().u1csw, additional_prop->u1csw); + ret = false; + } + + if (param->clover_csw != qudaState.layout.dirac_parms().su3csw) { + logQuda(QUDA_VERBOSE, "Property su3csw/clover_csw does not match in QudaInvertParam struct and openQxD:dirac_parms (%.2e, %.2e)\n", + qudaState.layout.dirac_parms().su3csw, param->clover_csw); + ret = false; + } + + return ret; +} + + +/** + * @brief Transfer the gauge field, and (re-)calculate or transfer the + * clover field if the gauge field was updated in openQxD. Update + * the settings kappa, su3csw and u1csw in the QudaInvertParam + * struct. Set up or update the multigrid instance if set in + * QudaInvertParam. + * + * @param param_ The parameter struct, where in param->additional_prop a + * pointer to the QudaMultigridParam struct was placed. + */ +void openQCD_qudaSolverUpdate(void *param_) +{ + if (param_ == nullptr) { + errorQuda("Solver handle is NULL."); + } + + QudaInvertParam* param = static_cast(param_); + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + + bool do_param_update = !openQCD_qudaSolverCheck(param_); + bool do_gauge_update = !gauge_field_get_up2date() && !gauge_field_get_unset(); + bool do_clover_update = do_gauge_update || do_param_update || !clover_field_get_up2date(param); + bool do_multigrid_update = param->inv_type_precondition == QUDA_MG_INVERTER && (do_gauge_update || do_clover_update); + + if (do_gauge_update) { + if (qudaState.layout.h_gauge == nullptr) { + errorQuda("qudaState.layout.h_gauge is not set."); + } + logQuda(QUDA_VERBOSE, "Loading gauge field from openQCD ...\n"); + void *h_gauge = qudaState.layout.h_gauge(); + PUSH_RANGE("openQCD_qudaGaugeLoad",3); + QudaReconstructType rec = qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3 ? QUDA_RECONSTRUCT_8 : QUDA_RECONSTRUCT_9; + + /** + * We set t_boundary = QUDA_ANTI_PERIODIC_T. This setting is a label that + * tells QUDA the current state of the residing gauge field, that is the + * same state as the one we transfer from openqxd. In openqxd the hdfld + * exhibits phases of -1 for the temporal time boundaries, meaning that the + * gauge fields are explicitly multiplied by -1 on the t=0 time slice, see + * chs_hd0() in hflds.c. The QUDA_ANTI_PERIODIC_T flag says that these + * phases are incorporated into the field and that QUDA has to add these + * phases on the t=0 time slice when reconstructing the field from + * QUDA_RECONSTRUCT_8/12, but not from QUDA_RECONSTRUCT_NO. In case of + * QUDA_RECONSTRUCT_NO the value if t_boundary has no effect. + * + * @see https://github.com/lattice/quda/issues/1315 + * @see Reconstruct#Unpack() in gauge_field_order.h + * @see Reconstruct<8,...>#Unpack() in gauge_field_order.h + * @see Reconstruct<12,...>#Unpack() in gauge_field_order.h + */ + openQCD_qudaGaugeLoad(h_gauge, QUDA_DOUBLE_PRECISION, rec, QUDA_ANTI_PERIODIC_T); + gauge_field_set_up2date(); + POP_RANGE; + } + + if (do_param_update) { + param->kappa = 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)); + additional_prop->u1csw = qudaState.layout.dirac_parms().u1csw; + set_su3csw(param, qudaState.layout.dirac_parms().su3csw); + } + + if (do_clover_update) { + if (param->clover_csw == 0.0) { + freeCloverQuda(); + } else { + if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { + /** + * SU3 case: + * Leaving both h_clover = h_clovinv = NULL allocates the clover field on + * the GPU and finally calls @createCloverQuda to calculate the clover + * field. + */ + logQuda(QUDA_VERBOSE, "Generating clover field in QUDA ...\n"); + PUSH_RANGE("loadCloverQuda",3); + loadCloverQuda(NULL, NULL, param); + POP_RANGE; + } else { + /** + * U3 case: Transfer the SW-field from openQCD. + */ + + if (qudaState.layout.h_sw == nullptr) { + errorQuda("qudaState.layout.h_sw is not set."); + } + + logQuda(QUDA_VERBOSE, "Loading clover field from openQCD ...\n"); + void *h_sw = qudaState.layout.h_sw(); + PUSH_RANGE("openQCD_qudaCloverLoad",3); + openQCD_qudaCloverLoad(h_sw, param->kappa, param->clover_csw); + POP_RANGE; + + /*loadCloverQuda(qudaState.layout.h_sw(), NULL, param);*/ + /* The above line would be prefered over openQCD_qudaCloverLoad, but throws this error, no idea why? + QUDA: ERROR: qudaEventRecord_ returned CUDA_ERROR_ILLEGAL_ADDRESS + (timer.h:82 in start()) + (rank 0, host yoshi, quda_api.cpp:72 in void quda::target::cuda::set_driver_error(CUresult, const char*, const char*, const char*, const char*, bool)()) + QUDA: last kernel called was (name=N4quda10CopyCloverINS_6clover11FloatNOrderIdLi72ELi2ELb0ELb1ELb0EEENS1_12OpenQCDOrderIdLi72EEEddEE,volume=32x16x16x64,aux=GPU-offline,vol=524288precision=8Nc=3,compute_diagonal)*/ + } + } + } + + /* setup/update the multigrid instance or do nothing */ + if (do_multigrid_update) { + QudaMultigridParam* mg_param = reinterpret_cast(param->additional_prop)->mg_param; + + if (mg_param == nullptr) { + errorQuda("No multigrid parameter struct set."); + } + + if (param->preconditioner == nullptr) { + logQuda(QUDA_VERBOSE, "Setting up multigrid instance ...\n"); + PUSH_RANGE("newMultigridQuda",4); + param->preconditioner = newMultigridQuda(mg_param); + POP_RANGE; + } else { + logQuda(QUDA_VERBOSE, "Updating up multigrid instance ...\n"); + PUSH_RANGE("updateMultigridQuda",4); + updateMultigridQuda(param->preconditioner, mg_param); + POP_RANGE; + } + } +} + + +void* openQCD_qudaSolverReadIn(int id) { int my_rank; - void *mgprec; - static int ad_rev = -1; - static int ud_rev = -1; - static int ad_rev_old = -2; - static int ud_rev_old = -2; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); @@ -911,6 +1173,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) QudaInvertParam* param = new QudaInvertParam(newQudaInvertParam()); QudaInvertParam* invert_param_mg = new QudaInvertParam(newQudaInvertParam()); QudaMultigridParam* multigrid_param = new QudaMultigridParam(newQudaMultigridParam()); + std::string section = "Solver " + std::to_string(id); /* Some default settings */ /* Some of them should not be changed */ @@ -933,31 +1196,13 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) param->inv_type_precondition = QUDA_INVALID_INVERTER; /* disables any preconditioning */ param->mass_normalization = QUDA_MASS_NORMALIZATION; - if (qudaState.layout.dirac_parms().su3csw != 0.0) { - param->clover_location = QUDA_CUDA_FIELD_LOCATION; /* seems to have no effect? */ - param->clover_cpu_prec = QUDA_DOUBLE_PRECISION; - param->clover_cuda_prec = QUDA_DOUBLE_PRECISION; - param->clover_cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; - - param->clover_csw = qudaState.layout.dirac_parms().su3csw; - param->clover_coeff = 0.0; - - /* Set to Wilson Dirac operator with Clover term */ - param->dslash_type = QUDA_CLOVER_WILSON_DSLASH; - - if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { - param->clover_order = QUDA_FLOAT8_CLOVER_ORDER; /* what implication has this? */ - param->compute_clover = true; - } else { - param->clover_order = QUDA_OPENQCD_CLOVER_ORDER; - } - } + set_su3csw(param, qudaState.layout.dirac_parms().su3csw); if (my_rank == 0) { KeyValueStore kv; kv.set_map(&enum_map); - kv.load(infile); + kv.load(qudaState.infile); param->verbosity = kv.get(section, "verbosity", param->verbosity); @@ -967,7 +1212,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) if (kv.get(section, "solver") != "QUDA") { errorQuda("Solver section \"%s\" in file %s is not a valid quda-solver section (solver = %s)\n", - section, infile, kv.get(section, "solver").c_str()); + section.c_str(), qudaState.infile, kv.get(section, "solver").c_str()); } /* both fields reside on the CPU */ @@ -1093,7 +1338,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) if (param->inv_type_precondition == QUDA_MG_INVERTER) { - std::string mg_section = std::string(section) + " Multigrid"; + std::string mg_section = section + " Multigrid"; /* (shallow) copy the struct */ *invert_param_mg = *param; @@ -1119,7 +1364,7 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) multigrid_param->thin_update_only = kv.get(mg_section, "thin_update_only", multigrid_param->thin_update_only); for (int i=0; in_level; i++) { - std::string subsection = std::string(section) + " Multigrid Level " + std::to_string(i); + std::string subsection = section + " Multigrid Level " + std::to_string(i); multigrid_param->geo_block_size[i][0] = kv.get(subsection, "geo_block_size[1]", multigrid_param->geo_block_size[i][0]); multigrid_param->geo_block_size[i][1] = kv.get(subsection, "geo_block_size[2]", multigrid_param->geo_block_size[i][1]); @@ -1194,131 +1439,165 @@ void* openQCD_qudaSolverSetup(char *infile, char *section) MPI_Bcast((void*) multigrid_param, sizeof(*multigrid_param), MPI_BYTE, 0, MPI_COMM_WORLD); multigrid_param->invert_param = invert_param_mg; - /* get current residing gauge field revision (residing in openqxd) */ - qudaState.layout.get_gfld_flags(&ud_rev, &ad_rev); + /** + * We need a void* to store the multigrid_param (QudaMultigridParam) struct, + * such that we can access it and setup multigrid in a later stage, for + * instance right before calling invertQuda() if multigrid was not + * instantiated until then. + */ + openQCD_QudaSolver *additional_prop = new openQCD_QudaSolver(); + strcpy(additional_prop->infile, qudaState.infile); + additional_prop->id = id; + additional_prop->mg_param = multigrid_param; + qudaState.layout.get_swfld_flags(&additional_prop->ud_swd_rev, &additional_prop->ad_swd_rev); + additional_prop->u1csw = qudaState.layout.dirac_parms().u1csw; + param->additional_prop = reinterpret_cast(additional_prop); - /* whether to update the gauge/clover field or not */ - bool update = ud_rev != ud_rev_old || ad_rev != ad_rev_old; - logQuda(QUDA_VERBOSE, "Gauge/Clover field updated in openQxD according to flag-database: %s\n", update ? "true" : "false"); + return (void*) param; +} - if (qudaState.layout.h_gauge != nullptr && update) { - logQuda(QUDA_VERBOSE, "Loading gauge field from openQCD ...\n"); - void *h_gauge = qudaState.layout.h_gauge(); - PUSH_RANGE("openQCD_qudaGaugeLoad",3); - QudaReconstructType rec = qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3 ? QUDA_RECONSTRUCT_8 : QUDA_RECONSTRUCT_9; - /** - * We set t_boundary = QUDA_ANTI_PERIODIC_T. This setting is a label that - * tells QUDA the current state of the residing gauge field, that is the - * same state as the one we transfer from openqxd. In openqxd the hdfld - * exhibits phases of -1 for the temporal time boundaries, meaning that the - * gauge fields are explicitly multiplied by -1 on the t=0 time slice, see - * chs_hd0() in hflds.c. The QUDA_ANTI_PERIODIC_T flag says that these - * phases are incorporated into the field and that QUDA has to add these - * phases on the t=0 time slice when reconstructing the field from - * QUDA_RECONSTRUCT_8/12, but not from QUDA_RECONSTRUCT_NO. In case of - * QUDA_RECONSTRUCT_NO the value if t_boundary has no effect. - * - * @see https://github.com/lattice/quda/issues/1315 - * @see Reconstruct#Unpack() in gauge_field_order.h - * @see Reconstruct<8,...>#Unpack() in gauge_field_order.h - * @see Reconstruct<12,...>#Unpack() in gauge_field_order.h - */ - openQCD_qudaGaugeLoad(h_gauge, QUDA_DOUBLE_PRECISION, rec, QUDA_ANTI_PERIODIC_T); - ud_rev_old = ud_rev; - ad_rev_old = ad_rev; - POP_RANGE; +void* openQCD_qudaSolverGetHandle(int id) +{ + if (qudaState.handles[id] == nullptr) { + printfQuda("read in solver parameters from file %s for solver id=%d\n", + qudaState.infile, id); + qudaState.handles[id] = openQCD_qudaSolverReadIn(id); } - if (param->clover_csw != 0.0 && update) { - if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { - /** - * SU3 case: - * Leaving both h_clover = h_clovinv = NULL allocates the clover field on - * the GPU and finally calls @createCloverQuda to calculate the clover - * field. - */ - logQuda(QUDA_VERBOSE, "Generating clover field in QUDA ...\n"); - PUSH_RANGE("loadCloverQuda",3); - loadCloverQuda(NULL, NULL, param); - POP_RANGE; - } else { - /** - * U3 case: Transfer the SW-field from openQCD. - */ - logQuda(QUDA_VERBOSE, "Loading clover field from openQCD ...\n"); - void *h_sw = qudaState.layout.h_sw(); - PUSH_RANGE("openQCD_qudaCloverLoad",3); - openQCD_qudaCloverLoad(h_sw, param->kappa, param->clover_csw); - POP_RANGE; + openQCD_qudaSolverUpdate(qudaState.handles[id]); + return qudaState.handles[id]; +} - /*loadCloverQuda(qudaState.layout.h_sw(), NULL, param);*/ - /* The above line would be prefered over openQCD_qudaCloverLoad, but throws this error, no idea why? - QUDA: ERROR: qudaEventRecord_ returned CUDA_ERROR_ILLEGAL_ADDRESS - (timer.h:82 in start()) - (rank 0, host yoshi, quda_api.cpp:72 in void quda::target::cuda::set_driver_error(CUresult, const char*, const char*, const char*, const char*, bool)()) - QUDA: last kernel called was (name=N4quda10CopyCloverINS_6clover11FloatNOrderIdLi72ELi2ELb0ELb1ELb0EEENS1_12OpenQCDOrderIdLi72EEEddEE,volume=32x16x16x64,aux=GPU-offline,vol=524288precision=8Nc=3,compute_diagonal)*/ + +/** + * @brief Take the string-hash over a struct using std::hash. + * + * @param[in] in Input struct + * + * @tparam T Type of input struct + * + * @return Hash value + */ +template int hash_struct(T *in) +{ + int hash = 0; + char* cstruct = reinterpret_cast(in); + + for (char* c = cstruct; c < cstruct + sizeof(T); c += strlen(c) + 1) { + if (strlen(c) != 0) { + hash ^= (std::hash{}(std::string(c)) << 1); } } - if (param->inv_type_precondition == QUDA_MG_INVERTER) { - logQuda(QUDA_VERBOSE, "Setting up multigrid solver ...\n"); - PUSH_RANGE("newMultigridQuda",4); - mgprec = newMultigridQuda(multigrid_param); - param->preconditioner = mgprec; - POP_RANGE; - } + return hash; +} + + +int openQCD_qudaSolverGetHash(int id) +{ + QudaInvertParam* param = reinterpret_cast(openQCD_qudaSolverGetHandle(id)); + QudaInvertParam hparam = newQudaInvertParam(); + memset(&hparam, '\0', sizeof(QudaInvertParam)); /* set everything to zero */ + + /* Set some properties we want to take the hash over */ + hparam.inv_type = param->inv_type; + hparam.tol = param->tol; + hparam.tol_restart = param->tol_restart; + hparam.tol_hq = param->tol_hq; + hparam.maxiter = param->maxiter; + hparam.reliable_delta = param->reliable_delta; + hparam.solution_type = param->solution_type; + hparam.solve_type = param->solve_type; + hparam.matpc_type = param->matpc_type; + hparam.dagger = param->dagger; + hparam.mass_normalization = param->mass_normalization; + hparam.solver_normalization = param->solver_normalization; + hparam.cpu_prec = param->cpu_prec; + hparam.cuda_prec = param->cuda_prec; + hparam.use_init_guess = param->use_init_guess; + hparam.gcrNkrylov = param->gcrNkrylov; + + return hash_struct(&hparam); +} + + +void openQCD_qudaSolverPrintSetup(int id) +{ + QudaInvertParam *param = static_cast(openQCD_qudaSolverGetHandle(id)); + + if (param != nullptr) { + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); - if (param->verbosity >= QUDA_DEBUG_VERBOSE) { printQudaInvertParam(param); - } + printfQuda("additional_prop->infile = %s\n", additional_prop->infile); + printfQuda("additional_prop->id = %d\n", additional_prop->id); + printfQuda("additional_prop->mg_param = %p\n", additional_prop->mg_param); + printfQuda("additional_prop->ud_swd_rev = %d\n", additional_prop->ud_swd_rev); + printfQuda("additional_prop->ad_swd_rev = %d\n", additional_prop->ad_swd_rev); + printfQuda("additional_prop->u1csw = %.2e\n", additional_prop->u1csw); + printfQuda("handle = %p\n", param); + printfQuda("hash = %d\n", openQCD_qudaSolverGetHash(id)); - if (param->inv_type_precondition == QUDA_MG_INVERTER) { - if (param->verbosity >= QUDA_DEBUG_VERBOSE) { - printQudaMultigridParam(multigrid_param); + if (param->inv_type_precondition == QUDA_MG_INVERTER) { + printQudaMultigridParam(additional_prop->mg_param); } + } else { + printfQuda("\n"); } - - return (void*) param; } -double openQCD_qudaInvert(void *param, double mu, void *source, void *solution, int *status) +double openQCD_qudaInvert(int id, double mu, void *source, void *solution, int *status) { - QudaInvertParam* invert_param = static_cast(param); - invert_param->mu = mu; + if (gauge_field_get_unset()) { + errorQuda("Gauge field not populated in openQxD."); + } + + QudaInvertParam* param = static_cast(openQCD_qudaSolverGetHandle(id)); + param->mu = mu; + + if (!openQCD_qudaSolverCheck(param)) { + errorQuda("Solver check failed, parameters/fields between openQxD and QUDA are not in sync."); + } logQuda(QUDA_VERBOSE, "Calling invertQuda() ...\n"); PUSH_RANGE("invertQuda",5); - invertQuda(static_cast(solution), static_cast(source), invert_param); + invertQuda(static_cast(solution), static_cast(source), param); POP_RANGE; - logQuda(QUDA_SUMMARIZE, "openQCD_qudaInvert()\n"); - logQuda(QUDA_SUMMARIZE, " true_res = %.2e\n", invert_param->true_res); - logQuda(QUDA_SUMMARIZE, " true_res_hq = %.2e\n", invert_param->true_res_hq); - logQuda(QUDA_SUMMARIZE, " iter = %d\n", invert_param->iter); - logQuda(QUDA_SUMMARIZE, " gflops = %.2e\n", invert_param->gflops); - logQuda(QUDA_SUMMARIZE, " secs = %.2e\n", invert_param->secs); + *status = param->true_res <= param->tol ? param->iter : -1; - *status = invert_param->true_res <= invert_param->tol ? invert_param->iter : -1; + logQuda(QUDA_VERBOSE, "openQCD_qudaInvert()\n"); + logQuda(QUDA_VERBOSE, " true_res = %.2e\n", param->true_res); + logQuda(QUDA_VERBOSE, " true_res_hq = %.2e\n", param->true_res_hq); + logQuda(QUDA_VERBOSE, " iter = %d\n", param->iter); + logQuda(QUDA_VERBOSE, " gflops = %.2e\n", param->gflops); + logQuda(QUDA_VERBOSE, " secs = %.2e\n", param->secs); + logQuda(QUDA_VERBOSE, " status = %d\n", *status); - return invert_param->true_res; + return param->true_res; } -void openQCD_qudaSolverDestroy(void *param) +void openQCD_qudaSolverDestroy(int id) { - QudaInvertParam* invert_param = static_cast(param); + QudaInvertParam *param = static_cast(openQCD_qudaSolverGetHandle(id)); - if (invert_param->inv_type_precondition == QUDA_MG_INVERTER) { - destroyMultigridQuda(invert_param->preconditioner); - } + if (param != nullptr) { + if (param->inv_type_precondition == QUDA_MG_INVERTER) { + destroyMultigridQuda(param->preconditioner); + } - delete invert_param; + delete static_cast(param->additional_prop)->mg_param; + delete static_cast(param->additional_prop); + delete param; + qudaState.handles[id] = nullptr; + } } -void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_section) +void* openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id) { int my_rank; @@ -1332,7 +1611,7 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_sectio KeyValueStore kv; kv.set_map(&enum_map); - kv.load(infile); + kv.load(infile == nullptr ? qudaState.infile : infile); verbosity = kv.get(section, "verbosity", verbosity); @@ -1342,7 +1621,7 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_sectio if (kv.get(section, "solver") != "QUDA") { errorQuda("Eigensolver section \"%s\" in file %s is not a valid quda-eigensolver section (solver = %s)\n", - section, infile, kv.get(section, "solver").c_str()); + section, infile == nullptr ? qudaState.infile : infile, kv.get(section, "solver").c_str()); } param->eig_type = kv.get(section, "eig_type", param->eig_type); @@ -1394,18 +1673,19 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, char *inv_sectio /* transfer of the struct to all the processes */ MPI_Bcast((void*) param, sizeof(*param), MPI_BYTE, 0, MPI_COMM_WORLD); - if (inv_section != nullptr) { - void *inv_param = openQCD_qudaSolverSetup(infile, inv_section); - param->invert_param = static_cast(inv_param); - } else { + if (solver_id == -1) { param->invert_param = new QudaInvertParam(newQudaInvertParam()); + } else { + void *inv_param = openQCD_qudaSolverGetHandle(solver_id); + param->invert_param = static_cast(inv_param); } param->invert_param->verbosity = std::max(param->invert_param->verbosity, verbosity); - if (inv_section != nullptr && param->invert_param->verbosity >= QUDA_DEBUG_VERBOSE) { + if (solver_id != -1 && param->invert_param->verbosity >= QUDA_DEBUG_VERBOSE) { printQudaInvertParam(param->invert_param); } + if (param->invert_param->verbosity >= QUDA_DEBUG_VERBOSE) { printQudaEigParam(param); } @@ -1433,6 +1713,11 @@ void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals) void openQCD_qudaEigensolverDestroy(void *param) { QudaEigParam* eig_param = static_cast(param); - openQCD_qudaSolverDestroy(eig_param->invert_param); + openQCD_QudaSolver *additional_prop = static_cast(eig_param->invert_param->additional_prop); + if (additional_prop == nullptr) { + delete eig_param->invert_param; + } else { + openQCD_qudaSolverDestroy(additional_prop->id); + } delete eig_param; } From c1fec52aa3f0e8ef2052fb61caf64ad8465e87db Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 21 Feb 2024 19:04:25 +0100 Subject: [PATCH 137/148] refactored interface --- include/quda_openqcd_interface.h | 64 +++-- lib/openqcd_interface.cpp | 452 ++++++++++++++++++------------- 2 files changed, 312 insertions(+), 204 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 7557bdb69d..9ee41500bb 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -116,10 +116,9 @@ typedef struct { bc_parms_t (*bc_parms)(void); /** @see bc_parms() */ flds_parms_t (*flds_parms)(void); /** @see flds_parms() */ dirac_parms_t (*dirac_parms)(void); /** @see dirac_parms() */ - void* (*h_gauge)(void); /** function to return a pointer to the gauge fields */ - void* (*h_sw)(void); /** function to return a pointer to the clover fields */ - void (*get_gfld_flags)(int *ud, int *ad); /** function pointer to gauge field query function */ - void (*get_swfld_flags)(int *uswd, int *aswd); /** function pointer to SW field query function */ + void* (*h_gauge)(void); /** function to return a pointer to the gauge field */ + void* (*h_sw)(void); /** function to return a pointer to the updated Clover field */ + void (*get_gfld_flags)(int *ud, int *ad); /** function pointer to gauge field revision query function */ } openQCD_QudaLayout_t; @@ -138,14 +137,17 @@ typedef struct { typedef struct { int initialized; /** Whether openQCD_qudaInit() was called or not */ - int gauge_loaded; /** Whether openQCD_qudaGaugeLoad() was called or not */ - int clover_loaded; /** Whether openQCD_qudaCloverLoad() was called or not */ - int dslash_setup; /** Whether openQCD_qudaSetDslashOptions() was called or not */ int ud_rev; /** Revision of ud field from openqxd */ int ad_rev; /** Revision of ad field from openqxd */ + int swd_ud_rev; /** Revision of ud field used to calc/transfer the SW field from openqxd */ + int swd_ad_rev; /** Revision of ad field used to calc/transfer the SW field from openqxd */ + double swd_kappa; /** kappa corresponding to the current SW field in QUDA */ + double swd_su3csw; /** SU(3) csw coefficient corresponding to the current SW field in QUDA */ + double swd_u1csw; /** U(1) csw coefficient corresponding to the current SW field in QUDA */ openQCD_QudaInitArgs_t init; openQCD_QudaLayout_t layout; - void* handles[32]; /** Array of void-pointers to QudaInvertParam structs */ + void* handles[32]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ + void* dirac_handle; /** void-pointer to QudaInvertParam struct for the Dirac operator */ char infile[1024]; /** Path to the input file (if given to quda_init()) */ } openQCD_QudaState_t; @@ -154,9 +156,12 @@ typedef struct openQCD_QudaSolver_s { char infile[1024]; /** Path to the input file (if given to quda_init()) */ int id; /** Solver section identifier in the input file */ QudaMultigridParam* mg_param; /** Pointer to the multigrid param struct */ - int ud_swd_rev; /** Revision of SU3 SW field from openqxd */ - int ad_swd_rev; /** Revision of U1 SW field from openqxd */ - double u1csw; /** Value of u1csw */ + double u1csw; /** u1csw property */ + int mg_ud_rev; /** Revision of ud field from openqxd */ + int mg_ad_rev; /** Revision of ad field from openqxd */ + double mg_kappa; /** kappa corresponding to the current mg-instance in QUDA */ + double mg_su3csw; /** SU(3) csw coefficient corresponding to the current mg-instance in QUDA */ + double mg_u1csw; /** U(1) csw coefficient corresponding to the current mg-instance in QUDA */ } openQCD_QudaSolver; @@ -180,7 +185,7 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, /** - * Destroy the QUDA context. + * Destroy the QUDA context and deallocate all solvers. */ void openQCD_qudaFinalize(void); @@ -266,8 +271,19 @@ void openQCD_qudaSpinorFree(void** quda_field); * @param[out] dst Destination spinor field * @param[in] p Dirac parameter struct */ -void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p); -void openQCD_qudaDw2(void *param, double mu, void *src, void *dst); +void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p); + + +/** + * @brief Apply the Dirac operator that corresponds to the current openQxD + * setup to a field. All fields passed and returned are host (CPU) + * fields in openQCD order. + * + * @param[in] mu Twisted mass + * @param in Input spinor + * @param out Output spinor + */ +void openQCD_qudaDw(double mu, void *in, void *out); /** @@ -281,10 +297,13 @@ void openQCD_qudaDw2(void *param, double mu, void *src, void *dst); * one has to provide a subsection called "Solver {id} Multigrid Level {level}", * where {level} runs from 0 to n_level-1. All these subsections may have keys * given by all the array-valued members of QudaMultigridParam, for example - * smoother_tol may appear in all subsections. + * smoother_tol may appear in all subsections. This function must be called on + * all ranks simulaneously. * * @param[in] id The identifier of the solver section, i.e. "Solver #". The - * input file is taken from the arguments of quda_init(). + * input file is taken from the arguments of quda_init(). If + * id is -1, then the section called "Lattice parameters" is + * parsed in the same way. * * @return Pointer to the solver context */ @@ -294,7 +313,8 @@ void* openQCD_qudaSolverGetHandle(int id); /** * @brief Return a hash from a subset of the settings in the - * QudaInvertParam struct. + * QudaInvertParam struct. Return 0 if the struct is not initialized + * yet. * * @param[in] id The solver identifier * @@ -304,7 +324,9 @@ int openQCD_qudaSolverGetHash(int id); /** - * @brief Prints solver information about the QUDA solver. + * @brief Print solver information about the QUDA solver. Print + * "Solver is not initialized yet" is the solver struct is nul + * initialized yet. * * @param[in] id The solver identifier */ @@ -353,10 +375,8 @@ void openQCD_qudaSolverDestroy(int id); * @param[in] infile Ini-file containing sections about the eigen-solver, * if null we use the value of qudaState.infile * @param[in] section The section name of the eigen-solver - * @param[in] solver_id The section id of the solver as in - * openQCD_qudaSolverSetup(). If -1, the section is not - * read in and the gauge and clover fields are not - * transfered/updated. + * @param[in] solver_id The section id of the solver. If -1, the section is + * not read in. * * @return Pointer to the eigen-solver context */ diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index fe8332f397..a13b9abcea 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -22,7 +22,7 @@ #define MAX(a, b) ((a) > (b) ? (a) : (b)) -static openQCD_QudaState_t qudaState = {false, false, false, false, -1, -1, {}, {}, {}, '\0'}; +static openQCD_QudaState_t qudaState = {false, -1, -1, -1, -1, 0.0, 0.0, 0.0, {}, {}, {}, nullptr, '\0'}; using namespace quda; @@ -245,6 +245,7 @@ class KeyValueStore { private: std::unordered_map>> store; std::unordered_map *map = nullptr; + std::string filename = ""; public: /** @@ -271,6 +272,10 @@ class KeyValueStore { map = _map; } + bool section_exists(const std::string& section) { + return store.find(section) != store.end(); + } + /** * @brief Gets the specified key. * @@ -283,7 +288,7 @@ class KeyValueStore { * @return The corresponding value */ template - T get(const std::string& section, const std::string& key, T default_value = T()) { + T get(const std::string& section, const std::string& key, T default_value = T(), bool fail = false) { int idx; std::string rkey; std::smatch match; @@ -321,17 +326,22 @@ class KeyValueStore { } } } + if (fail) { + errorQuda("Key \"%s\" in section \"%s\" in file %s does not exist.", + key.c_str(), section.c_str(), filename.c_str()); + } return default_value; /* Return default value for non-existent keys */ } /** * @brief Fill the store with entries from an ini-file * - * @param[in] filename The filename + * @param[in] fname The fname */ - void load(const std::string& filename) { + void load(const std::string& fname) { std::string line, section; std::smatch match; + filename = fname; std::ifstream file(filename.c_str()); std::regex p_section("^\\s*\\[([\\w\\ ]+)\\].*$"); /* [section] */ @@ -581,11 +591,6 @@ double openQCD_qudaPlaquette(void) { double plaq[3]; - if (!qudaState.gauge_loaded) { - errorQuda("Gauge field not loaded into QUDA, cannot calculate plaquette. Call openQCD_qudaGaugeLoad() first."); - return 0.0; - } - plaqQuda(plaq); /* Note different Nc normalization wrt openQCD! */ @@ -597,7 +602,6 @@ void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType { QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec, t_boundary); loadGaugeQuda(gauge, ¶m); - qudaState.gauge_loaded = true; } @@ -615,7 +619,6 @@ void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType void openQCD_qudaGaugeFree(void) { freeGaugeQuda(); - qudaState.gauge_loaded = false; } @@ -633,14 +636,42 @@ void openQCD_qudaCloverLoad(void *clover, double kappa, double csw) param.clover_coeff = 0.0; loadCloverQuda(clover, NULL, ¶m); - qudaState.clover_loaded = true; } void openQCD_qudaCloverFree(void) { freeCloverQuda(); - qudaState.clover_loaded = false; +} + + +/** + * @brief Set the su3csw corfficient and all related properties. + * + * @param param The parameter struct + * @param[in] su3csw The su3csw coefficient + */ +inline void set_su3csw(QudaInvertParam* param, double su3csw) +{ + param->clover_csw = su3csw; + if (su3csw != 0.0) { + param->clover_location = QUDA_CUDA_FIELD_LOCATION; /* seems to have no effect? */ + param->clover_cpu_prec = QUDA_DOUBLE_PRECISION; + param->clover_cuda_prec = QUDA_DOUBLE_PRECISION; + param->clover_cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; + + param->clover_coeff = 0.0; + + /* Set to Wilson Dirac operator with Clover term */ + param->dslash_type = QUDA_CLOVER_WILSON_DSLASH; + + if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { + param->clover_order = QUDA_FLOAT8_CLOVER_ORDER; /* what implication has this? */ + param->compute_clover = true; + } else { + param->clover_order = QUDA_OPENQCD_CLOVER_ORDER; + } + } } @@ -653,10 +684,6 @@ void openQCD_qudaCloverFree(void) */ static QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) { - if (!qudaState.gauge_loaded) { - errorQuda("Gauge field not loaded into QUDA, cannot setup Dirac operator / Clover term. Call openQCD_qudaGaugeLoad() first."); - } - QudaInvertParam param = newOpenQCDParam(); param.dslash_type = QUDA_WILSON_DSLASH; @@ -664,30 +691,7 @@ static QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) param.mu = p.mu; param.dagger = QUDA_DAG_NO; - if (p.su3csw != 0.0) { - param.clover_location = QUDA_CUDA_FIELD_LOCATION; /* seems to have no effect? */ - param.clover_cpu_prec = QUDA_DOUBLE_PRECISION; - param.clover_cuda_prec = QUDA_DOUBLE_PRECISION; - param.clover_cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; - param.clover_order = QUDA_FLOAT8_CLOVER_ORDER; /* what implication has this? */ - - param.compute_clover = true; - param.clover_csw = p.su3csw; - param.clover_coeff = 0.0; - - /* Set to Wilson Dirac operator with Clover term */ - param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; - - if (!qudaState.clover_loaded) { - /** - * Leaving both h_clover = h_clovinv = NULL allocates the clover field on - * the GPU and finally calls @createCloverQuda to calculate the clover - * field. - */ - loadCloverQuda(NULL, NULL, ¶m); /* Create the clover field */ - qudaState.clover_loaded = true; - } - } + set_su3csw(¶m, p.su3csw); param.inv_type = QUDA_CG_INVERTER; /* just set some, needed? */ @@ -876,36 +880,6 @@ void openQCD_qudaD2H(void *quda_field, void *openQCD_field) } -void openQCD_qudaDw(void *src, void *dst, openQCD_QudaDiracParam_t p) -{ - QudaInvertParam param = newOpenQCDDiracParam(p); - - /* both fields reside on the CPU */ - param.input_location = QUDA_CPU_FIELD_LOCATION; - param.output_location = QUDA_CPU_FIELD_LOCATION; - - MatQuda(static_cast(dst), static_cast(src), ¶m); - - logQuda(QUDA_DEBUG_VERBOSE, "MatQuda()\n"); - logQuda(QUDA_DEBUG_VERBOSE, " gflops = %.2e\n", param.gflops); - logQuda(QUDA_DEBUG_VERBOSE, " secs = %.2e\n", param.secs); -} - - -void openQCD_qudaDw2(void *param, double mu, void *src, void *dst) -{ - QudaInvertParam* inv_param = static_cast(param); - inv_param->mu = mu; - - /* both fields reside on the CPU */ - inv_param->input_location = QUDA_CPU_FIELD_LOCATION; - inv_param->output_location = QUDA_CPU_FIELD_LOCATION; - - MatQuda(static_cast(dst), static_cast(src), inv_param); -} - - - /** * @brief Check whether the gauge field from openQCD is in sync with the * one from QUDA. @@ -918,9 +892,6 @@ inline bool gauge_field_get_up2date(void) /* get current residing gauge field revision (residing in openqxd) */ qudaState.layout.get_gfld_flags(&ud_rev, &ad_rev); - logQuda(QUDA_VERBOSE, "Gauge field status according to get_gfld_flags: (ud,ad)=(%d,%d)\n", - ud_rev, ad_rev); - return ud_rev == qudaState.ud_rev && ad_rev == qudaState.ad_rev; } @@ -941,68 +912,86 @@ inline bool gauge_field_get_unset(void) /** - * @brief Check if the revision of the clover field in openQCD coincides - * with the information on the parameter struct. - * - * @param param The parameter struct + * @brief Check if the current SW field needs to update wrt the parameters from openQCD. * * @return true/false */ -inline bool clover_field_get_up2date(QudaInvertParam* param) +inline bool clover_field_get_up2date(void) { - int uswd_rev = -2, aswd_rev = -2; + return (gauge_field_get_up2date() + && qudaState.swd_ud_rev == qudaState.ud_rev + && qudaState.swd_ad_rev == qudaState.ad_rev + && qudaState.swd_kappa == 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)) + && qudaState.swd_su3csw == qudaState.layout.dirac_parms().su3csw + && qudaState.swd_u1csw == qudaState.layout.dirac_parms().u1csw); +} - /* get current residing clover field revision (residing in openqxd) */ - qudaState.layout.get_swfld_flags(&uswd_rev, &aswd_rev); - logQuda(QUDA_VERBOSE, "Clover field status according to get_swfld_flags: (uswd,aswd)=(%d,%d)\n", - uswd_rev, aswd_rev); +/** + * @brief Check whether the multigrid instance associated to the parameter + * struct is up to date with the global gauge field revision, + * parameters are in sync, and clover/gauge fields are up to date. + * + * @param param The parameter struct + * + * @return true/false + */ +inline bool mg_get_up2date(QudaInvertParam *param) +{ openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); - return uswd_rev == additional_prop->ud_swd_rev && aswd_rev == additional_prop->ad_swd_rev; + + return (param->preconditioner != nullptr + && gauge_field_get_up2date() + && clover_field_get_up2date() + && additional_prop->mg_ud_rev == qudaState.ud_rev + && additional_prop->mg_ad_rev == qudaState.ad_rev + && additional_prop->mg_kappa == 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)) + && additional_prop->mg_su3csw == qudaState.layout.dirac_parms().su3csw + && additional_prop->mg_u1csw == qudaState.layout.dirac_parms().u1csw); } /** - * @brief Set the global revisions numners for the SU(3) and U(1) fields. + * @brief Sets the multigrid instance associated to the parameter struct to + * be in sync with openQxD. + * + * @param param The parameter struct */ -inline void gauge_field_set_up2date(void) +inline void mg_set_revision(QudaInvertParam *param) { - qudaState.layout.get_gfld_flags(&qudaState.ud_rev, &qudaState.ad_rev); + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + + additional_prop->mg_ud_rev = qudaState.ud_rev; + additional_prop->mg_ad_rev = qudaState.ad_rev; + additional_prop->mg_kappa = 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)); + additional_prop->mg_su3csw = qudaState.layout.dirac_parms().su3csw; + additional_prop->mg_u1csw = qudaState.layout.dirac_parms().u1csw; } /** - * @brief Set the su3csw corfficient and all related properties. - * - * @param param The parameter struct - * @param[in] su3csw The su3csw coefficient + * @brief Set the global revisions numners for the SW field. */ -inline void set_su3csw(QudaInvertParam* param, double su3csw) +inline void clover_field_set_revision(void) { - param->clover_csw = su3csw; - if (su3csw != 0.0) { - param->clover_location = QUDA_CUDA_FIELD_LOCATION; /* seems to have no effect? */ - param->clover_cpu_prec = QUDA_DOUBLE_PRECISION; - param->clover_cuda_prec = QUDA_DOUBLE_PRECISION; - param->clover_cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; - - param->clover_coeff = 0.0; + qudaState.swd_ud_rev = qudaState.ud_rev; + qudaState.swd_ad_rev = qudaState.ad_rev; + qudaState.swd_kappa = 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)); + qudaState.swd_su3csw = qudaState.layout.dirac_parms().su3csw; + qudaState.swd_u1csw = qudaState.layout.dirac_parms().u1csw; +} - /* Set to Wilson Dirac operator with Clover term */ - param->dslash_type = QUDA_CLOVER_WILSON_DSLASH; - if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { - param->clover_order = QUDA_FLOAT8_CLOVER_ORDER; /* what implication has this? */ - param->compute_clover = true; - } else { - param->clover_order = QUDA_OPENQCD_CLOVER_ORDER; - } - } +/** + * @brief Set the global revisions numners for the gauge field. + */ +inline void gauge_field_set_revision(void) +{ + qudaState.layout.get_gfld_flags(&qudaState.ud_rev, &qudaState.ad_rev); } - /** * @brief Check if the solver parameters are in sync with the parameters * from openQCD. @@ -1011,26 +1000,26 @@ inline void set_su3csw(QudaInvertParam* param, double su3csw) * * @return Whether parameters are in sync or not */ -int openQCD_qudaSolverCheck(void *param_) +int openQCD_qudaInvertParamCheck(void *param_) { QudaInvertParam* param = static_cast(param_); openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); bool ret = true; if (param->kappa != (1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)))) { - logQuda(QUDA_VERBOSE, "Property m0/kappa does not match in QudaInvertParam struct and openQxD:dirac_parms (%.2e, %.2e)\n", + logQuda(QUDA_VERBOSE, "Property m0/kappa does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", (1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0))), param->kappa); ret = false; } if (additional_prop->u1csw != qudaState.layout.dirac_parms().u1csw) { - logQuda(QUDA_VERBOSE, "Property u1csw does not match in QudaInvertParam struct and openQxD:dirac_parms (%.2e, %.2e)\n", + logQuda(QUDA_VERBOSE, "Property u1csw does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", qudaState.layout.dirac_parms().u1csw, additional_prop->u1csw); ret = false; } if (param->clover_csw != qudaState.layout.dirac_parms().su3csw) { - logQuda(QUDA_VERBOSE, "Property su3csw/clover_csw does not match in QudaInvertParam struct and openQxD:dirac_parms (%.2e, %.2e)\n", + logQuda(QUDA_VERBOSE, "Property su3csw/clover_csw does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", qudaState.layout.dirac_parms().su3csw, param->clover_csw); ret = false; } @@ -1040,11 +1029,13 @@ int openQCD_qudaSolverCheck(void *param_) /** - * @brief Transfer the gauge field, and (re-)calculate or transfer the - * clover field if the gauge field was updated in openQxD. Update - * the settings kappa, su3csw and u1csw in the QudaInvertParam - * struct. Set up or update the multigrid instance if set in - * QudaInvertParam. + * @brief Transfer the gauge field if the gauge field was updated in + * openQxD. (Re-)calculate or transfer the clover field if + * parameters have changed or gauge field was updated. Update the + * settings kappa, su3csw and u1csw in the QudaInvertParam struct + * such that they are in sync with openQxD. Set up or update the + * multigrid instance if set in QudaInvertParam and if gauge- or + * clover-fields or parameters have changed. * * @param param_ The parameter struct, where in param->additional_prop a * pointer to the QudaMultigridParam struct was placed. @@ -1058,12 +1049,13 @@ void openQCD_qudaSolverUpdate(void *param_) QudaInvertParam* param = static_cast(param_); openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); - bool do_param_update = !openQCD_qudaSolverCheck(param_); - bool do_gauge_update = !gauge_field_get_up2date() && !gauge_field_get_unset(); - bool do_clover_update = do_gauge_update || do_param_update || !clover_field_get_up2date(param); - bool do_multigrid_update = param->inv_type_precondition == QUDA_MG_INVERTER && (do_gauge_update || do_clover_update); + bool do_param_update = !openQCD_qudaInvertParamCheck(param_); + bool do_gauge_transfer = !gauge_field_get_up2date() && !gauge_field_get_unset(); + bool do_clover_update = !clover_field_get_up2date() && !gauge_field_get_unset(); + bool do_multigrid_update = param_ != qudaState.dirac_handle && param->inv_type_precondition == QUDA_MG_INVERTER && !mg_get_up2date(param) && !gauge_field_get_unset(); + bool do_multigrid_fat_update = do_multigrid_update && (do_gauge_transfer || additional_prop->mg_ud_rev != qudaState.ud_rev || additional_prop->mg_ad_rev != qudaState.ad_rev); - if (do_gauge_update) { + if (do_gauge_transfer) { if (qudaState.layout.h_gauge == nullptr) { errorQuda("qudaState.layout.h_gauge is not set."); } @@ -1090,19 +1082,30 @@ void openQCD_qudaSolverUpdate(void *param_) * @see Reconstruct<12,...>#Unpack() in gauge_field_order.h */ openQCD_qudaGaugeLoad(h_gauge, QUDA_DOUBLE_PRECISION, rec, QUDA_ANTI_PERIODIC_T); - gauge_field_set_up2date(); + gauge_field_set_revision(); POP_RANGE; } if (do_param_update) { + logQuda(QUDA_VERBOSE, "Syncing kappa, su3csw, u1csw values from openQCD ...\n"); param->kappa = 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)); additional_prop->u1csw = qudaState.layout.dirac_parms().u1csw; set_su3csw(param, qudaState.layout.dirac_parms().su3csw); + + QudaInvertParam* mg_inv_param = additional_prop->mg_param->invert_param; + mg_inv_param->kappa = 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)); + set_su3csw(mg_inv_param, qudaState.layout.dirac_parms().su3csw); } if (do_clover_update) { if (param->clover_csw == 0.0) { + logQuda(QUDA_VERBOSE, "Deallocating Clover field in QUDA ...\n"); freeCloverQuda(); + qudaState.swd_ud_rev = 0; + qudaState.swd_ad_rev = 0; + qudaState.swd_kappa = 0.0; + qudaState.swd_su3csw = 0.0; + qudaState.swd_u1csw = 0.0; } else { if (qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3) { /** @@ -1111,10 +1114,11 @@ void openQCD_qudaSolverUpdate(void *param_) * the GPU and finally calls @createCloverQuda to calculate the clover * field. */ - logQuda(QUDA_VERBOSE, "Generating clover field in QUDA ...\n"); + logQuda(QUDA_VERBOSE, "Generating Clover field in QUDA ...\n"); PUSH_RANGE("loadCloverQuda",3); loadCloverQuda(NULL, NULL, param); POP_RANGE; + clover_field_set_revision(); } else { /** * U3 case: Transfer the SW-field from openQCD. @@ -1124,14 +1128,15 @@ void openQCD_qudaSolverUpdate(void *param_) errorQuda("qudaState.layout.h_sw is not set."); } - logQuda(QUDA_VERBOSE, "Loading clover field from openQCD ...\n"); + logQuda(QUDA_VERBOSE, "Loading Clover field from openQCD ...\n"); void *h_sw = qudaState.layout.h_sw(); PUSH_RANGE("openQCD_qudaCloverLoad",3); openQCD_qudaCloverLoad(h_sw, param->kappa, param->clover_csw); POP_RANGE; + clover_field_set_revision(); /*loadCloverQuda(qudaState.layout.h_sw(), NULL, param);*/ - /* The above line would be prefered over openQCD_qudaCloverLoad, but throws this error, no idea why? + /* TODO: The above line would be prefered over openQCD_qudaCloverLoad, but throws this error, no idea why? QUDA: ERROR: qudaEventRecord_ returned CUDA_ERROR_ILLEGAL_ADDRESS (timer.h:82 in start()) (rank 0, host yoshi, quda_api.cpp:72 in void quda::target::cuda::set_driver_error(CUresult, const char*, const char*, const char*, const char*, bool)()) @@ -1142,22 +1147,38 @@ void openQCD_qudaSolverUpdate(void *param_) /* setup/update the multigrid instance or do nothing */ if (do_multigrid_update) { - QudaMultigridParam* mg_param = reinterpret_cast(param->additional_prop)->mg_param; + QudaMultigridParam* mg_param = additional_prop->mg_param; if (mg_param == nullptr) { errorQuda("No multigrid parameter struct set."); } + if (do_multigrid_fat_update && param->preconditioner != nullptr) { + logQuda(QUDA_VERBOSE, "Destroying existing multigrid instance ...\n"); + PUSH_RANGE("destroyMultigridQuda",4); + destroyMultigridQuda(param->preconditioner); + param->preconditioner = nullptr; + POP_RANGE; + + additional_prop->mg_ud_rev = 0; + additional_prop->mg_ad_rev = 0; + additional_prop->mg_kappa = 0.0; + additional_prop->mg_su3csw = 0.0; + additional_prop->mg_u1csw = 0.0; + } + if (param->preconditioner == nullptr) { logQuda(QUDA_VERBOSE, "Setting up multigrid instance ...\n"); PUSH_RANGE("newMultigridQuda",4); param->preconditioner = newMultigridQuda(mg_param); POP_RANGE; + mg_set_revision(param); } else { - logQuda(QUDA_VERBOSE, "Updating up multigrid instance ...\n"); + logQuda(QUDA_VERBOSE, "Updating existing multigrid instance ...\n"); PUSH_RANGE("updateMultigridQuda",4); updateMultigridQuda(param->preconditioner, mg_param); POP_RANGE; + mg_set_revision(param); } } } @@ -1198,12 +1219,16 @@ void* openQCD_qudaSolverReadIn(int id) set_su3csw(param, qudaState.layout.dirac_parms().su3csw); - if (my_rank == 0) { + if (my_rank == 0 && id != -1) { KeyValueStore kv; kv.set_map(&enum_map); kv.load(qudaState.infile); + if (!kv.section_exists(section)) { + errorQuda("Solver section \"%s\" in file %s does not exist.", section.c_str(), qudaState.infile); + } + param->verbosity = kv.get(section, "verbosity", param->verbosity); if (param->verbosity >= QUDA_DEBUG_VERBOSE) { @@ -1211,7 +1236,7 @@ void* openQCD_qudaSolverReadIn(int id) } if (kv.get(section, "solver") != "QUDA") { - errorQuda("Solver section \"%s\" in file %s is not a valid quda-solver section (solver = %s)\n", + errorQuda("Solver section \"%s\" in file %s is not a valid quda-solver section (solver = %s).", section.c_str(), qudaState.infile, kv.get(section, "solver").c_str()); } @@ -1340,6 +1365,10 @@ void* openQCD_qudaSolverReadIn(int id) std::string mg_section = section + " Multigrid"; + if (!kv.section_exists(mg_section)) { + errorQuda("Solver section \"%s\" in file %s does not exist.", mg_section.c_str(), qudaState.infile); + } + /* (shallow) copy the struct */ *invert_param_mg = *param; @@ -1347,7 +1376,7 @@ void* openQCD_qudaSolverReadIn(int id) invert_param_mg->gamma_basis = QUDA_DEGRAND_ROSSI_GAMMA_BASIS; invert_param_mg->dirac_order = QUDA_DIRAC_ORDER; - multigrid_param->n_level = kv.get(mg_section, "n_level", multigrid_param->n_level); + multigrid_param->n_level = kv.get(mg_section, "n_level", multigrid_param->n_level, true); multigrid_param->setup_type = kv.get(mg_section, "setup_type", multigrid_param->setup_type); multigrid_param->pre_orthonormalize = kv.get(mg_section, "pre_orthonormalize", multigrid_param->pre_orthonormalize); multigrid_param->post_orthonormalize = kv.get(mg_section, "post_orthonormalize", multigrid_param->post_orthonormalize); @@ -1366,6 +1395,10 @@ void* openQCD_qudaSolverReadIn(int id) for (int i=0; in_level; i++) { std::string subsection = section + " Multigrid Level " + std::to_string(i); + if (!kv.section_exists(subsection)) { + errorQuda("Solver section \"%s\" in file %s does not exist.", subsection.c_str(), qudaState.infile); + } + multigrid_param->geo_block_size[i][0] = kv.get(subsection, "geo_block_size[1]", multigrid_param->geo_block_size[i][0]); multigrid_param->geo_block_size[i][1] = kv.get(subsection, "geo_block_size[2]", multigrid_param->geo_block_size[i][1]); multigrid_param->geo_block_size[i][2] = kv.get(subsection, "geo_block_size[3]", multigrid_param->geo_block_size[i][2]); @@ -1449,7 +1482,6 @@ void* openQCD_qudaSolverReadIn(int id) strcpy(additional_prop->infile, qudaState.infile); additional_prop->id = id; additional_prop->mg_param = multigrid_param; - qudaState.layout.get_swfld_flags(&additional_prop->ud_swd_rev, &additional_prop->ad_swd_rev); additional_prop->u1csw = qudaState.layout.dirac_parms().u1csw; param->additional_prop = reinterpret_cast(additional_prop); @@ -1459,14 +1491,55 @@ void* openQCD_qudaSolverReadIn(int id) void* openQCD_qudaSolverGetHandle(int id) { - if (qudaState.handles[id] == nullptr) { - printfQuda("read in solver parameters from file %s for solver id=%d\n", - qudaState.infile, id); - qudaState.handles[id] = openQCD_qudaSolverReadIn(id); + void *ptr = id == -1 ? qudaState.dirac_handle : qudaState.handles[id]; + + if (ptr == nullptr) { + if (id != -1) { + logQuda(QUDA_VERBOSE, "Read in solver parameters from file %s for solver (id=%d)\n", + qudaState.infile, id); + } + ptr = openQCD_qudaSolverReadIn(id); } - openQCD_qudaSolverUpdate(qudaState.handles[id]); - return qudaState.handles[id]; + openQCD_qudaSolverUpdate(ptr); + return ptr; +} + + +void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p) +{ + QudaInvertParam param = newOpenQCDDiracParam(p); + + /* both fields reside on the CPU */ + param.input_location = QUDA_CPU_FIELD_LOCATION; + param.output_location = QUDA_CPU_FIELD_LOCATION; + + MatQuda(static_cast(dst), static_cast(src), ¶m); + + logQuda(QUDA_DEBUG_VERBOSE, "MatQuda()\n"); + logQuda(QUDA_DEBUG_VERBOSE, " gflops = %.2e\n", param.gflops); + logQuda(QUDA_DEBUG_VERBOSE, " secs = %.2e\n", param.secs); +} + + +void openQCD_qudaDw(double mu, void *in, void *out) +{ + if (gauge_field_get_unset()) { + errorQuda("Gauge field not populated in openQxD."); + } + + QudaInvertParam* param = static_cast(openQCD_qudaSolverGetHandle(-1)); + param->mu = mu; + + if (!openQCD_qudaInvertParamCheck(param)) { + errorQuda("QudaInvertParam struct check failed, parameters/fields between openQxD and QUDA are not in sync."); + } + + /* both fields reside on the CPU */ + param->input_location = QUDA_CPU_FIELD_LOCATION; + param->output_location = QUDA_CPU_FIELD_LOCATION; + + MatQuda(static_cast(out), static_cast(in), param); } @@ -1496,49 +1569,57 @@ template int hash_struct(T *in) int openQCD_qudaSolverGetHash(int id) { - QudaInvertParam* param = reinterpret_cast(openQCD_qudaSolverGetHandle(id)); - QudaInvertParam hparam = newQudaInvertParam(); - memset(&hparam, '\0', sizeof(QudaInvertParam)); /* set everything to zero */ - - /* Set some properties we want to take the hash over */ - hparam.inv_type = param->inv_type; - hparam.tol = param->tol; - hparam.tol_restart = param->tol_restart; - hparam.tol_hq = param->tol_hq; - hparam.maxiter = param->maxiter; - hparam.reliable_delta = param->reliable_delta; - hparam.solution_type = param->solution_type; - hparam.solve_type = param->solve_type; - hparam.matpc_type = param->matpc_type; - hparam.dagger = param->dagger; - hparam.mass_normalization = param->mass_normalization; - hparam.solver_normalization = param->solver_normalization; - hparam.cpu_prec = param->cpu_prec; - hparam.cuda_prec = param->cuda_prec; - hparam.use_init_guess = param->use_init_guess; - hparam.gcrNkrylov = param->gcrNkrylov; - - return hash_struct(&hparam); + if (qudaState.handles[id] != nullptr) { + QudaInvertParam* param = reinterpret_cast(qudaState.handles[id]); + QudaInvertParam hparam = newQudaInvertParam(); + memset(&hparam, '\0', sizeof(QudaInvertParam)); /* set everything to zero */ + + /* Set some properties we want to take the hash over */ + hparam.inv_type = param->inv_type; + hparam.tol = param->tol; + hparam.tol_restart = param->tol_restart; + hparam.tol_hq = param->tol_hq; + hparam.maxiter = param->maxiter; + hparam.reliable_delta = param->reliable_delta; + hparam.solution_type = param->solution_type; + hparam.solve_type = param->solve_type; + hparam.matpc_type = param->matpc_type; + hparam.dagger = param->dagger; + hparam.mass_normalization = param->mass_normalization; + hparam.solver_normalization = param->solver_normalization; + hparam.cpu_prec = param->cpu_prec; + hparam.cuda_prec = param->cuda_prec; + hparam.use_init_guess = param->use_init_guess; + hparam.gcrNkrylov = param->gcrNkrylov; + + return hash_struct(&hparam); + } else { + return 0; + } } void openQCD_qudaSolverPrintSetup(int id) { - QudaInvertParam *param = static_cast(openQCD_qudaSolverGetHandle(id)); - - if (param != nullptr) { + if (qudaState.handles[id] != nullptr) { + QudaInvertParam *param = static_cast(qudaState.handles[id]); openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); printQudaInvertParam(param); - printfQuda("additional_prop->infile = %s\n", additional_prop->infile); - printfQuda("additional_prop->id = %d\n", additional_prop->id); - printfQuda("additional_prop->mg_param = %p\n", additional_prop->mg_param); - printfQuda("additional_prop->ud_swd_rev = %d\n", additional_prop->ud_swd_rev); - printfQuda("additional_prop->ad_swd_rev = %d\n", additional_prop->ad_swd_rev); - printfQuda("additional_prop->u1csw = %.2e\n", additional_prop->u1csw); + printfQuda("additional_prop->infile = %s\n", additional_prop->infile); + printfQuda("additional_prop->id = %d\n", additional_prop->id); + printfQuda("additional_prop->mg_param = %p\n", additional_prop->mg_param); + printfQuda("additional_prop->u1csw = %.6e\n", additional_prop->u1csw); + printfQuda("additional_prop->mg_ud_rev = %d\n", additional_prop->mg_ud_rev); + printfQuda("additional_prop->mg_ad_rev = %d\n", additional_prop->mg_ad_rev); + printfQuda("additional_prop->mg_kappa = %.6e\n", additional_prop->mg_kappa); + printfQuda("additional_prop->mg_su3csw = %.6e\n", additional_prop->mg_su3csw); + printfQuda("additional_prop->mg_u1csw = %.6e\n", additional_prop->mg_u1csw); printfQuda("handle = %p\n", param); printfQuda("hash = %d\n", openQCD_qudaSolverGetHash(id)); + printfQuda("inv_type_precondition = %d\n", param->inv_type_precondition); + if (param->inv_type_precondition == QUDA_MG_INVERTER) { printQudaMultigridParam(additional_prop->mg_param); } @@ -1557,10 +1638,21 @@ double openQCD_qudaInvert(int id, double mu, void *source, void *solution, int * QudaInvertParam* param = static_cast(openQCD_qudaSolverGetHandle(id)); param->mu = mu; - if (!openQCD_qudaSolverCheck(param)) { + if (!openQCD_qudaInvertParamCheck(param)) { errorQuda("Solver check failed, parameters/fields between openQxD and QUDA are not in sync."); } + /** + * This is to make sure we behave in the same way as openQCDs solvers. We have + * to make sure that the SW-term in openQxD is setup and in sync with QUDAs. + */ + if (qudaState.layout.h_sw != nullptr) { + qudaState.layout.h_sw(); + } else { + errorQuda("qudaState.layout.h_sw is not set."); + } + + logQuda(QUDA_VERBOSE, "Calling invertQuda() ...\n"); PUSH_RANGE("invertQuda",5); invertQuda(static_cast(solution), static_cast(source), param); @@ -1582,9 +1674,9 @@ double openQCD_qudaInvert(int id, double mu, void *source, void *solution, int * void openQCD_qudaSolverDestroy(int id) { - QudaInvertParam *param = static_cast(openQCD_qudaSolverGetHandle(id)); + if (qudaState.handles[id] != nullptr) { + QudaInvertParam *param = static_cast(qudaState.handles[id]); - if (param != nullptr) { if (param->inv_type_precondition == QUDA_MG_INVERTER) { destroyMultigridQuda(param->preconditioner); } @@ -1673,12 +1765,8 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id) /* transfer of the struct to all the processes */ MPI_Bcast((void*) param, sizeof(*param), MPI_BYTE, 0, MPI_COMM_WORLD); - if (solver_id == -1) { - param->invert_param = new QudaInvertParam(newQudaInvertParam()); - } else { - void *inv_param = openQCD_qudaSolverGetHandle(solver_id); - param->invert_param = static_cast(inv_param); - } + void *inv_param = openQCD_qudaSolverGetHandle(solver_id); + param->invert_param = static_cast(inv_param); param->invert_param->verbosity = std::max(param->invert_param->verbosity, verbosity); From d356fa090a590591b4f39b63a928337d7956b004 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Thu, 22 Feb 2024 19:46:36 +0100 Subject: [PATCH 138/148] removed ptr in getHandle --- include/quda_openqcd_interface.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 9ee41500bb..66ab0e4154 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -146,8 +146,11 @@ typedef struct { double swd_u1csw; /** U(1) csw coefficient corresponding to the current SW field in QUDA */ openQCD_QudaInitArgs_t init; openQCD_QudaLayout_t layout; + void* dirac_handle; /** void-pointer to QudaInvertParam struct for the Dirac operator. + * Notice that this void pointer HAS to be directly before + * handles[32], because it's possible to call + * openQCD_qudaSolverGetHandle with -1. */ void* handles[32]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ - void* dirac_handle; /** void-pointer to QudaInvertParam struct for the Dirac operator */ char infile[1024]; /** Path to the input file (if given to quda_init()) */ } openQCD_QudaState_t; From a80dd552ed15df1ead1606954ed3d6894582302d Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 23 Feb 2024 11:25:02 +0100 Subject: [PATCH 139/148] removed ptr in getHandle (2) --- lib/openqcd_interface.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index a13b9abcea..bce233d0fd 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -22,7 +22,7 @@ #define MAX(a, b) ((a) > (b) ? (a) : (b)) -static openQCD_QudaState_t qudaState = {false, -1, -1, -1, -1, 0.0, 0.0, 0.0, {}, {}, {}, nullptr, '\0'}; +static openQCD_QudaState_t qudaState = {false, -1, -1, -1, -1, 0.0, 0.0, 0.0, {}, {}, nullptr, {}, '\0'}; using namespace quda; @@ -1491,18 +1491,16 @@ void* openQCD_qudaSolverReadIn(int id) void* openQCD_qudaSolverGetHandle(int id) { - void *ptr = id == -1 ? qudaState.dirac_handle : qudaState.handles[id]; - - if (ptr == nullptr) { + if (qudaState.handles[id] == nullptr) { if (id != -1) { logQuda(QUDA_VERBOSE, "Read in solver parameters from file %s for solver (id=%d)\n", qudaState.infile, id); } - ptr = openQCD_qudaSolverReadIn(id); + qudaState.handles[id] = openQCD_qudaSolverReadIn(id); } - openQCD_qudaSolverUpdate(ptr); - return ptr; + openQCD_qudaSolverUpdate(qudaState.handles[id]); + return qudaState.handles[id]; } From 466d702720ec9ef9dd4b87a186d254d2cc79ce32 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 23 Feb 2024 15:49:45 +0100 Subject: [PATCH 140/148] fixed clang complaining "error: suggest braces around initialization of subobject" --- lib/openqcd_interface.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index bce233d0fd..cdb22d8f09 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -22,7 +22,7 @@ #define MAX(a, b) ((a) > (b) ? (a) : (b)) -static openQCD_QudaState_t qudaState = {false, -1, -1, -1, -1, 0.0, 0.0, 0.0, {}, {}, nullptr, {}, '\0'}; +static openQCD_QudaState_t qudaState = {false, -1, -1, -1, -1, 0.0, 0.0, 0.0, {}, {}, nullptr, {}, ""}; using namespace quda; From c79cb4a6a6f564ec0b513f5df0d48ca2b3ceae8c Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 23 Feb 2024 15:50:34 +0100 Subject: [PATCH 141/148] in openQCD sometimes we test with gauge field all set to unity, then the Sw term is zero, but QUDA should not complain then --- include/clover_field_order.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/clover_field_order.h b/include/clover_field_order.h index 0187228dc5..cc9f5a6666 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -638,7 +638,11 @@ namespace quda { errorQuda("Accessor reconstruct = %d does not match field reconstruct %d", enable_reconstruct, clover.Reconstruct()); if (clover.max_element(is_inverse) == 0.0 && isFixed::value) +#ifdef BUILD_OPENQCD_INTERFACE + warningQuda("%p max_element(%d) appears unset", &clover, is_inverse); /* ignore if the SW-field is zero */ +#else errorQuda("%p max_element(%d) appears unset", &clover, is_inverse); +#endif if (clover.Diagonal() == 0.0 && clover.Reconstruct()) errorQuda("%p diagonal appears unset", &clover); this->clover = clover_ ? clover_ : clover.data(is_inverse); } From 6edc8638fc3cb3570a0bbd2ba29d07b6884d84ce Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 23 Feb 2024 15:57:22 +0100 Subject: [PATCH 142/148] remove trailing whitespaces --- include/gamma.cuh | 2 +- include/gauge_field_order.h | 2 +- include/kernels/copy_color_spinor.cuh | 4 ++-- include/quda_openqcd_interface.h | 2 +- lib/openqcd_interface.cpp | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/gamma.cuh b/include/gamma.cuh index 27961e7099..8ddcc0553b 100644 --- a/include/gamma.cuh +++ b/include/gamma.cuh @@ -27,7 +27,7 @@ namespace quda { __device__ __host__ inline int getcol(int row) const { if (basis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS || - basis == QUDA_OPENQCD_GAMMA_BASIS) { + basis == QUDA_OPENQCD_GAMMA_BASIS) { switch(dir) { case 0: case 1: diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index f2c14223cb..ebb3e00f3e 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2464,7 +2464,7 @@ namespace quda { }; // class OpenQCDOrder } // namespace gauge - + template __device__ __host__ inline auto operator*(const gauge::fieldorder_wrapper &a, diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index c3b234e42c..7b53766080 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -112,7 +112,7 @@ namespace quda { } }; - /** Transform from openqcd into non-relativistic basis (a.k.a UKQCD basis): + /** Transform from openqcd into non-relativistic basis (a.k.a UKQCD basis): * gamma_ukqcd = U gamma_openqcd U^dagger with * U = [-1 0 1 0] [ 0 -1 0 1] @@ -142,7 +142,7 @@ namespace quda { } }; - /** Transform from non-relativistic (aka ukqcd) into openqcd basis: + /** Transform from non-relativistic (aka ukqcd) into openqcd basis: * gamma_ukqcd = U gamma_openqcd U^dagger with * U = [-1 0 1 0] * [ 0 -1 0 1] diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 66ab0e4154..8ce318aea0 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -195,7 +195,7 @@ void openQCD_qudaFinalize(void); /** * Copy a spinor to GPU and back to CPU. - * + * * @param[in] h_in Spinor input field (from openQCD) * @param[out] h_out Spinor output field */ diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index cdb22d8f09..6d726b20fa 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -418,7 +418,7 @@ static lat_dim_t get_local_dims(int *fill = nullptr) * * @param[in] coords coords is the 4D cartesian coordinate of a rank * @param[in] fdata should point to an instance of qudaLayout.ranks, - * @see struct openQCD_QudaLayout_t in + * @see struct openQCD_QudaLayout_t in * @file include/quda_openqcd_interface.h * * @return rank From 568dd3734e4aa20b99e0e18bd1f7d344e5e5bb9a Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Fri, 23 Feb 2024 16:07:45 +0100 Subject: [PATCH 143/148] applied clang-format as described in https://github.com/lattice/quda/wiki/Coding-Conventions-and-Style#coding-style-and-clang-format --- include/clover_field_order.h | 47 +- include/color_spinor_field_order.h | 27 +- include/communicator_quda.h | 13 +- include/dirac_quda.h | 3 +- include/enum_quda.h | 2 +- include/enum_quda_fortran.h | 2 +- include/gamma.cuh | 166 ++-- include/gauge_field_order.h | 54 +- include/index_helper.cuh | 158 ++-- include/kernels/copy_color_spinor.cuh | 50 +- include/quda.h | 2 +- include/quda_openqcd_interface.h | 168 ++-- lib/checksum.cu | 4 +- lib/copy_color_spinor.cuh | 18 +- lib/copy_gauge_extended.cu | 2 - lib/copy_gauge_inc.cu | 1 - lib/dslash_gamma_helper.cu | 8 +- lib/extract_gauge_ghost.in.cu | 3 +- lib/extract_gauge_ghost_extended.cu | 3 +- lib/gauge_field.cpp | 10 +- lib/openqcd_interface.cpp | 1149 ++++++++++++------------- lib/targets/cuda/comm_target.cpp | 5 +- lib/targets/hip/comm_target.cpp | 5 +- 23 files changed, 891 insertions(+), 1009 deletions(-) diff --git a/include/clover_field_order.h b/include/clover_field_order.h index cc9f5a6666..129227ccf1 100644 --- a/include/clover_field_order.h +++ b/include/clover_field_order.h @@ -1034,8 +1034,8 @@ namespace quda { const double coeff; const double csw; const double kappa; - const int dim[4]; // xyzt convention - const int L[4]; // txyz convention + const int dim[4]; // xyzt convention + const int L[4]; // txyz convention OpenQCDOrder(const CloverField &clover, bool inverse, Float *clover_ = nullptr, void * = nullptr) : volumeCB(clover.Stride()), @@ -1044,17 +1044,15 @@ namespace quda { epsilon2(clover.Epsilon2()), coeff(clover.Coeff()), csw(clover.Csw()), - kappa(clover.Coeff()/clover.Csw()), + kappa(clover.Coeff() / clover.Csw()), dim {clover.X()[0], clover.X()[1], clover.X()[2], clover.X()[3]}, // *local* lattice dimensions, xyzt - L {clover.X()[3], clover.X()[0], clover.X()[1], clover.X()[2]} // *local* lattice dimensions, txyz + L {clover.X()[3], clover.X()[0], clover.X()[1], clover.X()[2]} // *local* lattice dimensions, txyz { if (clover.Order() != QUDA_OPENQCD_CLOVER_ORDER) { errorQuda("Invalid clover order %d for this accessor", clover.Order()); } this->clover = clover_ ? clover_ : clover.data(inverse); - if (clover.Coeff() == 0.0 || clover.Csw() == 0.0) { - errorQuda("Neither coeff nor csw may be zero!"); - } + if (clover.Coeff() == 0.0 || clover.Csw() == 0.0) { errorQuda("Neither coeff nor csw may be zero!"); } } QudaTwistFlavorType TwistFlavor() const { return twist_flavor; } @@ -1074,8 +1072,8 @@ namespace quda { { int x_quda[4], x[4]; getCoords(x_quda, x_cb, dim, parity); // x_quda contains xyzt local Carthesian corrdinates - openqcd::rotate_coords(x_quda, x); // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate - return openqcd::ipt(x, L)*length; + openqcd::rotate_coords(x_quda, x); // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate + return openqcd::ipt(x, L) * length; } /** @@ -1085,35 +1083,32 @@ namespace quda { * @param x_cb The checkerboarded lattice site * @param parity The parity of the lattice site */ - __device__ __host__ inline void load(RegType v[length], int x_cb, int parity) const { - int sign[36] = {-1,-1,-1,-1,-1,-1, // diagonals (idx 0-5) - -1,+1,-1,+1,-1,-1,-1,-1,-1,-1, // column 0 (idx 6-15) - -1,+1,-1,-1,-1,-1,-1,-1, // column 1 (idx 16-23) - -1,-1,-1,-1,-1,-1, // column 2 (idx 24-29) - -1,+1,-1,+1, // column 3 (idx 30-33) - -1,+1}; // column 4 (idx 34-35) - int map[36] = {0,1,2,3,4,5, - 6,7,8,9,10,11,18,19,24,25, - 16,17,12,13,20,21,26,27, - 14,15,22,23,28,29, - 30,31,32,33, - 34,35}; - const int M = length/2; + __device__ __host__ inline void load(RegType v[length], int x_cb, int parity) const + { + int sign[36] = {-1, -1, -1, -1, -1, -1, // diagonals (idx 0-5) + -1, +1, -1, +1, -1, -1, -1, -1, -1, -1, // column 0 (idx 6-15) + -1, +1, -1, -1, -1, -1, -1, -1, // column 1 (idx 16-23) + -1, -1, -1, -1, -1, -1, // column 2 (idx 24-29) + -1, +1, -1, +1, // column 3 (idx 30-33) + -1, +1}; // column 4 (idx 34-35) + int map[36] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 18, 19, 24, 25, 16, 17, + 12, 13, 20, 21, 26, 27, 14, 15, 22, 23, 28, 29, 30, 31, 32, 33, 34, 35}; + const int M = length / 2; int offset = getCloverOffset(x_cb, parity); auto Ap = &clover[offset]; // A_+ auto Am = &clover[offset + M]; // A_- #pragma unroll for (int i = 0; i < M; i++) { - v[ i] = sign[i]*(kappa*Am[map[i]] - (i<6)); - v[M + i] = sign[i]*(kappa*Ap[map[i]] - (i<6)); + v[i] = sign[i] * (kappa * Am[map[i]] - (i < 6)); + v[M + i] = sign[i] * (kappa * Ap[map[i]] - (i < 6)); } } // FIXME implement the save routine for OpenQCD ordered fields __device__ __host__ inline void save(RegType[length], int, int) const { } - size_t Bytes() const { return length*sizeof(Float); } + size_t Bytes() const { return length * sizeof(Float); } }; } // namespace clover diff --git a/include/color_spinor_field_order.h b/include/color_spinor_field_order.h index 89096e3202..b0d49a67d2 100644 --- a/include/color_spinor_field_order.h +++ b/include/color_spinor_field_order.h @@ -1823,7 +1823,7 @@ namespace quda using real = typename mapper::type; using complex = complex; - static const int length = 2*Ns*Nc; // 12 complex (2 floats) numbers per spinor color field + static const int length = 2 * Ns * Nc; // 12 complex (2 floats) numbers per spinor color field Float *field; size_t offset; Float *ghost[8]; @@ -1835,15 +1835,13 @@ namespace quda OpenQCDDiracOrder(const ColorSpinorField &a, int = 1, Float *field_ = 0, float * = 0) : field(field_ ? field_ : a.data()), - offset(a.Bytes() / (2 * sizeof(Float))), // TODO: What's this for?? + offset(a.Bytes() / (2 * sizeof(Float))), // TODO: What's this for?? volumeCB(a.VolumeCB()), nParity(a.SiteSubset()), dim {a.X(0), a.X(1), a.X(2), a.X(3)}, // *local* lattice dimensions, xyzt - L {a.X(3), a.X(0), a.X(1), a.X(2)} // *local* lattice dimensions, txyz + L {a.X(3), a.X(0), a.X(1), a.X(2)} // *local* lattice dimensions, txyz { - if constexpr (length != 24) { - errorQuda("Spinor field length %d not supported", length); - } + if constexpr (length != 24) { errorQuda("Spinor field length %d not supported", length); } } /** @@ -1859,20 +1857,20 @@ namespace quda { int x_quda[4], x[4]; getCoords(x_quda, x_cb, dim, parity); // x_quda contains xyzt local Carthesian corrdinates - openqcd::rotate_coords(x_quda, x); // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate - return openqcd::ipt(x, L)*length; + openqcd::rotate_coords(x_quda, x); // xyzt -> txyz, x = openQCD local Carthesian lattice coordinate + return openqcd::ipt(x, L) * length; } - __device__ __host__ inline void load(complex v[length/2], int x_cb, int parity = 0) const + __device__ __host__ inline void load(complex v[length / 2], int x_cb, int parity = 0) const { auto in = &field[getSpinorOffset(x_cb, parity)]; - block_load(v, reinterpret_cast(in)); + block_load(v, reinterpret_cast(in)); } - __device__ __host__ inline void save(const complex v[length/2], int x_cb, int parity = 0) const + __device__ __host__ inline void save(const complex v[length / 2], int x_cb, int parity = 0) const { auto out = &field[getSpinorOffset(x_cb, parity)]; - block_store(reinterpret_cast(out), v); + block_store(reinterpret_cast(out), v); } /** @@ -1889,10 +1887,7 @@ namespace quda return colorspinor_wrapper(*this, x_cb, parity); } - size_t Bytes() const - { - return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); - } + size_t Bytes() const { return nParity * volumeCB * Nc * Ns * 2 * sizeof(Float); } }; // openQCDDiracOrder } // namespace colorspinor diff --git a/include/communicator_quda.h b/include/communicator_quda.h index 6eb7c406ae..3facc48be7 100644 --- a/include/communicator_quda.h +++ b/include/communicator_quda.h @@ -130,24 +130,23 @@ namespace quda int shift_integer; int Nx_displacement = 0; - for (int i = QUDA_MAX_DIM-1; i >=0; i--) { + for (int i = QUDA_MAX_DIM - 1; i >= 0; i--) { // cstar shift[x] shift[y] shift[z] shift[t] // 0 0 0 0 0 // 1 0 0 0 0 // 2 0 1 0 0 // 3 0 1 1 0 - if(i < topo->ndim && ( - (i==1 && topo->cstar >= 2) || - (i==2 && topo->cstar >= 3) - )) { + if (i < topo->ndim && ((i == 1 && topo->cstar >= 2) || (i == 2 && topo->cstar >= 3))) { // if we go over the boundary and have a shifted boundary condition, // we shift Nx/2 ranks in x-direction: // shift_integer in { 0, 1, 2} // (shift_integer - 1) in {-1, 0, 1} shift_integer = (comm_coords(topo)[i] + displacement[i] + comm_dims(topo)[i]) / comm_dims(topo)[i]; - Nx_displacement += (shift_integer - 1) * (comm_dims(topo)[0]/2); + Nx_displacement += (shift_integer - 1) * (comm_dims(topo)[0] / 2); } - coords[i] = (i < topo->ndim) ? mod(comm_coords(topo)[i] + displacement[i] + (i==0 ? Nx_displacement :0), comm_dims(topo)[i]) : 0; + coords[i] = (i < topo->ndim) ? + mod(comm_coords(topo)[i] + displacement[i] + (i == 0 ? Nx_displacement : 0), comm_dims(topo)[i]) : + 0; } return comm_rank_from_coords(topo, coords); diff --git a/include/dirac_quda.h b/include/dirac_quda.h index 225f4e6567..fc84f72895 100644 --- a/include/dirac_quda.h +++ b/include/dirac_quda.h @@ -2606,8 +2606,7 @@ namespace quda { && (pc_type == QUDA_MATPC_EVEN_EVEN || pc_type == QUDA_MATPC_EVEN_EVEN_ASYMMETRIC)) return true; - if (dirac_type == QUDA_WILSON_DIRAC || dirac_type == QUDA_CLOVER_DIRAC) - return true; + if (dirac_type == QUDA_WILSON_DIRAC || dirac_type == QUDA_CLOVER_DIRAC) return true; return false; } diff --git a/include/enum_quda.h b/include/enum_quda.h index f6eb33276f..a9b822da34 100644 --- a/include/enum_quda.h +++ b/include/enum_quda.h @@ -51,7 +51,7 @@ typedef enum QudaGaugeFieldOrder_s { QUDA_BQCD_GAUGE_ORDER, /* expect *gauge, mu, even-odd, spacetime+halos, column-row order */ QUDA_TIFR_GAUGE_ORDER, /* expect *gauge, mu, even-odd, spacetime, column-row order */ QUDA_TIFR_PADDED_GAUGE_ORDER, /* expect *gauge, mu, parity, t, z+halo, y, x/2, column-row order */ - QUDA_OPENQCD_GAUGE_ORDER, /* expect *gauge, spacetime, mu, parity row-column order -- links attached to odd points only */ + QUDA_OPENQCD_GAUGE_ORDER, /* expect *gauge, spacetime, mu, parity row-column order -- links attached to odd points only */ QUDA_INVALID_GAUGE_ORDER = QUDA_INVALID_ENUM } QudaGaugeFieldOrder; diff --git a/include/enum_quda_fortran.h b/include/enum_quda_fortran.h index 5de7760732..802df42fc4 100644 --- a/include/enum_quda_fortran.h +++ b/include/enum_quda_fortran.h @@ -228,7 +228,7 @@ #define QUDA_CPS_WILSON_DIRAC_ORDER 4 // odd-even color inside spin #define QUDA_LEX_DIRAC_ORDER 5 // lexicographical order color inside spin #define QUDA_TIFR_PADDED_DIRAC_ORDER 6 -#define QUDA_OPENQCD_DIRAC_ORDER 7 // openqcd +#define QUDA_OPENQCD_DIRAC_ORDER 7 // openqcd #define QUDA_INVALID_DIRAC_ORDER QUDA_INVALID_ENUM #define QudaCloverFieldOrder integer(4) diff --git a/include/gamma.cuh b/include/gamma.cuh index 8ddcc0553b..d5c82e54e3 100644 --- a/include/gamma.cuh +++ b/include/gamma.cuh @@ -26,9 +26,8 @@ namespace quda { Gamma(const Gamma &g) = default; __device__ __host__ inline int getcol(int row) const { - if (basis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS || - basis == QUDA_OPENQCD_GAMMA_BASIS) { - switch(dir) { + if (basis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS || basis == QUDA_OPENQCD_GAMMA_BASIS) { + switch(dir) { case 0: case 1: switch(row) { @@ -57,7 +56,7 @@ namespace quda { break; } } else { - switch(dir) { + switch(dir) { case 0: case 1: switch(row) { @@ -203,59 +202,49 @@ namespace quda { break; } } else if (basis == QUDA_OPENQCD_GAMMA_BASIS) { - switch(dir) { - case 0: /* corresponds to gamma1 in OpenQCD convention */ - switch(row) { - case 0: - case 1: - return -I; - case 2: - case 3: - return I; - } - break; - case 1: /* gamma2 in openQCD */ - switch(row) { - case 0: - case 3: - return -1; - case 1: - case 2: - return 1; - } - break; - case 2: /* gamma3 in openQCD */ - switch(row) { - case 0: - case 3: - return -I; - case 1: - case 2: - return I; - } - break; - case 3: /* gamma0 in openQCD */ - switch(row) { - case 0: - case 1: - case 2: - case 3: - return -1; - } - break; - case 4: /* gamma5 in openQCD */ - switch(row) { - case 0: - case 1: - return 1; - case 2: - case 3: - return -1; - } - break; - } - } - + switch (dir) { + case 0: /* corresponds to gamma1 in OpenQCD convention */ + switch (row) { + case 0: + case 1: return -I; + case 2: + case 3: return I; + } + break; + case 1: /* gamma2 in openQCD */ + switch (row) { + case 0: + case 3: return -1; + case 1: + case 2: return 1; + } + break; + case 2: /* gamma3 in openQCD */ + switch (row) { + case 0: + case 3: return -I; + case 1: + case 2: return I; + } + break; + case 3: /* gamma0 in openQCD */ + switch (row) { + case 0: + case 1: + case 2: + case 3: return -1; + } + break; + case 4: /* gamma5 in openQCD */ + switch (row) { + case 0: + case 1: return 1; + case 2: + case 3: return -1; + } + break; + } + } return 0; } @@ -339,38 +328,49 @@ namespace quda { break; } } else if (basis == QUDA_OPENQCD_GAMMA_BASIS) { - switch(dir) { - case 0: /* gamma1 in openQCD convention */ - switch(row) { - case 0: case 1: return complex(a.imag(), -a.real()); // I - case 2: case 3: return complex(-a.imag(), a.real()); // -I - } - break; - case 1: /* gamma2 in openQCD */ - switch(row) { - case 0: case 3: return -a; - case 1: case 2: return a; + switch (dir) { + case 0: /* gamma1 in openQCD convention */ + switch (row) { + case 0: + case 1: return complex(a.imag(), -a.real()); // I + case 2: + case 3: return complex(-a.imag(), a.real()); // -I } break; - case 2: /* gamma3 in openQCD */ - switch(row) { - case 0: case 3: return complex(a.imag(), -a.real()); // I - case 1: case 2: return complex(-a.imag(), a.real()); // -I + case 1: /* gamma2 in openQCD */ + switch (row) { + case 0: + case 3: return -a; + case 1: + case 2: return a; } break; - case 3: /* gamma0 in openQCD */ - switch(row) { - case 0: case 1: case 2: case 3: return -a; + case 2: /* gamma3 in openQCD */ + switch (row) { + case 0: + case 3: return complex(a.imag(), -a.real()); // I + case 1: + case 2: return complex(-a.imag(), a.real()); // -I } break; - case 4: /* gamma5 in openQCD */ - switch(row) { - case 0: case 1: return a; - case 2: case 3: return -a; - } - break; - } - } + case 3: /* gamma0 in openQCD */ + switch (row) { + case 0: + case 1: + case 2: + case 3: return -a; + } + break; + case 4: /* gamma5 in openQCD */ + switch (row) { + case 0: + case 1: return a; + case 2: + case 3: return -a; + } + break; + } + } return a; } diff --git a/include/gauge_field_order.h b/include/gauge_field_order.h index ebb3e00f3e..aa97f9e73c 100644 --- a/include/gauge_field_order.h +++ b/include/gauge_field_order.h @@ -2347,7 +2347,6 @@ namespace quda { size_t Bytes() const { return Nc * Nc * 2 * sizeof(Float); } }; - /** * struct to define order of gauge fields in OpenQCD */ @@ -2368,13 +2367,11 @@ namespace quda { LegacyOrder(u, ghost_), gauge(gauge_ ? gauge_ : (Float *)u.data()), // pointer to the gauge field on CPU volumeCB(u.VolumeCB()), // Volume and VolumeCB refer to the global lattice, if VolumeLocal, then local lattice - dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]}, // *local* lattice dimensions, xyzt - L {u.X()[3], u.X()[0], u.X()[1], u.X()[2]}, // *local* lattice dimensions, txyz + dim {u.X()[0], u.X()[1], u.X()[2], u.X()[3]}, // *local* lattice dimensions, xyzt + L {u.X()[3], u.X()[0], u.X()[1], u.X()[2]}, // *local* lattice dimensions, txyz nproc {comm_dim(3), comm_dim(0), comm_dim(1), comm_dim(2)} // txyz { - if constexpr (length != 18) { - errorQuda("Gauge field length %d not supported", length); - } + if constexpr (length != 18) { errorQuda("Gauge field length %d not supported", length); } } /** @@ -2389,10 +2386,11 @@ namespace quda { * * @return The offset. */ - __device__ __host__ inline int getGaugeOffset_lexi(int x_cb, int dir, int parity) const { + __device__ __host__ inline int getGaugeOffset_lexi(int x_cb, int dir, int parity) const + { int x[4]; getCoords(x, x_cb, dim, parity); - return (4*openqcd::lexi(x, dim, 4) + dir)*length; + return (4 * openqcd::lexi(x, dim, 4) + dir) * length; } /** @@ -2405,41 +2403,42 @@ namespace quda { * * @return The offset. */ - __device__ __host__ inline int getGaugeOffset(int x_cb, int dir, int parity) const { + __device__ __host__ inline int getGaugeOffset(int x_cb, int dir, int parity) const + { int quda_x[4], x[4]; getCoords(quda_x, x_cb, dim, parity); // x_quda = quda local lattice coordinates - openqcd::rotate_coords(quda_x, x); // x = openQCD local lattice coordinates + openqcd::rotate_coords(quda_x, x); // x = openQCD local lattice coordinates - int mu = (dir+1) % 4; // mu = openQCD direction + int mu = (dir + 1) % 4; // mu = openQCD direction int ix = openqcd::ipt(x, L); int iz = openqcd::iup(x, mu, L, nproc); int ofs = 0; int volume = openqcd::vol(L); - if (ix < volume/2) { // ix even -> iz odd - if (iz < volume) { // iz in interior - ofs = 8*(iz - volume/2) + 2*mu + 1; + if (ix < volume / 2) { // ix even -> iz odd + if (iz < volume) { // iz in interior + ofs = 8 * (iz - volume / 2) + 2 * mu + 1; } else { - int ib = iz - volume - openqcd::ifc(L, nproc, mu) - openqcd::bndry(L, nproc)/2; // iz in exterior - ofs = 4*volume + openqcd::face_offset(L, nproc, mu) + ib; + int ib = iz - volume - openqcd::ifc(L, nproc, mu) - openqcd::bndry(L, nproc) / 2; // iz in exterior + ofs = 4 * volume + openqcd::face_offset(L, nproc, mu) + ib; } - } else if (volume/2 <= ix && ix < volume) { // ix odd - ofs = 8*(ix - volume/2) + 2*mu; + } else if (volume / 2 <= ix && ix < volume) { // ix odd + ofs = 8 * (ix - volume / 2) + 2 * mu; } - return ofs*length; + return ofs * length; } - __device__ __host__ inline void load(complex v[length/2], int x_cb, int dir, int parity, Float = 1.0) const + __device__ __host__ inline void load(complex v[length / 2], int x_cb, int dir, int parity, Float = 1.0) const { auto in = &gauge[getGaugeOffset(x_cb, dir, parity)]; - block_load(v, reinterpret_cast(in)); + block_load(v, reinterpret_cast(in)); } - __device__ __host__ inline void save(const complex v[length/2], int x_cb, int dir, int parity) const + __device__ __host__ inline void save(const complex v[length / 2], int x_cb, int dir, int parity) const { auto out = &gauge[getGaugeOffset_lexi(x_cb, dir, parity)]; - block_store(reinterpret_cast(out), v); + block_store(reinterpret_cast(out), v); } /** @@ -2457,10 +2456,7 @@ namespace quda { return gauge_wrapper(const_cast(*this), dim, x_cb, parity); } - size_t Bytes() const - { - return 2*Nc*Nc*sizeof(Float); - } + size_t Bytes() const { return 2 * Nc * Nc * sizeof(Float); } }; // class OpenQCDOrder } // namespace gauge @@ -2607,6 +2603,8 @@ namespace quda { template struct gauge_order_mapper { typedef gauge::TIFROrder type; }; template struct gauge_order_mapper { typedef gauge::TIFRPaddedOrder type; }; template struct gauge_order_mapper { typedef gauge::FloatNOrder type; }; - template struct gauge_order_mapper { typedef gauge::OpenQCDOrder type; }; + template struct gauge_order_mapper { + typedef gauge::OpenQCDOrder type; + }; } // namespace quda diff --git a/include/index_helper.cuh b/include/index_helper.cuh index e3c2293667..b0696177a2 100644 --- a/include/index_helper.cuh +++ b/include/index_helper.cuh @@ -1106,7 +1106,6 @@ namespace quda { return (((x[3]*X[2] + x[2])*X[1] + x[1])*X[0] + x[0]) >> 1; } - /** * These are index helper functions used in the order classes of openQCD, i.e. * @@ -1119,7 +1118,8 @@ namespace quda { * openqcd:include/global.h) that are needed to calculate the correct offsets * of the fields base pointers. */ - namespace openqcd { + namespace openqcd + { /** * @brief Returns the surface in direction mu @@ -1131,17 +1131,16 @@ namespace quda { */ __device__ __host__ inline int surface(const int *X, const int mu) { - if (mu==0) { - return X[1]*X[2]*X[3]; - } else if (mu==1) { - return X[0]*X[2]*X[3]; - } else if (mu==2) { - return X[0]*X[1]*X[3]; + if (mu == 0) { + return X[1] * X[2] * X[3]; + } else if (mu == 1) { + return X[0] * X[2] * X[3]; + } else if (mu == 2) { + return X[0] * X[1] * X[3]; } - return X[0]*X[1]*X[2]; + return X[0] * X[1] * X[2]; } - /** * @brief Return BNDRY (see openqcd:include/global.h) * @@ -1152,14 +1151,11 @@ namespace quda { */ __device__ __host__ inline int bndry(const int *L, const int *nproc) { - return 2*(((1-(nproc[0]%2))*surface(L, 0)) - + ((1-(nproc[1]%2))*surface(L, 1)) - + ((1-(nproc[2]%2))*surface(L, 2)) - + ((1-(nproc[3]%2))*surface(L, 3)) - ); + return 2 + * (((1 - (nproc[0] % 2)) * surface(L, 0)) + ((1 - (nproc[1] % 2)) * surface(L, 1)) + + ((1 - (nproc[2] % 2)) * surface(L, 2)) + ((1 - (nproc[3] % 2)) * surface(L, 3))); } - /** * @brief Calculate the offset needed for boundary points in openQCD. * @@ -1171,23 +1167,18 @@ namespace quda { */ __device__ __host__ inline int ifc(const int *L, const int *nproc, const int mu) { - if (mu==0) { - return ((1-(nproc[0]%2))*surface(L, 0))/2; - } else if (mu==1) { - return ((1-(nproc[0]%2))*surface(L, 0)) - + (((1-(nproc[1]%2))*surface(L, 1))/2); - } else if (mu==2) { - return ((1-(nproc[0]%2))*surface(L, 0)) - + ((1-(nproc[1]%2))*surface(L, 1)) - + (((1-(nproc[2]%2))*surface(L, 2))/2); + if (mu == 0) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2; + } else if (mu == 1) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) + (((1 - (nproc[1] % 2)) * surface(L, 1)) / 2); + } else if (mu == 2) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) + ((1 - (nproc[1] % 2)) * surface(L, 1)) + + (((1 - (nproc[2] % 2)) * surface(L, 2)) / 2); } - return ((1-(nproc[0]%2))*surface(L, 0)) - + ((1-(nproc[1]%2))*surface(L, 1)) - + ((1-(nproc[2]%2))*surface(L, 2)) - + (((1-(nproc[3]%2))*surface(L, 3))/2); + return ((1 - (nproc[0] % 2)) * surface(L, 0)) + ((1 - (nproc[1] % 2)) * surface(L, 1)) + + ((1 - (nproc[2] % 2)) * surface(L, 2)) + (((1 - (nproc[3] % 2)) * surface(L, 3)) / 2); } - /** * @brief Calculate the offset of the faces in openQCD. * @@ -1199,20 +1190,17 @@ namespace quda { */ __device__ __host__ inline int face_offset(const int *L, const int *nproc, const int mu) { - if (mu==0) { + if (mu == 0) { return 0; - } else if (mu==1) { - return ((1-(nproc[0]%2))*surface(L, 0))/2; - } else if (mu==2) { - return ((1-(nproc[0]%2))*surface(L, 0))/2 - + ((1-(nproc[1]%2))*surface(L, 1))/2; + } else if (mu == 1) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2; + } else if (mu == 2) { + return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2 + ((1 - (nproc[1] % 2)) * surface(L, 1)) / 2; } - return ((1-(nproc[0]%2))*surface(L, 0))/2 - + ((1-(nproc[1]%2))*surface(L, 1))/2 - + ((1-(nproc[2]%2))*surface(L, 2))/2; + return ((1 - (nproc[0] % 2)) * surface(L, 0)) / 2 + ((1 - (nproc[1] % 2)) * surface(L, 1)) / 2 + + ((1 - (nproc[2] % 2)) * surface(L, 2)) / 2; } - /** * @brief Rotate coordinates (xyzt -> txyz) * @@ -1229,7 +1217,6 @@ namespace quda { x_openQCD[0] = x_quda[3]; } - /** * @brief Generate a lexicographical index with x[Ndims-1] running * fastest, for example if Ndims=4: @@ -1245,14 +1232,11 @@ namespace quda { { int i, ix = x[0]; - #pragma unroll - for (i=1; i +VOLUME/2 + return (lexi(xb, cbs, 4) / 2 + vol(cbs) * lexi(xn, cbn, 4) / 2 + + ((x[0] + x[1] + x[2] + x[3]) % 2 != 0) * (vol(L) / 2) // odd -> +VOLUME/2 ); } - /** * @brief Determines the number of boundary points in direction mu prior to * the Carthesian index x with dimensions X @@ -1346,22 +1322,22 @@ namespace quda { { int ret = 0; - if (mu==3) { + if (mu == 3) { ret = lexi(x, X, 3); // lexi without x[3] - } else if (mu==2) { - ret = X[3]*lexi(x, X, 2); - if (x[2]==(X[2]-1)) { + } else if (mu == 2) { + ret = X[3] * lexi(x, X, 2); + if (x[2] == (X[2] - 1)) { ret += x[3]; // lexi without x[2] } - } else if (mu==1) { - if (x[1]==(X[1]-1)) { - ret = X[2]*X[3]*x[0] + X[3]*x[2] + x[3]; // lexi without x[1] + } else if (mu == 1) { + if (x[1] == (X[1] - 1)) { + ret = X[2] * X[3] * x[0] + X[3] * x[2] + x[3]; // lexi without x[1] } else { ret = surface(X, 1); } - } else if (mu==0) { - if (x[0]==(X[0]-1)) { - ret = lexi(x+1, X+1, 3); // lexi without x[0] + } else if (mu == 0) { + if (x[0] == (X[0] - 1)) { + ret = lexi(x + 1, X + 1, 3); // lexi without x[0] } else { ret = surface(X, 0); } @@ -1370,7 +1346,6 @@ namespace quda { return ret; } - /** * @brief Pure implementation of iup[ix][mu]. Returns neighbouring * point of ix in positive mu direction. @@ -1387,42 +1362,37 @@ namespace quda { { int i, ret, xb[4], xn[4]; - if ((x[mu]==(L[mu]-1))&&(nproc[mu]>1)) { + if ((x[mu] == (L[mu] - 1)) && (nproc[mu] > 1)) { int cbs[4] = {setup_cbs(0, L), setup_cbs(1, L), setup_cbs(2, L), setup_cbs(3, L)}; - int cbn[4] = {L[0]/cbs[0], L[1]/cbs[1], L[2]/cbs[2], L[3]/cbs[3]}; + int cbn[4] = {L[0] / cbs[0], L[1] / cbs[1], L[2] / cbs[2], L[3] / cbs[3]}; xb[0] = x[0] % cbs[0]; xb[1] = x[1] % cbs[1]; xb[2] = x[2] % cbs[2]; xb[3] = x[3] % cbs[3]; - xn[0] = x[0]/cbs[0]; - xn[1] = x[1]/cbs[1]; - xn[2] = x[2]/cbs[2]; - xn[3] = x[3]/cbs[3]; + xn[0] = x[0] / cbs[0]; + xn[1] = x[1] / cbs[1]; + xn[2] = x[2] / cbs[2]; + xn[3] = x[3] / cbs[3]; ret = vol(L) + ifc(L, nproc, mu); - if ((x[0]+x[1]+x[2]+x[3]) % 2 == 0) { - ret += bndry(L, nproc)/2; - } + if ((x[0] + x[1] + x[2] + x[3]) % 2 == 0) { ret += bndry(L, nproc) / 2; } - ret += surface(cbs, mu)*boundary_pts(mu, xn, cbn)/2; - ret += boundary_pts(mu, xb, cbs)/2; + ret += surface(cbs, mu) * boundary_pts(mu, xn, cbn) / 2; + ret += boundary_pts(mu, xb, cbs) / 2; return ret; } else { - #pragma unroll - for (i=0; i<4; i++) { - xb[i] = x[i]; - } +#pragma unroll + for (i = 0; i < 4; i++) { xb[i] = x[i]; } - xb[mu] = (xb[mu] + 1) % (L[mu]*nproc[mu]); + xb[mu] = (xb[mu] + 1) % (L[mu] * nproc[mu]); return ipt(xb, L); } } - } // namespace openqcd } // namespace quda diff --git a/include/kernels/copy_color_spinor.cuh b/include/kernels/copy_color_spinor.cuh index 7b53766080..891f4b7122 100644 --- a/include/kernels/copy_color_spinor.cuh +++ b/include/kernels/copy_color_spinor.cuh @@ -120,23 +120,20 @@ namespace quda { [ 0 1 0 1] / sqrt(2), * see https://github.com/JeffersonLab/chroma/blob/master/docs/notes/gamma_conventions.tex for further notes. */ - template - struct ReverseOpenqcdBasis { + template struct ReverseOpenqcdBasis { template - __device__ __host__ inline void operator()(complex out[Ns*Nc], const complex in[Ns*Nc]) const { + __device__ __host__ inline void operator()(complex out[Ns * Nc], const complex in[Ns * Nc]) const + { int s1[4] = {0, 1, 0, 1}; int s2[4] = {2, 3, 2, 3}; - FloatOut K1[4] = {static_cast(-kP), - static_cast(-kP), - static_cast(kP), - static_cast(kP)}; - FloatOut K2[4] = {static_cast(kP), - static_cast(kP), - static_cast(kP), - static_cast(kP)}; - for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); + FloatOut K1[4] + = {static_cast(-kP), static_cast(-kP), static_cast(kP), static_cast(kP)}; + FloatOut K2[4] + = {static_cast(kP), static_cast(kP), static_cast(kP), static_cast(kP)}; + for (int s = 0; s < Ns; s++) { + for (int c = 0; c < Nc; c++) { + out[s * Nc + c] = K1[s] * static_cast>(in[s1[s] * Nc + c]) + + K2[s] * static_cast>(in[s2[s] * Nc + c]); } } } @@ -149,24 +146,21 @@ namespace quda { * [ 1 0 1 0] * [ 0 1 0 1] / sqrt(2) */ - template - struct OpenqcdBasis { + template struct OpenqcdBasis { template - __device__ __host__ inline void operator()(complex out[Ns*Nc], const complex in[Ns*Nc]) const { + __device__ __host__ inline void operator()(complex out[Ns * Nc], const complex in[Ns * Nc]) const + { int s1[4] = {0, 1, 0, 1}; int s2[4] = {2, 3, 2, 3}; - FloatOut K1[4] = {static_cast(-kU), - static_cast(-kU), - static_cast(kU), - static_cast(kU)}; - FloatOut K2[4] = {static_cast(kU), - static_cast(kU), - static_cast(kU), - static_cast(kU)}; - for (int s=0; s >(in[s1[s]*Nc+c]) + K2[s]*static_cast >(in[s2[s]*Nc+c]); + FloatOut K1[4] + = {static_cast(-kU), static_cast(-kU), static_cast(kU), static_cast(kU)}; + FloatOut K2[4] + = {static_cast(kU), static_cast(kU), static_cast(kU), static_cast(kU)}; + for (int s = 0; s < Ns; s++) { + for (int c = 0; c < Nc; c++) { + out[s * Nc + c] = K1[s] * static_cast>(in[s1[s] * Nc + c]) + + K2[s] * static_cast>(in[s2[s] * Nc + c]); } } } diff --git a/include/quda.h b/include/quda.h index a9e8ec1cb2..c2f5b9da53 100644 --- a/include/quda.h +++ b/include/quda.h @@ -1744,7 +1744,7 @@ extern "C" { void destroyDeflationQuda(void *df_instance); void setMPICommHandleQuda(void *mycomm); - + /* Parameter set for quark smearing operations */ typedef struct QudaQuarkSmearParam_s { /*-------------------------------------------------*/ diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 8ce318aea0..4db29e04a0 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -12,9 +12,8 @@ * "double _Complex" data types exposed in quda.h. */ -typedef struct -{ - double re,im; +typedef struct { + double re, im; } openqcd_complex_dble; #ifdef __CUDACC_RTC__ @@ -54,38 +53,32 @@ typedef struct extern "C" { #endif - - /** * Copied from flags.h * ############################################# */ #ifndef FLAGS_H -typedef struct -{ - int type; - int cstar; - double phi3[2][3]; - double phi1[2]; +typedef struct { + int type; + int cstar; + double phi3[2][3]; + double phi1[2]; } bc_parms_t; -typedef struct -{ - int qhat; - double m0,su3csw,u1csw,cF[2],theta[3]; +typedef struct { + int qhat; + double m0, su3csw, u1csw, cF[2], theta[3]; } dirac_parms_t; -typedef struct -{ - int gauge; - int nfl; +typedef struct { + int gauge; + int nfl; } flds_parms_t; #endif /** * ############################################# */ - typedef enum OpenQCDGaugeGroup_s { OPENQCD_GAUGE_SU3 = 1, OPENQCD_GAUGE_U1 = 2, @@ -93,90 +86,84 @@ typedef enum OpenQCDGaugeGroup_s { OPENQCD_GAUGE_INVALID = QUDA_INVALID_ENUM } OpenQCDGaugeGroup; - /** * Parameters related to problem size and machine topology. They should hold the * numbers in quda format, i.e. xyzt convention. For example L[0] = L1, L[1] = * L2, ... */ typedef struct { - int L[4]; /** Local lattice dimensions L1, L2, L3, L0 */ - int nproc[4]; /** Machine grid size NPROC1, NPROC2, NPROC3, NPROC0*/ - int nproc_blk[4]; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK, - is assumed to be [1, 1, 1, 1] */ - int N[4]; /** Glocal lattice dimensions N1, N2, N3, N3 */ - int device; /** GPU device number */ - int cstar; /** number of cstar directions, equals bc_cstar() */ - int *data; /** rank topology, length 5 + NPROC1*NPROC2*NPROC3*NPROC0: - data[0] = cstar; - data[1+i] = nproc[i] for 0 <= i < 4 - data[5+lex(ix,iy,iz,it)] returns rank number in - openQCD, where lex stands for lexicographical - indexing (in QUDA order (xyzt)) */ - bc_parms_t (*bc_parms)(void); /** @see bc_parms() */ - flds_parms_t (*flds_parms)(void); /** @see flds_parms() */ - dirac_parms_t (*dirac_parms)(void); /** @see dirac_parms() */ - void* (*h_gauge)(void); /** function to return a pointer to the gauge field */ - void* (*h_sw)(void); /** function to return a pointer to the updated Clover field */ - void (*get_gfld_flags)(int *ud, int *ad); /** function pointer to gauge field revision query function */ + int L[4]; /** Local lattice dimensions L1, L2, L3, L0 */ + int nproc[4]; /** Machine grid size NPROC1, NPROC2, NPROC3, NPROC0*/ + int nproc_blk[4]; /** Blocking size NPROC0_BLK, NPROC1_BLK, NPROC2_BLK, NPROC3_BLK, + is assumed to be [1, 1, 1, 1] */ + int N[4]; /** Glocal lattice dimensions N1, N2, N3, N3 */ + int device; /** GPU device number */ + int cstar; /** number of cstar directions, equals bc_cstar() */ + int *data; /** rank topology, length 5 + NPROC1*NPROC2*NPROC3*NPROC0: + data[0] = cstar; + data[1+i] = nproc[i] for 0 <= i < 4 + data[5+lex(ix,iy,iz,it)] returns rank number in + openQCD, where lex stands for lexicographical + indexing (in QUDA order (xyzt)) */ + bc_parms_t (*bc_parms)(void); /** @see bc_parms() */ + flds_parms_t (*flds_parms)(void); /** @see flds_parms() */ + dirac_parms_t (*dirac_parms)(void); /** @see dirac_parms() */ + void *(*h_gauge)(void); /** function to return a pointer to the gauge field */ + void *(*h_sw)(void); /** function to return a pointer to the updated Clover field */ + void (*get_gfld_flags)(int *ud, int *ad); /** function pointer to gauge field revision query function */ } openQCD_QudaLayout_t; - /** * Parameters used to create a QUDA context. */ typedef struct { - QudaVerbosity verbosity; /** How verbose QUDA should be (QUDA_SILENT, QUDA_VERBOSE or QUDA_SUMMARIZE) */ - FILE *logfile; /** log file handler */ - void *gauge; /** base pointer to the gauge fields */ - int volume; /** VOLUME */ - int bndry; /** BNDRY */ + QudaVerbosity verbosity; /** How verbose QUDA should be (QUDA_SILENT, QUDA_VERBOSE or QUDA_SUMMARIZE) */ + FILE *logfile; /** log file handler */ + void *gauge; /** base pointer to the gauge fields */ + int volume; /** VOLUME */ + int bndry; /** BNDRY */ void (*reorder_gauge_quda_to_openqcd)(void *in, void *out); } openQCD_QudaInitArgs_t; - typedef struct { - int initialized; /** Whether openQCD_qudaInit() was called or not */ - int ud_rev; /** Revision of ud field from openqxd */ - int ad_rev; /** Revision of ad field from openqxd */ - int swd_ud_rev; /** Revision of ud field used to calc/transfer the SW field from openqxd */ - int swd_ad_rev; /** Revision of ad field used to calc/transfer the SW field from openqxd */ - double swd_kappa; /** kappa corresponding to the current SW field in QUDA */ - double swd_su3csw; /** SU(3) csw coefficient corresponding to the current SW field in QUDA */ - double swd_u1csw; /** U(1) csw coefficient corresponding to the current SW field in QUDA */ + int initialized; /** Whether openQCD_qudaInit() was called or not */ + int ud_rev; /** Revision of ud field from openqxd */ + int ad_rev; /** Revision of ad field from openqxd */ + int swd_ud_rev; /** Revision of ud field used to calc/transfer the SW field from openqxd */ + int swd_ad_rev; /** Revision of ad field used to calc/transfer the SW field from openqxd */ + double swd_kappa; /** kappa corresponding to the current SW field in QUDA */ + double swd_su3csw; /** SU(3) csw coefficient corresponding to the current SW field in QUDA */ + double swd_u1csw; /** U(1) csw coefficient corresponding to the current SW field in QUDA */ openQCD_QudaInitArgs_t init; openQCD_QudaLayout_t layout; - void* dirac_handle; /** void-pointer to QudaInvertParam struct for the Dirac operator. + void *dirac_handle; /** void-pointer to QudaInvertParam struct for the Dirac operator. * Notice that this void pointer HAS to be directly before * handles[32], because it's possible to call * openQCD_qudaSolverGetHandle with -1. */ - void* handles[32]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ + void *handles[32]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ char infile[1024]; /** Path to the input file (if given to quda_init()) */ } openQCD_QudaState_t; - typedef struct openQCD_QudaSolver_s { - char infile[1024]; /** Path to the input file (if given to quda_init()) */ - int id; /** Solver section identifier in the input file */ - QudaMultigridParam* mg_param; /** Pointer to the multigrid param struct */ - double u1csw; /** u1csw property */ - int mg_ud_rev; /** Revision of ud field from openqxd */ - int mg_ad_rev; /** Revision of ad field from openqxd */ - double mg_kappa; /** kappa corresponding to the current mg-instance in QUDA */ - double mg_su3csw; /** SU(3) csw coefficient corresponding to the current mg-instance in QUDA */ - double mg_u1csw; /** U(1) csw coefficient corresponding to the current mg-instance in QUDA */ + char infile[1024]; /** Path to the input file (if given to quda_init()) */ + int id; /** Solver section identifier in the input file */ + QudaMultigridParam *mg_param; /** Pointer to the multigrid param struct */ + double u1csw; /** u1csw property */ + int mg_ud_rev; /** Revision of ud field from openqxd */ + int mg_ad_rev; /** Revision of ad field from openqxd */ + double mg_kappa; /** kappa corresponding to the current mg-instance in QUDA */ + double mg_su3csw; /** SU(3) csw coefficient corresponding to the current mg-instance in QUDA */ + double mg_u1csw; /** U(1) csw coefficient corresponding to the current mg-instance in QUDA */ } openQCD_QudaSolver; - typedef struct { - double kappa; /* kappa: hopping parameter */ - double mu; /* mu: twisted mass */ - double su3csw; /* su3csw: csw coefficient for SU(3) fields */ - double u1csw; /* u1csw: csw coefficient for U(1) fields, quda doesn't respect that parameter (yet) */ - int qhat; /* qhat: quda doesn't respect that parameter (yet) */ + double kappa; /* kappa: hopping parameter */ + double mu; /* mu: twisted mass */ + double su3csw; /* su3csw: csw coefficient for SU(3) fields */ + double u1csw; /* u1csw: csw coefficient for U(1) fields, quda doesn't respect that parameter (yet) */ + int qhat; /* qhat: quda doesn't respect that parameter (yet) */ } openQCD_QudaDiracParam_t; - /** * Initialize the QUDA context. * @@ -186,13 +173,11 @@ typedef struct { */ void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, char *infile); - /** * Destroy the QUDA context and deallocate all solvers. */ void openQCD_qudaFinalize(void); - /** * Copy a spinor to GPU and back to CPU. * @@ -201,7 +186,6 @@ void openQCD_qudaFinalize(void); */ void openQCD_back_and_forth(void *h_in, void *h_out); - /** * @brief Wrapper around openqcd::ipt * @@ -213,7 +197,6 @@ void openQCD_back_and_forth(void *h_in, void *h_out); */ int openQCD_qudaIndexIpt(const int *x); - /** * @brief Wrapper around openqcd::iup * @@ -226,7 +209,6 @@ int openQCD_qudaIndexIpt(const int *x); */ int openQCD_qudaIndexIup(const int *x, const int mu); - /** * @brief Norm square in QUDA. * @@ -236,7 +218,6 @@ int openQCD_qudaIndexIup(const int *x, const int mu); */ double openQCD_qudaNorm(void *h_in); - /** * @brief Prototype function for the norm-square in QUDA without loading * the field. @@ -247,7 +228,6 @@ double openQCD_qudaNorm(void *h_in); */ double openQCD_qudaNorm_NoLoads(void *d_in); - /** * @brief Applies Dirac matrix to spinor. * @@ -260,11 +240,9 @@ double openQCD_qudaNorm_NoLoads(void *d_in); */ void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out); - -void* openQCD_qudaH2D(void *openQCD_field); +void *openQCD_qudaH2D(void *openQCD_field); void openQCD_qudaD2H(void *quda_field, void *openQCD_field); -void openQCD_qudaSpinorFree(void** quda_field); - +void openQCD_qudaSpinorFree(void **quda_field); /** * @brief Apply the Wilson-Clover Dirac operator to a field. All fields @@ -276,7 +254,6 @@ void openQCD_qudaSpinorFree(void** quda_field); */ void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p); - /** * @brief Apply the Dirac operator that corresponds to the current openQxD * setup to a field. All fields passed and returned are host (CPU) @@ -288,7 +265,6 @@ void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p) */ void openQCD_qudaDw(double mu, void *in, void *out); - /** * Setup the solver interface to quda. This function parses the file given by * [infile] as an openQCD ini file. The solver section given by the [id] @@ -311,8 +287,7 @@ void openQCD_qudaDw(double mu, void *in, void *out); * @return Pointer to the solver context */ -void* openQCD_qudaSolverGetHandle(int id); - +void *openQCD_qudaSolverGetHandle(int id); /** * @brief Return a hash from a subset of the settings in the @@ -325,7 +300,6 @@ void* openQCD_qudaSolverGetHandle(int id); */ int openQCD_qudaSolverGetHash(int id); - /** * @brief Print solver information about the QUDA solver. Print * "Solver is not initialized yet" is the solver struct is nul @@ -335,7 +309,6 @@ int openQCD_qudaSolverGetHash(int id); */ void openQCD_qudaSolverPrintSetup(int id); - /** * @brief Solve Ax=b for an Clover Wilson operator with a multigrid solver. * All fields passed and returned are host (CPU) field in openQCD @@ -356,7 +329,6 @@ void openQCD_qudaSolverPrintSetup(int id); */ double openQCD_qudaInvert(int id, double mu, void *source, void *solution, int *status); - /** * @brief Destroys an existing solver context and frees all involed * structs. @@ -365,7 +337,6 @@ double openQCD_qudaInvert(int id, double mu, void *source, void *solution, int * */ void openQCD_qudaSolverDestroy(int id); - /** * Setup the eigen-solver interface to quda. This function parses the file * given by [infile] as an openQCD ini file. The solver section given by the @@ -383,8 +354,7 @@ void openQCD_qudaSolverDestroy(int id); * * @return Pointer to the eigen-solver context */ -void* openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id); - +void *openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id); /** * @brief Solve Ax=b for an Clover Wilson operator with a multigrid @@ -399,7 +369,6 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id); */ void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals); - /** * @brief Destroys an existing eigen-solver context and frees all involed * structs. @@ -408,7 +377,6 @@ void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals); */ void openQCD_qudaEigensolverDestroy(void *param); - /** * @brief Wrapper for the plaquette. We could call plaqQuda() directly in * openQCD, but we have to make sure manually that the gauge field @@ -419,7 +387,6 @@ void openQCD_qudaEigensolverDestroy(void *param); */ double openQCD_qudaPlaquette(void); - /** * @brief Load the gauge fields from host to quda. Notice that the boundary * fields have to be up2date; i.e. call copy_bnd_hd(), copy_bnd_ud() @@ -432,7 +399,6 @@ double openQCD_qudaPlaquette(void); */ void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary); - /** * @brief Save the gauge fields from quda to host. * @@ -443,13 +409,11 @@ void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType */ void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary); - /** * @brief Free the gauge field allocated in quda. */ void openQCD_qudaGaugeFree(void); - /** * @brief Load the clover fields from host to quda. * @@ -461,13 +425,11 @@ void openQCD_qudaGaugeFree(void); */ void openQCD_qudaCloverLoad(void *clover, double kappa, double csw); - /** * @brief Free the clover field allocated in quda. */ void openQCD_qudaCloverFree(void); - #ifdef __cplusplus } #endif diff --git a/lib/checksum.cu b/lib/checksum.cu index ea83ce946e..af8374e723 100644 --- a/lib/checksum.cu +++ b/lib/checksum.cu @@ -52,11 +52,11 @@ namespace quda { ChecksumArg arg(u,mini); checksum = ChecksumCPU(arg); } else if (u.Order() == QUDA_OPENQCD_GAUGE_ORDER) { - ChecksumArg arg(u,mini); + ChecksumArg arg(u, mini); checksum = ChecksumCPU(arg); } else { errorQuda("Checksum not implemented"); - } + } return checksum; } diff --git a/lib/copy_color_spinor.cuh b/lib/copy_color_spinor.cuh index e8082746a9..cc85feb2e8 100644 --- a/lib/copy_color_spinor.cuh +++ b/lib/copy_color_spinor.cuh @@ -33,8 +33,10 @@ namespace quda { else if (out.GammaBasis() == QUDA_DEGRAND_ROSSI_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) strcat(aux, ",RelBasis"); else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_CHIRAL_GAMMA_BASIS) strcat(aux, ",ChiralToNonRelBasis"); else if (out.GammaBasis() == QUDA_CHIRAL_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) strcat(aux, ",NonRelToChiralBasis"); - else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_OPENQCD_GAMMA_BASIS) strcat(aux, ",ReverseOpenqcdBasis"); - else if (out.GammaBasis() == QUDA_OPENQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) strcat(aux, ",OpenqcdBasis"); + else if (out.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_OPENQCD_GAMMA_BASIS) + strcat(aux, ",ReverseOpenqcdBasis"); + else if (out.GammaBasis() == QUDA_OPENQCD_GAMMA_BASIS && in.GammaBasis() == QUDA_UKQCD_GAMMA_BASIS) + strcat(aux, ",OpenqcdBasis"); else errorQuda("Basis change from %d to %d not supported", in.GammaBasis(), out.GammaBasis()); apply(device::get_default_stream()); @@ -108,8 +110,10 @@ namespace quda { else errorQuda("QDPJIT interface has not been built"); } else if (out.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { using O = OpenQCDDiracOrder; - if constexpr (is_enabled()) CopyColorSpinor(out, in, param); - else errorQuda("OpenQCD interface has not been built"); + if constexpr (is_enabled()) + CopyColorSpinor(out, in, param); + else + errorQuda("OpenQCD interface has not been built"); } else { errorQuda("Order %d not defined (Ns = %d, Nc = %d, precision = %d)", out.FieldOrder(), Ns, Nc, out.Precision()); } @@ -140,8 +144,10 @@ namespace quda { else errorQuda("QDPJIT interface has not been built"); } else if (in.FieldOrder() == QUDA_OPENQCD_FIELD_ORDER) { using ColorSpinor = OpenQCDDiracOrder; - if constexpr (is_enabled()) genericCopyColorSpinor(param); - else errorQuda("OpenQCD interface has not been built"); + if constexpr (is_enabled()) + genericCopyColorSpinor(param); + else + errorQuda("OpenQCD interface has not been built"); } else { errorQuda("Order %d not defined (Ns=%d, Nc=%d, precision = %d)", in.FieldOrder(), Ns, Nc, in.Precision()); } diff --git a/lib/copy_gauge_extended.cu b/lib/copy_gauge_extended.cu index 94d5390d59..b9ee436cbf 100644 --- a/lib/copy_gauge_extended.cu +++ b/lib/copy_gauge_extended.cu @@ -122,7 +122,6 @@ namespace quda { } else { errorQuda("Gauge field %d order not supported", out.Order()); } - } template @@ -204,7 +203,6 @@ namespace quda { } else { errorQuda("Gauge field %d order not supported", in.Order()); } - } template diff --git a/lib/copy_gauge_inc.cu b/lib/copy_gauge_inc.cu index d41a45de23..ef83b13f7e 100644 --- a/lib/copy_gauge_inc.cu +++ b/lib/copy_gauge_inc.cu @@ -143,7 +143,6 @@ namespace quda { } else { errorQuda("Gauge field %d order not supported", out.Order()); } - } template diff --git a/lib/dslash_gamma_helper.cu b/lib/dslash_gamma_helper.cu index b1668cefa5..4713915607 100644 --- a/lib/dslash_gamma_helper.cu +++ b/lib/dslash_gamma_helper.cu @@ -104,8 +104,8 @@ namespace quda { void gamma5(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,4); } /* RG: I have added these */ - void gamma0(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,0); } - void gamma1(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,1); } - void gamma2(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,2); } - void gamma3(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out,in,3); } + void gamma0(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out, in, 0); } + void gamma1(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out, in, 1); } + void gamma2(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out, in, 2); } + void gamma3(ColorSpinorField &out, const ColorSpinorField &in) { ApplyGamma(out, in, 3); } } diff --git a/lib/extract_gauge_ghost.in.cu b/lib/extract_gauge_ghost.in.cu index 2d2ca281f7..2a569016b8 100644 --- a/lib/extract_gauge_ghost.in.cu +++ b/lib/extract_gauge_ghost.in.cu @@ -113,7 +113,7 @@ namespace quda { } else if (u.Order() == QUDA_OPENQCD_GAUGE_ORDER) { if constexpr (is_enabled()) { - ExtractGhost>(u, Ghost, extract, offset); + ExtractGhost>(u, Ghost, extract, offset); } else { errorQuda("OpenQCD interface has not been built"); } @@ -121,7 +121,6 @@ namespace quda { } else { errorQuda("Gauge field %d order not supported", u.Order()); } - } }; diff --git a/lib/extract_gauge_ghost_extended.cu b/lib/extract_gauge_ghost_extended.cu index bc41dc9d51..11b2661e6f 100644 --- a/lib/extract_gauge_ghost_extended.cu +++ b/lib/extract_gauge_ghost_extended.cu @@ -145,7 +145,7 @@ namespace quda { } else if (u.Order() == QUDA_OPENQCD_GAUGE_ORDER) { if constexpr (is_enabled()) { - ExtractGhostEx>(u, dim, R, ghost, extract); + ExtractGhostEx>(u, dim, R, ghost, extract); } else { errorQuda("OpenQCD interface has not been built"); } @@ -153,7 +153,6 @@ namespace quda { } else { errorQuda("Gauge field %d order not supported", u.Order()); } - } }; diff --git a/lib/gauge_field.cpp b/lib/gauge_field.cpp index 35b18bf45b..d8976b131b 100644 --- a/lib/gauge_field.cpp +++ b/lib/gauge_field.cpp @@ -162,13 +162,13 @@ namespace quda { /* analogue to BNDRY in openQCD:include/global.h */ long int bndry = 0; - bndry += (1-(comm_dim(0)%2))*x[1]*x[2]*x[3]; - bndry += (1-(comm_dim(1)%2))*x[0]*x[2]*x[3]; - bndry += (1-(comm_dim(2)%2))*x[0]*x[1]*x[3]; - bndry += (1-(comm_dim(3)%2))*x[0]*x[1]*x[2]; + bndry += (1 - (comm_dim(0) % 2)) * x[1] * x[2] * x[3]; + bndry += (1 - (comm_dim(1) % 2)) * x[0] * x[2] * x[3]; + bndry += (1 - (comm_dim(2) % 2)) * x[0] * x[1] * x[3]; + bndry += (1 - (comm_dim(3) % 2)) * x[0] * x[1] * x[2]; bndry *= 2; - length += 18*7*bndry/4; + length += 18 * 7 * bndry / 4; bytes = length * precision; } else { bytes = length * precision; diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 6d726b20fa..6635fa3307 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -19,7 +19,6 @@ #include #include - #define MAX(a, b) ((a) > (b) ? (a) : (b)) static openQCD_QudaState_t qudaState = {false, -1, -1, -1, -1, 0.0, 0.0, 0.0, {}, {}, nullptr, {}, ""}; @@ -83,315 +82,303 @@ template void inline qudaopenqcd_called(const char *func, QudaVerbo #endif } -template void inline qudaopenqcd_called(const char *func) { qudaopenqcd_called(func, getVerbosity()); } - +template void inline qudaopenqcd_called(const char *func) +{ + qudaopenqcd_called(func, getVerbosity()); +} /** * Mapping of enums to their actual values. We have this mapping such that we * can use the named parameters in our input files rather than the number. this * makes reading and writing the configuration more understandable. */ -std::unordered_map enum_map = { - {"QUDA_CG_INVERTER", std::to_string(QUDA_CG_INVERTER)}, - {"QUDA_BICGSTAB_INVERTER", std::to_string(QUDA_BICGSTAB_INVERTER)}, - {"QUDA_GCR_INVERTER", std::to_string(QUDA_GCR_INVERTER)}, - {"QUDA_MR_INVERTER", std::to_string(QUDA_MR_INVERTER)}, - {"QUDA_SD_INVERTER", std::to_string(QUDA_SD_INVERTER)}, - {"QUDA_PCG_INVERTER", std::to_string(QUDA_PCG_INVERTER)}, - {"QUDA_EIGCG_INVERTER", std::to_string(QUDA_EIGCG_INVERTER)}, - {"QUDA_INC_EIGCG_INVERTER", std::to_string(QUDA_INC_EIGCG_INVERTER)}, - {"QUDA_GMRESDR_INVERTER", std::to_string(QUDA_GMRESDR_INVERTER)}, - {"QUDA_GMRESDR_PROJ_INVERTER", std::to_string(QUDA_GMRESDR_PROJ_INVERTER)}, - {"QUDA_GMRESDR_SH_INVERTER", std::to_string(QUDA_GMRESDR_SH_INVERTER)}, - {"QUDA_FGMRESDR_INVERTER", std::to_string(QUDA_FGMRESDR_INVERTER)}, - {"QUDA_MG_INVERTER", std::to_string(QUDA_MG_INVERTER)}, - {"QUDA_BICGSTABL_INVERTER", std::to_string(QUDA_BICGSTABL_INVERTER)}, - {"QUDA_CGNE_INVERTER", std::to_string(QUDA_CGNE_INVERTER)}, - {"QUDA_CGNR_INVERTER", std::to_string(QUDA_CGNR_INVERTER)}, - {"QUDA_CG3_INVERTER", std::to_string(QUDA_CG3_INVERTER)}, - {"QUDA_CG3NE_INVERTER", std::to_string(QUDA_CG3NE_INVERTER)}, - {"QUDA_CG3NR_INVERTER", std::to_string(QUDA_CG3NR_INVERTER)}, - {"QUDA_CA_CG_INVERTER", std::to_string(QUDA_CA_CG_INVERTER)}, - {"QUDA_CA_CGNE_INVERTER", std::to_string(QUDA_CA_CGNE_INVERTER)}, - {"QUDA_CA_CGNR_INVERTER", std::to_string(QUDA_CA_CGNR_INVERTER)}, - {"QUDA_CA_GCR_INVERTER", std::to_string(QUDA_CA_GCR_INVERTER)}, - {"QUDA_INVALID_INVERTER", std::to_string(QUDA_INVALID_INVERTER)}, - {"QUDA_MAT_SOLUTION", std::to_string(QUDA_MAT_SOLUTION)}, - {"QUDA_MATDAG_MAT_SOLUTION", std::to_string(QUDA_MATDAG_MAT_SOLUTION)}, - {"QUDA_MATPC_SOLUTION", std::to_string(QUDA_MATPC_SOLUTION)}, - {"QUDA_MATPC_DAG_SOLUTION", std::to_string(QUDA_MATPC_DAG_SOLUTION)}, - {"QUDA_MATPCDAG_MATPC_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SOLUTION)}, - {"QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION)}, - {"QUDA_INVALID_SOLUTION", std::to_string(QUDA_INVALID_SOLUTION)}, - {"QUDA_DIRECT_SOLVE", std::to_string(QUDA_DIRECT_SOLVE)}, - {"QUDA_NORMOP_SOLVE", std::to_string(QUDA_NORMOP_SOLVE)}, - {"QUDA_DIRECT_PC_SOLVE", std::to_string(QUDA_DIRECT_PC_SOLVE)}, - {"QUDA_NORMOP_PC_SOLVE", std::to_string(QUDA_NORMOP_PC_SOLVE)}, - {"QUDA_NORMERR_SOLVE", std::to_string(QUDA_NORMERR_SOLVE)}, - {"QUDA_NORMERR_PC_SOLVE", std::to_string(QUDA_NORMERR_PC_SOLVE)}, - {"QUDA_NORMEQ_SOLVE", std::to_string(QUDA_NORMEQ_SOLVE)}, - {"QUDA_NORMEQ_PC_SOLVE", std::to_string(QUDA_NORMEQ_PC_SOLVE)}, - {"QUDA_INVALID_SOLVE", std::to_string(QUDA_INVALID_SOLVE)}, - {"QUDA_MATPC_EVEN_EVEN", std::to_string(QUDA_MATPC_EVEN_EVEN)}, - {"QUDA_MATPC_ODD_ODD", std::to_string(QUDA_MATPC_ODD_ODD)}, - {"QUDA_MATPC_EVEN_EVEN_ASYMMETRIC", std::to_string(QUDA_MATPC_EVEN_EVEN_ASYMMETRIC)}, - {"QUDA_MATPC_ODD_ODD_ASYMMETRIC", std::to_string(QUDA_MATPC_ODD_ODD_ASYMMETRIC)}, - {"QUDA_MATPC_INVALID", std::to_string(QUDA_MATPC_INVALID)}, - {"QUDA_DEFAULT_NORMALIZATION", std::to_string(QUDA_DEFAULT_NORMALIZATION)}, - {"QUDA_SOURCE_NORMALIZATION", std::to_string(QUDA_SOURCE_NORMALIZATION)}, - {"QUDA_QUARTER_PRECISION", std::to_string(QUDA_QUARTER_PRECISION)}, - {"QUDA_HALF_PRECISION", std::to_string(QUDA_HALF_PRECISION)}, - {"QUDA_SINGLE_PRECISION", std::to_string(QUDA_SINGLE_PRECISION)}, - {"QUDA_DOUBLE_PRECISION", std::to_string(QUDA_DOUBLE_PRECISION)}, - {"QUDA_INVALID_PRECISION", std::to_string(QUDA_INVALID_PRECISION)}, - {"QUDA_BOOLEAN_FALSE", std::to_string(QUDA_BOOLEAN_FALSE)}, - {"false", std::to_string(QUDA_BOOLEAN_FALSE)}, - {"FALSE", std::to_string(QUDA_BOOLEAN_FALSE)}, - {"no", std::to_string(QUDA_BOOLEAN_FALSE)}, - {"n", std::to_string(QUDA_BOOLEAN_FALSE)}, - {"off", std::to_string(QUDA_BOOLEAN_FALSE)}, - {"QUDA_BOOLEAN_TRUE", std::to_string(QUDA_BOOLEAN_TRUE)}, - {"true", std::to_string(QUDA_BOOLEAN_TRUE)}, - {"TRUE", std::to_string(QUDA_BOOLEAN_TRUE)}, - {"yes", std::to_string(QUDA_BOOLEAN_TRUE)}, - {"y", std::to_string(QUDA_BOOLEAN_TRUE)}, - {"on", std::to_string(QUDA_BOOLEAN_TRUE)}, - {"QUDA_BOOLEAN_INVALID", std::to_string(QUDA_BOOLEAN_INVALID)}, - {"QUDA_COMPUTE_NULL_VECTOR_NO", std::to_string(QUDA_COMPUTE_NULL_VECTOR_NO)}, - {"QUDA_COMPUTE_NULL_VECTOR_YES", std::to_string(QUDA_COMPUTE_NULL_VECTOR_YES)}, - {"QUDA_COMPUTE_NULL_VECTOR_INVALID", std::to_string(QUDA_COMPUTE_NULL_VECTOR_INVALID)}, - {"QUDA_MG_CYCLE_VCYCLE", std::to_string(QUDA_MG_CYCLE_VCYCLE)}, - {"QUDA_MG_CYCLE_FCYCLE", std::to_string(QUDA_MG_CYCLE_FCYCLE)}, - {"QUDA_MG_CYCLE_WCYCLE", std::to_string(QUDA_MG_CYCLE_WCYCLE)}, - {"QUDA_MG_CYCLE_RECURSIVE", std::to_string(QUDA_MG_CYCLE_RECURSIVE)}, - {"QUDA_MG_CYCLE_INVALID", std::to_string(QUDA_MG_CYCLE_INVALID)}, - {"QUDA_CPU_FIELD_LOCATION", std::to_string(QUDA_CPU_FIELD_LOCATION)}, - {"QUDA_CUDA_FIELD_LOCATION", std::to_string(QUDA_CUDA_FIELD_LOCATION)}, - {"QUDA_INVALID_FIELD_LOCATION", std::to_string(QUDA_INVALID_FIELD_LOCATION)}, - {"QUDA_TWIST_SINGLET", std::to_string(QUDA_TWIST_SINGLET)}, - {"QUDA_TWIST_NONDEG_DOUBLET", std::to_string(QUDA_TWIST_NONDEG_DOUBLET)}, - {"QUDA_TWIST_NO", std::to_string(QUDA_TWIST_NO)}, - {"QUDA_TWIST_INVALID", std::to_string(QUDA_TWIST_INVALID)}, - {"QUDA_DAG_NO", std::to_string(QUDA_DAG_NO)}, - {"QUDA_DAG_YES", std::to_string(QUDA_DAG_YES)}, - {"QUDA_DAG_INVALID", std::to_string(QUDA_DAG_INVALID)}, - {"QUDA_KAPPA_NORMALIZATION", std::to_string(QUDA_KAPPA_NORMALIZATION)}, - {"QUDA_MASS_NORMALIZATION", std::to_string(QUDA_MASS_NORMALIZATION)}, - {"QUDA_ASYMMETRIC_MASS_NORMALIZATION", std::to_string(QUDA_ASYMMETRIC_MASS_NORMALIZATION)}, - {"QUDA_INVALID_NORMALIZATION", std::to_string(QUDA_INVALID_NORMALIZATION)}, - {"QUDA_PRESERVE_SOURCE_NO", std::to_string(QUDA_PRESERVE_SOURCE_NO)}, - {"QUDA_PRESERVE_SOURCE_YES", std::to_string(QUDA_PRESERVE_SOURCE_YES)}, - {"QUDA_PRESERVE_SOURCE_INVALID", std::to_string(QUDA_PRESERVE_SOURCE_INVALID)}, - {"QUDA_USE_INIT_GUESS_NO", std::to_string(QUDA_USE_INIT_GUESS_NO)}, - {"QUDA_USE_INIT_GUESS_YES", std::to_string(QUDA_USE_INIT_GUESS_YES)}, - {"QUDA_USE_INIT_GUESS_INVALID", std::to_string(QUDA_USE_INIT_GUESS_INVALID)}, - {"QUDA_SILENT", std::to_string(QUDA_SILENT)}, - {"QUDA_SUMMARIZE", std::to_string(QUDA_SUMMARIZE)}, - {"QUDA_VERBOSE", std::to_string(QUDA_VERBOSE)}, - {"QUDA_DEBUG_VERBOSE", std::to_string(QUDA_DEBUG_VERBOSE)}, - {"QUDA_INVALID_VERBOSITY", std::to_string(QUDA_INVALID_VERBOSITY)}, - {"QUDA_TUNE_NO", std::to_string(QUDA_TUNE_NO)}, - {"QUDA_TUNE_YES", std::to_string(QUDA_TUNE_YES)}, - {"QUDA_TUNE_INVALID", std::to_string(QUDA_TUNE_INVALID)}, - {"QUDA_POWER_BASIS", std::to_string(QUDA_POWER_BASIS)}, - {"QUDA_CHEBYSHEV_BASIS", std::to_string(QUDA_CHEBYSHEV_BASIS)}, - {"QUDA_INVALID_BASIS", std::to_string(QUDA_INVALID_BASIS)}, - {"QUDA_ADDITIVE_SCHWARZ", std::to_string(QUDA_ADDITIVE_SCHWARZ)}, - {"QUDA_MULTIPLICATIVE_SCHWARZ", std::to_string(QUDA_MULTIPLICATIVE_SCHWARZ)}, - {"QUDA_INVALID_SCHWARZ", std::to_string(QUDA_INVALID_SCHWARZ)}, - {"QUDA_MADWF_ACCELERATOR", std::to_string(QUDA_MADWF_ACCELERATOR)}, - {"QUDA_INVALID_ACCELERATOR", std::to_string(QUDA_INVALID_ACCELERATOR)}, - {"QUDA_L2_RELATIVE_RESIDUAL", std::to_string(QUDA_L2_RELATIVE_RESIDUAL)}, - {"QUDA_L2_ABSOLUTE_RESIDUAL", std::to_string(QUDA_L2_ABSOLUTE_RESIDUAL)}, - {"QUDA_HEAVY_QUARK_RESIDUAL", std::to_string(QUDA_HEAVY_QUARK_RESIDUAL)}, - {"QUDA_INVALID_RESIDUAL", std::to_string(QUDA_INVALID_RESIDUAL)}, - {"QUDA_NULL_VECTOR_SETUP", std::to_string(QUDA_NULL_VECTOR_SETUP)}, - {"QUDA_TEST_VECTOR_SETUP", std::to_string(QUDA_TEST_VECTOR_SETUP)}, - {"QUDA_INVALID_SETUP_TYPE", std::to_string(QUDA_INVALID_SETUP_TYPE)}, - {"QUDA_TRANSFER_AGGREGATE", std::to_string(QUDA_TRANSFER_AGGREGATE)}, - {"QUDA_TRANSFER_COARSE_KD", std::to_string(QUDA_TRANSFER_COARSE_KD)}, - {"QUDA_TRANSFER_OPTIMIZED_KD", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD)}, - {"QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG)}, - {"QUDA_TRANSFER_INVALID", std::to_string(QUDA_TRANSFER_INVALID)}, - {"QUDA_EIG_TR_LANCZOS", std::to_string(QUDA_EIG_TR_LANCZOS)}, - {"QUDA_EIG_BLK_TR_LANCZOS", std::to_string(QUDA_EIG_BLK_TR_LANCZOS)}, - {"QUDA_EIG_IR_ARNOLDI", std::to_string(QUDA_EIG_IR_ARNOLDI)}, - {"QUDA_EIG_BLK_IR_ARNOLDI", std::to_string(QUDA_EIG_BLK_IR_ARNOLDI)}, - {"QUDA_EIG_INVALID", std::to_string(QUDA_EIG_INVALID)}, - {"QUDA_SPECTRUM_LM_EIG", std::to_string(QUDA_SPECTRUM_LM_EIG)}, - {"QUDA_SPECTRUM_SM_EIG", std::to_string(QUDA_SPECTRUM_SM_EIG)}, - {"QUDA_SPECTRUM_LR_EIG", std::to_string(QUDA_SPECTRUM_LR_EIG)}, - {"QUDA_SPECTRUM_SR_EIG", std::to_string(QUDA_SPECTRUM_SR_EIG)}, - {"QUDA_SPECTRUM_LI_EIG", std::to_string(QUDA_SPECTRUM_LI_EIG)}, - {"QUDA_SPECTRUM_SI_EIG", std::to_string(QUDA_SPECTRUM_SI_EIG)}, - {"QUDA_SPECTRUM_INVALID", std::to_string(QUDA_SPECTRUM_INVALID)}, - {"QUDA_MEMORY_DEVICE", std::to_string(QUDA_MEMORY_DEVICE)}, - {"QUDA_MEMORY_DEVICE_PINNED", std::to_string(QUDA_MEMORY_DEVICE_PINNED)}, - {"QUDA_MEMORY_HOST", std::to_string(QUDA_MEMORY_HOST)}, - {"QUDA_MEMORY_HOST_PINNED", std::to_string(QUDA_MEMORY_HOST_PINNED)}, - {"QUDA_MEMORY_MAPPED", std::to_string(QUDA_MEMORY_MAPPED)}, - {"QUDA_MEMORY_MANAGED", std::to_string(QUDA_MEMORY_MANAGED)}, - {"QUDA_MEMORY_INVALID", std::to_string(QUDA_MEMORY_INVALID)}, - {"QUDA_CUSOLVE_EXTLIB", std::to_string(QUDA_CUSOLVE_EXTLIB)}, - {"QUDA_EIGEN_EXTLIB", std::to_string(QUDA_EIGEN_EXTLIB)}, - {"QUDA_EXTLIB_INVALID", std::to_string(QUDA_EXTLIB_INVALID)} -}; - +std::unordered_map enum_map + = {{"QUDA_CG_INVERTER", std::to_string(QUDA_CG_INVERTER)}, + {"QUDA_BICGSTAB_INVERTER", std::to_string(QUDA_BICGSTAB_INVERTER)}, + {"QUDA_GCR_INVERTER", std::to_string(QUDA_GCR_INVERTER)}, + {"QUDA_MR_INVERTER", std::to_string(QUDA_MR_INVERTER)}, + {"QUDA_SD_INVERTER", std::to_string(QUDA_SD_INVERTER)}, + {"QUDA_PCG_INVERTER", std::to_string(QUDA_PCG_INVERTER)}, + {"QUDA_EIGCG_INVERTER", std::to_string(QUDA_EIGCG_INVERTER)}, + {"QUDA_INC_EIGCG_INVERTER", std::to_string(QUDA_INC_EIGCG_INVERTER)}, + {"QUDA_GMRESDR_INVERTER", std::to_string(QUDA_GMRESDR_INVERTER)}, + {"QUDA_GMRESDR_PROJ_INVERTER", std::to_string(QUDA_GMRESDR_PROJ_INVERTER)}, + {"QUDA_GMRESDR_SH_INVERTER", std::to_string(QUDA_GMRESDR_SH_INVERTER)}, + {"QUDA_FGMRESDR_INVERTER", std::to_string(QUDA_FGMRESDR_INVERTER)}, + {"QUDA_MG_INVERTER", std::to_string(QUDA_MG_INVERTER)}, + {"QUDA_BICGSTABL_INVERTER", std::to_string(QUDA_BICGSTABL_INVERTER)}, + {"QUDA_CGNE_INVERTER", std::to_string(QUDA_CGNE_INVERTER)}, + {"QUDA_CGNR_INVERTER", std::to_string(QUDA_CGNR_INVERTER)}, + {"QUDA_CG3_INVERTER", std::to_string(QUDA_CG3_INVERTER)}, + {"QUDA_CG3NE_INVERTER", std::to_string(QUDA_CG3NE_INVERTER)}, + {"QUDA_CG3NR_INVERTER", std::to_string(QUDA_CG3NR_INVERTER)}, + {"QUDA_CA_CG_INVERTER", std::to_string(QUDA_CA_CG_INVERTER)}, + {"QUDA_CA_CGNE_INVERTER", std::to_string(QUDA_CA_CGNE_INVERTER)}, + {"QUDA_CA_CGNR_INVERTER", std::to_string(QUDA_CA_CGNR_INVERTER)}, + {"QUDA_CA_GCR_INVERTER", std::to_string(QUDA_CA_GCR_INVERTER)}, + {"QUDA_INVALID_INVERTER", std::to_string(QUDA_INVALID_INVERTER)}, + {"QUDA_MAT_SOLUTION", std::to_string(QUDA_MAT_SOLUTION)}, + {"QUDA_MATDAG_MAT_SOLUTION", std::to_string(QUDA_MATDAG_MAT_SOLUTION)}, + {"QUDA_MATPC_SOLUTION", std::to_string(QUDA_MATPC_SOLUTION)}, + {"QUDA_MATPC_DAG_SOLUTION", std::to_string(QUDA_MATPC_DAG_SOLUTION)}, + {"QUDA_MATPCDAG_MATPC_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SOLUTION)}, + {"QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION", std::to_string(QUDA_MATPCDAG_MATPC_SHIFT_SOLUTION)}, + {"QUDA_INVALID_SOLUTION", std::to_string(QUDA_INVALID_SOLUTION)}, + {"QUDA_DIRECT_SOLVE", std::to_string(QUDA_DIRECT_SOLVE)}, + {"QUDA_NORMOP_SOLVE", std::to_string(QUDA_NORMOP_SOLVE)}, + {"QUDA_DIRECT_PC_SOLVE", std::to_string(QUDA_DIRECT_PC_SOLVE)}, + {"QUDA_NORMOP_PC_SOLVE", std::to_string(QUDA_NORMOP_PC_SOLVE)}, + {"QUDA_NORMERR_SOLVE", std::to_string(QUDA_NORMERR_SOLVE)}, + {"QUDA_NORMERR_PC_SOLVE", std::to_string(QUDA_NORMERR_PC_SOLVE)}, + {"QUDA_NORMEQ_SOLVE", std::to_string(QUDA_NORMEQ_SOLVE)}, + {"QUDA_NORMEQ_PC_SOLVE", std::to_string(QUDA_NORMEQ_PC_SOLVE)}, + {"QUDA_INVALID_SOLVE", std::to_string(QUDA_INVALID_SOLVE)}, + {"QUDA_MATPC_EVEN_EVEN", std::to_string(QUDA_MATPC_EVEN_EVEN)}, + {"QUDA_MATPC_ODD_ODD", std::to_string(QUDA_MATPC_ODD_ODD)}, + {"QUDA_MATPC_EVEN_EVEN_ASYMMETRIC", std::to_string(QUDA_MATPC_EVEN_EVEN_ASYMMETRIC)}, + {"QUDA_MATPC_ODD_ODD_ASYMMETRIC", std::to_string(QUDA_MATPC_ODD_ODD_ASYMMETRIC)}, + {"QUDA_MATPC_INVALID", std::to_string(QUDA_MATPC_INVALID)}, + {"QUDA_DEFAULT_NORMALIZATION", std::to_string(QUDA_DEFAULT_NORMALIZATION)}, + {"QUDA_SOURCE_NORMALIZATION", std::to_string(QUDA_SOURCE_NORMALIZATION)}, + {"QUDA_QUARTER_PRECISION", std::to_string(QUDA_QUARTER_PRECISION)}, + {"QUDA_HALF_PRECISION", std::to_string(QUDA_HALF_PRECISION)}, + {"QUDA_SINGLE_PRECISION", std::to_string(QUDA_SINGLE_PRECISION)}, + {"QUDA_DOUBLE_PRECISION", std::to_string(QUDA_DOUBLE_PRECISION)}, + {"QUDA_INVALID_PRECISION", std::to_string(QUDA_INVALID_PRECISION)}, + {"QUDA_BOOLEAN_FALSE", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"false", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"FALSE", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"no", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"n", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"off", std::to_string(QUDA_BOOLEAN_FALSE)}, + {"QUDA_BOOLEAN_TRUE", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"true", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"TRUE", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"yes", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"y", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"on", std::to_string(QUDA_BOOLEAN_TRUE)}, + {"QUDA_BOOLEAN_INVALID", std::to_string(QUDA_BOOLEAN_INVALID)}, + {"QUDA_COMPUTE_NULL_VECTOR_NO", std::to_string(QUDA_COMPUTE_NULL_VECTOR_NO)}, + {"QUDA_COMPUTE_NULL_VECTOR_YES", std::to_string(QUDA_COMPUTE_NULL_VECTOR_YES)}, + {"QUDA_COMPUTE_NULL_VECTOR_INVALID", std::to_string(QUDA_COMPUTE_NULL_VECTOR_INVALID)}, + {"QUDA_MG_CYCLE_VCYCLE", std::to_string(QUDA_MG_CYCLE_VCYCLE)}, + {"QUDA_MG_CYCLE_FCYCLE", std::to_string(QUDA_MG_CYCLE_FCYCLE)}, + {"QUDA_MG_CYCLE_WCYCLE", std::to_string(QUDA_MG_CYCLE_WCYCLE)}, + {"QUDA_MG_CYCLE_RECURSIVE", std::to_string(QUDA_MG_CYCLE_RECURSIVE)}, + {"QUDA_MG_CYCLE_INVALID", std::to_string(QUDA_MG_CYCLE_INVALID)}, + {"QUDA_CPU_FIELD_LOCATION", std::to_string(QUDA_CPU_FIELD_LOCATION)}, + {"QUDA_CUDA_FIELD_LOCATION", std::to_string(QUDA_CUDA_FIELD_LOCATION)}, + {"QUDA_INVALID_FIELD_LOCATION", std::to_string(QUDA_INVALID_FIELD_LOCATION)}, + {"QUDA_TWIST_SINGLET", std::to_string(QUDA_TWIST_SINGLET)}, + {"QUDA_TWIST_NONDEG_DOUBLET", std::to_string(QUDA_TWIST_NONDEG_DOUBLET)}, + {"QUDA_TWIST_NO", std::to_string(QUDA_TWIST_NO)}, + {"QUDA_TWIST_INVALID", std::to_string(QUDA_TWIST_INVALID)}, + {"QUDA_DAG_NO", std::to_string(QUDA_DAG_NO)}, + {"QUDA_DAG_YES", std::to_string(QUDA_DAG_YES)}, + {"QUDA_DAG_INVALID", std::to_string(QUDA_DAG_INVALID)}, + {"QUDA_KAPPA_NORMALIZATION", std::to_string(QUDA_KAPPA_NORMALIZATION)}, + {"QUDA_MASS_NORMALIZATION", std::to_string(QUDA_MASS_NORMALIZATION)}, + {"QUDA_ASYMMETRIC_MASS_NORMALIZATION", std::to_string(QUDA_ASYMMETRIC_MASS_NORMALIZATION)}, + {"QUDA_INVALID_NORMALIZATION", std::to_string(QUDA_INVALID_NORMALIZATION)}, + {"QUDA_PRESERVE_SOURCE_NO", std::to_string(QUDA_PRESERVE_SOURCE_NO)}, + {"QUDA_PRESERVE_SOURCE_YES", std::to_string(QUDA_PRESERVE_SOURCE_YES)}, + {"QUDA_PRESERVE_SOURCE_INVALID", std::to_string(QUDA_PRESERVE_SOURCE_INVALID)}, + {"QUDA_USE_INIT_GUESS_NO", std::to_string(QUDA_USE_INIT_GUESS_NO)}, + {"QUDA_USE_INIT_GUESS_YES", std::to_string(QUDA_USE_INIT_GUESS_YES)}, + {"QUDA_USE_INIT_GUESS_INVALID", std::to_string(QUDA_USE_INIT_GUESS_INVALID)}, + {"QUDA_SILENT", std::to_string(QUDA_SILENT)}, + {"QUDA_SUMMARIZE", std::to_string(QUDA_SUMMARIZE)}, + {"QUDA_VERBOSE", std::to_string(QUDA_VERBOSE)}, + {"QUDA_DEBUG_VERBOSE", std::to_string(QUDA_DEBUG_VERBOSE)}, + {"QUDA_INVALID_VERBOSITY", std::to_string(QUDA_INVALID_VERBOSITY)}, + {"QUDA_TUNE_NO", std::to_string(QUDA_TUNE_NO)}, + {"QUDA_TUNE_YES", std::to_string(QUDA_TUNE_YES)}, + {"QUDA_TUNE_INVALID", std::to_string(QUDA_TUNE_INVALID)}, + {"QUDA_POWER_BASIS", std::to_string(QUDA_POWER_BASIS)}, + {"QUDA_CHEBYSHEV_BASIS", std::to_string(QUDA_CHEBYSHEV_BASIS)}, + {"QUDA_INVALID_BASIS", std::to_string(QUDA_INVALID_BASIS)}, + {"QUDA_ADDITIVE_SCHWARZ", std::to_string(QUDA_ADDITIVE_SCHWARZ)}, + {"QUDA_MULTIPLICATIVE_SCHWARZ", std::to_string(QUDA_MULTIPLICATIVE_SCHWARZ)}, + {"QUDA_INVALID_SCHWARZ", std::to_string(QUDA_INVALID_SCHWARZ)}, + {"QUDA_MADWF_ACCELERATOR", std::to_string(QUDA_MADWF_ACCELERATOR)}, + {"QUDA_INVALID_ACCELERATOR", std::to_string(QUDA_INVALID_ACCELERATOR)}, + {"QUDA_L2_RELATIVE_RESIDUAL", std::to_string(QUDA_L2_RELATIVE_RESIDUAL)}, + {"QUDA_L2_ABSOLUTE_RESIDUAL", std::to_string(QUDA_L2_ABSOLUTE_RESIDUAL)}, + {"QUDA_HEAVY_QUARK_RESIDUAL", std::to_string(QUDA_HEAVY_QUARK_RESIDUAL)}, + {"QUDA_INVALID_RESIDUAL", std::to_string(QUDA_INVALID_RESIDUAL)}, + {"QUDA_NULL_VECTOR_SETUP", std::to_string(QUDA_NULL_VECTOR_SETUP)}, + {"QUDA_TEST_VECTOR_SETUP", std::to_string(QUDA_TEST_VECTOR_SETUP)}, + {"QUDA_INVALID_SETUP_TYPE", std::to_string(QUDA_INVALID_SETUP_TYPE)}, + {"QUDA_TRANSFER_AGGREGATE", std::to_string(QUDA_TRANSFER_AGGREGATE)}, + {"QUDA_TRANSFER_COARSE_KD", std::to_string(QUDA_TRANSFER_COARSE_KD)}, + {"QUDA_TRANSFER_OPTIMIZED_KD", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD)}, + {"QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG", std::to_string(QUDA_TRANSFER_OPTIMIZED_KD_DROP_LONG)}, + {"QUDA_TRANSFER_INVALID", std::to_string(QUDA_TRANSFER_INVALID)}, + {"QUDA_EIG_TR_LANCZOS", std::to_string(QUDA_EIG_TR_LANCZOS)}, + {"QUDA_EIG_BLK_TR_LANCZOS", std::to_string(QUDA_EIG_BLK_TR_LANCZOS)}, + {"QUDA_EIG_IR_ARNOLDI", std::to_string(QUDA_EIG_IR_ARNOLDI)}, + {"QUDA_EIG_BLK_IR_ARNOLDI", std::to_string(QUDA_EIG_BLK_IR_ARNOLDI)}, + {"QUDA_EIG_INVALID", std::to_string(QUDA_EIG_INVALID)}, + {"QUDA_SPECTRUM_LM_EIG", std::to_string(QUDA_SPECTRUM_LM_EIG)}, + {"QUDA_SPECTRUM_SM_EIG", std::to_string(QUDA_SPECTRUM_SM_EIG)}, + {"QUDA_SPECTRUM_LR_EIG", std::to_string(QUDA_SPECTRUM_LR_EIG)}, + {"QUDA_SPECTRUM_SR_EIG", std::to_string(QUDA_SPECTRUM_SR_EIG)}, + {"QUDA_SPECTRUM_LI_EIG", std::to_string(QUDA_SPECTRUM_LI_EIG)}, + {"QUDA_SPECTRUM_SI_EIG", std::to_string(QUDA_SPECTRUM_SI_EIG)}, + {"QUDA_SPECTRUM_INVALID", std::to_string(QUDA_SPECTRUM_INVALID)}, + {"QUDA_MEMORY_DEVICE", std::to_string(QUDA_MEMORY_DEVICE)}, + {"QUDA_MEMORY_DEVICE_PINNED", std::to_string(QUDA_MEMORY_DEVICE_PINNED)}, + {"QUDA_MEMORY_HOST", std::to_string(QUDA_MEMORY_HOST)}, + {"QUDA_MEMORY_HOST_PINNED", std::to_string(QUDA_MEMORY_HOST_PINNED)}, + {"QUDA_MEMORY_MAPPED", std::to_string(QUDA_MEMORY_MAPPED)}, + {"QUDA_MEMORY_MANAGED", std::to_string(QUDA_MEMORY_MANAGED)}, + {"QUDA_MEMORY_INVALID", std::to_string(QUDA_MEMORY_INVALID)}, + {"QUDA_CUSOLVE_EXTLIB", std::to_string(QUDA_CUSOLVE_EXTLIB)}, + {"QUDA_EIGEN_EXTLIB", std::to_string(QUDA_EIGEN_EXTLIB)}, + {"QUDA_EXTLIB_INVALID", std::to_string(QUDA_EXTLIB_INVALID)}}; /** * @brief Just a simple key-value store */ -class KeyValueStore { +class KeyValueStore +{ private: - std::unordered_map>> store; - std::unordered_map *map = nullptr; - std::string filename = ""; + std::unordered_map>> store; + std::unordered_map *map = nullptr; + std::string filename = ""; public: - /** - * @brief Sets a key value pair - * - * @param[in] section The section - * @param[in] key The key - * @param[in] value The value - */ - void set(const std::string& section, const std::string& key, const std::string& value) { - if (map != nullptr) { - auto mvalue = map->find(value); - if (mvalue != map->end()) { - std::get<0>(store[section][key]) = mvalue->second; - std::get<1>(store[section][key]) = value; - return; - } - } - std::get<0>(store[section][key]) = value; + /** + * @brief Sets a key value pair + * + * @param[in] section The section + * @param[in] key The key + * @param[in] value The value + */ + void set(const std::string §ion, const std::string &key, const std::string &value) + { + if (map != nullptr) { + auto mvalue = map->find(value); + if (mvalue != map->end()) { + std::get<0>(store[section][key]) = mvalue->second; std::get<1>(store[section][key]) = value; + return; + } } + std::get<0>(store[section][key]) = value; + std::get<1>(store[section][key]) = value; + } - void set_map(std::unordered_map *_map) { - map = _map; - } + void set_map(std::unordered_map *_map) { map = _map; } - bool section_exists(const std::string& section) { - return store.find(section) != store.end(); - } + bool section_exists(const std::string §ion) { return store.find(section) != store.end(); } - /** - * @brief Gets the specified key. - * - * @param[in] section The section - * @param[in] key The key - * @param[in] default_value The default value if section/key is absent - * - * @tparam T Desired return type - * - * @return The corresponding value - */ - template - T get(const std::string& section, const std::string& key, T default_value = T(), bool fail = false) { - int idx; - std::string rkey; - std::smatch match; - std::regex p_key("([^\\[]+)\\[(\\d+)\\]"); /* key[idx] */ - auto sec = store.find(section); - - if (sec != store.end()) { - if (std::regex_search(key, match, p_key)) { - rkey = match[1]; - idx = std::stoi(match[2]); - } else { - rkey = key; - idx = 0; - } - - auto item = sec->second.find(rkey); - if (item != sec->second.end()) { - std::stringstream ss(std::get<0>(item->second)); - if constexpr (std::is_enum_v) { - typename std::underlying_type::type result, dummy; - for (int i=0; i> dummy; - } - if (ss >> result) { - return static_cast(result); - } - } else { - T result, dummy; - for (int i=0; i> dummy; - } - if (ss >> result) { - return result; - } - } - } - } - if (fail) { - errorQuda("Key \"%s\" in section \"%s\" in file %s does not exist.", - key.c_str(), section.c_str(), filename.c_str()); + /** + * @brief Gets the specified key. + * + * @param[in] section The section + * @param[in] key The key + * @param[in] default_value The default value if section/key is absent + * + * @tparam T Desired return type + * + * @return The corresponding value + */ + template + T get(const std::string §ion, const std::string &key, T default_value = T(), bool fail = false) + { + int idx; + std::string rkey; + std::smatch match; + std::regex p_key("([^\\[]+)\\[(\\d+)\\]"); /* key[idx] */ + auto sec = store.find(section); + + if (sec != store.end()) { + if (std::regex_search(key, match, p_key)) { + rkey = match[1]; + idx = std::stoi(match[2]); + } else { + rkey = key; + idx = 0; + } + + auto item = sec->second.find(rkey); + if (item != sec->second.end()) { + std::stringstream ss(std::get<0>(item->second)); + if constexpr (std::is_enum_v) { + typename std::underlying_type::type result, dummy; + for (int i = 0; i < idx; i++) { ss >> dummy; } + if (ss >> result) { return static_cast(result); } + } else { + T result, dummy; + for (int i = 0; i < idx; i++) { ss >> dummy; } + if (ss >> result) { return result; } } - return default_value; /* Return default value for non-existent keys */ + } } + if (fail) { + errorQuda("Key \"%s\" in section \"%s\" in file %s does not exist.", key.c_str(), section.c_str(), + filename.c_str()); + } + return default_value; /* Return default value for non-existent keys */ + } - /** - * @brief Fill the store with entries from an ini-file - * - * @param[in] fname The fname - */ - void load(const std::string& fname) { - std::string line, section; - std::smatch match; - filename = fname; - std::ifstream file(filename.c_str()); - - std::regex p_section("^\\s*\\[([\\w\\ ]+)\\].*$"); /* [section] */ - std::regex p_comment("^[^#]*(\\s*#.*)$"); /* line # comment */ - std::regex p_key_val("^([^\\s]+)\\s+(.*[^\\s]+)\\s*$"); /* key value */ - - if (file.is_open()) { - - while (std::getline(file, line)) { - - /* remove all comments */ - if (std::regex_search(line, match, p_comment)) { - line.erase(match.position(1)); - } - - if (std::regex_search(line, match, p_section)) { - section = match[1]; - } else if (std::regex_search(line, match, p_key_val)) { - std::string key = match[1]; - std::string val = match[2]; - this->set(section, key, val); - } - } - - file.close(); - } else { - std::cerr << "Error opening file: " << filename << std::endl; + /** + * @brief Fill the store with entries from an ini-file + * + * @param[in] fname The fname + */ + void load(const std::string &fname) + { + std::string line, section; + std::smatch match; + filename = fname; + std::ifstream file(filename.c_str()); + + std::regex p_section("^\\s*\\[([\\w\\ ]+)\\].*$"); /* [section] */ + std::regex p_comment("^[^#]*(\\s*#.*)$"); /* line # comment */ + std::regex p_key_val("^([^\\s]+)\\s+(.*[^\\s]+)\\s*$"); /* key value */ + + if (file.is_open()) { + + while (std::getline(file, line)) { + + /* remove all comments */ + if (std::regex_search(line, match, p_comment)) { line.erase(match.position(1)); } + + if (std::regex_search(line, match, p_section)) { + section = match[1]; + } else if (std::regex_search(line, match, p_key_val)) { + std::string key = match[1]; + std::string val = match[2]; + this->set(section, key, val); } + } + + file.close(); + } else { + std::cerr << "Error opening file: " << filename << std::endl; } + } - /** - * @brief Dumps all entries in the store. - */ - void dump(std::string _section = "") { - for (const auto& section : store) { - if (_section == "" || _section == section.first) { - std::cout << "[" << section.first << "]" << std::endl; - for (const auto& pair : section.second) { - std::cout << " " << pair.first << " = " << std::get<1>(pair.second); - if (std::get<0>(pair.second) != std::get<1>(pair.second)) { - std::cout << " # " << std::get<0>(pair.second); - } - std::cout << std::endl; - } - } + /** + * @brief Dumps all entries in the store. + */ + void dump(std::string _section = "") + { + for (const auto §ion : store) { + if (_section == "" || _section == section.first) { + std::cout << "[" << section.first << "]" << std::endl; + for (const auto &pair : section.second) { + std::cout << " " << pair.first << " = " << std::get<1>(pair.second); + if (std::get<0>(pair.second) != std::get<1>(pair.second)) { std::cout << " # " << std::get<0>(pair.second); } + std::cout << std::endl; } + } } + } }; - /** * @brief Returns the local lattice dimensions as lat_dim_t * @@ -401,7 +388,7 @@ static lat_dim_t get_local_dims(int *fill = nullptr) { lat_dim_t X; - for (int i=0; i<4; i++) { + for (int i = 0; i < 4; i++) { if (fill) { fill[i] = qudaState.layout.L[i]; } else { @@ -412,7 +399,6 @@ static lat_dim_t get_local_dims(int *fill = nullptr) return X; } - /** * @brief Calculate the rank from coordinates. * @@ -430,11 +416,10 @@ static int rankFromCoords(const int *coords, void *fdata) int *ranks = base + 5; int i; - i = coords[3] + NPROC[3]*(coords[2] + NPROC[2]*(coords[1] + NPROC[1]*(coords[0]))); + i = coords[3] + NPROC[3] * (coords[2] + NPROC[2] * (coords[1] + NPROC[1] * (coords[0]))); return ranks[i]; } - /** * Set set the local dimensions and machine topology for QUDA to use * @@ -477,12 +462,11 @@ void openQCD_qudaSetLayout(openQCD_QudaLayout_t layout, char *infile) qudaState.init.verbosity = kv.get("QUDA", "verbosity", qudaState.init.verbosity); } - MPI_Bcast((void*) &qudaState.init.verbosity, sizeof(qudaState.init.verbosity), MPI_INT, 0, MPI_COMM_WORLD); + MPI_Bcast((void *)&qudaState.init.verbosity, sizeof(qudaState.init.verbosity), MPI_INT, 0, MPI_COMM_WORLD); setVerbosityQuda(qudaState.init.verbosity, prefix, qudaState.init.logfile); initQuda(device); } - static int getLinkPadding(const int dim[4]) { int padding = MAX(dim[1] * dim[2] * dim[3] / 2, dim[0] * dim[2] * dim[3] / 2); @@ -491,7 +475,6 @@ static int getLinkPadding(const int dim[4]) return padding; } - /** * @brief Creates a new quda parameter struct * @@ -505,11 +488,11 @@ static QudaInvertParam newOpenQCDParam(void) param.verbosity = verbosity; - param.cpu_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the input fermion fields */ - param.cuda_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the QUDA solver */ + param.cpu_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the input fermion fields */ + param.cuda_prec = QUDA_DOUBLE_PRECISION; /* The precision used by the QUDA solver */ param.cuda_prec_eigensolver = QUDA_DOUBLE_PRECISION; /* The precision used by the QUDA eigensolver */ - param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; /* The precision used by the QUDA solver */ + param.cuda_prec_sloppy = QUDA_SINGLE_PRECISION; /* The precision used by the QUDA solver */ param.cuda_prec_precondition = QUDA_HALF_PRECISION; /* The precision used by the QUDA solver */ /** @@ -529,7 +512,6 @@ static QudaInvertParam newOpenQCDParam(void) return param; } - /** * @brief Initialize quda gauge param struct * @@ -555,13 +537,12 @@ static QudaGaugeParam newOpenQCDGaugeParam(QudaPrecision prec, QudaReconstructTy param.t_boundary = t_boundary; param.gauge_fix = QUDA_GAUGE_FIXED_NO; param.scale = 1.0; - param.anisotropy = 1.0; /* 1.0 means not anisotropic */ + param.anisotropy = 1.0; /* 1.0 means not anisotropic */ param.ga_pad = getLinkPadding(param.X); /* Why this? */ return param; } - void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, char *infile) { if (qudaState.initialized) return; @@ -574,19 +555,17 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, qudaState.initialized = true; } -void openQCD_qudaFinalize(void) { +void openQCD_qudaFinalize(void) +{ - for (int id=0; id<32; ++id) { - if (qudaState.handles[id] != nullptr) { - openQCD_qudaSolverDestroy(id); - } + for (int id = 0; id < 32; ++id) { + if (qudaState.handles[id] != nullptr) { openQCD_qudaSolverDestroy(id); } } qudaState.initialized = false; endQuda(); } - double openQCD_qudaPlaquette(void) { double plaq[3]; @@ -594,33 +573,26 @@ double openQCD_qudaPlaquette(void) plaqQuda(plaq); /* Note different Nc normalization wrt openQCD! */ - return 3.0*plaq[0]; + return 3.0 * plaq[0]; } - void openQCD_qudaGaugeLoad(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary) { QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec, t_boundary); loadGaugeQuda(gauge, ¶m); } - void openQCD_qudaGaugeSave(void *gauge, QudaPrecision prec, QudaReconstructType rec, QudaTboundary t_boundary) { QudaGaugeParam param = newOpenQCDGaugeParam(prec, rec, t_boundary); - void* buffer = pool_pinned_malloc((4*qudaState.init.volume + 7*qudaState.init.bndry/4)*18*prec); + void *buffer = pool_pinned_malloc((4 * qudaState.init.volume + 7 * qudaState.init.bndry / 4) * 18 * prec); saveGaugeQuda(buffer, ¶m); qudaState.init.reorder_gauge_quda_to_openqcd(buffer, gauge); pool_pinned_free(buffer); } - -void openQCD_qudaGaugeFree(void) -{ - freeGaugeQuda(); -} - +void openQCD_qudaGaugeFree(void) { freeGaugeQuda(); } void openQCD_qudaCloverLoad(void *clover, double kappa, double csw) { @@ -638,12 +610,7 @@ void openQCD_qudaCloverLoad(void *clover, double kappa, double csw) loadCloverQuda(clover, NULL, ¶m); } - -void openQCD_qudaCloverFree(void) -{ - freeCloverQuda(); -} - +void openQCD_qudaCloverFree(void) { freeCloverQuda(); } /** * @brief Set the su3csw corfficient and all related properties. @@ -651,7 +618,7 @@ void openQCD_qudaCloverFree(void) * @param param The parameter struct * @param[in] su3csw The su3csw coefficient */ -inline void set_su3csw(QudaInvertParam* param, double su3csw) +inline void set_su3csw(QudaInvertParam *param, double su3csw) { param->clover_csw = su3csw; if (su3csw != 0.0) { @@ -674,7 +641,6 @@ inline void set_su3csw(QudaInvertParam* param, double su3csw) } } - /** * @brief Creates a new quda Dirac parameter struct * @@ -704,7 +670,6 @@ static QudaInvertParam newOpenQCDDiracParam(openQCD_QudaDiracParam_t p) return param; } - void openQCD_back_and_forth(void *h_in, void *h_out) { /* sets up the necessary parameters */ @@ -737,7 +702,6 @@ void openQCD_back_and_forth(void *h_in, void *h_out) out_h = out; } - int openQCD_qudaIndexIpt(const int *x) { int L_openqcd[4]; @@ -745,7 +709,6 @@ int openQCD_qudaIndexIpt(const int *x) return openqcd::ipt(x, L_openqcd); } - int openQCD_qudaIndexIup(const int *x, const int mu) { int L_openqcd[4], nproc_openqcd[4]; @@ -754,7 +717,6 @@ int openQCD_qudaIndexIup(const int *x, const int mu) return openqcd::iup(x, mu, L_openqcd, nproc_openqcd); } - double openQCD_qudaNorm(void *h_in) { QudaInvertParam param = newOpenQCDParam(); @@ -770,12 +732,7 @@ double openQCD_qudaNorm(void *h_in) return blas::norm2(in); } - -double openQCD_qudaNorm_NoLoads(void *d_in) -{ - return blas::norm2(*reinterpret_cast(d_in)); -} - +double openQCD_qudaNorm_NoLoads(void *d_in) { return blas::norm2(*reinterpret_cast(d_in)); } void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) { @@ -800,18 +757,10 @@ void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) /* gamma_i run within QUDA using QUDA fields */ switch (dir) { - case 0: /* t direction */ - gamma3(out, in); - break; - case 1: /* x direction */ - gamma0(out, in); - break; - case 2: /* y direction */ - gamma1(out, in); - break; - case 3: /* z direction */ - gamma2(out, in); - break; + case 0: /* t direction */ gamma3(out, in); break; + case 1: /* x direction */ gamma0(out, in); break; + case 2: /* y direction */ gamma1(out, in); break; + case 3: /* z direction */ gamma2(out, in); break; case 4: case 5: gamma5(out, in); @@ -823,8 +772,7 @@ void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) * with U the transformation matrix from OpenQCD to UKQCD. */ blas::ax(-1.0, out); break; - default: - errorQuda("Unknown gamma: %d\n", dir); + default: errorQuda("Unknown gamma: %d\n", dir); } /* creates a field on the CPU */ @@ -836,8 +784,7 @@ void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out) out_h = out; } - -void* openQCD_qudaH2D(void *openQCD_field) +void *openQCD_qudaH2D(void *openQCD_field) { /* sets up the necessary parameters */ QudaInvertParam param = newOpenQCDParam(); @@ -855,14 +802,12 @@ void* openQCD_qudaH2D(void *openQCD_field) return in; } - -void openQCD_qudaSpinorFree(void** quda_field) +void openQCD_qudaSpinorFree(void **quda_field) { - delete reinterpret_cast(*quda_field); + delete reinterpret_cast(*quda_field); *quda_field = nullptr; } - void openQCD_qudaD2H(void *quda_field, void *openQCD_field) { int my_rank; @@ -876,10 +821,9 @@ void openQCD_qudaD2H(void *quda_field, void *openQCD_field) ColorSpinorField out_h(cpuParam); /* transfer the GPU field to CPU */ - out_h = *reinterpret_cast(quda_field); + out_h = *reinterpret_cast(quda_field); } - /** * @brief Check whether the gauge field from openQCD is in sync with the * one from QUDA. @@ -895,8 +839,6 @@ inline bool gauge_field_get_up2date(void) return ud_rev == qudaState.ud_rev && ad_rev == qudaState.ad_rev; } - - /** * @brief Check whether the gauge field is not (yet) set in openQCD. * @@ -909,8 +851,6 @@ inline bool gauge_field_get_unset(void) return ud_rev == 0 && ad_rev == 0; } - - /** * @brief Check if the current SW field needs to update wrt the parameters from openQCD. * @@ -918,15 +858,12 @@ inline bool gauge_field_get_unset(void) */ inline bool clover_field_get_up2date(void) { - return (gauge_field_get_up2date() - && qudaState.swd_ud_rev == qudaState.ud_rev - && qudaState.swd_ad_rev == qudaState.ad_rev - && qudaState.swd_kappa == 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)) - && qudaState.swd_su3csw == qudaState.layout.dirac_parms().su3csw - && qudaState.swd_u1csw == qudaState.layout.dirac_parms().u1csw); + return (gauge_field_get_up2date() && qudaState.swd_ud_rev == qudaState.ud_rev && qudaState.swd_ad_rev == qudaState.ad_rev + && qudaState.swd_kappa == 1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)) + && qudaState.swd_su3csw == qudaState.layout.dirac_parms().su3csw + && qudaState.swd_u1csw == qudaState.layout.dirac_parms().u1csw); } - /** * @brief Check whether the multigrid instance associated to the parameter * struct is up to date with the global gauge field revision, @@ -938,19 +875,14 @@ inline bool clover_field_get_up2date(void) */ inline bool mg_get_up2date(QudaInvertParam *param) { - openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); - - return (param->preconditioner != nullptr - && gauge_field_get_up2date() - && clover_field_get_up2date() - && additional_prop->mg_ud_rev == qudaState.ud_rev - && additional_prop->mg_ad_rev == qudaState.ad_rev - && additional_prop->mg_kappa == 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)) - && additional_prop->mg_su3csw == qudaState.layout.dirac_parms().su3csw - && additional_prop->mg_u1csw == qudaState.layout.dirac_parms().u1csw); -} - + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + return (param->preconditioner != nullptr && gauge_field_get_up2date() && clover_field_get_up2date() + && additional_prop->mg_ud_rev == qudaState.ud_rev && additional_prop->mg_ad_rev == qudaState.ad_rev + && additional_prop->mg_kappa == 1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)) + && additional_prop->mg_su3csw == qudaState.layout.dirac_parms().su3csw + && additional_prop->mg_u1csw == qudaState.layout.dirac_parms().u1csw); +} /** * @brief Sets the multigrid instance associated to the parameter struct to @@ -960,16 +892,15 @@ inline bool mg_get_up2date(QudaInvertParam *param) */ inline void mg_set_revision(QudaInvertParam *param) { - openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); additional_prop->mg_ud_rev = qudaState.ud_rev; additional_prop->mg_ad_rev = qudaState.ad_rev; - additional_prop->mg_kappa = 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)); + additional_prop->mg_kappa = 1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)); additional_prop->mg_su3csw = qudaState.layout.dirac_parms().su3csw; additional_prop->mg_u1csw = qudaState.layout.dirac_parms().u1csw; } - /** * @brief Set the global revisions numners for the SW field. */ @@ -977,20 +908,15 @@ inline void clover_field_set_revision(void) { qudaState.swd_ud_rev = qudaState.ud_rev; qudaState.swd_ad_rev = qudaState.ad_rev; - qudaState.swd_kappa = 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)); + qudaState.swd_kappa = 1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)); qudaState.swd_su3csw = qudaState.layout.dirac_parms().su3csw; qudaState.swd_u1csw = qudaState.layout.dirac_parms().u1csw; } - /** * @brief Set the global revisions numners for the gauge field. */ -inline void gauge_field_set_revision(void) -{ - qudaState.layout.get_gfld_flags(&qudaState.ud_rev, &qudaState.ad_rev); -} - +inline void gauge_field_set_revision(void) { qudaState.layout.get_gfld_flags(&qudaState.ud_rev, &qudaState.ad_rev); } /** * @brief Check if the solver parameters are in sync with the parameters @@ -1002,24 +928,28 @@ inline void gauge_field_set_revision(void) */ int openQCD_qudaInvertParamCheck(void *param_) { - QudaInvertParam* param = static_cast(param_); - openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + QudaInvertParam *param = static_cast(param_); + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); bool ret = true; - if (param->kappa != (1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)))) { - logQuda(QUDA_VERBOSE, "Property m0/kappa does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", - (1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0))), param->kappa); + if (param->kappa != (1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)))) { + logQuda( + QUDA_VERBOSE, "Property m0/kappa does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", + (1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0))), param->kappa); ret = false; } if (additional_prop->u1csw != qudaState.layout.dirac_parms().u1csw) { - logQuda(QUDA_VERBOSE, "Property u1csw does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", + logQuda( + QUDA_VERBOSE, + "Property u1csw does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", qudaState.layout.dirac_parms().u1csw, additional_prop->u1csw); ret = false; } if (param->clover_csw != qudaState.layout.dirac_parms().su3csw) { - logQuda(QUDA_VERBOSE, "Property su3csw/clover_csw does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", + logQuda( + QUDA_VERBOSE, "Property su3csw/clover_csw does not match in QudaInvertParam struct and openQxD:dirac_parms (openQxD: %.6e, QUDA: %.6e)\n", qudaState.layout.dirac_parms().su3csw, param->clover_csw); ret = false; } @@ -1027,7 +957,6 @@ int openQCD_qudaInvertParamCheck(void *param_) return ret; } - /** * @brief Transfer the gauge field if the gauge field was updated in * openQxD. (Re-)calculate or transfer the clover field if @@ -1042,27 +971,27 @@ int openQCD_qudaInvertParamCheck(void *param_) */ void openQCD_qudaSolverUpdate(void *param_) { - if (param_ == nullptr) { - errorQuda("Solver handle is NULL."); - } + if (param_ == nullptr) { errorQuda("Solver handle is NULL."); } - QudaInvertParam* param = static_cast(param_); - openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + QudaInvertParam *param = static_cast(param_); + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); bool do_param_update = !openQCD_qudaInvertParamCheck(param_); bool do_gauge_transfer = !gauge_field_get_up2date() && !gauge_field_get_unset(); bool do_clover_update = !clover_field_get_up2date() && !gauge_field_get_unset(); - bool do_multigrid_update = param_ != qudaState.dirac_handle && param->inv_type_precondition == QUDA_MG_INVERTER && !mg_get_up2date(param) && !gauge_field_get_unset(); - bool do_multigrid_fat_update = do_multigrid_update && (do_gauge_transfer || additional_prop->mg_ud_rev != qudaState.ud_rev || additional_prop->mg_ad_rev != qudaState.ad_rev); + bool do_multigrid_update = param_ != qudaState.dirac_handle && param->inv_type_precondition == QUDA_MG_INVERTER + && !mg_get_up2date(param) && !gauge_field_get_unset(); + bool do_multigrid_fat_update = do_multigrid_update + && (do_gauge_transfer || additional_prop->mg_ud_rev != qudaState.ud_rev + || additional_prop->mg_ad_rev != qudaState.ad_rev); if (do_gauge_transfer) { - if (qudaState.layout.h_gauge == nullptr) { - errorQuda("qudaState.layout.h_gauge is not set."); - } + if (qudaState.layout.h_gauge == nullptr) { errorQuda("qudaState.layout.h_gauge is not set."); } logQuda(QUDA_VERBOSE, "Loading gauge field from openQCD ...\n"); void *h_gauge = qudaState.layout.h_gauge(); - PUSH_RANGE("openQCD_qudaGaugeLoad",3); - QudaReconstructType rec = qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3 ? QUDA_RECONSTRUCT_8 : QUDA_RECONSTRUCT_9; + PUSH_RANGE("openQCD_qudaGaugeLoad", 3); + QudaReconstructType rec + = qudaState.layout.flds_parms().gauge == OPENQCD_GAUGE_SU3 ? QUDA_RECONSTRUCT_8 : QUDA_RECONSTRUCT_9; /** * We set t_boundary = QUDA_ANTI_PERIODIC_T. This setting is a label that @@ -1088,12 +1017,12 @@ void openQCD_qudaSolverUpdate(void *param_) if (do_param_update) { logQuda(QUDA_VERBOSE, "Syncing kappa, su3csw, u1csw values from openQCD ...\n"); - param->kappa = 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)); + param->kappa = 1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)); additional_prop->u1csw = qudaState.layout.dirac_parms().u1csw; set_su3csw(param, qudaState.layout.dirac_parms().su3csw); - QudaInvertParam* mg_inv_param = additional_prop->mg_param->invert_param; - mg_inv_param->kappa = 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)); + QudaInvertParam *mg_inv_param = additional_prop->mg_param->invert_param; + mg_inv_param->kappa = 1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)); set_su3csw(mg_inv_param, qudaState.layout.dirac_parms().su3csw); } @@ -1115,7 +1044,7 @@ void openQCD_qudaSolverUpdate(void *param_) * field. */ logQuda(QUDA_VERBOSE, "Generating Clover field in QUDA ...\n"); - PUSH_RANGE("loadCloverQuda",3); + PUSH_RANGE("loadCloverQuda", 3); loadCloverQuda(NULL, NULL, param); POP_RANGE; clover_field_set_revision(); @@ -1124,13 +1053,11 @@ void openQCD_qudaSolverUpdate(void *param_) * U3 case: Transfer the SW-field from openQCD. */ - if (qudaState.layout.h_sw == nullptr) { - errorQuda("qudaState.layout.h_sw is not set."); - } + if (qudaState.layout.h_sw == nullptr) { errorQuda("qudaState.layout.h_sw is not set."); } logQuda(QUDA_VERBOSE, "Loading Clover field from openQCD ...\n"); void *h_sw = qudaState.layout.h_sw(); - PUSH_RANGE("openQCD_qudaCloverLoad",3); + PUSH_RANGE("openQCD_qudaCloverLoad", 3); openQCD_qudaCloverLoad(h_sw, param->kappa, param->clover_csw); POP_RANGE; clover_field_set_revision(); @@ -1139,23 +1066,22 @@ void openQCD_qudaSolverUpdate(void *param_) /* TODO: The above line would be prefered over openQCD_qudaCloverLoad, but throws this error, no idea why? QUDA: ERROR: qudaEventRecord_ returned CUDA_ERROR_ILLEGAL_ADDRESS (timer.h:82 in start()) - (rank 0, host yoshi, quda_api.cpp:72 in void quda::target::cuda::set_driver_error(CUresult, const char*, const char*, const char*, const char*, bool)()) - QUDA: last kernel called was (name=N4quda10CopyCloverINS_6clover11FloatNOrderIdLi72ELi2ELb0ELb1ELb0EEENS1_12OpenQCDOrderIdLi72EEEddEE,volume=32x16x16x64,aux=GPU-offline,vol=524288precision=8Nc=3,compute_diagonal)*/ + (rank 0, host yoshi, quda_api.cpp:72 in void quda::target::cuda::set_driver_error(CUresult, const char*, const + char*, const char*, const char*, bool)()) QUDA: last kernel called was + (name=N4quda10CopyCloverINS_6clover11FloatNOrderIdLi72ELi2ELb0ELb1ELb0EEENS1_12OpenQCDOrderIdLi72EEEddEE,volume=32x16x16x64,aux=GPU-offline,vol=524288precision=8Nc=3,compute_diagonal)*/ } } } /* setup/update the multigrid instance or do nothing */ if (do_multigrid_update) { - QudaMultigridParam* mg_param = additional_prop->mg_param; + QudaMultigridParam *mg_param = additional_prop->mg_param; - if (mg_param == nullptr) { - errorQuda("No multigrid parameter struct set."); - } + if (mg_param == nullptr) { errorQuda("No multigrid parameter struct set."); } if (do_multigrid_fat_update && param->preconditioner != nullptr) { logQuda(QUDA_VERBOSE, "Destroying existing multigrid instance ...\n"); - PUSH_RANGE("destroyMultigridQuda",4); + PUSH_RANGE("destroyMultigridQuda", 4); destroyMultigridQuda(param->preconditioner); param->preconditioner = nullptr; POP_RANGE; @@ -1169,13 +1095,13 @@ void openQCD_qudaSolverUpdate(void *param_) if (param->preconditioner == nullptr) { logQuda(QUDA_VERBOSE, "Setting up multigrid instance ...\n"); - PUSH_RANGE("newMultigridQuda",4); + PUSH_RANGE("newMultigridQuda", 4); param->preconditioner = newMultigridQuda(mg_param); POP_RANGE; mg_set_revision(param); } else { logQuda(QUDA_VERBOSE, "Updating existing multigrid instance ...\n"); - PUSH_RANGE("updateMultigridQuda",4); + PUSH_RANGE("updateMultigridQuda", 4); updateMultigridQuda(param->preconditioner, mg_param); POP_RANGE; mg_set_revision(param); @@ -1183,17 +1109,16 @@ void openQCD_qudaSolverUpdate(void *param_) } } - -void* openQCD_qudaSolverReadIn(int id) +void *openQCD_qudaSolverReadIn(int id) { int my_rank; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); /* Allocate on the heap */ - QudaInvertParam* param = new QudaInvertParam(newQudaInvertParam()); - QudaInvertParam* invert_param_mg = new QudaInvertParam(newQudaInvertParam()); - QudaMultigridParam* multigrid_param = new QudaMultigridParam(newQudaMultigridParam()); + QudaInvertParam *param = new QudaInvertParam(newQudaInvertParam()); + QudaInvertParam *invert_param_mg = new QudaInvertParam(newQudaInvertParam()); + QudaMultigridParam *multigrid_param = new QudaMultigridParam(newQudaMultigridParam()); std::string section = "Solver " + std::to_string(id); /* Some default settings */ @@ -1207,7 +1132,7 @@ void* openQCD_qudaSolverReadIn(int id) param->dirac_order = QUDA_OPENQCD_DIRAC_ORDER; param->gamma_basis = QUDA_OPENQCD_GAMMA_BASIS; param->dslash_type = QUDA_WILSON_DSLASH; - param->kappa = 1.0/(2.0*(qudaState.layout.dirac_parms().m0+4.0)); + param->kappa = 1.0 / (2.0 * (qudaState.layout.dirac_parms().m0 + 4.0)); param->mu = 0.0; param->dagger = QUDA_DAG_NO; param->solution_type = QUDA_MAT_SOLUTION; @@ -1231,13 +1156,11 @@ void* openQCD_qudaSolverReadIn(int id) param->verbosity = kv.get(section, "verbosity", param->verbosity); - if (param->verbosity >= QUDA_DEBUG_VERBOSE) { - kv.dump(); - } + if (param->verbosity >= QUDA_DEBUG_VERBOSE) { kv.dump(); } if (kv.get(section, "solver") != "QUDA") { - errorQuda("Solver section \"%s\" in file %s is not a valid quda-solver section (solver = %s).", - section.c_str(), qudaState.infile, kv.get(section, "solver").c_str()); + errorQuda("Solver section \"%s\" in file %s is not a valid quda-solver section (solver = %s).", section.c_str(), + qudaState.infile, kv.get(section, "solver").c_str()); } /* both fields reside on the CPU */ @@ -1262,11 +1185,14 @@ void* openQCD_qudaSolverReadIn(int id) param->true_res_hq = kv.get(section, "true_res_hq", param->true_res_hq); param->maxiter = kv.get(section, "maxiter", param->maxiter); param->reliable_delta = kv.get(section, "reliable_delta", param->reliable_delta); - param->reliable_delta_refinement = kv.get(section, "reliable_delta_refinement", param->reliable_delta_refinement); + param->reliable_delta_refinement + = kv.get(section, "reliable_delta_refinement", param->reliable_delta_refinement); param->use_alternative_reliable = kv.get(section, "use_alternative_reliable", param->use_alternative_reliable); - param->use_sloppy_partial_accumulator = kv.get(section, "use_sloppy_partial_accumulator", param->use_sloppy_partial_accumulator); + param->use_sloppy_partial_accumulator + = kv.get(section, "use_sloppy_partial_accumulator", param->use_sloppy_partial_accumulator); - param->solution_accumulator_pipeline = kv.get(section, "solution_accumulator_pipeline", param->solution_accumulator_pipeline); + param->solution_accumulator_pipeline + = kv.get(section, "solution_accumulator_pipeline", param->solution_accumulator_pipeline); param->max_res_increase = kv.get(section, "max_res_increase", param->max_res_increase); param->max_res_increase_total = kv.get(section, "max_res_increase_total", param->max_res_increase_total); @@ -1278,7 +1204,8 @@ void* openQCD_qudaSolverReadIn(int id) param->pipeline = kv.get(section, "pipeline", param->pipeline); param->num_offset = kv.get(section, "num_offset", param->num_offset); param->num_src = kv.get(section, "num_src", param->num_src); - param->num_src_per_sub_partition = kv.get(section, "num_src_per_sub_partition", param->num_src_per_sub_partition); + param->num_src_per_sub_partition + = kv.get(section, "num_src_per_sub_partition", param->num_src_per_sub_partition); param->split_grid[0] = kv.get(section, "split_grid[1]", param->split_grid[0]); param->split_grid[1] = kv.get(section, "split_grid[2]", param->split_grid[1]); @@ -1287,7 +1214,7 @@ void* openQCD_qudaSolverReadIn(int id) param->overlap = kv.get(section, "overlap", param->overlap); - for(int i=0; inum_offset; i++) { + for (int i = 0; i < param->num_offset; i++) { std::string sub_key = "offset[" + std::to_string(i) + "]"; param->offset[i] = kv.get(section, sub_key, param->offset[i]); sub_key = "tol_offset[" + std::to_string(i) + "]"; @@ -1303,24 +1230,31 @@ void* openQCD_qudaSolverReadIn(int id) param->matpc_type = kv.get(section, "matpc_type", param->matpc_type); param->dagger = kv.get(section, "dagger", param->dagger); param->mass_normalization = kv.get(section, "mass_normalization", param->mass_normalization); - param->solver_normalization = kv.get(section, "solver_normalization", param->solver_normalization); + param->solver_normalization + = kv.get(section, "solver_normalization", param->solver_normalization); param->preserve_source = kv.get(section, "preserve_source", param->preserve_source); param->cpu_prec = kv.get(section, "cpu_prec", param->cpu_prec); param->cuda_prec = kv.get(section, "cuda_prec", param->cuda_prec); param->cuda_prec_sloppy = kv.get(section, "cuda_prec_sloppy", param->cuda_prec_sloppy); - param->cuda_prec_refinement_sloppy = kv.get(section, "cuda_prec_refinement_sloppy", param->cuda_prec_refinement_sloppy); - param->cuda_prec_precondition = kv.get(section, "cuda_prec_precondition", param->cuda_prec_precondition); + param->cuda_prec_refinement_sloppy + = kv.get(section, "cuda_prec_refinement_sloppy", param->cuda_prec_refinement_sloppy); + param->cuda_prec_precondition + = kv.get(section, "cuda_prec_precondition", param->cuda_prec_precondition); param->cuda_prec_eigensolver = kv.get(section, "cuda_prec_eigensolver", param->cuda_prec_eigensolver); param->clover_location = kv.get(section, "clover_location", param->clover_location); param->clover_cpu_prec = kv.get(section, "clover_cpu_prec", param->clover_cpu_prec); param->clover_cuda_prec = kv.get(section, "clover_cuda_prec", param->clover_cuda_prec); - param->clover_cuda_prec_sloppy = kv.get(section, "clover_cuda_prec_sloppy", param->clover_cuda_prec_sloppy); - param->clover_cuda_prec_refinement_sloppy = kv.get(section, "clover_cuda_prec_refinement_sloppy", param->clover_cuda_prec_refinement_sloppy); - param->clover_cuda_prec_precondition = kv.get(section, "clover_cuda_prec_precondition", param->clover_cuda_prec_precondition); - param->clover_cuda_prec_eigensolver = kv.get(section, "clover_cuda_prec_eigensolver", param->clover_cuda_prec_eigensolver); + param->clover_cuda_prec_sloppy + = kv.get(section, "clover_cuda_prec_sloppy", param->clover_cuda_prec_sloppy); + param->clover_cuda_prec_refinement_sloppy + = kv.get(section, "clover_cuda_prec_refinement_sloppy", param->clover_cuda_prec_refinement_sloppy); + param->clover_cuda_prec_precondition + = kv.get(section, "clover_cuda_prec_precondition", param->clover_cuda_prec_precondition); + param->clover_cuda_prec_eigensolver + = kv.get(section, "clover_cuda_prec_eigensolver", param->clover_cuda_prec_eigensolver); param->use_init_guess = kv.get(section, "use_init_guess", param->use_init_guess); @@ -1332,9 +1266,11 @@ void* openQCD_qudaSolverReadIn(int id) param->Nsteps = kv.get(section, "Nsteps", param->Nsteps); param->gcrNkrylov = kv.get(section, "gcrNkrylov", param->gcrNkrylov); - param->inv_type_precondition = kv.get(section, "inv_type_precondition", param->inv_type_precondition); + param->inv_type_precondition + = kv.get(section, "inv_type_precondition", param->inv_type_precondition); param->deflate = kv.get(section, "deflate", param->deflate); - param->verbosity_precondition = kv.get(section, "verbosity_precondition", param->verbosity_precondition); + param->verbosity_precondition + = kv.get(section, "verbosity_precondition", param->verbosity_precondition); param->tol_precondition = kv.get(section, "tol_precondition", param->tol_precondition); param->maxiter_precondition = kv.get(section, "maxiter_precondition", param->maxiter_precondition); param->omega = kv.get(section, "omega", param->omega); @@ -1342,13 +1278,17 @@ void* openQCD_qudaSolverReadIn(int id) param->ca_lambda_min = kv.get(section, "ca_lambda_min", param->ca_lambda_min); param->ca_lambda_max = kv.get(section, "ca_lambda_max", param->ca_lambda_max); param->ca_basis_precondition = kv.get(section, "ca_basis_precondition", param->ca_basis_precondition); - param->ca_lambda_min_precondition = kv.get(section, "ca_lambda_min_precondition", param->ca_lambda_min_precondition); - param->ca_lambda_max_precondition = kv.get(section, "ca_lambda_max_precondition", param->ca_lambda_max_precondition); + param->ca_lambda_min_precondition + = kv.get(section, "ca_lambda_min_precondition", param->ca_lambda_min_precondition); + param->ca_lambda_max_precondition + = kv.get(section, "ca_lambda_max_precondition", param->ca_lambda_max_precondition); param->precondition_cycle = kv.get(section, "precondition_cycle", param->precondition_cycle); param->schwarz_type = kv.get(section, "schwarz_type", param->schwarz_type); - param->accelerator_type_precondition = kv.get(section, "accelerator_type_precondition", param->accelerator_type_precondition); + param->accelerator_type_precondition + = kv.get(section, "accelerator_type_precondition", param->accelerator_type_precondition); - param->madwf_diagonal_suppressor = kv.get(section, "madwf_diagonal_suppressor", param->madwf_diagonal_suppressor); + param->madwf_diagonal_suppressor + = kv.get(section, "madwf_diagonal_suppressor", param->madwf_diagonal_suppressor); param->madwf_ls = kv.get(section, "madwf_ls", param->madwf_ls); param->madwf_null_miniter = kv.get(section, "madwf_null_miniter", param->madwf_null_miniter); param->madwf_null_tol = kv.get(section, "madwf_null_tol", param->madwf_null_tol); @@ -1356,8 +1296,10 @@ void* openQCD_qudaSolverReadIn(int id) param->madwf_param_load = kv.get(section, "madwf_param_load", param->madwf_param_load); param->madwf_param_save = kv.get(section, "madwf_param_save", param->madwf_param_save); - strcpy(param->madwf_param_infile, kv.get(section, "madwf_param_infile", param->madwf_param_infile).c_str()); - strcpy(param->madwf_param_outfile, kv.get(section, "madwf_param_outfile", param->madwf_param_outfile).c_str()); + strcpy(param->madwf_param_infile, + kv.get(section, "madwf_param_infile", param->madwf_param_infile).c_str()); + strcpy(param->madwf_param_outfile, + kv.get(section, "madwf_param_outfile", param->madwf_param_outfile).c_str()); param->residual_type = kv.get(section, "residual_type", param->residual_type); @@ -1378,98 +1320,146 @@ void* openQCD_qudaSolverReadIn(int id) multigrid_param->n_level = kv.get(mg_section, "n_level", multigrid_param->n_level, true); multigrid_param->setup_type = kv.get(mg_section, "setup_type", multigrid_param->setup_type); - multigrid_param->pre_orthonormalize = kv.get(mg_section, "pre_orthonormalize", multigrid_param->pre_orthonormalize); - multigrid_param->post_orthonormalize = kv.get(mg_section, "post_orthonormalize", multigrid_param->post_orthonormalize); - multigrid_param->setup_minimize_memory = kv.get(mg_section, "setup_minimize_memory", multigrid_param->setup_minimize_memory); - multigrid_param->compute_null_vector = kv.get(mg_section, "compute_null_vector", multigrid_param->compute_null_vector); - multigrid_param->generate_all_levels = kv.get(mg_section, "generate_all_levels", multigrid_param->generate_all_levels); + multigrid_param->pre_orthonormalize + = kv.get(mg_section, "pre_orthonormalize", multigrid_param->pre_orthonormalize); + multigrid_param->post_orthonormalize + = kv.get(mg_section, "post_orthonormalize", multigrid_param->post_orthonormalize); + multigrid_param->setup_minimize_memory + = kv.get(mg_section, "setup_minimize_memory", multigrid_param->setup_minimize_memory); + multigrid_param->compute_null_vector + = kv.get(mg_section, "compute_null_vector", multigrid_param->compute_null_vector); + multigrid_param->generate_all_levels + = kv.get(mg_section, "generate_all_levels", multigrid_param->generate_all_levels); multigrid_param->run_verify = kv.get(mg_section, "run_verify", multigrid_param->run_verify); - multigrid_param->run_low_mode_check = kv.get(mg_section, "run_low_mode_check", multigrid_param->run_low_mode_check); - multigrid_param->run_oblique_proj_check = kv.get(mg_section, "run_oblique_proj_check", multigrid_param->run_oblique_proj_check); + multigrid_param->run_low_mode_check + = kv.get(mg_section, "run_low_mode_check", multigrid_param->run_low_mode_check); + multigrid_param->run_oblique_proj_check + = kv.get(mg_section, "run_oblique_proj_check", multigrid_param->run_oblique_proj_check); multigrid_param->coarse_guess = kv.get(mg_section, "coarse_guess", multigrid_param->coarse_guess); - multigrid_param->preserve_deflation = kv.get(mg_section, "preserve_deflation", multigrid_param->preserve_deflation); - multigrid_param->allow_truncation = kv.get(mg_section, "allow_truncation", multigrid_param->allow_truncation); - multigrid_param->staggered_kd_dagger_approximation = kv.get(mg_section, "staggered_kd_dagger_approximation", multigrid_param->staggered_kd_dagger_approximation); - multigrid_param->thin_update_only = kv.get(mg_section, "thin_update_only", multigrid_param->thin_update_only); - - for (int i=0; in_level; i++) { + multigrid_param->preserve_deflation + = kv.get(mg_section, "preserve_deflation", multigrid_param->preserve_deflation); + multigrid_param->allow_truncation + = kv.get(mg_section, "allow_truncation", multigrid_param->allow_truncation); + multigrid_param->staggered_kd_dagger_approximation = kv.get( + mg_section, "staggered_kd_dagger_approximation", multigrid_param->staggered_kd_dagger_approximation); + multigrid_param->thin_update_only + = kv.get(mg_section, "thin_update_only", multigrid_param->thin_update_only); + + for (int i = 0; i < multigrid_param->n_level; i++) { std::string subsection = section + " Multigrid Level " + std::to_string(i); if (!kv.section_exists(subsection)) { errorQuda("Solver section \"%s\" in file %s does not exist.", subsection.c_str(), qudaState.infile); } - multigrid_param->geo_block_size[i][0] = kv.get(subsection, "geo_block_size[1]", multigrid_param->geo_block_size[i][0]); - multigrid_param->geo_block_size[i][1] = kv.get(subsection, "geo_block_size[2]", multigrid_param->geo_block_size[i][1]); - multigrid_param->geo_block_size[i][2] = kv.get(subsection, "geo_block_size[3]", multigrid_param->geo_block_size[i][2]); - multigrid_param->geo_block_size[i][3] = kv.get(subsection, "geo_block_size[0]", multigrid_param->geo_block_size[i][3]); + multigrid_param->geo_block_size[i][0] + = kv.get(subsection, "geo_block_size[1]", multigrid_param->geo_block_size[i][0]); + multigrid_param->geo_block_size[i][1] + = kv.get(subsection, "geo_block_size[2]", multigrid_param->geo_block_size[i][1]); + multigrid_param->geo_block_size[i][2] + = kv.get(subsection, "geo_block_size[3]", multigrid_param->geo_block_size[i][2]); + multigrid_param->geo_block_size[i][3] + = kv.get(subsection, "geo_block_size[0]", multigrid_param->geo_block_size[i][3]); - if (i==0) { + if (i == 0) { multigrid_param->geo_block_size[i][0] = 4; multigrid_param->geo_block_size[i][1] = 4; multigrid_param->geo_block_size[i][2] = 4; multigrid_param->geo_block_size[i][3] = 4; } - multigrid_param->spin_block_size[i] = kv.get(subsection, "spin_block_size", multigrid_param->spin_block_size[i]); + multigrid_param->spin_block_size[i] + = kv.get(subsection, "spin_block_size", multigrid_param->spin_block_size[i]); multigrid_param->n_vec[i] = kv.get(subsection, "n_vec", multigrid_param->n_vec[i]); - multigrid_param->precision_null[i] = kv.get(subsection, "precision_null", multigrid_param->precision_null[i]); + multigrid_param->precision_null[i] + = kv.get(subsection, "precision_null", multigrid_param->precision_null[i]); multigrid_param->n_block_ortho[i] = kv.get(subsection, "n_block_ortho", multigrid_param->n_block_ortho[i]); - multigrid_param->block_ortho_two_pass[i] = kv.get(subsection, "block_ortho_two_pass", multigrid_param->block_ortho_two_pass[i]); + multigrid_param->block_ortho_two_pass[i] + = kv.get(subsection, "block_ortho_two_pass", multigrid_param->block_ortho_two_pass[i]); multigrid_param->verbosity[i] = kv.get(subsection, "verbosity", multigrid_param->verbosity[i]); - multigrid_param->setup_inv_type[i] = kv.get(subsection, "setup_inv_type", multigrid_param->setup_inv_type[i]); - multigrid_param->setup_use_mma[i] = kv.get(subsection, "setup_use_mma", multigrid_param->setup_use_mma[i]); - multigrid_param->dslash_use_mma[i] = kv.get(subsection, "dslash_use_mma", multigrid_param->dslash_use_mma[i]); - multigrid_param->num_setup_iter[i] = kv.get(subsection, "num_setup_iter", multigrid_param->num_setup_iter[i]); + multigrid_param->setup_inv_type[i] + = kv.get(subsection, "setup_inv_type", multigrid_param->setup_inv_type[i]); + multigrid_param->setup_use_mma[i] + = kv.get(subsection, "setup_use_mma", multigrid_param->setup_use_mma[i]); + multigrid_param->dslash_use_mma[i] + = kv.get(subsection, "dslash_use_mma", multigrid_param->dslash_use_mma[i]); + multigrid_param->num_setup_iter[i] + = kv.get(subsection, "num_setup_iter", multigrid_param->num_setup_iter[i]); multigrid_param->setup_tol[i] = kv.get(subsection, "setup_tol", multigrid_param->setup_tol[i]); multigrid_param->setup_maxiter[i] = kv.get(subsection, "setup_maxiter", multigrid_param->setup_maxiter[i]); - multigrid_param->setup_maxiter_refresh[i] = kv.get(subsection, "setup_maxiter_refresh", multigrid_param->setup_maxiter_refresh[i]); - multigrid_param->setup_ca_basis[i] = kv.get(subsection, "setup_ca_basis", multigrid_param->setup_ca_basis[i]); - multigrid_param->setup_ca_basis_size[i] = kv.get(subsection, "setup_ca_basis_size", multigrid_param->setup_ca_basis_size[i]); - multigrid_param->setup_ca_lambda_min[i] = kv.get(subsection, "setup_ca_lambda_min", multigrid_param->setup_ca_lambda_min[i]); - multigrid_param->setup_ca_lambda_max[i] = kv.get(subsection, "setup_ca_lambda_max", multigrid_param->setup_ca_lambda_max[i]); - - multigrid_param->coarse_solver[i] = kv.get(subsection, "coarse_solver", multigrid_param->coarse_solver[i]); - multigrid_param->coarse_solver_tol[i] = kv.get(subsection, "coarse_solver_tol", multigrid_param->coarse_solver_tol[i]); - multigrid_param->coarse_solver_maxiter[i] = kv.get(subsection, "coarse_solver_maxiter", multigrid_param->coarse_solver_maxiter[i]); - multigrid_param->coarse_solver_ca_basis[i] = kv.get(subsection, "coarse_solver_ca_basis", multigrid_param->coarse_solver_ca_basis[i]); - multigrid_param->coarse_solver_ca_basis_size[i] = kv.get(subsection, "coarse_solver_ca_basis_size", multigrid_param->coarse_solver_ca_basis_size[i]); - multigrid_param->coarse_solver_ca_lambda_min[i] = kv.get(subsection, "coarse_solver_ca_lambda_min", multigrid_param->coarse_solver_ca_lambda_min[i]); - multigrid_param->coarse_solver_ca_lambda_max[i] = kv.get(subsection, "coarse_solver_ca_lambda_max", multigrid_param->coarse_solver_ca_lambda_max[i]); + multigrid_param->setup_maxiter_refresh[i] + = kv.get(subsection, "setup_maxiter_refresh", multigrid_param->setup_maxiter_refresh[i]); + multigrid_param->setup_ca_basis[i] + = kv.get(subsection, "setup_ca_basis", multigrid_param->setup_ca_basis[i]); + multigrid_param->setup_ca_basis_size[i] + = kv.get(subsection, "setup_ca_basis_size", multigrid_param->setup_ca_basis_size[i]); + multigrid_param->setup_ca_lambda_min[i] + = kv.get(subsection, "setup_ca_lambda_min", multigrid_param->setup_ca_lambda_min[i]); + multigrid_param->setup_ca_lambda_max[i] + = kv.get(subsection, "setup_ca_lambda_max", multigrid_param->setup_ca_lambda_max[i]); + + multigrid_param->coarse_solver[i] + = kv.get(subsection, "coarse_solver", multigrid_param->coarse_solver[i]); + multigrid_param->coarse_solver_tol[i] + = kv.get(subsection, "coarse_solver_tol", multigrid_param->coarse_solver_tol[i]); + multigrid_param->coarse_solver_maxiter[i] + = kv.get(subsection, "coarse_solver_maxiter", multigrid_param->coarse_solver_maxiter[i]); + multigrid_param->coarse_solver_ca_basis[i] + = kv.get(subsection, "coarse_solver_ca_basis", multigrid_param->coarse_solver_ca_basis[i]); + multigrid_param->coarse_solver_ca_basis_size[i] + = kv.get(subsection, "coarse_solver_ca_basis_size", multigrid_param->coarse_solver_ca_basis_size[i]); + multigrid_param->coarse_solver_ca_lambda_min[i] + = kv.get(subsection, "coarse_solver_ca_lambda_min", multigrid_param->coarse_solver_ca_lambda_min[i]); + multigrid_param->coarse_solver_ca_lambda_max[i] + = kv.get(subsection, "coarse_solver_ca_lambda_max", multigrid_param->coarse_solver_ca_lambda_max[i]); multigrid_param->smoother[i] = kv.get(subsection, "smoother", multigrid_param->smoother[i]); multigrid_param->smoother_tol[i] = kv.get(subsection, "smoother_tol", multigrid_param->smoother_tol[i]); multigrid_param->nu_pre[i] = kv.get(subsection, "nu_pre", multigrid_param->nu_pre[i]); multigrid_param->nu_post[i] = kv.get(subsection, "nu_post", multigrid_param->nu_post[i]); - multigrid_param->smoother_solver_ca_basis[i] = kv.get(subsection, "smoother_solver_ca_basis", multigrid_param->smoother_solver_ca_basis[i]); - multigrid_param->smoother_solver_ca_lambda_min[i] = kv.get(subsection, "smoother_solver_ca_lambda_min", multigrid_param->smoother_solver_ca_lambda_min[i]); - multigrid_param->smoother_solver_ca_lambda_max[i] = kv.get(subsection, "smoother_solver_ca_lambda_max", multigrid_param->smoother_solver_ca_lambda_max[i]); + multigrid_param->smoother_solver_ca_basis[i] + = kv.get(subsection, "smoother_solver_ca_basis", multigrid_param->smoother_solver_ca_basis[i]); + multigrid_param->smoother_solver_ca_lambda_min[i] = kv.get( + subsection, "smoother_solver_ca_lambda_min", multigrid_param->smoother_solver_ca_lambda_min[i]); + multigrid_param->smoother_solver_ca_lambda_max[i] = kv.get( + subsection, "smoother_solver_ca_lambda_max", multigrid_param->smoother_solver_ca_lambda_max[i]); multigrid_param->omega[i] = kv.get(subsection, "omega", multigrid_param->omega[i]); - multigrid_param->smoother_halo_precision[i] = kv.get(subsection, "smoother_halo_precision", multigrid_param->smoother_halo_precision[i]); - multigrid_param->smoother_schwarz_type[i] = kv.get(subsection, "smoother_schwarz_type", multigrid_param->smoother_schwarz_type[i]); - multigrid_param->smoother_schwarz_cycle[i] = kv.get(subsection, "smoother_schwarz_cycle", multigrid_param->smoother_schwarz_cycle[i]); - multigrid_param->coarse_grid_solution_type[i] = kv.get(subsection, "coarse_grid_solution_type", multigrid_param->coarse_grid_solution_type[i]); - multigrid_param->smoother_solve_type[i] = kv.get(subsection, "smoother_solve_type", multigrid_param->smoother_solve_type[i]); - multigrid_param->cycle_type[i] = kv.get(subsection, "cycle_type", multigrid_param->cycle_type[i]); - multigrid_param->global_reduction[i] = kv.get(subsection, "global_reduction", multigrid_param->global_reduction[i]); + multigrid_param->smoother_halo_precision[i] + = kv.get(subsection, "smoother_halo_precision", multigrid_param->smoother_halo_precision[i]); + multigrid_param->smoother_schwarz_type[i] + = kv.get(subsection, "smoother_schwarz_type", multigrid_param->smoother_schwarz_type[i]); + multigrid_param->smoother_schwarz_cycle[i] + = kv.get(subsection, "smoother_schwarz_cycle", multigrid_param->smoother_schwarz_cycle[i]); + multigrid_param->coarse_grid_solution_type[i] = kv.get( + subsection, "coarse_grid_solution_type", multigrid_param->coarse_grid_solution_type[i]); + multigrid_param->smoother_solve_type[i] + = kv.get(subsection, "smoother_solve_type", multigrid_param->smoother_solve_type[i]); + multigrid_param->cycle_type[i] + = kv.get(subsection, "cycle_type", multigrid_param->cycle_type[i]); + multigrid_param->global_reduction[i] + = kv.get(subsection, "global_reduction", multigrid_param->global_reduction[i]); multigrid_param->location[i] = kv.get(subsection, "location", multigrid_param->location[i]); - multigrid_param->setup_location[i] = kv.get(subsection, "setup_location", multigrid_param->setup_location[i]); - multigrid_param->use_eig_solver[i] = kv.get(subsection, "use_eig_solver", multigrid_param->use_eig_solver[i]); + multigrid_param->setup_location[i] + = kv.get(subsection, "setup_location", multigrid_param->setup_location[i]); + multigrid_param->use_eig_solver[i] + = kv.get(subsection, "use_eig_solver", multigrid_param->use_eig_solver[i]); multigrid_param->vec_load[i] = kv.get(subsection, "vec_load", multigrid_param->vec_load[i]); multigrid_param->vec_store[i] = kv.get(subsection, "vec_store", multigrid_param->vec_store[i]); - /*strcpy(multigrid_param->vec_infile[i], kv.get(subsection, "vec_infile", multigrid_param->vec_infile[i]).c_str()); - strcpy(multigrid_param->vec_outfile[i], kv.get(subsection, "vec_outfile", multigrid_param->vec_outfile[i]).c_str());*/ + /*strcpy(multigrid_param->vec_infile[i], kv.get(subsection, "vec_infile", + multigrid_param->vec_infile[i]).c_str()); strcpy(multigrid_param->vec_outfile[i], + kv.get(subsection, "vec_outfile", multigrid_param->vec_outfile[i]).c_str());*/ multigrid_param->mu_factor[i] = kv.get(subsection, "mu_factor", multigrid_param->mu_factor[i]); - multigrid_param->transfer_type[i] = kv.get(subsection, "transfer_type", multigrid_param->transfer_type[i]); + multigrid_param->transfer_type[i] + = kv.get(subsection, "transfer_type", multigrid_param->transfer_type[i]); } } } /* transfer of the struct to all processes */ - MPI_Bcast((void*) param, sizeof(*param), MPI_BYTE, 0, MPI_COMM_WORLD); - MPI_Bcast((void*) invert_param_mg, sizeof(*invert_param_mg), MPI_BYTE, 0, MPI_COMM_WORLD); - MPI_Bcast((void*) multigrid_param, sizeof(*multigrid_param), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Bcast((void *)param, sizeof(*param), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Bcast((void *)invert_param_mg, sizeof(*invert_param_mg), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Bcast((void *)multigrid_param, sizeof(*multigrid_param), MPI_BYTE, 0, MPI_COMM_WORLD); multigrid_param->invert_param = invert_param_mg; /** @@ -1483,18 +1473,16 @@ void* openQCD_qudaSolverReadIn(int id) additional_prop->id = id; additional_prop->mg_param = multigrid_param; additional_prop->u1csw = qudaState.layout.dirac_parms().u1csw; - param->additional_prop = reinterpret_cast(additional_prop); + param->additional_prop = reinterpret_cast(additional_prop); - return (void*) param; + return (void *)param; } - -void* openQCD_qudaSolverGetHandle(int id) +void *openQCD_qudaSolverGetHandle(int id) { if (qudaState.handles[id] == nullptr) { if (id != -1) { - logQuda(QUDA_VERBOSE, "Read in solver parameters from file %s for solver (id=%d)\n", - qudaState.infile, id); + logQuda(QUDA_VERBOSE, "Read in solver parameters from file %s for solver (id=%d)\n", qudaState.infile, id); } qudaState.handles[id] = openQCD_qudaSolverReadIn(id); } @@ -1503,7 +1491,6 @@ void* openQCD_qudaSolverGetHandle(int id) return qudaState.handles[id]; } - void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p) { QudaInvertParam param = newOpenQCDDiracParam(p); @@ -1519,14 +1506,11 @@ void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p) logQuda(QUDA_DEBUG_VERBOSE, " secs = %.2e\n", param.secs); } - void openQCD_qudaDw(double mu, void *in, void *out) { - if (gauge_field_get_unset()) { - errorQuda("Gauge field not populated in openQxD."); - } + if (gauge_field_get_unset()) { errorQuda("Gauge field not populated in openQxD."); } - QudaInvertParam* param = static_cast(openQCD_qudaSolverGetHandle(-1)); + QudaInvertParam *param = static_cast(openQCD_qudaSolverGetHandle(-1)); param->mu = mu; if (!openQCD_qudaInvertParamCheck(param)) { @@ -1540,7 +1524,6 @@ void openQCD_qudaDw(double mu, void *in, void *out) MatQuda(static_cast(out), static_cast(in), param); } - /** * @brief Take the string-hash over a struct using std::hash. * @@ -1553,42 +1536,39 @@ void openQCD_qudaDw(double mu, void *in, void *out) template int hash_struct(T *in) { int hash = 0; - char* cstruct = reinterpret_cast(in); + char *cstruct = reinterpret_cast(in); - for (char* c = cstruct; c < cstruct + sizeof(T); c += strlen(c) + 1) { - if (strlen(c) != 0) { - hash ^= (std::hash{}(std::string(c)) << 1); - } + for (char *c = cstruct; c < cstruct + sizeof(T); c += strlen(c) + 1) { + if (strlen(c) != 0) { hash ^= (std::hash {}(std::string(c)) << 1); } } return hash; } - int openQCD_qudaSolverGetHash(int id) { if (qudaState.handles[id] != nullptr) { - QudaInvertParam* param = reinterpret_cast(qudaState.handles[id]); + QudaInvertParam *param = reinterpret_cast(qudaState.handles[id]); QudaInvertParam hparam = newQudaInvertParam(); memset(&hparam, '\0', sizeof(QudaInvertParam)); /* set everything to zero */ /* Set some properties we want to take the hash over */ - hparam.inv_type = param->inv_type; - hparam.tol = param->tol; - hparam.tol_restart = param->tol_restart; - hparam.tol_hq = param->tol_hq; - hparam.maxiter = param->maxiter; - hparam.reliable_delta = param->reliable_delta; - hparam.solution_type = param->solution_type; - hparam.solve_type = param->solve_type; - hparam.matpc_type = param->matpc_type; - hparam.dagger = param->dagger; - hparam.mass_normalization = param->mass_normalization; + hparam.inv_type = param->inv_type; + hparam.tol = param->tol; + hparam.tol_restart = param->tol_restart; + hparam.tol_hq = param->tol_hq; + hparam.maxiter = param->maxiter; + hparam.reliable_delta = param->reliable_delta; + hparam.solution_type = param->solution_type; + hparam.solve_type = param->solve_type; + hparam.matpc_type = param->matpc_type; + hparam.dagger = param->dagger; + hparam.mass_normalization = param->mass_normalization; hparam.solver_normalization = param->solver_normalization; - hparam.cpu_prec = param->cpu_prec; - hparam.cuda_prec = param->cuda_prec; - hparam.use_init_guess = param->use_init_guess; - hparam.gcrNkrylov = param->gcrNkrylov; + hparam.cpu_prec = param->cpu_prec; + hparam.cuda_prec = param->cuda_prec; + hparam.use_init_guess = param->use_init_guess; + hparam.gcrNkrylov = param->gcrNkrylov; return hash_struct(&hparam); } else { @@ -1596,44 +1576,38 @@ int openQCD_qudaSolverGetHash(int id) } } - void openQCD_qudaSolverPrintSetup(int id) { if (qudaState.handles[id] != nullptr) { - QudaInvertParam *param = static_cast(qudaState.handles[id]); - openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); + QudaInvertParam *param = static_cast(qudaState.handles[id]); + openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); printQudaInvertParam(param); - printfQuda("additional_prop->infile = %s\n", additional_prop->infile); - printfQuda("additional_prop->id = %d\n", additional_prop->id); - printfQuda("additional_prop->mg_param = %p\n", additional_prop->mg_param); - printfQuda("additional_prop->u1csw = %.6e\n", additional_prop->u1csw); - printfQuda("additional_prop->mg_ud_rev = %d\n", additional_prop->mg_ud_rev); - printfQuda("additional_prop->mg_ad_rev = %d\n", additional_prop->mg_ad_rev); - printfQuda("additional_prop->mg_kappa = %.6e\n", additional_prop->mg_kappa); + printfQuda("additional_prop->infile = %s\n", additional_prop->infile); + printfQuda("additional_prop->id = %d\n", additional_prop->id); + printfQuda("additional_prop->mg_param = %p\n", additional_prop->mg_param); + printfQuda("additional_prop->u1csw = %.6e\n", additional_prop->u1csw); + printfQuda("additional_prop->mg_ud_rev = %d\n", additional_prop->mg_ud_rev); + printfQuda("additional_prop->mg_ad_rev = %d\n", additional_prop->mg_ad_rev); + printfQuda("additional_prop->mg_kappa = %.6e\n", additional_prop->mg_kappa); printfQuda("additional_prop->mg_su3csw = %.6e\n", additional_prop->mg_su3csw); - printfQuda("additional_prop->mg_u1csw = %.6e\n", additional_prop->mg_u1csw); + printfQuda("additional_prop->mg_u1csw = %.6e\n", additional_prop->mg_u1csw); printfQuda("handle = %p\n", param); printfQuda("hash = %d\n", openQCD_qudaSolverGetHash(id)); printfQuda("inv_type_precondition = %d\n", param->inv_type_precondition); - if (param->inv_type_precondition == QUDA_MG_INVERTER) { - printQudaMultigridParam(additional_prop->mg_param); - } + if (param->inv_type_precondition == QUDA_MG_INVERTER) { printQudaMultigridParam(additional_prop->mg_param); } } else { printfQuda("\n"); } } - double openQCD_qudaInvert(int id, double mu, void *source, void *solution, int *status) { - if (gauge_field_get_unset()) { - errorQuda("Gauge field not populated in openQxD."); - } + if (gauge_field_get_unset()) { errorQuda("Gauge field not populated in openQxD."); } - QudaInvertParam* param = static_cast(openQCD_qudaSolverGetHandle(id)); + QudaInvertParam *param = static_cast(openQCD_qudaSolverGetHandle(id)); param->mu = mu; if (!openQCD_qudaInvertParamCheck(param)) { @@ -1650,9 +1624,8 @@ double openQCD_qudaInvert(int id, double mu, void *source, void *solution, int * errorQuda("qudaState.layout.h_sw is not set."); } - logQuda(QUDA_VERBOSE, "Calling invertQuda() ...\n"); - PUSH_RANGE("invertQuda",5); + PUSH_RANGE("invertQuda", 5); invertQuda(static_cast(solution), static_cast(source), param); POP_RANGE; @@ -1661,33 +1634,29 @@ double openQCD_qudaInvert(int id, double mu, void *source, void *solution, int * logQuda(QUDA_VERBOSE, "openQCD_qudaInvert()\n"); logQuda(QUDA_VERBOSE, " true_res = %.2e\n", param->true_res); logQuda(QUDA_VERBOSE, " true_res_hq = %.2e\n", param->true_res_hq); - logQuda(QUDA_VERBOSE, " iter = %d\n", param->iter); + logQuda(QUDA_VERBOSE, " iter = %d\n", param->iter); logQuda(QUDA_VERBOSE, " gflops = %.2e\n", param->gflops); logQuda(QUDA_VERBOSE, " secs = %.2e\n", param->secs); - logQuda(QUDA_VERBOSE, " status = %d\n", *status); + logQuda(QUDA_VERBOSE, " status = %d\n", *status); return param->true_res; } - void openQCD_qudaSolverDestroy(int id) { if (qudaState.handles[id] != nullptr) { - QudaInvertParam *param = static_cast(qudaState.handles[id]); + QudaInvertParam *param = static_cast(qudaState.handles[id]); - if (param->inv_type_precondition == QUDA_MG_INVERTER) { - destroyMultigridQuda(param->preconditioner); - } + if (param->inv_type_precondition == QUDA_MG_INVERTER) { destroyMultigridQuda(param->preconditioner); } - delete static_cast(param->additional_prop)->mg_param; - delete static_cast(param->additional_prop); + delete static_cast(param->additional_prop)->mg_param; + delete static_cast(param->additional_prop); delete param; qudaState.handles[id] = nullptr; } } - -void* openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id) +void *openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id) { int my_rank; @@ -1695,7 +1664,7 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id) QudaVerbosity verbosity = QUDA_SUMMARIZE; /* Allocate on the heap */ - QudaEigParam* param = new QudaEigParam(newQudaEigParam()); + QudaEigParam *param = new QudaEigParam(newQudaEigParam()); if (my_rank == 0) { @@ -1705,13 +1674,11 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id) verbosity = kv.get(section, "verbosity", verbosity); - if (verbosity >= QUDA_DEBUG_VERBOSE) { - kv.dump(); - } + if (verbosity >= QUDA_DEBUG_VERBOSE) { kv.dump(); } if (kv.get(section, "solver") != "QUDA") { errorQuda("Eigensolver section \"%s\" in file %s is not a valid quda-eigensolver section (solver = %s)\n", - section, infile == nullptr ? qudaState.infile : infile, kv.get(section, "solver").c_str()); + section, infile == nullptr ? qudaState.infile : infile, kv.get(section, "solver").c_str()); } param->eig_type = kv.get(section, "eig_type", param->eig_type); @@ -1761,10 +1728,10 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id) } /* transfer of the struct to all the processes */ - MPI_Bcast((void*) param, sizeof(*param), MPI_BYTE, 0, MPI_COMM_WORLD); + MPI_Bcast((void *)param, sizeof(*param), MPI_BYTE, 0, MPI_COMM_WORLD); void *inv_param = openQCD_qudaSolverGetHandle(solver_id); - param->invert_param = static_cast(inv_param); + param->invert_param = static_cast(inv_param); param->invert_param->verbosity = std::max(param->invert_param->verbosity, verbosity); @@ -1772,21 +1739,18 @@ void* openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id) printQudaInvertParam(param->invert_param); } - if (param->invert_param->verbosity >= QUDA_DEBUG_VERBOSE) { - printQudaEigParam(param); - } + if (param->invert_param->verbosity >= QUDA_DEBUG_VERBOSE) { printQudaEigParam(param); } - return (void*) param; + return (void *)param; } - void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals) { - QudaEigParam* eig_param = static_cast(param); + QudaEigParam *eig_param = static_cast(param); logQuda(QUDA_VERBOSE, "Calling eigensolveQuda() ...\n"); - PUSH_RANGE("eigensolveQuda",6); - eigensolveQuda(h_evecs, static_cast(h_evals), eig_param); + PUSH_RANGE("eigensolveQuda", 6); + eigensolveQuda(h_evecs, static_cast(h_evals), eig_param); POP_RANGE; logQuda(QUDA_SUMMARIZE, "openQCD_qudaEigensolve()\n"); @@ -1795,11 +1759,10 @@ void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals) logQuda(QUDA_SUMMARIZE, " iter = %d\n", eig_param->invert_param->iter); } - void openQCD_qudaEigensolverDestroy(void *param) { - QudaEigParam* eig_param = static_cast(param); - openQCD_QudaSolver *additional_prop = static_cast(eig_param->invert_param->additional_prop); + QudaEigParam *eig_param = static_cast(param); + openQCD_QudaSolver *additional_prop = static_cast(eig_param->invert_param->additional_prop); if (additional_prop == nullptr) { delete eig_param->invert_param; } else { diff --git a/lib/targets/cuda/comm_target.cpp b/lib/targets/cuda/comm_target.cpp index b8bd74d770..89bc7f2a38 100644 --- a/lib/targets/cuda/comm_target.cpp +++ b/lib/targets/cuda/comm_target.cpp @@ -76,7 +76,10 @@ namespace quda if (comm_dim(dim) == 1) continue; #endif // even if comm_dim(dim) == 2, we might not have p2p enabled in both directions, so check this - const int num_dir = (!comm_dim_cstar(dim) && comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? 1 : 2; + const int num_dir + = (!comm_dim_cstar(dim) && comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? + 1 : + 2; for (int dir = 0; dir < num_dir; dir++) { remote[dim][dir] = nullptr; #ifndef NVSHMEM_COMMS diff --git a/lib/targets/hip/comm_target.cpp b/lib/targets/hip/comm_target.cpp index 622f73b9ee..30812d4440 100644 --- a/lib/targets/hip/comm_target.cpp +++ b/lib/targets/hip/comm_target.cpp @@ -70,7 +70,10 @@ namespace quda for (int dim = 0; dim < 4; ++dim) { if (comm_dim(dim) == 1) continue; // even if comm_dim(dim) == 2, we might not have p2p enabled in both directions, so check this - const int num_dir = (!comm_dim_cstar(dim) && comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) && comm_peer2peer_enabled(1, dim)) ? 1 : 2; + const int num_dir = (!comm_dim_cstar(dim) && comm_dim(dim) == 2 && comm_peer2peer_enabled(0, dim) + && comm_peer2peer_enabled(1, dim)) ? + 1 : + 2; for (int dir = 0; dir < num_dir; ++dir) { remote[dim][dir] = nullptr; if (!comm_peer2peer_enabled(dir, dim)) continue; From 3150761387c6a4be123cd6255286aa91cc336fc1 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 28 Feb 2024 12:43:12 +0100 Subject: [PATCH 144/148] eigensolver iterface rewritten to use identifiers too (as inverter interface) --- include/quda_openqcd_interface.h | 35 ++++--- lib/openqcd_interface.cpp | 169 ++++++++++++++++++++++++------- 2 files changed, 153 insertions(+), 51 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 4db29e04a0..8c01a92bcf 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -1,5 +1,8 @@ #pragma once +#define OPENQCD_MAX_INVERTERS 32 +#define OPENQCD_MAX_EIGENSOLVERS 32 + /** * The macro battle below is to trick quda.h to think that double_complex is * defined to be the struct below. For this we need to set the __CUDACC_RTC__, @@ -140,7 +143,8 @@ typedef struct { * Notice that this void pointer HAS to be directly before * handles[32], because it's possible to call * openQCD_qudaSolverGetHandle with -1. */ - void *handles[32]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ + void *inv_handles[OPENQCD_MAX_INVERTERS]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ + void *eig_handles[OPENQCD_MAX_EIGENSOLVERS]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ char infile[1024]; /** Path to the input file (if given to quda_init()) */ } openQCD_QudaState_t; @@ -346,15 +350,21 @@ void openQCD_qudaSolverDestroy(int id); * section given by the [section] parameter may contain every member of the * struct [QudaEigParam]. * - * @param[in] infile Ini-file containing sections about the eigen-solver, - * if null we use the value of qudaState.infile - * @param[in] section The section name of the eigen-solver + * @param[in] id The section id of the eigensolver * @param[in] solver_id The section id of the solver. If -1, the section is * not read in. * * @return Pointer to the eigen-solver context */ -void *openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id); +void *openQCD_qudaEigensolverGetHandle(int id, int solver_id); + +/** + * @brief Print the eigensolver setup. + * + * @param[in] id The eigensolver identifier + * @param[in] solver_id The solver identifier + */ +void openQCD_qudaEigensolverPrintSetup(int id, int solver_id); /** * @brief Solve Ax=b for an Clover Wilson operator with a multigrid @@ -362,20 +372,21 @@ void *openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id); * (CPU) field in openQCD order. This function requires an * existing solver context created with openQCD_qudaSolverSetup(). * - * @param[inout] param Pointer returned by openQCD_qudaEigensolverSetup() - * @param[inout] h_evecs Allocated array of void-pointers to param->n_conf - * fields - * @param[out] h_evals Allocated array of param->n_conf complex_dbles + * @param[in] id The eigensolver section identifier + * @param[in] solver_id The solver section identifier + * @param[inout] h_evecs Allocated array of void-pointers to param->n_conf + * fields + * @param[out] h_evals Allocated array of param->n_conf complex_dbles */ -void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals); +void openQCD_qudaEigensolve(int id, int solver_id, void **h_evecs, void *h_evals); /** * @brief Destroys an existing eigen-solver context and frees all involed * structs. * - * @param param Pointer to the context to destroy + * @param[in] id The section id of the eigensolver */ -void openQCD_qudaEigensolverDestroy(void *param); +void openQCD_qudaEigensolverDestroy(int id); /** * @brief Wrapper for the plaquette. We could call plaqQuda() directly in diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 6635fa3307..36300aed40 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -21,7 +21,7 @@ #define MAX(a, b) ((a) > (b) ? (a) : (b)) -static openQCD_QudaState_t qudaState = {false, -1, -1, -1, -1, 0.0, 0.0, 0.0, {}, {}, nullptr, {}, ""}; +static openQCD_QudaState_t qudaState = {false, -1, -1, -1, -1, 0.0, 0.0, 0.0, {}, {}, nullptr, {}, {}, ""}; using namespace quda; @@ -558,8 +558,12 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, void openQCD_qudaFinalize(void) { - for (int id = 0; id < 32; ++id) { - if (qudaState.handles[id] != nullptr) { openQCD_qudaSolverDestroy(id); } + for (int id = 0; id < OPENQCD_MAX_INVERTERS; ++id) { + if (qudaState.inv_handles[id] != nullptr) { openQCD_qudaSolverDestroy(id); } + } + + for (int id = 0; id < OPENQCD_MAX_EIGENSOLVERS; ++id) { + if (qudaState.eig_handles[id] != nullptr) { openQCD_qudaEigensolverDestroy(id); } } qudaState.initialized = false; @@ -957,6 +961,30 @@ int openQCD_qudaInvertParamCheck(void *param_) return ret; } +/** + * @brief Check if the solver identifier is in bounds + * + * @param[in] id The identifier + */ +void inline check_solver_id(int id) +{ + if (id < -1 || id > OPENQCD_MAX_INVERTERS-1) { + errorQuda("Solver id %d is out of range [%d, %d).", id, -1, OPENQCD_MAX_INVERTERS); + } +} + +/** + * @brief Check if the eigen-solver identifier is in bounds + * + * @param[in] id The identifier + */ +void inline check_eigensolver_id(int id) +{ + if (id < 0 || id > OPENQCD_MAX_EIGENSOLVERS-1) { + errorQuda("Eigensolver id %d is out of range [%d, %d).", id, 0, OPENQCD_MAX_EIGENSOLVERS); + } +} + /** * @brief Transfer the gauge field if the gauge field was updated in * openQxD. (Re-)calculate or transfer the clover field if @@ -1480,15 +1508,16 @@ void *openQCD_qudaSolverReadIn(int id) void *openQCD_qudaSolverGetHandle(int id) { - if (qudaState.handles[id] == nullptr) { + check_solver_id(id); + if (qudaState.inv_handles[id] == nullptr) { if (id != -1) { logQuda(QUDA_VERBOSE, "Read in solver parameters from file %s for solver (id=%d)\n", qudaState.infile, id); } - qudaState.handles[id] = openQCD_qudaSolverReadIn(id); + qudaState.inv_handles[id] = openQCD_qudaSolverReadIn(id); } - openQCD_qudaSolverUpdate(qudaState.handles[id]); - return qudaState.handles[id]; + openQCD_qudaSolverUpdate(qudaState.inv_handles[id]); + return qudaState.inv_handles[id]; } void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p) @@ -1547,8 +1576,9 @@ template int hash_struct(T *in) int openQCD_qudaSolverGetHash(int id) { - if (qudaState.handles[id] != nullptr) { - QudaInvertParam *param = reinterpret_cast(qudaState.handles[id]); + check_solver_id(id); + if (qudaState.inv_handles[id] != nullptr) { + QudaInvertParam *param = reinterpret_cast(qudaState.inv_handles[id]); QudaInvertParam hparam = newQudaInvertParam(); memset(&hparam, '\0', sizeof(QudaInvertParam)); /* set everything to zero */ @@ -1578,8 +1608,9 @@ int openQCD_qudaSolverGetHash(int id) void openQCD_qudaSolverPrintSetup(int id) { - if (qudaState.handles[id] != nullptr) { - QudaInvertParam *param = static_cast(qudaState.handles[id]); + check_solver_id(id); + if (qudaState.inv_handles[id] != nullptr) { + QudaInvertParam *param = static_cast(qudaState.inv_handles[id]); openQCD_QudaSolver *additional_prop = static_cast(param->additional_prop); printQudaInvertParam(param); @@ -1607,16 +1638,10 @@ double openQCD_qudaInvert(int id, double mu, void *source, void *solution, int * { if (gauge_field_get_unset()) { errorQuda("Gauge field not populated in openQxD."); } - QudaInvertParam *param = static_cast(openQCD_qudaSolverGetHandle(id)); - param->mu = mu; - - if (!openQCD_qudaInvertParamCheck(param)) { - errorQuda("Solver check failed, parameters/fields between openQxD and QUDA are not in sync."); - } - /** - * This is to make sure we behave in the same way as openQCDs solvers. We have - * to make sure that the SW-term in openQxD is setup and in sync with QUDAs. + * This is to make sure we behave in the same way as openQCDs solvers, we call + * h_sw() which in turn calls sw_term(). We have to make sure that the SW-term + * in openQxD is setup and in sync with QUDAs. */ if (qudaState.layout.h_sw != nullptr) { qudaState.layout.h_sw(); @@ -1624,6 +1649,13 @@ double openQCD_qudaInvert(int id, double mu, void *source, void *solution, int * errorQuda("qudaState.layout.h_sw is not set."); } + QudaInvertParam *param = static_cast(openQCD_qudaSolverGetHandle(id)); + param->mu = mu; + + if (!openQCD_qudaInvertParamCheck(param)) { + errorQuda("Solver check failed, parameters/fields between openQxD and QUDA are not in sync."); + } + logQuda(QUDA_VERBOSE, "Calling invertQuda() ...\n"); PUSH_RANGE("invertQuda", 5); invertQuda(static_cast(solution), static_cast(source), param); @@ -1644,33 +1676,41 @@ double openQCD_qudaInvert(int id, double mu, void *source, void *solution, int * void openQCD_qudaSolverDestroy(int id) { - if (qudaState.handles[id] != nullptr) { - QudaInvertParam *param = static_cast(qudaState.handles[id]); + check_solver_id(id); + if (qudaState.inv_handles[id] != nullptr) { + QudaInvertParam *param = static_cast(qudaState.inv_handles[id]); if (param->inv_type_precondition == QUDA_MG_INVERTER) { destroyMultigridQuda(param->preconditioner); } delete static_cast(param->additional_prop)->mg_param; delete static_cast(param->additional_prop); delete param; - qudaState.handles[id] = nullptr; + qudaState.inv_handles[id] = nullptr; } } -void *openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id) +void *openQCD_qudaEigensolverReadIn(int id, int solver_id) { int my_rank; + QudaEigParam *param; MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); QudaVerbosity verbosity = QUDA_SUMMARIZE; /* Allocate on the heap */ - QudaEigParam *param = new QudaEigParam(newQudaEigParam()); + if (qudaState.eig_handles[id] == nullptr) { + param = new QudaEigParam(newQudaEigParam()); + } else { + param = static_cast(qudaState.eig_handles[id]); + } if (my_rank == 0) { KeyValueStore kv; kv.set_map(&enum_map); - kv.load(infile == nullptr ? qudaState.infile : infile); + kv.load(qudaState.infile); + + std::string section = "Eigensolver " + std::to_string(id); verbosity = kv.get(section, "verbosity", verbosity); @@ -1678,7 +1718,7 @@ void *openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id) if (kv.get(section, "solver") != "QUDA") { errorQuda("Eigensolver section \"%s\" in file %s is not a valid quda-eigensolver section (solver = %s)\n", - section, infile == nullptr ? qudaState.infile : infile, kv.get(section, "solver").c_str()); + section.c_str(), qudaState.infile, kv.get(section, "solver").c_str()); } param->eig_type = kv.get(section, "eig_type", param->eig_type); @@ -1744,9 +1784,53 @@ void *openQCD_qudaEigensolverSetup(char *infile, char *section, int solver_id) return (void *)param; } -void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals) + +void *openQCD_qudaEigensolverGetHandle(int id, int solver_id) +{ + check_eigensolver_id(id); + check_solver_id(solver_id); + + if (qudaState.eig_handles[id] == nullptr) { + logQuda(QUDA_VERBOSE, "Read in eigensolver parameters from file %s for eigensolver (id=%d)\n", qudaState.infile, id); + qudaState.eig_handles[id] = openQCD_qudaEigensolverReadIn(id, solver_id); + } + + openQCD_qudaSolverUpdate(static_cast(qudaState.eig_handles[id])->invert_param); + return qudaState.eig_handles[id]; +} + + +void openQCD_qudaEigensolverPrintSetup(int id, int solver_id) +{ + check_eigensolver_id(id); + check_solver_id(solver_id); + + if (qudaState.eig_handles[id] != nullptr) { + QudaEigParam *param = static_cast(qudaState.eig_handles[id]); + printQudaEigParam(param); + printfQuda("\n"); + openQCD_qudaSolverPrintSetup(solver_id); + } else { + printfQuda("\n"); + } +} + + +void openQCD_qudaEigensolve(int id, int solver_id, void **h_evecs, void *h_evals) { - QudaEigParam *eig_param = static_cast(param); + if (gauge_field_get_unset()) { errorQuda("Gauge field not populated in openQxD."); } + + if (qudaState.layout.h_sw != nullptr) { + qudaState.layout.h_sw(); + } else { + errorQuda("qudaState.layout.h_sw is not set."); + } + + QudaEigParam *eig_param = static_cast(openQCD_qudaEigensolverGetHandle(id, solver_id)); + + if (!openQCD_qudaInvertParamCheck(eig_param->invert_param)) { + errorQuda("Solver check failed, parameters/fields between openQxD and QUDA are not in sync."); + } logQuda(QUDA_VERBOSE, "Calling eigensolveQuda() ...\n"); PUSH_RANGE("eigensolveQuda", 6); @@ -1754,19 +1838,26 @@ void openQCD_qudaEigensolve(void *param, void **h_evecs, void *h_evals) POP_RANGE; logQuda(QUDA_SUMMARIZE, "openQCD_qudaEigensolve()\n"); - logQuda(QUDA_SUMMARIZE, " gflops = %.2e\n", eig_param->gflops); - logQuda(QUDA_SUMMARIZE, " secs = %.2e\n", eig_param->secs); + logQuda(QUDA_SUMMARIZE, " gflops = %.2e\n", eig_param->invert_param->gflops); + logQuda(QUDA_SUMMARIZE, " secs = %.2e\n", eig_param->invert_param->secs); logQuda(QUDA_SUMMARIZE, " iter = %d\n", eig_param->invert_param->iter); } -void openQCD_qudaEigensolverDestroy(void *param) +void openQCD_qudaEigensolverDestroy(int id) { - QudaEigParam *eig_param = static_cast(param); - openQCD_QudaSolver *additional_prop = static_cast(eig_param->invert_param->additional_prop); - if (additional_prop == nullptr) { - delete eig_param->invert_param; - } else { - openQCD_qudaSolverDestroy(additional_prop->id); + check_eigensolver_id(id); + + if (qudaState.eig_handles[id] != nullptr) { + QudaEigParam *eig_param = static_cast(qudaState.eig_handles[id]); + openQCD_QudaSolver *additional_prop = static_cast(eig_param->invert_param->additional_prop); + + if (additional_prop == nullptr) { + delete eig_param->invert_param; + } else { + openQCD_qudaSolverDestroy(additional_prop->id); + } + + delete eig_param; + qudaState.eig_handles[id] = nullptr; } - delete eig_param; } From 0752a23426015792c926542fc937bfdc52aac35c Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 28 Feb 2024 14:20:15 +0100 Subject: [PATCH 145/148] fixed compiler error in CI/CD --- include/instantiate_dslash.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/instantiate_dslash.h b/include/instantiate_dslash.h index c7f59b16c7..ab3c31f342 100644 --- a/include/instantiate_dslash.h +++ b/include/instantiate_dslash.h @@ -38,6 +38,7 @@ namespace quda #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-8/9", QUDA_RECONSTRUCT); #endif +#ifdef BUILD_OPENQCD_INTERFACE } else if (Recon::recon.size() > 3 && U.Reconstruct() == Recon::recon[3]) { #if QUDA_RECONSTRUCT & 2 Apply(out, in, U, args...); @@ -50,6 +51,7 @@ namespace quda #else errorQuda("QUDA_RECONSTRUCT=%d does not enable reconstruct-9", QUDA_RECONSTRUCT); #endif +#endif /* BUILD_OPENQCD_INTERFACE */ } else { errorQuda("Unsupported reconstruct type %d\n", U.Reconstruct()); } From ca54f7ea45c9fba3a9c075b1172f5bbce3c428da Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 28 Feb 2024 16:06:03 +0100 Subject: [PATCH 146/148] clang-format --- include/gamma.cuh | 12 ++++++------ include/quda_openqcd_interface.h | 14 +++++++------- lib/openqcd_interface.cpp | 7 ++----- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/include/gamma.cuh b/include/gamma.cuh index d5c82e54e3..542070d36a 100644 --- a/include/gamma.cuh +++ b/include/gamma.cuh @@ -27,8 +27,8 @@ namespace quda { __device__ __host__ inline int getcol(int row) const { if (basis == QUDA_DEGRAND_ROSSI_GAMMA_BASIS || basis == QUDA_OPENQCD_GAMMA_BASIS) { - switch(dir) { - case 0: + switch (dir) { + case 0: case 1: switch(row) { case 0: return 3; @@ -54,10 +54,10 @@ namespace quda { case 3: return 3; } break; - } + } } else { - switch(dir) { - case 0: + switch (dir) { + case 0: case 1: switch(row) { case 0: return 3; @@ -90,7 +90,7 @@ namespace quda { case 3: return 1; } break; - } + } } return 0; } diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 8c01a92bcf..7470d8545f 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -139,13 +139,13 @@ typedef struct { double swd_u1csw; /** U(1) csw coefficient corresponding to the current SW field in QUDA */ openQCD_QudaInitArgs_t init; openQCD_QudaLayout_t layout; - void *dirac_handle; /** void-pointer to QudaInvertParam struct for the Dirac operator. - * Notice that this void pointer HAS to be directly before - * handles[32], because it's possible to call - * openQCD_qudaSolverGetHandle with -1. */ - void *inv_handles[OPENQCD_MAX_INVERTERS]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ - void *eig_handles[OPENQCD_MAX_EIGENSOLVERS]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ - char infile[1024]; /** Path to the input file (if given to quda_init()) */ + void *dirac_handle; /** void-pointer to QudaInvertParam struct for the Dirac operator. + * Notice that this void pointer HAS to be directly before + * handles[32], because it's possible to call + * openQCD_qudaSolverGetHandle with -1. */ + void *inv_handles[OPENQCD_MAX_INVERTERS]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ + void *eig_handles[OPENQCD_MAX_EIGENSOLVERS]; /** Array of void-pointers to QudaInvertParam structs for the solver(s) */ + char infile[1024]; /** Path to the input file (if given to quda_init()) */ } openQCD_QudaState_t; typedef struct openQCD_QudaSolver_s { diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 36300aed40..16e8464406 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -968,7 +968,7 @@ int openQCD_qudaInvertParamCheck(void *param_) */ void inline check_solver_id(int id) { - if (id < -1 || id > OPENQCD_MAX_INVERTERS-1) { + if (id < -1 || id > OPENQCD_MAX_INVERTERS - 1) { errorQuda("Solver id %d is out of range [%d, %d).", id, -1, OPENQCD_MAX_INVERTERS); } } @@ -980,7 +980,7 @@ void inline check_solver_id(int id) */ void inline check_eigensolver_id(int id) { - if (id < 0 || id > OPENQCD_MAX_EIGENSOLVERS-1) { + if (id < 0 || id > OPENQCD_MAX_EIGENSOLVERS - 1) { errorQuda("Eigensolver id %d is out of range [%d, %d).", id, 0, OPENQCD_MAX_EIGENSOLVERS); } } @@ -1784,7 +1784,6 @@ void *openQCD_qudaEigensolverReadIn(int id, int solver_id) return (void *)param; } - void *openQCD_qudaEigensolverGetHandle(int id, int solver_id) { check_eigensolver_id(id); @@ -1799,7 +1798,6 @@ void *openQCD_qudaEigensolverGetHandle(int id, int solver_id) return qudaState.eig_handles[id]; } - void openQCD_qudaEigensolverPrintSetup(int id, int solver_id) { check_eigensolver_id(id); @@ -1815,7 +1813,6 @@ void openQCD_qudaEigensolverPrintSetup(int id, int solver_id) } } - void openQCD_qudaEigensolve(int id, int solver_id, void **h_evecs, void *h_evals) { if (gauge_field_get_unset()) { errorQuda("Gauge field not populated in openQxD."); } From 9d9ace7eb01d0eaa34445fd0e694cdae659847f0 Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Tue, 12 Mar 2024 19:32:09 +0100 Subject: [PATCH 147/148] fixed eigensolver deallocation --- lib/openqcd_interface.cpp | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index 16e8464406..ecf9ccec3d 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -409,7 +409,7 @@ static lat_dim_t get_local_dims(int *fill = nullptr) * * @return rank */ -static int rankFromCoords(const int *coords, void *fdata) +int rankFromCoords(const int *coords, void *fdata) { int *base = static_cast(fdata); int *NPROC = base + 1; @@ -557,7 +557,6 @@ void openQCD_qudaInit(openQCD_QudaInitArgs_t init, openQCD_QudaLayout_t layout, void openQCD_qudaFinalize(void) { - for (int id = 0; id < OPENQCD_MAX_INVERTERS; ++id) { if (qudaState.inv_handles[id] != nullptr) { openQCD_qudaSolverDestroy(id); } } @@ -1497,7 +1496,7 @@ void *openQCD_qudaSolverReadIn(int id) * instantiated until then. */ openQCD_QudaSolver *additional_prop = new openQCD_QudaSolver(); - strcpy(additional_prop->infile, qudaState.infile); + sprintf(additional_prop->infile, "%s", qudaState.infile); additional_prop->id = id; additional_prop->mg_param = multigrid_param; additional_prop->u1csw = qudaState.layout.dirac_parms().u1csw; @@ -1846,13 +1845,6 @@ void openQCD_qudaEigensolverDestroy(int id) if (qudaState.eig_handles[id] != nullptr) { QudaEigParam *eig_param = static_cast(qudaState.eig_handles[id]); - openQCD_QudaSolver *additional_prop = static_cast(eig_param->invert_param->additional_prop); - - if (additional_prop == nullptr) { - delete eig_param->invert_param; - } else { - openQCD_qudaSolverDestroy(additional_prop->id); - } delete eig_param; qudaState.eig_handles[id] = nullptr; From f63a507d1ec604464fe5e70f441ad2f4e121a71a Mon Sep 17 00:00:00 2001 From: Roman Gruber Date: Wed, 3 Jul 2024 19:32:05 +0200 Subject: [PATCH 148/148] added openQCD_qudaDw_NoLoads (wrapper around Dirac operator that does not do any transfer of fields) --- include/quda_openqcd_interface.h | 43 ++++++++++++++++++++++++---- lib/openqcd_interface.cpp | 49 ++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 5 deletions(-) diff --git a/include/quda_openqcd_interface.h b/include/quda_openqcd_interface.h index 7470d8545f..d90ad42748 100644 --- a/include/quda_openqcd_interface.h +++ b/include/quda_openqcd_interface.h @@ -223,8 +223,8 @@ int openQCD_qudaIndexIup(const int *x, const int mu); double openQCD_qudaNorm(void *h_in); /** - * @brief Prototype function for the norm-square in QUDA without loading - * the field. + * @brief Prototype function for the norm-square in QUDA without + * transfering the field. Should serve as an example. * * @param[in] d_in Spinor input field (device pointer) * @@ -244,8 +244,28 @@ double openQCD_qudaNorm_NoLoads(void *d_in); */ void openQCD_qudaGamma(const int dir, void *openQCD_in, void *openQCD_out); +/** + * @brief Explicit transfer of an openQCD field from host to device + * + * @param openQCD_field Input openQCD spinor (host pointer) + * + * @return Device pointer + */ void *openQCD_qudaH2D(void *openQCD_field); + +/** + * @brief Explicit transfer of a QUDA field from device to host + * + * @param quda_field Input quda spinor field (device pointer) + * @param openQCD_field Output openQCD spinor (host pointer) + */ void openQCD_qudaD2H(void *quda_field, void *openQCD_field); + +/** + * @brief Free a device field allocated by openQCD_qudaH2D() + * + * @param quda_field Pointer to device pointer + */ void openQCD_qudaSpinorFree(void **quda_field); /** @@ -255,6 +275,8 @@ void openQCD_qudaSpinorFree(void **quda_field); * @param[in] src Source spinor field * @param[out] dst Destination spinor field * @param[in] p Dirac parameter struct + * + * @deprecated Replaced by openQCD_qudaDw() */ void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p); @@ -263,12 +285,23 @@ void openQCD_qudaDw_deprecated(void *src, void *dst, openQCD_QudaDiracParam_t p) * setup to a field. All fields passed and returned are host (CPU) * fields in openQCD order. * - * @param[in] mu Twisted mass - * @param in Input spinor - * @param out Output spinor + * @param[in] mu Twisted mass parameter + * @param[in] in Input spinor (host pointer) + * @param[out] out Output spinor (host pointer) */ void openQCD_qudaDw(double mu, void *in, void *out); +/** + * @brief Apply the Dirac operator that corresponds to the current openQxD + * setup to a field. All fields passed and returned are devicde + * (GPU) fields returned by openQCD_qudaH2D(). + * + * @param[in] mu Twisted mass parameter + * @param[in] d_in Input spinor (device pointer) + * @param[out] d_out Output spinor (device pointer) + */ +void openQCD_qudaDw_NoLoads(double mu, void *d_in, void *d_out); + /** * Setup the solver interface to quda. This function parses the file given by * [infile] as an openQCD ini file. The solver section given by the [id] diff --git a/lib/openqcd_interface.cpp b/lib/openqcd_interface.cpp index ecf9ccec3d..c020f6dbb6 100644 --- a/lib/openqcd_interface.cpp +++ b/lib/openqcd_interface.cpp @@ -23,6 +23,7 @@ static openQCD_QudaState_t qudaState = {false, -1, -1, -1, -1, 0.0, 0.0, 0.0, {}, {}, nullptr, {}, {}, ""}; + using namespace quda; /** @@ -1552,6 +1553,54 @@ void openQCD_qudaDw(double mu, void *in, void *out) MatQuda(static_cast(out), static_cast(in), param); } +void openQCD_qudaDw_NoLoads(double mu, void *d_in, void *d_out) +{ + if (gauge_field_get_unset()) { errorQuda("Gauge field not populated in openQxD."); } + + QudaInvertParam *inv_param = static_cast(openQCD_qudaSolverGetHandle(-1)); + inv_param->mu = mu; + + if (!openQCD_qudaInvertParamCheck(inv_param)) { + errorQuda("QudaInvertParam struct check failed, parameters/fields between openQxD and QUDA are not in sync."); + } + + /* both fields reside on the GPU */ + inv_param->input_location = QUDA_CUDA_FIELD_LOCATION; + inv_param->output_location = QUDA_CUDA_FIELD_LOCATION; + + ColorSpinorField *in = reinterpret_cast(d_in); + ColorSpinorField *out = reinterpret_cast(d_out); + + /* truncated version of what MatQuda does */ + pushVerbosity(inv_param->verbosity); + + bool pc = (inv_param->solution_type == QUDA_MATPC_SOLUTION || + inv_param->solution_type == QUDA_MATPCDAG_MATPC_SOLUTION); + + DiracParam diracParam; + setDiracParam(diracParam, inv_param, pc); + + Dirac *dirac = Dirac::create(diracParam); // create the Dirac operator + dirac->M(*out, *in); // apply the operator + delete dirac; // clean up + + if (pc) { + if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION) { + blas::ax(0.25/(inv_param->kappa*inv_param->kappa), *out); + } else if (inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(0.5/inv_param->kappa, *out); + } + } else { + if (inv_param->mass_normalization == QUDA_MASS_NORMALIZATION || + inv_param->mass_normalization == QUDA_ASYMMETRIC_MASS_NORMALIZATION) { + blas::ax(0.5/inv_param->kappa, *out); + } + } + + popVerbosity(); +} + + /** * @brief Take the string-hash over a struct using std::hash. *