Improving the developer comments in the C++ implementation (#54)

* Improving the developer comments in the C++ implementation * More detailed description of the templated interface. Not using doxygen syntax given that this is not exposed in the public API. This is a dev documentation
lab-cosmo · Jun 21, 2023 · ec04903 · ec04903
1 parent d075504
commit ec04903
Show file tree

Hide file tree

Showing 6 changed files with 160 additions and 33 deletions.
diff --git a/sphericart/include/macros.hpp b/sphericart/include/macros.hpp
@@ -8,14 +8,24 @@
     values are computed for one point at a time, and that the spherical harmonics are
     stored in a contiguous section that "flattens" the (l,m) dimensions, e.g.
     [ (0,0), (1,-1), (1,0), (1,1), (2,-2), ...]
+
     Functions get pointers to the beginning of the storage space for the current sample,
     x,y,z and, for l>1, x^2, y^2 and z^2, which can be reused.
+    
     Each macro computes one l, and macros should be called in order as the higher l
     reuse calculations at lower angular momentum. The expressions here are derived
     with computer assisted algebra by attempting all possible polynomial decompositions
-    and selecting that with the smallest number of operations.
+    and selecting that with the smallest number of operations. One should call
+    COMPUTE_SPH_L* or COMPUTE_SPH_DERIVATIVE_L* depending on whether only Ylm are needed
+    or if one also want to evbaluate Cartesian derivatives
+
+    Every macro takes an agument SPH_IDX that is an indexing function, that can be used to 
+    map the consecutive indices of the Ylm to a different memory layout (this is e.g. used
+    to optimize threads in CUDA code)
 */
 
+// this is used thoughout to indicate the maximum l channel for which we provide a hard-coded macro.
+// this should be modified if further macros are added
 #define SPHERICART_LMAX_HARDCODED 6
 
 // we need this monstruosity to make sure that literals are not treated as 
@@ -297,8 +307,8 @@
     }
 
 /*
-Combines the macro hard-coded Ylm calculators to get all the terms
-up to a given value. Macro version.
+Combines the macro hard-coded Ylm calculators to get all the terms up to a given value. Macro version.
+This uses if constexpr to decide at compile time which macro(s) should be called
 */
 #define HARDCODED_SPH_MACRO(HARDCODED_LMAX, x, y, z, x2, y2, z2, sph_i, SPH_IDX) \
     static_assert(HARDCODED_LMAX <= SPHERICART_LMAX_HARDCODED, "Computing hardcoded sph beyond what is currently implemented."); \

diff --git a/sphericart/include/sphericart.hpp b/sphericart/include/sphericart.hpp
@@ -184,19 +184,21 @@ class SphericalHarmonics{
 
 /* @cond */
 private:
-    size_t l_max;
-    size_t size_y;
-    size_t size_q;
-    bool normalized;
-    int omp_num_threads;
-    T *prefactors;
+    size_t l_max; // maximum l value computed by this class
+    size_t size_y; // size of the Ylm rows (l_max+1)**2
+    size_t size_q; // size of the prefactor-like arrays (l_max+1)*(l_max+2)/2
+    bool normalized; // should we normalize the input vectors?
+    int omp_num_threads; // number of openmp thread
+    T *prefactors; // storage space for prefactor and buffers
     T *buffers;
 
     // function pointers are used to set up the right functions to be called
+    // these are set in the constructor, so that the public compute functions can 
+    // be redirected to the right implementation
     void (*_array_no_derivatives)(const T*, T*, T*, int, int, const T*, T*);
     void (*_array_with_derivatives)(const T*, T*, T*, int, int, const T*, T*);
-    // these compute a single sample
 
+    // these compute a single sample
     void (*_sample_no_derivatives)(const T*, T*, T*, int, int, const T*, const T*, T*, T*, T*);
     void (*_sample_with_derivatives)(const T*, T*, T*, int, int, const T*, const T*, T*, T*, T*);
 /* @endcond */

diff --git a/sphericart/include/templates.hpp b/sphericart/include/templates.hpp
@@ -90,17 +90,36 @@ void compute_sph_prefactors(int l_max, T *factors) {
 }
 
 template <typename T, bool DO_DERIVATIVES, bool NORMALIZED, int HARDCODED_LMAX>
-inline void hardcoded_sph_sample(const T *xyz_i, T *sph_i, [[maybe_unused]] T *dsph_i,
-    [[maybe_unused]] int l_max_dummy=0,  // dummy variables to have a uniform interface
+inline void hardcoded_sph_sample(const T *xyz_i, T *sph_i, [[maybe_unused]] T *dsph_i, 
+    [[maybe_unused]] int l_max_dummy=0,  // dummy variables to have a uniform interface with generic_sph_ functions
     [[maybe_unused]] int size_y=1,
     [[maybe_unused]] const T *py_dummy=nullptr,
     [[maybe_unused]] const T *qy_dummy=nullptr,
     [[maybe_unused]] T *c_dummy=nullptr,
     [[maybe_unused]] T *s_dummy=nullptr,
     [[maybe_unused]] T *z_dummy=nullptr) {
 /*
-    Wrapper for the hardcoded derivatives that also allows to apply normalization. Computes a single
-    sample, and uses a template to avoid branching.
+    Wrapper for the hardcoded macros that also allows to apply normalization. 
+    Computes a single sample, and uses a template to avoid branching. 
+    
+    Template parameters:
+    typename T: float type (e.g. single/double precision)
+    bool DO_DERIVATIVES: should se evaluate the derivatives?
+    bool NORMALIZED: should we normalize the input positions?
+    int HARDCODED_LMAX: which lmax value will be computed
+
+    NB: this is meant to be computed for a maximum LMAX value defined at compile time. 
+    the l_max_dummy parameter (that correspond to l_max in the generic implementation)
+    is ignored 
+
+    Actual parameters:
+    const T *xyz_i: a T array containing the x,y,z coordinates of a single sample (x,y,z)
+    T *sph_i: pointer to the storage location for the Ylm (stored as l,m= (0,0),(1,-1),(1,0),(1,1),...
+    [[maybe_unused]] T *dsph_i : pointer to the storage location for the dYlm/dx,dy,dz. 
+                    stored as for sph_i, with three consecutive blocks associated to d/dx,d/dy,d/dz 
+    [[maybe_unused]] int size_y: size of storage for the y, (HARDCODED_LMAX+1)**2
+    
+    ALL XXX_dummy variables are defined to match the interface of generic_sph_sample and are ignored
 */
 
     auto x = xyz_i[0];
@@ -109,17 +128,19 @@ inline void hardcoded_sph_sample(const T *xyz_i, T *sph_i, [[maybe_unused]] T *d
     [[maybe_unused]] auto x2 = x * x;
     [[maybe_unused]] auto y2 = y * y;
     [[maybe_unused]] auto z2 = z * z;
-    [[maybe_unused]] T ir=0.0;
+    [[maybe_unused]] T ir=0.0; // 1/r, it is only computed and used if we normalize the input vector
 
     if constexpr(NORMALIZED) {
         ir = 1/std::sqrt(x2+y2+z2);
         x*=ir; y*=ir; z*=ir;
         x2 = x*x; y2=y*y; z2=z*z;
     }
 
+    // nb: asserting that HARDCODED_LMAX is not too large is done statically inside the macro
     HARDCODED_SPH_MACRO(HARDCODED_LMAX, x, y, z, x2, y2, z2, sph_i, DUMMY_SPH_IDX);
 
     if constexpr (DO_DERIVATIVES) {
+        // computes the derivatives 
         T *dxsph_i = dsph_i;
         T *dysph_i = dxsph_i + size_y;
         T *dzsph_i = dysph_i + size_y;
@@ -144,9 +165,19 @@ void hardcoded_sph(const T *xyz, T *sph, [[maybe_unused]] T *dsph,
     [[maybe_unused]] const T *prefactors_dummy=nullptr,
     [[maybe_unused]] T *buffers_dummy=nullptr) {
     /*
-        Cartesian Ylm calculator using the hardcoded expressions.
-        Templated version, just calls _compute_sph_templated and
-        _compute_dsph_templated functions within a loop.
+        Cartesian Ylm calculator using the hardcoded expressions. 
+        Templated version, just calls hardcoded_sph_sample within a loop.
+        XXX_dummy variables are ignored
+
+        Template parameters: see hardcoded_sph_sample
+
+        Actual parameters:
+        const T *xyz: a T array containing th n_samplex*3 x,y,z coordinates of multiple 3D points
+        T *sph: pointer to the storage location for the Ylm (stored as l,m= (0,0),(1,-1),(1,0),(1,1),...
+        [[maybe_unused]] T *dsph : pointer to the storage location for the dYlm/dx,dy,dz. 
+                        stored as for sph_i, with three consecutive blocks associated to d/dx,d/dy,d/dz 
+        int n_samples: number of samples that have to be computed
+
     */
     constexpr auto size_y = (HARDCODED_LMAX + 1) * (HARDCODED_LMAX + 1);
 
@@ -158,6 +189,7 @@ void hardcoded_sph(const T *xyz, T *sph, [[maybe_unused]] T *dsph,
 
         #pragma omp for
         for (int i_sample = 0; i_sample < n_samples; i_sample++) {
+            // gets pointers to the current sample input and output arrays
             xyz_i = xyz + i_sample * 3;
             sph_i = sph + i_sample * size_y;
             if constexpr (DO_DERIVATIVES) {
@@ -184,7 +216,45 @@ static inline void generic_sph_l_channel(int l,
     [[maybe_unused]] T *dzsph_i
 )
 {
-    // working space for the recursive evaluation of Qlm and Q(l-1)m
+    /* 
+    This is the main low-level code to compute sph and dsph for an arbitrary l. 
+    The code is a bit hard to follow because of (too?) many micro-optimizations. 
+    Sine and cosine terms are precomputed. Here the Qlm modifield Legendre polynomials
+    are evaluated, and combined with the other terms and the prefactors. 
+    The core iteration is an iteration down to lower values of m,
+
+    Qlm = A z Ql(m+1) + B rxy^2 Ql(m+2)
+
+    1. the special cases with l=+-m and +-1 are done separately, also because they 
+    initialize the recursive expression
+    2. we assume that some lower l are done with hard-coding, and HARDCODED_LMAX is
+    passed as a template parameter. this is used to split the loops over m in a part
+    with fixed size, known at compile time, and one with variable length. 
+    3. we do the recursion using stack variables and never store the full Qlm array
+    4. we compute separately Qlm and Q(l-1) - the latter needed for derivatives
+    rather than reuse the calculation from another l channel. It appears that 
+    the simplification in memory access makes this beneficial, with the added advantage
+    that each l channel can be computed independently 
+
+    Template parameters:
+    typename T: float type (e.g. single/double precision)
+    bool DO_DERIVATIVES: should se evaluate the derivatives?
+    bool NORMALIZED: should we normalize the input positions?
+
+    Actual parameters:
+    l: which l we are computing
+    x,y,z: the Cartesian coordinates of the point
+    rxy: sqrt(x^2+y^2), precomputed because it's used for all l
+    pk, qlmk: prefactors used in the calculation of Ylm and Qlm, respectively
+    c,s: the c_k and s_k cos-like and sin-like terms combined with the Qlm to compute Ylm
+    twomz: 2*m*z, these are also computed once and reused for all l
+    sph_i, d[x,y,z]sph_i: storage locations of the output arrays for Ylm and dYlm/d[x,y,z]
+
+
+    */
+    // working space for the recursive evaluation of Qlm and Q(l-1)m. 
+    // qlm_[0,1,2] correspond to the current Qlm, Ql(m+1) and Ql(m+2), and the 
+    // ql1m_[0,1,2] hold the same but for l-1
     [[maybe_unused]] T qlm_2, qlm_1, qlm_0;
     [[maybe_unused]] T ql1m_2, ql1m_1, ql1m_0;
 
@@ -216,7 +286,7 @@ static inline void generic_sph_l_channel(int l,
         dysph_i[-l + 1] = dxsph_i[l - 1] = pq * c[l - 2];
         dysph_i[l - 1] = -dxsph_i[-l + 1];
 
-        // uses Q(l-1)(l-1) to initialize the other recursion
+        // uses Q(l-1)(l-1) to initialize the Qlm  recursion
         ql1m_1 = qlmk[-1];
         auto pdq = pk[l - 1] * (l + l - 1) * ql1m_1;
         dzsph_i[-l + 1] = pdq * s[l - 1];
@@ -277,7 +347,7 @@ static inline void generic_sph_l_channel(int l,
         }
     }
 
-    // m=0
+    // m=0 is also a special case
     qlm_0 = qlmk[0] * (twomz[0] * qlm_1 + rxy * qlm_2);
     sph_i[0] = qlm_0 * pk[0];
 
@@ -302,7 +372,17 @@ static inline void generic_sph_sample(const T *xyz_i,
     T *s,
     T *twomz
 ) {
-    [[maybe_unused]] T ir = 0.0;
+    /*
+    This is a low-level function that combines all the pieces to evaluate the sph for a single sample.
+    It calls both the hardcoded macros and the generic l-channel calculator, as well as the sine and cosine
+    terms that are combined with the Qlm, that are needed in the generic calculator. 
+    There is a lot of pointer algebra used to address the correct part of the various arrays, that 
+    turned out to be more efficient than explicit indexing in early tests. 
+
+    The parameters correspond to those described in generic_sph_l_channel.
+    */
+
+    [[maybe_unused]] T ir = 0.0;  // storage for computing 1/r, which is reused when NORMALIZED=true
     [[maybe_unused]] T* dxsph_i = nullptr;
     [[maybe_unused]] T* dysph_i = nullptr;
     [[maybe_unused]] T* dzsph_i = nullptr;
@@ -344,7 +424,7 @@ static inline void generic_sph_sample(const T *xyz_i,
         so that they are just plain polynomials of x,y,z.    */
     // help the compiler unroll the first part of the loop
     int m = 0;
-    auto twoz = z+z;
+    auto twoz = z+z; // twomz actually holds 2*(m+1)*z
     twomz[0] = twoz;
     // also initialize the sine and cosine, even if these never change
     c[0] = 1.0;
@@ -381,7 +461,7 @@ static inline void generic_sph_sample(const T *xyz_i,
     }
 
     auto pk = pylm+k;
-    auto qlmk = pqlm+k;
+    auto qlmk = pqlm+k; // starts at HARDCODED_LMAX+1
     for (int l = HARDCODED_LMAX + 1; l < l_max + 1; l++) {
         generic_sph_l_channel<T, DO_DERIVATIVES, HARDCODED_LMAX>(
             l,
@@ -438,12 +518,14 @@ void generic_sph(
     T *buffers
 ) {
     /*
-        Implementation of the general case, but start at HARDCODED_LMAX and use
-        hard-coding before that.
+        Implementation of the general Ylm calculator case. Starts at HARDCODED_LMAX 
+        and uses hard-coding before that.
 
         Some general implementation remarks:
         - we use an alternative iteration for the Qlm that avoids computing the
           low-l section
+        - there is OMP parallelism threading over the samples (there is probably lots
+          to optimize on this front)
         - we compute at the same time Qlm and the corresponding Ylm, to reuse
           more of the pieces and stay local in memory. we use `if constexpr`
           to avoid runtime branching in the DO_DERIVATIVES=true/false cases
@@ -452,14 +534,30 @@ void generic_sph(
           the compiler can choose to unroll if it makes sense
         - there's a bit of pointer gymnastics because of the contiguous storage
           of the irregularly-shaped Q[l,m] and Y[l,m] arrays
+
+        Template parameters:
+        typename T: float type (e.g. single/double precision)
+        bool DO_DERIVATIVES: should se evaluate the derivatives?
+        bool NORMALIZED: should we normalize the input positions?
+        int HARDCODED_LMAX: which lmax value will be computed        
+
+        Actual parameters:        
+        const T *xyz: a T array containing th n_samplex*3 x,y,z coordinates of multiple 3D points
+        T *sph: pointer to the storage location for the Ylm (stored as l,m= (0,0),(1,-1),(1,0),(1,1),...
+        [[maybe_unused]] T *dsph : pointer to the storage location for the dYlm/dx,dy,dz. 
+                        stored as for sph_i, with three consecutive blocks associated to d/dx,d/dy,d/dz 
+        int n_samples: number of samples that have to be computed  
+        int l_max: maximum l to compute
+        prefactors: pointer to an array that contains the prefactors used for Ylm and Qlm calculation
+        buffers: buffer space to compute cosine, sine and 2*m*z terms
     */
 
     // implementation assumes to use hardcoded expressions for at least l=0,1
     static_assert(HARDCODED_LMAX>=1, "Cannot call the generic Ylm calculator for l<=1.");
 
-    const auto size_y = (l_max + 1) * (l_max + 1);
-    const auto size_q = (l_max + 1) * (l_max + 2) / 2;
-    const T *qlmfactors = prefactors + size_q;
+    const auto size_y = (l_max + 1) * (l_max + 1);  // size of Ylm blocks
+    const auto size_q = (l_max + 1) * (l_max + 2) / 2;  // size of Qlm blocks
+    const T *qlmfactors = prefactors + size_q; // the coeffs. used to compute Qlm are just stored contiguously after the Ylm prefactors
 
     #pragma omp parallel
     {

diff --git a/sphericart/src/sphericart.cpp b/sphericart/src/sphericart.cpp
@@ -6,7 +6,9 @@
 
 using namespace sphericart;
 
-// macro to define different possible hardcoded function calls
+// This macro to define different possible hardcoded function calls. It is used to 
+// initialize the function pointers that are used by the `compute_` calls in the
+// SphericalHarmonics class
 #define _HARCODED_SWITCH_CASE(L_MAX) \
     if (this->normalized) { \
         this->_array_no_derivatives = &hardcoded_sph<T, false, true, L_MAX>; \
@@ -22,11 +24,17 @@ using namespace sphericart;
 
 template<typename T>
 SphericalHarmonics<T>::SphericalHarmonics(size_t l_max, bool normalized) {
+    /* 
+        This is the constructor of the SphericalHarmonics class. It initizlizes buffer         
+        space, compute prefactors, and sets the function pointers that are used for 
+        the actual calls 
+    */
+
     this->l_max = (int) l_max;
     this->size_y = (int) (l_max + 1) * (l_max + 1);
     this->size_q = (int) (l_max + 1) * (l_max + 2) / 2;
     this->normalized = normalized;
-    this->prefactors = new T[(l_max+1)*(l_max+2)];
+    this->prefactors = new T[this->size_q*2];
     this ->omp_num_threads = omp_get_max_threads();
 
     // buffers for cos, sin, 2mz arrays
@@ -37,6 +45,8 @@ SphericalHarmonics<T>::SphericalHarmonics(size_t l_max, bool normalized) {
 
     // sets the correct function pointers for the compute functions
     if (this->l_max<=SPHERICART_LMAX_HARDCODED) {
+        // If we only need hard-coded calls, we set them up at this point
+        // using a macro to avoid even more code duplication. 
         switch (this->l_max) {
         case 0:
             _HARCODED_SWITCH_CASE(0);
@@ -77,12 +87,16 @@ SphericalHarmonics<T>::SphericalHarmonics(size_t l_max, bool normalized) {
 
 template<typename T>
 SphericalHarmonics<T>::~SphericalHarmonics() {
+    // Destructor, simply frees buffers
     delete [] this->prefactors;
     delete [] this->buffers;
 }
 
+// The compute/compute_with_gradient functions decide which function to call based on the size of the input vectors
+
 template<typename T>
-void SphericalHarmonics<T>::compute(const std::vector<T>& xyz, std::vector<T>& sph) {
+void SphericalHarmonics<T>::compute(const std::vector<T>& xyz, std::vector<T>& sph) {    
+
     auto n_samples = xyz.size() / 3;
     sph.resize(n_samples * (l_max + 1) * (l_max + 1));
 
@@ -106,6 +120,8 @@ void SphericalHarmonics<T>::compute_with_gradients(const std::vector<T>& xyz, st
     }
 }
 
+// Lower-level functions are simply wrappers over function-pointer-style calls that invoke the right (hardcoded or generic)
+// C++-library call
 template<typename T>
 void SphericalHarmonics<T>::compute_array(const T* xyz, size_t xyz_length, T* sph, size_t sph_length) {
     if (xyz_length % 3 != 0) {

diff --git a/sphericart/tests/test_hardcoding.cpp b/sphericart/tests/test_hardcoding.cpp
@@ -57,6 +57,7 @@ int main(int argc, char *argv[]) {
     auto prefactors = std::vector<DTYPE>((l_max+1)*(l_max+2), 0.0);
     compute_sph_prefactors(l_max, prefactors.data());
 
+    // random values
     auto xyz = std::vector<DTYPE>(n_samples*3, 0.0);
     for (size_t i=0; i<n_samples*3; ++i) {
         xyz[i] = (DTYPE)rand()/ (DTYPE) RAND_MAX *2.0-1.0;

diff --git a/sphericart/tests/test_samples.cpp b/sphericart/tests/test_samples.cpp
@@ -1,5 +1,5 @@
 /** @file test_hardcoding.cpp
- *  @brief Checks consistency of generic and hardcoded implementations
+ *  @brief Checks consistency of array and sample calls
 */
 
 #include <unistd.h>