gh-221: add mxm for matrix

SparseLinearAlgebra · Sep 1, 2023 · 6369d1c · 6369d1c
1 parent 1a0d6a1
commit 6369d1c
Show file tree

Hide file tree

Showing 18 changed files with 484 additions and 38 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -323,6 +323,7 @@ add_library(spla SHARED
         src/cpu/cpu_m_reduce.hpp
         src/cpu/cpu_m_reduce_by_column.hpp
         src/cpu/cpu_m_reduce_by_row.hpp
+        src/cpu/cpu_mxm.hpp
         src/cpu/cpu_mxmT_masked.hpp
         src/cpu/cpu_mxv.hpp
         src/cpu/cpu_vxm.hpp

diff --git a/include/spla.h b/include/spla.h
@@ -366,6 +366,7 @@ SPLA_API spla_Status spla_Algorithm_tc(int* ntrins, spla_Matrix A, spla_Matrix B
 
 /* Scheduling and operations execution */
 
+SPLA_API spla_Status spla_Exec_mxm(spla_Matrix R, spla_Matrix A, spla_Matrix B, spla_OpBinary op_multiply, spla_OpBinary op_add, spla_Scalar init, spla_Descriptor desc, spla_ScheduleTask* task);
 SPLA_API spla_Status spla_Exec_mxmT_masked(spla_Matrix R, spla_Matrix mask, spla_Matrix A, spla_Matrix B, spla_OpBinary op_multiply, spla_OpBinary op_add, spla_OpSelect op_select, spla_Scalar init, spla_Descriptor desc, spla_ScheduleTask* task);
 SPLA_API spla_Status spla_Exec_mxv_masked(spla_Vector r, spla_Vector mask, spla_Matrix M, spla_Vector v, spla_OpBinary op_multiply, spla_OpBinary op_add, spla_OpSelect op_select, spla_Scalar init, spla_Descriptor desc, spla_ScheduleTask* task);
 SPLA_API spla_Status spla_Exec_vxm_masked(spla_Vector r, spla_Vector mask, spla_Vector v, spla_Matrix M, spla_OpBinary op_multiply, spla_OpBinary op_add, spla_OpSelect op_select, spla_Scalar init, spla_Descriptor desc, spla_ScheduleTask* task);

diff --git a/include/spla/exec.hpp b/include/spla/exec.hpp
@@ -55,6 +55,33 @@ namespace spla {
             ref_ptr<Descriptor>    desc     = ref_ptr<Descriptor>(),
             ref_ptr<ScheduleTask>* task_hnd = nullptr);
 
+    /**
+     * @brief Execute (schedule) sparse-matrix sparse-matrix product
+     *
+     * @note Operation equivalent semantic is `R = AB`
+     * @note Pass valid `task_hnd` to store as a task, rather then execute immediately.
+     *
+     * @param R Matrix to store result of the operation
+     * @param A Left matrix for product
+     * @param B Right matrix for product
+     * @param op_multiply Element-wise binary operator for matrices elements product
+     * @param op_add Element-wise binary operator for matrices elements products sum
+     * @param init Init of matrix row and column product
+     * @param desc Scheduled task descriptor; default is null
+     * @param task_hnd Optional task hnd; pass not-null pointer to store task
+     *
+     * @return Status on task execution or status on hnd creation
+     */
+    SPLA_API Status exec_mxm(
+            ref_ptr<Matrix>        R,
+            ref_ptr<Matrix>        A,
+            ref_ptr<Matrix>        B,
+            ref_ptr<OpBinary>      op_multiply,
+            ref_ptr<OpBinary>      op_add,
+            ref_ptr<Scalar>        init,
+            ref_ptr<Descriptor>    desc     = ref_ptr<Descriptor>(),
+            ref_ptr<ScheduleTask>* task_hnd = nullptr);
+
     /**
      * @brief Execute (schedule) sparse masked matrix matrix-transposed product
      *

diff --git a/python/example.py b/python/example.py
@@ -1,35 +1,10 @@
 from pyspla import *
 
+M = Matrix.from_lists([0, 1, 2, 2], [1, 2, 0, 4], [1, 2, 3, 4], (3, 5), INT)
+print(M)
 
-def bfs(s: int, A: Matrix):
-    v = Vector(A.n_rows, INT)  # to store depths
-
-    front = Vector.from_lists([s], [1], A.n_rows, INT)  # front of new vertices to study
-    front_size = 1  # current front size
-    depth = Scalar(INT, 0)  # depth of search
-    count = 0  # num of reached vertices
-
-    while front_size > 0:  # while have something to study
-        depth += 1
-        count += front_size
-        v.assign(front, depth, op_assign=INT.SECOND, op_select=INT.NQZERO)  # assign depths
-        front = front.vxm(v, A, op_mult=INT.LAND, op_add=INT.LOR, op_select=INT.EQZERO)  # do traversal
-        front_size = front.reduce(op_reduce=INT.PLUS).get()  # update front count to end algorithm
-
-    return v, count, depth.get()
-
-
-I = [0, 1, 2, 2, 3]
-J = [1, 2, 0, 3, 2]
-V = [1, 1, 1, 1, 1]
-A = Matrix.from_lists(I, J, V, shape=(4, 4), dtype=INT)
-print(A)
-
-v, c, d = bfs(0, A)
-print(v)
-print(c)
-print(d)
+N = Matrix.from_lists([0, 1, 2, 3], [2, 0, 1, 3], [2, 3, 4, 5], (5, 4), INT)
+print(N)
 
-M = Matrix.from_lists([0, 1, 2, 3], [0, 3, 3, 2], [1, 2, 3, 4], (4, 4), INT)
-print(M)
-print(M.reduce_by_column(INT.PLUS))
+R = M.mxm(N, INT.MULT, INT.PLUS)
+print(R)
diff --git a/python/pyspla/__init__.py b/python/pyspla/__init__.py
@@ -105,7 +105,7 @@
 automatically select GPU-based acceleration backed for computations.
 >>> v, c, d = bfs(0, A)
 
-Outhput the result vector with distances of reached vertices.
+Output the result vector with distances of reached vertices.
 >>> print(v)
 '
  0| 1
@@ -205,9 +205,10 @@
 
 Library provides as of high-level linera algebra operations over matrices and vectors with
 parametrization by binary, unary and select `ops`. There is avalable implementation for
-masked `mxmT` matrix-matrix, `mxv` matrix-vector, `vxm` vector-matrix products, matrix and
-vector reductions, assignment, and so on. Most operations have both CPU and GPU implementation.
-Thus, you will have GPU performance in computations out of the box.
+general `mxm` matrix-matrix, masked `mxmT` matrix-matrix, masked `mxv` matrix-vector,
+masked `vxm` vector-matrix products, matrix and vector reductions, assignment, and so on.
+Most operations have both CPU and GPU implementation. Thus, you will have GPU performance
+in computations out of the box.
 
 Details
 -------

diff --git a/python/pyspla/bridge.py b/python/pyspla/bridge.py
@@ -547,6 +547,7 @@ def load_library(lib_path):
     _spla.spla_Algorithm_pr.argtypes = [_p_object_t, _object_t, _float, _float, _object_t]
     _spla.spla_Algorithm_tc.argtypes = [_p_int, _object_t, _object_t, _object_t]
 
+    _spla.spla_Exec_mxm.restype = _status_t
     _spla.spla_Exec_mxmT_masked.restype = _status_t
     _spla.spla_Exec_mxv_masked.restype = _status_t
     _spla.spla_Exec_vxm_masked.restype = _status_t
@@ -560,6 +561,8 @@ def load_library(lib_path):
     _spla.spla_Exec_v_reduce.restype = _status_t
     _spla.spla_Exec_v_count_mf.restype = _status_t
 
+    _spla.spla_Exec_mxm.argtypes = \
+        [_object_t, _object_t, _object_t, _object_t, _object_t, _object_t, _object_t, _p_object_t]
     _spla.spla_Exec_mxmT_masked.argtypes = \
         [_object_t, _object_t, _object_t, _object_t, _object_t, _object_t, _object_t, _object_t, _object_t, _p_object_t]
     _spla.spla_Exec_mxv_masked.argtypes = \

diff --git a/python/pyspla/matrix.py b/python/pyspla/matrix.py
@@ -535,6 +535,87 @@ def dense(cls, shape, dtype=INT, fill_value=0):
 
         return M
 
+    def mxm(self, M, op_mult, op_add, out=None, init=None, desc=None):
+        """
+        General sparse-matrix by sparse-matrix product.
+
+        Generate left operand matrix of shape 3x5 for product.
+        >>> M = Matrix.from_lists([0, 1, 2, 2], [1, 2, 0, 4], [1, 2, 3, 4], (3, 5), INT)
+        >>> print(M)
+        '
+            0 1 2 3 4
+         0| . 1 . . .|  0
+         1| . . 2 . .|  1
+         2| 3 . . . 4|  2
+            0 1 2 3 4
+        '
+
+        Generate right operand matrix of shape 5x4 for product, num of rows must match.
+        >>> N = Matrix.from_lists([0, 1, 2, 3], [2, 0, 1, 3], [2, 3, 4, 5], (5, 4), INT)
+        >>> print(N)
+        '
+            0 1 2 3
+         0| . . 2 .|  0
+         1| 3 . . .|  1
+         2| . 4 . .|  2
+         3| . . . 5|  3
+         4| . . . .|  4
+            0 1 2 3
+        '
+
+        Evaluate product using respective element-wise operations.
+        >>> R = M.mxm(N, INT.MULT, INT.PLUS)
+        >>> print(R)
+        '
+            0 1 2 3
+         0| 3 . . .|  0
+         1| . 8 . .|  1
+         2| . . 6 .|  2
+            0 1 2 3
+        '
+
+        :param M: Matrix.
+            Matrix for a product.
+
+        :param op_mult: OpBinary.
+            Element-wise binary operator for matrix vector elements product.
+
+        :param op_add: OpBinary.
+            Element-wise binary operator for matrix vector products sum.
+
+        :param out: optional: Matrix: default: None.
+            Optional matrix to store result of product.
+
+        :param init: optional: Scalar: default: 0.
+            Optional neutral init value for reduction.
+
+        :param desc: optional: Descriptor. default: None.
+            Optional descriptor object to configure the execution.
+
+        :return: Matrix with result.
+        """
+
+        if out is None:
+            out = Matrix(shape=(self.n_rows, M.n_cols), dtype=self.dtype)
+        if init is None:
+            init = Scalar(dtype=self.dtype, value=0)
+
+        assert M
+        assert out
+        assert init
+        assert out.dtype == self.dtype
+        assert M.dtype == self.dtype
+        assert init.dtype == self.dtype
+        assert out.n_rows == self.n_rows
+        assert out.n_cols == M.n_cols
+        assert self.n_cols == M.n_rows
+
+        check(backend().spla_Exec_mxm(out.hnd, self.hnd, M.hnd,
+                                      op_mult.hnd, op_add.hnd,
+                                      init.hnd, self._get_desc(desc), self._get_task(None)))
+
+        return out
+
     def mxmT(self, mask, M, op_mult, op_add, op_select, out=None, init=None, desc=None):
         """
         Masked sparse-matrix by sparse-matrix^T (transposed) product with sparse-mask.
@@ -610,8 +691,8 @@ def mxmT(self, mask, M, op_mult, op_add, op_select, out=None, init=None, desc=No
         :param op_select: OpSelect.
             Selection op to filter mask.
 
-        :param out: optional: Vector: default: None.
-            Optional vector to store result of product.
+        :param out: optional: Matrix: default: None.
+            Optional matrix to store result of product.
 
         :param init: optional: Scalar: default: 0.
             Optional neutral init value for reduction.

diff --git a/src/binding/c_exec.cpp b/src/binding/c_exec.cpp
@@ -43,6 +43,9 @@
 #define AS_OU(x) as_ref<spla::OpUnary>(x)
 #define AS_OS(x) as_ref<spla::OpSelect>(x)
 
+spla_Status spla_Exec_mxm(spla_Matrix R, spla_Matrix A, spla_Matrix B, spla_OpBinary op_multiply, spla_OpBinary op_add, spla_Scalar init, spla_Descriptor desc, spla_ScheduleTask* task) {
+    SPLA_WRAP_EXEC(exec_mxm, AS_M(R), AS_M(A), AS_M(B), AS_OB(op_multiply), AS_OB(op_add), AS_S(init));
+}
 spla_Status spla_Exec_mxmT_masked(spla_Matrix R, spla_Matrix mask, spla_Matrix A, spla_Matrix B, spla_OpBinary op_multiply, spla_OpBinary op_add, spla_OpSelect op_select, spla_Scalar init, spla_Descriptor desc, spla_ScheduleTask* task) {
     SPLA_WRAP_EXEC(exec_mxmT_masked, AS_M(R), AS_M(mask), AS_M(A), AS_M(B), AS_OB(op_multiply), AS_OB(op_add), AS_OS(op_select), AS_S(init));
 }

diff --git a/src/cpu/cpu_algo_registry.cpp b/src/cpu/cpu_algo_registry.cpp
@@ -34,6 +34,7 @@
 #include <cpu/cpu_m_reduce.hpp>
 #include <cpu/cpu_m_reduce_by_column.hpp>
 #include <cpu/cpu_m_reduce_by_row.hpp>
+#include <cpu/cpu_mxm.hpp>
 #include <cpu/cpu_mxmT_masked.hpp>
 #include <cpu/cpu_mxv.hpp>
 #include <cpu/cpu_v_assign.hpp>
@@ -109,6 +110,11 @@ namespace spla {
         g_registry->add(MAKE_KEY_CPU_0("mxmT_masked", INT), std::make_shared<Algo_mxmT_masked_cpu<T_INT>>());
         g_registry->add(MAKE_KEY_CPU_0("mxmT_masked", UINT), std::make_shared<Algo_mxmT_masked_cpu<T_UINT>>());
         g_registry->add(MAKE_KEY_CPU_0("mxmT_masked", FLOAT), std::make_shared<Algo_mxmT_masked_cpu<T_FLOAT>>());
+
+        // algorthm mxm
+        g_registry->add(MAKE_KEY_CPU_0("mxm", INT), std::make_shared<Algo_mxm_cpu<T_INT>>());
+        g_registry->add(MAKE_KEY_CPU_0("mxm", UINT), std::make_shared<Algo_mxm_cpu<T_UINT>>());
+        g_registry->add(MAKE_KEY_CPU_0("mxm", FLOAT), std::make_shared<Algo_mxm_cpu<T_FLOAT>>());
     }
 
 }// namespace spla
diff --git a/src/cpu/cpu_format_coo.hpp b/src/cpu/cpu_format_coo.hpp
@@ -54,6 +54,27 @@ namespace spla {
         in.values = 0;
     }
 
+    template<typename T>
+    void cpu_coo_to_lil(uint             n_rows,
+                        const CpuCoo<T>& in,
+                        CpuLil<T>&       out) {
+        auto& Rr = out.Ar;
+
+        auto& Ai = in.Ai;
+        auto& Aj = in.Aj;
+        auto& Ax = in.Ax;
+
+        assert(Rr.size() == n_rows);
+
+        for (uint k = 0; k < in.values; k++) {
+            const uint i = Ai[k];
+            const uint j = Aj[k];
+            const T    x = Ax[k];
+
+            Rr[i].emplace_back(j, x);
+        }
+    }
+
     template<typename T>
     void cpu_coo_to_dok(const CpuCoo<T>& in,
                         CpuDok<T>&       out) {