gh-224: add matrix transpose

SparseLinearAlgebra · Sep 2, 2023 · 649f2bc · 649f2bc
1 parent 2e2d856
commit 649f2bc
Show file tree

Hide file tree

Showing 17 changed files with 454 additions and 35 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -320,20 +320,6 @@ add_library(spla SHARED
         src/cpu/cpu_format_dok_vec.hpp
         src/cpu/cpu_format_lil.hpp
         src/cpu/cpu_formats.hpp
-        src/cpu/cpu_m_reduce.hpp
-        src/cpu/cpu_m_reduce_by_column.hpp
-        src/cpu/cpu_m_reduce_by_row.hpp
-        src/cpu/cpu_mxm.hpp
-        src/cpu/cpu_mxmT_masked.hpp
-        src/cpu/cpu_mxv.hpp
-        src/cpu/cpu_vxm.hpp
-        src/cpu/cpu_v_assign.hpp
-        src/cpu/cpu_v_count_mf.hpp
-        src/cpu/cpu_v_eadd.hpp
-        src/cpu/cpu_v_eadd_fdb.hpp
-        src/cpu/cpu_v_emult.hpp
-        src/cpu/cpu_v_map.hpp
-        src/cpu/cpu_v_reduce.hpp
         src/util/pair_hash.hpp
         src/profiling/time_profiler.cpp
         src/profiling/time_profiler.hpp

diff --git a/include/spla.h b/include/spla.h
@@ -373,6 +373,7 @@ SPLA_API spla_Status spla_Exec_vxm_masked(spla_Vector r, spla_Vector mask, spla_
 SPLA_API spla_Status spla_Exec_m_reduce_by_row(spla_Vector r, spla_Matrix M, spla_OpBinary op_reduce, spla_Scalar init, spla_Descriptor desc, spla_ScheduleTask* task);
 SPLA_API spla_Status spla_Exec_m_reduce_by_column(spla_Vector r, spla_Matrix M, spla_OpBinary op_reduce, spla_Scalar init, spla_Descriptor desc, spla_ScheduleTask* task);
 SPLA_API spla_Status spla_Exec_m_reduce(spla_Scalar r, spla_Scalar s, spla_Matrix M, spla_OpBinary op_reduce, spla_Descriptor desc, spla_ScheduleTask* task);
+SPLA_API spla_Status spla_Exec_m_transpose(spla_Matrix R, spla_Matrix M, spla_OpUnary op_apply, spla_Descriptor desc, spla_ScheduleTask* task);
 SPLA_API spla_Status spla_Exec_v_eadd(spla_Vector r, spla_Vector u, spla_Vector v, spla_OpBinary op, spla_Descriptor desc, spla_ScheduleTask* task);
 SPLA_API spla_Status spla_Exec_v_emult(spla_Vector r, spla_Vector u, spla_Vector v, spla_OpBinary op, spla_Descriptor desc, spla_ScheduleTask* task);
 SPLA_API spla_Status spla_Exec_v_eadd_fdb(spla_Vector r, spla_Vector v, spla_Vector fdb, spla_OpBinary op, spla_Descriptor desc, spla_ScheduleTask* task);

diff --git a/include/spla/exec.hpp b/include/spla/exec.hpp
@@ -239,6 +239,26 @@ namespace spla {
             ref_ptr<Descriptor>    desc     = ref_ptr<Descriptor>(),
             ref_ptr<ScheduleTask>* task_hnd = nullptr);
 
+    /**
+     * @brief Execute (schedule) matrix transpose operation
+     *
+     * @note Pass valid `task_hnd` to store as a task, rather then execute immediately.
+     *
+     * @param R Matrix to store result
+     * @param M Matrix to transpose
+     * @param op_apply Unary op to transform value
+     * @param desc Scheduled task descriptor; default is null
+     * @param task_hnd Optional task hnd; pass not-null pointer to store task
+     *
+     * @return Status on task execution or status on hnd creation
+     */
+    SPLA_API Status exec_m_transpose(
+            ref_ptr<Matrix>        R,
+            ref_ptr<Matrix>        M,
+            ref_ptr<OpUnary>       op_apply,
+            ref_ptr<Descriptor>    desc     = ref_ptr<Descriptor>(),
+            ref_ptr<ScheduleTask>* task_hnd = nullptr);
+
     /**
      * @brief Execute (schedule) element-wise addition by structure of two vectors
      *

diff --git a/include/spla/type.hpp b/include/spla/type.hpp
@@ -54,10 +54,10 @@ namespace spla {
         SPLA_API virtual int                get_id()          = 0;
     };
 
+    using T_BOOL  = bool;
     using T_INT   = std::int32_t;
     using T_UINT  = std::uint32_t;
     using T_FLOAT = float;
-    using T_BOOL  = bool;
 
     SPLA_API extern ref_ptr<Type> BOOL;
     SPLA_API extern ref_ptr<Type> INT;

diff --git a/python/example.py b/python/example.py
@@ -1,5 +1,10 @@
 from pyspla import *
 
-u = Vector.from_lists([0, 1], [10, 20], 4, INT)
-v = Vector.from_lists([1, 3], [-5, 12], 4, INT)
-print(u.emult(INT.PLUS, v))
+M = Matrix.from_lists([0, 1, 2], [3, 2, 0], [-5, 3, 9], (3, 4), INT)
+print(M)
+
+print(M.transpose())
+
+print(M.transpose(op_apply=INT.UONE))
+
+print(M.transpose(op_apply=INT.AINV))
diff --git a/python/pyspla/__init__.py b/python/pyspla/__init__.py
@@ -129,6 +129,14 @@
 Performance
 -----------
 
+Spla shows greate performance comparing to a Nvidia CUDA based optimized GraphBLAST library, processing large graphs
+in extreme cases counting 1 BILLION edges with speed and without memory issues. Also spla shows outstanding performance
+in Page-Rank algorithms, outperfoming low-level Nvidia CUDA highly-optimized Gunronck library. Spla shows scalability
+on GPUs on Intel, Nvidia and AMD with acceptable performance. Spla can be run even on integrated GPUs. Here you can
+get good speedup, what is much faster than `scipy` or `networkx`.
+
+More details with performance study given down bellow.
+
 **Comparison on a Nvidia GPU**
 
 ![stats](../../docs/stats/rq1_rel_compact.png)
@@ -174,23 +182,21 @@
 Containers
 ----------
 
-Library provides fundamental generalized linear algebra containers
-for data storage and mathematical computations. These containers
-are generalized, so any of built-in types may be used to parametrize
-type of data. Containers have sparse formats by default, so it is
-possible to create large-dimension but low data containers. Containers
-are storage-invariant, so the best format for the storage is automatically
-managed by container internally. All required format conversion done
-in the context of particular primitive usage.
+Library provides fundamental generalized linear algebra containers for data storage and mathematical computations.
+These containers are generalized, so any of built-in types may be used to parametrize type of data. Containers
+have sparse formats by default, so it is possible to create large-dimension but low data containers. Containers
+are storage-invariant, so the best format for the storage is automatically managed by container internally.
+All required format conversion done in the context of particular primitive usage.
 
 Types
 -----
 
 Library provides a set of standard and common built-in data types. Library value types
-differ a bit from a classic type definition. In spla library type is essentially is a
+differ a bit from a classic type definition. In `spla` library type is essentially is a
 storage characteristic, which defines count and layout of bytes per element. User
 can interpret stored data as her/she wants. Spla types set is limited due to the nature
 of GPUs accelerations, where arbitrary layout of data causes significant performance penalties.
+Types such as `int` `float` `uint` `bool` are supported. More types can be added on demand.
 
 Ops
 ---
@@ -203,8 +209,8 @@
 Math operations
 ---------------
 
-Library provides as of high-level linera algebra operations over matrices and vectors with
-parametrization by binary, unary and select `ops`. There is avalable implementation for
+Library provides a set of high-level linera algebra operations over matrices and vectors with
+parametrization by binary, unary and select `ops`. There is avalable implementations for
 general `mxm` matrix-matrix, masked `mxmT` matrix-matrix, masked `mxv` matrix-vector,
 masked `vxm` vector-matrix products, matrix and vector reductions, assignment, and so on.
 Most operations have both CPU and GPU implementation. Thus, you will have GPU performance

diff --git a/python/pyspla/bridge.py b/python/pyspla/bridge.py
@@ -554,6 +554,7 @@ def load_library(lib_path):
     _spla.spla_Exec_m_reduce_by_row.restype = _status_t
     _spla.spla_Exec_m_reduce_by_column.restype = _status_t
     _spla.spla_Exec_m_reduce.restype = _status_t
+    _spla.spla_Exec_m_transpose.restype = _status_t
     _spla.spla_Exec_v_eadd.restype = _status_t
     _spla.spla_Exec_v_emult.restype = _status_t
     _spla.spla_Exec_v_eadd_fdb.restype = _status_t
@@ -576,6 +577,8 @@ def load_library(lib_path):
         [_object_t, _object_t, _object_t, _object_t, _object_t, _p_object_t]
     _spla.spla_Exec_m_reduce.argtypes = \
         [_object_t, _object_t, _object_t, _object_t, _object_t, _p_object_t]
+    _spla.spla_Exec_m_transpose.argtypes = \
+        [_object_t, _object_t, _object_t, _object_t, _p_object_t]
     _spla.spla_Exec_v_eadd.argtypes = \
         [_object_t, _object_t, _object_t, _object_t, _object_t, _p_object_t]
     _spla.spla_Exec_v_emult.argtypes = \

diff --git a/python/pyspla/matrix.py b/python/pyspla/matrix.py
@@ -535,6 +535,42 @@ def dense(cls, shape, dtype=INT, fill_value=0):
 
         return M
 
+    @classmethod
+    def diag(cls, shape, dtype=INT, diag_value=1):
+        """
+        Diagonal matrix of desired shape and desired fill value on diagonal.
+
+        >>> M = Matrix.diag((5, 5), INT, -1)
+        >>> print(M)
+        '
+            0 1 2 3 4
+         0|-1 . . . .|  0
+         1| .-1 . . .|  1
+         2| . .-1 . .|  2
+         3| . . .-1 .|  3
+         4| . . . .-1|  4
+            0 1 2 3 4
+        '
+
+        :param shape: 2-tuple.
+            Size of the matrix.
+
+        :param dtype: optional: Type. default: INT.
+            Type of values matrix will have.
+
+        :param diag_value: optional: any. default: 1.
+            Optional value to fill the diagonal with.
+
+        :return: Matrix with main diagonal filled with value.
+        """
+
+        M = Matrix(shape, dtype)
+
+        for i in range(min(shape[0], shape[1])):
+            M.set(i, i, diag_value)
+
+        return M
+
     def mxm(self, M, op_mult, op_add, out=None, init=None, desc=None):
         """
         General sparse-matrix by sparse-matrix product.
@@ -935,6 +971,82 @@ def reduce(self, op_reduce, out=None, init=None, desc=None):
 
         return out
 
+    def transpose(self, out=None, op_apply=None, desc=None):
+        """
+        Transpose matrix.
+
+        Generate 3x4 matrix with int source data.
+        >>> M = Matrix.from_lists([0, 1, 2], [3, 2, 0], [-5, 3, 9], (3, 4), INT)
+        >>> print(M)
+        '
+            0 1 2 3
+         0| . . .-5|  0
+         1| . . 3 .|  1
+         2| 9 . . .|  2
+            0 1 2 3
+        '
+
+        Transpose matrix `M` as usual and print result.
+        >>> print(M.transpose())
+        '
+            0 1 2
+         0| . . 9|  0
+         1| . . .|  1
+         2| . 3 .|  2
+         3|-5 . .|  3
+            0 1 2
+        '
+
+        Transpose by map each value to `1`, discarding prev value.
+        >>> print(M.transpose(op_apply=INT.UONE))
+        '
+            0 1 2
+         0| . . 1|  0
+         1| . . .|  1
+         2| . 1 .|  2
+         3| 1 . .|  3
+            0 1 2
+        '
+
+        Transpose and apply additive-inverse for each value effectively changing the sign of values.
+        >>> print(M.transpose(op_apply=INT.AINV))
+        '
+            0 1 2
+         0| . .-9|  0
+         1| . . .|  1
+         2| .-3 .|  2
+         3| 5 . .|  3
+            0 1 2
+        '
+
+        :param out: optional: Matrix: default: none.
+            Optional matrix to store result.
+
+        :param op_apply: optional: OpUnary. default: None.
+            Optional unary function to apply on transposition.
+
+        :param desc: optional: Descriptor. default: None.
+            Optional descriptor object to configure the execution.
+
+        :return: Transposed matrix.
+        """
+
+        if out is None:
+            out = Matrix(shape=(self.n_cols, self.n_rows), dtype=self.dtype)
+        if op_apply is None:
+            op_apply = self.dtype.IDENTITY
+
+        assert out
+        assert op_apply
+        assert out.n_rows == self.n_cols
+        assert out.n_cols == self.n_rows
+        assert out.dtype == self.dtype
+
+        check(backend().spla_Exec_m_transpose(out.hnd, self.hnd, op_apply.hnd,
+                                              self._get_desc(desc), self._get_task(None)))
+
+        return out
+
     def __str__(self):
         return self.to_string()
 

diff --git a/src/binding/c_exec.cpp b/src/binding/c_exec.cpp
@@ -64,6 +64,9 @@ spla_Status spla_Exec_m_reduce_by_column(spla_Vector r, spla_Matrix M, spla_OpBi
 spla_Status spla_Exec_m_reduce(spla_Scalar r, spla_Scalar s, spla_Matrix M, spla_OpBinary op_reduce, spla_Descriptor desc, spla_ScheduleTask* task) {
     SPLA_WRAP_EXEC(exec_m_reduce, AS_S(r), AS_S(s), AS_M(M), AS_OB(op_reduce));
 }
+spla_Status spla_Exec_m_transpose(spla_Matrix R, spla_Matrix M, spla_OpUnary op_apply, spla_Descriptor desc, spla_ScheduleTask* task) {
+    SPLA_WRAP_EXEC(exec_m_transpose, AS_M(R), AS_M(M), AS_OU(op_apply));
+}
 spla_Status spla_Exec_v_eadd(spla_Vector r, spla_Vector u, spla_Vector v, spla_OpBinary op, spla_Descriptor desc, spla_ScheduleTask* task) {
     SPLA_WRAP_EXEC(exec_v_eadd, AS_V(r), AS_V(u), AS_V(v), AS_OB(op));
 }

diff --git a/src/cpu/cpu_algo_registry.cpp b/src/cpu/cpu_algo_registry.cpp
@@ -34,6 +34,7 @@
 #include <cpu/cpu_m_reduce.hpp>
 #include <cpu/cpu_m_reduce_by_column.hpp>
 #include <cpu/cpu_m_reduce_by_row.hpp>
+#include <cpu/cpu_m_transpose.hpp>
 #include <cpu/cpu_mxm.hpp>
 #include <cpu/cpu_mxmT_masked.hpp>
 #include <cpu/cpu_mxv.hpp>
@@ -102,6 +103,11 @@ namespace spla {
         g_registry->add(MAKE_KEY_CPU_0("m_reduce", UINT), std::make_shared<Algo_m_reduce_cpu<T_UINT>>());
         g_registry->add(MAKE_KEY_CPU_0("m_reduce", FLOAT), std::make_shared<Algo_m_reduce_cpu<T_FLOAT>>());
 
+        // algorthm m_transpose
+        g_registry->add(MAKE_KEY_CPU_0("m_transpose", INT), std::make_shared<Algo_m_transpose_cpu<T_INT>>());
+        g_registry->add(MAKE_KEY_CPU_0("m_transpose", UINT), std::make_shared<Algo_m_transpose_cpu<T_UINT>>());
+        g_registry->add(MAKE_KEY_CPU_0("m_transpose", FLOAT), std::make_shared<Algo_m_transpose_cpu<T_FLOAT>>());
+
         // algorthm mxv_masked
         g_registry->add(MAKE_KEY_CPU_0("mxv_masked", INT), std::make_shared<Algo_mxv_masked_cpu<T_INT>>());
         g_registry->add(MAKE_KEY_CPU_0("mxv_masked", UINT), std::make_shared<Algo_mxv_masked_cpu<T_UINT>>());

diff --git a/src/cpu/cpu_m_reduce.hpp b/src/cpu/cpu_m_reduce.hpp
@@ -59,17 +59,17 @@ namespace spla {
             auto t = ctx.task.template cast_safe<ScheduleTask_m_reduce>();
             auto M = t->M.template cast_safe<TMatrix<T>>();
 
-            if (M->is_valid(FormatMatrix::CpuDok)) {
-                return execute_dok(ctx);
+            if (M->is_valid(FormatMatrix::CpuCsr)) {
+                return execute_csr(ctx);
             }
             if (M->is_valid(FormatMatrix::CpuLil)) {
                 return execute_lil(ctx);
             }
-            if (M->is_valid(FormatMatrix::CpuCsr)) {
-                return execute_csr(ctx);
+            if (M->is_valid(FormatMatrix::CpuDok)) {
+                return execute_dok(ctx);
             }
 
-            return execute_dok(ctx);
+            return execute_csr(ctx);
         }
 
     private: