OpenMathLib · XiWeiGu · Apr 18, 2024 · May 7, 2024 · Apr 28, 2024 · May 9, 2024
diff --git a/.github/workflows/codspeed-bench.yml b/.github/workflows/codspeed-bench.yml
@@ -0,0 +1,150 @@
+name: Run codspeed benchmarks
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  benchmarks:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        fortran: [gfortran]
+        build: [make]
+        pyver: ["3.12"]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+            python-version: ${{ matrix.pyver }}
+
+      - name: Print system information
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            cat /proc/cpuinfo
+          fi
+
+      - name: Install Dependencies
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+            sudo apt-get update
+            sudo apt-get install -y gfortran cmake ccache libtinfo5
+          else
+            echo "::error::$RUNNER_OS not supported"
+            exit 1
+          fi
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          # We include the commit sha in the cache key, as new cache entries are
+          # only created if there is no existing entry for the key yet.
+          # GNU make and cmake call the compilers differently. It looks like
+          # that causes the cache to mismatch. Keep the ccache for both build
+          # tools separate to avoid polluting each other.
+          key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
+          # Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}
+            ccache-${{ runner.os }}-${{ matrix.build }}
+
+      - name: Write out the .pc
+        run: |
+             cd benchmark/pybench
+             cat > openblas.pc << EOF
+             libdir=${{ github.workspace }}
+             includedir= ${{ github.workspace }}
+             openblas_config= OpenBLAS 0.3.27 DYNAMIC_ARCH NO_AFFINITY Haswell MAX_THREADS=64
+             version=0.0.99
+             extralib=-lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas
+             Name: openblas
+             Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
+             Version: ${version}
+             URL: https://github.com/xianyi/OpenBLAS
+             Libs: ${{ github.workspace }}/libopenblas.so -Wl,-rpath,${{ github.workspace }}
+             Libs.private: -lm -lpthread -lgfortran -lquadmath -L${{ github.workspace }} -lopenblas
+             Cflags: -I${{ github.workspace}}
+             EOF
+             cat openblas.pc
+
+      - name: Configure ccache
+        run: |
+          if [ "${{ matrix.build }}" = "make" ]; then
+            # Add ccache to path
+            if [ "$RUNNER_OS" = "Linux" ]; then
+              echo "/usr/lib/ccache" >> $GITHUB_PATH
+            elif [ "$RUNNER_OS" = "macOS" ]; then
+              echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
+            else
+              echo "::error::$RUNNER_OS not supported"
+              exit 1
+            fi
+          fi
+          # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: Build OpenBLAS
+        run: |
+          case "${{ matrix.build }}" in
+            "make")
+              make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC="ccache ${{ matrix.fortran }}"
+              ;;
+            "cmake")
+              mkdir build && cd build
+              cmake -DDYNAMIC_ARCH=1 \
+                    -DNOFORTRAN=0 \
+                    -DBUILD_WITHOUT_LAPACK=0 \
+                    -DCMAKE_VERBOSE_MAKEFILE=ON \
+                    -DCMAKE_BUILD_TYPE=Release \
+                    -DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
+                    -DCMAKE_C_COMPILER_LAUNCHER=ccache \
+                    -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
+                    ..
+              cmake --build .
+              ;;
+            *)
+              echo "::error::Configuration not supported"
+              exit 1
+              ;;
+          esac
+
+      - name: Show ccache status
+        continue-on-error: true
+        run: ccache -s
+
+      - name: Install benchmark dependencies
+        run: pip install meson ninja numpy pytest pytest-codspeed --user
+
+      - name: Build the wrapper
+        run: |
+          cd benchmark/pybench
+          export PKG_CONFIG_PATH=$PWD
+          meson setup build  --prefix=$PWD/build-install
+          meson install -C build
+          #
+          # sanity check
+          cd build/openblas_wrap
+          python -c'import _flapack; print(dir(_flapack))'
+
+      - name: Run benchmarks
+        uses: CodSpeedHQ/action@v2
+        with:
+          token: ${{ secrets.CODSPEED_TOKEN }}
+          run: |
+            cd benchmark/pybench
+            export PYTHONPATH=$PWD/build-install/lib/python${{matrix.pyver}}/site-packages/
+            OPENBLAS_NUM_THREADS=1 pytest benchmarks/bench_blas.py --codspeed
+
diff --git a/.gitignore b/.gitignore
@@ -109,3 +109,4 @@ benchmark/smallscaling
 CMakeCache.txt
 CMakeFiles/*
 .vscode
+**/__pycache__
diff --git a/Makefile.system b/Makefile.system
@@ -269,6 +269,7 @@ else ifeq ($(ARCH), power)
 SMALL_MATRIX_OPT = 1
 BUILD_BFLOAT16 = 1
 endif
+SMALL_MATRIX_OPT = 1
 ifeq ($(SMALL_MATRIX_OPT), 1)
 CCOMMON_OPT += -DSMALL_MATRIX_OPT
 endif

diff --git a/benchmark/pybench/README.md b/benchmark/pybench/README.md
@@ -0,0 +1,49 @@
+# Continuous benchmarking of OpenBLAS performance
+
+We run a set of benchmarks of subset of OpenBLAS functionality.
+
+## Benchmark runner
+
+[![CodSpeed Badge](https://img.shields.io/endpoint?url=https://codspeed.io/badge.json)](https://codspeed.io/OpenMathLib/OpenBLAS/)
+
+Click on [benchmarks](https://codspeed.io/OpenMathLib/OpenBLAS/benchmarks) to see the performance of a particular benchmark over time;
+Click on [branches](https://codspeed.io/OpenMathLib/OpenBLAS/branches/) and then on the last PR link to see the flamegraphs.
+
+## What are the benchmarks
+
+We run raw BLAS/LAPACK subroutines, via f2py-generated python wrappers. The wrappers themselves are equivalent to [those from SciPy](https://docs.scipy.org/doc/scipy/reference/linalg.lapack.html).
+In fact, the wrappers _are_ from SciPy, we take a small subset simply to avoid having to build the whole SciPy for each CI run.
+
+
+## Adding a new benchmark
+
+`.github/workflows/codspeed-bench.yml` does all the orchestration on CI.
+
+Benchmarks live in the `benchmark/pybench` directory. It is organized as follows:
+
+- benchmarks themselves live in the `benchmarks` folder. Note that the LAPACK routines are imported from the `openblas_wrap` package.
+- the `openblas_wrap` package is a simple trampoline: it contains an f2py extension, `_flapack`, which talks to OpenBLAS, and exports the python names in its `__init__.py`.
+This way, the `openblas_wrap` package shields the benchmarks from the details of where a particular LAPACK function comes from. If wanted, you may for instance swap the `_flapack` extension to
+`scipy.linalg.blas` and `scipy.linalg.lapack`.
+
+To change parameters of an existing benchmark, edit python files in the `benchmark/pybench/benchmarks` directory.
+
+To add a benchmark for a new BLAS or LAPACK function, you need to:
+
+- add an f2py wrapper for the bare LAPACK function. You can simply copy a wrapper from SciPy (look for `*.pyf.src` files in https://github.com/scipy/scipy/tree/main/scipy/linalg)
+- add an import to `benchmark/pybench/openblas_wrap/__init__.py`
+
+
+## Running benchmarks locally
+
+This benchmarking layer is orchestrated from python, therefore you'll need to
+have all what it takes to build OpenBLAS from source, plus `python` and
+
+```
+$ python -mpip install numpy meson ninja pytest pytest-benchmark
+```
+
+The benchmark syntax is consistent with that of `pytest-benchmark` framework. The incantation to run the suite locally is `$ pytest benchmark/pybench/benchmarks/test_blas.py`.
+
+An ASV compatible benchmark suite is planned but currently not implemented.
+
diff --git a/benchmark/pybench/benchmarks/bench_blas.py b/benchmark/pybench/benchmarks/bench_blas.py
@@ -0,0 +1,185 @@
+import pytest
+import numpy as np
+from openblas_wrap import (
+    # level 1
+    dnrm2, ddot, daxpy,
+    # level 3
+    dgemm, dsyrk,
+    # lapack
+    dgesv,                   # linalg.solve
+    dgesdd, dgesdd_lwork,    # linalg.svd
+    dsyev, dsyev_lwork,      # linalg.eigh
+)
+
+# ### BLAS level 1 ###
+
+# dnrm2
+
+dnrm2_sizes = [100, 1000]
+
+def run_dnrm2(n, x, incx):
+    res = dnrm2(x, n, incx=incx)
+    return res
+
+
+@pytest.mark.parametrize('n', dnrm2_sizes)
+def test_nrm2(benchmark, n):
+    rndm = np.random.RandomState(1234)
+    x = np.array(rndm.uniform(size=(n,)), dtype=float)
+    result = benchmark(run_dnrm2, n, x, 1)
+
+
+# ddot
+
+ddot_sizes = [100, 1000]
+
+def run_ddot(x, y,):
+    res = ddot(x, y)
+    return res
+
+
+@pytest.mark.parametrize('n', ddot_sizes)
+def test_dot(benchmark, n):
+    rndm = np.random.RandomState(1234)
+    x = np.array(rndm.uniform(size=(n,)), dtype=float)
+    y = np.array(rndm.uniform(size=(n,)), dtype=float)
+    result = benchmark(run_ddot, x, y)
+
+
+# daxpy
+
+daxpy_sizes = [100, 1000]
+
+def run_daxpy(x, y,):
+    res = daxpy(x, y, a=2.0)
+    return res
+
+
+@pytest.mark.parametrize('n', daxpy_sizes)
+def test_daxpy(benchmark, n):
+    rndm = np.random.RandomState(1234)
+    x = np.array(rndm.uniform(size=(n,)), dtype=float)
+    y = np.array(rndm.uniform(size=(n,)), dtype=float)
+    result = benchmark(run_daxpy, x, y)
+
+
+
+
+# ### BLAS level 3 ###
+
+# dgemm
+
+gemm_sizes = [100, 1000]
+
+def run_gemm(a, b, c):
+    alpha = 1.0
+    res = dgemm(alpha, a, b, c=c, overwrite_c=True)
+    return res
+
+
+@pytest.mark.parametrize('n', gemm_sizes)
+def test_gemm(benchmark, n):
+    rndm = np.random.RandomState(1234)
+    a = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F')
+    b = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F')
+    c = np.empty((n, n), dtype=float, order='F')
+    result = benchmark(run_gemm, a, b, c)
+    assert result is c
+
+
+# dsyrk
+
+syrk_sizes = [100, 1000]
+
+
+def run_syrk(a, c):
+    res = dsyrk(1.0, a, c=c, overwrite_c=True)
+    return res
+
+
+@pytest.mark.parametrize('n', syrk_sizes)
+def test_syrk(benchmark, n):
+    rndm = np.random.RandomState(1234)
+    a = np.array(rndm.uniform(size=(n, n)), dtype=float, order='F')
+    c = np.empty((n, n), dtype=float, order='F')
+    result = benchmark(run_syrk, a, c)
+    assert result is c
+
+
+# ### LAPACK ###
+
+# linalg.solve
+
+gesv_sizes = [100, 1000]
+
+
+def run_gesv(a, b):
+    res = dgesv(a, b, overwrite_a=True, overwrite_b=True)
+    return res
+
+
+@pytest.mark.parametrize('n', gesv_sizes)
+def test_gesv(benchmark, n):
+    rndm = np.random.RandomState(1234)
+    a = (np.array(rndm.uniform(size=(n, n)), dtype=float, order='F') +
+         np.eye(n, order='F'))
+    b = np.array(rndm.uniform(size=(n, 1)), order='F')
+    lu, piv, x, info = benchmark(run_gesv, a, b)
+    assert lu is a
+    assert x is b
+    assert info == 0
+
+
+# linalg.svd
+
+gesdd_sizes = [(100, 5), (1000, 222)]
+
+
+def run_gesdd(a, lwork):
+    res = dgesdd(a, lwork=lwork, full_matrices=False, overwrite_a=False)
+    return res
+
+
+@pytest.mark.parametrize('mn', gesdd_sizes)
+def test_gesdd(benchmark, mn):
+    m, n = mn
+    rndm = np.random.RandomState(1234)
+    a = np.array(rndm.uniform(size=(m, n)), dtype=float, order='F')
+
+    lwork, info = dgesdd_lwork(m, n)
+    lwork = int(lwork)
+    assert info == 0
+
+    u, s, vt, info = benchmark(run_gesdd, a, lwork)
+
+    assert info == 0
+    np.testing.assert_allclose(u @ np.diag(s) @ vt, a, atol=1e-13)
+
+
+# linalg.eigh
+
+syev_sizes = [50, 200]
+
+
+def run_syev(a, lwork):
+    res = dsyev(a, lwork=lwork, overwrite_a=True)
+    return res
+
+
+@pytest.mark.parametrize('n', syev_sizes)
+def test_syev(benchmark, n):
+    rndm = np.random.RandomState(1234)
+    a = rndm.uniform(size=(n, n))
+    a = np.asarray(a + a.T, dtype=float, order='F')
+    a_ = a.copy()
+
+    lwork, info = dsyev_lwork(n)
+    lwork = int(lwork)
+    assert info == 0
+
+    w, v, info = benchmark(run_syev, a, lwork)
+
+    assert info == 0
+    assert a is v  # overwrite_a=True
+
+