Merge pull request LeelaChessZero#1 from Ergodice/dag-update-pr1791

Dag update pr1791
Craftyawesome · May 27, 2023 · 4a93916 · 4a93916
2 parents 166a28b + 1f90473
commit 4a93916
Show file tree

Hide file tree

Showing 92 changed files with 7,398 additions and 3,502 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -13,38 +13,54 @@ jobs:
       - run:
           name: Update Meson
           command: pip3 install --upgrade meson==0.58.1
-      - run:
-          name: Create Meson build dirs
-          command: mkdir build-gcc && mkdir build-clang
-      - run:
-          name: Meson Clang
-          environment:
-            CC: clang
-            CXX: clang++
-          command: meson build-clang
       - run:
           name: Meson GCC
           environment:
             CC: gcc-8
             CXX: g++-8
-          command: meson build-gcc
-      - run:
-          name: Build Clang
-          command: |
-            cd build-clang
-            ninja
+          command: meson build-gcc -Dgtest=false
       - run:
           name: Build GCC
           command: |
             cd build-gcc
             ninja -j 4
+  "mac":
+    macos:
+      xcode: 13.4.1
+    steps:
+      - checkout
       - run:
-          command: cp build-clang/lc0 /tmp/lc0-clang
+          name: "Pull Submodules"
+          command: |
+            git submodule init
+            git submodule update --remote
       - run:
-          command: cp build-gcc/lc0 /tmp/lc0-g++
-      - store_artifacts:
-          path: /tmp/lc0-clang
-          destination: lc0-ubuntu-18-04-clang
+          name: Install build tools
+          command: |
+            pip3 install meson==0.63
+            pip3 install ninja
+            brew install ispc
+      - run:
+          name: Build lc0
+          command: |
+            meson build --buildtype=release -Dgtest=false -Dopencl=false
+            cd build
+            ninja
+      - run:
+          name: Build lc0 arm
+          command: |
+            meson build-arm --buildtype=release -Dgtest=false -Dopencl=false --cross-file cross-files/aarch64-darwin
+            cd build-arm
+            ninja
+      - run:
+          name: Make universal binary
+          command: lipo -create -o /tmp/lc0 build/lc0 build-arm/lc0
       - store_artifacts:
-          path: /tmp/lc0-g++
-          destination: lc0-ubuntu-18-04-g++
+          path: /tmp/lc0
+          destination: lc0-macos_12.3.1
+workflows:
+  version: 2
+  builds:
+    jobs:
+      - build
+      - "mac"
diff --git a/.gitignore b/.gitignore
@@ -16,4 +16,5 @@ testdata/
 xcuserdata
 .clang-tidy
 compile_flags.txt
-.vscode
+.vscode
+.mesonpy*
diff --git a/FLAGS.md b/FLAGS.md
@@ -41,7 +41,6 @@ List of command line flags:
 | --slowmover=NUM | Scale thinking time | Parameter value `X` means that the whole remaining time is split in such a way that the current move gets `X × Y` seconds, and next moves will get `1 × Y` seconds. However, due to smart pruning, the engine usually doesn't use all allocated time.<br>Default: `2.2`|
 | <nobr>--move-overhead=NUM</nobr> | Move time overhead in milliseconds | How much overhead should the engine allocate for every move (to counteract things like slow connection, interprocess communication, etc.).<br>Default: `100` ms. |
 | <nobr>--minibatch-size=NUM</nobr> | Minibatch size for NN inference | How many positions the engine tries to batch together for computation. Theoretically larger batches may reduce strengths a bit, especially on a small number of playouts.<br>Default is `256`. Every backend/hardware has different optimal value (e.g., `1` if batching is not supported). |
-| <nobr>--max-prefetch=NUM</nobr> | Maximum prefetch nodes per NN call | When the engine can't gather a large enough batch for immediate use, try to prefetch up to `X` positions, which are likely to be useful soon, and put them in the cache.<br>Default: `32`. |
 | <nobr>--cpuct=NUM</nobr> | Cpuct MCTS option | C_puct constant from Upper Confidence Tree search algorithm. Higher values promote more exploration/wider search, lower values promote more confidence/deeper search.<br>Default: `1.2`. |
 | <nobr>--temperature=NUM</nobr> | Initial temperature | Tau value from softmax formula. If equal to 0, the engine also picks the best move to make. Larger values increase randomness while making the move.<br>Default: `0` |
 | <nobr>--tempdecay-moves=NUM</nobr> | Moves with temperature decay | Reduce temperature for every move linearly from initial temperature to `0`, during this number of moves since the game started. `0` disables temperature decay.<br>Default: `0` |

diff --git a/README.md b/README.md
@@ -9,27 +9,27 @@ Lc0 is a UCI-compliant chess engine designed to play chess via neural network, s
 
 Lc0 can be acquired either via a git clone or an archive download from GitHub. Be aware that there is a required submodule which isn't included in source archives.
 
-For essentially all purposes, including selfplay game generation and match play, we highly recommend using the latest `release/version` branch (for example `release/0.28`), which is equivalent to using the latest version tag.
+For essentially all purposes, including selfplay game generation and match play, we highly recommend using the latest `release/version` branch (for example `release/0.29`), which is equivalent to using the latest version tag.
 
 Versioning follows the Semantic Versioning guidelines, with major, minor and patch sections. The training server enforces game quality using the versions output by the client and engine.
 
 
 Download using git:
 
-```
-git clone -b release/0.28 --recurse-submodules https://github.com/LeelaChessZero/lc0.git
+```shell
+git clone -b release/0.29 --recurse-submodules https://github.com/LeelaChessZero/lc0.git
 ```
 
 If you have cloned already an old version, fetch, view and checkout a new branch:
-```
+```shell
 git fetch --all
 git branch --all
-git checkout -t remotes/origin/release/0.28
+git checkout -t remotes/origin/release/0.29
 ```
 
 
 If you prefer to download an archive, you need to also download and place the submodule:
- * Download the [.zip](https://api.github.com/repos/LeelaChessZero/lc0/zipball/release/0.28) file ([.tar.gz](https://api.github.com/repos/LeelaChessZero/lc0/tarball/release/0.28) archive is also available)
+ * Download the [.zip](https://api.github.com/repos/LeelaChessZero/lc0/zipball/release/0.29) file ([.tar.gz](https://api.github.com/repos/LeelaChessZero/lc0/tarball/release/0.29) archive is also available)
  * Extract
  * Download https://github.com/LeelaChessZero/lczero-common/archive/master.zip (also available as [.tar.gz](https://github.com/LeelaChessZero/lczero-common/archive/master.tar.gz))
  * Move the second archive into the first archive's `libs/lczero-common/` folder and extract
@@ -48,7 +48,7 @@ Backend support includes (in theory) any CBLAS-compatible library for CPU usage,
 
 Finally, lc0 requires a compiler supporting C++17. Minimal versions seem to be g++ v8.0, clang v5.0 (with C++17 stdlib) or Visual Studio 2017.
 
-*Note* that cuda checks the compiler version and stops even with newer compilers, and to work around this we have added the `nvcc_ccbin` build option. This is more of an issue with new Linux versions, where we recommend to install `g++-7` and add `-Dnvcc_ccbin=g++-7` to the `build.sh` command.
+*Note* that cuda checks the compiler version and stops even with newer compilers, and to work around this we have added the `nvcc_ccbin` build option. This is more of an issue with new Linux versions, but you can get around it by using an earlier version of gcc just for cuda. As an example, adding `-Dnvcc_ccbin=g++-9` to the `build.sh` command line will use g++-9 with cuda instead of the system compiler.
 
 Given those basics, the OS and backend specific instructions are below.
 
@@ -179,7 +179,7 @@ You'll need to be running the latest Raspberry Pi OS "buster".
 
 1. Install OpenBLAS
 
-```
+```shell
 git clone https://github.com/xianyi/OpenBLAS.git
 cd OpenBLAS/
 make
@@ -189,20 +189,20 @@ cd ..
 
 2. Install Meson
 
-```
-pip3 install meson
-pip3 install ninja
+```shell
+pip install meson
+pip install ninja
 ```
 
 3. Install compiler and standard libraries
 
-```
+```shell
 sudo apt install clang-6.0 libstdc++-8-dev
 ```
 
 4. Clone lc0 and compile
 
-```
+```shell
 git clone https://github.com/LeelaChessZero/lc0.git
 cd lc0
 git submodule update --init --recursive
@@ -211,6 +211,18 @@ CC=clang-6.0 CXX=clang++-6.0 ./build.sh -Ddefault_library=static
 
 5. The resulting binary will be in build/release
 
+## Python bindings
+
+Python bindings can be built and installed as follows.
+
+```shell
+pip install --user git+https://github.com/LeelaChessZero/lc0.git
+```
+
+This will build the package `lczero-bindings` and install it to your Python user install directory.
+All the `lc0` functionality related to position evaluation is now available in the module `lczero.backends`.
+An example interactive session can be found [here](https://github.com/LeelaChessZero/lc0/pull/1261#issuecomment-622951248).
+
 ## License
 
 Leela Chess is free software: you can redistribute it and/or modify

diff --git a/appveyor.yml b/appveyor.yml
@@ -109,17 +109,22 @@ cache:
   - C:\ndk\android-ndk-r19c\toolchains\llvm\prebuilt\windows-x86_64
 before_build:
 - cmd: git submodule update --init --recursive
-- cmd: IF %BLAS%==true (echo.#define DEFAULT_MINIBATCH_SIZE 7 & echo.#define DEFAULT_MAX_PREFETCH 0 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h
-- cmd: IF %ANDROID%==true (echo.#define DEFAULT_MINIBATCH_SIZE 7 & echo.#define DEFAULT_MAX_PREFETCH 0 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h
+- cmd: IF %BLAS%==true (echo.#define DEFAULT_MINIBATCH_SIZE 7 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h
+- cmd: IF %ANDROID%==true (echo.#define DEFAULT_MINIBATCH_SIZE 7 & echo.#define DEFAULT_TASK_WORKERS 0) > params_override.h
 - cmd: SET BUILD_BLAS=%BLAS%
 - cmd: IF %OPENCL%==true SET BUILD_BLAS=true
 - cmd: IF %DX%==true SET BUILD_BLAS=true
 - cmd: SET EMBED=false
 - cmd: IF %APPVEYOR_REPO_TAG%==true IF %ANDROID%==true SET EMBED=true
+- cmd: SET POPCNT=true
+- cmd: IF %NAME%==cpu-openblas SET POPCNT=false
+- cmd: SET F16C=true
+- cmd: IF %NAME%==cpu-openblas SET F16C=false
+- cmd: IF %CUDA%==true SET F16C=false
 - cmd: SET EXTRA=
 - cmd: IF %ANDROID%==false SET EXTRA=-Db_vscrt=md
 - cmd: IF %ONNX_DML%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\lib -Donnx_include=C:\cache\%ONNX_NAME%\include
-- cmd: IF %ANDROID%==false meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dpopcnt=false -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA%
+- cmd: IF %ANDROID%==false meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dpopcnt=%POPCNT% -Df16c=%F16C% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA%
 - cmd: IF %ANDROID%==true meson arm64-v8a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-aarch64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-aarch64\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-aarch64
 - cmd: IF %ANDROID%==true meson armeabi-v7a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-armv7a\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-armv7a\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-armv7a -Dispc=false -Dneon=false
 build_script:

diff --git a/build.cmd b/build.cmd
@@ -30,7 +30,11 @@ set CXX=cl
 set CC_LD=link
 set CXX_LD=link
 
-if exist "C:\Program Files (x86)\Microsoft Visual Studio\2019" (
+if exist "C:\Program Files\Microsoft Visual Studio\2022" (
+  where /q cl
+  if errorlevel 1 call "C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64
+  set backend=vs2022
+) else if exist "C:\Program Files (x86)\Microsoft Visual Studio\2019" (
   where /q cl
   if errorlevel 1 call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64
   set backend=vs2019

diff --git a/build.sh b/build.sh
@@ -1,9 +1,10 @@
 #!/usr/bin/env bash
 
-pushd "$(dirname "$0")"
-
 set -e
 
+# Move to this script's directory.
+CDPATH= cd -- "$(dirname -- "$0")"
+
 case $1 in
   plain|debug|debugoptimized|release|minsize)
     BUILDTYPE=$1
@@ -16,27 +17,15 @@ esac
 
 BUILDDIR=build/${BUILDTYPE}
 
-if ! hash meson 2>/dev/null && [ -x ${HOME}/.local/bin/meson ]
-then
-  export PATH=${PATH}:${HOME}/.local/bin
-fi
-
-if [ -f ${BUILDDIR}/build.ninja ]
-then
-  meson configure ${BUILDDIR} -Dbuildtype=${BUILDTYPE} -Dprefix=${INSTALL_PREFIX:-/usr/local} "$@"
-else
-  meson ${BUILDDIR} --buildtype ${BUILDTYPE} --prefix ${INSTALL_PREFIX:-/usr/local} "$@"
-fi
-
-cd ${BUILDDIR}
-
-NINJA=$(awk '/ninja/ {ninja=$4} END {print ninja}' meson-logs/meson-log.txt)
+MESON=$(PATH="${PATH}:${HOME}/.local/bin" command -v meson || :)
+MESON=${MESON:?"Could not find meson. Is it installed and in PATH?"}
 
-if [ -n "${INSTALL_PREFIX}" ]
+if [ -f "${BUILDDIR}/build.ninja" ]
 then
-  ${NINJA} install
+  "${MESON}" configure "${BUILDDIR}" -Dbuildtype="${BUILDTYPE}" -Dprefix="${INSTALL_PREFIX:-/usr/local}" "$@"
 else
-  ${NINJA}
+  "${MESON}" "${BUILDDIR}" --buildtype "${BUILDTYPE}" --prefix "${INSTALL_PREFIX:-/usr/local}" "$@"
 fi
 
-popd
+"${MESON}" compile -C "${BUILDDIR}"
+[ -n "${INSTALL_PREFIX}" ] && "${MESON}" install -C "${BUILDDIR}"
diff --git a/changelog.txt b/changelog.txt
@@ -1,4 +1,52 @@
-v0.29.0-rc0 (2022-04-03)
+v0.30.0-rc1 (2023-04-24)
+~~~~~~~
+* Support for networks with attention body and smolgen added to blas, cuda,
+  metal and onnx backends.
+* Persistent L2 cache optimization for the cuda backend. Use the
+  `cache_opt=true` backend option to turn it on.
+* Some performance improvements for the cuda, onnx and blas backends.
+* Added the `threads` backend option to onnx, defaults to 0 (let the
+  onnxruntime decide) except for onnx-cpu that defaults to 1.
+* The onnx-dml package now includes a `directml.dll` installation script.
+* Some users experienced memory issues with onnx-dml, so the defaults were
+  changed. This may affect performance, in which case you can use the `steps=8`
+  backend option to get the old behavior.
+* The Python bindings are available as a package, see the README for
+  instructions.
+* Some assorted fixes and code cleanups.
+
+v0.29.0 (2022-12-13)
+~~~~~~~
+* Updated onednn version to the latest one.
+
+v0.29.0-rc1 (2022-12-09)
+~~~~~~~
+* New metal backend for apple systems. This is now the default backend for
+  macos builds.
+* New onnx-dml backend to use DirectML under windows, has better net
+  compatibility than dx12 and is faster than opencl. See the README for use
+  instructions, a separate download of the DirectML dll is required.
+* Full attention policy support in cuda, cudnn, metal, onnx, blas, dnnl, and
+  eigen backends.
+* Partial attention policy support in onednn backend (good enough for T79).
+* Now the onnx backends can use fp16 when running with a network file (not with
+  .onnx model files). This is the default for onnx-cuda and onnx-dml, can be
+  switched on or off with by setting the `fp16` backend option to `true` or
+  `false` respectively.
+* The onednn package comes with a dnnl compiled to allow running on an intel gpu
+  by adding `gpu=0` to the backend options.
+* The default net is now 791556 for most backends except opencl and dx12 that
+  get 753723 (as they lack attention policy support).
+* Support for using pgn book with long lines in training: selfplay can start at
+  a random point in the book.
+* New "simple" time manager.
+* Support for double Fischer random chess (dfrc).
+* Added TC-dependent output to the backendbench assistant.
+* Starting with this version, the check backend compares policy for valid moves
+  after softmax.
+* Some assorted fixes and code cleanups.
+
+v0.29.0-rc0 (2022-04-03)
 ~~~~~~~
 * Initial support for attention policy, only cuda backend and partially in
   blas/dnnl/eigen (good enough for T79).

diff --git a/cross-files/aarch64-darwin b/cross-files/aarch64-darwin
@@ -0,0 +1,27 @@
+
+[host_machine]
+system = 'darwin'
+cpu_family = 'aarch64'
+cpu = 'aarch64'
+endian = 'little'
+
+[properties]
+
+
+[binaries]
+c = 'clang'
+cpp = 'clang++'
+objc = 'clang'
+objcpp = 'clang++'
+ar = 'ar'
+ld = 'ld'
+
+[built-in options]
+c_args = ['-arch', 'arm64']
+cpp_args = ['-arch', 'arm64']
+objc_args = ['-arch', 'arm64']
+objcpp_args = ['-arch', 'arm64']
+c_link_args = ['-arch', 'arm64']
+cpp_link_args = ['-arch', 'arm64']
+objc_link_args = ['-arch', 'arm64']
+objcpp_link_args = ['-arch', 'arm64']
diff --git a/cross-files/aarch64-linux-android b/cross-files/aarch64-linux-android
@@ -7,7 +7,7 @@
 
 [host_machine]
 system = 'android'
-cpu_family = 'arm'
+cpu_family = 'aarch64'
 cpu = 'aarch64'
 endian = 'little'
 

diff --git a/dist/README-onnx-dml.txt b/dist/README-onnx-dml.txt
@@ -4,8 +4,9 @@ Lc0 is a UCI-compliant chess engine designed to play chess via
 neural network, specifically those of the LeelaChessZero project
 (https://lczero.org).
 
-To run this version you will most likely need a very recent DirectML dll.
-You can download the currently latest nuget installer package from
+To run this version you will most likely need a very recent DirectML dll,
+which you can get by running the included `install.cmd` script. Alternatively,
+you can download the currently latest nuget installer package from
 <https://www.nuget.org/api/v2/package/Microsoft.AI.DirectML/1.10.0>.
 If you don't know how to use nuget installer packages, you can change the
 extension to .zip and open it as a normal zip file, the dll you need is