From d12eec7521aaa26f49ca0c11c94ea42879a8e71d Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Mon, 23 Oct 2023 19:54:42 +0800
Subject: [PATCH] Support torch 2.1.0 (#1249)

---
 .github/workflows/build-cpu-macos.yml         |   1 +
 .github/workflows/build-cpu-ubuntu.yml        |   1 +
 .github/workflows/build-cpu-windows.yml       |   1 +
 .github/workflows/build-cuda-ubuntu.yml       |   1 +
 .github/workflows/macos-cpu-m1-wheels.yml     |  94 -----------
 .github/workflows/macos-cpu-wheels.yml        |   6 +-
 .github/workflows/run-tests-cpu.yml           |  11 +-
 .github/workflows/run-tests.yml               | 154 ++++++++----------
 .github/workflows/style_check.yml             |   1 +
 .../test-k2-as-third-party-lib-cpu-macos.yml  |   1 +
 .../test-k2-as-third-party-lib-cpu-ubuntu.yml |   1 +
 ...test-k2-as-third-party-lib-cpu-windows.yml |   1 +
 ...test-k2-as-third-party-lib-cuda-ubuntu.yml |   1 +
 .github/workflows/ubuntu-cpu-wheels.yml       |   6 +-
 .github/workflows/ubuntu-cuda-wheels.yml      |   6 +-
 .github/workflows/windows-x64-cpu-wheels.yml  |   6 +-
 CMakeLists.txt                                |  19 ++-
 cmake/select_compute_arch.cmake               |   5 +
 cmake/torch.cmake                             |  16 +-
 docs/source/installation/cuda-cudnn.rst       |  77 +++++++++
 k2/torch/bin/CMakeLists.txt                   |  14 +-
 scripts/github_actions/build-ubuntu-cuda.sh   |  25 ++-
 .../github_actions/generate_build_matrix.py   |   9 +-
 scripts/github_actions/install_cuda.sh        |  16 +-
 scripts/github_actions/install_cudnn.sh       |   3 +
 scripts/github_actions/install_torch.sh       |  15 +-
 setup.py                                      |   6 +
 27 files changed, 275 insertions(+), 222 deletions(-)
 delete mode 100644 .github/workflows/macos-cpu-m1-wheels.yml

diff --git a/.github/workflows/build-cpu-macos.yml b/.github/workflows/build-cpu-macos.yml
index fe3e999ba..16edcb44c 100644
--- a/.github/workflows/build-cpu-macos.yml
+++ b/.github/workflows/build-cpu-macos.yml
@@ -36,6 +36,7 @@ on:
       - 'cmake/**'
       - 'k2/csrc/**'
       - 'k2/python/**'
+  workflow_dispatch:
 
 concurrency:
   group: build-cpu-macos-${{ github.ref }}
diff --git a/.github/workflows/build-cpu-ubuntu.yml b/.github/workflows/build-cpu-ubuntu.yml
index 5e14d130d..b6d82cef4 100644
--- a/.github/workflows/build-cpu-ubuntu.yml
+++ b/.github/workflows/build-cpu-ubuntu.yml
@@ -36,6 +36,7 @@ on:
       - 'cmake/**'
       - 'k2/csrc/**'
       - 'k2/python/**'
+  workflow_dispatch:
 
 concurrency:
   group: build-cpu-ubuntu-${{ github.ref }}
diff --git a/.github/workflows/build-cpu-windows.yml b/.github/workflows/build-cpu-windows.yml
index 779de1940..165c65e22 100644
--- a/.github/workflows/build-cpu-windows.yml
+++ b/.github/workflows/build-cpu-windows.yml
@@ -35,6 +35,7 @@ on:
       - 'cmake/**'
       - 'k2/csrc/**'
       - 'k2/python/**'
+  workflow_dispatch:
 
 concurrency:
   group: build-cpu-windows-${{ github.ref }}
diff --git a/.github/workflows/build-cuda-ubuntu.yml b/.github/workflows/build-cuda-ubuntu.yml
index a76685763..5ef0f1e13 100644
--- a/.github/workflows/build-cuda-ubuntu.yml
+++ b/.github/workflows/build-cuda-ubuntu.yml
@@ -36,6 +36,7 @@ on:
       - 'cmake/**'
       - 'k2/csrc/**'
       - 'k2/python/**'
+  workflow_dispatch:
 
 concurrency:
   group: build-cuda-ubuntu-${{ github.ref }}
diff --git a/.github/workflows/macos-cpu-m1-wheels.yml b/.github/workflows/macos-cpu-m1-wheels.yml
deleted file mode 100644
index 6ea0a5e2d..000000000
--- a/.github/workflows/macos-cpu-m1-wheels.yml
+++ /dev/null
@@ -1,94 +0,0 @@
-name: build-wheels-cpu-macos-m1
-
-on:
-  push:
-    tags:
-      - '*'
-  workflow_dispatch:
-
-concurrency:
-  group: build-wheels-cpu-macos-m1-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  generate_build_matrix:
-    if: github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa'
-    # see https://github.com/pytorch/pytorch/pull/50633
-    runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-      - name: Generating build matrix
-        id: set-matrix
-        run: |
-          # outputting for debugging purposes
-          python ./scripts/github_actions/generate_build_matrix.py --for-macos-m1
-          MATRIX=$(python ./scripts/github_actions/generate_build_matrix.py --for-macos-m1)
-          echo "::set-output name=matrix::${MATRIX}"
-
-  build_wheels_macos_cpu_m1:
-    needs: generate_build_matrix
-    name: ${{ matrix.torch }} ${{ matrix.python-version }}
-    runs-on: macos-latest
-    strategy:
-      fail-fast: false
-      matrix:
-        ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
-
-    steps:
-      - uses: actions/checkout@v2
-        with:
-          fetch-depth: 0
-
-      # see https://cibuildwheel.readthedocs.io/en/stable/changelog/
-      # for a list of versions
-      - name: ${{ matrix.torch }} ${{ matrix.python-version }}
-        uses: pypa/cibuildwheel@v2.11.4
-        env:
-          CIBW_BEFORE_BUILD: pip install torch==${{ matrix.torch}} cmake numpy
-          CIBW_BUILD: ${{ matrix.python-version }}-*
-          CIBW_REPAIR_WHEEL_COMMAND_MACOS: ""
-          CIBW_BUILD_VERBOSITY: 3
-          CIBW_ARCHS_MACOS: arm64
-
-      - name: Display wheels
-        shell: bash
-        run: |
-          ls -lh ./wheelhouse/
-
-      - name: Upload Wheel
-        uses: actions/upload-artifact@v2
-        with:
-          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-macos-latest-cpu-m1
-          path: wheelhouse/*.whl
-
-      # https://huggingface.co/docs/hub/spaces-github-actions
-      - name: Publish to huggingface
-        env:
-          HF_TOKEN: ${{ secrets.HF_TOKEN }}
-        uses: nick-fields/retry@v2
-        with:
-          max_attempts: 20
-          timeout_seconds: 200
-          shell: bash
-          command: |
-            git config --global user.email "csukuangfj@gmail.com"
-            git config --global user.name "Fangjun Kuang"
-
-            rm -rf huggingface
-            export GIT_LFS_SKIP_SMUDGE=1
-
-            git clone https://huggingface.co/csukuangfj/k2 huggingface
-            cd huggingface
-            git pull
-
-            mkdir -p macos
-            cp -v ../wheelhouse/*.whl ./macos
-            git status
-            git lfs track "*.whl"
-            git add .
-            git commit -m "upload macos m1 wheel for torch ${{ matrix.torch }} python ${{ matrix.python-version }}"
-            git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/k2 main
diff --git a/.github/workflows/macos-cpu-wheels.yml b/.github/workflows/macos-cpu-wheels.yml
index f4fd94847..9f249ad97 100644
--- a/.github/workflows/macos-cpu-wheels.yml
+++ b/.github/workflows/macos-cpu-wheels.yml
@@ -18,7 +18,7 @@ jobs:
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Generating build matrix
@@ -39,7 +39,7 @@ jobs:
         ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -59,7 +59,7 @@ jobs:
           ls -lh ./wheelhouse/
 
       - name: Upload Wheel
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-macos-latest-cpu
           path: wheelhouse/*.whl
diff --git a/.github/workflows/run-tests-cpu.yml b/.github/workflows/run-tests-cpu.yml
index e55f1fbbd..3cf8fa30f 100644
--- a/.github/workflows/run-tests-cpu.yml
+++ b/.github/workflows/run-tests-cpu.yml
@@ -38,6 +38,7 @@ on:
       - 'cmake/**'
       - 'k2/csrc/**'
       - 'k2/python/**'
+  workflow_dispatch:
 
 concurrency:
   group: run-tests-cpu-${{ github.ref }}
@@ -47,16 +48,14 @@ jobs:
   run-tests-cpu:
     if: github.event.label.name == 'ready' || github.event.label.name == 'cpp-test' || github.event_name == 'push'
     runs-on: ${{ matrix.os }}
+    name: ${{ matrix.python-version }} ${{ matrix.build_type }}
     strategy:
       fail-fast: false
       matrix:
         os: [ubuntu-latest, macos-latest]
-        torch: ["1.13.1"]
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
+        torch: ["2.1.0"]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
         build_type: ["Release", "Debug"]
-        exclude:
-          - os: macos-latest
-            python-version: "3.11"
 
     steps:
       # refer to https://github.com/actions/checkout
@@ -114,7 +113,7 @@ jobs:
           ./scripts/github_actions/fix_torch.sh
           mkdir build
           cd build
-          cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DK2_WITH_CUDA=OFF ..
+          cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DK2_WITH_CUDA=OFF -DCMAKE_CXX_STANDARD=17 ..
           cat k2/csrc/version.h
 
       - name: ${{ matrix.build_type }} Build
diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml
index 47bcfd464..f1f4baf79 100644
--- a/.github/workflows/run-tests.yml
+++ b/.github/workflows/run-tests.yml
@@ -26,6 +26,7 @@ on:
       - '.github/workflows/run-tests.yml'
       - 'CMakeLists.txt'
       - 'cmake/**'
+      - 'scripts/github_actions/**'
       - 'k2/csrc/**'
       - 'k2/python/**'
   pull_request:
@@ -34,8 +35,10 @@ on:
       - '.github/workflows/run-tests.yml'
       - 'CMakeLists.txt'
       - 'cmake/**'
+      - 'scripts/github_actions/**'
       - 'k2/csrc/**'
       - 'k2/python/**'
+  workflow_dispatch:
 
 concurrency:
   group: run-tests-${{ github.ref }}
@@ -45,107 +48,80 @@ jobs:
   run-tests:
     if: github.event.label.name == 'ready' || github.event_name == 'push'
     runs-on: ${{ matrix.os }}
+    name: ${{ matrix.build_type }}
     strategy:
       fail-fast: false
       matrix:
         os: [ubuntu-latest]
-        cuda: ["11.7"]
-        torch: ["1.13.1"]
+        cuda: ["12.1"]
+        torch: ["2.1.0"]
         python-version: ["3.11"]
-        build_type: ["Release", "Debug"]
 
     steps:
       # refer to https://github.com/actions/checkout
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
-      - name: Install CUDA Toolkit ${{ matrix.cuda }}
-        env:
-          cuda: ${{ matrix.cuda }}
-        run: |
-          source ./scripts/github_actions/install_cuda.sh
-          echo "CUDA_HOME=${CUDA_HOME}" >> $GITHUB_ENV
-          echo "${CUDA_HOME}/bin" >> $GITHUB_PATH
-          echo "LD_LIBRARY_PATH=${CUDA_HOME}/lib:${CUDA_HOME}/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV
-        shell: bash
-
-      - name: Display NVCC version
-        run: |
-          which nvcc
-          nvcc --version
-
-      - name: Display GCC version
-        run: |
-          gcc --version
-
-      - name: Setup Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
+      - name: Run the build process with Docker
+        uses: addnab/docker-run-action@v3
         with:
-          python-version: ${{ matrix.python-version }}
-
-      - name: Display Python version
-        run: python -c "import sys; print(sys.version)"
-
-      - name: Install PyTorch ${{ matrix.torch }}
-        env:
-          cuda: ${{ matrix.cuda }}
-          torch: ${{ matrix.torch }}
+            image: "pytorch/manylinux-builder:cuda12.1"
+            options: -v ${{ github.workspace }}:/var/www -e PYTHON_VERSION=${{ matrix.python-version }} -e TORCH_VERSION=${{ matrix.torch }} -e CUDA_VERSION=${{ matrix.cuda }}
+            run: |
+              echo "pwd: $PWD"
+              uname -a
+              id
+              cat /etc/*release
+              gcc --version
+              python3 --version
+              which python3
+
+              pushd /usr/local
+              rm cuda
+              ln -s cuda-$CUDA_VERSION cuda
+              popd
+              which nvcc
+              nvcc --version
+
+              cp /var/www/scripts/github_actions/install_torch.sh .
+              chmod +x install_torch.sh
+
+              /var/www/scripts/github_actions/build-ubuntu-cuda.sh
+
+      - name: Display wheels
         shell: bash
         run: |
-          python3 -m pip install -qq --upgrade pip six
-          python3 -m pip install -qq bs4 requests tqdm typing_extensions
-          python3 -m pip install -qq dataclasses graphviz
-          sudo apt-get -qq install graphviz
-
-          ./scripts/github_actions/install_torch.sh
-          python3 -c "import torch; print('torch version:', torch.__version__)"
-
-      - name: Install git lfs
-        run: |
-          sudo apt-get install -y git-lfs
+          ls -lh ./wheelhouse/
 
-      - name: Download cudnn 8.0
-        env:
-          cuda: ${{ matrix.cuda }}
-        run: |
-          ./scripts/github_actions/install_cudnn.sh
+      - name: Upload Wheel
+        uses: actions/upload-artifact@v3
+        with:
+          name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cuda
+          path: wheelhouse/*.whl
 
-      - name: Configure CMake
-        shell: bash
+      # https://huggingface.co/docs/hub/spaces-github-actions
+      - name: Publish to huggingface
         env:
-          torch: ${{ matrix.torch }}
-        run: |
-          pwd
-          ./scripts/github_actions/fix_torch.sh
-          mkdir build
-          cd build
-          cmake -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ..
-          cat k2/csrc/version.h
-
-      - name: ${{ matrix.build_type }} Build
-        shell: bash
-        run: |
-          echo "number of cores: $(nproc)"
-          cd build
-          # we cannot use -j here because of limited RAM
-          # of the VM provided by GitHub actions
-          make VERBOSE=1 -j2
-
-      - name: Display Build Information
-        shell: bash
-        run: |
-          export PYTHONPATH=$PWD/k2/python:$PWD/build/lib:$PYTHONPATH
-          python3 -m k2.version
-
-      - name: Run Tests
-        shell: bash
-        run: |
-          cd build
-          ctest --output-on-failure
-          # default log level is INFO
-          ./bin/cu_log_test --gtest_filter="Log.Cpu"
-          K2_LOG_LEVEL=TRACE ./bin/cu_log_test --gtest_filter="Log.Cpu"
-          K2_LOG_LEVEL=DEBUG ./bin/cu_log_test --gtest_filter="Log.Cpu"
-          K2_LOG_LEVEL=INFO ./bin/cu_log_test --gtest_filter="Log.Cpu"
-          K2_LOG_LEVEL=WARNING ./bin/cu_log_test --gtest_filter="Log.Cpu"
-          K2_LOG_LEVEL=ERROR ./bin/cu_log_test --gtest_filter="Log.Cpu"
-          K2_LOG_LEVEL=FATAL ./bin/cu_log_test --gtest_filter="Log.Cpu"
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        uses: nick-fields/retry@v2
+        with:
+          max_attempts: 20
+          timeout_seconds: 200
+          shell: bash
+          command: |
+            git config --global user.email "csukuangfj@gmail.com"
+            git config --global user.name "Fangjun Kuang"
+
+            rm -rf huggingface
+            export GIT_LFS_SKIP_SMUDGE=1
+
+            git clone https://huggingface.co/csukuangfj/k2 huggingface
+            cd huggingface
+            git pull
+
+            mkdir -p ubuntu-cuda
+            cp -v ../wheelhouse/*.whl ./ubuntu-cuda
+            git status
+            git lfs track "*.whl"
+            git add .
+            git commit -m "upload ubuntu-cuda wheel for torch ${{ matrix.torch }} python ${{ matrix.python-version }}"
+            git push https://csukuangfj:$HF_TOKEN@huggingface.co/csukuangfj/k2 main
diff --git a/.github/workflows/style_check.yml b/.github/workflows/style_check.yml
index c6046594d..1dd6a8de6 100644
--- a/.github/workflows/style_check.yml
+++ b/.github/workflows/style_check.yml
@@ -31,6 +31,7 @@ on:
       - '.github/workflows/style_check.yml'
       - 'k2/csrc/**'
       - 'k2/python/**'
+  workflow_dispatch:
 
 concurrency:
   group: style_check-${{ github.ref }}
diff --git a/.github/workflows/test-k2-as-third-party-lib-cpu-macos.yml b/.github/workflows/test-k2-as-third-party-lib-cpu-macos.yml
index 2019ef695..7c9f17903 100644
--- a/.github/workflows/test-k2-as-third-party-lib-cpu-macos.yml
+++ b/.github/workflows/test-k2-as-third-party-lib-cpu-macos.yml
@@ -38,6 +38,7 @@ on:
       - 'k2/csrc/**'
       - 'k2/python/**'
       - 'scripts/github_actions/k2-torch-api-test/**'
+  workflow_dispatch:
 
 concurrency:
   group: test-k2-as-third-party-lib-cpu-macos-${{ github.ref }}
diff --git a/.github/workflows/test-k2-as-third-party-lib-cpu-ubuntu.yml b/.github/workflows/test-k2-as-third-party-lib-cpu-ubuntu.yml
index d45e757db..8119cc812 100644
--- a/.github/workflows/test-k2-as-third-party-lib-cpu-ubuntu.yml
+++ b/.github/workflows/test-k2-as-third-party-lib-cpu-ubuntu.yml
@@ -38,6 +38,7 @@ on:
       - 'k2/csrc/**'
       - 'k2/python/**'
       - 'scripts/github_actions/k2-torch-api-test/**'
+  workflow_dispatch:
 
 concurrency:
   group: test-k2-as-third-party-lib-cpu-ubuntu-${{ github.ref }}
diff --git a/.github/workflows/test-k2-as-third-party-lib-cpu-windows.yml b/.github/workflows/test-k2-as-third-party-lib-cpu-windows.yml
index bee01d455..7c6ea68a1 100644
--- a/.github/workflows/test-k2-as-third-party-lib-cpu-windows.yml
+++ b/.github/workflows/test-k2-as-third-party-lib-cpu-windows.yml
@@ -38,6 +38,7 @@ on:
       - 'k2/csrc/**'
       - 'k2/python/**'
       - 'scripts/github_actions/k2-torch-api-test/**'
+  workflow_dispatch:
 
 concurrency:
   group: test-k2-as-third-party-lib-cpu-windows-${{ github.ref }}
diff --git a/.github/workflows/test-k2-as-third-party-lib-cuda-ubuntu.yml b/.github/workflows/test-k2-as-third-party-lib-cuda-ubuntu.yml
index b79f4ac70..f20056e82 100644
--- a/.github/workflows/test-k2-as-third-party-lib-cuda-ubuntu.yml
+++ b/.github/workflows/test-k2-as-third-party-lib-cuda-ubuntu.yml
@@ -38,6 +38,7 @@ on:
       - 'k2/csrc/**'
       - 'k2/python/**'
       - 'scripts/github_actions/k2-torch-api-test/**'
+  workflow_dispatch:
 
 concurrency:
   group: test-k2-as-third-party-lib-cuda-ubuntu-${{ github.ref }}
diff --git a/.github/workflows/ubuntu-cpu-wheels.yml b/.github/workflows/ubuntu-cpu-wheels.yml
index 0a2da29b2..e813bb12d 100644
--- a/.github/workflows/ubuntu-cpu-wheels.yml
+++ b/.github/workflows/ubuntu-cpu-wheels.yml
@@ -18,7 +18,7 @@ jobs:
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Generating build matrix
@@ -39,7 +39,7 @@ jobs:
         ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -65,7 +65,7 @@ jobs:
               /var/www/scripts/github_actions/build-ubuntu-cpu.sh
 
       - name: Upload Wheel
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cpu
           path: wheelhouse/*.whl
diff --git a/.github/workflows/ubuntu-cuda-wheels.yml b/.github/workflows/ubuntu-cuda-wheels.yml
index 6f272fa02..06166ab8a 100644
--- a/.github/workflows/ubuntu-cuda-wheels.yml
+++ b/.github/workflows/ubuntu-cuda-wheels.yml
@@ -18,7 +18,7 @@ jobs:
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Generating build matrix
@@ -39,7 +39,7 @@ jobs:
         ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -83,7 +83,7 @@ jobs:
           ls -lh ./wheelhouse/
 
       - name: Upload Wheel
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-ubuntu-latest-cuda
           path: wheelhouse/*.whl
diff --git a/.github/workflows/windows-x64-cpu-wheels.yml b/.github/workflows/windows-x64-cpu-wheels.yml
index 9f3c37468..60fcdea2f 100644
--- a/.github/workflows/windows-x64-cpu-wheels.yml
+++ b/.github/workflows/windows-x64-cpu-wheels.yml
@@ -18,7 +18,7 @@ jobs:
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
       - name: Generating build matrix
@@ -39,7 +39,7 @@ jobs:
         ${{ fromJson(needs.generate_build_matrix.outputs.matrix) }}
 
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
@@ -59,7 +59,7 @@ jobs:
           ls -lh ./wheelhouse/
 
       - name: Upload Wheel
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v3
         with:
           name: torch-${{ matrix.torch }}-python-${{ matrix.python-version }}-windows-latest-cpu
           path: wheelhouse/*.whl
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7808bd33e..07ce545bf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -178,30 +178,33 @@ execute_process(COMMAND
   ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE
 )
 
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.")
+endif()
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}")
+
 include(CheckIncludeFileCXX)
 check_include_file_cxx(cxxabi.h K2_HAVE_CXXABI_H)
 check_include_file_cxx(execinfo.h K2_HAVE_EXECINFO_H)
 
 include(CheckCXXCompilerFlag)
 if(NOT WIN32)
-  check_cxx_compiler_flag("-std=c++14" K2_COMPILER_SUPPORTS_CXX14)
+  check_cxx_compiler_flag("-std=c++${CMAKE_CXX_STANDARD}" K2_COMPILER_SUPPORTS_CXX${CMAKE_CXX_STANDARD})
 else()
   # windows x86 or x86_64
-  check_cxx_compiler_flag("/std:c++14" K2_COMPILER_SUPPORTS_CXX14)
+  check_cxx_compiler_flag("/std:c++${CMAKE_CXX_STANDARD}" K2_COMPILER_SUPPORTS_CXX${CMAKE_CXX_STANDARD})
 endif()
-if(NOT K2_COMPILER_SUPPORTS_CXX14)
+if(NOT K2_COMPILER_SUPPORTS_CXX${CMAKE_CXX_STANDARD})
   message(FATAL_ERROR "
-    k2 requires a compiler supporting at least C++14.
+    k2 requires a compiler supporting at least C++${CMAKE_CXX_STANDARD}.
     If you are using GCC, please upgrade it to at least version 7.0.
     If you are using Clang, please upgrade it to at least version 3.4.")
 endif()
 
 # ========= Settings for CUB begin =========
 # the following settings are modified from cub/CMakeLists.txt
-set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.")
-set(CMAKE_CXX_EXTENSIONS OFF)
-
-message(STATUS "C++ Standard version: ${CMAKE_CXX_STANDARD}")
 
 if(K2_WITH_CUDA)
   # Force CUDA C++ standard to be the same as the C++ standard used.
diff --git a/cmake/select_compute_arch.cmake b/cmake/select_compute_arch.cmake
index 43fa5056d..ffa2c4aa6 100644
--- a/cmake/select_compute_arch.cmake
+++ b/cmake/select_compute_arch.cmake
@@ -44,6 +44,11 @@ endif()
 # This list is used to filter CUDA archs when autodetecting
 set(CUDA_ALL_GPU_ARCHITECTURES "3.5" "5.0")
 
+if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
+  list(REMOVE_ITEM CUDA_COMMON_GPU_ARCHITECTURES "3.5")
+  list(REMOVE_ITEM CUDA_ALL_GPU_ARCHITECTURES "3.5")
+endif()
+
 if(CUDA_VERSION VERSION_GREATER "6.5")
   list(APPEND CUDA_KNOWN_GPU_ARCHITECTURES "Kepler+Tegra" "Kepler+Tesla" "Maxwell+Tegra")
   list(APPEND CUDA_COMMON_GPU_ARCHITECTURES "5.2")
diff --git a/cmake/torch.cmake b/cmake/torch.cmake
index 69fdf0c13..5f5a8c8f7 100644
--- a/cmake/torch.cmake
+++ b/cmake/torch.cmake
@@ -14,7 +14,22 @@ find_package(Torch REQUIRED)
 # k2 uses the same abi flag as PyTorch
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
 if(K2_WITH_CUDA)
+  if(CUDA_VERSION VERSION_GREATER_EQUAL "12.0")
+    string(REPLACE " " ";" MY_LIST ${CMAKE_CUDA_FLAGS})
+    set(TEMP_LIST)
+    foreach(f IN LISTS MY_LIST)
+      if(f STREQUAL arch=compute_35,code=sm_35)
+        list(REMOVE_AT TEMP_LIST -1)
+        continue()
+      endif()
+      list(APPEND TEMP_LIST ${f})
+    endforeach()
+
+    string(REPLACE ";" " " CMAKE_CUDA_FLAGS "${TEMP_LIST}")
+  endif()
+
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} ${TORCH_CXX_FLAGS}")
+  message(WARNING " CMAKE_CUDA_FLAGS is ${CMAKE_CUDA_FLAGS}")
 endif()
 
 
@@ -42,7 +57,6 @@ execute_process(
 message(STATUS "PyTorch version: ${TORCH_VERSION}")
 
 if(K2_WITH_CUDA)
-
   execute_process(
     COMMAND "${PYTHON_EXECUTABLE}" -c "import torch; print(torch.version.cuda)"
     OUTPUT_STRIP_TRAILING_WHITESPACE
diff --git a/docs/source/installation/cuda-cudnn.rst b/docs/source/installation/cuda-cudnn.rst
index c9b6cb3f8..5dc78b4d5 100644
--- a/docs/source/installation/cuda-cudnn.rst
+++ b/docs/source/installation/cuda-cudnn.rst
@@ -615,3 +615,80 @@ The output should look like the following:
   Built on Wed_Sep_21_10:33:58_PDT_2022
   Cuda compilation tools, release 11.8, V11.8.89
   Build cuda_11.8.r11.8/compiler.31833905_0
+
+CUDA 12.1
+---------
+
+You can use the following commands to install CUDA 12.1. We install it
+into ``/star-fj/fangjun/software/cuda-12.1.0``. You can replace it
+if needed.
+
+.. code-block:: bash
+
+  wget https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
+
+  chmod +x cuda_12.1.0_530.30.02_linux.run
+
+  ./cuda_12.1.0_530.30.02_linux.run \
+    --silent \
+    --toolkit \
+    --installpath=/star-fj/fangjun/software/cuda-12.1.0 \
+    --no-opengl-libs \
+    --no-drm \
+    --no-man-page
+
+Install cuDNN for CUDA 12.1
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Now, install ``cuDNN`` for CUDA 12.1.
+
+.. code-block:: bash
+
+  wget https://huggingface.co/csukuangfj/cudnn/resolve/main/cudnn-linux-x86_64-8.9.5.29_cuda12-archive.tar.xz
+
+  tar xvf cudnn-linux-x86_64-8.9.5.29_cuda12-archive.tar.xz --strip-components=1 -C /star-fj/fangjun/software/cuda-12.1.0
+
+Set environment variables for CUDA 12.1
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Note that we have to set the following environment variables after installing
+CUDA 11.8. You can save the following code to ``activate-cuda-12.1.sh``
+and use ``source activate-cuda-12.1.sh`` if you want to activate CUDA 12.1.
+
+.. code-block:: bash
+
+  export CUDA_HOME=/star-fj/fangjun/software/cuda-12.1.0
+  export PATH=$CUDA_HOME/bin:$PATH
+  export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+  export LD_LIBRARY_PATH=$CUDA_HOME/lib:$LD_LIBRARY_PATH
+  export LD_LIBRARY_PATH=$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+  export CUDAToolkit_ROOT_DIR=$CUDA_HOME
+  export CUDAToolkit_ROOT=$CUDA_HOME
+
+  export CUDA_TOOLKIT_ROOT_DIR=$CUDA_HOME
+  export CUDA_TOOLKIT_ROOT=$CUDA_HOME
+  export CUDA_BIN_PATH=$CUDA_HOME
+  export CUDA_PATH=$CUDA_HOME
+  export CUDA_INC_PATH=$CUDA_HOME/targets/x86_64-linux
+  export CFLAGS=-I$CUDA_HOME/targets/x86_64-linux/include:$CFLAGS
+  export CUDAToolkit_TARGET_DIR=$CUDA_HOME/targets/x86_64-linux
+
+To check that you have installed CUDA 12.1 successfully, please run:
+
+.. code-block:: bash
+
+  which nvcc
+
+  nvcc --version
+
+The output should look like the following:
+
+.. code-block:: bash
+
+  /star-fj/fangjun/software/cuda-12.1.0/bin/nvcc
+
+  nvcc: NVIDIA (R) Cuda compiler driver
+  Copyright (c) 2005-2023 NVIDIA Corporation
+  Built on Tue_Feb__7_19:32:13_PST_2023
+  Cuda compilation tools, release 12.1, V12.1.66
+  Build cuda_12.1.r12.1/compiler.32415258_0
diff --git a/k2/torch/bin/CMakeLists.txt b/k2/torch/bin/CMakeLists.txt
index 54399ecdb..4b0e2dede 100644
--- a/k2/torch/bin/CMakeLists.txt
+++ b/k2/torch/bin/CMakeLists.txt
@@ -15,7 +15,7 @@ if(NOT K2_WITH_CUDA)
   transform(OUTPUT_VARIABLE ctc_decode_srcs SRCS ${ctc_decode_srcs})
 endif()
 add_executable(ctc_decode ${ctc_decode_srcs})
-set_property(TARGET ctc_decode PROPERTY CXX_STANDARD 14)
+set_property(TARGET ctc_decode PROPERTY CXX_STANDARD ${CMAKE_CXX_STANDARD})
 target_link_libraries(ctc_decode ${bin_dep_libs})
 
 #----------------------------------------
@@ -26,7 +26,7 @@ if(NOT K2_WITH_CUDA)
   transform(OUTPUT_VARIABLE hlg_decode_srcs SRCS ${hlg_decode_srcs})
 endif()
 add_executable(hlg_decode ${hlg_decode_srcs})
-set_property(TARGET hlg_decode PROPERTY CXX_STANDARD 14)
+set_property(TARGET hlg_decode PROPERTY CXX_STANDARD ${CMAKE_CXX_STANDARD})
 target_link_libraries(hlg_decode ${bin_dep_libs})
 
 #-------------------------------------------
@@ -37,7 +37,7 @@ if(NOT K2_WITH_CUDA)
   transform(OUTPUT_VARIABLE ngram_lm_rescore_srcs SRCS ${ngram_lm_rescore_srcs})
 endif()
 add_executable(ngram_lm_rescore ${ngram_lm_rescore_srcs})
-set_property(TARGET ngram_lm_rescore PROPERTY CXX_STANDARD 14)
+set_property(TARGET ngram_lm_rescore PROPERTY CXX_STANDARD ${CMAKE_CXX_STANDARD})
 target_link_libraries(ngram_lm_rescore ${bin_dep_libs})
 
 #---------------------------------------------------------------
@@ -48,7 +48,7 @@ if(NOT K2_WITH_CUDA)
   transform(OUTPUT_VARIABLE attention_rescore_srcs SRCS ${attention_rescore_srcs})
 endif()
 add_executable(attention_rescore ${attention_rescore_srcs})
-set_property(TARGET attention_rescore PROPERTY CXX_STANDARD 14)
+set_property(TARGET attention_rescore PROPERTY CXX_STANDARD ${CMAKE_CXX_STANDARD})
 target_link_libraries(attention_rescore ${bin_dep_libs})
 
 
@@ -61,7 +61,7 @@ if(NOT K2_WITH_CUDA)
 endif()
 
 add_executable(online_decode ${online_decode_srcs})
-set_property(TARGET online_decode PROPERTY CXX_STANDARD 14)
+set_property(TARGET online_decode PROPERTY CXX_STANDARD ${CMAKE_CXX_STANDARD})
 target_link_libraries(online_decode ${bin_dep_libs})
 
 #-------------------------------------------
@@ -73,7 +73,7 @@ if(NOT K2_WITH_CUDA)
 endif()
 
 add_executable(rnnt_demo ${rnnt_demo_srcs})
-set_property(TARGET rnnt_demo PROPERTY CXX_STANDARD 14)
+set_property(TARGET rnnt_demo PROPERTY CXX_STANDARD ${CMAKE_CXX_STANDARD})
 target_link_libraries(rnnt_demo ${bin_dep_libs})
 
 #-------------------------------------------
@@ -85,5 +85,5 @@ if(NOT K2_WITH_CUDA)
 endif()
 
 add_executable(pruned_stateless_transducer ${pruned_stateless_transducer_srcs})
-set_property(TARGET pruned_stateless_transducer PROPERTY CXX_STANDARD 14)
+set_property(TARGET pruned_stateless_transducer PROPERTY CXX_STANDARD ${CMAKE_CXX_STANDARD})
 target_link_libraries(pruned_stateless_transducer ${bin_dep_libs})
diff --git a/scripts/github_actions/build-ubuntu-cuda.sh b/scripts/github_actions/build-ubuntu-cuda.sh
index 033cba837..544b151b4 100755
--- a/scripts/github_actions/build-ubuntu-cuda.sh
+++ b/scripts/github_actions/build-ubuntu-cuda.sh
@@ -18,7 +18,7 @@ fi
 if [ -z $CUDA_VERSION ]; then
   echo "Please set the environment variable CUDA_VERSION"
   echo "Example: export CUDA_VERSION=10.2"
-  # valid values: 10.2, 11.1, 11.3, 11.6, 11.7, 11.8
+  # valid values: 10.2, 11.1, 11.3, 11.6, 11.7, 11.8, 12.1
   exit 1
 fi
 
@@ -69,7 +69,8 @@ yum clean all >/dev/null 2>&1
 cd /var/www
 
 export CMAKE_CUDA_COMPILER_LAUNCHER=
-export K2_CMAKE_ARGS=" -DPYTHON_EXECUTABLE=$PYTHON_INSTALL_DIR/bin/python3 "
+# export K2_CMAKE_ARGS="-DCUDAToolkit_TARGET_DIR=/usr/local/cuda/targets/x86_64-linux -DPYTHON_EXECUTABLE=$PYTHON_INSTALL_DIR/bin/python3 "
+export K2_CMAKE_ARGS="-DPYTHON_EXECUTABLE=$PYTHON_INSTALL_DIR/bin/python3 "
 export K2_MAKE_ARGS=" -j2 "
 
 python3 setup.py bdist_wheel
@@ -89,8 +90,28 @@ auditwheel --verbose repair \
   --exclude libcudnn.so.8 \
   --exclude libcublas.so.11 \
   --exclude libcublasLt.so.11 \
+  --exclude libcublas.so.12 \
+  --exclude libcublas.so \
+  --exclude libcublasLt.so.12 \
+  --exclude libcublasLt.so \
   --exclude libcudart.so.11.0 \
+  --exclude libcudart.so.12 \
   --exclude libnvrtc.so.11.2 \
+  --exclude libnvrtc.so.12 \
+  --exclude libnvrtc.so \
+  --exclude libcupti.so.12 \
+  --exclude libcupti.so \
+  --exclude libcusparse.so.12 \
+  --exclude libcusparse.so \
+  --exclude libnvJitLink.so.12 \
+  --exclude libnvJitLink.so \
+  --exclude libcurand.so.10 \
+  --exclude libcurand.so \
+  --exclude libcufft.so.11 \
+  --exclude libcufft.so \
+  --exclude libnccl.so.2 \
+  --exclude libnccl.so \
+  --exclude libshm.so \
   --exclude libtorch_cuda_cu.so \
   --exclude libtorch_cuda_cpp.so \
   --plat manylinux_2_17_x86_64 \
diff --git a/scripts/github_actions/generate_build_matrix.py b/scripts/github_actions/generate_build_matrix.py
index fc04f6572..e9dfabdab 100755
--- a/scripts/github_actions/generate_build_matrix.py
+++ b/scripts/github_actions/generate_build_matrix.py
@@ -160,9 +160,16 @@ def generate_build_matrix(
             if not for_windows
             else ["11.7.1", "11.8.0"],
         },
+        "2.1.0": {
+            "python-version": ["3.8", "3.9", "3.10", "3.11"],
+            "cuda": ["11.8", "12.1"]  # default 12.1
+            if not for_windows
+            else ["11.8.0", "12.1.0"],
+        },
+        # https://github.com/Jimver/cuda-toolkit/blob/master/src/links/windows-links.ts
     }
     if test_only_latest_torch:
-        latest = "2.0.1"
+        latest = "2.1.0"
         matrix = {latest: matrix[latest]}
 
     if for_windows or for_macos:
diff --git a/scripts/github_actions/install_cuda.sh b/scripts/github_actions/install_cuda.sh
index f94e7d869..3d47f4c63 100755
--- a/scripts/github_actions/install_cuda.sh
+++ b/scripts/github_actions/install_cuda.sh
@@ -52,6 +52,9 @@ case "$cuda" in
   11.8)
     url=https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
     ;;
+  12.1)
+    url=https://developer.download.nvidia.com/compute/cuda/12.1.0/local_installers/cuda_12.1.0_530.30.02_linux.run
+    ;;
   *)
     echo "Unknown cuda version: $cuda"
     exit 1
@@ -66,10 +69,21 @@ retry curl -LSs -O $url
 filename=$(basename $url)
 echo "filename: $filename"
 chmod +x ./$filename
-sudo ./$filename --toolkit --silent
+
+ls -lh
+ls -lh /usr/local
+
+sudo ./$filename \
+  --silent \
+  --toolkit \
+  --no-opengl-libs \
+  --no-drm \
+  --no-man-page
+
 rm -fv ./$filename
 
 export CUDA_HOME=/usr/local/cuda
 export PATH=$CUDA_HOME/bin:$PATH
 export LD_LIBRARY_PATH=$CUDA_HOME/lib:$LD_LIBRARY_PATH
 export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
+ls -lh $CUDA_HOME
diff --git a/scripts/github_actions/install_cudnn.sh b/scripts/github_actions/install_cudnn.sh
index 7bfe681e4..527e9a7a1 100755
--- a/scripts/github_actions/install_cudnn.sh
+++ b/scripts/github_actions/install_cudnn.sh
@@ -45,6 +45,9 @@ case $cuda in
   11.8)
     filename=cudnn-11.3-linux-x64-v8.2.0.53.tgz
     ;;
+  12.1)
+    filename=cudnn-linux-x86_64-8.9.5.29_cuda12-archive.tar.xz
+    ;;
   *)
     echo "Unsupported cuda version: $cuda"
     exit 1
diff --git a/scripts/github_actions/install_torch.sh b/scripts/github_actions/install_torch.sh
index 689ff58e8..1209ed616 100755
--- a/scripts/github_actions/install_torch.sh
+++ b/scripts/github_actions/install_torch.sh
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-if [ $TORCH_VERSION != "" ] && [ $CUDA_VERSION != "" ]; then
+if [ x"$TORCH_VERSION" != x"" ] && [ x"$CUDA_VERSION" != x"" ]; then
     torch=$TORCH_VERSION
     cuda=$CUDA_VERSION
 fi
@@ -172,6 +172,19 @@ case ${torch} in
         ;;
     esac
     ;;
+  2.1.*)
+    case ${cuda} in
+      11.8)
+        package="torch==${torch}+cu118"
+        url=https://download.pytorch.org/whl/torch_stable.html
+        ;;
+      12.1)
+        package="torch==${torch}"
+        # Leave it empty to use PyPI.
+        url=
+        ;;
+    esac
+    ;;
   *)
     echo "Unsupported PyTorch version: ${torch}"
     exit 1
diff --git a/setup.py b/setup.py
index 006093522..32e63c2eb 100644
--- a/setup.py
+++ b/setup.py
@@ -150,6 +150,12 @@ def build_extension(self, ext: setuptools.extension.Extension):
             f" -DCMAKE_INSTALL_PREFIX={Path(self.build_lib).resolve()}/k2 "  # noqa
         )
 
+        major, minor = get_pytorch_version().split(".")[:2]
+        major = int(major)
+        minor = int(minor)
+        if major > 2 or (major == 2 and minor >= 1):
+            extra_cmake_args += f" -DCMAKE_CXX_STANDARD=17 "
+
         if cmake_args == "":
             cmake_args = "-DCMAKE_BUILD_TYPE=Release"