From ee5bbbaed6ccb1810bb2c909d284bd523b5d3473 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 8 Jan 2023 22:35:50 -0600
Subject: [PATCH 01/17] [ci] speed up if-else, swig, and lint conda setup

---
 .ci/test.sh | 107 +++++++++++++++++++++++++++-------------------------
 1 file changed, 55 insertions(+), 52 deletions(-)

diff --git a/.ci/test.sh b/.ci/test.sh
index f18198b5924f..137f3c58a0b5 100755
--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -34,7 +34,61 @@ if [[ "$TASK" == "cpp-tests" ]]; then
     exit 0
 fi
 
-conda create -q -y -n $CONDA_ENV "python=$PYTHON_VERSION[build=*cpython]"
+CONDA_PYTHON_REQUIREMENT="python=$PYTHON_VERSION[build=*cpython]"
+
+if [[ $TASK == "if-else" ]]; then
+    conda create -q -y -n $CONDA_ENV ${CONDA_PYTHON_REQUIREMENT} numpy
+    mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build && cmake .. && make lightgbm -j4 || exit -1
+    cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=train.conf convert_model_language=cpp convert_model=../../src/boosting/gbdt_prediction.cpp && ../../lightgbm config=predict.conf output_result=origin.pred || exit -1
+    cd $BUILD_DIRECTORY/build && make lightgbm -j4 || exit -1
+    cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=predict.conf output_result=ifelse.pred && python test.py || exit -1
+    exit 0
+fi
+
+if [[ $TASK == "swig" ]]; then
+    mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build
+    if [[ $OS_NAME == "macos" ]]; then
+        cmake -DUSE_SWIG=ON -DAPPLE_OUTPUT_DYLIB=ON ..
+    else
+        cmake -DUSE_SWIG=ON ..
+    fi
+    make -j4 || exit -1
+    if [[ $OS_NAME == "linux" ]] && [[ $COMPILER == "gcc" ]]; then
+        objdump -T $BUILD_DIRECTORY/lib_lightgbm.so > $BUILD_DIRECTORY/objdump.log || exit -1
+        objdump -T $BUILD_DIRECTORY/lib_lightgbm_swig.so >> $BUILD_DIRECTORY/objdump.log || exit -1
+        python $BUILD_DIRECTORY/helpers/check_dynamic_dependencies.py $BUILD_DIRECTORY/objdump.log || exit -1
+    fi
+    if [[ $PRODUCES_ARTIFACTS == "true" ]]; then
+        cp $BUILD_DIRECTORY/build/lightgbmlib.jar $BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_$OS_NAME.jar
+    fi
+    exit 0
+fi
+
+if [[ $TASK == "lint" ]]; then
+    conda create -q -y -n $CONDA_ENV \
+        ${CONDA_PYTHON_REQUIREMENT} \
+        cmakelint \
+        cpplint \
+        isort \
+        mypy \
+        pycodestyle \
+        pydocstyle \
+        "r-lintr>=3.0"
+    echo "Linting Python code"
+    pycodestyle --ignore=E501,W503 --exclude=./.nuget,./external_libs . || exit -1
+    pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1
+    isort . --check-only || exit -1
+    mypy --ignore-missing-imports python-package/ || true
+    echo "Linting R code"
+    Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1
+    echo "Linting C++ code"
+    cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package ./swig ./tests || exit -1
+    cmake_files=$(find . -name CMakeLists.txt -o -path "*/cmake/*.cmake")
+    cmakelint --linelength=120 --filter=-convention/filename,-package/stdargs,-readability/wonkycase ${cmake_files} || exit -1
+    exit 0
+fi
+
+conda create -q -y -n $CONDA_ENV "${CONDA_PYTHON_REQUIREMENT}"
 source activate $CONDA_ENV
 
 cd $BUILD_DIRECTORY
@@ -72,57 +126,6 @@ if [[ $TASK == "check-docs" ]] || [[ $TASK == "check-links" ]]; then
     exit 0
 fi
 
-if [[ $TASK == "lint" ]]; then
-    conda install -q -y -n $CONDA_ENV \
-        cmakelint \
-        cpplint \
-        isort \
-        mypy \
-        pycodestyle \
-        pydocstyle \
-        "r-lintr>=3.0"
-    echo "Linting Python code"
-    pycodestyle --ignore=E501,W503 --exclude=./.nuget,./external_libs . || exit -1
-    pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1
-    isort . --check-only || exit -1
-    mypy --ignore-missing-imports python-package/ || true
-    echo "Linting R code"
-    Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1
-    echo "Linting C++ code"
-    cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package ./swig ./tests || exit -1
-    cmake_files=$(find . -name CMakeLists.txt -o -path "*/cmake/*.cmake")
-    cmakelint --linelength=120 --filter=-convention/filename,-package/stdargs,-readability/wonkycase ${cmake_files} || exit -1
-    exit 0
-fi
-
-if [[ $TASK == "if-else" ]]; then
-    conda install -q -y -n $CONDA_ENV numpy
-    mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build && cmake .. && make lightgbm -j4 || exit -1
-    cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=train.conf convert_model_language=cpp convert_model=../../src/boosting/gbdt_prediction.cpp && ../../lightgbm config=predict.conf output_result=origin.pred || exit -1
-    cd $BUILD_DIRECTORY/build && make lightgbm -j4 || exit -1
-    cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=predict.conf output_result=ifelse.pred && python test.py || exit -1
-    exit 0
-fi
-
-if [[ $TASK == "swig" ]]; then
-    mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build
-    if [[ $OS_NAME == "macos" ]]; then
-        cmake -DUSE_SWIG=ON -DAPPLE_OUTPUT_DYLIB=ON ..
-    else
-        cmake -DUSE_SWIG=ON ..
-    fi
-    make -j4 || exit -1
-    if [[ $OS_NAME == "linux" ]] && [[ $COMPILER == "gcc" ]]; then
-        objdump -T $BUILD_DIRECTORY/lib_lightgbm.so > $BUILD_DIRECTORY/objdump.log || exit -1
-        objdump -T $BUILD_DIRECTORY/lib_lightgbm_swig.so >> $BUILD_DIRECTORY/objdump.log || exit -1
-        python $BUILD_DIRECTORY/helpers/check_dynamic_dependencies.py $BUILD_DIRECTORY/objdump.log || exit -1
-    fi
-    if [[ $PRODUCES_ARTIFACTS == "true" ]]; then
-        cp $BUILD_DIRECTORY/build/lightgbmlib.jar $BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_$OS_NAME.jar
-    fi
-    exit 0
-fi
-
 # re-including python=version[build=*cpython] to ensure that conda doesn't fall back to pypy
 conda install -q -y -n $CONDA_ENV \
     cloudpickle \

From 12f8f555b7b8d67793ab3bbf9eadd047c827d249 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 8 Jan 2023 22:47:14 -0600
Subject: [PATCH 02/17] add 'source activate'

---
 .ci/test.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.ci/test.sh b/.ci/test.sh
index 137f3c58a0b5..6d7ecfc6635d 100755
--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -38,6 +38,7 @@ CONDA_PYTHON_REQUIREMENT="python=$PYTHON_VERSION[build=*cpython]"
 
 if [[ $TASK == "if-else" ]]; then
     conda create -q -y -n $CONDA_ENV ${CONDA_PYTHON_REQUIREMENT} numpy
+    source activate $CONDA_ENV
     mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build && cmake .. && make lightgbm -j4 || exit -1
     cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=train.conf convert_model_language=cpp convert_model=../../src/boosting/gbdt_prediction.cpp && ../../lightgbm config=predict.conf output_result=origin.pred || exit -1
     cd $BUILD_DIRECTORY/build && make lightgbm -j4 || exit -1
@@ -74,6 +75,7 @@ if [[ $TASK == "lint" ]]; then
         pycodestyle \
         pydocstyle \
         "r-lintr>=3.0"
+    source activate $CONDA_ENV
     echo "Linting Python code"
     pycodestyle --ignore=E501,W503 --exclude=./.nuget,./external_libs . || exit -1
     pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1

From a981a0741db8dddf2f9cd9fbbac61f5369626455 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 8 Jan 2023 22:47:56 -0600
Subject: [PATCH 03/17] python constraint

---
 .ci/test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.ci/test.sh b/.ci/test.sh
index 6d7ecfc6635d..c66f24abc74c 100755
--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -139,7 +139,7 @@ conda install -q -y -n $CONDA_ENV \
     pandas \
     psutil \
     pytest \
-    "python=$PYTHON_VERSION[build=*cpython]" \
+    ${CONDA_PYTHON_REQUIREMENT} \
     python-graphviz \
     scikit-learn \
     scipy || exit -1

From 05ee8a124dfee077db61143a1cf7f292fa5de16f Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 16 Jan 2023 22:58:44 -0600
Subject: [PATCH 04/17] start removing cuda v1

---
 .ci/test.sh                             |   35 +-
 .github/workflows/cuda.yml              |   20 +-
 CMakeLists.txt                          |   33 +-
 docs/Installation-Guide.rst             |    4 +-
 docs/Parameters.rst                     |    6 +-
 include/LightGBM/config.h               |    4 +-
 python-package/setup.py                 |    7 +-
 src/objective/objective_function.cpp    |   24 +-
 src/treelearner/cuda_kernel_launcher.cu |  171 ----
 src/treelearner/cuda_kernel_launcher.h  |   70 --
 src/treelearner/cuda_tree_learner.cpp   | 1031 -----------------------
 src/treelearner/cuda_tree_learner.h     |  261 ------
 src/treelearner/serial_tree_learner.cpp |    8 -
 13 files changed, 47 insertions(+), 1627 deletions(-)
 delete mode 100644 src/treelearner/cuda_kernel_launcher.cu
 delete mode 100644 src/treelearner/cuda_kernel_launcher.h
 delete mode 100644 src/treelearner/cuda_tree_learner.cpp
 delete mode 100644 src/treelearner/cuda_tree_learner.h

diff --git a/.ci/test.sh b/.ci/test.sh
index c66f24abc74c..12b4ae027993 100755
--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -201,41 +201,24 @@ if [[ $TASK == "gpu" ]]; then
     elif [[ $METHOD == "source" ]]; then
         cmake -DUSE_GPU=ON ..
     fi
-elif [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then
-    if [[ $TASK == "cuda" ]]; then
-        sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h
-        grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
-    else
-        sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda_exp";/' $BUILD_DIRECTORY/include/LightGBM/config.h
-        grep -q 'std::string device_type = "cuda_exp"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
-        # by default ``gpu_use_dp=false`` for efficiency. change to ``true`` here for exact results in ci tests
-        sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h
-        grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
-    fi
+elif [[ $TASK == "cuda" ]]; then
+    sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h
+    grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
+    # by default ``gpu_use_dp=false`` for efficiency. change to ``true`` here for exact results in ci tests
+    sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h
+    grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit -1  # make sure that changes were really done
     if [[ $METHOD == "pip" ]]; then
         cd $BUILD_DIRECTORY/python-package && python setup.py sdist || exit -1
-        if [[ $TASK == "cuda" ]]; then
-            pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1
-        else
-            pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda-exp || exit -1
-        fi
+        pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER.tar.gz -v --install-option=--cuda || exit -1
         pytest $BUILD_DIRECTORY/tests/python_package_test || exit -1
         exit 0
     elif [[ $METHOD == "wheel" ]]; then
-        if [[ $TASK == "cuda" ]]; then
-            cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda || exit -1
-        else
-            cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda-exp || exit -1
-        fi
+        cd $BUILD_DIRECTORY/python-package && python setup.py bdist_wheel --cuda || exit -1
         pip install --user $BUILD_DIRECTORY/python-package/dist/lightgbm-$LGB_VER*.whl -v || exit -1
         pytest $BUILD_DIRECTORY/tests || exit -1
         exit 0
     elif [[ $METHOD == "source" ]]; then
-        if [[ $TASK == "cuda" ]]; then
-            cmake -DUSE_CUDA=ON ..
-        else
-            cmake -DUSE_CUDA_EXP=ON ..
-        fi
+        cmake -DUSE_CUDA=ON ..
     fi
 elif [[ $TASK == "mpi" ]]; then
     if [[ $METHOD == "pip" ]]; then
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 946f548784a6..ca17ff2ca587 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -28,31 +28,21 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - method: source
-            compiler: gcc
-            python_version: "3.8"
-            cuda_version: "11.7.1"
-            task: cuda
-          - method: pip
-            compiler: clang
-            python_version: "3.9"
-            cuda_version: "10.0"
-            task: cuda
           - method: wheel
             compiler: gcc
             python_version: "3.10"
-            cuda_version: "9.0"
+            cuda_version: "11.7.1"
             task: cuda
           - method: source
             compiler: gcc
             python_version: "3.8"
-            cuda_version: "11.7.1"
-            task: cuda_exp
+            cuda_version: "10.0"
+            task: cuda
           - method: pip
             compiler: clang
             python_version: "3.9"
-            cuda_version: "10.0"
-            task: cuda_exp
+            cuda_version: "11.7.1"
+            task: cuda
     steps:
       - name: Setup or update software on host machine
         run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b7fa5dc8f330..8d005662f37a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -4,8 +4,8 @@ option(USE_GPU "Enable GPU-accelerated training" OFF)
 option(USE_SWIG "Enable SWIG to generate Java API" OFF)
 option(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF)
 option(USE_TIMETAG "Set to ON to output time costs" OFF)
-option(USE_CUDA "Enable CUDA-accelerated training (EXPERIMENTAL)" OFF)
-option(USE_CUDA_EXP "Enable CUDA-accelerated training with more acceleration (EXPERIMENTAL)" OFF)
+option(USE_CUDA "Enable CUDA-accelerated training " OFF)
+option(USE_CUDA_EXP "(DEPRECATED) Alias for 'USE_CUDA'. Use 'USE_CUDA' instead." OFF)
 option(USE_DEBUG "Set to ON for Debug mode" OFF)
 option(USE_SANITIZER "Use santizer flags" OFF)
 set(
@@ -31,7 +31,7 @@ elseif(USE_SWIG)
   cmake_minimum_required(VERSION 3.8)
 elseif(USE_GPU OR APPLE)
   cmake_minimum_required(VERSION 3.2)
-elseif(USE_CUDA OR USE_CUDA_EXP)
+elseif(USE_CUDA)
   cmake_minimum_required(VERSION 3.16)
 else()
   cmake_minimum_required(VERSION 3.0)
@@ -137,7 +137,12 @@ else()
     add_definitions(-DUSE_SOCKET)
 endif()
 
-if(USE_CUDA OR USE_CUDA_EXP)
+if(USE_CUDA_EXP)
+    message(WARNING "Option -DUSE_CUDA_EXP=ON is deprecated. Use -DUSE_CUDA=ON instead.")
+    set(USE_CUDA ON CACHE BOOL "Building CUDA-enabled version" FORCE)
+endif()
+
+if(USE_CUDA)
     set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
     enable_language(CUDA)
     set(USE_OPENMP ON CACHE BOOL "CUDA requires OpenMP" FORCE)
@@ -192,12 +197,8 @@ if(__INTEGRATE_OPENCL)
     endif()
 endif()
 
-if(USE_CUDA OR USE_CUDA_EXP)
-    if(USE_CUDA)
-      find_package(CUDA 9.0 REQUIRED)
-    else()
-      find_package(CUDA 10.0 REQUIRED)
-    endif()
+if(USE_CUDA)
+    find_package(CUDA 10.0 REQUIRED)
     include_directories(${CUDA_INCLUDE_DIRS})
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall")
 
@@ -224,11 +225,7 @@ if(USE_CUDA OR USE_CUDA_EXP)
     endif()
     message(STATUS "CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
 
-    if(USE_CUDA)
-      add_definitions(-DUSE_CUDA)
-    elseif(USE_CUDA_EXP)
-      add_definitions(-DUSE_CUDA_EXP)
-    endif()
+    add_definitions(-DUSE_CUDA)
 
     if(NOT DEFINED CMAKE_CUDA_STANDARD)
       set(CMAKE_CUDA_STANDARD 11)
@@ -411,10 +408,8 @@ file(
       src/objective/*.cpp
       src/network/*.cpp
       src/treelearner/*.cpp
-if(USE_CUDA OR USE_CUDA_EXP)
+if(USE_CUDA)
       src/treelearner/*.cu
-endif()
-if(USE_CUDA_EXP)
       src/boosting/cuda/*.cpp
       src/boosting/cuda/*.cu
       src/metric/cuda/*.cpp
@@ -549,7 +544,7 @@ if(__INTEGRATE_OPENCL)
   target_link_libraries(lightgbm_objs PUBLIC ${INTEGRATED_OPENCL_LIBRARIES} ${CMAKE_DL_LIBS})
 endif()
 
-if(USE_CUDA OR USE_CUDA_EXP)
+if(USE_CUDA)
   # Disable cmake warning about policy CMP0104. Refer to issue #3754 and PR #4268.
   # Custom target properties does not propagate, thus we need to specify for
   # each target that contains or depends on cuda source.
diff --git a/docs/Installation-Guide.rst b/docs/Installation-Guide.rst
index 3f8cc45f2160..6ce3f2f0c987 100644
--- a/docs/Installation-Guide.rst
+++ b/docs/Installation-Guide.rst
@@ -621,7 +621,7 @@ On Linux a CUDA version of LightGBM can be built using **CUDA**, **CMake** and *
 
 The following dependencies should be installed before compilation:
 
--  **CUDA** 9.0 or later libraries. Please refer to `this detailed guide`_. Pay great attention to the minimum required versions of host compilers listed in the table from that guide and use only recommended versions of compilers.
+-  **CUDA** 10.0 or later libraries. Please refer to `this detailed guide`_. Pay great attention to the minimum required versions of host compilers listed in the table from that guide and use only recommended versions of compilers.
 
 -  **CMake** 3.16 or later.
 
@@ -636,8 +636,6 @@ To build LightGBM CUDA version, run the following commands:
   cmake -DUSE_CUDA=1 ..
   make -j4
 
-Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace ``-DUSE_CUDA`` with ``-DUSE_CUDA_EXP`` in the above commands. Please note that new version requires **CUDA** 10.0 or later libraries.
-
 **Note**: glibc >= 2.14 is required.
 
 **Note**: In some rare cases you may need to install OpenMP runtime library separately (use your package manager and search for ``lib[g|i]omp`` for doing this).
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 4ac77d407ba6..d2cb9acdaf6d 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -205,7 +205,7 @@ Core Parameters
 
    -  **Note**: please **don't** change this during training, especially when running multiple jobs simultaneously by external packages, otherwise it may cause undesirable errors
 
--  ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, ``cuda``, ``cuda_exp``, aliases: ``device``
+-  ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, ``cuda``, aliases: ``device``
 
    -  device for the tree learning, you can use GPU to achieve the faster learning
 
@@ -215,10 +215,6 @@ Core Parameters
 
    -  **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
 
-   -  **Note**: ``cuda_exp`` is an experimental CUDA version, the installation guide for ``cuda_exp`` is identical with ``cuda``
-
-   -  **Note**: ``cuda_exp`` is faster than ``cuda`` and will replace ``cuda`` in the future
-
 -  ``seed`` :raw-html:`<a id="seed" title="Permalink to this parameter" href="#seed">&#x1F517;&#xFE0E;</a>`, default = ``None``, type = int, aliases: ``random_seed``, ``random_state``
 
    -  this seed is used to generate other seeds, e.g. ``data_random_seed``, ``feature_fraction_seed``, etc.
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 4456d19b4da3..c2de146b3854 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -223,14 +223,12 @@ struct Config {
 
   // [doc-only]
   // type = enum
-  // options = cpu, gpu, cuda, cuda_exp
+  // options = cpu, gpu, cuda
   // alias = device
   // desc = device for the tree learning, you can use GPU to achieve the faster learning
   // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up
   // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training
   // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
-  // desc = **Note**: ``cuda_exp`` is an experimental CUDA version, the installation guide for ``cuda_exp`` is identical with ``cuda``
-  // desc = **Note**: ``cuda_exp`` is faster than ``cuda`` and will replace ``cuda`` in the future
   std::string device_type = "cpu";
 
   // [doc-only]
diff --git a/python-package/setup.py b/python-package/setup.py
index f40229f2c430..af07bfbdd5b4 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -128,6 +128,9 @@ def compile_cpp(
 
     logger.info("Starting to compile the library.")
 
+    if use_cuda_exp:
+        use_cuda = True
+
     cmake_cmd = ["cmake", str(CURRENT_DIR / "compile")]
     if integrated_opencl:
         use_gpu = False
@@ -148,8 +151,6 @@ def compile_cpp(
             cmake_cmd.append(f"-DOpenCL_LIBRARY={opencl_library}")
     elif use_cuda:
         cmake_cmd.append("-DUSE_CUDA=ON")
-    elif use_cuda_exp:
-        cmake_cmd.append("-DUSE_CUDA_EXP=ON")
     if use_mpi:
         cmake_cmd.append("-DUSE_MPI=ON")
     if nomp:
@@ -171,7 +172,7 @@ def compile_cpp(
         else:
             status = 1
             lib_path = CURRENT_DIR / "compile" / "windows" / "x64" / "DLL" / "lib_lightgbm.dll"
-            if not any((use_gpu, use_cuda, use_cuda_exp, use_mpi, use_hdfs, nomp, bit32, integrated_opencl)):
+            if not any((use_gpu, use_cuda, use_mpi, use_hdfs, nomp, bit32, integrated_opencl)):
                 logger.info("Starting to compile with MSBuild from existing solution file.")
                 platform_toolsets = ("v143", "v142", "v141", "v140")
                 for pt in platform_toolsets:
diff --git a/src/objective/objective_function.cpp b/src/objective/objective_function.cpp
index 79749570d672..3ce424721f86 100644
--- a/src/objective/objective_function.cpp
+++ b/src/objective/objective_function.cpp
@@ -18,8 +18,8 @@
 namespace LightGBM {
 
 ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string& type, const Config& config) {
-  #ifdef USE_CUDA_EXP
-  if (config.device_type == std::string("cuda_exp") &&
+  #ifdef USE_CUDA
+  if (config.device_type == std::string("cuda") &&
       config.data_sample_strategy != std::string("goss") &&
       config.boosting != std::string("rf")) {
     if (type == std::string("regression")) {
@@ -27,7 +27,7 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
     } else if (type == std::string("regression_l1")) {
       return new CUDARegressionL1loss(config);
     } else if (type == std::string("quantile")) {
-      Log::Warning("Objective quantile is not implemented in cuda_exp version. Fall back to boosting on CPU.");
+      Log::Warning("Objective quantile is not implemented in cuda version. Fall back to boosting on CPU.");
       return new RegressionQuantileloss(config);
     } else if (type == std::string("huber")) {
       return new CUDARegressionHuberLoss(config);
@@ -46,26 +46,26 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
     } else if (type == std::string("multiclassova")) {
       return new CUDAMulticlassOVA(config);
     } else if (type == std::string("cross_entropy")) {
-      Log::Warning("Objective cross_entropy is not implemented in cuda_exp version. Fall back to boosting on CPU.");
+      Log::Warning("Objective cross_entropy is not implemented in cuda version. Fall back to boosting on CPU.");
       return new CrossEntropy(config);
     } else if (type == std::string("cross_entropy_lambda")) {
-      Log::Warning("Objective cross_entropy_lambda is not implemented in cuda_exp version. Fall back to boosting on CPU.");
+      Log::Warning("Objective cross_entropy_lambda is not implemented in cuda version. Fall back to boosting on CPU.");
       return new CrossEntropyLambda(config);
     } else if (type == std::string("mape")) {
-      Log::Warning("Objective mape is not implemented in cuda_exp version. Fall back to boosting on CPU.");
+      Log::Warning("Objective mape is not implemented in cuda version. Fall back to boosting on CPU.");
       return new RegressionMAPELOSS(config);
     } else if (type == std::string("gamma")) {
-      Log::Warning("Objective gamma is not implemented in cuda_exp version. Fall back to boosting on CPU.");
+      Log::Warning("Objective gamma is not implemented in cuda version. Fall back to boosting on CPU.");
       return new RegressionGammaLoss(config);
     } else if (type == std::string("tweedie")) {
-      Log::Warning("Objective tweedie is not implemented in cuda_exp version. Fall back to boosting on CPU.");
+      Log::Warning("Objective tweedie is not implemented in cuda version. Fall back to boosting on CPU.");
       return new RegressionTweedieLoss(config);
     } else if (type == std::string("custom")) {
-      Log::Warning("Using customized objective with cuda_exp. This requires copying gradients from CPU to GPU, which can be slow.");
+      Log::Warning("Using customized objective with cuda. This requires copying gradients from CPU to GPU, which can be slow.");
       return nullptr;
     }
   } else {
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
     if (type == std::string("regression")) {
       return new RegressionL2loss(config);
     } else if (type == std::string("regression_l1")) {
@@ -101,9 +101,9 @@ ObjectiveFunction* ObjectiveFunction::CreateObjectiveFunction(const std::string&
     } else if (type == std::string("custom")) {
       return nullptr;
     }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
   Log::Fatal("Unknown objective type name: %s", type.c_str());
   return nullptr;
 }
diff --git a/src/treelearner/cuda_kernel_launcher.cu b/src/treelearner/cuda_kernel_launcher.cu
deleted file mode 100644
index 05724695100e..000000000000
--- a/src/treelearner/cuda_kernel_launcher.cu
+++ /dev/null
@@ -1,171 +0,0 @@
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifdef USE_CUDA
-
-#include "cuda_kernel_launcher.h"
-
-#include <LightGBM/utils/log.h>
-
-#include <cuda_runtime.h>
-
-#include <cstdio>
-
-namespace LightGBM {
-
-void cuda_histogram(
-                int             histogram_size,
-                data_size_t     leaf_num_data,
-                data_size_t     num_data,
-                bool            use_all_features,
-                bool            is_constant_hessian,
-                int             num_workgroups,
-                cudaStream_t    stream,
-                uint8_t*        arg0,
-                uint8_t*        arg1,
-                data_size_t     arg2,
-                data_size_t*    arg3,
-                data_size_t     arg4,
-                score_t*        arg5,
-                score_t*        arg6,
-                score_t         arg6_const,
-                char*           arg7,
-                volatile int*   arg8,
-                void*           arg9,
-                size_t          exp_workgroups_per_feature) {
-  if (histogram_size == 16) {
-    if (leaf_num_data == num_data) {
-      if (use_all_features) {
-        if (!is_constant_hessian)
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-           histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-           histogram16_fulldata<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-           histogram16_fulldata<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    } else {
-      if (use_all_features) {
-        // seems all features is always enabled, so this should be the same as fulldata
-        if (!is_constant_hessian)
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram16<<<num_workgroups, 16, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    }
-  } else if (histogram_size == 64) {
-    if (leaf_num_data == num_data) {
-      if (use_all_features) {
-        if (!is_constant_hessian)
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram64_fulldata<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64_fulldata<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    } else {
-      if (use_all_features) {
-        // seems all features is always enabled, so this should be the same as fulldata
-        if (!is_constant_hessian)
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram64<<<num_workgroups, 64, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    }
-  } else {
-    if (leaf_num_data == num_data) {
-      if (use_all_features) {
-        if (!is_constant_hessian)
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256_fulldata<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    } else {
-      if (use_all_features) {
-        // seems all features is always enabled, so this should be the same as fulldata
-        if (!is_constant_hessian)
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      } else {
-        if (!is_constant_hessian)
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-        else
-          histogram256<<<num_workgroups, 256, 0, stream>>>(arg0, arg1, arg2,
-                  arg3, arg4, arg5,
-                  arg6_const, arg7, arg8, static_cast<acc_type*>(arg9), exp_workgroups_per_feature);
-      }
-    }
-  }
-}
-
-}  // namespace LightGBM
-
-#endif  // USE_CUDA
diff --git a/src/treelearner/cuda_kernel_launcher.h b/src/treelearner/cuda_kernel_launcher.h
deleted file mode 100644
index 0714e05b2f2d..000000000000
--- a/src/treelearner/cuda_kernel_launcher.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifndef LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
-#define LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
-
-#ifdef USE_CUDA
-#include <chrono>
-#include "kernels/histogram_16_64_256.hu"  // kernel, acc_type, data_size_t, uchar, score_t
-
-namespace LightGBM {
-
-struct ThreadData {
-          // device id
-          int             device_id;
-          // parameters for cuda_histogram
-          int             histogram_size;
-          data_size_t     leaf_num_data;
-          data_size_t     num_data;
-          bool            use_all_features;
-          bool            is_constant_hessian;
-          int             num_workgroups;
-          cudaStream_t    stream;
-          uint8_t*        device_features;
-          uint8_t*        device_feature_masks;
-          data_size_t*    device_data_indices;
-          score_t*        device_gradients;
-          score_t*        device_hessians;
-          score_t         hessians_const;
-          char*           device_subhistograms;
-          volatile int*   sync_counters;
-          void*           device_histogram_outputs;
-          size_t          exp_workgroups_per_feature;
-          // cuda events
-          cudaEvent_t*    kernel_start;
-          cudaEvent_t*    kernel_wait_obj;
-          std::chrono::duration<double, std::milli>* kernel_input_wait_time;
-          // copy histogram
-          size_t        output_size;
-          char*                 host_histogram_output;
-          cudaEvent_t*          histograms_wait_obj;
-};
-
-
-void cuda_histogram(
-                int             histogram_size,
-                data_size_t     leaf_num_data,
-                data_size_t     num_data,
-                bool            use_all_features,
-                bool            is_constant_hessian,
-                int             num_workgroups,
-                cudaStream_t    stream,
-                uint8_t*        arg0,
-                uint8_t*        arg1,
-                data_size_t     arg2,
-                data_size_t*    arg3,
-                data_size_t     arg4,
-                score_t*        arg5,
-                score_t*        arg6,
-                score_t         arg6_const,
-                char*           arg7,
-                volatile int*   arg8,
-                void*           arg9,
-                size_t          exp_workgroups_per_feature);
-
-}  // namespace LightGBM
-
-#endif  // USE_CUDA
-#endif  // LIGHTGBM_TREELEARNER_CUDA_KERNEL_LAUNCHER_H_
diff --git a/src/treelearner/cuda_tree_learner.cpp b/src/treelearner/cuda_tree_learner.cpp
deleted file mode 100644
index a6bd4c47ae06..000000000000
--- a/src/treelearner/cuda_tree_learner.cpp
+++ /dev/null
@@ -1,1031 +0,0 @@
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifdef USE_CUDA
-#include "cuda_tree_learner.h"
-
-#include <LightGBM/bin.h>
-#include <LightGBM/network.h>
-#include <LightGBM/cuda/cuda_utils.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/utils/common.h>
-
-#include <pthread.h>
-
-#include <algorithm>
-#include <cinttypes>
-#include <vector>
-
-#include "../io/dense_bin.hpp"
-
-namespace LightGBM {
-
-#define cudaMemcpy_DEBUG 0  // 1: DEBUG cudaMemcpy
-#define ResetTrainingData_DEBUG 0  // 1: Debug ResetTrainingData
-
-#define CUDA_DEBUG 0
-
-static void *launch_cuda_histogram(void *thread_data) {
-  ThreadData td = *(reinterpret_cast<ThreadData*>(thread_data));
-  int device_id = td.device_id;
-  CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
-
-  // launch cuda kernel
-  cuda_histogram(td.histogram_size,
-                 td.leaf_num_data, td.num_data, td.use_all_features,
-                 td.is_constant_hessian, td.num_workgroups, td.stream,
-                 td.device_features,
-                 td.device_feature_masks,
-                 td.num_data,
-                 td.device_data_indices,
-                 td.leaf_num_data,
-                 td.device_gradients,
-                 td.device_hessians, td.hessians_const,
-                 td.device_subhistograms, td.sync_counters,
-                 td.device_histogram_outputs,
-                 td.exp_workgroups_per_feature);
-
-  CUDASUCCESS_OR_FATAL(cudaGetLastError());
-
-  return NULL;
-}
-
-CUDATreeLearner::CUDATreeLearner(const Config* config)
-  :SerialTreeLearner(config) {
-  use_bagging_ = false;
-  nthreads_ = 0;
-  if (config->gpu_use_dp && USE_DP_FLOAT) {
-    Log::Info("LightGBM using CUDA trainer with DP float!!");
-  } else {
-    Log::Info("LightGBM using CUDA trainer with SP float!!");
-  }
-}
-
-CUDATreeLearner::~CUDATreeLearner() {
-  #pragma omp parallel for schedule(static, num_gpu_)
-
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
-
-    if (device_features_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id]));
-    }
-
-    if (device_gradients_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id]));
-    }
-
-    if (device_hessians_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id]));
-    }
-
-    if (device_feature_masks_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(device_feature_masks_[device_id]));
-    }
-
-    if (device_data_indices_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id]));
-    }
-
-    if (sync_counters_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id]));
-    }
-
-    if (device_subhistograms_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id]));
-    }
-
-    if (device_histogram_outputs_[device_id] != NULL) {
-      CUDASUCCESS_OR_FATAL(cudaFree(device_histogram_outputs_[device_id]));
-    }
-  }
-}
-
-
-void CUDATreeLearner::Init(const Dataset* train_data, bool is_constant_hessian) {
-  // initialize SerialTreeLearner
-  SerialTreeLearner::Init(train_data, is_constant_hessian);
-
-  // some additional variables needed for GPU trainer
-  num_feature_groups_ = train_data_->num_feature_groups();
-
-  // Initialize GPU buffers and kernels: get device info
-  InitGPU(config_->num_gpu);
-}
-
-// some functions used for debugging the GPU histogram construction
-#if CUDA_DEBUG > 0
-
-void PrintHistograms(hist_t* h, size_t size) {
-  double total_hess = 0;
-  for (size_t i = 0; i < size; ++i) {
-    printf("%03lu=%9.3g,%9.3g\t", i, GET_GRAD(h, i), GET_HESS(h, i));
-    if ((i & 3) == 3)
-        printf("\n");
-    total_hess += GET_HESS(h, i);
-  }
-  printf("\nSum hessians: %9.3g\n", total_hess);
-}
-
-union Float_t {
-    int64_t i;
-    double f;
-    static int64_t ulp_diff(Float_t a, Float_t b) {
-      return abs(a.i - b.i);
-    }
-};
-
-int CompareHistograms(hist_t* h1, hist_t* h2, size_t size, int feature_id, int dp_flag, int const_flag) {
-  int i;
-  int retval = 0;
-  printf("Comparing Histograms, feature_id = %d, size = %d\n", feature_id, static_cast<int>(size));
-  if (dp_flag) {  // double precision
-    double af, bf;
-    int64_t ai, bi;
-    for (i = 0; i < static_cast<int>(size); ++i) {
-      af = GET_GRAD(h1, i);
-      bf = GET_GRAD(h2, i);
-      if ((((std::fabs(af - bf))/af) >= 1e-6) && ((std::fabs(af - bf)) >= 1e-6)) {
-        printf("i = %5d, h1.grad %13.6lf, h2.grad %13.6lf\n", i, af, bf);
-        ++retval;
-      }
-      if (const_flag) {
-        ai = GET_HESS((reinterpret_cast<int64_t *>(h1)), i);
-        bi = GET_HESS((reinterpret_cast<int64_t *>(h2)), i);
-        if (ai != bi) {
-          printf("i = %5d, h1.hess %" PRId64 ", h2.hess %" PRId64 "\n", i, ai, bi);
-          ++retval;
-        }
-      } else {
-        af = GET_HESS(h1, i);
-        bf = GET_HESS(h2, i);
-        if (((std::fabs(af - bf))/af) >= 1e-6) {
-          printf("i = %5d, h1.hess %13.6lf, h2.hess %13.6lf\n", i, af, bf);
-          ++retval;
-        }
-      }
-    }
-  } else {  // single precision
-    float af, bf;
-    int ai, bi;
-    for (i = 0; i < static_cast<int>(size); ++i) {
-      af = GET_GRAD(h1, i);
-      bf = GET_GRAD(h2, i);
-      if ((((std::fabs(af - bf))/af) >= 1e-6) && ((std::fabs(af - bf)) >= 1e-6)) {
-        printf("i = %5d, h1.grad %13.6f, h2.grad %13.6f\n", i, af, bf);
-        ++retval;
-      }
-      if (const_flag) {
-        ai = GET_HESS(h1, i);
-        bi = GET_HESS(h2, i);
-        if (ai != bi) {
-          printf("i = %5d, h1.hess %d, h2.hess %d\n", i, ai, bi);
-          ++retval;
-        }
-      } else {
-        af = GET_HESS(h1, i);
-        bf = GET_HESS(h2, i);
-        if (((std::fabs(af - bf))/af) >= 1e-5) {
-          printf("i = %5d, h1.hess %13.6f, h2.hess %13.6f\n", i, af, bf);
-          ++retval;
-        }
-      }
-    }
-  }
-  printf("DONE Comparing Histograms...\n");
-  return retval;
-}
-#endif
-
-int CUDATreeLearner::GetNumWorkgroupsPerFeature(data_size_t leaf_num_data) {
-  // we roughly want 256 workgroups per device, and we have num_dense_feature4_ feature tuples.
-  // also guarantee that there are at least 2K examples per workgroup
-  double x = 256.0 / num_dense_feature_groups_;
-
-  int exp_workgroups_per_feature = static_cast<int>(ceil(log2(x)));
-  double t = leaf_num_data / 1024.0;
-
-  Log::Debug("We can have at most %d workgroups per feature4 for efficiency reasons\n"
-         "Best workgroup size per feature for full utilization is %d\n", static_cast<int>(ceil(t)), (1 << exp_workgroups_per_feature));
-
-  exp_workgroups_per_feature = std::min(exp_workgroups_per_feature, static_cast<int>(ceil(log(static_cast<double>(t))/log(2.0))));
-  if (exp_workgroups_per_feature < 0)
-      exp_workgroups_per_feature = 0;
-  if (exp_workgroups_per_feature > kMaxLogWorkgroupsPerFeature)
-      exp_workgroups_per_feature = kMaxLogWorkgroupsPerFeature;
-
-  return exp_workgroups_per_feature;
-}
-
-void CUDATreeLearner::GPUHistogram(data_size_t leaf_num_data, bool use_all_features) {
-  // we have already copied ordered gradients, ordered hessians and indices to GPU
-  // decide the best number of workgroups working on one feature4 tuple
-  // set work group size based on feature size
-  // each 2^exp_workgroups_per_feature workgroups work on a feature4 tuple
-  int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(leaf_num_data);
-  std::vector<int> num_gpu_workgroups;
-  ThreadData *thread_data = reinterpret_cast<ThreadData*>(_mm_malloc(sizeof(ThreadData) * num_gpu_, 16));
-
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    int num_gpu_feature_groups = num_gpu_feature_groups_[device_id];
-    int num_workgroups = (1 << exp_workgroups_per_feature) * num_gpu_feature_groups;
-    num_gpu_workgroups.push_back(num_workgroups);
-    if (num_workgroups > preallocd_max_num_wg_[device_id]) {
-      preallocd_max_num_wg_.at(device_id) = num_workgroups;
-      CUDASUCCESS_OR_FATAL(cudaFree(device_subhistograms_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), static_cast<size_t>(num_workgroups * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))));
-    }
-    // set thread_data
-    SetThreadData(thread_data, device_id, histogram_size_, leaf_num_data, use_all_features,
-                  num_workgroups, exp_workgroups_per_feature);
-  }
-
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    if (pthread_create(cpu_threads_[device_id], NULL, launch_cuda_histogram, reinterpret_cast<void *>(&thread_data[device_id]))) {
-      Log::Fatal("Error in creating threads.");
-    }
-  }
-
-  /* Wait for the threads to finish */
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    if (pthread_join(*(cpu_threads_[device_id]), NULL)) {
-      Log::Fatal("Error in joining threads.");
-    }
-  }
-
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    // copy the results asynchronously. Size depends on if double precision is used
-
-    size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
-    size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
-
-    CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(reinterpret_cast<char*>(host_histogram_outputs_) + host_output_offset, device_histogram_outputs_[device_id], output_size, cudaMemcpyDeviceToHost, stream_[device_id]));
-    CUDASUCCESS_OR_FATAL(cudaEventRecord(histograms_wait_obj_[device_id], stream_[device_id]));
-  }
-}
-
-
-template <typename HistType>
-void CUDATreeLearner::WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array) {
-  HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_);
-
-  #pragma omp parallel for schedule(static, num_gpu_)
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    // when the output is ready, the computation is done
-    CUDASUCCESS_OR_FATAL(cudaEventSynchronize(histograms_wait_obj_[device_id]));
-  }
-
-  HistType* histograms = reinterpret_cast<HistType*>(leaf_histogram_array[0].RawData() - kHistOffset);
-  #pragma omp parallel for schedule(static)
-  for (int i = 0; i < num_dense_feature_groups_; ++i) {
-    if (!feature_masks_[i]) {
-      continue;
-    }
-    int dense_group_index = dense_feature_group_map_[i];
-    auto old_histogram_array = histograms + train_data_->GroupBinBoundary(dense_group_index) * 2;
-    int bin_size = train_data_->FeatureGroupNumBin(dense_group_index);
-
-    for (int j = 0; j < bin_size; ++j) {
-      GET_GRAD(old_histogram_array, j) = GET_GRAD(hist_outputs, i * device_bin_size_+ j);
-      GET_HESS(old_histogram_array, j) = GET_HESS(hist_outputs, i * device_bin_size_+ j);
-    }
-  }
-}
-
-void CUDATreeLearner::CountDenseFeatureGroups() {
-  num_dense_feature_groups_ = 0;
-
-  for (int i = 0; i < num_feature_groups_; ++i) {
-    if (!train_data_->IsMultiGroup(i)) {
-      num_dense_feature_groups_++;
-    }
-  }
-  if (!num_dense_feature_groups_) {
-    Log::Warning("GPU acceleration is disabled because no non-trivial dense features can be found");
-  }
-}
-
-void CUDATreeLearner::prevAllocateGPUMemory() {
-  // how many feature-group tuples we have
-  // leave some safe margin for prefetching
-  // 256 work-items per workgroup. Each work-item prefetches one tuple for that feature
-
-  allocated_num_data_ = std::max(num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature), allocated_num_data_);
-
-  // clear sparse/dense maps
-  dense_feature_group_map_.clear();
-  sparse_feature_group_map_.clear();
-
-  // do nothing it there is no dense feature
-  if (!num_dense_feature_groups_) {
-    return;
-  }
-
-  // calculate number of feature groups per gpu
-  num_gpu_feature_groups_.resize(num_gpu_);
-  offset_gpu_feature_groups_.resize(num_gpu_);
-  int num_features_per_gpu = num_dense_feature_groups_ / num_gpu_;
-  int remain_features = num_dense_feature_groups_ - num_features_per_gpu * num_gpu_;
-
-  int offset = 0;
-
-  for (int i = 0; i < num_gpu_; ++i) {
-    offset_gpu_feature_groups_.at(i) = offset;
-    num_gpu_feature_groups_.at(i) = (i < remain_features) ? num_features_per_gpu + 1 : num_features_per_gpu;
-    offset += num_gpu_feature_groups_.at(i);
-  }
-
-  feature_masks_.resize(num_dense_feature_groups_);
-  Log::Debug("Resized feature masks");
-
-  ptr_pinned_feature_masks_ = feature_masks_.data();
-  Log::Debug("Memset pinned_feature_masks_");
-  memset(ptr_pinned_feature_masks_, 0, num_dense_feature_groups_);
-
-  // histogram bin entry size depends on the precision (single/double)
-  hist_bin_entry_sz_ = 2 * (config_->gpu_use_dp ? sizeof(hist_t) : sizeof(gpu_hist_t));  // two elements in this "size"
-
-  CUDASUCCESS_OR_FATAL(cudaHostAlloc(reinterpret_cast<void **>(&host_histogram_outputs_), static_cast<size_t>(num_dense_feature_groups_ * device_bin_size_ * hist_bin_entry_sz_), cudaHostAllocPortable));
-
-  nthreads_ = std::min(omp_get_max_threads(), num_dense_feature_groups_ / dword_features_);
-  nthreads_ = std::max(nthreads_, 1);
-}
-
-// allocate GPU memory for each GPU
-void CUDATreeLearner::AllocateGPUMemory() {
-  #pragma omp parallel for schedule(static, num_gpu_)
-
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    // do nothing it there is no gpu feature
-    int num_gpu_feature_groups = num_gpu_feature_groups_[device_id];
-    if (num_gpu_feature_groups) {
-      CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
-
-      // allocate memory for all features
-      if (device_features_[device_id] != NULL) {
-        CUDASUCCESS_OR_FATAL(cudaFree(device_features_[device_id]));
-      }
-
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_features_[device_id]),  static_cast<size_t>(num_gpu_feature_groups * num_data_ * sizeof(uint8_t))));
-      Log::Debug("Allocated device_features_ addr=%p sz=%lu", device_features_[device_id], num_gpu_feature_groups * num_data_);
-
-      // allocate space for gradients and hessians on device
-      // we will copy gradients and hessians in after ordered_gradients_ and ordered_hessians_ are constructed
-      if (device_gradients_[device_id] != NULL) {
-        CUDASUCCESS_OR_FATAL(cudaFree(device_gradients_[device_id]));
-      }
-
-      if (device_hessians_[device_id] != NULL) {
-        CUDASUCCESS_OR_FATAL(cudaFree(device_hessians_[device_id]));
-      }
-
-      if (device_feature_masks_[device_id] != NULL) {
-         CUDASUCCESS_OR_FATAL(cudaFree(device_feature_masks_[device_id]));
-      }
-
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_gradients_[device_id]), static_cast<size_t>(allocated_num_data_ * sizeof(score_t))));
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_hessians_[device_id]),  static_cast<size_t>(allocated_num_data_ * sizeof(score_t))));
-
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_feature_masks_[device_id]), static_cast<size_t>(num_gpu_feature_groups)));
-
-      // copy indices to the device
-      if (device_data_indices_[device_id] != NULL) {
-        CUDASUCCESS_OR_FATAL(cudaFree(device_data_indices_[device_id]));
-      }
-
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_data_indices_[device_id]), static_cast<size_t>(allocated_num_data_ * sizeof(data_size_t))));
-      CUDASUCCESS_OR_FATAL(cudaMemsetAsync(device_data_indices_[device_id], 0, allocated_num_data_ * sizeof(data_size_t), stream_[device_id]));
-
-      Log::Debug("Memset device_data_indices_");
-
-      // create output buffer, each feature has a histogram with device_bin_size_ bins,
-      // each work group generates a sub-histogram of dword_features_ features.
-      if (!device_subhistograms_[device_id]) {
-        // only initialize once here, as this will not need to change when ResetTrainingData() is called
-        CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_subhistograms_[device_id]), static_cast<size_t>(preallocd_max_num_wg_[device_id] * dword_features_ * device_bin_size_ * (3 * hist_bin_entry_sz_ / 2))));
-
-        Log::Debug("created device_subhistograms_: %p", device_subhistograms_[device_id]);
-      }
-
-      // create atomic counters for inter-group coordination
-      CUDASUCCESS_OR_FATAL(cudaFree(sync_counters_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(sync_counters_[device_id]), static_cast<size_t>(num_gpu_feature_groups * sizeof(int))));
-      CUDASUCCESS_OR_FATAL(cudaMemsetAsync(sync_counters_[device_id], 0, num_gpu_feature_groups * sizeof(int), stream_[device_id]));
-
-      // The output buffer is allocated to host directly, to overlap compute and data transfer
-      CUDASUCCESS_OR_FATAL(cudaFree(device_histogram_outputs_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaMalloc(&(device_histogram_outputs_[device_id]), static_cast<size_t>(num_gpu_feature_groups * device_bin_size_ * hist_bin_entry_sz_)));
-    }
-  }
-}
-
-void CUDATreeLearner::ResetGPUMemory() {
-  // clear sparse/dense maps
-  dense_feature_group_map_.clear();
-  sparse_feature_group_map_.clear();
-}
-
-void CUDATreeLearner::copyDenseFeature() {
-  if (num_feature_groups_ == 0) {
-    LGBM_config_::current_learner = use_cpu_learner;
-    return;
-  }
-
-  Log::Debug("Started copying dense features from CPU to GPU");
-  // find the dense feature-groups and group then into Feature4 data structure (several feature-groups packed into 4 bytes)
-  size_t copied_feature = 0;
-  // set device info
-  int device_id = 0;
-  uint8_t* device_features = device_features_[device_id];
-  CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
-  Log::Debug("Started copying dense features from CPU to GPU - 1");
-
-  for (int i = 0; i < num_feature_groups_; ++i) {
-    // looking for dword_features_ non-sparse feature-groups
-    if (!train_data_->IsMultiGroup(i)) {
-      dense_feature_group_map_.push_back(i);
-      auto sizes_in_byte = std::min(train_data_->FeatureGroupSizesInByte(i), static_cast<size_t>(num_data_));
-      void* tmp_data = train_data_->FeatureGroupData(i);
-      Log::Debug("Started copying dense features from CPU to GPU - 2");
-      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(&device_features[copied_feature * num_data_], tmp_data, sizes_in_byte, cudaMemcpyHostToDevice, stream_[device_id]));
-      Log::Debug("Started copying dense features from CPU to GPU - 3");
-      copied_feature++;
-      // reset device info
-      if (copied_feature == static_cast<size_t>(num_gpu_feature_groups_[device_id])) {
-         CUDASUCCESS_OR_FATAL(cudaEventRecord(features_future_[device_id], stream_[device_id]));
-         device_id += 1;
-         copied_feature = 0;
-         if (device_id < num_gpu_) {
-           device_features = device_features_[device_id];
-           CUDASUCCESS_OR_FATAL(cudaSetDevice(device_id));
-         }
-      }
-    } else {
-      sparse_feature_group_map_.push_back(i);
-    }
-  }
-}
-
-
-
-// InitGPU w/ num_gpu
-void CUDATreeLearner::InitGPU(int num_gpu) {
-  // Get the max bin size, used for selecting best GPU kernel
-  max_num_bin_ = 0;
-
-  #if CUDA_DEBUG >= 1
-  printf("bin_size: ");
-  #endif
-  for (int i = 0; i < num_feature_groups_; ++i) {
-    if (train_data_->IsMultiGroup(i)) {
-      continue;
-    }
-    #if CUDA_DEBUG >= 1
-    printf("%d, ", train_data_->FeatureGroupNumBin(i));
-    #endif
-    max_num_bin_ = std::max(max_num_bin_, train_data_->FeatureGroupNumBin(i));
-  }
-  #if CUDA_DEBUG >= 1
-  printf("\n");
-  #endif
-
-  if (max_num_bin_ <= 16) {
-    device_bin_size_ = 16;
-    histogram_size_ = 16;
-    dword_features_ = 1;
-  } else if (max_num_bin_ <= 64) {
-    device_bin_size_ = 64;
-    histogram_size_ = 64;
-    dword_features_ = 1;
-  } else if (max_num_bin_ <= 256) {
-    Log::Debug("device_bin_size_ = 256");
-    device_bin_size_ = 256;
-    histogram_size_ = 256;
-    dword_features_ = 1;
-  } else {
-    Log::Fatal("bin size %d cannot run on GPU", max_num_bin_);
-  }
-
-  // ignore the feature groups that contain categorical features when producing warnings about max_bin.
-  // these groups may contain larger number of bins due to categorical features, but not due to the setting of max_bin.
-  int max_num_bin_no_categorical = 0;
-  int cur_feature_group = 0;
-  bool categorical_feature_found = false;
-  for (int inner_feature_index = 0; inner_feature_index < num_features_; ++inner_feature_index) {
-    const int feature_group = train_data_->Feature2Group(inner_feature_index);
-    const BinMapper* feature_bin_mapper = train_data_->FeatureBinMapper(inner_feature_index);
-    if (feature_bin_mapper->bin_type() == BinType::CategoricalBin) {
-      categorical_feature_found = true;
-    }
-    if (feature_group != cur_feature_group || inner_feature_index == num_features_ - 1) {
-      if (!categorical_feature_found) {
-        max_num_bin_no_categorical = std::max(max_num_bin_no_categorical, train_data_->FeatureGroupNumBin(cur_feature_group));
-      }
-      categorical_feature_found = false;
-      cur_feature_group = feature_group;
-    }
-  }
-  if (max_num_bin_no_categorical == 65) {
-    Log::Warning("Setting max_bin to 63 is suggested for best performance");
-  }
-  if (max_num_bin_no_categorical == 17) {
-    Log::Warning("Setting max_bin to 15 is suggested for best performance");
-  }
-
-  // get num_dense_feature_groups_
-  CountDenseFeatureGroups();
-
-  if (num_gpu > num_dense_feature_groups_) num_gpu = num_dense_feature_groups_;
-
-  // initialize GPU
-  int gpu_count;
-
-  CUDASUCCESS_OR_FATAL(cudaGetDeviceCount(&gpu_count));
-  num_gpu_ = (gpu_count < num_gpu) ? gpu_count : num_gpu;
-
-  // set cpu threads
-  cpu_threads_ = reinterpret_cast<pthread_t **>(_mm_malloc(sizeof(pthread_t *)*num_gpu_, 16));
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    cpu_threads_[device_id] = reinterpret_cast<pthread_t *>(_mm_malloc(sizeof(pthread_t), 16));
-  }
-
-  // resize device memory pointers
-  device_features_.resize(num_gpu_);
-  device_gradients_.resize(num_gpu_);
-  device_hessians_.resize(num_gpu_);
-  device_feature_masks_.resize(num_gpu_);
-  device_data_indices_.resize(num_gpu_);
-  sync_counters_.resize(num_gpu_);
-  device_subhistograms_.resize(num_gpu_);
-  device_histogram_outputs_.resize(num_gpu_);
-
-  // create stream & events to handle multiple GPUs
-  preallocd_max_num_wg_.resize(num_gpu_, 1024);
-  stream_.resize(num_gpu_);
-  hessians_future_.resize(num_gpu_);
-  gradients_future_.resize(num_gpu_);
-  indices_future_.resize(num_gpu_);
-  features_future_.resize(num_gpu_);
-  kernel_start_.resize(num_gpu_);
-  kernel_wait_obj_.resize(num_gpu_);
-  histograms_wait_obj_.resize(num_gpu_);
-
-  for (int i = 0; i < num_gpu_; ++i) {
-    CUDASUCCESS_OR_FATAL(cudaSetDevice(i));
-    CUDASUCCESS_OR_FATAL(cudaStreamCreate(&(stream_[i])));
-    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(hessians_future_[i])));
-    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(gradients_future_[i])));
-    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(indices_future_[i])));
-    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(features_future_[i])));
-    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(kernel_start_[i])));
-    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(kernel_wait_obj_[i])));
-    CUDASUCCESS_OR_FATAL(cudaEventCreate(&(histograms_wait_obj_[i])));
-  }
-
-  allocated_num_data_ = 0;
-  prevAllocateGPUMemory();
-
-  AllocateGPUMemory();
-
-  copyDenseFeature();
-}
-
-Tree* CUDATreeLearner::Train(const score_t* gradients, const score_t *hessians, bool is_first_tree) {
-  Tree *ret = SerialTreeLearner::Train(gradients, hessians, is_first_tree);
-  return ret;
-}
-
-void CUDATreeLearner::ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) {
-  // check data size
-  data_size_t old_allocated_num_data = allocated_num_data_;
-
-  SerialTreeLearner::ResetTrainingDataInner(train_data, is_constant_hessian, reset_multi_val_bin);
-
-  #if ResetTrainingData_DEBUG == 1
-  serial_time = std::chrono::steady_clock::now() - start_serial_time;
-  #endif
-
-  num_feature_groups_ = train_data_->num_feature_groups();
-
-  // GPU memory has to been reallocated because data may have been changed
-  #if ResetTrainingData_DEBUG == 1
-  auto start_alloc_gpu_time = std::chrono::steady_clock::now();
-  #endif
-
-  // AllocateGPUMemory only when the number of data increased
-  int old_num_feature_groups = num_dense_feature_groups_;
-  CountDenseFeatureGroups();
-  if ((old_allocated_num_data < (num_data_ + 256 * (1 << kMaxLogWorkgroupsPerFeature))) || (old_num_feature_groups < num_dense_feature_groups_)) {
-    prevAllocateGPUMemory();
-    AllocateGPUMemory();
-  } else {
-    ResetGPUMemory();
-  }
-
-  copyDenseFeature();
-
-  #if ResetTrainingData_DEBUG == 1
-  alloc_gpu_time = std::chrono::steady_clock::now() - start_alloc_gpu_time;
-  #endif
-
-  // setup GPU kernel arguments after we allocating all the buffers
-  #if ResetTrainingData_DEBUG == 1
-  auto start_set_arg_time = std::chrono::steady_clock::now();
-  #endif
-
-  #if ResetTrainingData_DEBUG == 1
-  set_arg_time = std::chrono::steady_clock::now() - start_set_arg_time;
-  reset_training_data_time = std::chrono::steady_clock::now() - start_reset_training_data_time;
-  Log::Info("reset_training_data_time: %f secs.", reset_training_data_time.count() * 1e-3);
-  Log::Info("serial_time: %f secs.", serial_time.count() * 1e-3);
-  Log::Info("alloc_gpu_time: %f secs.", alloc_gpu_time.count() * 1e-3);
-  Log::Info("set_arg_time: %f secs.", set_arg_time.count() * 1e-3);
-  #endif
-}
-
-void CUDATreeLearner::BeforeTrain() {
-  #if cudaMemcpy_DEBUG == 1
-  std::chrono::duration<double, std::milli> device_hessians_time = std::chrono::milliseconds(0);
-  std::chrono::duration<double, std::milli> device_gradients_time = std::chrono::milliseconds(0);
-  #endif
-
-  SerialTreeLearner::BeforeTrain();
-
-  #if CUDA_DEBUG >= 2
-  printf("CUDATreeLearner::BeforeTrain() Copying initial full gradients and hessians to device\n");
-  #endif
-
-  // Copy initial full hessians and gradients to GPU.
-  // We start copying as early as possible, instead of at ConstructHistogram().
-  if ((hessians_ != NULL) && (gradients_ != NULL)) {
-    if (!use_bagging_ && num_dense_feature_groups_) {
-      Log::Debug("CudaTreeLearner::BeforeTrain() No baggings, dense_feature_groups_=%d", num_dense_feature_groups_);
-
-      for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-        if (!(share_state_->is_constant_hessian)) {
-          Log::Debug("CUDATreeLearner::BeforeTrain(): Starting hessians_ -> device_hessians_");
-
-          #if cudaMemcpy_DEBUG == 1
-          auto start_device_hessians_time = std::chrono::steady_clock::now();
-          #endif
-
-          CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], hessians_, num_data_*sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
-
-          CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
-
-          #if cudaMemcpy_DEBUG == 1
-          device_hessians_time = std::chrono::steady_clock::now() - start_device_hessians_time;
-          #endif
-
-          Log::Debug("queued copy of device_hessians_");
-        }
-
-        #if cudaMemcpy_DEBUG == 1
-        auto start_device_gradients_time = std::chrono::steady_clock::now();
-        #endif
-
-        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], gradients_, num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
-        CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id]));
-
-        #if cudaMemcpy_DEBUG == 1
-        device_gradients_time = std::chrono::steady_clock::now() - start_device_gradients_time;
-        #endif
-
-        Log::Debug("CUDATreeLearner::BeforeTrain: issued gradients_ -> device_gradients_");
-      }
-    }
-  }
-
-  // use bagging
-  if ((hessians_ != NULL) && (gradients_ != NULL)) {
-    if (data_partition_->leaf_count(0) != num_data_ && num_dense_feature_groups_) {
-      // On GPU, we start copying indices, gradients and hessians now, instead at ConstructHistogram()
-      // copy used gradients and hessians to ordered buffer
-      const data_size_t* indices = data_partition_->indices();
-      data_size_t cnt = data_partition_->leaf_count(0);
-
-      // transfer the indices to GPU
-      for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], indices, cnt * sizeof(*indices), cudaMemcpyHostToDevice, stream_[device_id]));
-        CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
-
-        if (!(share_state_->is_constant_hessian)) {
-          CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_hessians_[device_id], const_cast<void*>(reinterpret_cast<const void*>(&(hessians_[0]))), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
-          CUDASUCCESS_OR_FATAL(cudaEventRecord(hessians_future_[device_id], stream_[device_id]));
-        }
-
-        CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_gradients_[device_id], const_cast<void*>(reinterpret_cast<const void*>(&(gradients_[0]))), num_data_ * sizeof(score_t), cudaMemcpyHostToDevice, stream_[device_id]));
-        CUDASUCCESS_OR_FATAL(cudaEventRecord(gradients_future_[device_id], stream_[device_id]));
-      }
-    }
-  }
-}
-
-bool CUDATreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) {
-  int smaller_leaf;
-
-  data_size_t num_data_in_left_child = GetGlobalDataCountInLeaf(left_leaf);
-  data_size_t num_data_in_right_child = GetGlobalDataCountInLeaf(right_leaf);
-
-  // only have root
-  if (right_leaf < 0) {
-    smaller_leaf = -1;
-  } else if (num_data_in_left_child < num_data_in_right_child) {
-    smaller_leaf = left_leaf;
-  } else {
-    smaller_leaf = right_leaf;
-  }
-
-  // Copy indices, gradients and hessians as early as possible
-  if (smaller_leaf >= 0 && num_dense_feature_groups_) {
-    // only need to initialize for smaller leaf
-    // Get leaf boundary
-    const data_size_t* indices = data_partition_->indices();
-    data_size_t begin = data_partition_->leaf_begin(smaller_leaf);
-    data_size_t end = begin + data_partition_->leaf_count(smaller_leaf);
-
-    for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], &indices[begin], (end-begin) * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
-    }
-  }
-
-  const bool ret = SerialTreeLearner::BeforeFindBestSplit(tree, left_leaf, right_leaf);
-
-  return ret;
-}
-
-bool CUDATreeLearner::ConstructGPUHistogramsAsync(
-  const std::vector<int8_t>& is_feature_used,
-  const data_size_t* data_indices, data_size_t num_data) {
-  if (num_data <= 0) {
-    return false;
-  }
-
-  // do nothing if no features can be processed on GPU
-  if (!num_dense_feature_groups_) {
-    Log::Debug("no dense feature groups, returning");
-    return false;
-  }
-
-  // copy data indices if it is not null
-  if (data_indices != nullptr && num_data != num_data_) {
-    for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-      CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_data_indices_[device_id], data_indices, num_data * sizeof(data_size_t), cudaMemcpyHostToDevice, stream_[device_id]));
-      CUDASUCCESS_OR_FATAL(cudaEventRecord(indices_future_[device_id], stream_[device_id]));
-    }
-  }
-
-  // converted indices in is_feature_used to feature-group indices
-  std::vector<int8_t> is_feature_group_used(num_feature_groups_, 0);
-
-  #pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048)
-  for (int i = 0; i < num_features_; ++i) {
-    if (is_feature_used[i]) {
-      int feature_group = train_data_->Feature2Group(i);
-      is_feature_group_used[feature_group] = (train_data_->FeatureGroupNumBin(feature_group) <= 16) ? 2 : 1;
-    }
-  }
-
-  // construct the feature masks for dense feature-groups
-  int used_dense_feature_groups = 0;
-  #pragma omp parallel for schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
-  for (int i = 0; i < num_dense_feature_groups_; ++i) {
-    if (is_feature_group_used[dense_feature_group_map_[i]]) {
-      feature_masks_[i] = is_feature_group_used[dense_feature_group_map_[i]];
-      ++used_dense_feature_groups;
-    } else {
-      feature_masks_[i] = 0;
-    }
-  }
-  bool use_all_features = ((used_dense_feature_groups == num_dense_feature_groups_) && (data_indices != nullptr));
-  // if no feature group is used, just return and do not use GPU
-  if (used_dense_feature_groups == 0) {
-    return false;
-  }
-
-  // if not all feature groups are used, we need to transfer the feature mask to GPU
-  // otherwise, we will use a specialized GPU kernel with all feature groups enabled
-
-  // We now copy even if all features are used.
-  #pragma omp parallel for schedule(static, num_gpu_)
-  for (int device_id = 0; device_id < num_gpu_; ++device_id) {
-    int offset = offset_gpu_feature_groups_[device_id];
-    CUDASUCCESS_OR_FATAL(cudaMemcpyAsync(device_feature_masks_[device_id], ptr_pinned_feature_masks_ + offset, num_gpu_feature_groups_[device_id] , cudaMemcpyHostToDevice, stream_[device_id]));
-  }
-
-  // All data have been prepared, now run the GPU kernel
-  GPUHistogram(num_data, use_all_features);
-
-  return true;
-}
-
-void CUDATreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
-  std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
-  std::vector<int8_t> is_dense_feature_used(num_features_, 0);
-  int num_dense_features = 0, num_sparse_features = 0;
-
-  #pragma omp parallel for schedule(static)
-  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
-    if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
-    if (!is_feature_used[feature_index]) continue;
-    if (train_data_->IsMultiGroup(train_data_->Feature2Group(feature_index))) {
-      is_sparse_feature_used[feature_index] = 1;
-      num_sparse_features++;
-    } else {
-      is_dense_feature_used[feature_index] = 1;
-      num_dense_features++;
-    }
-  }
-
-  // construct smaller leaf
-  hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
-
-  // Check workgroups per feature4 tuple..
-  int exp_workgroups_per_feature = GetNumWorkgroupsPerFeature(smaller_leaf_splits_->num_data_in_leaf());
-
-  // if the workgroup per feature is 1 (2^0), return as the work is too small for a GPU
-  if (exp_workgroups_per_feature == 0) {
-    return SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
-  }
-
-  // ConstructGPUHistogramsAsync will return true if there are availabe feature groups dispatched to GPU
-  bool is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
-    nullptr, smaller_leaf_splits_->num_data_in_leaf());
-
-  // then construct sparse features on CPU
-  // We set data_indices to null to avoid rebuilding ordered gradients/hessians
-  if (num_sparse_features > 0) {
-    train_data_->ConstructHistograms(is_sparse_feature_used,
-    smaller_leaf_splits_->data_indices(), smaller_leaf_splits_->num_data_in_leaf(),
-    gradients_, hessians_,
-    ordered_gradients_.data(), ordered_hessians_.data(),
-    share_state_.get(),
-    ptr_smaller_leaf_hist_data);
-  }
-
-  // wait for GPU to finish, only if GPU is actually used
-  if (is_gpu_used) {
-    if (config_->gpu_use_dp) {
-      // use double precision
-      WaitAndGetHistograms<hist_t>(smaller_leaf_histogram_array_);
-    } else {
-      // use single precision
-      WaitAndGetHistograms<gpu_hist_t>(smaller_leaf_histogram_array_);
-    }
-  }
-
-  // Compare GPU histogram with CPU histogram, useful for debuggin GPU code problem
-  // #define CUDA_DEBUG_COMPARE
-#ifdef CUDA_DEBUG_COMPARE
-  printf("Start Comparing_Histogram between GPU and CPU, num_dense_feature_groups_ = %d\n", num_dense_feature_groups_);
-  bool compare = true;
-  for (int i = 0; i < num_dense_feature_groups_; ++i) {
-    if (!feature_masks_[i])
-      continue;
-    int dense_feature_group_index = dense_feature_group_map_[i];
-    size_t size = train_data_->FeatureGroupNumBin(dense_feature_group_index);
-    hist_t* ptr_smaller_leaf_hist_data = smaller_leaf_histogram_array_[0].RawData() - kHistOffset;
-    hist_t* current_histogram = ptr_smaller_leaf_hist_data + train_data_->GroupBinBoundary(dense_feature_group_index) * 2;
-    hist_t* gpu_histogram = new hist_t[size * 2];
-    data_size_t num_data = smaller_leaf_splits_->num_data_in_leaf();
-    printf("Comparing histogram for feature %d, num_data %d, num_data_ = %d, %lu bins\n", dense_feature_group_index, num_data, num_data_, size);
-    std::copy(current_histogram, current_histogram + size * 2, gpu_histogram);
-    std::memset(current_histogram, 0, size * sizeof(hist_t) * 2);
-    if (train_data_->FeatureGroupBin(dense_feature_group_index) == nullptr) {
-      continue;
-    }
-    if (num_data == num_data_) {
-      if (share_state_->is_constant_hessian) {
-        printf("ConstructHistogram(): num_data == num_data_ is_constant_hessian\n");
-        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
-            0,
-            num_data,
-            gradients_,
-            current_histogram);
-      } else {
-        printf("ConstructHistogram(): num_data == num_data_\n");
-        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
-            0,
-            num_data,
-            gradients_, hessians_,
-            current_histogram);
-      }
-    } else {
-      if (share_state_->is_constant_hessian) {
-        printf("ConstructHistogram(): is_constant_hessian\n");
-        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
-            smaller_leaf_splits_->data_indices(),
-            0,
-            num_data,
-            gradients_,
-            current_histogram);
-      } else {
-        printf("ConstructHistogram(): 4, num_data = %d, num_data_ = %d\n", num_data, num_data_);
-        train_data_->FeatureGroupBin(dense_feature_group_index)->ConstructHistogram(
-            smaller_leaf_splits_->data_indices(),
-            0,
-            num_data,
-            gradients_, hessians_,
-            current_histogram);
-      }
-    }
-    int retval;
-    if ((num_data != num_data_) && compare) {
-        retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, share_state_->is_constant_hessian);
-        printf("CompareHistograms reports %d errors\n", retval);
-        compare = false;
-    }
-    retval = CompareHistograms(gpu_histogram, current_histogram, size, dense_feature_group_index, config_->gpu_use_dp, share_state_->is_constant_hessian);
-    if (num_data == num_data_) {
-        printf("CompareHistograms reports %d errors\n", retval);
-    } else {
-        printf("CompareHistograms reports %d errors\n", retval);
-    }
-    std::copy(gpu_histogram, gpu_histogram + size * 2, current_histogram);
-    delete [] gpu_histogram;
-  }
-  printf("End Comparing Histogram between GPU and CPU\n");
-  fflush(stderr);
-  fflush(stdout);
-#endif
-
-  if (larger_leaf_histogram_array_ != nullptr && !use_subtract) {
-    // construct larger leaf
-    hist_t* ptr_larger_leaf_hist_data = larger_leaf_histogram_array_[0].RawData() - kHistOffset;
-
-    is_gpu_used = ConstructGPUHistogramsAsync(is_feature_used,
-      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf());
-
-    // then construct sparse features on CPU
-    // We set data_indices to null to avoid rebuilding ordered gradients/hessians
-    if (num_sparse_features > 0) {
-    train_data_->ConstructHistograms(is_sparse_feature_used,
-      larger_leaf_splits_->data_indices(), larger_leaf_splits_->num_data_in_leaf(),
-      gradients_, hessians_,
-      ordered_gradients_.data(), ordered_hessians_.data(),
-      share_state_.get(),
-      ptr_larger_leaf_hist_data);
-    }
-
-    // wait for GPU to finish, only if GPU is actually used
-    if (is_gpu_used) {
-      if (config_->gpu_use_dp) {
-        // use double precision
-        WaitAndGetHistograms<hist_t>(larger_leaf_histogram_array_);
-      } else {
-        // use single precision
-        WaitAndGetHistograms<gpu_hist_t>(larger_leaf_histogram_array_);
-      }
-    }
-  }
-}
-
-void CUDATreeLearner::FindBestSplits(const Tree* tree) {
-  SerialTreeLearner::FindBestSplits(tree);
-
-#if CUDA_DEBUG >= 3
-  for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
-    if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
-    if (parent_leaf_histogram_array_ != nullptr
-        && !parent_leaf_histogram_array_[feature_index].is_splittable()) {
-      smaller_leaf_histogram_array_[feature_index].set_is_splittable(false);
-      continue;
-    }
-    size_t bin_size = train_data_->FeatureNumBin(feature_index) + 1;
-    printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd smaller leaf:\n", feature_index, bin_size);
-    PrintHistograms(smaller_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size);
-    if (larger_leaf_splits_ == nullptr || larger_leaf_splits_->leaf_index() < 0) { continue; }
-    printf("CUDATreeLearner::FindBestSplits() Feature %d bin_size=%zd larger leaf:\n", feature_index, bin_size);
-
-    PrintHistograms(larger_leaf_histogram_array_[feature_index].RawData() - kHistOffset, bin_size);
-  }
-#endif
-}
-
-void CUDATreeLearner::Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) {
-  const SplitInfo& best_split_info = best_split_per_leaf_[best_Leaf];
-#if CUDA_DEBUG >= 2
-  printf("Splitting leaf %d with feature %d thresh %d gain %f stat %f %f %f %f\n", best_Leaf, best_split_info.feature, best_split_info.threshold, best_split_info.gain, best_split_info.left_sum_gradient, best_split_info.right_sum_gradient, best_split_info.left_sum_hessian, best_split_info.right_sum_hessian);
-#endif
-  SerialTreeLearner::Split(tree, best_Leaf, left_leaf, right_leaf);
-  if (Network::num_machines() == 1) {
-    // do some sanity check for the GPU algorithm
-    if (best_split_info.left_count < best_split_info.right_count) {
-      if ((best_split_info.left_count != smaller_leaf_splits_->num_data_in_leaf()) ||
-          (best_split_info.right_count!= larger_leaf_splits_->num_data_in_leaf())) {
-        Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
-      }
-    } else {
-      if ((best_split_info.left_count != larger_leaf_splits_->num_data_in_leaf()) ||
-          (best_split_info.right_count!= smaller_leaf_splits_->num_data_in_leaf())) {
-        Log::Fatal("Bug in GPU histogram! split %d: %d, smaller_leaf: %d, larger_leaf: %d\n", best_split_info.left_count, best_split_info.right_count, smaller_leaf_splits_->num_data_in_leaf(), larger_leaf_splits_->num_data_in_leaf());
-      }
-    }
-  }
-}
-
-}   // namespace LightGBM
-#undef cudaMemcpy_DEBUG
-#endif  // USE_CUDA
diff --git a/src/treelearner/cuda_tree_learner.h b/src/treelearner/cuda_tree_learner.h
deleted file mode 100644
index b0e6fb3b4628..000000000000
--- a/src/treelearner/cuda_tree_learner.h
+++ /dev/null
@@ -1,261 +0,0 @@
-/*!
- * Copyright (c) 2020 IBM Corporation. All rights reserved.
- * Licensed under the MIT License. See LICENSE file in the project root for license information.
- */
-#ifndef LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
-#define LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
-
-#include <LightGBM/utils/random.h>
-#include <LightGBM/utils/array_args.h>
-#include <LightGBM/dataset.h>
-#include <LightGBM/feature_group.h>
-#include <LightGBM/tree.h>
-
-#include <string>
-#include <cmath>
-#include <cstdio>
-#include <memory>
-#include <random>
-#include <vector>
-#ifdef USE_CUDA
-#include <cuda_runtime.h>
-#endif
-
-#include "feature_histogram.hpp"
-#include "serial_tree_learner.h"
-#include "data_partition.hpp"
-#include "split_info.hpp"
-#include "leaf_splits.hpp"
-
-#ifdef USE_CUDA
-#include <LightGBM/cuda/vector_cudahost.h>
-#include "cuda_kernel_launcher.h"
-
-
-using json11::Json;
-
-namespace LightGBM {
-
-/*!
-* \brief CUDA-based parallel learning algorithm.
-*/
-class CUDATreeLearner: public SerialTreeLearner {
- public:
-    explicit CUDATreeLearner(const Config* tree_config);
-    ~CUDATreeLearner();
-    void Init(const Dataset* train_data, bool is_constant_hessian) override;
-    void ResetTrainingDataInner(const Dataset* train_data, bool is_constant_hessian, bool reset_multi_val_bin) override;
-    Tree* Train(const score_t* gradients, const score_t *hessians, bool is_first_tree) override;
-    void SetBaggingData(const Dataset* subset, const data_size_t* used_indices, data_size_t num_data) override {
-      SerialTreeLearner::SetBaggingData(subset, used_indices, num_data);
-      if (subset == nullptr && used_indices != nullptr) {
-        if (num_data != num_data_) {
-          use_bagging_ = true;
-          return;
-        }
-      }
-      use_bagging_ = false;
-    }
-
- protected:
-    void BeforeTrain() override;
-    bool BeforeFindBestSplit(const Tree* tree, int left_leaf, int right_leaf) override;
-    void FindBestSplits(const Tree* tree) override;
-    void Split(Tree* tree, int best_Leaf, int* left_leaf, int* right_leaf) override;
-    void ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) override;
-
- private:
-    typedef float gpu_hist_t;
-
-    /*!
-     * \brief Find the best number of workgroups processing one feature for maximizing efficiency
-     * \param leaf_num_data The number of data examples on the current leaf being processed
-     * \return Log2 of the best number for workgroups per feature, in range 0...kMaxLogWorkgroupsPerFeature
-     */
-    int GetNumWorkgroupsPerFeature(data_size_t leaf_num_data);
-
-    /*!
-     * \brief Initialize GPU device
-     * \param num_gpu: number of maximum gpus
-     */
-    void InitGPU(int num_gpu);
-
-    /*!
-     * \brief Allocate memory for GPU computation // alloc only
-     */
-    void CountDenseFeatureGroups();  // compute num_dense_feature_group
-    void prevAllocateGPUMemory();  // compute CPU-side param calculation & Pin HostMemory
-    void AllocateGPUMemory();
-
-    /*!
-     * \ ResetGPUMemory
-     */
-    void ResetGPUMemory();
-
-    /*!
-     * \ copy dense feature from CPU to GPU
-     */
-    void copyDenseFeature();
-
-    /*! 
-     * \brief Compute GPU feature histogram for the current leaf.
-     *        Indices, gradients and Hessians have been copied to the device.
-     * \param leaf_num_data Number of data on current leaf
-     * \param use_all_features Set to true to not use feature masks, with a faster kernel
-     */
-    void GPUHistogram(data_size_t leaf_num_data, bool use_all_features);
-
-    void SetThreadData(ThreadData* thread_data, int device_id, int histogram_size,
-                int leaf_num_data, bool use_all_features,
-                int num_workgroups, int exp_workgroups_per_feature) {
-      ThreadData* td = &thread_data[device_id];
-      td->device_id             = device_id;
-      td->histogram_size        = histogram_size;
-      td->leaf_num_data         = leaf_num_data;
-      td->num_data              = num_data_;
-      td->use_all_features      = use_all_features;
-      td->is_constant_hessian   = share_state_->is_constant_hessian;
-      td->num_workgroups        = num_workgroups;
-      td->stream                = stream_[device_id];
-      td->device_features       = device_features_[device_id];
-      td->device_feature_masks  = reinterpret_cast<uint8_t *>(device_feature_masks_[device_id]);
-      td->device_data_indices   = device_data_indices_[device_id];
-      td->device_gradients      = device_gradients_[device_id];
-      td->device_hessians       = device_hessians_[device_id];
-      td->hessians_const        = hessians_[0];
-      td->device_subhistograms  = device_subhistograms_[device_id];
-      td->sync_counters         = sync_counters_[device_id];
-      td->device_histogram_outputs   = device_histogram_outputs_[device_id];
-      td->exp_workgroups_per_feature = exp_workgroups_per_feature;
-
-      td->kernel_start           = &(kernel_start_[device_id]);
-      td->kernel_wait_obj        = &(kernel_wait_obj_[device_id]);
-      td->kernel_input_wait_time = &(kernel_input_wait_time_[device_id]);
-
-      size_t output_size = num_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
-      size_t host_output_offset = offset_gpu_feature_groups_[device_id] * dword_features_ * device_bin_size_ * hist_bin_entry_sz_;
-      td->output_size           = output_size;
-      td->host_histogram_output = reinterpret_cast<char*>(host_histogram_outputs_) + host_output_offset;
-      td->histograms_wait_obj   = &(histograms_wait_obj_[device_id]);
-    }
-
-    /*!
-     * \brief Wait for GPU kernel execution and read histogram
-     * \param histograms Destination of histogram results from GPU.
-     */
-    template <typename HistType>
-    void WaitAndGetHistograms(FeatureHistogram* leaf_histogram_array);
-
-    /*!
-     * \brief Construct GPU histogram asynchronously. 
-     *        Interface is similar to Dataset::ConstructHistograms().
-     * \param is_feature_used A predicate vector for enabling each feature
-     * \param data_indices Array of data example IDs to be included in histogram, will be copied to GPU.
-     *                     Set to nullptr to skip copy to GPU.
-     * \param num_data Number of data examples to be included in histogram
-     * \return true if GPU kernel is launched, false if GPU is not used
-    */
-    bool ConstructGPUHistogramsAsync(
-      const std::vector<int8_t>& is_feature_used,
-      const data_size_t* data_indices, data_size_t num_data);
-
-    /*! brief Log2 of max number of workgroups per feature*/
-    const int kMaxLogWorkgroupsPerFeature = 10;  // 2^10
-    /*! brief Max total number of workgroups with preallocated workspace.
-     *        If we use more than this number of workgroups, we have to reallocate subhistograms */
-    std::vector<int> preallocd_max_num_wg_;
-
-    /*! \brief True if bagging is used */
-    bool use_bagging_;
-
-    /*! \brief GPU command queue object */
-    std::vector<cudaStream_t> stream_;
-
-    /*! \brief total number of feature-groups */
-    int num_feature_groups_;
-    /*! \brief total number of dense feature-groups, which will be processed on GPU */
-    int num_dense_feature_groups_;
-    std::vector<int> num_gpu_feature_groups_;
-    std::vector<int> offset_gpu_feature_groups_;
-    /*! \brief On GPU we read one DWORD (4-byte) of features of one example once.
-     *  With bin size > 16, there are 4 features per DWORD.
-     *  With bin size <=16, there are 8 features per DWORD.
-     */
-    int dword_features_;
-    /*! \brief Max number of bins of training data, used to determine 
-     * which GPU kernel to use */
-    int max_num_bin_;
-    /*! \brief Used GPU kernel bin size (64, 256) */
-    int histogram_size_;
-    int device_bin_size_;
-    /*! \brief Size of histogram bin entry, depending if single or double precision is used */
-    size_t hist_bin_entry_sz_;
-    /*! \brief Indices of all dense feature-groups */
-    std::vector<int> dense_feature_group_map_;
-    /*! \brief Indices of all sparse feature-groups */
-    std::vector<int> sparse_feature_group_map_;
-    /*! \brief GPU memory object holding the training data */
-    std::vector<uint8_t*> device_features_;
-    /*! \brief GPU memory object holding the ordered gradient */
-    std::vector<score_t*> device_gradients_;
-    /*! \brief GPU memory object holding the ordered hessian */
-    std::vector<score_t*> device_hessians_;
-    /*! \brief A vector of feature mask. 1 = feature used, 0 = feature not used */
-    std::vector<char> feature_masks_;
-    /*! \brief GPU memory object holding the feature masks */
-    std::vector<char*> device_feature_masks_;
-    /*! \brief Pointer to pinned memory of feature masks */
-    char* ptr_pinned_feature_masks_ = nullptr;
-    /*! \brief GPU memory object holding indices of the leaf being processed */
-    std::vector<data_size_t*> device_data_indices_;
-    /*! \brief GPU memory object holding counters for workgroup coordination */
-    std::vector<int*> sync_counters_;
-    /*! \brief GPU memory object holding temporary sub-histograms per workgroup */
-    std::vector<char*> device_subhistograms_;
-    /*! \brief Host memory object for histogram output (GPU will write to Host memory directly) */
-    std::vector<void*> device_histogram_outputs_;
-    /*! \brief Host memory pointer for histogram outputs */
-    void *host_histogram_outputs_;
-    /*! CUDA waitlist object for waiting for data transfer before kernel execution */
-    std::vector<cudaEvent_t> kernel_wait_obj_;
-    /*! CUDA waitlist object for reading output histograms after kernel execution */
-    std::vector<cudaEvent_t> histograms_wait_obj_;
-    /*! CUDA Asynchronous waiting object for copying indices */
-    std::vector<cudaEvent_t> indices_future_;
-    /*! Asynchronous waiting object for copying gradients */
-    std::vector<cudaEvent_t> gradients_future_;
-    /*! Asynchronous waiting object for copying Hessians */
-    std::vector<cudaEvent_t> hessians_future_;
-    /*! Asynchronous waiting object for copying dense features */
-    std::vector<cudaEvent_t> features_future_;
-
-    // host-side buffer for converting feature data into featre4 data
-    int nthreads_;  // number of Feature4* vector on host4_vecs_
-    std::vector<cudaEvent_t> kernel_start_;
-    std::vector<float> kernel_time_;  // measure histogram kernel time
-    std::vector<std::chrono::duration<double, std::milli>> kernel_input_wait_time_;
-    int num_gpu_;
-    int allocated_num_data_;  // allocated data instances
-    pthread_t **cpu_threads_;  // pthread, 1 cpu thread / gpu
-};
-
-}  // namespace LightGBM
-#else  // USE_CUDA
-
-// When GPU support is not compiled in, quit with an error message
-
-namespace LightGBM {
-
-class CUDATreeLearner: public SerialTreeLearner {
- public:
-    #pragma warning(disable : 4702)
-    explicit CUDATreeLearner(const Config* tree_config) : SerialTreeLearner(tree_config) {
-      Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
-                 "Please recompile with CMake option -DUSE_CUDA=1");
-    }
-};
-
-}  // namespace LightGBM
-
-#endif  // USE_CUDA
-#endif  // LIGHTGBM_TREELEARNER_CUDA_TREE_LEARNER_H_
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 7a21ed0691c3..5ca8a3f047f6 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -344,15 +344,7 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree, const std::set<int>* fo
   }
   bool use_subtract = parent_leaf_histogram_array_ != nullptr;
 
-#ifdef USE_CUDA
-  if (LGBM_config_::current_learner == use_cpu_learner) {
-    SerialTreeLearner::ConstructHistograms(is_feature_used, use_subtract);
-  } else {
-    ConstructHistograms(is_feature_used, use_subtract);
-  }
-#else
   ConstructHistograms(is_feature_used, use_subtract);
-#endif
   FindBestSplitsFromHistograms(is_feature_used, use_subtract, tree);
 }
 

From a9103a35b42f466b28aaa1d7b281abf3a0a34ebd Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 16 Jan 2023 23:00:22 -0600
Subject: [PATCH 05/17] comment out CI

---
 .appveyor.yml                   |  44 ----
 .github/workflows/r_package.yml | 350 --------------------------------
 .vsts-ci.yml                    | 138 ++++++-------
 3 files changed, 69 insertions(+), 463 deletions(-)
 delete mode 100644 .appveyor.yml
 delete mode 100644 .github/workflows/r_package.yml

diff --git a/.appveyor.yml b/.appveyor.yml
deleted file mode 100644
index 2d279b0f33e3..000000000000
--- a/.appveyor.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-version: 3.3.3.99.{build}
-
-image: Visual Studio 2015
-platform: x64
-configuration:  # a trick to construct a build matrix with multiple Python versions
-  - '3.7'
-
-# only build pull requests and
-# commits to 'master' or any branch starting with 'release'
-branches:
-  only:
-    - master
-    - /^release/
-
-environment:
-  matrix:
-    - COMPILER: MSVC
-      TASK: python
-    - COMPILER: MINGW
-      TASK: python
-
-clone_depth: 5
-
-install:
-  - git submodule update --init --recursive  # get `external_libs` folder
-  - set PATH=%PATH:C:\Program Files\Git\usr\bin;=%  # delete sh.exe from PATH (mingw32-make fix)
-  - set PATH=C:\mingw-w64\x86_64-8.1.0-posix-seh-rt_v6-rev0\mingw64\bin;%PATH%
-  - set PYTHON_VERSION=%CONFIGURATION%
-  - set CONDA_ENV="test-env"
-  - ps: |
-      $env:MINICONDA = "C:\Miniconda3-x64"
-      $env:PATH = "$env:MINICONDA;$env:MINICONDA\Scripts;$env:PATH"
-      $env:BUILD_SOURCESDIRECTORY = "$env:APPVEYOR_BUILD_FOLDER"
-      $env:LGB_VER = (Get-Content $env:APPVEYOR_BUILD_FOLDER\VERSION.txt).trim()
-
-build: false
-
-test_script:
-  - conda config --remove channels defaults
-  - conda config --add channels nodefaults
-  - conda config --add channels conda-forge
-  - conda config --set channel_priority strict
-  - conda init powershell
-  - powershell.exe -ExecutionPolicy Bypass -File %APPVEYOR_BUILD_FOLDER%\.ci\test_windows.ps1
diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml
deleted file mode 100644
index 1574786cac6f..000000000000
--- a/.github/workflows/r_package.yml
+++ /dev/null
@@ -1,350 +0,0 @@
-name: R-package
-
-on:
-  push:
-    branches:
-    - master
-  pull_request:
-    branches:
-    - master
-    - release/*
-
-# automatically cancel in-progress builds if another commit is pushed
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-env:
-  # hack to get around this:
-  # https://stat.ethz.ch/pipermail/r-package-devel/2020q3/005930.html
-  _R_CHECK_SYSTEM_CLOCK_: 0
-  # ignore R CMD CHECK NOTE checking how long it has
-  # been since the last submission
-  _R_CHECK_CRAN_INCOMING_REMOTE_: 0
-  # CRAN ignores the "installed size is too large" NOTE,
-  # so our CI can too. Setting to a large value here just
-  # to catch extreme problems
-  _R_CHECK_PKG_SIZES_THRESHOLD_: 100
-
-jobs:
-  test:
-    name: ${{ matrix.task }} (${{ matrix.os }}, ${{ matrix.compiler }}, R ${{ matrix.r_version }}, ${{ matrix.build_type }})
-    runs-on: ${{ matrix.os }}
-    container: ${{ matrix.container }}
-    timeout-minutes: 60
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          ################
-          # CMake builds #
-          ################
-          - os: ubuntu-latest
-            task: r-package
-            compiler: gcc
-            r_version: 3.6
-            build_type: cmake
-            container: 'ubuntu:18.04'
-          - os: ubuntu-latest
-            task: r-package
-            compiler: gcc
-            r_version: 4.2
-            build_type: cmake
-            container: 'ubuntu:22.04'
-          - os: ubuntu-latest
-            task: r-package
-            compiler: clang
-            r_version: 3.6
-            build_type: cmake
-            container: 'ubuntu:18.04'
-          - os: ubuntu-latest
-            task: r-package
-            compiler: clang
-            r_version: 4.2
-            build_type: cmake
-            container: 'ubuntu:22.04'
-          - os: macOS-latest
-            task: r-package
-            compiler: gcc
-            r_version: 3.6
-            build_type: cmake
-            container: null
-          - os: macOS-latest
-            task: r-package
-            compiler: gcc
-            r_version: 4.2
-            build_type: cmake
-            container: null
-          - os: macOS-latest
-            task: r-package
-            compiler: clang
-            r_version: 3.6
-            build_type: cmake
-            container: null
-          - os: macOS-latest
-            task: r-package
-            compiler: clang
-            r_version: 4.2
-            build_type: cmake
-            container: null
-          - os: windows-latest
-            task: r-package
-            compiler: MINGW
-            toolchain: MINGW
-            r_version: 3.6
-            build_type: cmake
-            container: null
-          - os: windows-latest
-            task: r-package
-            compiler: MINGW
-            toolchain: MSYS
-            r_version: 4.2
-            build_type: cmake
-            container: null
-          # Visual Studio 2019
-          - os: windows-2019
-            task: r-package
-            compiler: MSVC
-            toolchain: MSVC
-            r_version: 3.6
-            build_type: cmake
-            container: null
-          # Visual Studio 2022
-          - os: windows-2022
-            task: r-package
-            compiler: MSVC
-            toolchain: MSVC
-            r_version: 4.2
-            build_type: cmake
-            container: null
-          ###############
-          # CRAN builds #
-          ###############
-          - os: windows-latest
-            task: r-package
-            compiler: MINGW
-            toolchain: MINGW
-            r_version: 3.6
-            build_type: cran
-            container: null
-          - os: windows-latest
-            task: r-package
-            compiler: MINGW
-            toolchain: MSYS
-            r_version: 4.2
-            build_type: cran
-            container: null
-          - os: ubuntu-latest
-            task: r-package
-            compiler: gcc
-            r_version: 4.2
-            build_type: cran
-            container: 'ubuntu:22.04'
-          - os: macOS-latest
-            task: r-package
-            compiler: clang
-            r_version: 4.2
-            build_type: cran
-            container: null
-          ################
-          # Other checks #
-          ################
-          - os: ubuntu-latest
-            task: r-rchk
-            compiler: gcc
-            r_version: 4.2
-            build_type: cran
-            container: 'ubuntu:22.04'
-    steps:
-      - name: Prevent conversion of line endings on Windows
-        if: startsWith(matrix.os, 'windows')
-        shell: pwsh
-        run: git config --global core.autocrlf false
-      - name: Install packages used by third-party actions
-        if: startsWith(matrix.os, 'ubuntu')
-        shell: bash
-        run: |
-          apt-get update -y
-          apt-get install --no-install-recommends -y \
-            ca-certificates \
-            dirmngr \
-            gpg \
-            gpg-agent \
-            software-properties-common \
-            sudo
-          # install newest version of git
-          # ref:
-          #     - https://unix.stackexchange.com/a/170831/550004
-          #     - https://git-scm.com/download/linux
-          add-apt-repository ppa:git-core/ppa -y
-          apt-get update -y
-          apt-get install --no-install-recommends -y \
-            git
-      - name: Trust git cloning LightGBM
-        if: startsWith(matrix.os, 'ubuntu')
-        run: |
-          git config --global --add safe.directory "${GITHUB_WORKSPACE}"
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 5
-          submodules: true
-      - name: Install pandoc
-        uses: r-lib/actions/setup-pandoc@v2
-      - name: install tinytex
-        if: startsWith(matrix.os, 'windows')
-        uses: r-lib/actions/setup-tinytex@v2
-        env:
-          CTAN_MIRROR: https://ctan.math.illinois.edu/systems/win32/miktex
-          TINYTEX_INSTALLER: TinyTeX
-      - name: Setup and run tests on Linux and macOS
-        if: matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest'
-        shell: bash
-        run: |
-          export TASK="${{ matrix.task }}"
-          export COMPILER="${{ matrix.compiler }}"
-          export GITHUB_ACTIONS="true"
-          if [[ "${{ matrix.os }}" == "macOS-latest" ]]; then
-              export OS_NAME="macos"
-          elif [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then
-              export OS_NAME="linux"
-              export IN_UBUNTU_BASE_CONTAINER="true"
-              # the default version of cmake provided on Ubuntu 18.04 (v3.10.2), is not supported by LightGBM
-              # see https://github.com/microsoft/LightGBM/issues/5642
-              if [[ "${{ matrix.container }}" == "ubuntu:18.04" ]]; then
-                export INSTALL_CMAKE_FROM_RELEASES="true"
-              fi
-          fi
-          export BUILD_DIRECTORY="$GITHUB_WORKSPACE"
-          export R_VERSION="${{ matrix.r_version }}"
-          export R_BUILD_TYPE="${{ matrix.build_type }}"
-          $GITHUB_WORKSPACE/.ci/setup.sh
-          $GITHUB_WORKSPACE/.ci/test.sh
-      - name: Setup and run tests on Windows
-        if: startsWith(matrix.os, 'windows')
-        shell: pwsh -command ". {0}"
-        run: |
-          $env:BUILD_SOURCESDIRECTORY = $env:GITHUB_WORKSPACE
-          $env:LGB_VER = (Get-Content -TotalCount 1 $env:BUILD_SOURCESDIRECTORY\VERSION.txt).trim().replace('rc', '-')
-          $env:TOOLCHAIN = "${{ matrix.toolchain }}"
-          $env:R_VERSION = "${{ matrix.r_version }}"
-          $env:R_BUILD_TYPE = "${{ matrix.build_type }}"
-          $env:COMPILER = "${{ matrix.compiler }}"
-          $env:GITHUB_ACTIONS = "true"
-          $env:TASK = "${{ matrix.task }}"
-          & "$env:GITHUB_WORKSPACE/.ci/test_windows.ps1"
-  test-r-sanitizers:
-    name: r-sanitizers (ubuntu-latest, R-devel, ${{ matrix.compiler }} ASAN/UBSAN)
-    timeout-minutes: 60
-    runs-on: ubuntu-latest
-    container: wch1/r-debug
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - r_customization: san
-            compiler: gcc
-          - r_customization: csan
-            compiler: clang
-    steps:
-      - name: Trust git cloning LightGBM
-        run: |
-          git config --global --add safe.directory "${GITHUB_WORKSPACE}"
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 5
-          submodules: true
-      - name: Install packages
-        shell: bash
-        run: |
-          RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
-          sh build-cran-package.sh --r-executable=RD${{ matrix.r_customization }}
-          RD${{ matrix.r_customization }} CMD INSTALL lightgbm_*.tar.gz || exit -1
-      - name: Run tests with sanitizers
-        shell: bash
-        run: |
-          cd R-package/tests
-          exit_code=0
-          RDscript${{ matrix.r_customization }} testthat.R >> tests.log 2>&1 || exit_code=-1
-          cat ./tests.log
-          exit ${exit_code}
-  test-r-debian-clang:
-    name: r-package (debian, R-devel, clang)
-    timeout-minutes: 60
-    runs-on: ubuntu-latest
-    container: rhub/debian-clang-devel
-    steps:
-      - name: Install Git before checkout
-        shell: bash
-        run: |
-          apt-get update --allow-releaseinfo-change
-          apt-get install --no-install-recommends -y git
-      - name: Trust git cloning LightGBM
-        run: |
-          git config --global --add safe.directory "${GITHUB_WORKSPACE}"
-      - name: Checkout repository
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 5
-          submodules: true
-      - name: update to clang 15
-        shell: bash
-        run: |
-          # remove clang stuff that comes installed in the image
-          apt-get autoremove -y --purge \
-              clang-* \
-              libclang-* \
-              libunwind-* \
-              llvm-*
-          #
-          # replace it all with clang-15
-          apt-get update -y
-          apt-get install --no-install-recommends -y \
-              gnupg \
-              lsb-release \
-              software-properties-common \
-              wget
-          #
-          wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -
-          #
-          add-apt-repository "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-15 main"
-          apt-get install -y --no-install-recommends \
-              clang-15 \
-              clangd-15 \
-              clang-format-15 \
-              clang-tidy-15 \
-              clang-tools-15 \
-              lldb-15 \
-              lld-15 \
-              llvm-15-dev \
-              llvm-15-tools \
-              libomp-15-dev \
-              libc++-15-dev \
-              libc++abi-15-dev \
-              libclang-common-15-dev \
-              libclang-15-dev \
-              libclang-cpp15-dev \
-              libunwind-15-dev
-          # overwrite everything in /usr/bin with the new v15 versions
-          cp --remove-destination /usr/lib/llvm-15/bin/* /usr/bin/
-      - name: Install packages and run tests
-        shell: bash
-        run: |
-          export PATH=/opt/R-devel/bin/:${PATH}
-          Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
-          sh build-cran-package.sh
-          R CMD check --as-cran --run-donttest lightgbm_*.tar.gz || exit -1
-          if grep -q -E "NOTE|WARNING|ERROR" lightgbm.Rcheck/00check.log; then
-              echo "NOTEs, WARNINGs, or ERRORs have been found by R CMD check"
-              exit -1
-          fi
-  all-r-package-jobs-successful:
-    if: always()
-    runs-on: ubuntu-latest
-    needs: [test, test-r-sanitizers, test-r-debian-clang]
-    steps:
-    - name: Note that all tests succeeded
-      uses: re-actors/alls-green@v1.2.2
-      with:
-        jobs: ${{ toJSON(needs) }}
diff --git a/.vsts-ci.yml b/.vsts-ci.yml
index 194aa5471131..26c687b0a20f 100644
--- a/.vsts-ci.yml
+++ b/.vsts-ci.yml
@@ -143,74 +143,74 @@ jobs:
     displayName: Setup
   - bash: $(Build.SourcesDirectory)/.ci/test.sh
     displayName: Test
-###########################################
-- job: QEMU_multiarch
-###########################################
-  variables:
-    COMPILER: gcc
-    OS_NAME: 'linux'
-    PRODUCES_ARTIFACTS: 'true'
-  pool:
-    vmImage: ubuntu-22.04
-  timeoutInMinutes: 180
-  strategy:
-    matrix:
-      bdist:
-        TASK: bdist
-        ARCH: aarch64
-  steps:
-  - script: |
-      sudo apt-get update
-      sudo apt-get install --no-install-recommends -y \
-        binfmt-support \
-        qemu \
-        qemu-user \
-        qemu-user-static
-    displayName: 'Install QEMU'
-  - script: |
-      docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
-    displayName: 'Enable Docker multi-architecture support'
-  - script: |
-      export ROOT_DOCKER_FOLDER=/LightGBM
-      cat > docker.env <<EOF
-      AZURE=$AZURE
-      OS_NAME=$OS_NAME
-      COMPILER=$COMPILER
-      TASK=$TASK
-      METHOD=$METHOD
-      CONDA_ENV=$CONDA_ENV
-      PYTHON_VERSION=$PYTHON_VERSION
-      BUILD_DIRECTORY=$ROOT_DOCKER_FOLDER
-      LGB_VER=$(head -n 1 VERSION.txt)
-      PRODUCES_ARTIFACTS=$PRODUCES_ARTIFACTS
-      BUILD_ARTIFACTSTAGINGDIRECTORY=$BUILD_ARTIFACTSTAGINGDIRECTORY
-      EOF
-      cat > docker-script.sh <<EOF
-      export CONDA=\$HOME/miniforge
-      export PATH=\$CONDA/bin:/opt/rh/llvm-toolset-7.0/root/usr/bin:\$PATH
-      export LD_LIBRARY_PATH=/opt/rh/llvm-toolset-7.0/root/usr/lib64:\$LD_LIBRARY_PATH
-      $ROOT_DOCKER_FOLDER/.ci/setup.sh || exit -1
-      $ROOT_DOCKER_FOLDER/.ci/test.sh || exit -1
-      EOF
-      IMAGE_URI="lightgbm/vsts-agent:manylinux2014_aarch64"
-      docker pull "${IMAGE_URI}" || exit -1
-      PLATFORM=$(docker inspect --format='{{.Os}}/{{.Architecture}}' "${IMAGE_URI}") || exit -1
-      echo "detected image platform: ${PLATFORM}"
-      docker run \
-        --platform "${PLATFORM}" \
-        --rm \
-        --env-file docker.env \
-        -v "$(Build.SourcesDirectory)":"$ROOT_DOCKER_FOLDER" \
-        -v "$(Build.ArtifactStagingDirectory)":"$(Build.ArtifactStagingDirectory)" \
-        "${IMAGE_URI}" \
-        /bin/bash $ROOT_DOCKER_FOLDER/docker-script.sh
-    displayName: 'Setup and run tests'
-  - task: PublishBuildArtifacts@1
-    condition: and(succeeded(), in(variables['TASK'], 'bdist'), not(startsWith(variables['Build.SourceBranch'], 'refs/pull/')))
-    inputs:
-      pathtoPublish: '$(Build.ArtifactStagingDirectory)'
-      artifactName: PackageAssets
-      artifactType: container
+# ###########################################
+# - job: QEMU_multiarch
+# ###########################################
+#   variables:
+#     COMPILER: gcc
+#     OS_NAME: 'linux'
+#     PRODUCES_ARTIFACTS: 'true'
+#   pool:
+#     vmImage: ubuntu-22.04
+#   timeoutInMinutes: 180
+#   strategy:
+#     matrix:
+#       bdist:
+#         TASK: bdist
+#         ARCH: aarch64
+#   steps:
+#   - script: |
+#       sudo apt-get update
+#       sudo apt-get install --no-install-recommends -y \
+#         binfmt-support \
+#         qemu \
+#         qemu-user \
+#         qemu-user-static
+#     displayName: 'Install QEMU'
+#   - script: |
+#       docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+#     displayName: 'Enable Docker multi-architecture support'
+#   - script: |
+#       export ROOT_DOCKER_FOLDER=/LightGBM
+#       cat > docker.env <<EOF
+#       AZURE=$AZURE
+#       OS_NAME=$OS_NAME
+#       COMPILER=$COMPILER
+#       TASK=$TASK
+#       METHOD=$METHOD
+#       CONDA_ENV=$CONDA_ENV
+#       PYTHON_VERSION=$PYTHON_VERSION
+#       BUILD_DIRECTORY=$ROOT_DOCKER_FOLDER
+#       LGB_VER=$(head -n 1 VERSION.txt)
+#       PRODUCES_ARTIFACTS=$PRODUCES_ARTIFACTS
+#       BUILD_ARTIFACTSTAGINGDIRECTORY=$BUILD_ARTIFACTSTAGINGDIRECTORY
+#       EOF
+#       cat > docker-script.sh <<EOF
+#       export CONDA=\$HOME/miniforge
+#       export PATH=\$CONDA/bin:/opt/rh/llvm-toolset-7.0/root/usr/bin:\$PATH
+#       export LD_LIBRARY_PATH=/opt/rh/llvm-toolset-7.0/root/usr/lib64:\$LD_LIBRARY_PATH
+#       $ROOT_DOCKER_FOLDER/.ci/setup.sh || exit -1
+#       $ROOT_DOCKER_FOLDER/.ci/test.sh || exit -1
+#       EOF
+#       IMAGE_URI="lightgbm/vsts-agent:manylinux2014_aarch64"
+#       docker pull "${IMAGE_URI}" || exit -1
+#       PLATFORM=$(docker inspect --format='{{.Os}}/{{.Architecture}}' "${IMAGE_URI}") || exit -1
+#       echo "detected image platform: ${PLATFORM}"
+#       docker run \
+#         --platform "${PLATFORM}" \
+#         --rm \
+#         --env-file docker.env \
+#         -v "$(Build.SourcesDirectory)":"$ROOT_DOCKER_FOLDER" \
+#         -v "$(Build.ArtifactStagingDirectory)":"$(Build.ArtifactStagingDirectory)" \
+#         "${IMAGE_URI}" \
+#         /bin/bash $ROOT_DOCKER_FOLDER/docker-script.sh
+#     displayName: 'Setup and run tests'
+#   - task: PublishBuildArtifacts@1
+#     condition: and(succeeded(), in(variables['TASK'], 'bdist'), not(startsWith(variables['Build.SourceBranch'], 'refs/pull/')))
+#     inputs:
+#       pathtoPublish: '$(Build.ArtifactStagingDirectory)'
+#       artifactName: PackageAssets
+#       artifactType: container
 ###########################################
 - job: macOS
 ###########################################
@@ -326,7 +326,7 @@ jobs:
   dependsOn:
   - Linux
   - Linux_latest
-  - QEMU_multiarch
+  #- QEMU_multiarch
   - macOS
   - Windows
   - R_artifact

From ae9cdb2a8d6985eda2305310bc1d737eb220acdc Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Mon, 16 Jan 2023 23:50:17 -0600
Subject: [PATCH 06/17] remove more references

---
 include/LightGBM/bin.h                        |  4 +-
 include/LightGBM/cuda/cuda_algorithms.hpp     |  4 +-
 include/LightGBM/cuda/cuda_column_data.hpp    |  4 +-
 include/LightGBM/cuda/cuda_metadata.hpp       |  4 +-
 include/LightGBM/cuda/cuda_metric.hpp         |  4 +-
 .../LightGBM/cuda/cuda_objective_function.hpp |  4 +-
 include/LightGBM/cuda/cuda_random.hpp         |  4 +-
 include/LightGBM/cuda/cuda_row_data.hpp       |  4 +-
 include/LightGBM/cuda/cuda_split_info.hpp     |  4 +-
 include/LightGBM/cuda/cuda_tree.hpp           |  4 +-
 include/LightGBM/cuda/cuda_utils.h            | 13 +---
 include/LightGBM/cuda/vector_cudahost.h       | 36 ++++-----
 include/LightGBM/dataset.h                    | 16 ++--
 include/LightGBM/objective_function.h         |  4 +-
 include/LightGBM/sample_strategy.h            | 10 +--
 include/LightGBM/train_share_states.h         | 16 ++--
 include/LightGBM/tree.h                       |  8 +-
 python-package/README.rst                     |  2 +-
 src/application/application.cpp               |  2 +-
 src/boosting/bagging.hpp                      | 32 ++++----
 src/boosting/cuda/cuda_score_updater.cpp      |  4 +-
 src/boosting/cuda/cuda_score_updater.cu       |  4 +-
 src/boosting/cuda/cuda_score_updater.hpp      |  4 +-
 src/boosting/gbdt.cpp                         | 78 +++++++++----------
 src/boosting/gbdt.h                           | 12 +--
 src/boosting/goss.hpp                         | 20 ++---
 src/cuda/cuda_algorithms.cu                   |  4 +-
 src/cuda/cuda_utils.cpp                       |  4 +-
 src/io/bin.cpp                                |  4 +-
 src/io/config.cpp                             | 21 ++---
 src/io/cuda/cuda_column_data.cpp              |  4 +-
 src/io/cuda/cuda_column_data.cu               |  4 +-
 src/io/cuda/cuda_metadata.cpp                 |  4 +-
 src/io/cuda/cuda_row_data.cpp                 |  4 +-
 src/io/cuda/cuda_tree.cpp                     |  4 +-
 src/io/cuda/cuda_tree.cu                      |  4 +-
 src/io/dataset.cpp                            | 47 +++++------
 src/io/dataset_loader.cpp                     | 12 +--
 src/io/dense_bin.hpp                          |  2 +-
 src/io/metadata.cpp                           | 24 +++---
 src/io/multi_val_dense_bin.hpp                |  4 +-
 src/io/multi_val_sparse_bin.hpp               |  4 +-
 src/io/train_share_states.cpp                 |  4 +-
 src/io/tree.cpp                               |  8 +-
 src/metric/cuda/cuda_binary_metric.cpp        |  4 +-
 src/metric/cuda/cuda_binary_metric.hpp        |  4 +-
 src/metric/cuda/cuda_pointwise_metric.cpp     |  4 +-
 src/metric/cuda/cuda_pointwise_metric.cu      |  4 +-
 src/metric/cuda/cuda_pointwise_metric.hpp     |  4 +-
 src/metric/cuda/cuda_regression_metric.cpp    |  4 +-
 src/metric/cuda/cuda_regression_metric.hpp    |  4 +-
 src/metric/metric.cpp                         | 50 ++++++------
 src/objective/cuda/cuda_binary_objective.cpp  |  4 +-
 src/objective/cuda/cuda_binary_objective.cu   |  4 +-
 src/objective/cuda/cuda_binary_objective.hpp  |  4 +-
 .../cuda/cuda_multiclass_objective.cpp        |  4 +-
 .../cuda/cuda_multiclass_objective.cu         |  4 +-
 .../cuda/cuda_multiclass_objective.hpp        |  4 +-
 src/objective/cuda/cuda_rank_objective.cpp    |  4 +-
 src/objective/cuda/cuda_rank_objective.cu     |  4 +-
 src/objective/cuda/cuda_rank_objective.hpp    |  4 +-
 .../cuda/cuda_regression_objective.cpp        |  4 +-
 .../cuda/cuda_regression_objective.cu         |  4 +-
 .../cuda/cuda_regression_objective.hpp        |  4 +-
 .../cuda/cuda_best_split_finder.cpp           |  4 +-
 .../cuda/cuda_best_split_finder.cu            |  4 +-
 .../cuda/cuda_best_split_finder.hpp           |  4 +-
 src/treelearner/cuda/cuda_data_partition.cpp  |  4 +-
 src/treelearner/cuda/cuda_data_partition.cu   |  4 +-
 src/treelearner/cuda/cuda_data_partition.hpp  |  4 +-
 .../cuda/cuda_histogram_constructor.cpp       |  4 +-
 .../cuda/cuda_histogram_constructor.cu        |  4 +-
 .../cuda/cuda_histogram_constructor.hpp       |  4 +-
 src/treelearner/cuda/cuda_leaf_splits.cpp     |  4 +-
 src/treelearner/cuda/cuda_leaf_splits.cu      |  4 +-
 src/treelearner/cuda/cuda_leaf_splits.hpp     |  4 +-
 .../cuda/cuda_single_gpu_tree_learner.cpp     |  4 +-
 .../cuda/cuda_single_gpu_tree_learner.cu      |  4 +-
 .../cuda/cuda_single_gpu_tree_learner.hpp     | 10 +--
 src/treelearner/serial_tree_learner.h         |  2 +-
 src/treelearner/tree_learner.cpp              | 14 +---
 tests/python_package_test/test_basic.py       |  2 +-
 tests/python_package_test/test_dask.py        |  2 +-
 tests/python_package_test/test_engine.py      | 14 ++--
 tests/python_package_test/test_sklearn.py     |  6 +-
 tests/python_package_test/test_utilities.py   |  8 +-
 86 files changed, 331 insertions(+), 372 deletions(-)

diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index 71b60c493504..705d83f29e0c 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -480,13 +480,13 @@ class MultiValBin {
 
   virtual MultiValBin* Clone() = 0;
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   virtual const void* GetRowWiseData(uint8_t* bit_type,
     size_t* total_size,
     bool* is_sparse,
     const void** out_data_ptr,
     uint8_t* data_ptr_bit_type) const = 0;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 };
 
 inline uint32_t BinMapper::ValueToBin(double value) const {
diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp
index b302af8b63c1..ab3328bb5561 100644
--- a/include/LightGBM/cuda/cuda_algorithms.hpp
+++ b/include/LightGBM/cuda/cuda_algorithms.hpp
@@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
 #define LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -577,5 +577,5 @@ __device__ VAL_T PercentileDevice(const VAL_T* values,
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_CUDA_CUDA_ALGORITHMS_HPP_
diff --git a/include/LightGBM/cuda/cuda_column_data.hpp b/include/LightGBM/cuda/cuda_column_data.hpp
index 0252669fe457..6668c92f2921 100644
--- a/include/LightGBM/cuda/cuda_column_data.hpp
+++ b/include/LightGBM/cuda/cuda_column_data.hpp
@@ -3,7 +3,7 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #ifndef LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_
 #define LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_
@@ -137,4 +137,4 @@ class CUDAColumnData {
 
 #endif  // LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp
index 6919d9723fb1..bc7339a84bf7 100644
--- a/include/LightGBM/cuda/cuda_metadata.hpp
+++ b/include/LightGBM/cuda/cuda_metadata.hpp
@@ -3,7 +3,7 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #ifndef LIGHTGBM_CUDA_CUDA_METADATA_HPP_
 #define LIGHTGBM_CUDA_CUDA_METADATA_HPP_
@@ -55,4 +55,4 @@ class CUDAMetadata {
 
 #endif  // LIGHTGBM_CUDA_CUDA_METADATA_HPP_
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/include/LightGBM/cuda/cuda_metric.hpp b/include/LightGBM/cuda/cuda_metric.hpp
index caeff267e8ef..5eb04c81c777 100644
--- a/include/LightGBM/cuda/cuda_metric.hpp
+++ b/include/LightGBM/cuda/cuda_metric.hpp
@@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_CUDA_CUDA_METRIC_HPP_
 #define LIGHTGBM_CUDA_CUDA_METRIC_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/metric.h>
 
@@ -36,6 +36,6 @@ class CUDAMetricInterface: public HOST_METRIC {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 
 #endif  // LIGHTGBM_CUDA_CUDA_METRIC_HPP_
diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp
index 1010895e9d7a..dacaf252f8e6 100644
--- a/include/LightGBM/cuda/cuda_objective_function.hpp
+++ b/include/LightGBM/cuda/cuda_objective_function.hpp
@@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_CUDA_CUDA_OBJECTIVE_FUNCTION_HPP_
 #define LIGHTGBM_CUDA_CUDA_OBJECTIVE_FUNCTION_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_utils.h>
 #include <LightGBM/objective_function.h>
@@ -73,6 +73,6 @@ class CUDAObjectiveInterface: public HOST_OBJECTIVE {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 
 #endif  // LIGHTGBM_CUDA_CUDA_OBJECTIVE_FUNCTION_HPP_
diff --git a/include/LightGBM/cuda/cuda_random.hpp b/include/LightGBM/cuda/cuda_random.hpp
index 1f07d64452da..6c28e44bc2d3 100644
--- a/include/LightGBM/cuda/cuda_random.hpp
+++ b/include/LightGBM/cuda/cuda_random.hpp
@@ -5,7 +5,7 @@
 #ifndef LIGHTGBM_CUDA_CUDA_RANDOM_HPP_
 #define LIGHTGBM_CUDA_CUDA_RANDOM_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <cuda.h>
 #include <cuda_runtime.h>
@@ -69,6 +69,6 @@ class CUDARandom {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 
 #endif  // LIGHTGBM_CUDA_CUDA_RANDOM_HPP_
diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp
index 3950a5328553..a7e487dae9f0 100644
--- a/include/LightGBM/cuda/cuda_row_data.hpp
+++ b/include/LightGBM/cuda/cuda_row_data.hpp
@@ -3,7 +3,7 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #ifndef LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_
 #define LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_
@@ -176,4 +176,4 @@ class CUDARowData {
 }  // namespace LightGBM
 #endif  // LIGHTGBM_CUDA_CUDA_ROW_DATA_HPP_
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/include/LightGBM/cuda/cuda_split_info.hpp b/include/LightGBM/cuda/cuda_split_info.hpp
index 5c525b431548..46b35ca37a59 100644
--- a/include/LightGBM/cuda/cuda_split_info.hpp
+++ b/include/LightGBM/cuda/cuda_split_info.hpp
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #ifndef LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_
 #define LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_
@@ -102,4 +102,4 @@ class CUDASplitInfo {
 
 #endif  // LIGHTGBM_CUDA_CUDA_SPLIT_INFO_HPP_
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp
index d557798270e0..e2836baa2be5 100644
--- a/include/LightGBM/cuda/cuda_tree.hpp
+++ b/include/LightGBM/cuda/cuda_tree.hpp
@@ -3,7 +3,7 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #ifndef LIGHTGBM_CUDA_CUDA_TREE_HPP_
 #define LIGHTGBM_CUDA_CUDA_TREE_HPP_
@@ -170,4 +170,4 @@ class CUDATree : public Tree {
 
 #endif  // LIGHTGBM_CUDA_CUDA_TREE_HPP_
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.h
index d5b94bc89e4a..771e1561f767 100644
--- a/include/LightGBM/cuda/cuda_utils.h
+++ b/include/LightGBM/cuda/cuda_utils.h
@@ -6,20 +6,15 @@
 #ifndef LIGHTGBM_CUDA_CUDA_UTILS_H_
 #define LIGHTGBM_CUDA_CUDA_UTILS_H_
 
-#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
+#ifdef USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdio.h>
 #include <LightGBM/utils/log.h>
-#endif  // USE_CUDA || USE_CUDA_EXP
-
-#ifdef USE_CUDA_EXP
 #include <vector>
-#endif  // USE_CUDA_EXP
 
 namespace LightGBM {
 
-#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
 #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
   if (code != cudaSuccess) {
@@ -27,9 +22,7 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort =
     if (abort) exit(code);
   }
 }
-#endif  // USE_CUDA || USE_CUDA_EXP
 
-#ifdef USE_CUDA_EXP
 #define CUDASUCCESS_OR_FATAL_OUTER(ans) { gpuAssert((ans), file, line); }
 
 void SetCUDADevice(int gpu_device_id, const char* file, int line);
@@ -184,8 +177,8 @@ class CUDAVector {
   size_t size_;
 };
 
-#endif  // USE_CUDA_EXP
-
 }  // namespace LightGBM
 
+#endif  // USE_CUDA
+
 #endif  // LIGHTGBM_CUDA_CUDA_UTILS_H_
diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index 7c6e219cbbd9..6262a3bc9113 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -5,12 +5,11 @@
 #ifndef LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
 #define LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
 
+#ifdef USE_CUDA
 #include <LightGBM/utils/common.h>
 
-#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
 #include <cuda.h>
 #include <cuda_runtime.h>
-#endif
 #include <stdio.h>
 
 enum LGBM_Device {
@@ -43,44 +42,36 @@ struct CHAllocator {
     T* ptr;
     if (n == 0) return NULL;
     n = SIZE_ALIGNED(n);
-    #if defined(USE_CUDA) || defined(USE_CUDA_EXP)
-      if (LGBM_config_::current_device == lgbm_device_cuda) {
+    if (LGBM_config_::current_device == lgbm_device_cuda) {
         cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
         if (ret != cudaSuccess) {
-          Log::Warning("Defaulting to malloc in CHAllocator!!!");
-          ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
+            Log::Warning("Defaulting to malloc in CHAllocator!!!");
+            ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
         }
-      } else {
+    } else {
         ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
-      }
-    #else
-      ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
-    #endif
+    }
     return ptr;
   }
 
   void deallocate(T* p, std::size_t n) {
     (void)n;  // UNUSED
     if (p == NULL) return;
-    #if defined(USE_CUDA) || defined(USE_CUDA_EXP)
-      if (LGBM_config_::current_device == lgbm_device_cuda) {
+    if (LGBM_config_::current_device == lgbm_device_cuda) {
         cudaPointerAttributes attributes;
         cudaPointerGetAttributes(&attributes, p);
         #if CUDA_VERSION >= 10000
-          if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
+        if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
             cudaFreeHost(p);
-          }
+        }
         #else
-          if ((attributes.memoryType == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
+        if ((attributes.memoryType == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
             cudaFreeHost(p);
-          }
+        }
         #endif
-      } else {
+    } else {
         _mm_free(p);
-      }
-    #else
-      _mm_free(p);
-    #endif
+    }
   }
 };
 template <class T, class U>
@@ -90,4 +81,5 @@ bool operator!=(const CHAllocator<T>&, const CHAllocator<U>&);
 
 }  // namespace LightGBM
 
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index 74e3e9c1dad4..97bc146c9a9e 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -277,13 +277,13 @@ class Metadata {
   /*! \brief Disable copy */
   Metadata(const Metadata&) = delete;
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
 
   CUDAMetadata* cuda_metadata() const { return cuda_metadata_.get(); }
 
   void CreateCUDAMetadata(const int gpu_device_id);
 
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
  private:
   /*! \brief Load wights from file */
@@ -329,9 +329,9 @@ class Metadata {
   bool weight_load_from_file_;
   bool query_load_from_file_;
   bool init_score_load_from_file_;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   std::unique_ptr<CUDAMetadata> cuda_metadata_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 };
 
 
@@ -910,13 +910,13 @@ class Dataset {
     return feature_groups_[feature_group_index]->feature_min_bin(sub_feature_index);
   }
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
 
   const CUDAColumnData* cuda_column_data() const {
     return cuda_column_data_.get();
   }
 
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
  private:
   void CreateCUDAColumnData();
@@ -968,9 +968,9 @@ class Dataset {
   /*! \brief mutex for threading safe call */
   std::mutex mutex_;
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   std::unique_ptr<CUDAColumnData> cuda_column_data_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
   std::string parser_config_str_;
 };
diff --git a/include/LightGBM/objective_function.h b/include/LightGBM/objective_function.h
index 376a6f1a071d..ad188dc39676 100644
--- a/include/LightGBM/objective_function.h
+++ b/include/LightGBM/objective_function.h
@@ -97,7 +97,7 @@ class ObjectiveFunction {
   */
   virtual bool IsCUDAObjective() const { return false; }
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   /*!
   * \brief Convert output for CUDA version
   */
@@ -107,7 +107,7 @@ class ObjectiveFunction {
 
   virtual bool NeedConvertOutputCUDA () const { return false; }
 
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 };
 
 }  // namespace LightGBM
diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h
index 765632f7ecbf..51d3cbc16f23 100644
--- a/include/LightGBM/sample_strategy.h
+++ b/include/LightGBM/sample_strategy.h
@@ -38,9 +38,9 @@ class SampleStrategy {
 
   std::vector<data_size_t, Common::AlignmentAllocator<data_size_t, kAlignedSize>>& bag_data_indices() { return bag_data_indices_; }
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   CUDAVector<data_size_t>& cuda_bag_data_indices() { return cuda_bag_data_indices_; }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
   void UpdateObjectiveFunction(const ObjectiveFunction* objective_function) {
     objective_function_ = objective_function;
@@ -72,10 +72,10 @@ class SampleStrategy {
   /*! \brief whether need to resize the gradient vectors */
   bool need_resize_gradients_;
 
-  #ifdef USE_CUDA_EXP
-  /*! \brief Buffer for bag_data_indices_ on GPU, used only with cuda_exp */
+  #ifdef USE_CUDA
+  /*! \brief Buffer for bag_data_indices_ on GPU, used only with cuda */
   CUDAVector<data_size_t> cuda_bag_data_indices_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 };
 
 }  // namespace LightGBM
diff --git a/include/LightGBM/train_share_states.h b/include/LightGBM/train_share_states.h
index 5c14c7d51a47..8c50734695b2 100644
--- a/include/LightGBM/train_share_states.h
+++ b/include/LightGBM/train_share_states.h
@@ -126,7 +126,7 @@ class MultiValBinWrapper {
   }
 
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   const void* GetRowWiseData(
     uint8_t* bit_type,
     size_t* total_size,
@@ -142,7 +142,7 @@ class MultiValBinWrapper {
       return multi_val_bin_->GetRowWiseData(bit_type, total_size, is_sparse, out_data_ptr, data_ptr_bit_type);
     }
   }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
  private:
   bool is_use_subcol_ = false;
@@ -183,9 +183,9 @@ struct TrainingShareStates {
 
   const std::vector<uint32_t>& feature_hist_offsets() const { return feature_hist_offsets_; }
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   const std::vector<uint32_t>& column_hist_offsets() const { return column_hist_offsets_; }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
   bool IsSparseRowwise() {
     return (multi_val_bin_wrapper_ != nullptr && multi_val_bin_wrapper_->IsSparse());
@@ -235,7 +235,7 @@ struct TrainingShareStates {
   }
 
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   const void* GetRowWiseData(uint8_t* bit_type,
     size_t* total_size,
     bool* is_sparse,
@@ -250,13 +250,13 @@ struct TrainingShareStates {
       return nullptr;
     }
   }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
  private:
   std::vector<uint32_t> feature_hist_offsets_;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   std::vector<uint32_t> column_hist_offsets_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
   int num_hist_total_bin_ = 0;
   std::unique_ptr<MultiValBinWrapper> multi_val_bin_wrapper_;
   std::vector<hist_t, Common::AlignmentAllocator<hist_t, kAlignedSize>> hist_buf_;
diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h
index 3e403b16e89b..13b3c41a2309 100644
--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -319,9 +319,9 @@ class Tree {
 
   inline bool is_linear() const { return is_linear_; }
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   inline bool is_cuda_tree() const { return is_cuda_tree_; }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
   inline void SetIsLinear(bool is_linear) {
     is_linear_ = is_linear;
@@ -532,10 +532,10 @@ class Tree {
   std::vector<std::vector<int>> leaf_features_;
   /* \brief features used in leaf linear models; indexing is relative to used_features_ */
   std::vector<std::vector<int>> leaf_features_inner_;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   /*! \brief Marks whether this tree is a CUDATree */
   bool is_cuda_tree_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 };
 
 inline void Tree::Split(int leaf, int feature, int real_feature,
diff --git a/python-package/README.rst b/python-package/README.rst
index 4494eb43a172..8021add44c63 100644
--- a/python-package/README.rst
+++ b/python-package/README.rst
@@ -125,7 +125,7 @@ All requirements from `Build from Sources section <#build-from-sources>`__ apply
 
 Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace ``--cuda`` with ``--cuda-exp`` in the above commands. Please note that new version requires **CUDA** 10.0 or later libraries. Note that this new version uses twice the memory, since it stores data row-wise as well as column-wise in memory to improve performance (see this `issue <https://github.com/microsoft/LightGBM/issues/5318>`__ for discussion). 
 
-To use the regular or experimental CUDA versions within Python, pass ``{"device": "cuda"}`` or ``{"device": "cuda_exp"}`` respectively as parameters.
+To use the CUDA version within Python, pass ``{"device": "cuda"}`` respectively in parameters.
 
 Build HDFS Version
 ~~~~~~~~~~~~~~~~~~
diff --git a/src/application/application.cpp b/src/application/application.cpp
index d69efb5017ae..3e51136afc96 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -36,7 +36,7 @@ Application::Application(int argc, char** argv) {
     Log::Fatal("No training/prediction data, application quit");
   }
 
-  if (config_.device_type == std::string("cuda") || config_.device_type == std::string("cuda_exp")) {
+  if (config_.device_type == std::string("cuda")) {
       LGBM_config_::current_device = lgbm_device_cuda;
   }
 }
diff --git a/src/boosting/bagging.hpp b/src/boosting/bagging.hpp
index 65a937435105..4c2c81553e7c 100644
--- a/src/boosting/bagging.hpp
+++ b/src/boosting/bagging.hpp
@@ -47,33 +47,33 @@ class BaggingSampleStrategy : public SampleStrategy {
       Log::Debug("Re-bagging, using %d data to train", bag_data_cnt_);
       // set bagging data to tree learner
       if (!is_use_subset_) {
-        #ifdef USE_CUDA_EXP
-        if (config_->device_type == std::string("cuda_exp")) {
+        #ifdef USE_CUDA
+        if (config_->device_type == std::string("cuda")) {
           CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
           tree_learner->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
         } else {
-        #endif  // USE_CUDA_EXP
+        #endif  // USE_CUDA
           tree_learner->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
-        #ifdef USE_CUDA_EXP
+        #ifdef USE_CUDA
         }
-        #endif  // USE_CUDA_EXP
+        #endif  // USE_CUDA
       } else {
         // get subset
         tmp_subset_->ReSize(bag_data_cnt_);
         tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
                                 bag_data_cnt_, false);
-        #ifdef USE_CUDA_EXP
-        if (config_->device_type == std::string("cuda_exp")) {
+        #ifdef USE_CUDA
+        if (config_->device_type == std::string("cuda")) {
           CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
           tree_learner->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
                                        bag_data_cnt_);
         } else {
-        #endif  // USE_CUDA_EXP
+        #endif  // USE_CUDA
           tree_learner->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
                                        bag_data_cnt_);
-        #ifdef USE_CUDA_EXP
+        #ifdef USE_CUDA
         }
-        #endif  // USE_CUDA_EXP
+        #endif  // USE_CUDA
       }
     }
   }
@@ -103,11 +103,11 @@ class BaggingSampleStrategy : public SampleStrategy {
         bag_data_cnt_ = static_cast<data_size_t>(config_->bagging_fraction * num_data_);
       }
       bag_data_indices_.resize(num_data_);
-      #ifdef USE_CUDA_EXP
-      if (config_->device_type == std::string("cuda_exp")) {
+      #ifdef USE_CUDA
+      if (config_->device_type == std::string("cuda")) {
         cuda_bag_data_indices_.Resize(num_data_);
       }
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
       bagging_runner_.ReSize(num_data_);
       bagging_rands_.clear();
       for (int i = 0;
@@ -118,7 +118,7 @@ class BaggingSampleStrategy : public SampleStrategy {
       double average_bag_rate =
           (static_cast<double>(bag_data_cnt_) / num_data_) / config_->bagging_freq;
       is_use_subset_ = false;
-      if (config_->device_type != std::string("cuda_exp")) {
+      if (config_->device_type != std::string("cuda")) {
         const int group_threshold_usesubset = 100;
         const double average_bag_rate_threshold = 0.5;
         if (average_bag_rate <= average_bag_rate_threshold
@@ -141,9 +141,9 @@ class BaggingSampleStrategy : public SampleStrategy {
     } else {
       bag_data_cnt_ = num_data_;
       bag_data_indices_.clear();
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
       cuda_bag_data_indices_.Clear();
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
       bagging_runner_.ReSize(0);
       is_use_subset_ = false;
     }
diff --git a/src/boosting/cuda/cuda_score_updater.cpp b/src/boosting/cuda/cuda_score_updater.cpp
index 9c514265ee40..d29fe9f4e394 100644
--- a/src/boosting/cuda/cuda_score_updater.cpp
+++ b/src/boosting/cuda/cuda_score_updater.cpp
@@ -5,7 +5,7 @@
 
 #include "cuda_score_updater.hpp"
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 namespace LightGBM {
 
@@ -91,4 +91,4 @@ inline void CUDAScoreUpdater::MultiplyScore(double val, int cur_tree_id) {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/boosting/cuda/cuda_score_updater.cu b/src/boosting/cuda/cuda_score_updater.cu
index c2138957f199..a1fd3638d20a 100644
--- a/src/boosting/cuda/cuda_score_updater.cu
+++ b/src/boosting/cuda/cuda_score_updater.cu
@@ -5,7 +5,7 @@
 
 #include "cuda_score_updater.hpp"
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 namespace LightGBM {
 
@@ -42,4 +42,4 @@ void CUDAScoreUpdater::LaunchMultiplyScoreConstantKernel(const double val, const
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/boosting/cuda/cuda_score_updater.hpp b/src/boosting/cuda/cuda_score_updater.hpp
index 01326ab1d354..ec728777e66c 100644
--- a/src/boosting/cuda/cuda_score_updater.hpp
+++ b/src/boosting/cuda/cuda_score_updater.hpp
@@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_
 #define LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_utils.h>
 
@@ -60,6 +60,6 @@ class CUDAScoreUpdater: public ScoreUpdater {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 
 #endif  // LIGHTGBM_BOOSTING_CUDA_CUDA_SCORE_UPDATER_HPP_
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 72e16ee7e707..fbaf337eea60 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -68,14 +68,12 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   es_first_metric_only_ = config_->first_metric_only;
   shrinkage_rate_ = config_->learning_rate;
 
-  if (config_->device_type == std::string("cuda") || config_->device_type == std::string("cuda_exp")) {
+  if (config_->device_type == std::string("cuda")) {
     LGBM_config_::current_learner = use_cuda_learner;
-    #ifdef USE_CUDA_EXP
-    if (config_->device_type == std::string("cuda_exp")) {
-      const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0;
-      CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id));
-    }
-    #endif  // USE_CUDA_EXP
+    #ifdef USE_CUDA
+    const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0;
+    CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id));
+    #endif  // USE_CUDA
   }
 
   // load forced_splits file
@@ -116,15 +114,15 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   }
   training_metrics_.shrink_to_fit();
 
-  #ifdef USE_CUDA_EXP
-  if (config_->device_type == std::string("cuda_exp")) {
+  #ifdef USE_CUDA
+  if (config_->device_type == std::string("cuda")) {
     train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu_));
   } else {
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
     train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
   num_data_ = train_data_->num_data();
 
@@ -186,11 +184,11 @@ void GBDT::AddValidDataset(const Dataset* valid_data,
   }
   // for a validation dataset, we need its score and metric
   auto new_score_updater =
-    #ifdef USE_CUDA_EXP
-    config_->device_type == std::string("cuda_exp") ?
+    #ifdef USE_CUDA
+    config_->device_type == std::string("cuda") ?
     std::unique_ptr<CUDAScoreUpdater>(new CUDAScoreUpdater(valid_data, num_tree_per_iteration_,
       objective_function_ != nullptr && objective_function_->IsCUDAObjective())) :
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
     std::unique_ptr<ScoreUpdater>(new ScoreUpdater(valid_data, num_tree_per_iteration_));
   // update score
   for (int i = 0; i < iter_; ++i) {
@@ -481,15 +479,15 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
     const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt();
     // we need to predict out-of-bag scores of data for boosting
     if (num_data_ - bag_data_cnt > 0) {
-      #ifdef USE_CUDA_EXP
-      if (config_->device_type == std::string("cuda_exp")) {
+      #ifdef USE_CUDA
+      if (config_->device_type == std::string("cuda")) {
         train_score_updater_->AddScore(tree, data_sample_strategy_->cuda_bag_data_indices().RawData() + bag_data_cnt, num_data_ - bag_data_cnt, cur_tree_id);
       } else {
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
         train_score_updater_->AddScore(tree, data_sample_strategy_->bag_data_indices().data() + bag_data_cnt, num_data_ - bag_data_cnt, cur_tree_id);
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
       }
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
     }
 
   } else {
@@ -503,17 +501,17 @@ void GBDT::UpdateScore(const Tree* tree, const int cur_tree_id) {
   }
 }
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t num_data) const {
 #else
 std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* score, const data_size_t /*num_data*/) const {
-#endif  // USE_CUDA_EXP
-  #ifdef USE_CUDA_EXP
+#endif  // USE_CUDA
+  #ifdef USE_CUDA
   const bool evaluation_on_cuda = metric->IsCUDAMetric();
   if ((boosting_on_gpu_ && evaluation_on_cuda) || (!boosting_on_gpu_ && !evaluation_on_cuda)) {
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
     return metric->Eval(score, objective_function_);
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   } else if (boosting_on_gpu_ && !evaluation_on_cuda) {
     const size_t total_size = static_cast<size_t>(num_data) * static_cast<size_t>(num_tree_per_iteration_);
     if (total_size > host_score_.size()) {
@@ -529,7 +527,7 @@ std::vector<double> GBDT::EvalOneMetric(const Metric* metric, const double* scor
     CopyFromHostToCUDADevice<double>(cuda_score_.RawData(), score, total_size, __FILE__, __LINE__);
     return metric->Eval(cuda_score_.RawData(), objective_function_);
   }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }
 
 std::string GBDT::OutputMetric(int iter) {
@@ -660,14 +658,14 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
     num_data = valid_score_updater_[used_idx]->num_data();
     *out_len = static_cast<int64_t>(num_data) * num_class_;
   }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   std::vector<double> host_raw_scores;
   if (boosting_on_gpu_) {
     host_raw_scores.resize(static_cast<size_t>(*out_len), 0.0);
     CopyFromCUDADeviceToHost<double>(host_raw_scores.data(), raw_scores, static_cast<size_t>(*out_len), __FILE__, __LINE__);
     raw_scores = host_raw_scores.data();
   }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
   if (objective_function_ != nullptr) {
     #pragma omp parallel for schedule(static)
     for (data_size_t i = 0; i < num_data; ++i) {
@@ -730,26 +728,26 @@ void GBDT::ResetTrainingData(const Dataset* train_data, const ObjectiveFunction*
   }
   training_metrics_.shrink_to_fit();
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   boosting_on_gpu_ = objective_function_ != nullptr && objective_function_->IsCUDAObjective() &&
                     !data_sample_strategy_->IsHessianChange();  // for sample strategy with Hessian change, fall back to boosting on CPU
   tree_learner_->ResetBoostingOnGPU(boosting_on_gpu_);
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
   if (train_data != train_data_) {
     train_data_ = train_data;
     data_sample_strategy_->UpdateTrainingData(train_data);
     // not same training data, need reset score and others
     // create score tracker
-    #ifdef USE_CUDA_EXP
-    if (config_->device_type == std::string("cuda_exp")) {
+    #ifdef USE_CUDA
+    if (config_->device_type == std::string("cuda")) {
       train_score_updater_.reset(new CUDAScoreUpdater(train_data_, num_tree_per_iteration_, boosting_on_gpu_));
     } else {
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
       train_score_updater_.reset(new ScoreUpdater(train_data_, num_tree_per_iteration_));
-    #ifdef USE_CUDA_EXP
+    #ifdef USE_CUDA
     }
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
 
     // update score
     for (int i = 0; i < iter_; ++i) {
@@ -827,8 +825,8 @@ void GBDT::ResetGradientBuffers() {
   const bool is_use_subset = data_sample_strategy_->is_use_subset();
   const data_size_t bag_data_cnt = data_sample_strategy_->bag_data_cnt();
   if (objective_function_ != nullptr) {
-    #ifdef USE_CUDA_EXP
-    if (config_->device_type == std::string("cuda_exp") && boosting_on_gpu_) {
+    #ifdef USE_CUDA
+    if (config_->device_type == std::string("cuda") && boosting_on_gpu_) {
       if (cuda_gradients_.Size() < total_size) {
         cuda_gradients_.Resize(total_size);
         cuda_hessians_.Resize(total_size);
@@ -836,16 +834,16 @@ void GBDT::ResetGradientBuffers() {
       gradients_pointer_ = cuda_gradients_.RawData();
       hessians_pointer_ = cuda_hessians_.RawData();
     } else {
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
       if (gradients_.size() < total_size) {
         gradients_.resize(total_size);
         hessians_.resize(total_size);
       }
       gradients_pointer_ = gradients_.data();
       hessians_pointer_ = hessians_.data();
-    #ifdef USE_CUDA_EXP
+    #ifdef USE_CUDA
     }
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
   } else if (data_sample_strategy_->IsHessianChange() || (is_use_subset && bag_data_cnt < num_data_ && !boosting_on_gpu_)) {
     if (gradients_.size() < total_size) {
       gradients_.resize(total_size);
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index d71245980b36..1f784b94e593 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -542,7 +542,7 @@ class GBDT : public GBDTBase {
   /*! \brief Parser config file content */
   std::string parser_config_str_ = "";
 
-#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
+#ifdef USE_CUDA
   /*! \brief First order derivative of training data */
   std::vector<score_t, CHAllocator<score_t>> gradients_;
   /*! \brief Second order derivative of training data */
@@ -557,18 +557,18 @@ class GBDT : public GBDTBase {
   score_t* gradients_pointer_;
   /*! \brief Pointer to hessian vector, can be on CPU or GPU */
   score_t* hessians_pointer_;
-  /*! \brief Whether boosting is done on GPU, used for cuda_exp */
+  /*! \brief Whether boosting is done on GPU, used for cuda */
   bool boosting_on_gpu_;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   /*! \brief Gradient vector on GPU */
   CUDAVector<score_t> cuda_gradients_;
   /*! \brief Hessian vector on GPU */
   CUDAVector<score_t> cuda_hessians_;
-  /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with cuda_exp */
+  /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with cuda */
   mutable std::vector<double> host_score_;
-  /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with cuda_exp */
+  /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with cuda */
   mutable CUDAVector<double> cuda_score_;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
   /*! \brief Number of training data */
   data_size_t num_data_;
diff --git a/src/boosting/goss.hpp b/src/boosting/goss.hpp
index 34b099e051bb..9d55d89ee097 100644
--- a/src/boosting/goss.hpp
+++ b/src/boosting/goss.hpp
@@ -43,33 +43,33 @@ class GOSSStrategy : public SampleStrategy {
     bag_data_cnt_ = left_cnt;
     // set bagging data to tree learner
     if (!is_use_subset_) {
-      #ifdef USE_CUDA_EXP
-      if (config_->device_type == std::string("cuda_exp")) {
+      #ifdef USE_CUDA
+      if (config_->device_type == std::string("cuda")) {
         CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
         tree_learner->SetBaggingData(nullptr, cuda_bag_data_indices_.RawData(), bag_data_cnt_);
       } else {
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
         tree_learner->SetBaggingData(nullptr, bag_data_indices_.data(), bag_data_cnt_);
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
       }
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
     } else {
       // get subset
       tmp_subset_->ReSize(bag_data_cnt_);
       tmp_subset_->CopySubrow(train_data_, bag_data_indices_.data(),
                               bag_data_cnt_, false);
-      #ifdef USE_CUDA_EXP
-      if (config_->device_type == std::string("cuda_exp")) {
+      #ifdef USE_CUDA
+      if (config_->device_type == std::string("cuda")) {
         CopyFromHostToCUDADevice<data_size_t>(cuda_bag_data_indices_.RawData(), bag_data_indices_.data(), static_cast<size_t>(num_data_), __FILE__, __LINE__);
         tree_learner->SetBaggingData(tmp_subset_.get(), cuda_bag_data_indices_.RawData(),
                                       bag_data_cnt_);
       } else {
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
         tree_learner->SetBaggingData(tmp_subset_.get(), bag_data_indices_.data(),
                                      bag_data_cnt_);
-      #ifdef USE_CUDA_EXP
+      #ifdef USE_CUDA
       }
-      #endif  // USE_CUDA_EXP
+      #endif  // USE_CUDA
     }
   }
 
diff --git a/src/cuda/cuda_algorithms.cu b/src/cuda/cuda_algorithms.cu
index 5a6b3eb74ef0..19c1507419e9 100644
--- a/src/cuda/cuda_algorithms.cu
+++ b/src/cuda/cuda_algorithms.cu
@@ -3,7 +3,7 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_algorithms.hpp>
 
@@ -509,4 +509,4 @@ template __device__ double PercentileDevice<double, data_size_t, label_t, double
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp
index bab1e1b8ff37..fd4abcf25e79 100644
--- a/src/cuda/cuda_utils.cpp
+++ b/src/cuda/cuda_utils.cpp
@@ -3,7 +3,7 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_utils.h>
 
@@ -28,4 +28,4 @@ void SetCUDADevice(int gpu_device_id, const char* file, int line) {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/io/bin.cpp b/src/io/bin.cpp
index a5430c483d3b..652b874c94d8 100644
--- a/src/io/bin.cpp
+++ b/src/io/bin.cpp
@@ -886,7 +886,7 @@ namespace LightGBM {
     return nullptr;
   }
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   template <>
   const void* MultiValDenseBin<uint8_t>::GetRowWiseData(uint8_t* bit_type,
       size_t* total_size,
@@ -1081,6 +1081,6 @@ namespace LightGBM {
     return to_return;
   }
 
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
 }  // namespace LightGBM
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 8827414c2e99..ba2672fcaef4 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -178,7 +178,13 @@ void GetDeviceType(const std::unordered_map<std::string, std::string>& params, s
     } else if (value == std::string("cuda")) {
       *device_type = "cuda";
     } else if (value == std::string("cuda_exp")) {
-      *device_type = "cuda_exp";
+      Log::Warning(
+        "Found device_type='cuda_exp' passed through params. "
+        "That is an alias for device_type='cuda'. "
+        "Use device_type='cuda' to suppress this warning. "
+        "In the future, this warning will become an error. "
+      );
+      *device_type = "cuda";
     } else {
       Log::Fatal("Unknown device type %s", value.c_str());
     }
@@ -260,7 +266,7 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) {
   GetObjectiveType(params, &objective);
   GetMetricType(params, objective, &metric);
   GetDeviceType(params, &device_type);
-  if (device_type == std::string("cuda") || device_type == std::string("cuda_exp")) {
+  if (device_type == std::string("cuda")) {
     LGBM_config_::current_device = lgbm_device_cuda;
   }
   GetTreeLearnerType(params, &tree_learner);
@@ -373,26 +379,21 @@ void Config::CheckParamConflict() {
       num_leaves = static_cast<int>(full_num_leaves);
     }
   }
-  if (device_type == std::string("gpu") || device_type == std::string("cuda")) {
+  if (device_type == std::string("gpu")) {
     // force col-wise for gpu, and cuda version
     force_col_wise = true;
     force_row_wise = false;
     if (deterministic) {
       Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
     }
-  } else if (device_type == std::string("cuda_exp")) {
-    // force row-wise for cuda_exp version
+  } else if (device_type == std::string("cuda")) {
+    // force row-wise for cuda version
     force_col_wise = false;
     force_row_wise = true;
     if (deterministic) {
       Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
     }
   }
-  // force gpu_use_dp for CUDA
-  if (device_type == std::string("cuda") && !gpu_use_dp) {
-    Log::Warning("CUDA currently requires double precision calculations.");
-    gpu_use_dp = true;
-  }
   // linear tree learner must be serial type and run on CPU device
   if (linear_tree) {
     if (device_type != std::string("cpu")) {
diff --git a/src/io/cuda/cuda_column_data.cpp b/src/io/cuda/cuda_column_data.cpp
index c4b0bb62e584..a1080cb2b902 100644
--- a/src/io/cuda/cuda_column_data.cpp
+++ b/src/io/cuda/cuda_column_data.cpp
@@ -3,7 +3,7 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_column_data.hpp>
 
@@ -308,4 +308,4 @@ void CUDAColumnData::InitColumnMetaInfo() {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/io/cuda/cuda_column_data.cu b/src/io/cuda/cuda_column_data.cu
index 3ab70e9a5758..75ff6234e09e 100644
--- a/src/io/cuda/cuda_column_data.cu
+++ b/src/io/cuda/cuda_column_data.cu
@@ -4,7 +4,7 @@
  */
 
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_column_data.hpp>
 
@@ -58,4 +58,4 @@ void CUDAColumnData::LaunchCopySubrowKernel(void* const* in_cuda_data_by_column)
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/io/cuda/cuda_metadata.cpp b/src/io/cuda/cuda_metadata.cpp
index 2a3dd380254a..02cd42cf805f 100644
--- a/src/io/cuda/cuda_metadata.cpp
+++ b/src/io/cuda/cuda_metadata.cpp
@@ -3,7 +3,7 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_metadata.hpp>
 
@@ -89,4 +89,4 @@ void CUDAMetadata::SetInitScore(const double* init_score, data_size_t len) {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/io/cuda/cuda_row_data.cpp b/src/io/cuda/cuda_row_data.cpp
index 3c66a164d9d4..30bbb273193a 100644
--- a/src/io/cuda/cuda_row_data.cpp
+++ b/src/io/cuda/cuda_row_data.cpp
@@ -3,7 +3,7 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_row_data.hpp>
 
@@ -474,4 +474,4 @@ template const uint64_t* CUDARowData::GetPartitionPtr<uint64_t>() const;
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/io/cuda/cuda_tree.cpp b/src/io/cuda/cuda_tree.cpp
index 196563340ae5..923e51961e0b 100644
--- a/src/io/cuda/cuda_tree.cpp
+++ b/src/io/cuda/cuda_tree.cpp
@@ -3,7 +3,7 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_tree.hpp>
 
@@ -337,4 +337,4 @@ void CUDATree::AsConstantTree(double val) {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/io/cuda/cuda_tree.cu b/src/io/cuda/cuda_tree.cu
index 2a6448259d7f..62020c3a09ae 100644
--- a/src/io/cuda/cuda_tree.cu
+++ b/src/io/cuda/cuda_tree.cu
@@ -4,7 +4,7 @@
  */
 
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_tree.hpp>
 
@@ -456,4 +456,4 @@ void CUDATree::LaunchAddPredictionToScoreKernel(
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index a237e79b2680..061bdd2464bd 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -345,9 +345,9 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
   auto features_in_group = OneFeaturePerGroup(used_features);
 
   auto is_sparse = io_config.is_enable_sparse;
-  if (io_config.device_type == std::string("cuda") || io_config.device_type == std::string("cuda_exp")) {
+  if (io_config.device_type == std::string("cuda")) {
       LGBM_config_::current_device = lgbm_device_cuda;
-      if ((io_config.device_type == std::string("cuda") || io_config.device_type == std::string("cuda_exp")) && is_sparse) {
+      if ((io_config.device_type == std::string("cuda")) && is_sparse) {
         Log::Warning("Using sparse features with CUDA is currently not supported.");
         is_sparse = false;
       }
@@ -355,8 +355,7 @@ void Dataset::Construct(std::vector<std::unique_ptr<BinMapper>>* bin_mappers,
 
   std::vector<int8_t> group_is_multi_val(used_features.size(), 0);
   if (io_config.enable_bundle && !used_features.empty()) {
-    bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda")
-      || io_config.device_type == std::string("cuda_exp");
+    bool lgbm_is_gpu_used = io_config.device_type == std::string("gpu") || io_config.device_type == std::string("cuda");
     features_in_group = FastFeatureBundling(
         *bin_mappers, sample_non_zero_indices, sample_values, num_per_col,
         num_sample_col, static_cast<data_size_t>(total_sample_cnt),
@@ -447,14 +446,10 @@ void Dataset::FinishLoad() {
   }
   metadata_.FinishLoad();
 
-  #ifdef USE_CUDA_EXP
-  if (device_type_ == std::string("cuda_exp")) {
-    CreateCUDAColumnData();
-    metadata_.CreateCUDAMetadata(gpu_device_id_);
-  } else {
-    cuda_column_data_.reset(nullptr);
-  }
-  #endif  // USE_CUDA_EXP
+  #ifdef USE_CUDA
+  CreateCUDAColumnData();
+  metadata_.CreateCUDAMetadata(gpu_device_id_);
+  #endif  // USE_CUDA
   is_finish_load_ = true;
 }
 
@@ -862,15 +857,13 @@ void Dataset::CopySubrow(const Dataset* fullset,
   device_type_ = fullset->device_type_;
   gpu_device_id_ = fullset->gpu_device_id_;
 
-  #ifdef USE_CUDA_EXP
-  if (device_type_ == std::string("cuda_exp")) {
-    if (cuda_column_data_ == nullptr) {
-      cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_));
-      metadata_.CreateCUDAMetadata(gpu_device_id_);
-    }
-    cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices);
+  #ifdef USE_CUDA
+  if (cuda_column_data_ == nullptr) {
+    cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_));
+    metadata_.CreateCUDAMetadata(gpu_device_id_);
   }
-  #endif  // USE_CUDA_EXP
+  cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices);
+  #endif  // USE_CUDA
 }
 
 bool Dataset::SetFloatField(const char* field_name, const float* field_data,
@@ -1508,13 +1501,9 @@ void Dataset::AddFeaturesFrom(Dataset* other) {
       raw_data_.push_back(other->raw_data_[i]);
     }
   }
-  #ifdef USE_CUDA_EXP
-  if (device_type_ == std::string("cuda_exp")) {
-    CreateCUDAColumnData();
-  } else {
-    cuda_column_data_ = nullptr;
-  }
-  #endif  // USE_CUDA_EXP
+  #ifdef USE_CUDA
+  CreateCUDAColumnData();
+  #endif  // USE_CUDA
 }
 
 const void* Dataset::GetColWiseData(
@@ -1536,7 +1525,7 @@ const void* Dataset::GetColWiseData(
   return feature_groups_[feature_group_index]->GetColWiseData(sub_feature_index, bit_type, is_sparse, bin_iterator);
 }
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 void Dataset::CreateCUDAColumnData() {
   cuda_column_data_.reset(new CUDAColumnData(num_data_, gpu_device_id_));
   int num_columns = 0;
@@ -1671,6 +1660,6 @@ void Dataset::CreateCUDAColumnData() {
                           feature_to_column);
 }
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 
 }  // namespace LightGBM
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 246424600b03..e384af9e0792 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -279,14 +279,10 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
 
     dataset->device_type_ = config_.device_type;
     dataset->gpu_device_id_ = config_.gpu_device_id;
-    #ifdef USE_CUDA_EXP
-    if (config_.device_type == std::string("cuda_exp")) {
-      dataset->CreateCUDAColumnData();
-      dataset->metadata_.CreateCUDAMetadata(dataset->gpu_device_id_);
-    } else {
-      dataset->cuda_column_data_ = nullptr;
-    }
-    #endif  // USE_CUDA_EXP
+    #ifdef USE_CUDA
+    dataset->CreateCUDAColumnData();
+    dataset->metadata_.CreateCUDAMetadata(dataset->gpu_device_id_);
+    #endif  // USE_CUDA
   }
   // check meta data
   dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);
diff --git a/src/io/dense_bin.hpp b/src/io/dense_bin.hpp
index 5d95d9dc6073..0ebcdc1a6181 100644
--- a/src/io/dense_bin.hpp
+++ b/src/io/dense_bin.hpp
@@ -467,7 +467,7 @@ class DenseBin : public Bin {
 
  private:
   data_size_t num_data_;
-#if defined(USE_CUDA) || defined(USE_CUDA_EXP)
+#ifdef USE_CUDA
   std::vector<VAL_T, CHAllocator<VAL_T>> data_;
 #else
   std::vector<VAL_T, Common::AlignmentAllocator<VAL_T, kAlignedSize>> data_;
diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp
index 053d1b43c104..07e9701b1ca6 100644
--- a/src/io/metadata.cpp
+++ b/src/io/metadata.cpp
@@ -18,9 +18,9 @@ Metadata::Metadata() {
   weight_load_from_file_ = false;
   query_load_from_file_ = false;
   init_score_load_from_file_ = false;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   cuda_metadata_ = nullptr;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }
 
 void Metadata::Init(const char* data_filename) {
@@ -344,11 +344,11 @@ void Metadata::SetInitScore(const double* init_score, data_size_t len) {
     init_score_[i] = Common::AvoidInf(init_score[i]);
   }
   init_score_load_from_file_ = false;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   if (cuda_metadata_ != nullptr) {
     cuda_metadata_->SetInitScore(init_score_.data(), len);
   }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }
 
 void Metadata::InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size) {
@@ -387,11 +387,11 @@ void Metadata::SetLabel(const label_t* label, data_size_t len) {
   for (data_size_t i = 0; i < num_data_; ++i) {
     label_[i] = Common::AvoidInf(label[i]);
   }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   if (cuda_metadata_ != nullptr) {
     cuda_metadata_->SetLabel(label_.data(), len);
   }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }
 
 void Metadata::InsertLabels(const label_t* labels, data_size_t start_index, data_size_t len) {
@@ -428,11 +428,11 @@ void Metadata::SetWeights(const label_t* weights, data_size_t len) {
   }
   CalculateQueryWeights();
   weight_load_from_file_ = false;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   if (cuda_metadata_ != nullptr) {
     cuda_metadata_->SetWeights(weights_.data(), len);
   }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }
 
 void Metadata::InsertWeights(const label_t* weights, data_size_t start_index, data_size_t len) {
@@ -477,7 +477,7 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
   }
   CalculateQueryWeights();
   query_load_from_file_ = false;
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   if (cuda_metadata_ != nullptr) {
     if (query_weights_.size() > 0) {
       CHECK_EQ(query_weights_.size(), static_cast<size_t>(num_queries_));
@@ -486,7 +486,7 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
       cuda_metadata_->SetQuery(query_boundaries_.data(), nullptr, num_queries_);
     }
   }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }
 
 void Metadata::InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len) {
@@ -635,12 +635,12 @@ void Metadata::FinishLoad() {
   CalculateQueryBoundaries();
 }
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 void Metadata::CreateCUDAMetadata(const int gpu_device_id) {
   cuda_metadata_.reset(new CUDAMetadata(gpu_device_id));
   cuda_metadata_->Init(label_, weights_, query_boundaries_, query_weights_, init_score_);
 }
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 
 void Metadata::LoadFromMemory(const void* memory) {
   const char* mem_ptr = reinterpret_cast<const char*>(memory);
diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp
index 8de9cf305952..b4fbfbe673aa 100644
--- a/src/io/multi_val_dense_bin.hpp
+++ b/src/io/multi_val_dense_bin.hpp
@@ -211,13 +211,13 @@ class MultiValDenseBin : public MultiValBin {
 
   MultiValDenseBin<VAL_T>* Clone() override;
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   const void* GetRowWiseData(uint8_t* bit_type,
     size_t* total_size,
     bool* is_sparse,
     const void** out_data_ptr,
     uint8_t* data_ptr_bit_type) const override;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
  private:
   data_size_t num_data_;
diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp
index 80acbb681ab6..eaa30ef0a0cc 100644
--- a/src/io/multi_val_sparse_bin.hpp
+++ b/src/io/multi_val_sparse_bin.hpp
@@ -292,13 +292,13 @@ class MultiValSparseBin : public MultiValBin {
   MultiValSparseBin<INDEX_T, VAL_T>* Clone() override;
 
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   const void* GetRowWiseData(uint8_t* bit_type,
     size_t* total_size,
     bool* is_sparse,
     const void** out_data_ptr,
     uint8_t* data_ptr_bit_type) const override;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
  private:
   data_size_t num_data_;
diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp
index 199424733f80..f6462697a93d 100644
--- a/src/io/train_share_states.cpp
+++ b/src/io/train_share_states.cpp
@@ -382,9 +382,9 @@ void TrainingShareStates::CalcBinOffsets(const std::vector<std::unique_ptr<Featu
     }
     num_hist_total_bin_ = static_cast<int>(feature_hist_offsets_.back());
   }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   column_hist_offsets_ = *offsets;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }
 
 void TrainingShareStates::SetMultiValBin(MultiValBin* bin, data_size_t num_data,
diff --git a/src/io/tree.cpp b/src/io/tree.cpp
index 39b5c23d4d1c..ce45d20cf454 100644
--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@@ -53,9 +53,9 @@ Tree::Tree(int max_leaves, bool track_branch_features, bool is_linear)
     leaf_features_.resize(max_leaves_);
     leaf_features_inner_.resize(max_leaves_);
   }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   is_cuda_tree_ = false;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }
 
 int Tree::Split(int leaf, int feature, int real_feature, uint32_t threshold_bin,
@@ -731,9 +731,9 @@ Tree::Tree(const char* str, size_t* used_len) {
     is_linear_ = false;
   }
 
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   is_cuda_tree_ = false;
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 
   if ((num_leaves_ <= 1) && !is_linear_) {
     return;
diff --git a/src/metric/cuda/cuda_binary_metric.cpp b/src/metric/cuda/cuda_binary_metric.cpp
index d526fddeecb2..cedf909b9892 100644
--- a/src/metric/cuda/cuda_binary_metric.cpp
+++ b/src/metric/cuda/cuda_binary_metric.cpp
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include "cuda_binary_metric.hpp"
 
@@ -28,4 +28,4 @@ std::vector<double> CUDABinaryMetricInterface<HOST_METRIC, CUDA_METRIC>::Eval(co
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/metric/cuda/cuda_binary_metric.hpp b/src/metric/cuda/cuda_binary_metric.hpp
index ae50dac381dd..72d9edc8b40e 100644
--- a/src/metric/cuda/cuda_binary_metric.hpp
+++ b/src/metric/cuda/cuda_binary_metric.hpp
@@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_
 #define LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_metric.hpp>
 #include <LightGBM/cuda/cuda_utils.h>
@@ -52,6 +52,6 @@ class CUDABinaryLoglossMetric: public CUDABinaryMetricInterface<BinaryLoglossMet
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 
 #endif  // LIGHTGBM_METRIC_CUDA_CUDA_BINARY_METRIC_HPP_
diff --git a/src/metric/cuda/cuda_pointwise_metric.cpp b/src/metric/cuda/cuda_pointwise_metric.cpp
index aacd85e50e87..bf9a0cd2867c 100644
--- a/src/metric/cuda/cuda_pointwise_metric.cpp
+++ b/src/metric/cuda/cuda_pointwise_metric.cpp
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include "cuda_binary_metric.hpp"
 #include "cuda_pointwise_metric.hpp"
@@ -35,4 +35,4 @@ template void CUDAPointwiseMetricInterface<BinaryLoglossMetric, CUDABinaryLoglos
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/metric/cuda/cuda_pointwise_metric.cu b/src/metric/cuda/cuda_pointwise_metric.cu
index 4650eb2593cf..2b276bdc3e01 100644
--- a/src/metric/cuda/cuda_pointwise_metric.cu
+++ b/src/metric/cuda/cuda_pointwise_metric.cu
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_algorithms.hpp>
 
@@ -66,4 +66,4 @@ template void CUDAPointwiseMetricInterface<BinaryLoglossMetric, CUDABinaryLoglos
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/metric/cuda/cuda_pointwise_metric.hpp b/src/metric/cuda/cuda_pointwise_metric.hpp
index 4d635da5739e..fbdff27486d3 100644
--- a/src/metric/cuda/cuda_pointwise_metric.hpp
+++ b/src/metric/cuda/cuda_pointwise_metric.hpp
@@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_METRIC_CUDA_CUDA_POINTWISE_METRIC_HPP_
 #define LIGHTGBM_METRIC_CUDA_CUDA_POINTWISE_METRIC_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_metric.hpp>
 #include <LightGBM/cuda/cuda_utils.h>
@@ -38,6 +38,6 @@ class CUDAPointwiseMetricInterface: public CUDAMetricInterface<HOST_METRIC> {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 
 #endif  // LIGHTGBM_METRIC_CUDA_CUDA_POINTWISE_METRIC_HPP_
diff --git a/src/metric/cuda/cuda_regression_metric.cpp b/src/metric/cuda/cuda_regression_metric.cpp
index 15b219160a30..5e61214ad1e8 100644
--- a/src/metric/cuda/cuda_regression_metric.cpp
+++ b/src/metric/cuda/cuda_regression_metric.cpp
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <vector>
 
@@ -31,4 +31,4 @@ CUDAL2Metric::CUDAL2Metric(const Config& config): CUDARegressionMetricInterface<
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/metric/cuda/cuda_regression_metric.hpp b/src/metric/cuda/cuda_regression_metric.hpp
index 342e49542eb4..6e9d44a6b046 100644
--- a/src/metric/cuda/cuda_regression_metric.hpp
+++ b/src/metric/cuda/cuda_regression_metric.hpp
@@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_
 #define LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_metric.hpp>
 #include <LightGBM/cuda/cuda_utils.h>
@@ -54,6 +54,6 @@ class CUDAL2Metric : public CUDARegressionMetricInterface<L2Metric, CUDAL2Metric
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 
 #endif  // LIGHTGBM_METRIC_CUDA_CUDA_REGRESSION_METRIC_HPP_
diff --git a/src/metric/metric.cpp b/src/metric/metric.cpp
index a393f1b2021a..385eea1dcecb 100644
--- a/src/metric/metric.cpp
+++ b/src/metric/metric.cpp
@@ -17,77 +17,77 @@
 namespace LightGBM {
 
 Metric* Metric::CreateMetric(const std::string& type, const Config& config) {
-  #ifdef USE_CUDA_EXP
-  if (config.device_type == std::string("cuda_exp") && config.boosting == std::string("gbdt")) {
+  #ifdef USE_CUDA
+  if (config.device_type == std::string("cuda") && config.boosting == std::string("gbdt")) {
     if (type == std::string("l2")) {
       return new CUDAL2Metric(config);
     } else if (type == std::string("rmse")) {
       return new CUDARMSEMetric(config);
     } else if (type == std::string("l1")) {
-      Log::Warning("Metric l1 is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric l1 is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new L1Metric(config);
     } else if (type == std::string("quantile")) {
-      Log::Warning("Metric quantile is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric quantile is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new QuantileMetric(config);
     } else if (type == std::string("huber")) {
-      Log::Warning("Metric huber is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric huber is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new HuberLossMetric(config);
     } else if (type == std::string("fair")) {
-      Log::Warning("Metric fair is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric fair is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new FairLossMetric(config);
     } else if (type == std::string("poisson")) {
-      Log::Warning("Metric poisson is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric poisson is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new PoissonMetric(config);
     } else if (type == std::string("binary_logloss")) {
       return new CUDABinaryLoglossMetric(config);
     } else if (type == std::string("binary_error")) {
-      Log::Warning("Metric binary_error is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric binary_error is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new BinaryErrorMetric(config);
     } else if (type == std::string("auc")) {
-      Log::Warning("Metric auc is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric auc is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new AUCMetric(config);
     } else if (type == std::string("average_precision")) {
-      Log::Warning("Metric average_precision is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric average_precision is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new AveragePrecisionMetric(config);
     } else if (type == std::string("auc_mu")) {
-      Log::Warning("Metric auc_mu is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric auc_mu is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new AucMuMetric(config);
     } else if (type == std::string("ndcg")) {
-      Log::Warning("Metric ndcg is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric ndcg is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new NDCGMetric(config);
     } else if (type == std::string("map")) {
-      Log::Warning("Metric map is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric map is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new MapMetric(config);
     } else if (type == std::string("multi_logloss")) {
-      Log::Warning("Metric multi_logloss is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric multi_logloss is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new MultiSoftmaxLoglossMetric(config);
     } else if (type == std::string("multi_error")) {
-      Log::Warning("Metric multi_error is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric multi_error is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new MultiErrorMetric(config);
     } else if (type == std::string("cross_entropy")) {
-      Log::Warning("Metric cross_entropy is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric cross_entropy is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new CrossEntropyMetric(config);
     } else if (type == std::string("cross_entropy_lambda")) {
-      Log::Warning("Metric cross_entropy_lambda is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric cross_entropy_lambda is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new CrossEntropyLambdaMetric(config);
     } else if (type == std::string("kullback_leibler")) {
-      Log::Warning("Metric kullback_leibler is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric kullback_leibler is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new KullbackLeiblerDivergence(config);
     } else if (type == std::string("mape")) {
-      Log::Warning("Metric mape is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric mape is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new MAPEMetric(config);
     } else if (type == std::string("gamma")) {
-      Log::Warning("Metric gamma is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric gamma is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new GammaMetric(config);
     } else if (type == std::string("gamma_deviance")) {
-      Log::Warning("Metric gamma_deviance is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric gamma_deviance is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new GammaDevianceMetric(config);
     } else if (type == std::string("tweedie")) {
-      Log::Warning("Metric tweedie is not implemented in cuda_exp version. Fall back to evaluation on CPU.");
+      Log::Warning("Metric tweedie is not implemented in cuda version. Fall back to evaluation on CPU.");
       return new TweedieMetric(config);
     }
   } else {
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
     if (type == std::string("l2")) {
       return new L2Metric(config);
     } else if (type == std::string("rmse")) {
@@ -135,9 +135,9 @@ Metric* Metric::CreateMetric(const std::string& type, const Config& config) {
     } else if (type == std::string("tweedie")) {
       return new TweedieMetric(config);
     }
-  #ifdef USE_CUDA_EXP
+  #ifdef USE_CUDA
   }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
   return nullptr;
 }
 
diff --git a/src/objective/cuda/cuda_binary_objective.cpp b/src/objective/cuda/cuda_binary_objective.cpp
index 837fd7ba08fe..5eb02f8cf32d 100644
--- a/src/objective/cuda/cuda_binary_objective.cpp
+++ b/src/objective/cuda/cuda_binary_objective.cpp
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include "cuda_binary_objective.hpp"
 
@@ -61,4 +61,4 @@ void CUDABinaryLogloss::Init(const Metadata& metadata, data_size_t num_data) {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/objective/cuda/cuda_binary_objective.cu b/src/objective/cuda/cuda_binary_objective.cu
index 9726f3eda66d..6f01c1745f72 100644
--- a/src/objective/cuda/cuda_binary_objective.cu
+++ b/src/objective/cuda/cuda_binary_objective.cu
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <algorithm>
 
@@ -206,4 +206,4 @@ void CUDABinaryLogloss::LaunchResetOVACUDALabelKernel() const {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/objective/cuda/cuda_binary_objective.hpp b/src/objective/cuda/cuda_binary_objective.hpp
index 77f58d8318f1..c87654921cad 100644
--- a/src/objective/cuda/cuda_binary_objective.hpp
+++ b/src/objective/cuda/cuda_binary_objective.hpp
@@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #define GET_GRADIENTS_BLOCK_SIZE_BINARY (1024)
 #define CALC_INIT_SCORE_BLOCK_SIZE_BINARY (1024)
@@ -58,6 +58,6 @@ class CUDABinaryLogloss : public CUDAObjectiveInterface<BinaryLogloss> {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 
 #endif  // LIGHTGBM_OBJECTIVE_CUDA_CUDA_BINARY_OBJECTIVE_HPP_
diff --git a/src/objective/cuda/cuda_multiclass_objective.cpp b/src/objective/cuda/cuda_multiclass_objective.cpp
index 2ea3de870e99..55ff694bf4f7 100644
--- a/src/objective/cuda/cuda_multiclass_objective.cpp
+++ b/src/objective/cuda/cuda_multiclass_objective.cpp
@@ -3,7 +3,7 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include "cuda_multiclass_objective.hpp"
 
@@ -59,4 +59,4 @@ const double* CUDAMulticlassOVA::ConvertOutputCUDA(const data_size_t num_data, c
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/objective/cuda/cuda_multiclass_objective.cu b/src/objective/cuda/cuda_multiclass_objective.cu
index 797c7cec7bf0..b9687ed430e2 100644
--- a/src/objective/cuda/cuda_multiclass_objective.cu
+++ b/src/objective/cuda/cuda_multiclass_objective.cu
@@ -3,7 +3,7 @@
  * Licensed under the MIT License. See LICENSE file in the project root for license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <algorithm>
 
@@ -105,4 +105,4 @@ const double* CUDAMulticlassSoftmax::LaunchConvertOutputCUDAKernel(
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/objective/cuda/cuda_multiclass_objective.hpp b/src/objective/cuda/cuda_multiclass_objective.hpp
index e6e326306e31..328ae6515516 100644
--- a/src/objective/cuda/cuda_multiclass_objective.hpp
+++ b/src/objective/cuda/cuda_multiclass_objective.hpp
@@ -5,7 +5,7 @@
 #ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_objective_function.hpp>
 
@@ -74,5 +74,5 @@ class CUDAMulticlassOVA: public CUDAObjectiveInterface<MulticlassOVA> {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_OBJECTIVE_CUDA_CUDA_MULTICLASS_OBJECTIVE_HPP_
diff --git a/src/objective/cuda/cuda_rank_objective.cpp b/src/objective/cuda/cuda_rank_objective.cpp
index 50ea16e80799..227498ed39af 100644
--- a/src/objective/cuda/cuda_rank_objective.cpp
+++ b/src/objective/cuda/cuda_rank_objective.cpp
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <string>
 #include <vector>
@@ -64,4 +64,4 @@ void CUDARankXENDCG::GenerateItemRands() const {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/objective/cuda/cuda_rank_objective.cu b/src/objective/cuda/cuda_rank_objective.cu
index 43a39fdcbaf8..af9f595f1aed 100644
--- a/src/objective/cuda/cuda_rank_objective.cu
+++ b/src/objective/cuda/cuda_rank_objective.cu
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include "cuda_rank_objective.hpp"
 
@@ -658,4 +658,4 @@ void CUDARankXENDCG::LaunchGetGradientsKernel(const double* score, score_t* grad
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/objective/cuda/cuda_rank_objective.hpp b/src/objective/cuda/cuda_rank_objective.hpp
index f922c240bd28..a7f047017d88 100644
--- a/src/objective/cuda/cuda_rank_objective.hpp
+++ b/src/objective/cuda/cuda_rank_objective.hpp
@@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_RANK_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_CUDA_CUDA_RANK_OBJECTIVE_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #define NUM_QUERY_PER_BLOCK (10)
 
@@ -118,5 +118,5 @@ class CUDARankXENDCG : public CUDALambdaRankObjectiveInterface<RankXENDCG> {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_OBJECTIVE_CUDA_CUDA_RANK_OBJECTIVE_HPP_
diff --git a/src/objective/cuda/cuda_regression_objective.cpp b/src/objective/cuda/cuda_regression_objective.cpp
index 91d7b8059da6..beb030721ae3 100644
--- a/src/objective/cuda/cuda_regression_objective.cpp
+++ b/src/objective/cuda/cuda_regression_objective.cpp
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include "cuda_regression_objective.hpp"
 
@@ -85,4 +85,4 @@ double CUDARegressionPoissonLoss::LaunchCalcInitScoreKernel(const int class_id)
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/objective/cuda/cuda_regression_objective.cu b/src/objective/cuda/cuda_regression_objective.cu
index 99feec132508..28fc3168e41a 100644
--- a/src/objective/cuda/cuda_regression_objective.cu
+++ b/src/objective/cuda/cuda_regression_objective.cu
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include "cuda_regression_objective.hpp"
 #include <LightGBM/cuda/cuda_algorithms.hpp>
@@ -353,4 +353,4 @@ const double* CUDARegressionPoissonLoss::LaunchConvertOutputCUDAKernel(const dat
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/objective/cuda/cuda_regression_objective.hpp b/src/objective/cuda/cuda_regression_objective.hpp
index 593fcf1cfcb6..e2eb1abbd006 100644
--- a/src/objective/cuda/cuda_regression_objective.hpp
+++ b/src/objective/cuda/cuda_regression_objective.hpp
@@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_OBJECTIVE_CUDA_CUDA_REGRESSION_OBJECTIVE_HPP_
 #define LIGHTGBM_OBJECTIVE_CUDA_CUDA_REGRESSION_OBJECTIVE_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #define GET_GRADIENTS_BLOCK_SIZE_REGRESSION (1024)
 
@@ -135,5 +135,5 @@ class CUDARegressionPoissonLoss : public CUDARegressionObjectiveInterface<Regres
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_OBJECTIVE_CUDA_CUDA_REGRESSION_OBJECTIVE_HPP_
diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp
index fdca46ec1647..761b62f21c2e 100644
--- a/src/treelearner/cuda/cuda_best_split_finder.cpp
+++ b/src/treelearner/cuda/cuda_best_split_finder.cpp
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <algorithm>
 
@@ -383,4 +383,4 @@ void CUDABestSplitFinder::SetUsedFeatureByNode(const std::vector<int8_t>& is_fea
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu
index 04896c40e7a9..3fee5562953c 100644
--- a/src/treelearner/cuda/cuda_best_split_finder.cu
+++ b/src/treelearner/cuda/cuda_best_split_finder.cu
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <algorithm>
 
@@ -1802,4 +1802,4 @@ void CUDABestSplitFinder::LaunchInitCUDARandomKernel() {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp
index e9c12922cde6..69f8169f8d85 100644
--- a/src/treelearner/cuda/cuda_best_split_finder.hpp
+++ b/src/treelearner/cuda/cuda_best_split_finder.hpp
@@ -7,7 +7,7 @@
 #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_
 #define LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/bin.h>
 #include <LightGBM/dataset.h>
@@ -211,5 +211,5 @@ class CUDABestSplitFinder {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_BEST_SPLIT_FINDER_HPP_
diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp
index 271b9d97e5f0..3ad157ef0105 100644
--- a/src/treelearner/cuda/cuda_data_partition.cpp
+++ b/src/treelearner/cuda/cuda_data_partition.cpp
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <algorithm>
 #include <memory>
@@ -370,4 +370,4 @@ void CUDADataPartition::ResetByLeafPred(const std::vector<int>& leaf_pred, int n
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu
index 4f88d3034acc..b1d3fa496ab9 100644
--- a/src/treelearner/cuda/cuda_data_partition.cu
+++ b/src/treelearner/cuda/cuda_data_partition.cu
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include "cuda_data_partition.hpp"
 
@@ -1071,4 +1071,4 @@ void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double* leaf_valu
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp
index 3f02977a615a..84050565c085 100644
--- a/src/treelearner/cuda/cuda_data_partition.hpp
+++ b/src/treelearner/cuda/cuda_data_partition.hpp
@@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_
 #define LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/bin.h>
 #include <LightGBM/meta.h>
@@ -384,5 +384,5 @@ class CUDADataPartition {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_DATA_PARTITION_HPP_
diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp
index 83227165af19..7e6be1c1069c 100644
--- a/src/treelearner/cuda/cuda_histogram_constructor.cpp
+++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include "cuda_histogram_constructor.hpp"
 
@@ -193,4 +193,4 @@ void CUDAHistogramConstructor::ResetConfig(const Config* config) {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu
index e1888f0c4b66..c884383304a4 100644
--- a/src/treelearner/cuda/cuda_histogram_constructor.cu
+++ b/src/treelearner/cuda/cuda_histogram_constructor.cu
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include "cuda_histogram_constructor.hpp"
 
@@ -429,4 +429,4 @@ void CUDAHistogramConstructor::LaunchSubtractHistogramKernel(
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp
index e364003ed934..7e600e7c01b4 100644
--- a/src/treelearner/cuda/cuda_histogram_constructor.hpp
+++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp
@@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
 #define LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_row_data.hpp>
 #include <LightGBM/feature_group.h>
@@ -165,5 +165,5 @@ class CUDAHistogramConstructor {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_HISTOGRAM_CONSTRUCTOR_HPP_
diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp
index 9d093f0f164b..6aa020d9ea0d 100644
--- a/src/treelearner/cuda/cuda_leaf_splits.cpp
+++ b/src/treelearner/cuda/cuda_leaf_splits.cpp
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include "cuda_leaf_splits.hpp"
 
@@ -68,4 +68,4 @@ void CUDALeafSplits::Resize(const data_size_t num_data) {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu
index 15c2983ef1d2..29e42f67ead9 100644
--- a/src/treelearner/cuda/cuda_leaf_splits.cu
+++ b/src/treelearner/cuda/cuda_leaf_splits.cu
@@ -5,7 +5,7 @@
  */
 
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include "cuda_leaf_splits.hpp"
 #include <LightGBM/cuda/cuda_algorithms.hpp>
@@ -126,4 +126,4 @@ void CUDALeafSplits::LaunchInitValuesKernal(
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp
index fe04cf5bcace..769f956b95c3 100644
--- a/src/treelearner/cuda/cuda_leaf_splits.hpp
+++ b/src/treelearner/cuda/cuda_leaf_splits.hpp
@@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
 #define LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_utils.h>
 #include <LightGBM/bin.h>
@@ -156,5 +156,5 @@ class CUDALeafSplits {
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_LEAF_SPLITS_HPP_
diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
index f8e6fbfec725..28c5562aa01c 100644
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include "cuda_single_gpu_tree_learner.hpp"
 
@@ -515,4 +515,4 @@ void CUDASingleGPUTreeLearner::CheckSplitValid(
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
index f4a87de499cb..8a558ddc43d1 100644
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
@@ -4,7 +4,7 @@
  * license information.
  */
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_algorithms.hpp>
 
@@ -258,4 +258,4 @@ void CUDASingleGPUTreeLearner::LaunchConstructBitsetForCategoricalSplitKernel(
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
index b1922f5f28c5..bfad5e81c952 100644
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
@@ -9,7 +9,7 @@
 #include <memory>
 #include <vector>
 
-#ifdef USE_CUDA_EXP
+#ifdef USE_CUDA
 
 #include "cuda_leaf_splits.hpp"
 #include "cuda_histogram_constructor.hpp"
@@ -137,7 +137,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {
 
 }  // namespace LightGBM
 
-#else  // USE_CUDA_EXP
+#else  // USE_CUDA
 
 // When GPU support is not compiled in, quit with an error message
 
@@ -147,12 +147,12 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {
  public:
     #pragma warning(disable : 4702)
     explicit CUDASingleGPUTreeLearner(const Config* tree_config, const bool /*boosting_on_cuda*/) : SerialTreeLearner(tree_config) {
-      Log::Fatal("CUDA Tree Learner experimental version was not enabled in this build.\n"
-                 "Please recompile with CMake option -DUSE_CUDA_EXP=1");
+      Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
+                 "Please recompile with CMake option -DUSE_CUDAP=1");
     }
 };
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA_EXP
+#endif  // USE_CUDA
 #endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_SINGLE_GPU_TREE_LEARNER_HPP_
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 0409821850b1..14b78eb6a577 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -211,7 +211,7 @@ class SerialTreeLearner: public TreeLearner {
   std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_gradients_;
   /*! \brief hessians of current iteration, ordered for cache optimized, aligned to 4K page */
   std::vector<score_t, boost::alignment::aligned_allocator<score_t, 4096>> ordered_hessians_;
-#elif defined(USE_CUDA) || defined(USE_CUDA_EXP)
+#elif defined(USE_CUDA)
   /*! \brief gradients of current iteration, ordered for cache optimized */
   std::vector<score_t, CHAllocator<score_t>> ordered_gradients_;
   /*! \brief hessians of current iteration, ordered for cache optimized */
diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp
index 40a29e33a531..e637660c1425 100644
--- a/src/treelearner/tree_learner.cpp
+++ b/src/treelearner/tree_learner.cpp
@@ -40,24 +40,14 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con
       return new VotingParallelTreeLearner<GPUTreeLearner>(config);
     }
   } else if (device_type == std::string("cuda")) {
-    if (learner_type == std::string("serial")) {
-      return new CUDATreeLearner(config);
-    } else if (learner_type == std::string("feature")) {
-      return new FeatureParallelTreeLearner<CUDATreeLearner>(config);
-    } else if (learner_type == std::string("data")) {
-      return new DataParallelTreeLearner<CUDATreeLearner>(config);
-    } else if (learner_type == std::string("voting")) {
-      return new VotingParallelTreeLearner<CUDATreeLearner>(config);
-    }
-  } else if (device_type == std::string("cuda_exp")) {
     if (learner_type == std::string("serial")) {
       if (config->num_gpu == 1) {
         return new CUDASingleGPUTreeLearner(config, boosting_on_cuda);
       } else {
-        Log::Fatal("cuda_exp only supports training on a single GPU.");
+        Log::Fatal("cuda only supports training on a single GPU.");
       }
     } else {
-      Log::Fatal("cuda_exp only supports training on a single machine.");
+      Log::Fatal("cuda only supports training on a single machine.");
     }
   }
   return nullptr;
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index f3fc65d3e4c1..b3a00c319c90 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -48,7 +48,7 @@ def test_basic(tmp_path):
     assert bst.current_iteration() == 20
     assert bst.num_trees() == 20
     assert bst.num_model_per_iteration() == 1
-    if getenv('TASK', '') != 'cuda_exp':
+    if getenv('TASK', '') != 'cuda':
         assert bst.lower_bound() == pytest.approx(-2.9040190126976606)
         assert bst.upper_bound() == pytest.approx(3.3182142872462883)
 
diff --git a/tests/python_package_test/test_dask.py b/tests/python_package_test/test_dask.py
index 748824cef451..068fa4a6f388 100644
--- a/tests/python_package_test/test_dask.py
+++ b/tests/python_package_test/test_dask.py
@@ -57,7 +57,7 @@
 
 pytestmark = [
     pytest.mark.skipif(getenv('TASK', '') == 'mpi', reason='Fails to run with MPI interface'),
-    pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Fails to run with CUDA Experimental interface')
+    pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Fails to run with CUDA interface')
 ]
 
 
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index 9f91ffcbeb46..4691120184aa 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -588,7 +588,7 @@ def test_multi_class_error():
     assert results['training']['multi_error@2'][-1] == pytest.approx(0)
 
 
-@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version')
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version')
 def test_auc_mu():
     # should give same result as binary auc for 2 classes
     X, y = load_digits(n_class=10, return_X_y=True)
@@ -1661,7 +1661,7 @@ def generate_trainset_for_monotone_constraints_tests(x3_to_category=True):
     return trainset
 
 
-@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Monotone constraints are not yet supported by CUDA Experimental version')
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Monotone constraints are not yet supported by CUDA version')
 @pytest.mark.parametrize("test_with_categorical_variable", [True, False])
 def test_monotone_constraints(test_with_categorical_variable):
     def is_increasing(y):
@@ -1751,7 +1751,7 @@ def has_interaction(treef):
                 assert are_interactions_enforced(constrained_model, feature_sets)
 
 
-@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Monotone constraints are not yet supported by CUDA Experimental version')
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Monotone constraints are not yet supported by CUDA version')
 def test_monotone_penalty():
     def are_first_splits_non_monotone(tree, n, monotone_constraints):
         if n <= 0:
@@ -1791,7 +1791,7 @@ def are_there_monotone_splits(tree, monotone_constraints):
 
 
 # test if a penalty as high as the depth indeed prohibits all monotone splits
-@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Monotone constraints are not yet supported by CUDA Experimental version')
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Monotone constraints are not yet supported by CUDA version')
 def test_monotone_penalty_max():
     max_depth = 5
     monotone_constraints = [1, -1, 0]
@@ -2652,7 +2652,7 @@ def test_model_size():
         pytest.skipTest('not enough RAM')
 
 
-@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version')
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version')
 def test_get_split_value_histogram():
     X, y = make_synthetic_regression()
     X = np.repeat(X, 3, axis=0)
@@ -2735,7 +2735,7 @@ def test_get_split_value_histogram():
         gbm.get_split_value_histogram(2)
 
 
-@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version')
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version')
 def test_early_stopping_for_only_first_metric():
 
     def metrics_combination_train_regression(valid_sets, metric_list, assumed_iteration,
@@ -3573,7 +3573,7 @@ def hook(obj):
     assert "LV" in dumped_model_str
 
 
-@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Forced splits are not yet supported by CUDA Experimental version')
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Forced splits are not yet supported by CUDA version')
 def test_force_split_with_feature_fraction(tmp_path):
     X, y = make_synthetic_regression()
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index c6d8a5bcaf41..746c958a7304 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -121,7 +121,7 @@ def test_regression():
     assert gbm.evals_result_['valid_0']['l2'][gbm.best_iteration_ - 1] == pytest.approx(ret)
 
 
-@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version')
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version')
 def test_multiclass():
     X, y = load_digits(n_class=10, return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
@@ -134,7 +134,7 @@ def test_multiclass():
     assert gbm.evals_result_['valid_0']['multi_logloss'][gbm.best_iteration_ - 1] == pytest.approx(ret)
 
 
-@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version')
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version')
 def test_lambdarank():
     rank_example_dir = Path(__file__).absolute().parents[2] / 'examples' / 'lambdarank'
     X_train, y_train = load_svmlight_file(str(rank_example_dir / 'rank.train'))
@@ -1091,7 +1091,7 @@ def test_nan_handle():
     np.testing.assert_allclose(gbm.evals_result_['training']['l2'], np.nan)
 
 
-@pytest.mark.skipif(getenv('TASK', '') == 'cuda_exp', reason='Skip due to differences in implementation details of CUDA Experimental version')
+@pytest.mark.skipif(getenv('TASK', '') == 'cuda', reason='Skip due to differences in implementation details of CUDA version')
 def test_first_metric_only():
 
     def fit_and_check(eval_set_names, metric_names, assumed_iteration, first_metric_only):
diff --git a/tests/python_package_test/test_utilities.py b/tests/python_package_test/test_utilities.py
index e75198bb1214..01c45884a578 100644
--- a/tests/python_package_test/test_utilities.py
+++ b/tests/python_package_test/test_utilities.py
@@ -91,15 +91,15 @@ def dummy_metric(_, __):
         "INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.",
         "INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!"
     ]
-    cuda_exp_lines = [
-        "INFO | [LightGBM] [Warning] Metric auc is not implemented in cuda_exp version. Fall back to evaluation on CPU.",
-        "INFO | [LightGBM] [Warning] Metric binary_error is not implemented in cuda_exp version. Fall back to evaluation on CPU.",
+    cuda_ines = [
+        "INFO | [LightGBM] [Warning] Metric auc is not implemented in cuda version. Fall back to evaluation on CPU.",
+        "INFO | [LightGBM] [Warning] Metric binary_error is not implemented in cuda version. Fall back to evaluation on CPU.",
     ]
     with open(log_filename, "rt", encoding="utf-8") as f:
         actual_log = f.read().strip()
         actual_log_wo_gpu_stuff = []
         for line in actual_log.split("\n"):
-            if not any(line.startswith(gpu_or_cuda_exp_line) for gpu_or_cuda_exp_line in gpu_lines + cuda_exp_lines):
+            if not any(line.startswith(gpu_or_cuda_line) for gpu_or_cuda_line in gpu_lines + cuda_lines):
                 actual_log_wo_gpu_stuff.append(line)
 
     assert "\n".join(actual_log_wo_gpu_stuff) == expected_log

From 01822059b8948f3c997d21f626faf2ddee0bb9a0 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 17 Jan 2023 19:39:02 -0600
Subject: [PATCH 07/17] revert some unnecessaary changes

---
 include/LightGBM/cuda/vector_cudahost.h | 31 ++++++++++++++++---------
 src/io/config.cpp                       |  3 +--
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index 6262a3bc9113..45adfd67858b 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -42,35 +42,44 @@ struct CHAllocator {
     T* ptr;
     if (n == 0) return NULL;
     n = SIZE_ALIGNED(n);
-    if (LGBM_config_::current_device == lgbm_device_cuda) {
+    #ifdef USE_CUDA
+      if (LGBM_config_::current_device == lgbm_device_cuda) {
         cudaError_t ret = cudaHostAlloc(&ptr, n*sizeof(T), cudaHostAllocPortable);
         if (ret != cudaSuccess) {
-            Log::Warning("Defaulting to malloc in CHAllocator!!!");
-            ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
+          Log::Warning("Defaulting to malloc in CHAllocator!!!");
+          ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
         }
-    } else {
+      } else {
         ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
-    }
+      }
+    #else
+      ptr = reinterpret_cast<T*>(_mm_malloc(n*sizeof(T), 16));
+    #endif
     return ptr;
   }
 
   void deallocate(T* p, std::size_t n) {
     (void)n;  // UNUSED
     if (p == NULL) return;
-    if (LGBM_config_::current_device == lgbm_device_cuda) {
+    #ifdef USE_CUDA
+      if (LGBM_config_::current_device == lgbm_device_cuda) {
         cudaPointerAttributes attributes;
         cudaPointerGetAttributes(&attributes, p);
         #if CUDA_VERSION >= 10000
-        if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
+          if ((attributes.type == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
             cudaFreeHost(p);
-        }
+          }
         #else
-        if ((attributes.memoryType == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
+          if ((attributes.memoryType == cudaMemoryTypeHost) && (attributes.devicePointer != NULL)) {
             cudaFreeHost(p);
-        }
+          }
         #endif
-    } else {
+      } else {
         _mm_free(p);
+      }
+    #else
+      _mm_free(p);
+    #endif
     }
   }
 };
diff --git a/src/io/config.cpp b/src/io/config.cpp
index ba2672fcaef4..2d891f769ee1 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -182,8 +182,7 @@ void GetDeviceType(const std::unordered_map<std::string, std::string>& params, s
         "Found device_type='cuda_exp' passed through params. "
         "That is an alias for device_type='cuda'. "
         "Use device_type='cuda' to suppress this warning. "
-        "In the future, this warning will become an error. "
-      );
+        "In the future, this warning will become an error. ");
       *device_type = "cuda";
     } else {
       Log::Fatal("Unknown device type %s", value.c_str());

From 07a4a926142a3d95cc39165d9801284a065857e4 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 17 Jan 2023 19:51:04 -0600
Subject: [PATCH 08/17] revert a few more mistakes

---
 include/LightGBM/cuda/vector_cudahost.h |  1 -
 src/io/dataset.cpp                      | 20 +++++++++++++-------
 src/io/dataset_loader.cpp               | 10 +++++++---
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index 45adfd67858b..8df88c1c48cf 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -80,7 +80,6 @@ struct CHAllocator {
     #else
       _mm_free(p);
     #endif
-    }
   }
 };
 template <class T, class U>
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 061bdd2464bd..551683a81197 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -858,12 +858,14 @@ void Dataset::CopySubrow(const Dataset* fullset,
   gpu_device_id_ = fullset->gpu_device_id_;
 
   #ifdef USE_CUDA
-  if (cuda_column_data_ == nullptr) {
-    cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_));
-    metadata_.CreateCUDAMetadata(gpu_device_id_);
+  if (device_type_ == std::string("cuda")) {
+    if (cuda_column_data_ == nullptr) {
+      cuda_column_data_.reset(new CUDAColumnData(fullset->num_data(), gpu_device_id_));
+      metadata_.CreateCUDAMetadata(gpu_device_id_);
+    }
+    cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices);
   }
-  cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices);
-  #endif  // USE_CUDA
+  #endif  // USE_CUDA_EXP
 }
 
 bool Dataset::SetFloatField(const char* field_name, const float* field_data,
@@ -1502,8 +1504,12 @@ void Dataset::AddFeaturesFrom(Dataset* other) {
     }
   }
   #ifdef USE_CUDA
-  CreateCUDAColumnData();
-  #endif  // USE_CUDA
+  if (device_type_ == std::string("cuda")) {
+    CreateCUDAColumnData();
+  } else {
+    cuda_column_data_ = nullptr;
+  }
+  #endif  // USE_CUDA_EXP
 }
 
 const void* Dataset::GetColWiseData(
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index e384af9e0792..76d6fad794cc 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -280,9 +280,13 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
     dataset->device_type_ = config_.device_type;
     dataset->gpu_device_id_ = config_.gpu_device_id;
     #ifdef USE_CUDA
-    dataset->CreateCUDAColumnData();
-    dataset->metadata_.CreateCUDAMetadata(dataset->gpu_device_id_);
-    #endif  // USE_CUDA
+    if (config_.device_type == std::string("cuda")) {
+      dataset->CreateCUDAColumnData();
+      dataset->metadata_.CreateCUDAMetadata(dataset->gpu_device_id_);
+    } else {
+      dataset->cuda_column_data_ = nullptr;
+    }
+    #endif  // USE_CUDA_EXP
   }
   // check meta data
   dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);

From 0c60b71726e17de75b6bc507fb669a19d44c437b Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 17 Jan 2023 20:30:07 -0600
Subject: [PATCH 09/17] revert another change that ignored params

---
 src/io/dataset.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 551683a81197..68d01ec11920 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -447,9 +447,13 @@ void Dataset::FinishLoad() {
   metadata_.FinishLoad();
 
   #ifdef USE_CUDA
-  CreateCUDAColumnData();
-  metadata_.CreateCUDAMetadata(gpu_device_id_);
-  #endif  // USE_CUDA
+  if (device_type_ == std::string("cuda")) {
+    CreateCUDAColumnData();
+    metadata_.CreateCUDAMetadata(gpu_device_id_);
+  } else {
+    cuda_column_data_.reset(nullptr);
+  }
+  #endif  // USE_CUDA_EXP
   is_finish_load_ = true;
 }
 

From 118d32a4a8a69577db64ef304ce8a32ad1a06145 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 17 Jan 2023 20:47:40 -0600
Subject: [PATCH 10/17] sigh

---
 src/boosting/gbdt.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index fbaf337eea60..b1db41621067 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -71,9 +71,11 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
   if (config_->device_type == std::string("cuda")) {
     LGBM_config_::current_learner = use_cuda_learner;
     #ifdef USE_CUDA
-    const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0;
-    CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id));
-    #endif  // USE_CUDA
+    if (config_->device_type == std::string("cuda")) {
+      const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0;
+      CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id));
+    }
+    #endif  // USE_CUDA_EXP
   }
 
   // load forced_splits file

From e4cc9d05b56472145bbe72b698ada6e1e0d0ddcc Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 17 Jan 2023 21:55:32 -0600
Subject: [PATCH 11/17] remove CUDATreeLearner

---
 include/LightGBM/cuda/vector_cudahost.h           | 4 ++--
 src/treelearner/data_parallel_tree_learner.cpp    | 1 -
 src/treelearner/feature_parallel_tree_learner.cpp | 1 -
 src/treelearner/parallel_tree_learner.h           | 1 -
 src/treelearner/tree_learner.cpp                  | 1 -
 src/treelearner/voting_parallel_tree_learner.cpp  | 1 -
 6 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/include/LightGBM/cuda/vector_cudahost.h b/include/LightGBM/cuda/vector_cudahost.h
index 8df88c1c48cf..83fbe5cda9b7 100644
--- a/include/LightGBM/cuda/vector_cudahost.h
+++ b/include/LightGBM/cuda/vector_cudahost.h
@@ -5,11 +5,12 @@
 #ifndef LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
 #define LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
 
-#ifdef USE_CUDA
 #include <LightGBM/utils/common.h>
 
+#ifdef USE_CUDA
 #include <cuda.h>
 #include <cuda_runtime.h>
+#endif
 #include <stdio.h>
 
 enum LGBM_Device {
@@ -89,5 +90,4 @@ bool operator!=(const CHAllocator<T>&, const CHAllocator<U>&);
 
 }  // namespace LightGBM
 
-#endif  // USE_CUDA
 #endif  // LIGHTGBM_CUDA_VECTOR_CUDAHOST_H_
diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp
index c9ff36da9f44..677b7dc6eb82 100644
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -276,7 +276,6 @@ void DataParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf, in
 }
 
 // instantiate template classes, otherwise linker cannot find the code
-template class DataParallelTreeLearner<CUDATreeLearner>;
 template class DataParallelTreeLearner<GPUTreeLearner>;
 template class DataParallelTreeLearner<SerialTreeLearner>;
 
diff --git a/src/treelearner/feature_parallel_tree_learner.cpp b/src/treelearner/feature_parallel_tree_learner.cpp
index f4edfe03dc16..c5202f3d706d 100644
--- a/src/treelearner/feature_parallel_tree_learner.cpp
+++ b/src/treelearner/feature_parallel_tree_learner.cpp
@@ -77,7 +77,6 @@ void FeatureParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(
 }
 
 // instantiate template classes, otherwise linker cannot find the code
-template class FeatureParallelTreeLearner<CUDATreeLearner>;
 template class FeatureParallelTreeLearner<GPUTreeLearner>;
 template class FeatureParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM
diff --git a/src/treelearner/parallel_tree_learner.h b/src/treelearner/parallel_tree_learner.h
index 03b60aca6329..29f4e1688b99 100644
--- a/src/treelearner/parallel_tree_learner.h
+++ b/src/treelearner/parallel_tree_learner.h
@@ -12,7 +12,6 @@
 #include <memory>
 #include <vector>
 
-#include "cuda_tree_learner.h"
 #include "gpu_tree_learner.h"
 #include "serial_tree_learner.h"
 
diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp
index e637660c1425..aaa0ce72ecd9 100644
--- a/src/treelearner/tree_learner.cpp
+++ b/src/treelearner/tree_learner.cpp
@@ -4,7 +4,6 @@
  */
 #include <LightGBM/tree_learner.h>
 
-#include "cuda_tree_learner.h"
 #include "gpu_tree_learner.h"
 #include "linear_tree_learner.h"
 #include "parallel_tree_learner.h"
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index aacd5caa4412..0eee032839cd 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -501,7 +501,6 @@ void VotingParallelTreeLearner<TREELEARNER_T>::Split(Tree* tree, int best_Leaf,
 }
 
 // instantiate template classes, otherwise linker cannot find the code
-template class VotingParallelTreeLearner<CUDATreeLearner>;
 template class VotingParallelTreeLearner<GPUTreeLearner>;
 template class VotingParallelTreeLearner<SerialTreeLearner>;
 }  // namespace LightGBM

From e734d6f02a418b7e4b2ee261230418a9a64e39be Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 17 Jan 2023 22:08:02 -0600
Subject: [PATCH 12/17] fix tests, docs

---
 .ci/test.sh                                 | 114 ++++++++++----------
 docs/Installation-Guide.rst                 |   4 +-
 python-package/README.rst                   |   6 +-
 python-package/setup.py                     |   2 +-
 src/boosting/gbdt.cpp                       |   2 +-
 src/io/dataset.cpp                          |   6 +-
 src/io/dataset_loader.cpp                   |   2 +-
 tests/python_package_test/test_utilities.py |   2 +-
 8 files changed, 66 insertions(+), 72 deletions(-)

diff --git a/.ci/test.sh b/.ci/test.sh
index 12b4ae027993..a45cb7ae40ef 100755
--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -34,63 +34,7 @@ if [[ "$TASK" == "cpp-tests" ]]; then
     exit 0
 fi
 
-CONDA_PYTHON_REQUIREMENT="python=$PYTHON_VERSION[build=*cpython]"
-
-if [[ $TASK == "if-else" ]]; then
-    conda create -q -y -n $CONDA_ENV ${CONDA_PYTHON_REQUIREMENT} numpy
-    source activate $CONDA_ENV
-    mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build && cmake .. && make lightgbm -j4 || exit -1
-    cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=train.conf convert_model_language=cpp convert_model=../../src/boosting/gbdt_prediction.cpp && ../../lightgbm config=predict.conf output_result=origin.pred || exit -1
-    cd $BUILD_DIRECTORY/build && make lightgbm -j4 || exit -1
-    cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=predict.conf output_result=ifelse.pred && python test.py || exit -1
-    exit 0
-fi
-
-if [[ $TASK == "swig" ]]; then
-    mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build
-    if [[ $OS_NAME == "macos" ]]; then
-        cmake -DUSE_SWIG=ON -DAPPLE_OUTPUT_DYLIB=ON ..
-    else
-        cmake -DUSE_SWIG=ON ..
-    fi
-    make -j4 || exit -1
-    if [[ $OS_NAME == "linux" ]] && [[ $COMPILER == "gcc" ]]; then
-        objdump -T $BUILD_DIRECTORY/lib_lightgbm.so > $BUILD_DIRECTORY/objdump.log || exit -1
-        objdump -T $BUILD_DIRECTORY/lib_lightgbm_swig.so >> $BUILD_DIRECTORY/objdump.log || exit -1
-        python $BUILD_DIRECTORY/helpers/check_dynamic_dependencies.py $BUILD_DIRECTORY/objdump.log || exit -1
-    fi
-    if [[ $PRODUCES_ARTIFACTS == "true" ]]; then
-        cp $BUILD_DIRECTORY/build/lightgbmlib.jar $BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_$OS_NAME.jar
-    fi
-    exit 0
-fi
-
-if [[ $TASK == "lint" ]]; then
-    conda create -q -y -n $CONDA_ENV \
-        ${CONDA_PYTHON_REQUIREMENT} \
-        cmakelint \
-        cpplint \
-        isort \
-        mypy \
-        pycodestyle \
-        pydocstyle \
-        "r-lintr>=3.0"
-    source activate $CONDA_ENV
-    echo "Linting Python code"
-    pycodestyle --ignore=E501,W503 --exclude=./.nuget,./external_libs . || exit -1
-    pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1
-    isort . --check-only || exit -1
-    mypy --ignore-missing-imports python-package/ || true
-    echo "Linting R code"
-    Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1
-    echo "Linting C++ code"
-    cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package ./swig ./tests || exit -1
-    cmake_files=$(find . -name CMakeLists.txt -o -path "*/cmake/*.cmake")
-    cmakelint --linelength=120 --filter=-convention/filename,-package/stdargs,-readability/wonkycase ${cmake_files} || exit -1
-    exit 0
-fi
-
-conda create -q -y -n $CONDA_ENV "${CONDA_PYTHON_REQUIREMENT}"
+conda create -q -y -n $CONDA_ENV "python=$PYTHON_VERSION[build=*cpython]"
 source activate $CONDA_ENV
 
 cd $BUILD_DIRECTORY
@@ -128,6 +72,60 @@ if [[ $TASK == "check-docs" ]] || [[ $TASK == "check-links" ]]; then
     exit 0
 fi
 
+if [[ $TASK == "lint" ]]; then
+    conda install -q -y -n $CONDA_ENV \
+        cmakelint \
+        cpplint \
+        flake8 \
+        isort \
+        mypy \
+        pydocstyle \
+        "r-lintr>=3.0"
+    echo "Linting Python code"
+    flake8 \
+        --ignore=E501,W503 \
+        --exclude=./.nuget,./external_libs,./python-package/build \
+        . || exit -1
+    pydocstyle --convention=numpy --add-ignore=D105 --match-dir="^(?!^external_libs|test|example).*" --match="(?!^test_|setup).*\.py" . || exit -1
+    isort . --check-only || exit -1
+    mypy --ignore-missing-imports python-package/ || true
+    echo "Linting R code"
+    Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit -1
+    echo "Linting C++ code"
+    cpplint --filter=-build/c++11,-build/include_subdir,-build/header_guard,-whitespace/line_length --recursive ./src ./include ./R-package ./swig ./tests || exit -1
+    cmake_files=$(find . -name CMakeLists.txt -o -path "*/cmake/*.cmake")
+    cmakelint --linelength=120 --filter=-convention/filename,-package/stdargs,-readability/wonkycase ${cmake_files} || exit -1
+    exit 0
+fi
+
+if [[ $TASK == "if-else" ]]; then
+    conda install -q -y -n $CONDA_ENV numpy
+    mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build && cmake .. && make lightgbm -j4 || exit -1
+    cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=train.conf convert_model_language=cpp convert_model=../../src/boosting/gbdt_prediction.cpp && ../../lightgbm config=predict.conf output_result=origin.pred || exit -1
+    cd $BUILD_DIRECTORY/build && make lightgbm -j4 || exit -1
+    cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=predict.conf output_result=ifelse.pred && python test.py || exit -1
+    exit 0
+fi
+
+if [[ $TASK == "swig" ]]; then
+    mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build
+    if [[ $OS_NAME == "macos" ]]; then
+        cmake -DUSE_SWIG=ON -DAPPLE_OUTPUT_DYLIB=ON ..
+    else
+        cmake -DUSE_SWIG=ON ..
+    fi
+    make -j4 || exit -1
+    if [[ $OS_NAME == "linux" ]] && [[ $COMPILER == "gcc" ]]; then
+        objdump -T $BUILD_DIRECTORY/lib_lightgbm.so > $BUILD_DIRECTORY/objdump.log || exit -1
+        objdump -T $BUILD_DIRECTORY/lib_lightgbm_swig.so >> $BUILD_DIRECTORY/objdump.log || exit -1
+        python $BUILD_DIRECTORY/helpers/check_dynamic_dependencies.py $BUILD_DIRECTORY/objdump.log || exit -1
+    fi
+    if [[ $PRODUCES_ARTIFACTS == "true" ]]; then
+        cp $BUILD_DIRECTORY/build/lightgbmlib.jar $BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_$OS_NAME.jar
+    fi
+    exit 0
+fi
+
 # re-including python=version[build=*cpython] to ensure that conda doesn't fall back to pypy
 conda install -q -y -n $CONDA_ENV \
     cloudpickle \
@@ -139,7 +137,7 @@ conda install -q -y -n $CONDA_ENV \
     pandas \
     psutil \
     pytest \
-    ${CONDA_PYTHON_REQUIREMENT} \
+    "python=$PYTHON_VERSION[build=*cpython]" \
     python-graphviz \
     scikit-learn \
     scipy || exit -1
diff --git a/docs/Installation-Guide.rst b/docs/Installation-Guide.rst
index 6ce3f2f0c987..46dd0559074e 100644
--- a/docs/Installation-Guide.rst
+++ b/docs/Installation-Guide.rst
@@ -605,8 +605,8 @@ Docker
 
 Refer to `GPU Docker folder <https://github.com/microsoft/LightGBM/tree/master/docker/gpu>`__.
 
-Build CUDA Version (Experimental)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Build CUDA Version
+~~~~~~~~~~~~~~~~~~
 
 The `original GPU build <#build-gpu-version>`__ of LightGBM (``device_type=gpu``) is based on OpenCL.
 
diff --git a/python-package/README.rst b/python-package/README.rst
index 8021add44c63..6cabf33ab29c 100644
--- a/python-package/README.rst
+++ b/python-package/README.rst
@@ -121,9 +121,7 @@ Build CUDA Version
 
 All requirements from `Build from Sources section <#build-from-sources>`__ apply for this installation option as well, and `CMake`_ (version 3.16 or higher) is strongly required.
 
-**CUDA** library (version 9.0 or higher) is needed: details for installation can be found in `Installation Guide <https://github.com/microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#build-cuda-version-experimental>`__.
-
-Recently, a new CUDA version with better efficiency is implemented as an experimental feature. To build the new CUDA version, replace ``--cuda`` with ``--cuda-exp`` in the above commands. Please note that new version requires **CUDA** 10.0 or later libraries. Note that this new version uses twice the memory, since it stores data row-wise as well as column-wise in memory to improve performance (see this `issue <https://github.com/microsoft/LightGBM/issues/5318>`__ for discussion). 
+**CUDA** library (version 10.0 or higher) is needed: details for installation can be found in `Installation Guide <https://github.com/microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#build-cuda-version-experimental>`__.
 
 To use the CUDA version within Python, pass ``{"device": "cuda"}`` respectively in parameters.
 
@@ -211,8 +209,6 @@ Run ``python setup.py install --gpu`` to enable GPU support. All requirements fr
 
 Run ``python setup.py install --cuda`` to enable CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well.
 
-Run ``python setup.py install --cuda-exp`` to enable the new experimental version of CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well.
-
 Run ``python setup.py install --hdfs`` to enable HDFS support. All requirements from `Build HDFS Version section <#build-hdfs-version>`__ apply for this installation option as well.
 
 Run ``python setup.py install --bit32``, if you want to use 32-bit version. All requirements from `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__ apply for this installation option as well.
diff --git a/python-package/setup.py b/python-package/setup.py
index af07bfbdd5b4..012dd0f8577b 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -21,7 +21,7 @@
     ('integrated-opencl', None, 'Compile integrated OpenCL version'),
     ('gpu', 'g', 'Compile GPU version'),
     ('cuda', None, 'Compile CUDA version'),
-    ('cuda-exp', None, 'Compile CUDA Experimental version'),
+    ('cuda-exp', None, '(deprecated) Alias for 'cuda'. Use 'cuda' instead.'),
     ('mpi', None, 'Compile MPI version'),
     ('nomp', None, 'Compile version without OpenMP support'),
     ('hdfs', 'h', 'Compile HDFS version'),
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index b1db41621067..1f0a5405bf49 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -75,7 +75,7 @@ void GBDT::Init(const Config* config, const Dataset* train_data, const Objective
       const int gpu_device_id = config_->gpu_device_id >= 0 ? config_->gpu_device_id : 0;
       CUDASUCCESS_OR_FATAL(cudaSetDevice(gpu_device_id));
     }
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
   }
 
   // load forced_splits file
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index 68d01ec11920..de368d3036db 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -453,7 +453,7 @@ void Dataset::FinishLoad() {
   } else {
     cuda_column_data_.reset(nullptr);
   }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
   is_finish_load_ = true;
 }
 
@@ -869,7 +869,7 @@ void Dataset::CopySubrow(const Dataset* fullset,
     }
     cuda_column_data_->CopySubrow(fullset->cuda_column_data(), used_indices, num_used_indices);
   }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }
 
 bool Dataset::SetFloatField(const char* field_name, const float* field_data,
@@ -1513,7 +1513,7 @@ void Dataset::AddFeaturesFrom(Dataset* other) {
   } else {
     cuda_column_data_ = nullptr;
   }
-  #endif  // USE_CUDA_EXP
+  #endif  // USE_CUDA
 }
 
 const void* Dataset::GetColWiseData(
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 76d6fad794cc..621176db2c59 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -286,7 +286,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
     } else {
       dataset->cuda_column_data_ = nullptr;
     }
-    #endif  // USE_CUDA_EXP
+    #endif  // USE_CUDA
   }
   // check meta data
   dataset->metadata_.CheckOrPartition(num_global_data, used_data_indices);
diff --git a/tests/python_package_test/test_utilities.py b/tests/python_package_test/test_utilities.py
index 01c45884a578..cfd5b133b6e2 100644
--- a/tests/python_package_test/test_utilities.py
+++ b/tests/python_package_test/test_utilities.py
@@ -91,7 +91,7 @@ def dummy_metric(_, __):
         "INFO | [LightGBM] [Warning] CUDA currently requires double precision calculations.",
         "INFO | [LightGBM] [Info] LightGBM using CUDA trainer with DP float!!"
     ]
-    cuda_ines = [
+    cuda_lines = [
         "INFO | [LightGBM] [Warning] Metric auc is not implemented in cuda version. Fall back to evaluation on CPU.",
         "INFO | [LightGBM] [Warning] Metric binary_error is not implemented in cuda version. Fall back to evaluation on CPU.",
     ]

From 0b4df93323c898abd60ac392202a78ca19c495a1 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 17 Jan 2023 22:17:07 -0600
Subject: [PATCH 13/17] fix quoting in setup.py

---
 python-package/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python-package/setup.py b/python-package/setup.py
index 012dd0f8577b..617d3b33d6ef 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -21,7 +21,7 @@
     ('integrated-opencl', None, 'Compile integrated OpenCL version'),
     ('gpu', 'g', 'Compile GPU version'),
     ('cuda', None, 'Compile CUDA version'),
-    ('cuda-exp', None, '(deprecated) Alias for 'cuda'. Use 'cuda' instead.'),
+    ('cuda-exp', None, '(deprecated) Alias for "cuda". Use "cuda" instead.'),
     ('mpi', None, 'Compile MPI version'),
     ('nomp', None, 'Compile version without OpenMP support'),
     ('hdfs', 'h', 'Compile HDFS version'),

From 967c005f7c8a2ea2cc8a2049c19c3e6a94eb68b8 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Tue, 17 Jan 2023 23:07:21 -0600
Subject: [PATCH 14/17] restore all CI

---
 .appveyor.yml                   |  44 ++++
 .github/workflows/r_package.yml | 350 ++++++++++++++++++++++++++++++++
 .vsts-ci.yml                    | 138 ++++++-------
 3 files changed, 463 insertions(+), 69 deletions(-)
 create mode 100644 .appveyor.yml
 create mode 100644 .github/workflows/r_package.yml

diff --git a/.appveyor.yml b/.appveyor.yml
new file mode 100644
index 000000000000..2d279b0f33e3
--- /dev/null
+++ b/.appveyor.yml
@@ -0,0 +1,44 @@
+version: 3.3.3.99.{build}
+
+image: Visual Studio 2015
+platform: x64
+configuration:  # a trick to construct a build matrix with multiple Python versions
+  - '3.7'
+
+# only build pull requests and
+# commits to 'master' or any branch starting with 'release'
+branches:
+  only:
+    - master
+    - /^release/
+
+environment:
+  matrix:
+    - COMPILER: MSVC
+      TASK: python
+    - COMPILER: MINGW
+      TASK: python
+
+clone_depth: 5
+
+install:
+  - git submodule update --init --recursive  # get `external_libs` folder
+  - set PATH=%PATH:C:\Program Files\Git\usr\bin;=%  # delete sh.exe from PATH (mingw32-make fix)
+  - set PATH=C:\mingw-w64\x86_64-8.1.0-posix-seh-rt_v6-rev0\mingw64\bin;%PATH%
+  - set PYTHON_VERSION=%CONFIGURATION%
+  - set CONDA_ENV="test-env"
+  - ps: |
+      $env:MINICONDA = "C:\Miniconda3-x64"
+      $env:PATH = "$env:MINICONDA;$env:MINICONDA\Scripts;$env:PATH"
+      $env:BUILD_SOURCESDIRECTORY = "$env:APPVEYOR_BUILD_FOLDER"
+      $env:LGB_VER = (Get-Content $env:APPVEYOR_BUILD_FOLDER\VERSION.txt).trim()
+
+build: false
+
+test_script:
+  - conda config --remove channels defaults
+  - conda config --add channels nodefaults
+  - conda config --add channels conda-forge
+  - conda config --set channel_priority strict
+  - conda init powershell
+  - powershell.exe -ExecutionPolicy Bypass -File %APPVEYOR_BUILD_FOLDER%\.ci\test_windows.ps1
diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml
new file mode 100644
index 000000000000..1574786cac6f
--- /dev/null
+++ b/.github/workflows/r_package.yml
@@ -0,0 +1,350 @@
+name: R-package
+
+on:
+  push:
+    branches:
+    - master
+  pull_request:
+    branches:
+    - master
+    - release/*
+
+# automatically cancel in-progress builds if another commit is pushed
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  # hack to get around this:
+  # https://stat.ethz.ch/pipermail/r-package-devel/2020q3/005930.html
+  _R_CHECK_SYSTEM_CLOCK_: 0
+  # ignore R CMD CHECK NOTE checking how long it has
+  # been since the last submission
+  _R_CHECK_CRAN_INCOMING_REMOTE_: 0
+  # CRAN ignores the "installed size is too large" NOTE,
+  # so our CI can too. Setting to a large value here just
+  # to catch extreme problems
+  _R_CHECK_PKG_SIZES_THRESHOLD_: 100
+
+jobs:
+  test:
+    name: ${{ matrix.task }} (${{ matrix.os }}, ${{ matrix.compiler }}, R ${{ matrix.r_version }}, ${{ matrix.build_type }})
+    runs-on: ${{ matrix.os }}
+    container: ${{ matrix.container }}
+    timeout-minutes: 60
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          ################
+          # CMake builds #
+          ################
+          - os: ubuntu-latest
+            task: r-package
+            compiler: gcc
+            r_version: 3.6
+            build_type: cmake
+            container: 'ubuntu:18.04'
+          - os: ubuntu-latest
+            task: r-package
+            compiler: gcc
+            r_version: 4.2
+            build_type: cmake
+            container: 'ubuntu:22.04'
+          - os: ubuntu-latest
+            task: r-package
+            compiler: clang
+            r_version: 3.6
+            build_type: cmake
+            container: 'ubuntu:18.04'
+          - os: ubuntu-latest
+            task: r-package
+            compiler: clang
+            r_version: 4.2
+            build_type: cmake
+            container: 'ubuntu:22.04'
+          - os: macOS-latest
+            task: r-package
+            compiler: gcc
+            r_version: 3.6
+            build_type: cmake
+            container: null
+          - os: macOS-latest
+            task: r-package
+            compiler: gcc
+            r_version: 4.2
+            build_type: cmake
+            container: null
+          - os: macOS-latest
+            task: r-package
+            compiler: clang
+            r_version: 3.6
+            build_type: cmake
+            container: null
+          - os: macOS-latest
+            task: r-package
+            compiler: clang
+            r_version: 4.2
+            build_type: cmake
+            container: null
+          - os: windows-latest
+            task: r-package
+            compiler: MINGW
+            toolchain: MINGW
+            r_version: 3.6
+            build_type: cmake
+            container: null
+          - os: windows-latest
+            task: r-package
+            compiler: MINGW
+            toolchain: MSYS
+            r_version: 4.2
+            build_type: cmake
+            container: null
+          # Visual Studio 2019
+          - os: windows-2019
+            task: r-package
+            compiler: MSVC
+            toolchain: MSVC
+            r_version: 3.6
+            build_type: cmake
+            container: null
+          # Visual Studio 2022
+          - os: windows-2022
+            task: r-package
+            compiler: MSVC
+            toolchain: MSVC
+            r_version: 4.2
+            build_type: cmake
+            container: null
+          ###############
+          # CRAN builds #
+          ###############
+          - os: windows-latest
+            task: r-package
+            compiler: MINGW
+            toolchain: MINGW
+            r_version: 3.6
+            build_type: cran
+            container: null
+          - os: windows-latest
+            task: r-package
+            compiler: MINGW
+            toolchain: MSYS
+            r_version: 4.2
+            build_type: cran
+            container: null
+          - os: ubuntu-latest
+            task: r-package
+            compiler: gcc
+            r_version: 4.2
+            build_type: cran
+            container: 'ubuntu:22.04'
+          - os: macOS-latest
+            task: r-package
+            compiler: clang
+            r_version: 4.2
+            build_type: cran
+            container: null
+          ################
+          # Other checks #
+          ################
+          - os: ubuntu-latest
+            task: r-rchk
+            compiler: gcc
+            r_version: 4.2
+            build_type: cran
+            container: 'ubuntu:22.04'
+    steps:
+      - name: Prevent conversion of line endings on Windows
+        if: startsWith(matrix.os, 'windows')
+        shell: pwsh
+        run: git config --global core.autocrlf false
+      - name: Install packages used by third-party actions
+        if: startsWith(matrix.os, 'ubuntu')
+        shell: bash
+        run: |
+          apt-get update -y
+          apt-get install --no-install-recommends -y \
+            ca-certificates \
+            dirmngr \
+            gpg \
+            gpg-agent \
+            software-properties-common \
+            sudo
+          # install newest version of git
+          # ref:
+          #     - https://unix.stackexchange.com/a/170831/550004
+          #     - https://git-scm.com/download/linux
+          add-apt-repository ppa:git-core/ppa -y
+          apt-get update -y
+          apt-get install --no-install-recommends -y \
+            git
+      - name: Trust git cloning LightGBM
+        if: startsWith(matrix.os, 'ubuntu')
+        run: |
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 5
+          submodules: true
+      - name: Install pandoc
+        uses: r-lib/actions/setup-pandoc@v2
+      - name: install tinytex
+        if: startsWith(matrix.os, 'windows')
+        uses: r-lib/actions/setup-tinytex@v2
+        env:
+          CTAN_MIRROR: https://ctan.math.illinois.edu/systems/win32/miktex
+          TINYTEX_INSTALLER: TinyTeX
+      - name: Setup and run tests on Linux and macOS
+        if: matrix.os == 'macOS-latest' || matrix.os == 'ubuntu-latest'
+        shell: bash
+        run: |
+          export TASK="${{ matrix.task }}"
+          export COMPILER="${{ matrix.compiler }}"
+          export GITHUB_ACTIONS="true"
+          if [[ "${{ matrix.os }}" == "macOS-latest" ]]; then
+              export OS_NAME="macos"
+          elif [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then
+              export OS_NAME="linux"
+              export IN_UBUNTU_BASE_CONTAINER="true"
+              # the default version of cmake provided on Ubuntu 18.04 (v3.10.2), is not supported by LightGBM
+              # see https://github.com/microsoft/LightGBM/issues/5642
+              if [[ "${{ matrix.container }}" == "ubuntu:18.04" ]]; then
+                export INSTALL_CMAKE_FROM_RELEASES="true"
+              fi
+          fi
+          export BUILD_DIRECTORY="$GITHUB_WORKSPACE"
+          export R_VERSION="${{ matrix.r_version }}"
+          export R_BUILD_TYPE="${{ matrix.build_type }}"
+          $GITHUB_WORKSPACE/.ci/setup.sh
+          $GITHUB_WORKSPACE/.ci/test.sh
+      - name: Setup and run tests on Windows
+        if: startsWith(matrix.os, 'windows')
+        shell: pwsh -command ". {0}"
+        run: |
+          $env:BUILD_SOURCESDIRECTORY = $env:GITHUB_WORKSPACE
+          $env:LGB_VER = (Get-Content -TotalCount 1 $env:BUILD_SOURCESDIRECTORY\VERSION.txt).trim().replace('rc', '-')
+          $env:TOOLCHAIN = "${{ matrix.toolchain }}"
+          $env:R_VERSION = "${{ matrix.r_version }}"
+          $env:R_BUILD_TYPE = "${{ matrix.build_type }}"
+          $env:COMPILER = "${{ matrix.compiler }}"
+          $env:GITHUB_ACTIONS = "true"
+          $env:TASK = "${{ matrix.task }}"
+          & "$env:GITHUB_WORKSPACE/.ci/test_windows.ps1"
+  test-r-sanitizers:
+    name: r-sanitizers (ubuntu-latest, R-devel, ${{ matrix.compiler }} ASAN/UBSAN)
+    timeout-minutes: 60
+    runs-on: ubuntu-latest
+    container: wch1/r-debug
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - r_customization: san
+            compiler: gcc
+          - r_customization: csan
+            compiler: clang
+    steps:
+      - name: Trust git cloning LightGBM
+        run: |
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 5
+          submodules: true
+      - name: Install packages
+        shell: bash
+        run: |
+          RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
+          sh build-cran-package.sh --r-executable=RD${{ matrix.r_customization }}
+          RD${{ matrix.r_customization }} CMD INSTALL lightgbm_*.tar.gz || exit -1
+      - name: Run tests with sanitizers
+        shell: bash
+        run: |
+          cd R-package/tests
+          exit_code=0
+          RDscript${{ matrix.r_customization }} testthat.R >> tests.log 2>&1 || exit_code=-1
+          cat ./tests.log
+          exit ${exit_code}
+  test-r-debian-clang:
+    name: r-package (debian, R-devel, clang)
+    timeout-minutes: 60
+    runs-on: ubuntu-latest
+    container: rhub/debian-clang-devel
+    steps:
+      - name: Install Git before checkout
+        shell: bash
+        run: |
+          apt-get update --allow-releaseinfo-change
+          apt-get install --no-install-recommends -y git
+      - name: Trust git cloning LightGBM
+        run: |
+          git config --global --add safe.directory "${GITHUB_WORKSPACE}"
+      - name: Checkout repository
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 5
+          submodules: true
+      - name: update to clang 15
+        shell: bash
+        run: |
+          # remove clang stuff that comes installed in the image
+          apt-get autoremove -y --purge \
+              clang-* \
+              libclang-* \
+              libunwind-* \
+              llvm-*
+          #
+          # replace it all with clang-15
+          apt-get update -y
+          apt-get install --no-install-recommends -y \
+              gnupg \
+              lsb-release \
+              software-properties-common \
+              wget
+          #
+          wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | apt-key add -
+          #
+          add-apt-repository "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-15 main"
+          apt-get install -y --no-install-recommends \
+              clang-15 \
+              clangd-15 \
+              clang-format-15 \
+              clang-tidy-15 \
+              clang-tools-15 \
+              lldb-15 \
+              lld-15 \
+              llvm-15-dev \
+              llvm-15-tools \
+              libomp-15-dev \
+              libc++-15-dev \
+              libc++abi-15-dev \
+              libclang-common-15-dev \
+              libclang-15-dev \
+              libclang-cpp15-dev \
+              libunwind-15-dev
+          # overwrite everything in /usr/bin with the new v15 versions
+          cp --remove-destination /usr/lib/llvm-15/bin/* /usr/bin/
+      - name: Install packages and run tests
+        shell: bash
+        run: |
+          export PATH=/opt/R-devel/bin/:${PATH}
+          Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
+          sh build-cran-package.sh
+          R CMD check --as-cran --run-donttest lightgbm_*.tar.gz || exit -1
+          if grep -q -E "NOTE|WARNING|ERROR" lightgbm.Rcheck/00check.log; then
+              echo "NOTEs, WARNINGs, or ERRORs have been found by R CMD check"
+              exit -1
+          fi
+  all-r-package-jobs-successful:
+    if: always()
+    runs-on: ubuntu-latest
+    needs: [test, test-r-sanitizers, test-r-debian-clang]
+    steps:
+    - name: Note that all tests succeeded
+      uses: re-actors/alls-green@v1.2.2
+      with:
+        jobs: ${{ toJSON(needs) }}
diff --git a/.vsts-ci.yml b/.vsts-ci.yml
index 26c687b0a20f..194aa5471131 100644
--- a/.vsts-ci.yml
+++ b/.vsts-ci.yml
@@ -143,74 +143,74 @@ jobs:
     displayName: Setup
   - bash: $(Build.SourcesDirectory)/.ci/test.sh
     displayName: Test
-# ###########################################
-# - job: QEMU_multiarch
-# ###########################################
-#   variables:
-#     COMPILER: gcc
-#     OS_NAME: 'linux'
-#     PRODUCES_ARTIFACTS: 'true'
-#   pool:
-#     vmImage: ubuntu-22.04
-#   timeoutInMinutes: 180
-#   strategy:
-#     matrix:
-#       bdist:
-#         TASK: bdist
-#         ARCH: aarch64
-#   steps:
-#   - script: |
-#       sudo apt-get update
-#       sudo apt-get install --no-install-recommends -y \
-#         binfmt-support \
-#         qemu \
-#         qemu-user \
-#         qemu-user-static
-#     displayName: 'Install QEMU'
-#   - script: |
-#       docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
-#     displayName: 'Enable Docker multi-architecture support'
-#   - script: |
-#       export ROOT_DOCKER_FOLDER=/LightGBM
-#       cat > docker.env <<EOF
-#       AZURE=$AZURE
-#       OS_NAME=$OS_NAME
-#       COMPILER=$COMPILER
-#       TASK=$TASK
-#       METHOD=$METHOD
-#       CONDA_ENV=$CONDA_ENV
-#       PYTHON_VERSION=$PYTHON_VERSION
-#       BUILD_DIRECTORY=$ROOT_DOCKER_FOLDER
-#       LGB_VER=$(head -n 1 VERSION.txt)
-#       PRODUCES_ARTIFACTS=$PRODUCES_ARTIFACTS
-#       BUILD_ARTIFACTSTAGINGDIRECTORY=$BUILD_ARTIFACTSTAGINGDIRECTORY
-#       EOF
-#       cat > docker-script.sh <<EOF
-#       export CONDA=\$HOME/miniforge
-#       export PATH=\$CONDA/bin:/opt/rh/llvm-toolset-7.0/root/usr/bin:\$PATH
-#       export LD_LIBRARY_PATH=/opt/rh/llvm-toolset-7.0/root/usr/lib64:\$LD_LIBRARY_PATH
-#       $ROOT_DOCKER_FOLDER/.ci/setup.sh || exit -1
-#       $ROOT_DOCKER_FOLDER/.ci/test.sh || exit -1
-#       EOF
-#       IMAGE_URI="lightgbm/vsts-agent:manylinux2014_aarch64"
-#       docker pull "${IMAGE_URI}" || exit -1
-#       PLATFORM=$(docker inspect --format='{{.Os}}/{{.Architecture}}' "${IMAGE_URI}") || exit -1
-#       echo "detected image platform: ${PLATFORM}"
-#       docker run \
-#         --platform "${PLATFORM}" \
-#         --rm \
-#         --env-file docker.env \
-#         -v "$(Build.SourcesDirectory)":"$ROOT_DOCKER_FOLDER" \
-#         -v "$(Build.ArtifactStagingDirectory)":"$(Build.ArtifactStagingDirectory)" \
-#         "${IMAGE_URI}" \
-#         /bin/bash $ROOT_DOCKER_FOLDER/docker-script.sh
-#     displayName: 'Setup and run tests'
-#   - task: PublishBuildArtifacts@1
-#     condition: and(succeeded(), in(variables['TASK'], 'bdist'), not(startsWith(variables['Build.SourceBranch'], 'refs/pull/')))
-#     inputs:
-#       pathtoPublish: '$(Build.ArtifactStagingDirectory)'
-#       artifactName: PackageAssets
-#       artifactType: container
+###########################################
+- job: QEMU_multiarch
+###########################################
+  variables:
+    COMPILER: gcc
+    OS_NAME: 'linux'
+    PRODUCES_ARTIFACTS: 'true'
+  pool:
+    vmImage: ubuntu-22.04
+  timeoutInMinutes: 180
+  strategy:
+    matrix:
+      bdist:
+        TASK: bdist
+        ARCH: aarch64
+  steps:
+  - script: |
+      sudo apt-get update
+      sudo apt-get install --no-install-recommends -y \
+        binfmt-support \
+        qemu \
+        qemu-user \
+        qemu-user-static
+    displayName: 'Install QEMU'
+  - script: |
+      docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
+    displayName: 'Enable Docker multi-architecture support'
+  - script: |
+      export ROOT_DOCKER_FOLDER=/LightGBM
+      cat > docker.env <<EOF
+      AZURE=$AZURE
+      OS_NAME=$OS_NAME
+      COMPILER=$COMPILER
+      TASK=$TASK
+      METHOD=$METHOD
+      CONDA_ENV=$CONDA_ENV
+      PYTHON_VERSION=$PYTHON_VERSION
+      BUILD_DIRECTORY=$ROOT_DOCKER_FOLDER
+      LGB_VER=$(head -n 1 VERSION.txt)
+      PRODUCES_ARTIFACTS=$PRODUCES_ARTIFACTS
+      BUILD_ARTIFACTSTAGINGDIRECTORY=$BUILD_ARTIFACTSTAGINGDIRECTORY
+      EOF
+      cat > docker-script.sh <<EOF
+      export CONDA=\$HOME/miniforge
+      export PATH=\$CONDA/bin:/opt/rh/llvm-toolset-7.0/root/usr/bin:\$PATH
+      export LD_LIBRARY_PATH=/opt/rh/llvm-toolset-7.0/root/usr/lib64:\$LD_LIBRARY_PATH
+      $ROOT_DOCKER_FOLDER/.ci/setup.sh || exit -1
+      $ROOT_DOCKER_FOLDER/.ci/test.sh || exit -1
+      EOF
+      IMAGE_URI="lightgbm/vsts-agent:manylinux2014_aarch64"
+      docker pull "${IMAGE_URI}" || exit -1
+      PLATFORM=$(docker inspect --format='{{.Os}}/{{.Architecture}}' "${IMAGE_URI}") || exit -1
+      echo "detected image platform: ${PLATFORM}"
+      docker run \
+        --platform "${PLATFORM}" \
+        --rm \
+        --env-file docker.env \
+        -v "$(Build.SourcesDirectory)":"$ROOT_DOCKER_FOLDER" \
+        -v "$(Build.ArtifactStagingDirectory)":"$(Build.ArtifactStagingDirectory)" \
+        "${IMAGE_URI}" \
+        /bin/bash $ROOT_DOCKER_FOLDER/docker-script.sh
+    displayName: 'Setup and run tests'
+  - task: PublishBuildArtifacts@1
+    condition: and(succeeded(), in(variables['TASK'], 'bdist'), not(startsWith(variables['Build.SourceBranch'], 'refs/pull/')))
+    inputs:
+      pathtoPublish: '$(Build.ArtifactStagingDirectory)'
+      artifactName: PackageAssets
+      artifactType: container
 ###########################################
 - job: macOS
 ###########################################
@@ -326,7 +326,7 @@ jobs:
   dependsOn:
   - Linux
   - Linux_latest
-  #- QEMU_multiarch
+  - QEMU_multiarch
   - macOS
   - Windows
   - R_artifact

From 2497ca2a88cc01474c9ef567c52fa6661ecc39a8 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 29 Jan 2023 20:46:30 -0600
Subject: [PATCH 15/17] Apply suggestions from code review

Co-authored-by: shiyu1994 <shiyu_k1994@qq.com>
---
 src/treelearner/tree_learner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/treelearner/tree_learner.cpp b/src/treelearner/tree_learner.cpp
index aaa0ce72ecd9..0018bdaf8b94 100644
--- a/src/treelearner/tree_learner.cpp
+++ b/src/treelearner/tree_learner.cpp
@@ -43,10 +43,10 @@ TreeLearner* TreeLearner::CreateTreeLearner(const std::string& learner_type, con
       if (config->num_gpu == 1) {
         return new CUDASingleGPUTreeLearner(config, boosting_on_cuda);
       } else {
-        Log::Fatal("cuda only supports training on a single GPU.");
+        Log::Fatal("Currently cuda version only supports training on a single GPU.");
       }
     } else {
-      Log::Fatal("cuda only supports training on a single machine.");
+      Log::Fatal("Currently cuda version only supports training on a single machine.");
     }
   }
   return nullptr;

From b111ab62fec11e6bf237f12dade6a5947c7bf693 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 29 Jan 2023 20:47:36 -0600
Subject: [PATCH 16/17] Apply suggestions from code review

---
 src/boosting/gbdt.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index 1f784b94e593..288c345a4f1f 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -557,16 +557,16 @@ class GBDT : public GBDTBase {
   score_t* gradients_pointer_;
   /*! \brief Pointer to hessian vector, can be on CPU or GPU */
   score_t* hessians_pointer_;
-  /*! \brief Whether boosting is done on GPU, used for cuda */
+  /*! \brief Whether boosting is done on GPU, used for device_type=cuda */
   bool boosting_on_gpu_;
   #ifdef USE_CUDA
   /*! \brief Gradient vector on GPU */
   CUDAVector<score_t> cuda_gradients_;
   /*! \brief Hessian vector on GPU */
   CUDAVector<score_t> cuda_hessians_;
-  /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with cuda */
+  /*! \brief Buffer for scores when boosting is on GPU but evaluation is not, used only with device_type=cuda */
   mutable std::vector<double> host_score_;
-  /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with cuda */
+  /*! \brief Buffer for scores when boosting is not on GPU but evaluation is, used only with device_type=cuda */
   mutable CUDAVector<double> cuda_score_;
   #endif  // USE_CUDA
 

From ac7ab774d5152478aa660028c2ee0d866a008daa Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Sun, 29 Jan 2023 21:06:05 -0600
Subject: [PATCH 17/17] completely remove cuda_exp, update docs

---
 .ci/setup.sh              |  2 +-
 CMakeLists.txt            |  6 ------
 docs/Parameters.rst       |  8 +++++++-
 include/LightGBM/config.h |  5 ++++-
 python-package/setup.py   | 10 +---------
 src/io/config.cpp         |  7 -------
 6 files changed, 13 insertions(+), 25 deletions(-)

diff --git a/.ci/setup.sh b/.ci/setup.sh
index 89fbed442ed1..f7da21286d7d 100755
--- a/.ci/setup.sh
+++ b/.ci/setup.sh
@@ -106,7 +106,7 @@ else  # Linux
             || exit -1
         fi
     fi
-    if [[ $TASK == "cuda" || $TASK == "cuda_exp" ]]; then
+    if [[ $TASK == "cuda" ]]; then
         echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections
         apt-get update
         apt-get install --no-install-recommends -y \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8d005662f37a..750b41ab8164 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,7 +5,6 @@ option(USE_SWIG "Enable SWIG to generate Java API" OFF)
 option(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF)
 option(USE_TIMETAG "Set to ON to output time costs" OFF)
 option(USE_CUDA "Enable CUDA-accelerated training " OFF)
-option(USE_CUDA_EXP "(DEPRECATED) Alias for 'USE_CUDA'. Use 'USE_CUDA' instead." OFF)
 option(USE_DEBUG "Set to ON for Debug mode" OFF)
 option(USE_SANITIZER "Use santizer flags" OFF)
 set(
@@ -137,11 +136,6 @@ else()
     add_definitions(-DUSE_SOCKET)
 endif()
 
-if(USE_CUDA_EXP)
-    message(WARNING "Option -DUSE_CUDA_EXP=ON is deprecated. Use -DUSE_CUDA=ON instead.")
-    set(USE_CUDA ON CACHE BOOL "Building CUDA-enabled version" FORCE)
-endif()
-
 if(USE_CUDA)
     set(CMAKE_CUDA_HOST_COMPILER "${CMAKE_CXX_COMPILER}")
     enable_language(CUDA)
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index d2cb9acdaf6d..45f0837d7c89 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -207,7 +207,13 @@ Core Parameters
 
 -  ``device_type`` :raw-html:`<a id="device_type" title="Permalink to this parameter" href="#device_type">&#x1F517;&#xFE0E;</a>`, default = ``cpu``, type = enum, options: ``cpu``, ``gpu``, ``cuda``, aliases: ``device``
 
-   -  device for the tree learning, you can use GPU to achieve the faster learning
+   -  device for the tree learning
+
+   -  ``cpu`` supports all LightGBM functionality and is portable across the widest range of operating systems and hardware
+
+   -  ``cuda`` offers faster training than ``gpu`` or ``cpu``, but only works on GPUs supporting CUDA
+
+   -  ``gpu`` can be faster than ``cpu`` and works on a wider range of GPUs than CUDA
 
    -  **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up
 
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index c2de146b3854..b2a7112d612e 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -225,7 +225,10 @@ struct Config {
   // type = enum
   // options = cpu, gpu, cuda
   // alias = device
-  // desc = device for the tree learning, you can use GPU to achieve the faster learning
+  // desc = device for the tree learning
+  // desc = ``cpu`` supports all LightGBM functionality and is portable across the widest range of operating systems and hardware
+  // desc = ``cuda`` offers faster training than ``gpu`` or ``cpu``, but only works on GPUs supporting CUDA
+  // desc = ``gpu`` can be faster than ``cpu`` and works on a wider range of GPUs than CUDA
   // desc = **Note**: it is recommended to use the smaller ``max_bin`` (e.g. 63) to get the better speed up
   // desc = **Note**: for the faster speed, GPU uses 32-bit float point to sum up by default, so this may affect the accuracy for some tasks. You can set ``gpu_use_dp=true`` to enable 64-bit float point, but it will slow down the training
   // desc = **Note**: refer to `Installation Guide <./Installation-Guide.rst#build-gpu-version>`__ to build LightGBM with GPU support
diff --git a/python-package/setup.py b/python-package/setup.py
index 617d3b33d6ef..243dd5cb0af5 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -21,7 +21,6 @@
     ('integrated-opencl', None, 'Compile integrated OpenCL version'),
     ('gpu', 'g', 'Compile GPU version'),
     ('cuda', None, 'Compile CUDA version'),
-    ('cuda-exp', None, '(deprecated) Alias for "cuda". Use "cuda" instead.'),
     ('mpi', None, 'Compile MPI version'),
     ('nomp', None, 'Compile version without OpenMP support'),
     ('hdfs', 'h', 'Compile HDFS version'),
@@ -106,7 +105,6 @@ def compile_cpp(
     use_mingw: bool = False,
     use_gpu: bool = False,
     use_cuda: bool = False,
-    use_cuda_exp: bool = False,
     use_mpi: bool = False,
     use_hdfs: bool = False,
     boost_root: Optional[str] = None,
@@ -128,9 +126,6 @@ def compile_cpp(
 
     logger.info("Starting to compile the library.")
 
-    if use_cuda_exp:
-        use_cuda = True
-
     cmake_cmd = ["cmake", str(CURRENT_DIR / "compile")]
     if integrated_opencl:
         use_gpu = False
@@ -236,7 +231,6 @@ def initialize_options(self) -> None:
         self.integrated_opencl = False
         self.gpu = False
         self.cuda = False
-        self.cuda_exp = False
         self.boost_root = None
         self.boost_dir = None
         self.boost_include_dir = None
@@ -261,7 +255,7 @@ def run(self) -> None:
         LOG_PATH.touch()
         if not self.precompile:
             copy_files(integrated_opencl=self.integrated_opencl, use_gpu=self.gpu)
-            compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_cuda_exp=self.cuda_exp, use_mpi=self.mpi,
+            compile_cpp(use_mingw=self.mingw, use_gpu=self.gpu, use_cuda=self.cuda, use_mpi=self.mpi,
                         use_hdfs=self.hdfs, boost_root=self.boost_root, boost_dir=self.boost_dir,
                         boost_include_dir=self.boost_include_dir, boost_librarydir=self.boost_librarydir,
                         opencl_include_dir=self.opencl_include_dir, opencl_library=self.opencl_library,
@@ -282,7 +276,6 @@ def initialize_options(self) -> None:
         self.integrated_opencl = False
         self.gpu = False
         self.cuda = False
-        self.cuda_exp = False
         self.boost_root = None
         self.boost_dir = None
         self.boost_include_dir = None
@@ -305,7 +298,6 @@ def finalize_options(self) -> None:
         install.integrated_opencl = self.integrated_opencl
         install.gpu = self.gpu
         install.cuda = self.cuda
-        install.cuda_exp = self.cuda_exp
         install.boost_root = self.boost_root
         install.boost_dir = self.boost_dir
         install.boost_include_dir = self.boost_include_dir
diff --git a/src/io/config.cpp b/src/io/config.cpp
index 2d891f769ee1..86b64a52d105 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -177,13 +177,6 @@ void GetDeviceType(const std::unordered_map<std::string, std::string>& params, s
       *device_type = "gpu";
     } else if (value == std::string("cuda")) {
       *device_type = "cuda";
-    } else if (value == std::string("cuda_exp")) {
-      Log::Warning(
-        "Found device_type='cuda_exp' passed through params. "
-        "That is an alias for device_type='cuda'. "
-        "Use device_type='cuda' to suppress this warning. "
-        "In the future, this warning will become an error. ");
-      *device_type = "cuda";
     } else {
       Log::Fatal("Unknown device type %s", value.c_str());
     }