Merge branch 'master' into 6021-fix_single_row_contention

microsoft · Aug 15, 2023 · 601316b · 601316b
2 parents c52a7d7 + e0d63b5
commit 601316b
Show file tree

Hide file tree

Showing 64 changed files with 804 additions and 540 deletions.
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -1,4 +1,4 @@
-version: 4.0.0.{build}
+version: 4.0.0.99.{build}
 
 image: Visual Studio 2015
 platform: x64

diff --git a/.ci/lint_r_code.R b/.ci/lint_r_code.R
@@ -33,29 +33,37 @@ LINTERS_TO_USE <- list(
     , "any_duplicated"       = lintr::any_duplicated_linter()
     , "any_is_na"            = lintr::any_is_na_linter()
     , "assignment"           = lintr::assignment_linter()
+    , "boolean_arithmetic"   = lintr::boolean_arithmetic_linter()
     , "braces"               = lintr::brace_linter()
     , "class_equals"         = lintr::class_equals_linter()
     , "commas"               = lintr::commas_linter()
     , "duplicate_argument"   = lintr::duplicate_argument_linter()
+    , "empty_assignment"     = lintr::empty_assignment_linter()
     , "equals_na"            = lintr::equals_na_linter()
+    , "for_loop_index"       = lintr::for_loop_index_linter()
     , "function_left"        = lintr::function_left_parentheses_linter()
     , "implicit_integers"    = lintr::implicit_integer_linter()
     , "infix_spaces"         = lintr::infix_spaces_linter()
     , "inner_combine"        = lintr::inner_combine_linter()
+    , "is_numeric"           = lintr::is_numeric_linter()
     , "fixed_regex"          = lintr::fixed_regex_linter()
+    , "function_return"      = lintr::function_return_linter()
+    , "lengths"              = lintr::lengths_linter()
     , "literal_coercion"     = lintr::literal_coercion_linter()
     , "long_lines"           = lintr::line_length_linter(length = 120L)
+    , "matrix"               = lintr::matrix_apply_linter()
     , "missing_argument"     = lintr::missing_argument_linter()
-    , "no_tabs"              = lintr::no_tab_linter()
     , "non_portable_path"    = lintr::nonportable_path_linter()
     , "numeric_leading_zero" = lintr::numeric_leading_zero_linter()
     , "outer_negation"       = lintr::outer_negation_linter()
     , "package_hooks"        = lintr::package_hooks_linter()
     , "paste"                = lintr::paste_linter()
+    , "quotes"               = lintr::quotes_linter()
+    , "redundant_equals"     = lintr::redundant_equals_linter()
     , "regex_subset"         = lintr::regex_subset_linter()
+    , "routine_registration" = lintr::routine_registration_linter()
     , "semicolon"            = lintr::semicolon_linter()
     , "seq"                  = lintr::seq_linter()
-    , "single_quotes"        = lintr::single_quotes_linter()
     , "spaces_inside"        = lintr::spaces_inside_linter()
     , "spaces_left_parens"   = lintr::spaces_left_parentheses_linter()
     , "sprintf"              = lintr::sprintf_linter()
@@ -96,9 +104,11 @@ LINTERS_TO_USE <- list(
             , "??" = interactive_text
         )
     )
-    , "unneeded_concatenation" = lintr::unneeded_concatenation_linter()
-    , "unreachable_code"       = lintr::unreachable_code_linter()
-    , "vector_logic"           = lintr::vector_logic_linter()
+    , "unnecessary_concatenation" = lintr::unnecessary_concatenation_linter()
+    , "unnecessary_lambda"        = lintr::unnecessary_lambda_linter()
+    , "unreachable_code"          = lintr::unreachable_code_linter()
+    , "vector_logic"              = lintr::vector_logic_linter()
+    , "whitespace"                = lintr::whitespace_linter()
 )
 
 noquote(paste0(length(FILES_TO_LINT), " R files need linting"))

diff --git a/.ci/test.sh b/.ci/test.sh
@@ -73,7 +73,7 @@ if [[ $TASK == "lint" ]]; then
         cpplint \
         isort \
         mypy \
-        'r-lintr>=3.0' \
+        'r-lintr>=3.1' \
         ruff
     source activate $CONDA_ENV
     echo "Linting Python code"
@@ -119,15 +119,21 @@ if [[ $TASK == "check-docs" ]] || [[ $TASK == "check-links" ]]; then
     exit 0
 fi
 
+# older versions of Dask are incompatible with pandas>=2.0, but not all conda packages' metadata accurately reflects that
+#
+# ref: https://github.com/microsoft/LightGBM/issues/6030
+CONSTRAINED_DEPENDENCIES="'dask-core>=2023.5.0' 'distributed>=2023.5.0' 'pandas>=2.0'"
+if [[ $PYTHON_VERSION == "3.7" ]]; then
+    CONSTRAINED_DEPENDENCIES="'dask-core' 'distributed 'pandas<2.0'"
+fi
+
 # including python=version[build=*cpython] to ensure that conda doesn't fall back to pypy
 conda create -q -y -n $CONDA_ENV \
+    ${CONSTRAINED_DEPENDENCIES} \
     cloudpickle \
-    dask-core \
-    distributed \
     joblib \
     matplotlib \
     numpy \
-    pandas \
     psutil \
     pytest \
     ${CONDA_PYTHON_REQUIREMENT} \

diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh
@@ -36,7 +36,10 @@ fi
 #
 # `devscripts` is required for 'checkbashisms' (https://github.com/r-lib/actions/issues/111)
 if [[ $OS_NAME == "linux" ]]; then
+    mkdir -p ~/.gnupg
+    echo "disable-ipv6" >> ~/.gnupg/dirmngr.conf
     sudo apt-key adv \
+        --homedir ~/.gnupg \
         --keyserver keyserver.ubuntu.com \
         --recv-keys E298A3A825C0D65DFD57CBB651716619E084DAB9 || exit -1
     sudo add-apt-repository \

diff --git a/.ci/test_windows.ps1 b/.ci/test_windows.ps1
@@ -124,7 +124,7 @@ if (($env:TASK -eq "regular") -or (($env:APPVEYOR -eq "true") -and ($env:TASK -e
   cd $env:BUILD_SOURCESDIRECTORY/examples/python-guide
   @("import matplotlib", "matplotlib.use('Agg')") + (Get-Content "plot_example.py") | Set-Content "plot_example.py"
   (Get-Content "plot_example.py").replace('graph.render(view=True)', 'graph.render(view=False)') | Set-Content "plot_example.py"  # prevent interactive window mode
-  conda install -q -y -n $env:CONDA_ENV h5py ipywidgets notebook
+  conda install -q -y -n $env:CONDA_ENV "h5py>3.0" ipywidgets notebook
   foreach ($file in @(Get-ChildItem *.py)) {
     @("import sys, warnings", "warnings.showwarning = lambda message, category, filename, lineno, file=None, line=None: sys.stdout.write(warnings.formatwarning(message, category, filename, lineno, line))") + (Get-Content $file) | Set-Content $file
     python $file ; Check-Output $?

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -7,56 +7,4 @@
 # offer a reasonable automatic best-guess
 
 # catch-all rule (this only gets matched if no rules below match)
-*    @guolinke @StrikerRUS @jameslamb @shiyu1994
-
-# other catch-alls that will get matched if specific rules below are not matched
-*.R    @jameslamb @jmoralez
-*.py    @StrikerRUS @jmoralez @jameslamb @shiyu1994
-*.cpp    @guolinke @shiyu1994
-*.h    @guolinke @shiyu1994
-
-# main C++ code
-include/    @guolinke @shiyu1994
-src/    @guolinke @shiyu1994
-CMakeLists.txt    @guolinke @jameslamb @StrikerRUS @shiyu1994
-tests/c_api_test/    @guolinke @shiyu1994
-tests/cpp_tests/    @guolinke @shiyu1994
-tests/data/    @guolinke @shiyu1994
-windows/    @guolinke @StrikerRUS @shiyu1994
-
-# R code
-build_r.R    @jameslamb @StrikerRUS @jmoralez
-build-cran-package.sh    @jameslamb @StrikerRUS @jmoralez
-R-package/    @jameslamb @jmoralez
-
-# Python code
-python-package/    @StrikerRUS @shiyu1994 @jameslamb @jmoralez
-
-# Dask integration
-python-package/lightgbm/dask.py    @jameslamb @jmoralez
-tests/python_package_test/test_dask.py    @jameslamb @jmoralez
-
-# helpers
-helpers/    @StrikerRUS @guolinke
-
-# CI administrative stuff
-.ci/    @StrikerRUS @jameslamb
-docs/    @StrikerRUS @jameslamb
-examples/     @StrikerRUS @jameslamb @guolinke @jmoralez
-*.yml    @StrikerRUS @jameslamb
-.vsts-ci.yml    @StrikerRUS @jameslamb
-
-# docker setup
-docker/    @StrikerRUS @jameslamb
-docker/dockerfile-cli    @guolinke @shiyu1994 @StrikerRUS @jameslamb
-docker/gpu/    @StrikerRUS @jameslamb
-docker/dockerfile-python    @StrikerRUS @shiyu1994 @jameslamb @jmoralez
-docker/dockerfile-r    @jameslamb @jmoralez
-
-# GPU code
-docs/GPU-*.rst    @shiyu1994 @guolinke
-src/treelearner/gpu_tree_learner.cpp    @guolinke @shiyu1994
-src/treelearner/tree_learner.cpp    @guolinke @shiyu1994
-
-# JAVA code
-swig/    @guolinke @shiyu1994
+*    @guolinke @jameslamb @shiyu1994 @jmoralez
diff --git a/R-package/R/callback.R b/R-package/R/callback.R
@@ -24,7 +24,7 @@ CB_ENV <- R6::R6Class(
 )
 
 # Format the evaluation metric string
-format.eval.string <- function(eval_res, eval_err) {
+.format_eval_string <- function(eval_res, eval_err) {
 
   # Check for empty evaluation string
   if (is.null(eval_res) || length(eval_res) == 0L) {
@@ -40,7 +40,7 @@ format.eval.string <- function(eval_res, eval_err) {
 
 }
 
-merge.eval.string <- function(env) {
+.merge_eval_string <- function(env) {
 
   # Check length of evaluation list
   if (length(env$eval_list) <= 0L) {
@@ -63,7 +63,7 @@ merge.eval.string <- function(env) {
     }
 
     # Set error message
-    msg <- c(msg, format.eval.string(eval_res = env$eval_list[[j]], eval_err = eval_err))
+    msg <- c(msg, .format_eval_string(eval_res = env$eval_list[[j]], eval_err = eval_err))
 
   }
 
@@ -86,11 +86,11 @@ cb_print_evaluation <- function(period) {
       if ((i - 1L) %% period == 0L || is.element(i, c(env$begin_iteration, env$end_iteration))) {
 
         # Merge evaluation string
-        msg <- merge.eval.string(env = env)
+        msg <- .merge_eval_string(env = env)
 
         # Check if message is existing
         if (nchar(msg) > 0L) {
-          print(merge.eval.string(env = env))
+          print(.merge_eval_string(env = env))
         }
 
       }
@@ -270,7 +270,7 @@ cb_early_stop <- function(stopping_rounds, first_metric_only, verbose) {
 
           # Prepare to print if verbose
           if (verbose) {
-            best_msg[[i]] <<- as.character(merge.eval.string(env = env))
+            best_msg[[i]] <<- as.character(.merge_eval_string(env = env))
           }
 
         } else {

diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R
@@ -928,6 +928,7 @@ NULL
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' valids <- list(test = dtest)
 #' model <- lgb.train(
@@ -1086,7 +1087,10 @@ predict.lgb.Booster <- function(object,
 #' X <- as.matrix(mtcars[, -1L])
 #' y <- mtcars[, 1L]
 #' dtrain <- lgb.Dataset(X, label = y, params = list(max_bin = 5L))
-#' params <- list(min_data_in_leaf = 2L)
+#' params <- list(
+#'   min_data_in_leaf = 2L
+#'   , num_threads = 2L
+#' )
 #' model <- lgb.train(
 #'   params = params
 #'  , data = dtrain
@@ -1231,6 +1235,7 @@ summary.lgb.Booster <- function(object, ...) {
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' valids <- list(test = dtest)
 #' model <- lgb.train(
@@ -1296,6 +1301,7 @@ lgb.load <- function(filename = NULL, model_str = NULL) {
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' valids <- list(test = dtest)
 #' model <- lgb.train(
@@ -1351,6 +1357,7 @@ lgb.save <- function(booster, filename, num_iteration = NULL) {
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' valids <- list(test = dtest)
 #' model <- lgb.train(
@@ -1401,6 +1408,7 @@ lgb.dump <- function(booster, num_iteration = NULL) {
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' valids <- list(test = dtest)
 #' model <- lgb.train(

diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
@@ -494,11 +494,10 @@ Dataset <- R6::R6Class(
         if (info_len > 0L) {
 
           # Get back fields
-          ret <- NULL
-          ret <- if (field_name == "group") {
-            integer(info_len)
+          if (field_name == "group") {
+            ret <- integer(info_len)
           } else {
-            numeric(info_len)
+            ret <- numeric(info_len)
           }
 
           .Call(

diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R
@@ -98,8 +98,6 @@ Predictor <- R6::R6Class(
         start_iteration <- 0L
       }
 
-      num_row <- 0L
-
       # Check if data is a file name and not a matrix
       if (identical(class(data), "character") && length(data) == 1L) {
 

diff --git a/R-package/R/lgb.convert_with_rules.R b/R-package/R/lgb.convert_with_rules.R
@@ -116,10 +116,6 @@ lgb.convert_with_rules <- function(data, rules = NULL) {
 
     column_classes <- .get_column_classes(df = data)
 
-    is_char <- which(column_classes == "character")
-    is_factor <- which(column_classes == "factor")
-    is_logical <- which(column_classes == "logical")
-
     is_data_table <- data.table::is.data.table(x = data)
     is_data_frame <- is.data.frame(data)
 

diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R
@@ -59,6 +59,7 @@ CVBooster <- R6::R6Class(
 #'   , metric = "l2"
 #'   , min_data = 1L
 #'   , learning_rate = 1.0
+#'   , num_threads = 2L
 #' )
 #' model <- lgb.cv(
 #'   params = params
@@ -224,8 +225,6 @@ lgb.cv <- function(params = list()
       stop(sQuote("folds"), " must be a list with 2 or more elements that are vectors of indices for each CV-fold")
     }
 
-    nfold <- length(folds)
-
   } else {
 
     if (nfold <= 1L) {

diff --git a/R-package/R/lgb.importance.R b/R-package/R/lgb.importance.R
@@ -24,6 +24,7 @@
 #'   , max_depth = -1L
 #'   , min_data_in_leaf = 1L
 #'   , min_sum_hessian_in_leaf = 1.0
+#'   , num_threads = 2L
 #' )
 #' model <- lgb.train(
 #'     params = params

diff --git a/R-package/R/lgb.interprete.R b/R-package/R/lgb.interprete.R
@@ -35,6 +35,7 @@
 #'     , max_depth = -1L
 #'     , min_data_in_leaf = 1L
 #'     , min_sum_hessian_in_leaf = 1.0
+#'     , num_threads = 2L
 #' )
 #' model <- lgb.train(
 #'     params = params
@@ -71,7 +72,9 @@ lgb.interprete <- function(model,
   leaf_index_dt <- data.table::as.data.table(x = pred_mat)
   leaf_index_mat_list <- lapply(
     X = leaf_index_dt
-    , FUN = function(x) matrix(x, ncol = num_class, byrow = TRUE)
+    , FUN = matrix
+    , ncol = num_class
+    , byrow = TRUE
   )
 
   # Get list of trees

diff --git a/R-package/R/lgb.model.dt.tree.R b/R-package/R/lgb.model.dt.tree.R
@@ -40,6 +40,7 @@
 #'   , max_depth = -1L
 #'   , min_data_in_leaf = 1L
 #'   , min_sum_hessian_in_leaf = 1.0
+#'   , num_threads = 2L
 #' )
 #' model <- lgb.train(params, dtrain, 10L)
 #'

diff --git a/R-package/R/lgb.plot.importance.R b/R-package/R/lgb.plot.importance.R
@@ -28,6 +28,7 @@
 #'     , learning_rate = 0.1
 #'     , min_data_in_leaf = 1L
 #'     , min_sum_hessian_in_leaf = 1.0
+#'     , num_threads = 2L
 #' )
 #'
 #' model <- lgb.train(

diff --git a/R-package/R/lgb.plot.interpretation.R b/R-package/R/lgb.plot.interpretation.R
@@ -39,6 +39,7 @@
 #'   , max_depth = -1L
 #'   , min_data_in_leaf = 1L
 #'   , min_sum_hessian_in_leaf = 1.0
+#'   , num_threads = 2L
 #' )
 #' model <- lgb.train(
 #'   params = params

diff --git a/R-package/R/lgb.restore_handle.R b/R-package/R/lgb.restore_handle.R
@@ -23,7 +23,9 @@
 #'   , agaricus.train$label
 #'   , params = list(objective = "binary")
 #'   , nrounds = 5L
-#'   , verbose = 0)
+#'   , verbose = 0
+#'   , num_threads = 2L
+#' )
 #' fname <- tempfile(fileext="rds")
 #' saveRDS(model, fname)
 #'