From e8c316f46bf1b0e44fa12cb597c76ce70bf371c7 Mon Sep 17 00:00:00 2001
From: Thomas BESSOU <thomas.bessou@hotmail.fr>
Date: Mon, 7 Aug 2023 01:02:17 +0200
Subject: [PATCH 01/13] Fix single row prediction performance in a
 multi-threaded environment

Fixes #6021

- Store all resources that are reused across single-row predictions in the dedicated `SingleRowPredictor` (aka `FastConfig`)
- Use that instead of resources in the `Booster` when doing single row predictions to avoid having to lock the `Booster` exclusively.
- A FastConfig being alive now takes a shared lock on the booster (it was likely very incorrect to mutate the booster while this object was already built anyway)
---
 src/c_api.cpp | 145 +++++++++++++++++++++++++++++---------------------
 1 file changed, 85 insertions(+), 60 deletions(-)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index 442247d7a9dd..a89fd99108d5 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -58,12 +58,12 @@ yamc::shared_lock<yamc::alternate::shared_mutex> lock(&mtx);
 const int PREDICTOR_TYPES = 4;
 
 // Single row predictor to abstract away caching logic
-class SingleRowPredictor {
+class SingleRowPredictorInner {
  public:
   PredictFunction predict_function;
   int64_t num_pred_in_one_row;
 
-  SingleRowPredictor(int predict_type, Boosting* boosting, const Config& config, int start_iter, int num_iter) {
+  SingleRowPredictorInner(int predict_type, Boosting* boosting, const Config& config, int start_iter, int num_iter) {
     bool is_predict_leaf = false;
     bool is_raw_score = false;
     bool predict_contrib = false;
@@ -85,7 +85,7 @@ class SingleRowPredictor {
     num_total_model_ = boosting->NumberOfTotalModel();
   }
 
-  ~SingleRowPredictor() {}
+  ~SingleRowPredictorInner() {}
 
   bool IsPredictorEqual(const Config& config, int iter, Boosting* boosting) {
     return early_stop_ == config.pred_early_stop &&
@@ -104,6 +104,59 @@ class SingleRowPredictor {
   int num_total_model_;
 };
 
+/*!
+ * \brief Object to store resources meant for single-row Fast Predict methods.
+ * 
+ * For legacy reasons this is called `FastConfig` in the public C API.
+ *
+ * Meant to be used by the *Fast* predict methods only.
+ * It stores the configuration and prediction resources for reuse across predictions.
+ */
+struct SingleRowPredictor {
+ public:
+  SingleRowPredictor(yamc::alternate::shared_mutex *booster_mutex,
+             const char *parameters,
+             const int data_type,
+             const int32_t num_cols,
+             int predict_type,
+             Boosting *boosting,
+             int start_iter,
+             int num_iter) : booster_shared_lock(booster_mutex), config(Config::Str2Map(parameters)), data_type(data_type), num_cols(num_cols), single_row_predictor_inner(predict_type, boosting, config, start_iter, num_iter) {
+    if (!config.predict_disable_shape_check && num_cols != boosting->MaxFeatureIdx() + 1) {
+      Log::Fatal("The number of features in data (%d) is not the same as it was in training data (%d).\n"\
+                 "You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.", num_cols, boosting->MaxFeatureIdx() + 1);
+    }
+  }
+
+  void Predict(std::function<std::vector<std::pair<int, double>>(int row_idx)> get_row_fun,
+               double* out_result, int64_t* out_len) const {
+    UNIQUE_LOCK(single_row_predictor_mutex)
+
+    auto one_row = get_row_fun(0);
+    single_row_predictor_inner.predict_function(one_row, out_result);
+
+    *out_len = single_row_predictor_inner.num_pred_in_one_row;
+  }
+
+ private:
+  // Prevent the booster from being modified while we have a predictor relying on it
+  yamc::shared_lock<yamc::alternate::shared_mutex> booster_shared_lock;
+
+ public:
+  Config config;
+  const int data_type;
+  const int32_t num_cols;
+ private:
+  SingleRowPredictorInner single_row_predictor_inner;
+
+  // If several threads try to predict at the same time using the same SingleRowPredictor
+  // we want them to still provide correct values, so the mutex is necessary due to the shared
+  // resources in the predictor.
+  // However the recommended approach is to instantiate one SingleRowPredictor per thread,
+  // to avoid contention here.
+  mutable yamc::alternate::shared_mutex single_row_predictor_mutex;
+};
+
 class Booster {
  public:
   explicit Booster(const char* filename) {
@@ -373,15 +426,21 @@ class Booster {
     boosting_->RollbackOneIter();
   }
 
-  void SetSingleRowPredictor(int start_iteration, int num_iteration, int predict_type, const Config& config) {
+  void SetSingleRowPredictorInner(int start_iteration, int num_iteration, int predict_type, const Config& config) {
       UNIQUE_LOCK(mutex_)
       if (single_row_predictor_[predict_type].get() == nullptr ||
           !single_row_predictor_[predict_type]->IsPredictorEqual(config, num_iteration, boosting_.get())) {
-        single_row_predictor_[predict_type].reset(new SingleRowPredictor(predict_type, boosting_.get(),
+        single_row_predictor_[predict_type].reset(new SingleRowPredictorInner(predict_type, boosting_.get(),
                                                                          config, start_iteration, num_iteration));
       }
   }
 
+  std::unique_ptr<SingleRowPredictor> InitSingleRowPredictor(int predict_type, int start_iteration, int num_iteration, int data_type, int32_t num_cols, const char *parameters) {
+    return std::unique_ptr<SingleRowPredictor>(new SingleRowPredictor(
+      &mutex_, parameters, data_type, num_cols, predict_type, boosting_.get(), start_iteration, num_iteration
+    ));
+  }
+
   void PredictSingleRow(int predict_type, int ncol,
                std::function<std::vector<std::pair<int, double>>(int row_idx)> get_row_fun,
                const Config& config,
@@ -814,7 +873,7 @@ class Booster {
  private:
   const Dataset* train_data_;
   std::unique_ptr<Boosting> boosting_;
-  std::unique_ptr<SingleRowPredictor> single_row_predictor_[PREDICTOR_TYPES];
+  std::unique_ptr<SingleRowPredictorInner> single_row_predictor_[PREDICTOR_TYPES];
 
   /*! \brief All configs */
   Config config_;
@@ -847,6 +906,7 @@ using LightGBM::Log;
 using LightGBM::Network;
 using LightGBM::Random;
 using LightGBM::ReduceScatterFunction;
+using LightGBM::SingleRowPredictor;
 
 // some help functions used to convert data
 
@@ -2053,35 +2113,11 @@ int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
   API_END();
 }
 
-/*!
- * \brief Object to store resources meant for single-row Fast Predict methods.
- *
- * Meant to be used as a basic struct by the *Fast* predict methods only.
- * It stores the configuration resources for reuse during prediction.
- *
- * Even the row function is stored. We score the instance at the same memory
- * address all the time. One just replaces the feature values at that address
- * and scores again with the *Fast* methods.
- */
-struct FastConfig {
-  FastConfig(Booster *const booster_ptr,
-             const char *parameter,
-             const int predict_type_,
-             const int data_type_,
-             const int32_t num_cols) : booster(booster_ptr), predict_type(predict_type_), data_type(data_type_), ncol(num_cols) {
-    config.Set(Config::Str2Map(parameter));
-  }
 
-  Booster* const booster;
-  Config config;
-  const int predict_type;
-  const int data_type;
-  const int32_t ncol;
-};
 
 int LGBM_FastConfigFree(FastConfigHandle fastConfig) {
   API_BEGIN();
-  delete reinterpret_cast<FastConfig*>(fastConfig);
+  delete reinterpret_cast<SingleRowPredictor*>(fastConfig);
   API_END();
 }
 
@@ -2229,7 +2265,7 @@ int LGBM_BoosterPredictForCSRSingleRow(BoosterHandle handle,
   OMP_SET_NUM_THREADS(config.num_threads);
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowFunctionFromCSR<int>(indptr, indptr_type, indices, data, data_type, nindptr, nelem);
-  ref_booster->SetSingleRowPredictor(start_iteration, num_iteration, predict_type, config);
+  ref_booster->SetSingleRowPredictorInner(start_iteration, num_iteration, predict_type, config);
   ref_booster->PredictSingleRow(predict_type, static_cast<int32_t>(num_col), get_row_fun, config, out_result, out_len);
   API_END();
 }
@@ -2249,18 +2285,14 @@ int LGBM_BoosterPredictForCSRSingleRowFastInit(BoosterHandle handle,
     Log::Fatal("The number of columns should be smaller than INT32_MAX.");
   }
 
-  auto fastConfig_ptr = std::unique_ptr<FastConfig>(new FastConfig(
-    reinterpret_cast<Booster*>(handle),
-    parameter,
-    predict_type,
-    data_type,
-    static_cast<int32_t>(num_col)));
+  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
 
-  OMP_SET_NUM_THREADS(fastConfig_ptr->config.num_threads);
+  std::unique_ptr<SingleRowPredictor> single_row_predictor =
+    ref_booster->InitSingleRowPredictor(start_iteration, num_iteration, predict_type, data_type, static_cast<int32_t>(num_col), parameter);
 
-  fastConfig_ptr->booster->SetSingleRowPredictor(start_iteration, num_iteration, predict_type, fastConfig_ptr->config);
+  OMP_SET_NUM_THREADS(single_row_predictor->config.num_threads);
 
-  *out_fastConfig = fastConfig_ptr.release();
+  *out_fastConfig = single_row_predictor.release();
   API_END();
 }
 
@@ -2274,10 +2306,9 @@ int LGBM_BoosterPredictForCSRSingleRowFast(FastConfigHandle fastConfig_handle,
                                            int64_t* out_len,
                                            double* out_result) {
   API_BEGIN();
-  FastConfig *fastConfig = reinterpret_cast<FastConfig*>(fastConfig_handle);
-  auto get_row_fun = RowFunctionFromCSR<int>(indptr, indptr_type, indices, data, fastConfig->data_type, nindptr, nelem);
-  fastConfig->booster->PredictSingleRow(fastConfig->predict_type, fastConfig->ncol,
-                                        get_row_fun, fastConfig->config, out_result, out_len);
+  SingleRowPredictor *single_row_predictor = reinterpret_cast<SingleRowPredictor*>(fastConfig_handle);
+  auto get_row_fun = RowFunctionFromCSR<int>(indptr, indptr_type, indices, data, single_row_predictor->data_type, nindptr, nelem);
+  single_row_predictor->Predict(get_row_fun, out_result, out_len);
   API_END();
 }
 
@@ -2392,7 +2423,7 @@ int LGBM_BoosterPredictForMatSingleRow(BoosterHandle handle,
   OMP_SET_NUM_THREADS(config.num_threads);
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
   auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, ncol, data_type, is_row_major);
-  ref_booster->SetSingleRowPredictor(start_iteration, num_iteration, predict_type, config);
+  ref_booster->SetSingleRowPredictorInner(start_iteration, num_iteration, predict_type, config);
   ref_booster->PredictSingleRow(predict_type, ncol, get_row_fun, config, out_result, out_len);
   API_END();
 }
@@ -2406,18 +2437,14 @@ int LGBM_BoosterPredictForMatSingleRowFastInit(BoosterHandle handle,
                                                const char* parameter,
                                                FastConfigHandle *out_fastConfig) {
   API_BEGIN();
-  auto fastConfig_ptr = std::unique_ptr<FastConfig>(new FastConfig(
-    reinterpret_cast<Booster*>(handle),
-    parameter,
-    predict_type,
-    data_type,
-    ncol));
+  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
 
-  OMP_SET_NUM_THREADS(fastConfig_ptr->config.num_threads);
+  std::unique_ptr<SingleRowPredictor> single_row_predictor =
+    ref_booster->InitSingleRowPredictor(start_iteration, num_iteration, predict_type, data_type, ncol, parameter);
 
-  fastConfig_ptr->booster->SetSingleRowPredictor(start_iteration, num_iteration, predict_type, fastConfig_ptr->config);
+  OMP_SET_NUM_THREADS(single_row_predictor->config.num_threads);
 
-  *out_fastConfig = fastConfig_ptr.release();
+  *out_fastConfig = single_row_predictor.release();
   API_END();
 }
 
@@ -2426,12 +2453,10 @@ int LGBM_BoosterPredictForMatSingleRowFast(FastConfigHandle fastConfig_handle,
                                            int64_t* out_len,
                                            double* out_result) {
   API_BEGIN();
-  FastConfig *fastConfig = reinterpret_cast<FastConfig*>(fastConfig_handle);
+  SingleRowPredictor *single_row_predictor = reinterpret_cast<SingleRowPredictor*>(fastConfig_handle);
   // Single row in row-major format:
-  auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, fastConfig->ncol, fastConfig->data_type, 1);
-  fastConfig->booster->PredictSingleRow(fastConfig->predict_type, fastConfig->ncol,
-                                        get_row_fun, fastConfig->config,
-                                        out_result, out_len);
+  auto get_row_fun = RowPairFunctionFromDenseMatric(data, 1, single_row_predictor->num_cols, single_row_predictor->data_type, 1);
+  single_row_predictor->Predict(get_row_fun, out_result, out_len);
   API_END();
 }
 

From fe31d4ecda2c857128128688c3dbd2eaa0020d04 Mon Sep 17 00:00:00 2001
From: Thomas BESSOU <thomas.bessou@hotmail.fr>
Date: Mon, 7 Aug 2023 01:47:43 +0200
Subject: [PATCH 02/13] fix missing file change

---
 include/LightGBM/config.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index e01578396259..3749cd464843 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -33,6 +33,10 @@ const int kDefaultNumLeaves = 31;
 
 struct Config {
  public:
+  Config() {}
+  Config(std::unordered_map<std::string, std::string> parameters_map) {
+    Set(parameters_map);
+  }
   std::string ToString() const;
   /*!
   * \brief Get string value by specific name of key

From feaf3dc1981b8ec8dd98d35ac22dcb40a89acadb Mon Sep 17 00:00:00 2001
From: Thomas BESSOU <thomas.bessou@hotmail.fr>
Date: Mon, 7 Aug 2023 12:06:02 +0200
Subject: [PATCH 03/13] fix lint

---
 include/LightGBM/config.h | 2 +-
 src/c_api.cpp             | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 3749cd464843..801921764271 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -34,7 +34,7 @@ const int kDefaultNumLeaves = 31;
 struct Config {
  public:
   Config() {}
-  Config(std::unordered_map<std::string, std::string> parameters_map) {
+  explicit Config(std::unordered_map<std::string, std::string> parameters_map) {
     Set(parameters_map);
   }
   std::string ToString() const;
diff --git a/src/c_api.cpp b/src/c_api.cpp
index a89fd99108d5..fa42adab4072 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -146,6 +146,7 @@ struct SingleRowPredictor {
   Config config;
   const int data_type;
   const int32_t num_cols;
+
  private:
   SingleRowPredictorInner single_row_predictor_inner;
 
@@ -437,8 +438,7 @@ class Booster {
 
   std::unique_ptr<SingleRowPredictor> InitSingleRowPredictor(int predict_type, int start_iteration, int num_iteration, int data_type, int32_t num_cols, const char *parameters) {
     return std::unique_ptr<SingleRowPredictor>(new SingleRowPredictor(
-      &mutex_, parameters, data_type, num_cols, predict_type, boosting_.get(), start_iteration, num_iteration
-    ));
+      &mutex_, parameters, data_type, num_cols, predict_type, boosting_.get(), start_iteration, num_iteration));
   }
 
   void PredictSingleRow(int predict_type, int ncol,

From f41755b42b37f16c5dfc43c55cdc78ef41c463db Mon Sep 17 00:00:00 2001
From: Thomas BESSOU <thomas.bessou@hotmail.fr>
Date: Mon, 7 Aug 2023 13:25:44 +0200
Subject: [PATCH 04/13] check whether freeze is due to booster shared lock
 being held by the SingleRowPredictor

---
 src/c_api.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index fa42adab4072..1da7d40cb557 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -121,7 +121,7 @@ struct SingleRowPredictor {
              int predict_type,
              Boosting *boosting,
              int start_iter,
-             int num_iter) : booster_shared_lock(booster_mutex), config(Config::Str2Map(parameters)), data_type(data_type), num_cols(num_cols), single_row_predictor_inner(predict_type, boosting, config, start_iter, num_iter) {
+             int num_iter) : config(Config::Str2Map(parameters)), data_type(data_type), num_cols(num_cols), single_row_predictor_inner(predict_type, boosting, config, start_iter, num_iter), booster_mutex(booster_mutex) {
     if (!config.predict_disable_shape_check && num_cols != boosting->MaxFeatureIdx() + 1) {
       Log::Fatal("The number of features in data (%d) is not the same as it was in training data (%d).\n"\
                  "You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.", num_cols, boosting->MaxFeatureIdx() + 1);
@@ -131,6 +131,7 @@ struct SingleRowPredictor {
   void Predict(std::function<std::vector<std::pair<int, double>>(int row_idx)> get_row_fun,
                double* out_result, int64_t* out_len) const {
     UNIQUE_LOCK(single_row_predictor_mutex)
+    yamc::shared_lock<yamc::alternate::shared_mutex> booster_shared_lock(booster_mutex);
 
     auto one_row = get_row_fun(0);
     single_row_predictor_inner.predict_function(one_row, out_result);
@@ -138,10 +139,6 @@ struct SingleRowPredictor {
     *out_len = single_row_predictor_inner.num_pred_in_one_row;
   }
 
- private:
-  // Prevent the booster from being modified while we have a predictor relying on it
-  yamc::shared_lock<yamc::alternate::shared_mutex> booster_shared_lock;
-
  public:
   Config config;
   const int data_type;
@@ -150,6 +147,9 @@ struct SingleRowPredictor {
  private:
   SingleRowPredictorInner single_row_predictor_inner;
 
+  // Prevent the booster from being modified while we have a predictor relying on it during prediction
+  yamc::alternate::shared_mutex *booster_mutex;
+
   // If several threads try to predict at the same time using the same SingleRowPredictor
   // we want them to still provide correct values, so the mutex is necessary due to the shared
   // resources in the predictor.

From c52a7d79e7d9871177b8717f0e7c1ccac8be6e30 Mon Sep 17 00:00:00 2001
From: Thomas BESSOU <thomas.bessou@hotmail.fr>
Date: Tue, 8 Aug 2023 00:46:49 +0200
Subject: [PATCH 05/13] what you get for having everything as an int

---
 src/c_api.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index 1da7d40cb557..124e4343e15a 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -2440,7 +2440,7 @@ int LGBM_BoosterPredictForMatSingleRowFastInit(BoosterHandle handle,
   Booster* ref_booster = reinterpret_cast<Booster*>(handle);
 
   std::unique_ptr<SingleRowPredictor> single_row_predictor =
-    ref_booster->InitSingleRowPredictor(start_iteration, num_iteration, predict_type, data_type, ncol, parameter);
+    ref_booster->InitSingleRowPredictor(predict_type, start_iteration, num_iteration, data_type, ncol, parameter);
 
   OMP_SET_NUM_THREADS(single_row_predictor->config.num_threads);
 

From 6d949a0b910b1a6b633b228cc7947c2332d2bb95 Mon Sep 17 00:00:00 2001
From: Thomas BESSOU <thomas.bessou@hotmail.fr>
Date: Thu, 11 Jan 2024 22:17:16 +0100
Subject: [PATCH 06/13] Add TODO about FastConfig naming needing to be updated

As discussed here:
https://github.com/microsoft/LightGBM/pull/6024#discussion_r1332536566
---
 src/c_api.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index 124e4343e15a..f74bffe4bc92 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -2113,8 +2113,8 @@ int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
   API_END();
 }
 
-
-
+// TODO in future versions of LightGBM, public API named around `FastConfig` should be made named around
+// `SingleRowPredictor`, because it is specific to single row prediction, and doesn't actually hold only config.
 int LGBM_FastConfigFree(FastConfigHandle fastConfig) {
   API_BEGIN();
   delete reinterpret_cast<SingleRowPredictor*>(fastConfig);

From 1fcbc3fd7fed762e1b753bdccbabebf644d63667 Mon Sep 17 00:00:00 2001
From: Thomas BESSOU <thomas.bessou@hotmail.fr>
Date: Fri, 12 Jan 2024 00:49:55 +0100
Subject: [PATCH 07/13] Add a test showcasing fast single row prediction in
 multi-thread

Ensures consistency between that and LGBM_BoosterPredictForMat
---
 tests/cpp_tests/test_single_row.cpp | 156 ++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)
 create mode 100644 tests/cpp_tests/test_single_row.cpp

diff --git a/tests/cpp_tests/test_single_row.cpp b/tests/cpp_tests/test_single_row.cpp
new file mode 100644
index 000000000000..14dc1aa55488
--- /dev/null
+++ b/tests/cpp_tests/test_single_row.cpp
@@ -0,0 +1,156 @@
+/*!
+ * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+
+#include <gtest/gtest.h>
+#include <testutils.h>
+#include <LightGBM/utils/byte_buffer.h>
+#include <LightGBM/utils/log.h>
+#include <LightGBM/c_api.h>
+#include <LightGBM/dataset.h>
+
+#include <iostream>
+#include <fstream>
+
+using LightGBM::ByteBuffer;
+using LightGBM::Dataset;
+using LightGBM::Log;
+using LightGBM::TestUtils;
+
+TEST(SingleRow, JustWorks) {
+    // Load some test data
+    int result;
+
+    DatasetHandle train_dataset;
+    result = TestUtils::LoadDatasetFromExamples("binary_classification/binary.train", "max_bin=15", &train_dataset);
+    EXPECT_EQ(0, result) << "LoadDatasetFromExamples train result code: " << result;
+
+    DatasetHandle test_dataset;
+    result = TestUtils::LoadDatasetFromExamples("binary_classification/binary.test", "max_bin=15", &test_dataset);
+    EXPECT_EQ(0, result) << "LoadDatasetFromExamples test result code: " << result;
+
+    BoosterHandle booster_handle;
+    result = LGBM_BoosterCreate(train_dataset, "app=binary metric=auc num_leaves=31 verbose=0", &booster_handle);
+    EXPECT_EQ(0, result) << "LGBM_BoosterCreate result code: " << result;
+
+    for (int i = 0; i < 51; i++) {
+        int is_finished;
+        result = LGBM_BoosterUpdateOneIter(
+            booster_handle,
+            &is_finished);
+        EXPECT_EQ(0, result) << "LGBM_BoosterUpdateOneIter result code: " << result;
+    }
+
+    int n_features;
+    result = LGBM_BoosterGetNumFeature(
+        booster_handle,
+        &n_features);
+    EXPECT_EQ(0, result) << "LGBM_BoosterGetNumFeature result code: " << result;
+
+    // Run a single row prediction and compare with regular Mat prediction:
+    int64_t output_size;
+    result = LGBM_BoosterCalcNumPredict(
+        booster_handle,
+        1,
+        C_API_PREDICT_NORMAL, // predict_type
+        0,                    // start_iteration
+        -1,                   // num_iteration
+        &output_size);
+    EXPECT_EQ(0, result) << "LGBM_BoosterCalcNumPredict result code: " << result;
+
+    std::ifstream test_file("../examples/binary_classification/binary.test");
+    std::vector<double> test;
+    double x;
+    int test_set_size = 0;
+    while (test_file >> x) {
+        if (test_set_size % (n_features + 1) == 0) {
+            // Drop the result from the dataset, we only care about checking that prediction results are equal
+            // in both cases
+            test_file >> x;
+            test_set_size++;
+        }
+        test.push_back(x);
+        test_set_size++;
+    }
+    EXPECT_EQ(test_set_size % (n_features + 1), 0) << "Test size mismatch with dataset size (%)";
+    test_set_size /= (n_features + 1);
+    EXPECT_EQ(test_set_size, 500) << "Improperly parsed test file (test_set_size)";
+    EXPECT_EQ(test.size(), test_set_size * n_features) << "Improperly parsed test file (test len)";
+
+    std::vector<double> mat_output(output_size * test_set_size, -1);
+    int64_t written;
+    result = LGBM_BoosterPredictForMat(
+        booster_handle,
+        &test[0],
+        C_API_DTYPE_FLOAT64,
+        test_set_size,        // nrow
+        n_features,           // ncol
+        1,                    // is_row_major
+        C_API_PREDICT_NORMAL, // predict_type
+        0,                    // start_iteration
+        -1,                   // num_iteration
+        "",
+        &written,
+        &mat_output[0]);
+    EXPECT_EQ(0, result) << "LGBM_BoosterPredictForMat result code: " << result;
+
+    // Now let's run with the single row fast prediction API:
+    int n_threads = 10;
+    FastConfigHandle fast_configs[n_threads];
+    for (int i = 0; i < n_threads; i++) {
+        result = LGBM_BoosterPredictForMatSingleRowFastInit(
+            booster_handle,
+            C_API_PREDICT_NORMAL, // predict_type
+            0,                    // start_iteration
+            -1,                   // num_iteration
+            C_API_DTYPE_FLOAT64,
+            n_features,
+            "",
+            &fast_configs[i]);
+        EXPECT_EQ(0, result) << "LGBM_BoosterPredictForMatSingleRowFastInit result code: " << result;
+    }
+
+    std::vector<double> single_row_output(output_size * test_set_size, -1);
+    std::vector<std::thread> threads(n_threads);
+    int batch_size = (test_set_size + n_threads - 1) / n_threads; // round up
+    for (int i = 0; i < n_threads; i++) {
+        threads[i] = std::thread(
+            [
+                i, batch_size, test_set_size, output_size, n_features,
+                test = &test[0], fast_configs = &fast_configs[0], single_row_output = &single_row_output[0]
+            ](){
+                int result;
+                int64_t written;
+                for (int j = i * batch_size; j < std::min((i + 1) * batch_size, test_set_size); j++) {
+                    result = LGBM_BoosterPredictForMatSingleRowFast(
+                        fast_configs[i],
+                        &test[j * n_features],
+                        &written,
+                        &single_row_output[j * output_size]);
+                    EXPECT_EQ(0, result) << "LGBM_BoosterPredictForMatSingleRowFast result code: " << result;
+                    EXPECT_EQ(written, output_size) << "LGBM_BoosterPredictForMatSingleRowFast unexpected written output size";
+                }
+            });
+    }
+    for (std::thread &t : threads) {
+        t.join();
+    }
+
+    EXPECT_EQ(single_row_output, mat_output) << "LGBM_BoosterPredictForMatSingleRowFast output mismatch with LGBM_BoosterPredictForMat";
+
+    // Free all:
+    for (int i = 0; i < n_threads; i++) {
+        result = LGBM_FastConfigFree(fast_configs[i]);
+        EXPECT_EQ(0, result) << "LGBM_FastConfigFree result code: " << result;
+    }
+
+    result = LGBM_BoosterFree(booster_handle);
+    EXPECT_EQ(0, result) << "LGBM_BoosterFree result code: " << result;
+
+    result = LGBM_DatasetFree(train_dataset);
+    EXPECT_EQ(0, result) << "LGBM_DatasetFree result code: " << result;
+
+    result = LGBM_DatasetFree(test_dataset);
+    EXPECT_EQ(0, result) << "LGBM_DatasetFree result code: " << result;
+}

From 4a80c8c378db1be7413fbedf142136f7241ce8b7 Mon Sep 17 00:00:00 2001
From: Thomas BESSOU <thomas.bessou@hotmail.fr>
Date: Fri, 12 Jan 2024 01:04:32 +0100
Subject: [PATCH 08/13] Add a workaround for #6142

---
 src/c_api.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index f74bffe4bc92..8562815432ef 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -437,6 +437,12 @@ class Booster {
   }
 
   std::unique_ptr<SingleRowPredictor> InitSingleRowPredictor(int predict_type, int start_iteration, int num_iteration, int data_type, int32_t num_cols, const char *parameters) {
+    // Workaround https://github.com/microsoft/LightGBM/issues/6142 by locking here
+    // This is only a workaround because if predictors are initialized differently it may still behave incorrectly,
+    // and because multiple racing Predictor initializations through LGBM_BoosterPredictForMat suffers from that same issue of Predictor init writing things in the booster.
+    // Once #6142 is fixed (predictor doesn't write in the Booster as should have been the case since 1c35c3b9ede9adab8ccc5fd7b4b2b6af188a79f0), this line can be removed.
+    UNIQUE_LOCK(mutex_)
+
     return std::unique_ptr<SingleRowPredictor>(new SingleRowPredictor(
       &mutex_, parameters, data_type, num_cols, predict_type, boosting_.get(), start_iteration, num_iteration));
   }

From 49d721741f726cc9f9ddcfd3b260d6190505f476 Mon Sep 17 00:00:00 2001
From: Thomas BESSOU <thomas.bessou@hotmail.fr>
Date: Fri, 12 Jan 2024 01:27:13 +0100
Subject: [PATCH 09/13] add more cleanup comment

---
 src/c_api.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index 8562815432ef..725323132372 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -2121,6 +2121,9 @@ int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
 
 // TODO in future versions of LightGBM, public API named around `FastConfig` should be made named around
 // `SingleRowPredictor`, because it is specific to single row prediction, and doesn't actually hold only config.
+// At the same time, one should consider removing the old non-fast single row public API that stores its Predictor
+// in the Booster, because that will enable removing these Predictors from the Booster, and associated initialization
+// code.
 int LGBM_FastConfigFree(FastConfigHandle fastConfig) {
   API_BEGIN();
   delete reinterpret_cast<SingleRowPredictor*>(fastConfig);

From 1a4f1aaa24614618cca2ed1080f49c655a1f4988 Mon Sep 17 00:00:00 2001
From: Thomas BESSOU <thomas.bessou@hotmail.fr>
Date: Fri, 12 Jan 2024 01:58:12 +0100
Subject: [PATCH 10/13] cleanup some unnecessary includes in the test

---
 tests/cpp_tests/test_single_row.cpp | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/tests/cpp_tests/test_single_row.cpp b/tests/cpp_tests/test_single_row.cpp
index 14dc1aa55488..ade8d9c82a5f 100644
--- a/tests/cpp_tests/test_single_row.cpp
+++ b/tests/cpp_tests/test_single_row.cpp
@@ -5,17 +5,11 @@
 
 #include <gtest/gtest.h>
 #include <testutils.h>
-#include <LightGBM/utils/byte_buffer.h>
-#include <LightGBM/utils/log.h>
 #include <LightGBM/c_api.h>
-#include <LightGBM/dataset.h>
 
 #include <iostream>
 #include <fstream>
 
-using LightGBM::ByteBuffer;
-using LightGBM::Dataset;
-using LightGBM::Log;
 using LightGBM::TestUtils;
 
 TEST(SingleRow, JustWorks) {
@@ -26,10 +20,6 @@ TEST(SingleRow, JustWorks) {
     result = TestUtils::LoadDatasetFromExamples("binary_classification/binary.train", "max_bin=15", &train_dataset);
     EXPECT_EQ(0, result) << "LoadDatasetFromExamples train result code: " << result;
 
-    DatasetHandle test_dataset;
-    result = TestUtils::LoadDatasetFromExamples("binary_classification/binary.test", "max_bin=15", &test_dataset);
-    EXPECT_EQ(0, result) << "LoadDatasetFromExamples test result code: " << result;
-
     BoosterHandle booster_handle;
     result = LGBM_BoosterCreate(train_dataset, "app=binary metric=auc num_leaves=31 verbose=0", &booster_handle);
     EXPECT_EQ(0, result) << "LGBM_BoosterCreate result code: " << result;
@@ -150,7 +140,4 @@ TEST(SingleRow, JustWorks) {
 
     result = LGBM_DatasetFree(train_dataset);
     EXPECT_EQ(0, result) << "LGBM_DatasetFree result code: " << result;
-
-    result = LGBM_DatasetFree(test_dataset);
-    EXPECT_EQ(0, result) << "LGBM_DatasetFree result code: " << result;
 }

From e00191a37e021c26530508c4a16d4d2bd6543c3a Mon Sep 17 00:00:00 2001
From: Thomas BESSOU <thomas.bessou@hotmail.fr>
Date: Fri, 12 Jan 2024 11:18:19 +0100
Subject: [PATCH 11/13] make windows cpp compiler happy

---
 tests/cpp_tests/test_single_row.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/cpp_tests/test_single_row.cpp b/tests/cpp_tests/test_single_row.cpp
index ade8d9c82a5f..c94262d71309 100644
--- a/tests/cpp_tests/test_single_row.cpp
+++ b/tests/cpp_tests/test_single_row.cpp
@@ -86,7 +86,7 @@ TEST(SingleRow, JustWorks) {
     EXPECT_EQ(0, result) << "LGBM_BoosterPredictForMat result code: " << result;
 
     // Now let's run with the single row fast prediction API:
-    int n_threads = 10;
+    const int n_threads = 10;
     FastConfigHandle fast_configs[n_threads];
     for (int i = 0; i < n_threads; i++) {
         result = LGBM_BoosterPredictForMatSingleRowFastInit(

From bfefc204e77ad12224e488c04c42a1ba23393713 Mon Sep 17 00:00:00 2001
From: Thomas BESSOU <thomas.bessou@hotmail.fr>
Date: Mon, 22 Jan 2024 19:02:51 +0100
Subject: [PATCH 12/13] hopefully make static analysis happy

---
 tests/cpp_tests/test_single_row.cpp | 38 ++++++++++++++---------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/tests/cpp_tests/test_single_row.cpp b/tests/cpp_tests/test_single_row.cpp
index c94262d71309..c14a681bea68 100644
--- a/tests/cpp_tests/test_single_row.cpp
+++ b/tests/cpp_tests/test_single_row.cpp
@@ -43,9 +43,9 @@ TEST(SingleRow, JustWorks) {
     result = LGBM_BoosterCalcNumPredict(
         booster_handle,
         1,
-        C_API_PREDICT_NORMAL, // predict_type
-        0,                    // start_iteration
-        -1,                   // num_iteration
+        C_API_PREDICT_NORMAL,  // predict_type
+        0,                     // start_iteration
+        -1,                    // num_iteration
         &output_size);
     EXPECT_EQ(0, result) << "LGBM_BoosterCalcNumPredict result code: " << result;
 
@@ -74,26 +74,26 @@ TEST(SingleRow, JustWorks) {
         booster_handle,
         &test[0],
         C_API_DTYPE_FLOAT64,
-        test_set_size,        // nrow
-        n_features,           // ncol
-        1,                    // is_row_major
-        C_API_PREDICT_NORMAL, // predict_type
-        0,                    // start_iteration
-        -1,                   // num_iteration
+        test_set_size,         // nrow
+        n_features,            // ncol
+        1,                     // is_row_major
+        C_API_PREDICT_NORMAL,  // predict_type
+        0,                     // start_iteration
+        -1,                    // num_iteration
         "",
         &written,
         &mat_output[0]);
     EXPECT_EQ(0, result) << "LGBM_BoosterPredictForMat result code: " << result;
 
     // Now let's run with the single row fast prediction API:
-    const int n_threads = 10;
-    FastConfigHandle fast_configs[n_threads];
-    for (int i = 0; i < n_threads; i++) {
+    const int kNThreads = 10;
+    FastConfigHandle fast_configs[kNThreads];
+    for (int i = 0; i < kNThreads; i++) {
         result = LGBM_BoosterPredictForMatSingleRowFastInit(
             booster_handle,
-            C_API_PREDICT_NORMAL, // predict_type
-            0,                    // start_iteration
-            -1,                   // num_iteration
+            C_API_PREDICT_NORMAL,  // predict_type
+            0,                     // start_iteration
+            -1,                    // num_iteration
             C_API_DTYPE_FLOAT64,
             n_features,
             "",
@@ -102,9 +102,9 @@ TEST(SingleRow, JustWorks) {
     }
 
     std::vector<double> single_row_output(output_size * test_set_size, -1);
-    std::vector<std::thread> threads(n_threads);
-    int batch_size = (test_set_size + n_threads - 1) / n_threads; // round up
-    for (int i = 0; i < n_threads; i++) {
+    std::vector<std::thread> threads(kNThreads);
+    int batch_size = (test_set_size + kNThreads - 1) / kNThreads;  // round up
+    for (int i = 0; i < kNThreads; i++) {
         threads[i] = std::thread(
             [
                 i, batch_size, test_set_size, output_size, n_features,
@@ -130,7 +130,7 @@ TEST(SingleRow, JustWorks) {
     EXPECT_EQ(single_row_output, mat_output) << "LGBM_BoosterPredictForMatSingleRowFast output mismatch with LGBM_BoosterPredictForMat";
 
     // Free all:
-    for (int i = 0; i < n_threads; i++) {
+    for (int i = 0; i < kNThreads; i++) {
         result = LGBM_FastConfigFree(fast_configs[i]);
         EXPECT_EQ(0, result) << "LGBM_FastConfigFree result code: " << result;
     }

From ff613adbae52f7b8b885559908423b04960a0cbf Mon Sep 17 00:00:00 2001
From: Thomas BESSOU <thomas.bessou@hotmail.fr>
Date: Wed, 24 Jan 2024 23:32:43 +0100
Subject: [PATCH 13/13] Turn TODO into a regular comment

---
 src/c_api.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/c_api.cpp b/src/c_api.cpp
index 725323132372..f2d709f9a798 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -2119,8 +2119,9 @@ int LGBM_BoosterCalcNumPredict(BoosterHandle handle,
   API_END();
 }
 
-// TODO in future versions of LightGBM, public API named around `FastConfig` should be made named around
+// Naming: In future versions of LightGBM, public API named around `FastConfig` should be made named around
 // `SingleRowPredictor`, because it is specific to single row prediction, and doesn't actually hold only config.
+// For now this is kept as `FastConfig` for backwards compatibility.
 // At the same time, one should consider removing the old non-fast single row public API that stores its Predictor
 // in the Booster, because that will enable removing these Predictors from the Booster, and associated initialization
 // code.