adjust buffer size, thread count

ggerganov · Sep 15, 2024 · cf0f60e · cf0f60e
1 parent d813691
commit cf0f60e
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 10 deletions.
diff --git a/examples/mnist/mnist-common.cpp b/examples/mnist/mnist-common.cpp
@@ -530,7 +530,7 @@ mnist_eval_result mnist_model_eval(mnist_model & model, const float * images, co
 void mnist_model_train(mnist_model & model, const float * images, const float * labels, const int nex, const int nepoch, const float val_split) {
     const int64_t t_start_us = ggml_time_us();
 
-    struct ggml_cgraph * gf = ggml_new_graph_custom(model.ctx_compute, 16384, true); // Forward pass.
+    struct ggml_cgraph * gf = ggml_new_graph_custom(model.ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass.
     ggml_build_forward_expand(gf, model.loss);
 
     struct ggml_cgraph * gb_grad = ggml_graph_dup(model.ctx_compute, gf); // Backward pass, gradients.
@@ -634,7 +634,7 @@ void mnist_model_save(mnist_model & model, const std::string & fname) {
     struct ggml_context * ggml_ctx;
     {
         struct ggml_init_params params = {
-            /*.mem_size   =*/ model.size_weight,
+            /*.mem_size   =*/ 100 * 1024*1024,
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ false,
         };

diff --git a/examples/mnist/mnist-common.h b/examples/mnist/mnist-common.h
@@ -49,9 +49,6 @@ struct mnist_model {
     struct ggml_tensor * dense_weight = nullptr;
     struct ggml_tensor * dense_bias   = nullptr;
 
-    static const size_t size_weight  = 100 *      1024*1024;
-    static const size_t size_compute =   1 * 1024*1024*1024;
-
     struct ggml_context * ctx_weight  = nullptr;
     struct ggml_context * ctx_compute = nullptr;
     ggml_backend_buffer_t buf_weight  = nullptr;
@@ -70,21 +67,25 @@ struct mnist_model {
         fprintf(stderr, "%s: using %s backend\n", __func__, backend_name.c_str());
         backend = ggml_backend_reg_init_backend(backend_index, nullptr);
         if (ggml_backend_is_cpu(backend)) {
-            ggml_backend_cpu_set_n_threads(backend, std::thread::hardware_concurrency());
+            const int ncores_logical = std::thread::hardware_concurrency();
+            ggml_backend_cpu_set_n_threads(backend, std::min(ncores_logical, (ncores_logical + 4)/2));
         }
 
         {
+            const size_t size_meta = 1024*ggml_tensor_overhead();
             struct ggml_init_params params = {
-                /*.mem_size   =*/ size_weight,
+                /*.mem_size   =*/ size_meta,
                 /*.mem_buffer =*/ nullptr,
                 /*.no_alloc   =*/ true,
             };
             ctx_weight = ggml_init(params);
         }
 
         {
+            // The compute context needs a total of 3 compute graphs: forward pass + backwards pass (with/without optimizer step).
+            const size_t size_meta = GGML_DEFAULT_GRAPH_SIZE*ggml_tensor_overhead() + 3*ggml_graph_overhead();
             struct ggml_init_params params = {
-                /*.mem_size   =*/ size_compute,
+                /*.mem_size   =*/ size_meta,
                 /*.mem_buffer =*/ nullptr,
                 /*.no_alloc   =*/ true,
             };

diff --git a/examples/mnist/mnist-eval.cpp b/examples/mnist/mnist-eval.cpp
@@ -46,7 +46,9 @@ int main(int argc, char ** argv) {
     mnist_eval_result result_eval;
 
     if (backend == "CPU") {
-        result_eval = mnist_graph_eval(argv[1], images.data(), labels.data(), MNIST_NTEST, std::thread::hardware_concurrency());
+        const int ncores_logical = std::thread::hardware_concurrency();
+        result_eval = mnist_graph_eval(
+            argv[1], images.data(), labels.data(), MNIST_NTEST, std::min(ncores_logical, (ncores_logical + 4)/2));
         if (result_eval.success) {
             fprintf(stdout, "%s: predicted digit is %d\n", __func__, result_eval.pred[iex]);
 

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -3229,7 +3229,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
     test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {3,1,2,1}, 1, 0, 1));
     test_cases.emplace_back(new test_conv_transpose_1d({2,1,1,1}, {3,1,1,1}, 1, 0, 1));
 
-    for (const int64_t & ne3 : {1, 3}) { // CUDA only supports ne3 == 1
+    for (const int64_t & ne3 : {1, 3}) { // CUDA backwards pass only supports ne3 == 1
         test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 1, 1, 1}));
         test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {2, 1, 1, 1}));
         test_cases.emplace_back(new test_repeat(GGML_TYPE_F32, {10, 5, 4, ne3}, {1, 2, 1, 1}));