ggerganov · ggerganov · Sep 11, 2024 · Sep 11, 2024
diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp
@@ -17,17 +17,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
 constexpr float rms_norm_eps = 5e-6f;
 #endif
 
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
 static struct ggml_tensor * randomize_tensor(
     struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
 ) {
@@ -1514,8 +1503,6 @@ int main(int argc, char ** argv) {
     int n_tokens = model.hparams.n_ctx;
     int n_vocab  = model.hparams.n_vocab;
 
-    std::vector<uint8_t> work_buffer;
-
     for (int ex=0; ex<n_examples; ++ex) {
         struct ggml_init_params params = {
             /*.mem_size   =*/ compute_size,
@@ -1542,7 +1529,10 @@ int main(int argc, char ** argv) {
         struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
 
         ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
+        ggml_graph_prepare(gf, 1, nullptr);
+        ggml_graph_work_init(gf, nullptr);
+        ggml_graph_compute(gf);
+        ggml_graph_work_free(gf);
 
         float error_before_opt = ggml_get_f32_1d(e, 0);
 
@@ -1553,7 +1543,10 @@ int main(int argc, char ** argv) {
         ggml_opt(ctx0, opt_params_lbfgs, e);
         //
         ggml_build_forward_expand(gf, e);
-        ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
+        ggml_graph_prepare(gf, 1, nullptr);
+        ggml_graph_work_init(gf, nullptr);
+        ggml_graph_compute(gf);
+        ggml_graph_work_free(gf);
 
         float error_after_opt = ggml_get_f32_1d(e, 0);
 
@@ -1607,7 +1600,10 @@ int main(int argc, char ** argv) {
             struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);
 
             ggml_build_forward_expand(gf, logits);
-            ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
+            ggml_graph_prepare(gf, 1, nullptr);
+            ggml_graph_work_init(gf, nullptr);
+            ggml_graph_compute(gf);
+            ggml_graph_work_free(gf);
 
             struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
             struct ggml_tensor * probs        = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);

diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp
@@ -20,17 +20,6 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
-static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    ggml_graph_compute(graph, &plan);
-}
-
 static float tensor_sum_elements(const ggml_tensor * tensor) {
     double sum = 0;
     if (tensor->type == GGML_TYPE_F32) {
@@ -179,9 +168,8 @@ int main(int argc, char ** argv)  {
     TENSOR_DUMP(m11);
     TENSOR_DUMP(m2);
 
-    std::vector<uint8_t> work_buffer;
-
-    ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
+    ggml_graph_prepare(gf, benchmark_params.n_threads, nullptr);
+    ggml_graph_work_init(gf, nullptr);
 
     TENSOR_DUMP(ggml_graph_node(gf, 0));
 
@@ -234,7 +222,7 @@ int main(int argc, char ** argv)  {
 
         long long int start = ggml_time_us();
         //printf("Running ggml_graph_compute\n");
-        ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
+        ggml_graph_compute(gf31);
 
         long long int stop = ggml_time_us();
         long long int usec = stop-start;
@@ -267,8 +255,11 @@ int main(int argc, char ** argv)  {
         }
 
         // Running a different graph computation to make sure we override the CPU cache lines
-        ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
+        ggml_graph_compute(gf32);
     }
+
+    ggml_graph_work_free(gf);
+
     printf("\n");
     printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
     printf("=====================================================================================\n");

diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
@@ -183,7 +183,9 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
     struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side,  size_ele * clip_n_mmproj_embd(ctx_clip), 0);
     // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
     ggml_build_forward_expand(gf, flatten);
-    ggml_graph_compute_with_ctx(model.ctx, gf, 1);
+    ggml_graph_prepare(gf, 1, nullptr);
+    ggml_graph_work_init(gf, model.ctx);
+    ggml_graph_compute(gf);
     struct ggml_tensor* result = ggml_graph_node(gf, -1);
 
     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -644,20 +644,6 @@ extern "C" {
 
     typedef struct ggml_threadpool * ggml_threadpool_t;
 
-    // the compute plan that needs to be prepared for ggml_graph_compute()
-    // since https://github.com/ggerganov/ggml/issues/287
-    struct ggml_cplan {
-        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
-        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
-
-        int n_threads;
-        struct ggml_threadpool * threadpool;
-
-        // abort ggml_graph_compute when true
-        ggml_abort_callback abort_callback;
-        void *              abort_callback_data;
-    };
-
     // scratch buffer
     struct ggml_scratch {
         size_t offs;
@@ -2047,7 +2033,6 @@ extern "C" {
     GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
     GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
 
-    // graph allocation in a context
     GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
     GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
     GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
@@ -2065,26 +2050,72 @@ extern "C" {
     GGML_API size_t ggml_graph_overhead(void);
     GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
 
+    // TODO: move these declarations above before the ggml_graph API and reorder the implementation order in ggml.c
+    //       (unless the code has been moved to a separate source file)
     GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
     GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
     GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
-    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
-    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
-    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
-    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
-
-    // ggml_graph_plan() has to be called before ggml_graph_compute()
-    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan(
-                  const struct ggml_cgraph * cgraph,
-                                       int   n_threads, /* = GGML_DEFAULT_N_THREADS */
-                    struct ggml_threadpool * threadpool /* = NULL */ );
-    GGML_API enum ggml_status  ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
-
-    // same as ggml_graph_compute() but the work data is allocated as a part of the context
-    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_API enum ggml_status  ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+    GGML_API struct ggml_threadpool *      ggml_threadpool_new           (struct ggml_threadpool_params * params);
+    GGML_API void                          ggml_threadpool_free          (struct ggml_threadpool * threadpool);
+    GGML_API int                           ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_pause         (struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_resume        (struct ggml_threadpool * threadpool);
+
+    // =================================================================================================
+    // CPU-only API for ggml_cgraph
+    //
+    // TODO: move to the CPU backend
+    // NOTE: avoid using, will be removed
+    //
+
+    // loops through the graph and determines:
+    //
+    // - work size needed for CPU computation
+    // - number of threads to start
+    //
+    GGML_API enum ggml_status ggml_graph_prepare(
+                struct ggml_cgraph * cgraph,
+                               int   n_threads, /* = GGML_DEFAULT_N_THREADS */
+            struct ggml_threadpool * threadpool /* = NULL */ );
+
+    // get the estimated work size for the graph from ggml_graph_prepare()
+    GGML_API size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph);
+
+    // if ctx is NULL, the work buffer will be dynamically allocated. in this case, call ggml_graph_work_free() to free the buffer
+    // otherwise, the work buffer will be allocated in the context. no need to free it
+    GGML_API enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx);
+    GGML_API void             ggml_graph_work_free(struct ggml_cgraph * cgraph);
+
+    // note: call ggml_graph_prepare() and ggml_graph_work_init() first
+    //
+    // sample usages:
+    //
+    //   - no dynamic allocations:
+    //
+    //      ... prepare ggml_context ctx ...
+    //
+    //      ggml_graph_prepare  (cgraph, n_threads, threadpool);
+    //      ggml_graph_work_init(cgraph, ctx);
+    //
+    //      ggml_graph_compute  (cgraph); // can call many times
+    //
+    //      // no need to call ggml_graph_work_free() because it is allocated in ctx
+    //
+    //  - dynamic allocations:
+    //
+    //      ggml_graph_prepare  (cgraph, n_threads, threadpool);
+    //      ggml_graph_work_init(cgraph, NULL); // will allocate memory
+    //
+    //      ggml_graph_compute  (cgraph); // can call many times
+    //
+    //      ggml_graph_work_free(cgraph);
+    //
+    GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph);
+
+    // end of CPU-only API
+    // =================================================================================================
+
+    GGML_API void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data);
 
     GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
 
@@ -2107,6 +2138,7 @@ extern "C" {
             struct ggml_cgraph    * gb_tmp,
             struct ggml_tensor  * * checkpoints,
             int                     n_checkpoints);
+
     //
     // optimization
     //

diff --git a/ggml/src/ggml-backend.c b/ggml/src/ggml-backend.c
@@ -752,7 +752,8 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_
 }
 
 struct ggml_backend_plan_cpu {
-    struct ggml_cplan cplan;
+    // TODO: move member from ggml_cgraph here when the public CPU-only API is removed
+
     struct ggml_cgraph cgraph;
 };
 
@@ -761,27 +762,27 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg
 
     struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
 
-    cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
     cpu_plan->cgraph = *cgraph; // FIXME: deep copy
+    ggml_graph_prepare(&cpu_plan->cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
 
-    if (cpu_plan->cplan.work_size > 0) {
-        cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
-        if (cpu_plan->cplan.work_data == NULL) {
+    if (cpu_plan->cgraph.work_size > 0) {
+        cpu_plan->cgraph.work_data = malloc(cpu_plan->cgraph.work_size);
+        if (cpu_plan->cgraph.work_data == NULL) {
             free(cpu_plan);
             return NULL;
         }
     }
 
-    cpu_plan->cplan.abort_callback      = cpu_ctx->abort_callback;
-    cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+    cpu_plan->cgraph.abort_callback      = cpu_ctx->abort_callback;
+    cpu_plan->cgraph.abort_callback_data = cpu_ctx->abort_callback_data;
 
     return cpu_plan;
 }
 
 GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
 
-    free(cpu_plan->cplan.work_data);
+    free(cpu_plan->cgraph.work_data);
     free(cpu_plan);
 
     GGML_UNUSED(backend);
@@ -790,31 +791,32 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
 
-    return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+    return ggml_graph_compute(&cpu_plan->cgraph);
 
     GGML_UNUSED(backend);
 }
 
 GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
 
-    struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
+    ggml_graph_prepare(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
 
-    if (cpu_ctx->work_size < cplan.work_size) {
+    if (cpu_ctx->work_size < cgraph->work_size) {
         free(cpu_ctx->work_data);
-        cpu_ctx->work_data = malloc(cplan.work_size);
+        cpu_ctx->work_data = malloc(cgraph->work_size);
         if (cpu_ctx->work_data == NULL) {
             cpu_ctx->work_size = 0;
             return GGML_STATUS_ALLOC_FAILED;
         }
-        cpu_ctx->work_size = cplan.work_size;
+        cpu_ctx->work_size = cgraph->work_size;
     }
-    cplan.work_data = cpu_ctx->work_data;
+    cgraph->work_data = cpu_ctx->work_data;
+    cgraph->work_own  = false; // always freed by ggml_backend_cpu_graph_plan_free
 
-    cplan.abort_callback      = cpu_ctx->abort_callback;
-    cplan.abort_callback_data = cpu_ctx->abort_callback_data;
+    cgraph->abort_callback      = cpu_ctx->abort_callback;
+    cgraph->abort_callback_data = cpu_ctx->abort_callback_data;
 
-    return ggml_graph_compute(cgraph, &cplan);
+    return ggml_graph_compute(cgraph);
 }
 
 GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {

diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h
@@ -773,6 +773,19 @@ struct ggml_cgraph {
     struct ggml_hash_set visited_hash_set;
 
     enum ggml_cgraph_eval_order order;
+
+    // TODO: after the CPU-only API is removed, we can move the members below to ggml_backend_plan_cpu
+
+    bool      work_own;
+    size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+    uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+    int n_threads;
+    struct ggml_threadpool * threadpool;
+
+    // abort ggml_graph_compute when true
+    ggml_abort_callback abort_callback;
+    void *              abort_callback_data;
 };
 
 struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);