Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml : remove ggml_cplan + rework ggml_cgraph #9431

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 12 additions & 16 deletions examples/baby-llama/baby-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,6 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
constexpr float rms_norm_eps = 5e-6f;
#endif

static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);

if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}

ggml_graph_compute(graph, &plan);
}

static struct ggml_tensor * randomize_tensor(
struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
) {
Expand Down Expand Up @@ -1514,8 +1503,6 @@ int main(int argc, char ** argv) {
int n_tokens = model.hparams.n_ctx;
int n_vocab = model.hparams.n_vocab;

std::vector<uint8_t> work_buffer;

for (int ex=0; ex<n_examples; ++ex) {
struct ggml_init_params params = {
/*.mem_size =*/ compute_size,
Expand All @@ -1542,7 +1529,10 @@ int main(int argc, char ** argv) {
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);

ggml_build_forward_expand(gf, e);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, nullptr);
ggml_graph_compute(gf);
ggml_graph_work_free(gf);

float error_before_opt = ggml_get_f32_1d(e, 0);

Expand All @@ -1553,7 +1543,10 @@ int main(int argc, char ** argv) {
ggml_opt(ctx0, opt_params_lbfgs, e);
//
ggml_build_forward_expand(gf, e);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, nullptr);
ggml_graph_compute(gf);
ggml_graph_work_free(gf);

float error_after_opt = ggml_get_f32_1d(e, 0);

Expand Down Expand Up @@ -1607,7 +1600,10 @@ int main(int argc, char ** argv) {
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, gf, tokens_input, sample_ctx, n_past);

ggml_build_forward_expand(gf, logits);
ggml_graph_compute_helper(work_buffer, gf, /*n_threads*/ 1);
ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, nullptr);
ggml_graph_compute(gf);
ggml_graph_work_free(gf);

struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
Expand Down
23 changes: 7 additions & 16 deletions examples/benchmark/benchmark-matmult.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,17 +20,6 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);

if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}

ggml_graph_compute(graph, &plan);
}

static float tensor_sum_elements(const ggml_tensor * tensor) {
double sum = 0;
if (tensor->type == GGML_TYPE_F32) {
Expand Down Expand Up @@ -179,9 +168,8 @@ int main(int argc, char ** argv) {
TENSOR_DUMP(m11);
TENSOR_DUMP(m2);

std::vector<uint8_t> work_buffer;

ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
ggml_graph_prepare(gf, benchmark_params.n_threads, nullptr);
ggml_graph_work_init(gf, nullptr);

TENSOR_DUMP(ggml_graph_node(gf, 0));

Expand Down Expand Up @@ -234,7 +222,7 @@ int main(int argc, char ** argv) {

long long int start = ggml_time_us();
//printf("Running ggml_graph_compute\n");
ggml_graph_compute_helper(work_buffer, gf31, benchmark_params.n_threads);
ggml_graph_compute(gf31);

long long int stop = ggml_time_us();
long long int usec = stop-start;
Expand Down Expand Up @@ -267,8 +255,11 @@ int main(int argc, char ** argv) {
}

// Running a different graph computation to make sure we override the CPU cache lines
ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads);
ggml_graph_compute(gf32);
}

ggml_graph_work_free(gf);

printf("\n");
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));
printf("=====================================================================================\n");
Expand Down
4 changes: 3 additions & 1 deletion examples/llava/llava.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,9 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
struct ggml_tensor *flatten = ggml_view_2d(model.ctx, permuted_cont, clip_n_mmproj_embd(ctx_clip), num_patches_height * num_patches_width * num_patches_per_side * num_patches_per_side, size_ele * clip_n_mmproj_embd(ctx_clip), 0);
// ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
ggml_build_forward_expand(gf, flatten);
ggml_graph_compute_with_ctx(model.ctx, gf, 1);
ggml_graph_prepare(gf, 1, nullptr);
ggml_graph_work_init(gf, model.ctx);
ggml_graph_compute(gf);
struct ggml_tensor* result = ggml_graph_node(gf, -1);

memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
Expand Down
96 changes: 64 additions & 32 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -644,20 +644,6 @@ extern "C" {

typedef struct ggml_threadpool * ggml_threadpool_t;

// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`

int n_threads;
struct ggml_threadpool * threadpool;

// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;
};

// scratch buffer
struct ggml_scratch {
size_t offs;
Expand Down Expand Up @@ -2047,7 +2033,6 @@ extern "C" {
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);

// graph allocation in a context
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
Expand All @@ -2065,26 +2050,72 @@ extern "C" {
GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);

// TODO: move these declarations above before the ggml_graph API and reorder the implementation order in ggml.c
// (unless the code has been moved to a separate source file)
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);

// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);

// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);

// =================================================================================================
// CPU-only API for ggml_cgraph
//
// TODO: move to the CPU backend
// NOTE: avoid using, will be removed
//

// loops through the graph and determines:
//
// - work size needed for CPU computation
// - number of threads to start
//
GGML_API enum ggml_status ggml_graph_prepare(
struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );

// get the estimated work size for the graph from ggml_graph_prepare()
GGML_API size_t ggml_graph_work_size(const struct ggml_cgraph * cgraph);

// if ctx is NULL, the work buffer will be dynamically allocated. in this case, call ggml_graph_work_free() to free the buffer
// otherwise, the work buffer will be allocated in the context. no need to free it
GGML_API enum ggml_status ggml_graph_work_init(struct ggml_cgraph * cgraph, struct ggml_context * ctx);
GGML_API void ggml_graph_work_free(struct ggml_cgraph * cgraph);

// note: call ggml_graph_prepare() and ggml_graph_work_init() first
//
// sample usages:
//
// - no dynamic allocations:
//
// ... prepare ggml_context ctx ...
//
// ggml_graph_prepare (cgraph, n_threads, threadpool);
// ggml_graph_work_init(cgraph, ctx);
//
// ggml_graph_compute (cgraph); // can call many times
//
// // no need to call ggml_graph_work_free() because it is allocated in ctx
//
// - dynamic allocations:
//
// ggml_graph_prepare (cgraph, n_threads, threadpool);
// ggml_graph_work_init(cgraph, NULL); // will allocate memory
//
// ggml_graph_compute (cgraph); // can call many times
//
// ggml_graph_work_free(cgraph);
//
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph);

// end of CPU-only API
// =================================================================================================

GGML_API void ggml_graph_set_abort_callback(struct ggml_cgraph * cgraph, ggml_abort_callback abort_callback, void * abort_data);

GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);

Expand All @@ -2107,6 +2138,7 @@ extern "C" {
struct ggml_cgraph * gb_tmp,
struct ggml_tensor * * checkpoints,
int n_checkpoints);

//
// optimization
//
Expand Down
36 changes: 19 additions & 17 deletions ggml/src/ggml-backend.c
Original file line number Diff line number Diff line change
Expand Up @@ -752,7 +752,8 @@ GGML_CALL static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_
}

struct ggml_backend_plan_cpu {
struct ggml_cplan cplan;
// TODO: move member from ggml_cgraph here when the public CPU-only API is removed

struct ggml_cgraph cgraph;
};

Expand All @@ -761,27 +762,27 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg

struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));

cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
ggml_graph_prepare(&cpu_plan->cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);

if (cpu_plan->cplan.work_size > 0) {
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
if (cpu_plan->cplan.work_data == NULL) {
if (cpu_plan->cgraph.work_size > 0) {
cpu_plan->cgraph.work_data = malloc(cpu_plan->cgraph.work_size);
if (cpu_plan->cgraph.work_data == NULL) {
free(cpu_plan);
return NULL;
}
}

cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
cpu_plan->cgraph.abort_callback = cpu_ctx->abort_callback;
cpu_plan->cgraph.abort_callback_data = cpu_ctx->abort_callback_data;

return cpu_plan;
}

GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;

free(cpu_plan->cplan.work_data);
free(cpu_plan->cgraph.work_data);
free(cpu_plan);

GGML_UNUSED(backend);
Expand All @@ -790,31 +791,32 @@ GGML_CALL static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, g
GGML_CALL static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;

return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
return ggml_graph_compute(&cpu_plan->cgraph);

GGML_UNUSED(backend);
}

GGML_CALL static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;

struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
ggml_graph_prepare(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);

if (cpu_ctx->work_size < cplan.work_size) {
if (cpu_ctx->work_size < cgraph->work_size) {
free(cpu_ctx->work_data);
cpu_ctx->work_data = malloc(cplan.work_size);
cpu_ctx->work_data = malloc(cgraph->work_size);
if (cpu_ctx->work_data == NULL) {
cpu_ctx->work_size = 0;
return GGML_STATUS_ALLOC_FAILED;
}
cpu_ctx->work_size = cplan.work_size;
cpu_ctx->work_size = cgraph->work_size;
}
cplan.work_data = cpu_ctx->work_data;
cgraph->work_data = cpu_ctx->work_data;
cgraph->work_own = false; // always freed by ggml_backend_cpu_graph_plan_free

cplan.abort_callback = cpu_ctx->abort_callback;
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
cgraph->abort_callback = cpu_ctx->abort_callback;
cgraph->abort_callback_data = cpu_ctx->abort_callback_data;

return ggml_graph_compute(cgraph, &cplan);
return ggml_graph_compute(cgraph);
}

GGML_CALL static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
Expand Down
13 changes: 13 additions & 0 deletions ggml/src/ggml-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -773,6 +773,19 @@ struct ggml_cgraph {
struct ggml_hash_set visited_hash_set;

enum ggml_cgraph_eval_order order;

// TODO: after the CPU-only API is removed, we can move the members below to ggml_backend_plan_cpu

bool work_own;
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`

int n_threads;
struct ggml_threadpool * threadpool;

// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;
};

struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
Expand Down
Loading