diff --git a/examples/mnist/mnist-common.cpp b/examples/mnist/mnist-common.cpp index 83fc0b763..cc56d2736 100644 --- a/examples/mnist/mnist-common.cpp +++ b/examples/mnist/mnist-common.cpp @@ -530,13 +530,16 @@ mnist_eval_result mnist_model_eval(mnist_model & model, const float * images, co void mnist_model_train(mnist_model & model, const float * images, const float * labels, const int nex, const int nepoch, const float val_split) { const int64_t t_start_us = ggml_time_us(); + // gf == graph forward, forward pass only. struct ggml_cgraph * gf = ggml_new_graph_custom(model.ctx_compute, GGML_DEFAULT_GRAPH_SIZE, /*grads =*/ true); // Forward pass. ggml_build_forward_expand(gf, model.loss); - struct ggml_cgraph * gb_grad = ggml_graph_dup(model.ctx_compute, gf); // Backward pass, gradients. + // gb_grad == graph backward gradients, forward pass, then backward pass to calculate gradients. + struct ggml_cgraph * gb_grad = ggml_graph_dup(model.ctx_compute, gf); ggml_build_backward_expand(model.ctx_compute, gf, gb_grad, /*accumulate =*/ true, false); - struct ggml_cgraph * gb_opt = ggml_graph_dup(model.ctx_compute, gf); // Backward pass, gradients + optimizer. + // gb_opt == graph backward optimize, forward pass, then backward pass to calculate gradients, then optimizer step. + struct ggml_cgraph * gb_opt = ggml_graph_dup(model.ctx_compute, gb_grad); ggml_build_opt_adamw(model.ctx_compute, gf, gb_opt, 1e-3f, 0.9f, 0.999f, 1e-8f, 0.0f); model.buf_compute = ggml_backend_alloc_ctx_tensors(model.ctx_compute, model.backend); @@ -557,8 +560,6 @@ void mnist_model_train(mnist_model & model, const float * images, const float * ggml_backend_tensor_set(model.images, images + iex0*MNIST_NINPUT, 0, ggml_nbytes(model.images)); ggml_backend_tensor_set(model.labels, labels + iex0*MNIST_NCLASSES, 0, ggml_nbytes(model.labels)); - ggml_backend_graph_compute(model.backend, gf); // Always compute forward pass. - // With a period of nbatch_logical/nbatch_physical iterations: if ((iex0 + model.nbatch_physical) % model.nbatch_logical != 0) { // For the first nbatch_logical/nbatch_physical - 1 iterations, only calculate gradients and accumulate them: diff --git a/include/ggml.h b/include/ggml.h index 779d0017c..a78bcf011 100644 --- a/include/ggml.h +++ b/include/ggml.h @@ -570,11 +570,13 @@ extern "C" { GGML_LOG_LEVEL_DEBUG = 5 }; + // this tensor... enum ggml_tensor_flag { - GGML_TENSOR_FLAG_INPUT = 1, - GGML_TENSOR_FLAG_OUTPUT = 2, - GGML_TENSOR_FLAG_PARAM = 4, - GGML_TENSOR_FLAG_LOSS = 8, + GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML comptue graph + GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML comptue graph + GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters + GGML_TENSOR_FLAG_GRAD_ACC = 8, // ...is an accumulator for gradients + GGML_TENSOR_FLAG_LOSS = 16, // ...defines loss for numerical optimization (multiple loss tensors add up) }; // ggml object diff --git a/src/ggml.c b/src/ggml.c index de61438ac..c65545e55 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -18123,11 +18123,23 @@ void ggml_build_backward_gradient_checkpointing( ggml_hash_map_free(replacements); } -// functions to change gradients considering the case that input a might be initial gradient with zero value +// utility functions to change gradients +// by default, just add/subtract/etc. the gradients +// if a is in zero_table and not a gradient accumulator, replace a +// if a is in zero_table and a gradient accumulator, modify gradients in-place and mark result as gradient accumulator static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { - return b; + if (a->flags & GGML_TENSOR_FLAG_GRAD_ACC) { + struct ggml_tensor * ret = ggml_add_impl(ctx, a, b, true); + ret->flags |= GGML_TENSOR_FLAG_GRAD_ACC; + const size_t insert_result = ggml_hash_insert(zero_table, ret); + GGML_ASSERT(insert_result != GGML_HASHSET_FULL); + GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS); + return ret; + } else { + return b; + } } else { return ggml_add_impl(ctx, a, b, false); } @@ -18135,8 +18147,17 @@ static struct ggml_tensor * ggml_add_or_set(struct ggml_context * ctx, struct gg static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, size_t nb1, size_t nb2, size_t nb3, size_t offset, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { - struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f); - return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false); + if (a->flags & GGML_TENSOR_FLAG_GRAD_ACC) { + struct ggml_tensor * ret = ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, true); + ret->flags |= GGML_TENSOR_FLAG_GRAD_ACC; + const size_t insert_result = ggml_hash_insert(zero_table, ret); + GGML_ASSERT(insert_result != GGML_HASHSET_FULL); + GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS); + return ret; + } else { + struct ggml_tensor * a_zero = ggml_scale(ctx, a, 0.0f); // FIXME this is going to produce NaN if a contains inf/NaN + return ggml_acc_impl(ctx, a_zero, b, nb1, nb2, nb3, offset, false); + } } else { return ggml_acc_impl(ctx, a, b, nb1, nb2, nb3, offset, false); } @@ -18144,7 +18165,16 @@ static struct ggml_tensor * ggml_acc_or_set(struct ggml_context * ctx, struct gg static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { - return ggml_repeat(ctx, b, a); + if (a->flags & GGML_TENSOR_FLAG_GRAD_ACC) { + struct ggml_tensor * ret = ggml_add1_impl(ctx, a, b, true); + ret->flags |= GGML_TENSOR_FLAG_GRAD_ACC; + const size_t insert_result = ggml_hash_insert(zero_table, ret); + GGML_ASSERT(insert_result != GGML_HASHSET_FULL); + GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS); + return ret; + } else { + return ggml_repeat(ctx, b, a); + } } else { return ggml_add1_impl(ctx, a, b, false); } @@ -18152,7 +18182,16 @@ static struct ggml_tensor * ggml_add1_or_set(struct ggml_context * ctx, struct g static struct ggml_tensor * ggml_sub_or_set(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_hash_set * zero_table) { if (ggml_hash_contains(zero_table, a)) { - return ggml_neg(ctx, b); + if (a->flags & GGML_TENSOR_FLAG_GRAD_ACC) { + struct ggml_tensor * ret = ggml_sub_impl(ctx, a, b, true); + ret->flags |= GGML_TENSOR_FLAG_GRAD_ACC; + const size_t insert_result = ggml_hash_insert(zero_table, ret); + GGML_ASSERT(insert_result != GGML_HASHSET_FULL); + GGML_ASSERT(insert_result != GGML_HASHSET_ALREADY_EXISTS); + return ret; + } else { + return ggml_neg(ctx, b); + } } else { return ggml_sub_impl(ctx, a, b, false); } @@ -19136,22 +19175,25 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * } } - // hash table of original gradients that should be overwritten instead of incremented + // keep table of original gradients for replacement/accumulation logic struct ggml_hash_set zero_table = ggml_hash_set_new(gf->size); + for (int i = 0; i < gf->n_nodes; i++) { + struct ggml_tensor * node = gf->nodes[i]; - // when accumulating gradients the table is empty -> gradients always incremented - if (!accumulate) { - for (int i = 0; i < gf->n_nodes; i++) { - if (gf->grads[i]) { - ggml_hash_insert(&zero_table, gf->grads[i]); + if (node->grad) { + // only gradients of trainable parameters should be accumulated + if (accumulate && (node->flags & GGML_TENSOR_FLAG_PARAM)) { + node->grad->flags |= GGML_TENSOR_FLAG_GRAD_ACC; } + + ggml_hash_insert(&zero_table, node->grad); } } for (int i = gf->n_nodes - 1; i >= 0; i--) { struct ggml_tensor * node = gf->nodes[i]; - // inplace operations to add gradients are not created by ggml_compute_backward + // inplace operations to add gradients are not created by ggml_compute_backward except for gradient accumulation // use allocator to automatically make inplace operations if (node->grad) { ggml_compute_backward(ctx, node, &zero_table); @@ -19319,19 +19361,18 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { for (int i = 0; i < cgraph->n_nodes; i++) { struct ggml_tensor * node = cgraph->nodes[i]; - struct ggml_tensor * grad = cgraph->grads[i]; // initial gradients of loss should be 1, 0 otherwise - if (grad) { + if (node->grad) { if (node->flags & GGML_TENSOR_FLAG_LOSS) { - GGML_ASSERT(grad->buffer); + GGML_ASSERT(node->grad->buffer); GGML_ASSERT(node->type == GGML_TYPE_F32); GGML_ASSERT(ggml_is_scalar(node)); const float onef = 1.0f; - ggml_backend_tensor_set(grad, &onef, 0, ggml_nbytes(grad)); + ggml_backend_tensor_set(node->grad, &onef, 0, ggml_nbytes(node->grad)); } else { - ggml_set_zero(grad); + ggml_set_zero(node->grad); } }