Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sync : llama.cpp #955

Merged
merged 13 commits into from
Sep 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ option(GGML_VULKAN "ggml: use Vulkan"
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
Expand Down
2 changes: 1 addition & 1 deletion examples/gpt-2/main-alloc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ bool gpt2_eval(
}

// run the computation
struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
struct ggml_cplan plan = ggml_graph_plan(gf, n_threads, nullptr);
static std::vector<uint8_t> work_buffer;
work_buffer.resize(plan.work_size);
plan.work_data = work_buffer.data();
Expand Down
2 changes: 1 addition & 1 deletion examples/sam/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ static void ggml_disconnect_node_from_graph(ggml_tensor * t) {
}

static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);

if (plan.work_size > 0) {
buf.resize(plan.work_size);
Expand Down
4 changes: 2 additions & 2 deletions include/ggml-alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ extern "C" {
#endif

typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t;

// Tensor allocator
struct ggml_tallocr {
Expand Down
1 change: 1 addition & 0 deletions include/ggml-backend.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ extern "C" {

GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);

// Create a backend buffer from an existing pointer
Expand Down
64 changes: 62 additions & 2 deletions include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,8 @@
#define GGML_MAX_SRC 10
#ifndef GGML_MAX_NAME
#define GGML_MAX_NAME 64
#define GGML_MAX_N_THREADS 512

#endif
#define GGML_MAX_OP_PARAMS 64
#define GGML_DEFAULT_N_THREADS 4
Expand Down Expand Up @@ -393,6 +395,8 @@ extern "C" {
GGML_TYPE_Q4_0_4_4 = 31,
GGML_TYPE_Q4_0_4_8 = 32,
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_TQ1_0 = 34,
GGML_TYPE_TQ2_0 = 35,
GGML_TYPE_COUNT,
};

Expand Down Expand Up @@ -512,6 +516,7 @@ extern "C" {
GGML_OP_WIN_UNPART,
GGML_OP_GET_REL_POS,
GGML_OP_ADD_REL_POS,
GGML_OP_RWKV_WKV,

GGML_OP_UNARY,

Expand Down Expand Up @@ -546,6 +551,7 @@ extern "C" {
GGML_UNARY_OP_SILU,
GGML_UNARY_OP_HARDSWISH,
GGML_UNARY_OP_HARDSIGMOID,
GGML_UNARY_OP_EXP,

GGML_UNARY_OP_COUNT,
};
Expand Down Expand Up @@ -628,13 +634,37 @@ extern "C" {
// If it returns true, the computation is aborted
typedef bool (*ggml_abort_callback)(void * data);

// Scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};

// Threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};

struct ggml_threadpool; // forward declaration, see ggml.c

typedef struct ggml_threadpool * ggml_threadpool_t;

// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`

int n_threads;
struct ggml_threadpool * threadpool;

// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
Expand Down Expand Up @@ -1139,6 +1169,14 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);

GGML_API struct ggml_tensor * ggml_exp(
struct ggml_context * ctx,
struct ggml_tensor * a);

GGML_API struct ggml_tensor * ggml_exp_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);

// normalize along rows
GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx,
Expand Down Expand Up @@ -1887,6 +1925,15 @@ extern "C" {
struct ggml_tensor * pw,
struct ggml_tensor * ph);

GGML_API struct ggml_tensor * ggml_rwkv_wkv(
struct ggml_context * ctx,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * r,
struct ggml_tensor * tf,
struct ggml_tensor * td,
struct ggml_tensor * state);

// custom operators

typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
Expand Down Expand Up @@ -2057,10 +2104,23 @@ extern "C" {
GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);

GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params *p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
GGML_API struct ggml_threadpool* ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);

// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API enum ggml_status ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
GGML_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);

// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
Expand Down
2 changes: 1 addition & 1 deletion scripts/sync-llama.last
Original file line number Diff line number Diff line change
@@ -1 +1 @@
66b039a5011522b6a61495eea2a9862601e169f7
a8768614558262b6762fc0feeddcc6f7ce4bc5fc
6 changes: 5 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -612,6 +612,10 @@ if (GGML_VULKAN)
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
endif()

if (GGML_VULKAN_SHADER_DEBUG_INFO)
add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
endif()

if (GGML_VULKAN_PERF)
add_compile_definitions(GGML_VULKAN_PERF)
endif()
Expand Down Expand Up @@ -1247,7 +1251,7 @@ endif()

# Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
add_compile_definitions(_GNU_SOURCE)
endif()

Expand Down
Loading
Loading