Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Implement GGUF #2397

Merged
merged 2 commits into from
Jul 26, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
GGUF_MAGIC = 0x47475546
GGUF_VERSION = 1

# general
KEY_GENERAL_ARCHITECTURE = "general.architecture"
KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
KEY_GENERAL_NAME = "general.name"
KEY_GENERAL_AUTHOR = "general.author"
KEY_GENERAL_URL = "general.url"
KEY_GENERAL_DESCRIPTION = "general.description"
KEY_GENERAL_FILE_TYPE = "general.file_type"
KEY_GENERAL_LICENSE = "general.license"
KEY_GENERAL_SOURCE_URL = "general.source.url"
KEY_GENERAL_SOURCE_HF_REPO = "general.source.hugginface.repository"

# LLM
KEY_LLM_CONTEXT_LENGTH = "{llm}.context_length"
KEY_LLM_EMBEDDING_LENGTH = "{llm}.embedding_length"
KEY_LLM_LAYER_COUNT = "{llm}.layer_count"
KEY_LLM_FEED_FORWARD_LENGTH = "{llm}.feed_forward_length"
KEY_LLM_USE_PARALLEL_RESIDUAL = "{llm}.use_parallel_residual"
KEY_LLM_TENSOR_DATA_LAYOUT = "{llm}.tensor_data_layout"

# attention
KEY_ATTENTION_HEAD_COUNT = "{llm}.attention.head_count"
KEY_ATTENTION_HEAD_COUNT_KV = "{llm}.attention.head_count_kv"
KEY_ATTENTION_MAX_ALIBI_BIAS = "{llm}.attention.max_alibi_bias"
KEY_ATTENTION_CLAMP_KQV = "{llm}.attention.clamp_kqv"

# RoPE
KEY_ROPE_DIMENSION_COUNT = "{llm}.rope.dimension_count"
KEY_ROPE_SCALE = "{llm}.rope.scale"
6 changes: 5 additions & 1 deletion examples/baby-llama/baby-llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

static const float rms_norm_eps = 1e-6f;
#ifdef LLAMA_DEFAULT_RMS_EPS
static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
#else
static const float rms_norm_eps = 5e-6f;
#endif

float frand() {
return (float)rand()/(float)RAND_MAX;
Expand Down
2 changes: 1 addition & 1 deletion examples/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ struct gpt_params {
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
int32_t n_probs = 0; // if greater than 0, output the probabilities of top n_probs tokens.
float rms_norm_eps = 1e-6; // rms norm epsilon
float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS; // rms norm epsilon
float rope_freq_base = 10000.0f; // RoPE base frequency
float rope_freq_scale = 1.0f; // RoPE frequency scaling factor

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

static const float rms_norm_eps = 1e-6f;
static const float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;

struct random_normal_distribution {
std::mt19937 gen;
Expand Down
192 changes: 192 additions & 0 deletions gguf.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
// TODO: convert to proper gguf.h gguf.c structure, now I'm trying to be fast as much as possible,
// and everything is in this file for quick debugging.

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdbool.h>


enum ggml_type {
GGML_TYPE_F32 = 0,
GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3,
// GGML_TYPE_Q4_2 = 4, support has been removed
// GGML_TYPE_Q4_3 (5) support has been removed
GGML_TYPE_Q5_0 = 6,
GGML_TYPE_Q5_1 = 7,
GGML_TYPE_Q8_0 = 8,
GGML_TYPE_Q8_1 = 9,
// k-quantizations
GGML_TYPE_Q2_K = 10,
GGML_TYPE_Q3_K = 11,
GGML_TYPE_Q4_K = 12,
GGML_TYPE_Q5_K = 13,
GGML_TYPE_Q6_K = 14,
GGML_TYPE_Q8_K = 15,
GGML_TYPE_I8,
GGML_TYPE_I16,
GGML_TYPE_I32,
GGML_TYPE_COUNT,
};

enum gguf_metadata_value_type {
GGUF_METADATA_VALUE_TYPE_UINT8 = 0,
GGUF_METADATA_VALUE_TYPE_INT8 = 1,
GGUF_METADATA_VALUE_TYPE_UINT16 = 2,
GGUF_METADATA_VALUE_TYPE_INT16 = 3,
GGUF_METADATA_VALUE_TYPE_UINT32 = 4,
GGUF_METADATA_VALUE_TYPE_INT32 = 5,
GGUF_METADATA_VALUE_TYPE_FLOAT32 = 6,
GGUF_METADATA_VALUE_TYPE_BOOL = 7,
GGUF_METADATA_VALUE_TYPE_STRING = 8,
GGUF_METADATA_VALUE_TYPE_ARRAY = 9,
};

struct gguf_string_t {
uint32_t len;
char * string;
};

union gguf_metadata_value_t;

// Union definition for gguf_metadata_value_t
union gguf_metadata_value_t {
uint8_t uint8;
int8_t int8;
uint16_t uint16;
int16_t int16;
uint32_t uint32;
int32_t int32;
float float32;
bool bool_;
struct gguf_string_t string;
struct {
uint32_t len;
enum gguf_metadata_value_type type;
union gguf_metadata_value_t * array;
} array;
};


struct gguf_metadata_kv_t {
struct gguf_string_t key;
uint32_t value_len;
enum gguf_metadata_value_type value_type;
union gguf_metadata_value_t* value;
};

struct gguf_header_t {
uint32_t magic;
uint32_t version;
uint32_t tensor_count;
uint32_t metadata_kv_count;
struct gguf_metadata_kv_t * metadata_kv;
};

struct gguf_tensor_info_t {
struct gguf_string_t name;
uint32_t n_dimensions;
uint32_t dimensions[];
};

struct gguf_file_t {
struct gguf_header_t header;
uint8_t tensor_data[];
};

void read_gguf_file(const char * file_path, struct gguf_file_t * gguf_file) {
FILE* file = fopen(file_path, "rb");
if (file == NULL) {
printf("Error opening the file.\n");
return;
}

fread(&gguf_file->header.magic, sizeof(uint32_t), 1, file);

// Verify magic and version
if (gguf_file->header.magic != 0x47475546) {
printf("Invalid magic number. Not a valid GGUF file.\n");
fclose(file);
return;
}

fread(&gguf_file->header.version, sizeof(uint32_t), 1, file);

if (gguf_file->header.version != 1) {
printf("Unsupported version. Expected version 1.\n");
fclose(file);
return;
}

fread(&gguf_file->header.tensor_count, sizeof(uint32_t), 1, file);
fread(&gguf_file->header.metadata_kv_count, sizeof(uint32_t), 1, file);

printf("Magic: %x\n", gguf_file->header.magic);
printf("Version: %d\n", gguf_file->header.version);
printf("Tensor Count: %d\n", gguf_file->header.tensor_count);
printf("Metadata Key-Value Count: %d\n", gguf_file->header.metadata_kv_count);

gguf_file->header.metadata_kv = (struct gguf_metadata_kv_t*)malloc(gguf_file->header.metadata_kv_count * sizeof(struct gguf_metadata_kv_t));

for (int i = 0; i < gguf_file->header.metadata_kv_count; i++) {
struct gguf_metadata_kv_t* kv = &gguf_file->header.metadata_kv[i];
fread(&kv->key.len, sizeof(uint32_t), 1, file);
kv->key.string = (char*)malloc(kv->key.len ); // Allocate memory for the key string
fread(kv->key.string, sizeof(char), kv->key.len, file);
//kv->key.string[kv->key.len] = '\0'; // Null-terminate the key string

fread(&kv->value_type, sizeof(uint32_t), 1, file);

printf("Metadata Value Type: %d\n", kv->value_type);
printf("Metadata Key: %s\n", kv->key.string);

// Read metadata value according to its type using reinterpret_cast
switch (kv->value_type) {
case GGUF_METADATA_VALUE_TYPE_UINT32:
kv->value = (uint32_t *) malloc(sizeof(uint32_t));
fread(kv->value, sizeof(uint32_t), 1, file);
printf("value: %d\n", kv->value->uint32);
break;
case GGUF_METADATA_VALUE_TYPE_FLOAT32:
kv->value = (float *)malloc(sizeof(float));
fread(kv->value, sizeof(float), 1, file);
printf("value: %f\n", (float)kv->value->float32);
break;
case GGUF_METADATA_VALUE_TYPE_STRING:
fread(&kv->value_len, sizeof(uint32_t), 1, file);
printf("value len: %d\n", kv->value_len);
kv->value = (char *)malloc(sizeof(char) * kv->value_len); // Allocate memory for the value string
fread(kv->value, sizeof(char), kv->value_len, file);
printf("value: %s\n", (char *)kv->value);
break;
// ... (handle other types in a similar manner)
default:
printf("Unsupported metadata value type.\n");
fclose(file);
return;
}
}

// TODO: handle reading tensor data

fclose(file);
}

void gguf_free(struct gguf_file_t * gguf_file) {
// Free allocated memory for key strings avd values
for (int i = 0; i < gguf_file->header.metadata_kv_count; i++) {
free(gguf_file->header.metadata_kv[i].key.string);
free(gguf_file->header.metadata_kv[i].value);
}
free(gguf_file->header.metadata_kv);
}

int main() {
const char* file_path = "example.gguf";
struct gguf_file_t gguf_file;
read_gguf_file(file_path, &gguf_file);
gguf_free(&gguf_file);
return 0;
}
Loading