How to implement Infinite Context and Attention? #1021
Unanswered
calebnwokocha
asked this question in
Q&A
Replies: 0 comments
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
Please anyone who understand the method in this paper https://arxiv.org/abs/2404.07143 should help me implement the method in my code below for GPT-2 774M model. I think I am close to full implementation, but just a few things I am missing.
`
#include "ggml.h"
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
// default hparams (GPT-2 774M)
struct gpt_hparams {
int32_t n_vocab = 50257; // Vocabulary size remains the same
int32_t n_embd = 1024; // Embedding dimensionality
int32_t n_head = 16; // Number of attention heads
int32_t n_layer = 24; // Number of transformer layers
int32_t ftype = 1; // Set to 1 for FP16 precision (optional)
float eps = 1e-5f; // Small constant for numerical stability
};
struct gpt_vocab {
using id = int32_t;
using token = std::string;
};
struct gpt_layer {
// normalization
struct ggml_tensor * ln_1_g;
struct ggml_tensor * ln_1_b;
};
struct gpt_model {
gpt_hparams hparams;
};
// load the model's weights from a file
bool gpt_model_load(const std::string & fname, gpt_model & model, gpt_vocab & vocab) {
printf("%s: loading model from '%s'\n", func, fname.c_str());
}
void gpt_split_words(std::string str, std::vectorstd::string& words) {
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
const std::regex re(pattern);
std::smatch m;
}
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
std::vectorstd::string words;
}
static std::vector<gpt_vocab::id> parse_tokens_from_string(const std::string& input, char delimiter) {
std::vector<gpt_vocab::id> output;
std::stringstream ss(input);
std::string token;
}
static std::map<std::string, std::vector<gpt_vocab::id>> extract_tests_from_file(const std::string & fpath_test){
if (fpath_test.empty()){
fprintf(stderr, "%s : No test file found.\n", func);
return std::map<std::string, std::vector<gpt_vocab::id>>();
}
}
void test_gpt_tokenizer(gpt_vocab & vocab, const std::string & fpath_test){
std::map<std::string, std::vector<gpt_vocab::id>> tests = extract_tests_from_file(fpath_test);
}
gpt_vocab::id gpt_sample_top_k_top_p(
const gpt_vocab & vocab,
const float * logits,
int top_k,
double top_p,
double temp,
std::mt19937 & rng) {
int n_logits = vocab.id_to_token.size();
}
struct ggml_tensor* load_compressed_memory(
struct ggml_context* ctx,
int layer,
const gpt_model & model,
int n_past,
int n_embd) {
}
// evaluate the transformer
//
// - model: the model
// - n_threads: number of threads to use
// - n_past: the context size so far
// - embd_inp: the embeddings of the tokens in the context
// - embd_w: the predicted logits for the next token
//
bool gpt_eval(
const gpt_model & model,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
std::vector & embd_w,
size_t & mem_per_token) {
const int N = embd_inp.size();
}
void gpt_print_usage(int argc, char ** argv, const gpt_hparams & params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: random)\n");
fprintf(stderr, " -f FNAME, --file FNAME\n");
fprintf(stderr, " load prompt from a file\n");
fprintf(stderr, " -tt TOKEN_TEST, --token_test TOKEN_TEST\n");
fprintf(stderr, " test tokenization\n");
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled)\n", params.repeat_last_n);
fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.2f, 1.0 = disabled)\n", (double)params.repeat_penalty);
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " -c N, --context N context / KV cache size (default: %d)\n", params.n_ctx);
fprintf(stderr, " --ignore-eos ignore EOS token during generation\n");
fprintf(stderr, " -ngl N, --gpu-layers N number of layers to offload to GPU on supported models (default: %d)\n", params.n_gpu_layers);
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");
}
// Function to check if the next argument exists
static std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_hparams& params) {
if (i + 1 < argc && argv[i + 1][0] != '-') {
return argv[++i];
} else {
fprintf(stderr, "error: %s requires one argument.\n", flag.c_str());
gpt_print_usage(argc, argv, params);
exit(0);
}
}
bool gpt_params_parse(int argc, char ** argv, gpt_hparams & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
}
std::string gpt_random_prompt(std::mt19937 & rng) {
const int r = rng() % 10;
switch (r) {
case 0: return "So";
case 1: return "Once upon a time";
case 2: return "When";
case 3: return "The";
case 4: return "After";
case 5: return "If";
case 6: return "import";
case 7: return "He";
case 8: return "She";
case 9: return "They";
}
}
//// Class for Infinite Context Handling
//std::vector tokens; // Store tokens dynamically
//
//// Add new tokens to context
//void add_tokens(const std::vector& new_tokens) {
// tokens.insert(tokens.end(), new_tokens.begin(), new_tokens.end());
//}
//
//// Retrieve relevant context
//std::vector get_context(int max_length) const {
// if (tokens.size() <= max_length) {
// return tokens;
// }
// return std::vector(tokens.end() - max_length, tokens.end());
//}
int main(int argc, char ** argv) {
ggml_time_init();
}
`
Beta Was this translation helpful? Give feedback.
All reactions