From 7e2f2378b9621fb9d9eb71d4cce440c27fbd8c89 Mon Sep 17 00:00:00 2001 From: William Casarin Date: Wed, 27 Dec 2023 12:43:20 -0800 Subject: [PATCH] Inital embedded content parser This adds some initial code for nostrdb content parsing. We still need to write tests for encoding and decoding, so this is likely not working yet. --- Makefile | 4 +- src/bolt11/bech32.c | 7 +- src/bolt11/bech32.h | 3 +- src/bolt11/bolt11.h | 1 + src/content_parser.c | 328 ++++++++++++++++++++++++++++++------------- src/invoice.h | 3 + src/nostrdb.c | 2 +- src/nostrdb.h | 7 + test.c | 11 ++ 9 files changed, 264 insertions(+), 102 deletions(-) diff --git a/Makefile b/Makefile index edd403c..9be718c 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ CFLAGS = -Wall -Wno-misleading-indentation -Wno-unused-function -Werror -O2 -g - HEADERS = deps/lmdb/lmdb.h deps/secp256k1/include/secp256k1.h src/sha256.h src/nostrdb.h src/cursor.h src/hex.h src/jsmn.h src/config.h src/sha256.h src/random.h src/memchr.h src/cpu.h $(C_BINDINGS) FLATCC_SRCS=deps/flatcc/src/runtime/json_parser.c deps/flatcc/src/runtime/verifier.c deps/flatcc/src/runtime/builder.c deps/flatcc/src/runtime/emitter.c deps/flatcc/src/runtime/refmap.c BOLT11_SRCS = src/bolt11/bolt11.c src/bolt11/bech32.c src/bolt11/tal.c src/bolt11/talstr.c src/bolt11/take.c src/bolt11/list.c src/bolt11/utf8.c src/bolt11/amount.c src/bolt11/hash_u5.c -SRCS = src/nostrdb.c src/sha256.c src/invoice.c $(BOLT11_SRCS) $(FLATCC_SRCS) +SRCS = src/nostrdb.c src/sha256.c src/invoice.c src/nostr_bech32.c src/content_parser.c $(BOLT11_SRCS) $(FLATCC_SRCS) LDS = $(OBJS) $(ARS) OBJS = $(SRCS:.c=.o) DEPS = $(OBJS) $(HEADERS) $(ARS) @@ -41,7 +41,7 @@ check: test rm -rf testdata/db/*.mdb clean: - rm -rf test bench bench-ingest bench-ingest-many + rm -rf test bench bench-ingest bench-ingest-many $(OBJS) distclean: clean rm -rf deps diff --git a/src/bolt11/bech32.c b/src/bolt11/bech32.c index 321ca53..e1c9e18 100644 --- a/src/bolt11/bech32.c +++ b/src/bolt11/bech32.c @@ -91,7 +91,7 @@ int bech32_encode(char *output, const char *hrp, const uint8_t *data, size_t dat return 1; } -bech32_encoding bech32_decode_len(char* hrp, uint8_t *data, size_t *data_len, const char *input, size_t input_len) { +bech32_encoding bech32_decode_len(char* hrp, uint8_t *data, size_t *data_len, const char *input, size_t input_len, int max_hrp_len) { uint32_t chk = 1; size_t i; size_t hrp_len; @@ -104,6 +104,8 @@ bech32_encoding bech32_decode_len(char* hrp, uint8_t *data, size_t *data_len, co ++(*data_len); } hrp_len = input_len - (1 + *data_len); + if (hrp_len > max_hrp_len) + return BECH32_ENCODING_NONE; if (1 + *data_len >= input_len || *data_len < 6) { return BECH32_ENCODING_NONE; } @@ -158,13 +160,14 @@ bech32_encoding bech32_decode(char* hrp, uint8_t *data, size_t *data_len, const if (len > max_input_len) { return BECH32_ENCODING_NONE; } - return bech32_decode_len(hrp, data, data_len, input, len); + return bech32_decode_len(hrp, data, data_len, input, len, 8); } int bech32_convert_bits(uint8_t* out, size_t* outlen, int outbits, const uint8_t* in, size_t inlen, int inbits, int pad) { uint32_t val = 0; int bits = 0; uint32_t maxv = (((uint32_t)1) << outbits) - 1; + *outlen = 0; while (inlen--) { val = (val << inbits) | *(in++); bits += inbits; diff --git a/src/bolt11/bech32.h b/src/bolt11/bech32.h index 0650131..f69eb4a 100644 --- a/src/bolt11/bech32.h +++ b/src/bolt11/bech32.h @@ -123,7 +123,8 @@ bech32_encoding bech32_decode_len( uint8_t *data, size_t *data_len, const char *input, - size_t input_len + size_t input_len, + int max_prefix_len ); /* Helper from bech32: translates inbits-bit bytes to outbits-bit bytes. diff --git a/src/bolt11/bolt11.h b/src/bolt11/bolt11.h index 9d368c5..aabbb8a 100644 --- a/src/bolt11/bolt11.h +++ b/src/bolt11/bolt11.h @@ -3,6 +3,7 @@ #include "short_types.h" #include "hash_u5.h" +#include "amount.h" #include "list.h" #include "amount.h" #include "node_id.h" diff --git a/src/content_parser.c b/src/content_parser.c index ecc46c3..741e73a 100644 --- a/src/content_parser.c +++ b/src/content_parser.c @@ -1,11 +1,22 @@ -#include "damus.h" #include "cursor.h" -#include "bolt11.h" -#include "bech32.h" +#include "nostr_bech32.h" +#include "block.h" +#include "invoice.h" +#include "bolt11/bolt11.h" +#include "bolt11/bech32.h" #include #include #include "cursor.h" +#include "block.h" + +struct ndb_content_parser { + int bech32_strs; + struct cursor buffer; + struct cursor content; + struct ndb_note_blocks *blocks; +}; + static int parse_digit(struct cursor *cur, int *digit) { int c; @@ -25,7 +36,7 @@ static int parse_digit(struct cursor *cur, int *digit) { static int parse_mention_index(struct cursor *cur, struct note_block *block) { int d1, d2, d3, ind; - u8 *start = cur->p; + unsigned char *start = cur->p; if (!parse_str(cur, "#[")) return 0; @@ -56,7 +67,7 @@ static int parse_mention_index(struct cursor *cur, struct note_block *block) { static int parse_hashtag(struct cursor *cur, struct note_block *block) { int c; - u8 *start = cur->p; + unsigned char *start = cur->p; if (!parse_char(cur, '#')) return 0; @@ -70,22 +81,158 @@ static int parse_hashtag(struct cursor *cur, struct note_block *block) { consume_until_boundary(cur); block->type = BLOCK_HASHTAG; - block->block.str.start = (const char*)(start + 1); - block->block.str.end = (const char*)cur->p; + block->block.str.str = (const char*)(start + 1); + block->block.str.len = cur->p - (start + 1); return 1; } -static int add_block(struct note_blocks *blocks, struct note_block block) -{ - if (blocks->num_blocks + 1 >= MAX_BLOCKS) +static int push_str_block(struct cursor *buf, const char *content, struct str_block *block) { + return cursor_push_varint(buf, block->str - content) && + cursor_push_varint(buf, block->len); +} + +static int pull_str_block(struct cursor *buf, const char *content, struct str_block *block) { + uint32_t start; + if (!cursor_pull_varint_u32(buf, &start)) return 0; + + block->str = content + start; + + return cursor_pull_varint_u32(buf, &block->len); +} + +// +// decode and push a bech32 string into our blocks output buffer. +// +// bech32 blocks are stored as: +// +// nostr_bech32_type : varint +// bech32_buffer_size : u16 +// bech32_data : [u8] +// +// The TLV form is compact already, so we just use it directly +// +// This allows us to not duplicate all of the TLV encoding and decoding code +// for our on-disk nostrdb format. +// +static int push_bech32_str(struct ndb_content_parser *p, struct str_block *bech32) +{ + // we decode the raw bech32 directly into the output buffer + struct cursor u8, u5; + unsigned char *start; + uint16_t *u8_size; + enum nostr_bech32_type type; + size_t u5_out_len, u8_out_len; + static const int MAX_PREFIX = 8; + char prefix[9] = {0}; + + start = p->buffer.p; + + if (!parse_nostr_bech32_type(bech32->str, &type)) + goto fail; + + if (!cursor_push_varint(&p->buffer, type)) + goto fail; + + // save a spot for the raw bech32 buffer size + u8_size = (uint16_t*)p->buffer.p; + if (!cursor_skip(&p->buffer, 2)) + goto fail; + + if (!cursor_malloc_slice(&p->buffer, &u8, bech32->len)) + goto fail; + + if (!cursor_malloc_slice(&p->buffer, &u5, bech32->len)) + goto fail; - blocks->blocks[blocks->num_blocks++] = block; + if (bech32_decode_len(prefix, u5.p, &u5_out_len, bech32->str, + bech32->len, MAX_PREFIX) == BECH32_ENCODING_NONE) { + goto fail; + } + + u5.p += u5_out_len; + + if (!bech32_convert_bits(u8.p, &u8_out_len, 8, u5.start, u5.p - u5.start, 5, 0)) + goto fail; + + u8.p += u8_out_len; + + // move the out cursor to the end of the 8-bit buffer + p->buffer.p = u8.p; + + if (u8_out_len > UINT16_MAX) + goto fail; + + // mark the size of the bech32 buffer + *u8_size = (uint16_t)u8_out_len; + + return 1; + +fail: + p->buffer.p = start; + return 0; +} + +static int push_invoice_str(struct cursor *buf, struct str_block *str) +{ + unsigned char *start; + struct bolt11 *bolt11; + char *fail; + + if (!(bolt11 = bolt11_decode(NULL, str->str, &fail))) + return 0; + + start = buf->p; + if (!ndb_encode_invoice(buf, bolt11)) { + buf->p = start; + tal_free(bolt11); + return 0; + } + + tal_free(bolt11); + return 1; +} + +static int push_block(struct ndb_content_parser *p, struct note_block *block) +{ + // push the tag + if (!cursor_push_varint(&p->buffer, block->type)) + return 0; + + switch (block->type) { + case BLOCK_HASHTAG: + case BLOCK_TEXT: + case BLOCK_URL: + if (!push_str_block(&p->buffer, (const char*)p->content.start, + &block->block.str)) + return 0; + break; + + case BLOCK_MENTION_INDEX: + if (!cursor_push_varint(&p->buffer, block->block.mention_index)) + return 0; + break; + case BLOCK_MENTION_BECH32: + // we only push bech32 strs here + if (!push_bech32_str(p, &block->block.str)) + return 0; + break; + + case BLOCK_INVOICE: + // we only push invoice strs here + if (!push_invoice_str(&p->buffer, &block->block.str)) + return 0; + break; + } + + p->blocks->num_blocks++; + return 1; } -static int add_text_block(struct note_blocks *blocks, const u8 *start, const u8 *end) +static int add_text_block(struct ndb_content_parser *p, + const unsigned char *start, const unsigned char *end) { struct note_block b; @@ -93,10 +240,10 @@ static int add_text_block(struct note_blocks *blocks, const u8 *start, const u8 return 1; b.type = BLOCK_TEXT; - b.block.str.start = (const char*)start; - b.block.str.end = (const char*)end; + b.block.str.str = (const char*)start; + b.block.str.len = end - start; - return add_block(blocks, b); + return push_block(p, &b); } static int consume_url_fragment(struct cursor *cur) @@ -163,10 +310,12 @@ static int consume_url_host(struct cursor *cur) } static int parse_url(struct cursor *cur, struct note_block *block) { - u8 *start = cur->p; - u8 *host; + unsigned char *start = cur->p; + unsigned char *host; + unsigned char tmp[4096]; int host_len; - struct cursor path_cur; + struct cursor path_cur, tmp_cur; + make_cursor(tmp, tmp + sizeof(tmp), &tmp_cur); if (!parse_str(cur, "http")) return 0; @@ -222,29 +371,28 @@ static int parse_url(struct cursor *cur, struct note_block *block) { } // save the bech32 string pos in case we hit a damus.io link - block->block.str.start = (const char *)path_cur.p; + block->block.str.str = (const char *)path_cur.p; // if we have a damus link, make it a mention if (host_len == 8 && !strncmp((const char *)host, "damus.io", 8) - && parse_nostr_bech32(&path_cur, &block->block.mention_bech32.bech32)) + && parse_nostr_bech32_str(&path_cur)) { - block->block.str.end = (const char *)path_cur.p; + block->block.str.len = path_cur.p - path_cur.start; block->type = BLOCK_MENTION_BECH32; return 1; } block->type = BLOCK_URL; - block->block.str.start = (const char *)start; - block->block.str.end = (const char *)cur->p; + block->block.str.str = (const char *)start; + block->block.str.len = cur->p - start; return 1; } static int parse_invoice(struct cursor *cur, struct note_block *block) { - u8 *start, *end; - char *fail; - struct bolt11 *bolt11; + unsigned char *start, *end; + // optional parse_str(cur, "lightning:"); @@ -260,20 +408,10 @@ static int parse_invoice(struct cursor *cur, struct note_block *block) { end = cur->p; - char str[end - start + 1]; - str[end - start] = 0; - memcpy(str, start, end - start); - - if (!(bolt11 = bolt11_decode(NULL, str, &fail))) { - cur->p = start; - return 0; - } - block->type = BLOCK_INVOICE; - block->block.invoice.invstr.start = (const char*)start; - block->block.invoice.invstr.end = (const char*)end; - block->block.invoice.bolt11 = bolt11; + block->block.str.str = (const char*)start; + block->block.str.len = end - start; cur->p = end; @@ -282,107 +420,105 @@ static int parse_invoice(struct cursor *cur, struct note_block *block) { static int parse_mention_bech32(struct cursor *cur, struct note_block *block) { - u8 *start = cur->p; + unsigned char *start = cur->p; parse_char(cur, '@'); parse_str(cur, "nostr:"); - block->block.str.start = (const char *)cur->p; + block->block.str.str = (const char *)cur->p; - if (!parse_nostr_bech32(cur, &block->block.mention_bech32.bech32)) { + if (!parse_nostr_bech32_str(cur)) { cur->p = start; return 0; } - block->block.str.end = (const char *)cur->p; - + block->block.str.len = cur->p - start; block->type = BLOCK_MENTION_BECH32; return 1; } -static int add_text_then_block(struct cursor *cur, struct note_blocks *blocks, struct note_block block, u8 **start, const u8 *pre_mention) +static int add_text_then_block(struct ndb_content_parser *p, + struct note_block *block, + unsigned char **start, + const unsigned char *pre_mention) { - if (!add_text_block(blocks, *start, pre_mention)) + if (!add_text_block(p, *start, pre_mention)) return 0; - *start = (u8*)cur->p; - - if (!add_block(blocks, block)) - return 0; + *start = (unsigned char*)p->content.p; - return 1; + return push_block(p, block); } -int ndb_parse_content(struct note_blocks *blocks, const char *content) { +int ndb_parse_content(unsigned char *buf, int buf_size, + const char *content, int content_len, + struct ndb_note_blocks **blocks_p) +{ int cp, c; - struct cursor cur; + struct ndb_content_parser parser; + struct note_block block; - u8 *start, *pre_mention; - - blocks->words = 0; - blocks->num_blocks = 0; - make_cursor((u8*)content, (u8*)content + strlen(content), &cur); + unsigned char *start, *pre_mention; - start = cur.p; - while (cur.p < cur.end && blocks->num_blocks < MAX_BLOCKS) { - cp = peek_char(&cur, -1); - c = peek_char(&cur, 0); + make_cursor(buf, buf + buf_size, &parser.buffer); + + // allocate some space for the blocks header + *blocks_p = parser.blocks = (struct ndb_note_blocks *)buf; + parser.buffer.p += sizeof(struct ndb_note_blocks); + + make_cursor((unsigned char *)content, + (unsigned char*)content + content_len, &parser.content); + + parser.blocks->words = 0; + parser.blocks->num_blocks = 0; + parser.blocks->blocks_size = 0; + + start = parser.content.p; + while (parser.content.p < parser.content.end) { + cp = peek_char(&parser.content, -1); + c = peek_char(&parser.content, 0); // new word - if (is_whitespace(cp) && !is_whitespace(c)) { - blocks->words++; - } + if (is_whitespace(cp) && !is_whitespace(c)) + parser.blocks->words++; - pre_mention = cur.p; + pre_mention = parser.content.p; if (cp == -1 || is_left_boundary(cp) || c == '#') { - if (c == '#' && (parse_mention_index(&cur, &block) || parse_hashtag(&cur, &block))) { - if (!add_text_then_block(&cur, blocks, block, &start, pre_mention)) + if (c == '#' && (parse_mention_index(&parser.content, &block) || parse_hashtag(&parser.content, &block))) { + if (!add_text_then_block(&parser, &block, &start, pre_mention)) return 0; continue; - } else if ((c == 'h' || c == 'H') && parse_url(&cur, &block)) { - if (!add_text_then_block(&cur, blocks, block, &start, pre_mention)) + } else if ((c == 'h' || c == 'H') && parse_url(&parser.content, &block)) { + if (!add_text_then_block(&parser, &block, &start, pre_mention)) return 0; continue; - } else if ((c == 'l' || c == 'L') && parse_invoice(&cur, &block)) { - if (!add_text_then_block(&cur, blocks, block, &start, pre_mention)) + } else if ((c == 'l' || c == 'L') && parse_invoice(&parser.content, &block)) { + if (!add_text_then_block(&parser, &block, &start, pre_mention)) return 0; continue; - } else if ((c == 'n' || c == '@') && parse_mention_bech32(&cur, &block)) { - if (!add_text_then_block(&cur, blocks, block, &start, pre_mention)) + } else if ((c == 'n' || c == '@') && parse_mention_bech32(&parser.content, &block)) { + if (!add_text_then_block(&parser, &block, &start, pre_mention)) return 0; continue; } } - cur.p++; + parser.content.p++; } - if (cur.p - start > 0) { - if (!add_text_block(blocks, start, cur.p)) + if (parser.content.p - start > 0) { + if (!add_text_block(&parser, start, parser.content.p)) return 0; } - - return 1; -} -void blocks_init(struct note_blocks *blocks) { - blocks->blocks = malloc(sizeof(struct note_block) * MAX_BLOCKS); - blocks->num_blocks = 0; -} + // pad to 8-byte alignment + if (!cursor_align(&parser.buffer, 8)) + return 0; + assert((parser.buffer.p - parser.buffer.start) % 8 == 0); -void blocks_free(struct note_blocks *blocks) { - if (!blocks->blocks) { - return; - } + parser.blocks->blocks_size = parser.buffer.p - parser.buffer.start; - for (int i = 0; i < blocks->num_blocks; ++i) { - if (blocks->blocks[i].type == BLOCK_MENTION_BECH32) { - free(blocks->blocks[i].block.mention_bech32.bech32.buffer); - blocks->blocks[i].block.mention_bech32.bech32.buffer = NULL; - } - } - - free(blocks->blocks); - blocks->num_blocks = 0; + return 1; } + diff --git a/src/invoice.h b/src/invoice.h index acbb221..32c8adf 100644 --- a/src/invoice.h +++ b/src/invoice.h @@ -2,6 +2,9 @@ #ifndef NDB_INVOICE_H #define NDB_INVOICE_H +#include +#include "cursor.h" + struct bolt11; struct ndb_invoice { diff --git a/src/nostrdb.c b/src/nostrdb.c index 65facf4..747dc63 100644 --- a/src/nostrdb.c +++ b/src/nostrdb.c @@ -57,7 +57,6 @@ typedef int (*ndb_word_parser_fn)(void *, const char *word, int word_len, // representation #pragma pack(push, 1) - union ndb_packed_str { struct { char str[3]; @@ -99,6 +98,7 @@ struct ndb_note { #pragma pack(pop) + struct ndb_migration { ndb_migrate_fn fn; }; diff --git a/src/nostrdb.h b/src/nostrdb.h index bf90e2b..b66191f 100644 --- a/src/nostrdb.h +++ b/src/nostrdb.h @@ -20,11 +20,13 @@ struct ndb_json_parser; struct ndb; +struct ndb_note_blocks; struct ndb_note; struct ndb_tag; struct ndb_tags; struct ndb_lmdb; union ndb_packed_str; +struct bolt11; // sorry, swift needs help with forward declared pointers like this struct ndb_t { @@ -383,4 +385,9 @@ const char *ndb_db_name(enum ndb_dbs db); const char *ndb_kind_name(enum ndb_common_kind ck); enum ndb_common_kind ndb_kind_to_common_kind(int kind); +// CONTENT PARSER +int ndb_parse_content(unsigned char *buf, int buf_size, + const char *content, int content_len, + struct ndb_note_blocks **blocks_p); + #endif diff --git a/test.c b/test.c index 677f00b..d57b1aa 100644 --- a/test.c +++ b/test.c @@ -4,6 +4,7 @@ #include "io.h" #include "bolt11/bolt11.h" #include "invoice.h" +#include "block.h" #include "protected_queue.h" #include "memchr.h" #include "print_util.h" @@ -772,6 +773,15 @@ static void test_strings_work_before_finalization() { assert(!strcmp(ndb_note_content(note), "hello")); } +static void test_parse_content() { + const char *content = "hello,#world,https://github.com/damus-io"; + struct ndb_note_blocks *blocks; + unsigned char buf[4096]; + + assert(ndb_parse_content(buf, sizeof(buf), content, strlen(content), &blocks)); + assert(blocks->num_blocks == 4); +} + static void test_tce_eose() { unsigned char buf[1024]; const char json[] = "[\"EOSE\",\"s\"]"; @@ -1059,6 +1069,7 @@ static int test_varints() { } int main(int argc, const char *argv[]) { + test_parse_content(); test_varints(); test_encode_decode_invoice(); test_filters();