From 7e2f2378b9621fb9d9eb71d4cce440c27fbd8c89 Mon Sep 17 00:00:00 2001
From: William Casarin <jb55@jb55.com>
Date: Wed, 27 Dec 2023 12:43:20 -0800
Subject: [PATCH] Inital embedded content parser

This adds some initial code for nostrdb content parsing.

We still need to write tests for encoding and decoding, so this is
likely not working yet.
---
 Makefile             |   4 +-
 src/bolt11/bech32.c  |   7 +-
 src/bolt11/bech32.h  |   3 +-
 src/bolt11/bolt11.h  |   1 +
 src/content_parser.c | 328 ++++++++++++++++++++++++++++++-------------
 src/invoice.h        |   3 +
 src/nostrdb.c        |   2 +-
 src/nostrdb.h        |   7 +
 test.c               |  11 ++
 9 files changed, 264 insertions(+), 102 deletions(-)

diff --git a/Makefile b/Makefile
index edd403c..9be718c 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ CFLAGS = -Wall -Wno-misleading-indentation -Wno-unused-function -Werror -O2 -g -
 HEADERS = deps/lmdb/lmdb.h deps/secp256k1/include/secp256k1.h src/sha256.h src/nostrdb.h src/cursor.h src/hex.h src/jsmn.h src/config.h src/sha256.h src/random.h src/memchr.h src/cpu.h $(C_BINDINGS) 
 FLATCC_SRCS=deps/flatcc/src/runtime/json_parser.c deps/flatcc/src/runtime/verifier.c deps/flatcc/src/runtime/builder.c deps/flatcc/src/runtime/emitter.c deps/flatcc/src/runtime/refmap.c
 BOLT11_SRCS = src/bolt11/bolt11.c src/bolt11/bech32.c src/bolt11/tal.c src/bolt11/talstr.c src/bolt11/take.c src/bolt11/list.c src/bolt11/utf8.c src/bolt11/amount.c src/bolt11/hash_u5.c
-SRCS = src/nostrdb.c src/sha256.c src/invoice.c $(BOLT11_SRCS) $(FLATCC_SRCS)
+SRCS = src/nostrdb.c src/sha256.c src/invoice.c src/nostr_bech32.c src/content_parser.c $(BOLT11_SRCS) $(FLATCC_SRCS)
 LDS = $(OBJS) $(ARS) 
 OBJS = $(SRCS:.c=.o)
 DEPS = $(OBJS) $(HEADERS) $(ARS)
@@ -41,7 +41,7 @@ check: test
 	rm -rf testdata/db/*.mdb
 
 clean:
-	rm -rf test bench bench-ingest bench-ingest-many
+	rm -rf test bench bench-ingest bench-ingest-many $(OBJS)
 
 distclean: clean
 	rm -rf deps
diff --git a/src/bolt11/bech32.c b/src/bolt11/bech32.c
index 321ca53..e1c9e18 100644
--- a/src/bolt11/bech32.c
+++ b/src/bolt11/bech32.c
@@ -91,7 +91,7 @@ int bech32_encode(char *output, const char *hrp, const uint8_t *data, size_t dat
     return 1;
 }
 
-bech32_encoding bech32_decode_len(char* hrp, uint8_t *data, size_t *data_len, const char *input, size_t input_len) {
+bech32_encoding bech32_decode_len(char* hrp, uint8_t *data, size_t *data_len, const char *input, size_t input_len, int max_hrp_len) {
     uint32_t chk = 1;
     size_t i;
     size_t hrp_len;
@@ -104,6 +104,8 @@ bech32_encoding bech32_decode_len(char* hrp, uint8_t *data, size_t *data_len, co
         ++(*data_len);
     }
     hrp_len = input_len - (1 + *data_len);
+    if (hrp_len > max_hrp_len)
+	    return BECH32_ENCODING_NONE;
     if (1 + *data_len >= input_len || *data_len < 6) {
         return BECH32_ENCODING_NONE;
     }
@@ -158,13 +160,14 @@ bech32_encoding bech32_decode(char* hrp, uint8_t *data, size_t *data_len, const
     if (len > max_input_len) {
         return BECH32_ENCODING_NONE;
     }
-    return bech32_decode_len(hrp, data, data_len, input, len);
+    return bech32_decode_len(hrp, data, data_len, input, len, 8);
 }
 
 int bech32_convert_bits(uint8_t* out, size_t* outlen, int outbits, const uint8_t* in, size_t inlen, int inbits, int pad) {
     uint32_t val = 0;
     int bits = 0;
     uint32_t maxv = (((uint32_t)1) << outbits) - 1;
+    *outlen = 0;
     while (inlen--) {
         val = (val << inbits) | *(in++);
         bits += inbits;
diff --git a/src/bolt11/bech32.h b/src/bolt11/bech32.h
index 0650131..f69eb4a 100644
--- a/src/bolt11/bech32.h
+++ b/src/bolt11/bech32.h
@@ -123,7 +123,8 @@ bech32_encoding bech32_decode_len(
     uint8_t *data,
     size_t *data_len,
     const char *input,
-    size_t input_len
+    size_t input_len,
+    int max_prefix_len
 );
 
 /* Helper from bech32: translates inbits-bit bytes to outbits-bit bytes.
diff --git a/src/bolt11/bolt11.h b/src/bolt11/bolt11.h
index 9d368c5..aabbb8a 100644
--- a/src/bolt11/bolt11.h
+++ b/src/bolt11/bolt11.h
@@ -3,6 +3,7 @@
 
 #include "short_types.h"
 #include "hash_u5.h"
+#include "amount.h"
 #include "list.h"
 #include "amount.h"
 #include "node_id.h"
diff --git a/src/content_parser.c b/src/content_parser.c
index ecc46c3..741e73a 100644
--- a/src/content_parser.c
+++ b/src/content_parser.c
@@ -1,11 +1,22 @@
-#include "damus.h"
 #include "cursor.h"
-#include "bolt11.h"
-#include "bech32.h"
+#include "nostr_bech32.h"
+#include "block.h"
+#include "invoice.h"
+#include "bolt11/bolt11.h"
+#include "bolt11/bech32.h"
 #include <stdlib.h>
 #include <string.h>
 
 #include "cursor.h"
+#include "block.h"
+
+struct ndb_content_parser {
+	int bech32_strs;
+	struct cursor buffer;
+	struct cursor content;
+	struct ndb_note_blocks *blocks;
+};
+
 
 static int parse_digit(struct cursor *cur, int *digit) {
 	int c;
@@ -25,7 +36,7 @@ static int parse_digit(struct cursor *cur, int *digit) {
 
 static int parse_mention_index(struct cursor *cur, struct note_block *block) {
 	int d1, d2, d3, ind;
-	u8 *start = cur->p;
+	unsigned char *start = cur->p;
 	
 	if (!parse_str(cur, "#["))
 		return 0;
@@ -56,7 +67,7 @@ static int parse_mention_index(struct cursor *cur, struct note_block *block) {
 
 static int parse_hashtag(struct cursor *cur, struct note_block *block) {
 	int c;
-	u8 *start = cur->p;
+	unsigned char *start = cur->p;
 	
 	if (!parse_char(cur, '#'))
 		return 0;
@@ -70,22 +81,158 @@ static int parse_hashtag(struct cursor *cur, struct note_block *block) {
 	consume_until_boundary(cur);
 	
 	block->type = BLOCK_HASHTAG;
-	block->block.str.start = (const char*)(start + 1);
-	block->block.str.end = (const char*)cur->p;
+	block->block.str.str = (const char*)(start + 1);
+	block->block.str.len = cur->p - (start + 1);
 	
 	return 1;
 }
 
-static int add_block(struct note_blocks *blocks, struct note_block block)
-{
-	if (blocks->num_blocks + 1 >= MAX_BLOCKS)
+static int push_str_block(struct cursor *buf, const char *content, struct str_block *block) {
+	return cursor_push_varint(buf, block->str - content) &&
+	       cursor_push_varint(buf, block->len);
+}
+
+static int pull_str_block(struct cursor *buf, const char *content, struct str_block *block) {
+	uint32_t start;
+	if (!cursor_pull_varint_u32(buf, &start))
 		return 0;
+
+	block->str = content + start;
+
+	return cursor_pull_varint_u32(buf, &block->len);
+}
+
+//
+// decode and push a bech32 string into our blocks output buffer.
+//
+// bech32 blocks are stored as:
+//
+//     nostr_bech32_type  : varint
+//     bech32_buffer_size : u16
+//     bech32_data        : [u8]
+//
+// The TLV form is compact already, so we just use it directly
+//
+// This allows us to not duplicate all of the TLV encoding and decoding code
+// for our on-disk nostrdb format.
+//
+static int push_bech32_str(struct ndb_content_parser *p, struct str_block *bech32)
+{
+	// we decode the raw bech32 directly into the output buffer
+	struct cursor u8, u5;
+	unsigned char *start;
+	uint16_t *u8_size;
+	enum nostr_bech32_type type;
+	size_t u5_out_len, u8_out_len;
+	static const int MAX_PREFIX = 8;
+	char prefix[9] = {0};
+
+	start = p->buffer.p;
+
+	if (!parse_nostr_bech32_type(bech32->str, &type))
+		goto fail;
+
+	if (!cursor_push_varint(&p->buffer, type))
+		goto fail;
+
+	// save a spot for the raw bech32 buffer size
+	u8_size = (uint16_t*)p->buffer.p;
+	if (!cursor_skip(&p->buffer, 2))
+		goto fail;
+
+	if (!cursor_malloc_slice(&p->buffer, &u8, bech32->len))
+		goto fail;
+
+	if (!cursor_malloc_slice(&p->buffer, &u5, bech32->len))
+		goto fail;
 	
-	blocks->blocks[blocks->num_blocks++] = block;
+	if (bech32_decode_len(prefix, u5.p, &u5_out_len, bech32->str,
+			      bech32->len, MAX_PREFIX) == BECH32_ENCODING_NONE) {
+		goto fail;
+	}
+
+	u5.p += u5_out_len;
+
+	if (!bech32_convert_bits(u8.p, &u8_out_len, 8, u5.start, u5.p - u5.start, 5, 0))
+		goto fail;
+
+	u8.p += u8_out_len;
+
+	// move the out cursor to the end of the 8-bit buffer
+	p->buffer.p = u8.p;
+
+	if (u8_out_len > UINT16_MAX)
+		goto fail;
+
+	// mark the size of the bech32 buffer
+	*u8_size = (uint16_t)u8_out_len;
+
+	return 1;
+
+fail:
+	p->buffer.p = start;
+	return 0;
+}
+
+static int push_invoice_str(struct cursor *buf, struct str_block *str)
+{
+	unsigned char *start;
+	struct bolt11 *bolt11;
+	char *fail;
+
+	if (!(bolt11 = bolt11_decode(NULL, str->str, &fail)))
+		return 0;
+
+	start = buf->p;
+	if (!ndb_encode_invoice(buf, bolt11)) {
+		buf->p = start;
+		tal_free(bolt11);
+		return 0;
+	}
+
+	tal_free(bolt11);
+	return 1;
+}
+
+static int push_block(struct ndb_content_parser *p, struct note_block *block)
+{
+	// push the tag
+	if (!cursor_push_varint(&p->buffer, block->type))
+		return 0;
+
+	switch (block->type) {
+	case BLOCK_HASHTAG:
+	case BLOCK_TEXT:
+	case BLOCK_URL:
+		if (!push_str_block(&p->buffer, (const char*)p->content.start,
+			       &block->block.str))
+			return 0;
+		break;
+
+	case BLOCK_MENTION_INDEX:
+		if (!cursor_push_varint(&p->buffer, block->block.mention_index))
+			return 0;
+		break;
+	case BLOCK_MENTION_BECH32:
+		// we only push bech32 strs here
+		if (!push_bech32_str(p, &block->block.str))
+			return 0;
+		break;
+		
+	case BLOCK_INVOICE:
+		// we only push invoice strs here
+		if (!push_invoice_str(&p->buffer, &block->block.str))
+			return 0;
+		break;
+	}
+
+	p->blocks->num_blocks++;
+
 	return 1;
 }
 
-static int add_text_block(struct note_blocks *blocks, const u8 *start, const u8 *end)
+static int add_text_block(struct ndb_content_parser *p,
+			  const unsigned char *start, const unsigned char *end)
 {
 	struct note_block b;
 	
@@ -93,10 +240,10 @@ static int add_text_block(struct note_blocks *blocks, const u8 *start, const u8
 		return 1;
 	
 	b.type = BLOCK_TEXT;
-	b.block.str.start = (const char*)start;
-	b.block.str.end = (const char*)end;
+	b.block.str.str = (const char*)start;
+	b.block.str.len = end - start;
 	
-	return add_block(blocks, b);
+	return push_block(p, &b);
 }
 
 static int consume_url_fragment(struct cursor *cur)
@@ -163,10 +310,12 @@ static int consume_url_host(struct cursor *cur)
 }
 
 static int parse_url(struct cursor *cur, struct note_block *block) {
-	u8 *start = cur->p;
-	u8 *host;
+	unsigned char *start = cur->p;
+	unsigned char *host;
+	unsigned char tmp[4096];
 	int host_len;
-	struct cursor path_cur;
+	struct cursor path_cur, tmp_cur;
+	make_cursor(tmp, tmp + sizeof(tmp), &tmp_cur);
 	
 	if (!parse_str(cur, "http"))
 		return 0;
@@ -222,29 +371,28 @@ static int parse_url(struct cursor *cur, struct note_block *block) {
 	}
 
 	// save the bech32 string pos in case we hit a damus.io link
-	block->block.str.start = (const char *)path_cur.p;
+	block->block.str.str = (const char *)path_cur.p;
 
 	// if we have a damus link, make it a mention
 	if (host_len == 8
 	&& !strncmp((const char *)host, "damus.io", 8)
-	&& parse_nostr_bech32(&path_cur, &block->block.mention_bech32.bech32))
+	&& parse_nostr_bech32_str(&path_cur))
 	{
-		block->block.str.end = (const char *)path_cur.p;
+		block->block.str.len = path_cur.p - path_cur.start;
 		block->type = BLOCK_MENTION_BECH32;
 		return 1;
 	}
 
 	block->type = BLOCK_URL;
-	block->block.str.start = (const char *)start;
-	block->block.str.end = (const char *)cur->p;
+	block->block.str.str = (const char *)start;
+	block->block.str.len = cur->p - start;
 	
 	return 1;
 }
 
 static int parse_invoice(struct cursor *cur, struct note_block *block) {
-	u8 *start, *end;
-	char *fail;
-	struct bolt11 *bolt11;
+	unsigned char *start, *end;
+
 	// optional
 	parse_str(cur, "lightning:");
 	
@@ -260,20 +408,10 @@ static int parse_invoice(struct cursor *cur, struct note_block *block) {
 	
 	end = cur->p;
 	
-	char str[end - start + 1];
-	str[end - start] = 0;
-	memcpy(str, start, end - start);
-	
-	if (!(bolt11 = bolt11_decode(NULL, str, &fail))) {
-		cur->p = start;
-		return 0;
-	}
-	
 	block->type = BLOCK_INVOICE;
 	
-	block->block.invoice.invstr.start = (const char*)start;
-	block->block.invoice.invstr.end = (const char*)end;
-	block->block.invoice.bolt11 = bolt11;
+	block->block.str.str = (const char*)start;
+	block->block.str.len = end - start;
 	
 	cur->p = end;
 	
@@ -282,107 +420,105 @@ static int parse_invoice(struct cursor *cur, struct note_block *block) {
 
 
 static int parse_mention_bech32(struct cursor *cur, struct note_block *block) {
-	u8 *start = cur->p;
+	unsigned char *start = cur->p;
 	
 	parse_char(cur, '@');
 	parse_str(cur, "nostr:");
 
-	block->block.str.start = (const char *)cur->p;
+	block->block.str.str = (const char *)cur->p;
 	
-	if (!parse_nostr_bech32(cur, &block->block.mention_bech32.bech32)) {
+	if (!parse_nostr_bech32_str(cur)) {
 		cur->p = start;
 		return 0;
 	}
 	
-	block->block.str.end = (const char *)cur->p;
-	
+	block->block.str.len = cur->p - start;
 	block->type = BLOCK_MENTION_BECH32;
 
 	return 1;
 }
 
-static int add_text_then_block(struct cursor *cur, struct note_blocks *blocks, struct note_block block, u8 **start, const u8 *pre_mention)
+static int add_text_then_block(struct ndb_content_parser *p,
+			       struct note_block *block,
+			       unsigned char **start,
+			       const unsigned char *pre_mention)
 {
-	if (!add_text_block(blocks, *start, pre_mention))
+	if (!add_text_block(p, *start, pre_mention))
 		return 0;
 	
-	*start = (u8*)cur->p;
-	
-	if (!add_block(blocks, block))
-		return 0;
+	*start = (unsigned char*)p->content.p;
 	
-	return 1;
+	return push_block(p, block);
 }
 
-int ndb_parse_content(struct note_blocks *blocks, const char *content) {
+int ndb_parse_content(unsigned char *buf, int buf_size,
+		      const char *content, int content_len,
+		      struct ndb_note_blocks **blocks_p)
+{
 	int cp, c;
-	struct cursor cur;
+	struct ndb_content_parser parser;
+
 	struct note_block block;
-	u8 *start, *pre_mention;
-	
-	blocks->words = 0;
-	blocks->num_blocks = 0;
-	make_cursor((u8*)content, (u8*)content + strlen(content), &cur);
+	unsigned char *start, *pre_mention;
 	
-	start = cur.p;
-	while (cur.p < cur.end && blocks->num_blocks < MAX_BLOCKS) {
-		cp = peek_char(&cur, -1);
-		c  = peek_char(&cur, 0);
+	make_cursor(buf, buf + buf_size, &parser.buffer);
+
+	// allocate some space for the blocks header
+	*blocks_p = parser.blocks = (struct ndb_note_blocks *)buf;
+	parser.buffer.p += sizeof(struct ndb_note_blocks);
+
+	make_cursor((unsigned char *)content,
+		    (unsigned char*)content + content_len, &parser.content);
+
+	parser.blocks->words = 0;
+	parser.blocks->num_blocks = 0;
+	parser.blocks->blocks_size = 0;
+
+	start = parser.content.p;
+	while (parser.content.p < parser.content.end) {
+		cp = peek_char(&parser.content, -1);
+		c  = peek_char(&parser.content, 0);
 		
 		// new word
-		if (is_whitespace(cp) && !is_whitespace(c)) {
-			blocks->words++;
-		}
+		if (is_whitespace(cp) && !is_whitespace(c))
+			parser.blocks->words++;
 		
-		pre_mention = cur.p;
+		pre_mention = parser.content.p;
 		if (cp == -1 || is_left_boundary(cp) || c == '#') {
-			if (c == '#' && (parse_mention_index(&cur, &block) || parse_hashtag(&cur, &block))) {
-				if (!add_text_then_block(&cur, blocks, block, &start, pre_mention))
+			if (c == '#' && (parse_mention_index(&parser.content, &block) || parse_hashtag(&parser.content, &block))) {
+				if (!add_text_then_block(&parser, &block, &start, pre_mention))
 					return 0;
 				continue;
-			} else if ((c == 'h' || c == 'H') && parse_url(&cur, &block)) {
-				if (!add_text_then_block(&cur, blocks, block, &start, pre_mention))
+			} else if ((c == 'h' || c == 'H') && parse_url(&parser.content, &block)) {
+				if (!add_text_then_block(&parser, &block, &start, pre_mention))
 					return 0;
 				continue;
-			} else if ((c == 'l' || c == 'L') && parse_invoice(&cur, &block)) {
-				if (!add_text_then_block(&cur, blocks, block, &start, pre_mention))
+			} else if ((c == 'l' || c == 'L') && parse_invoice(&parser.content, &block)) {
+				if (!add_text_then_block(&parser, &block, &start, pre_mention))
 					return 0;
 				continue;
-			} else if ((c == 'n' || c == '@') && parse_mention_bech32(&cur, &block)) {
-				if (!add_text_then_block(&cur, blocks, block, &start, pre_mention))
+			} else if ((c == 'n' || c == '@') && parse_mention_bech32(&parser.content, &block)) {
+				if (!add_text_then_block(&parser, &block, &start, pre_mention))
 					return 0;
 				continue;
 			}
 		}
 		
-		cur.p++;
+		parser.content.p++;
 	}
 	
-	if (cur.p - start > 0) {
-		if (!add_text_block(blocks, start, cur.p))
+	if (parser.content.p - start > 0) {
+		if (!add_text_block(&parser, start, parser.content.p))
 			return 0;
 	}
-	
-	return 1;
-}
 
-void blocks_init(struct note_blocks *blocks) {
-	blocks->blocks = malloc(sizeof(struct note_block) * MAX_BLOCKS);
-	blocks->num_blocks = 0;
-}
+	// pad to 8-byte alignment
+	if (!cursor_align(&parser.buffer, 8))
+		return 0;
+	assert((parser.buffer.p - parser.buffer.start) % 8 == 0);
 
-void blocks_free(struct note_blocks *blocks) {
-	if (!blocks->blocks) {
-		return;
-	}
+	parser.blocks->blocks_size = parser.buffer.p - parser.buffer.start;
 	
-	for (int i = 0; i < blocks->num_blocks; ++i) {
-		if (blocks->blocks[i].type == BLOCK_MENTION_BECH32) {
-			free(blocks->blocks[i].block.mention_bech32.bech32.buffer);
-			blocks->blocks[i].block.mention_bech32.bech32.buffer = NULL;
-		}
-	}
-
-	free(blocks->blocks);
-	blocks->num_blocks = 0;
+	return 1;
 }
+
diff --git a/src/invoice.h b/src/invoice.h
index acbb221..32c8adf 100644
--- a/src/invoice.h
+++ b/src/invoice.h
@@ -2,6 +2,9 @@
 #ifndef NDB_INVOICE_H
 #define NDB_INVOICE_H
 
+#include <inttypes.h>
+#include "cursor.h"
+
 struct bolt11;
 
 struct ndb_invoice {
diff --git a/src/nostrdb.c b/src/nostrdb.c
index 65facf4..747dc63 100644
--- a/src/nostrdb.c
+++ b/src/nostrdb.c
@@ -57,7 +57,6 @@ typedef int (*ndb_word_parser_fn)(void *, const char *word, int word_len,
 // representation
 #pragma pack(push, 1)
 
-
 union ndb_packed_str {
 	struct {
 		char str[3];
@@ -99,6 +98,7 @@ struct ndb_note {
 
 #pragma pack(pop)
 
+
 struct ndb_migration {
 	ndb_migrate_fn fn;
 };
diff --git a/src/nostrdb.h b/src/nostrdb.h
index bf90e2b..b66191f 100644
--- a/src/nostrdb.h
+++ b/src/nostrdb.h
@@ -20,11 +20,13 @@
 
 struct ndb_json_parser;
 struct ndb;
+struct ndb_note_blocks;
 struct ndb_note;
 struct ndb_tag;
 struct ndb_tags;
 struct ndb_lmdb;
 union ndb_packed_str;
+struct bolt11;
 
 // sorry, swift needs help with forward declared pointers like this
 struct ndb_t {
@@ -383,4 +385,9 @@ const char *ndb_db_name(enum ndb_dbs db);
 const char *ndb_kind_name(enum ndb_common_kind ck);
 enum ndb_common_kind ndb_kind_to_common_kind(int kind);
 
+// CONTENT PARSER
+int ndb_parse_content(unsigned char *buf, int buf_size,
+		      const char *content, int content_len,
+		      struct ndb_note_blocks **blocks_p);
+
 #endif
diff --git a/test.c b/test.c
index 677f00b..d57b1aa 100644
--- a/test.c
+++ b/test.c
@@ -4,6 +4,7 @@
 #include "io.h"
 #include "bolt11/bolt11.h"
 #include "invoice.h"
+#include "block.h"
 #include "protected_queue.h"
 #include "memchr.h"
 #include "print_util.h"
@@ -772,6 +773,15 @@ static void test_strings_work_before_finalization() {
 	assert(!strcmp(ndb_note_content(note), "hello"));
 }
 
+static void test_parse_content() {
+	const char *content = "hello,#world,https://github.com/damus-io";
+	struct ndb_note_blocks *blocks;
+	unsigned char buf[4096];
+
+	assert(ndb_parse_content(buf, sizeof(buf), content, strlen(content), &blocks));
+	assert(blocks->num_blocks == 4);
+}
+
 static void test_tce_eose() {
 	unsigned char buf[1024];
 	const char json[] = "[\"EOSE\",\"s\"]";
@@ -1059,6 +1069,7 @@ static int test_varints() {
 }
 
 int main(int argc, const char *argv[]) {
+	test_parse_content();
 	test_varints();
 	test_encode_decode_invoice();
 	test_filters();