From c9ba1a02af92ae6488b87e859761e5ce639e1b33 Mon Sep 17 00:00:00 2001 From: Protobuf Team Bot Date: Thu, 19 Dec 2024 22:22:29 -0800 Subject: [PATCH] Don't track the size of each allocated block any more This saves us 8 bytes per block on 64 bit builds, we no longer need to traverse the linked list of blocks to check allocated space, which means we also no longer need atomics in the linked list or even its head. This is especially beneficial as the previous implementation contained a race where we could dereference uninitialized memory; because the setting of the `next` pointers did not use release semantics and the reading of them in `SpaceAllocated` reads with relaxed order, there's no guarantee that `size` has actually been initialized - but worse, *there is also no guarantee that `next` has been!*. Simplified: ``` AddBlock: 1 ptr = malloc(); 2 ptr->size = 123; 3 ptr->next = ai->blocks; 4 ai->blocks = ptr (release order); ``` ``` SpaceAllocated: 5 block = ai->blocks (relaxed order) 6 block->size (acquire, but probably by accident) 7 block = block->next (relaxed order) ``` So I think a second thread calling SpaceAllocated could see the order 1, 4, 5, 6, 7, 2, 3 and read uninitialized memory - there is no data-dependency relationship or happens-before edge that this order violates, and so it would be valid for a compiler+hardware to produce. In reality, operation 4 will produce an `stlr` on arm (forcing an order of 1, 2, 3 before 4), and `block->next` has a data dependency on `ai->blocks` which would force an ordering in the hardware between 5->6 and 5->7 even for regular `ldr` instructions. The fix would be for `SpaceAllocated` to read `ai->blocks` with acquire order, but with this CL that's moot. Please check my work as I'm less familiar with the the C/C++ memory model. Delete arena contains, it's private and the only user is its own test. PiperOrigin-RevId: 708180547 --- upb/mem/arena.c | 64 ++++++++++++++++------------------------ upb/mem/arena_test.cc | 38 ------------------------ upb/mem/internal/arena.h | 7 +---- 3 files changed, 26 insertions(+), 83 deletions(-) diff --git a/upb/mem/arena.c b/upb/mem/arena.c index 79f42d8f9690d..4594ce7e329de 100644 --- a/upb/mem/arena.c +++ b/upb/mem/arena.c @@ -28,9 +28,7 @@ void upb_Arena_SetMaxBlockSize(size_t max) { } typedef struct upb_MemBlock { - // Atomic only for the benefit of SpaceAllocated(). - UPB_ATOMIC(struct upb_MemBlock*) next; - uint32_t size; + struct upb_MemBlock* next; // Data follows. } upb_MemBlock; @@ -62,9 +60,11 @@ typedef struct upb_ArenaInternal { // == self when no other list members. UPB_ATOMIC(struct upb_ArenaInternal*) tail; - // Linked list of blocks to free/cleanup. Atomic only for the benefit of - // upb_Arena_SpaceAllocated(). - UPB_ATOMIC(upb_MemBlock*) blocks; + // Linked list of blocks to free/cleanup. + upb_MemBlock* blocks; + + // Total space allocated in blocks, atomic only for SpaceAllocated + UPB_ATOMIC(uint32_t) space_allocated; } upb_ArenaInternal; // All public + private state for an arena. @@ -207,11 +207,7 @@ size_t upb_Arena_SpaceAllocated(upb_Arena* arena, size_t* fused_count) { size_t local_fused_count = 0; while (ai != NULL) { - upb_MemBlock* block = upb_Atomic_Load(&ai->blocks, memory_order_relaxed); - while (block != NULL) { - memsize += sizeof(upb_MemBlock) + block->size; - block = upb_Atomic_Load(&block->next, memory_order_relaxed); - } + memsize += upb_Atomic_Load(&ai->space_allocated, memory_order_relaxed); ai = upb_Atomic_Load(&ai->next, memory_order_relaxed); local_fused_count++; } @@ -220,21 +216,6 @@ size_t upb_Arena_SpaceAllocated(upb_Arena* arena, size_t* fused_count) { return memsize; } -bool UPB_PRIVATE(_upb_Arena_Contains)(const upb_Arena* a, void* ptr) { - upb_ArenaInternal* ai = upb_Arena_Internal(a); - UPB_ASSERT(ai); - - upb_MemBlock* block = upb_Atomic_Load(&ai->blocks, memory_order_relaxed); - while (block) { - uintptr_t beg = (uintptr_t)block; - uintptr_t end = beg + block->size; - if ((uintptr_t)ptr >= beg && (uintptr_t)ptr < end) return true; - block = upb_Atomic_Load(&block->next, memory_order_relaxed); - } - - return false; -} - uint32_t upb_Arena_DebugRefCount(upb_Arena* a) { upb_ArenaInternal* ai = upb_Arena_Internal(a); // These loads could probably be relaxed, but given that this is debug-only, @@ -252,9 +233,8 @@ static void _upb_Arena_AddBlock(upb_Arena* a, void* ptr, size_t size) { upb_MemBlock* block = ptr; // Insert into linked list. - block->size = (uint32_t)size; - upb_Atomic_Init(&block->next, ai->blocks); - upb_Atomic_Store(&ai->blocks, block, memory_order_release); + block->next = ai->blocks; + ai->blocks = block; a->UPB_PRIVATE(ptr) = UPB_PTR_AT(block, kUpb_MemblockReserve, char); a->UPB_PRIVATE(end) = UPB_PTR_AT(block, size, char); @@ -267,7 +247,7 @@ static bool _upb_Arena_AllocBlock(upb_Arena* a, size_t size) { upb_ArenaInternal* ai = upb_Arena_Internal(a); if (!ai->block_alloc) return false; size_t last_size = 128; - upb_MemBlock* last_block = upb_Atomic_Load(&ai->blocks, memory_order_relaxed); + upb_MemBlock* last_block = ai->blocks; if (last_block) { last_size = a->UPB_PRIVATE(end) - (char*)last_block; } @@ -288,6 +268,13 @@ static bool _upb_Arena_AllocBlock(upb_Arena* a, size_t size) { if (!block) return false; _upb_Arena_AddBlock(a, block, block_size); + // Atomic add not required here, as threads won't race allocating blocks, plus + // atomic fetch-add is slower than load/add/store on arm devices compiled + // targetting pre-v8.1. + uint32_t old_block_size = + upb_Atomic_Load(&ai->space_allocated, memory_order_relaxed); + upb_Atomic_Store(&ai->space_allocated, block_size + old_block_size, + memory_order_relaxed); UPB_ASSERT(UPB_PRIVATE(_upb_ArenaHas)(a) >= size); return true; } @@ -316,7 +303,8 @@ static upb_Arena* _upb_Arena_InitSlow(upb_alloc* alloc) { upb_Atomic_Init(&a->body.parent_or_count, _upb_Arena_TaggedFromRefcount(1)); upb_Atomic_Init(&a->body.next, NULL); upb_Atomic_Init(&a->body.tail, &a->body); - upb_Atomic_Init(&a->body.blocks, NULL); + upb_Atomic_Init(&a->body.space_allocated, n); + a->body.blocks = NULL; a->body.upb_alloc_cleanup = NULL; _upb_Arena_AddBlock(&a->head, mem, n); @@ -355,7 +343,8 @@ upb_Arena* upb_Arena_Init(void* mem, size_t n, upb_alloc* alloc) { upb_Atomic_Init(&a->body.parent_or_count, _upb_Arena_TaggedFromRefcount(1)); upb_Atomic_Init(&a->body.next, NULL); upb_Atomic_Init(&a->body.tail, &a->body); - upb_Atomic_Init(&a->body.blocks, NULL); + upb_Atomic_Init(&a->body.space_allocated, 0); + a->body.blocks = NULL; a->body.upb_alloc_cleanup = NULL; a->body.block_alloc = _upb_Arena_MakeBlockAlloc(alloc, 1); @@ -374,12 +363,11 @@ static void _upb_Arena_DoFree(upb_ArenaInternal* ai) { upb_ArenaInternal* next_arena = (upb_ArenaInternal*)upb_Atomic_Load(&ai->next, memory_order_acquire); upb_alloc* block_alloc = _upb_ArenaInternal_BlockAlloc(ai); - upb_MemBlock* block = upb_Atomic_Load(&ai->blocks, memory_order_acquire); + upb_MemBlock* block = ai->blocks; upb_AllocCleanupFunc* alloc_cleanup = *ai->upb_alloc_cleanup; while (block != NULL) { // Load first since we are deleting block. - upb_MemBlock* next_block = - upb_Atomic_Load(&block->next, memory_order_acquire); + upb_MemBlock* next_block = block->next; upb_free(block_alloc, block); block = next_block; } @@ -601,8 +589,7 @@ void UPB_PRIVATE(_upb_Arena_SwapIn)(upb_Arena* des, const upb_Arena* src) { *des = *src; desi->block_alloc = srci->block_alloc; - upb_MemBlock* blocks = upb_Atomic_Load(&srci->blocks, memory_order_relaxed); - upb_Atomic_Init(&desi->blocks, blocks); + desi->blocks = srci->blocks; } void UPB_PRIVATE(_upb_Arena_SwapOut)(upb_Arena* des, const upb_Arena* src) { @@ -610,6 +597,5 @@ void UPB_PRIVATE(_upb_Arena_SwapOut)(upb_Arena* des, const upb_Arena* src) { upb_ArenaInternal* srci = upb_Arena_Internal(src); *des = *src; - upb_MemBlock* blocks = upb_Atomic_Load(&srci->blocks, memory_order_relaxed); - upb_Atomic_Store(&desi->blocks, blocks, memory_order_relaxed); + desi->blocks = srci->blocks; } diff --git a/upb/mem/arena_test.cc b/upb/mem/arena_test.cc index 292dcf340ca98..9b373bd8b9dfe 100644 --- a/upb/mem/arena_test.cc +++ b/upb/mem/arena_test.cc @@ -172,44 +172,6 @@ TEST(ArenaTest, FuzzSingleThreaded) { } } -TEST(ArenaTest, Contains) { - upb_Arena* arena1 = upb_Arena_New(); - upb_Arena* arena2 = upb_Arena_New(); - void* ptr1a = upb_Arena_Malloc(arena1, 8); - void* ptr2a = upb_Arena_Malloc(arena2, 8); - - EXPECT_TRUE(UPB_PRIVATE(_upb_Arena_Contains)(arena1, ptr1a)); - EXPECT_TRUE(UPB_PRIVATE(_upb_Arena_Contains)(arena2, ptr2a)); - EXPECT_FALSE(UPB_PRIVATE(_upb_Arena_Contains)(arena1, ptr2a)); - EXPECT_FALSE(UPB_PRIVATE(_upb_Arena_Contains)(arena2, ptr1a)); - - void* ptr1b = upb_Arena_Malloc(arena1, 1000000); - void* ptr2b = upb_Arena_Malloc(arena2, 1000000); - - EXPECT_TRUE(UPB_PRIVATE(_upb_Arena_Contains)(arena1, ptr1a)); - EXPECT_TRUE(UPB_PRIVATE(_upb_Arena_Contains)(arena1, ptr1b)); - EXPECT_TRUE(UPB_PRIVATE(_upb_Arena_Contains)(arena2, ptr2a)); - EXPECT_TRUE(UPB_PRIVATE(_upb_Arena_Contains)(arena2, ptr2b)); - EXPECT_FALSE(UPB_PRIVATE(_upb_Arena_Contains)(arena1, ptr2a)); - EXPECT_FALSE(UPB_PRIVATE(_upb_Arena_Contains)(arena1, ptr2b)); - EXPECT_FALSE(UPB_PRIVATE(_upb_Arena_Contains)(arena2, ptr1a)); - EXPECT_FALSE(UPB_PRIVATE(_upb_Arena_Contains)(arena2, ptr1b)); - - upb_Arena_Fuse(arena1, arena2); - - EXPECT_TRUE(UPB_PRIVATE(_upb_Arena_Contains)(arena1, ptr1a)); - EXPECT_TRUE(UPB_PRIVATE(_upb_Arena_Contains)(arena1, ptr1b)); - EXPECT_TRUE(UPB_PRIVATE(_upb_Arena_Contains)(arena2, ptr2a)); - EXPECT_TRUE(UPB_PRIVATE(_upb_Arena_Contains)(arena2, ptr2b)); - EXPECT_FALSE(UPB_PRIVATE(_upb_Arena_Contains)(arena1, ptr2a)); - EXPECT_FALSE(UPB_PRIVATE(_upb_Arena_Contains)(arena1, ptr2b)); - EXPECT_FALSE(UPB_PRIVATE(_upb_Arena_Contains)(arena2, ptr1a)); - EXPECT_FALSE(UPB_PRIVATE(_upb_Arena_Contains)(arena2, ptr1b)); - - upb_Arena_Free(arena1); - upb_Arena_Free(arena2); -} - TEST(ArenaTest, LargeAlloc) { // Tests an allocation larger than the max block size. upb_Arena* arena = upb_Arena_New(); diff --git a/upb/mem/internal/arena.h b/upb/mem/internal/arena.h index 08aacde1eac81..f69c621c7556a 100644 --- a/upb/mem/internal/arena.h +++ b/upb/mem/internal/arena.h @@ -20,7 +20,7 @@ // // We need this because the decoder inlines a upb_Arena for performance but // the full struct is not visible outside of arena.c. Yes, I know, it's awful. -#define UPB_ARENA_SIZE_HACK 8 +#define UPB_ARENA_SIZE_HACK 9 // LINT.IfChange(upb_Arena) @@ -40,11 +40,6 @@ void UPB_PRIVATE(_upb_Arena_SwapIn)(struct upb_Arena* des, void UPB_PRIVATE(_upb_Arena_SwapOut)(struct upb_Arena* des, const struct upb_Arena* src); -// Returns whether |ptr| was allocated directly by |a| (so care must be used -// with fused arenas). -UPB_API bool UPB_ONLYBITS(_upb_Arena_Contains)(const struct upb_Arena* a, - void* ptr); - UPB_INLINE size_t UPB_PRIVATE(_upb_ArenaHas)(const struct upb_Arena* a) { return (size_t)(a->UPB_ONLYBITS(end) - a->UPB_ONLYBITS(ptr)); }