From ebf641671eeacc6372db65cd0d15a623767ce174 Mon Sep 17 00:00:00 2001 From: Arseny Kapoulkine Date: Mon, 13 Jan 2025 10:45:32 -0800 Subject: [PATCH] vertexcodec: Optimize encoding selection of zero groups When checking if a byte group can be encoded as zero, we need to check 16 bytes; to reduce branch mispredictions we can load the byte group into two 64-bit registers and check the bitwise or. This results in slightly suboptimal codegen for gcc, but is optimal for clang/MSVC. This function can also be used to determine if a given vertex block can use zero encoding as a control mode. For cases when the zero encoding is selected, this scans the bytes faster and does not rely on auto-vectorization which sometimes synthesizes rather poor code in this case. This change makes encoding ~5-10% faster depending on the data. --- src/vertexcodec.cpp | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/vertexcodec.cpp b/src/vertexcodec.cpp index e41d2f295..739571103 100644 --- a/src/vertexcodec.cpp +++ b/src/vertexcodec.cpp @@ -138,12 +138,9 @@ const int kEncodeDefaultLevel = 2; static size_t getVertexBlockSize(size_t vertex_size) { - // make sure the entire block fits into the scratch buffer - size_t result = kVertexBlockSizeBytes / vertex_size; - - // align to byte group size; we encode each byte as a byte group - // if vertex block is misaligned, it results in wasted bytes, so just truncate the block size - result &= ~(kByteGroupSize - 1); + // make sure the entire block fits into the scratch buffer and is aligned to byte group size + // note: the block size is implicitly part of the format, so we can't change it without breaking compatibility + size_t result = (kVertexBlockSizeBytes / vertex_size) & ~(kByteGroupSize - 1); return (result < kVertexBlockMaxSize) ? result : kVertexBlockMaxSize; } @@ -179,13 +176,14 @@ static Stats* bytestats = NULL; static Stats vertexstats[256]; #endif -static bool canEncodeZero(const unsigned char* buffer, size_t buffer_size) +static bool encodeBytesGroupZero(const unsigned char* buffer) { - for (size_t i = 0; i < buffer_size; ++i) - if (buffer[i]) - return false; + assert(kByteGroupSize == sizeof(unsigned long long) * 2); - return true; + unsigned long long v[2]; + memcpy(v, buffer, sizeof(v)); + + return (v[0] | v[1]) == 0; } static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits) @@ -193,7 +191,7 @@ static size_t encodeBytesGroupMeasure(const unsigned char* buffer, int bits) assert(bits >= 0 && bits <= 8); if (bits == 0) - return canEncodeZero(buffer, kByteGroupSize) ? 0 : size_t(-1); + return encodeBytesGroupZero(buffer) ? 0 : size_t(-1); if (bits == 8) return kByteGroupSize; @@ -455,9 +453,18 @@ static int estimateChannel(const unsigned char* vertex_data, size_t vertex_count return best_channel == 2 ? best_channel | (xor_rot << 4) : best_channel; } +static bool estimateControlZero(const unsigned char* buffer, size_t vertex_count_aligned) +{ + for (size_t i = 0; i < vertex_count_aligned; i += kByteGroupSize) + if (!encodeBytesGroupZero(buffer + i)) + return false; + + return true; +} + static int estimateControl(const unsigned char* buffer, size_t vertex_count, size_t vertex_count_aligned, int level) { - if (canEncodeZero(buffer, vertex_count)) + if (estimateControlZero(buffer, vertex_count_aligned)) return 2; // zero encoding if (level == 0)