Skip to content

Commit

Permalink
Moved FastConvert logic to own method. Removed _length happening with…
Browse files Browse the repository at this point in the history
…in loops.
  • Loading branch information
CptMoore committed Jan 11, 2025
1 parent 10cb760 commit ae97c64
Showing 1 changed file with 77 additions and 68 deletions.
145 changes: 77 additions & 68 deletions ModTek/Features/Logging/FastBuffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -157,90 +157,99 @@ internal void Append(string value)
var dstPtr = _bufferPtr + _length;
var srcPtr = (byte*)chars + s_charLowBitsPosition;

// parallelism isn't what makes it particular fast, it's the batching that is helpful (fewer ops overall)
// 8 is a sweat spot, since we can do the ASCII bit mask check with an ulong
if (FastConvert(dstPtr, srcPtr, ref processingCount))
{
const int IterSize = 8;
for (; processingCount >= IterSize; processingCount -= IterSize)
{
*(dstPtr + 0) = *(srcPtr + 0 * 2);
*(dstPtr + 1) = *(srcPtr + 1 * 2);
*(dstPtr + 2) = *(srcPtr + 2 * 2);
*(dstPtr + 3) = *(srcPtr + 3 * 2);
*(dstPtr + 4) = *(srcPtr + 4 * 2);
*(dstPtr + 5) = *(srcPtr + 5 * 2);
*(dstPtr + 6) = *(srcPtr + 6 * 2);
*(dstPtr + 7) = *(srcPtr + 7 * 2);

const ulong NonAsciiBitmask =
(1ul << (7 + 8 * 7)) +
(1ul << (7 + 8 * 6)) +
(1ul << (7 + 8 * 5)) +
(1ul << (7 + 8 * 4)) +
(1ul << (7 + 8 * 3)) +
(1ul << (7 + 8 * 2)) +
(1ul << (7 + 8 * 1)) +
(1ul << (7 + 8 * 0));
if ((*(ulong*)dstPtr & NonAsciiBitmask) != 0)
{
goto Utf8Fallback;
}
dstPtr += IterSize;
srcPtr += 2*IterSize;
_length += IterSize;
}
_length += value.Length;
}

else
{
// this is 10x slower or more (GetBytes has no fast ASCII path and no SIMD in this old .NET)
var measurement = MTStopwatch.GetTimestamp();
var charIndex = value.Length - processingCount;
_length += charIndex;
const int Utf8MaxBytesPerChar = 4;
EnsureCapacity(_length + processingCount * Utf8MaxBytesPerChar);
_length += Encoding.UTF8.GetBytes(value, charIndex, processingCount, _buffer, _length);
UTF8FallbackStopwatch.EndMeasurement(measurement);
}
}
}
internal static readonly MTStopwatch UTF8FallbackStopwatch = new();
private static readonly int s_charLowBitsPosition = GetLowerBytePosition();
private static int GetLowerBytePosition()
{
var chars = stackalloc char[] { '1' };
return *(byte*)chars == 0 ? 1 : 0;
}
// if utf16 is only ASCII7 we can just copy the lower bits to 1 byte
// there is some parallelism achieved due to unrolling of the loop
// batching also has an effect due to fewer ops overall
// 8 is a sweat spot for unrolling and the ulong bit mask check
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static bool FastConvert(byte* dstPtr, byte* srcPtr, ref int processingCount)
{
{
const int IterSize = 8;
for (; processingCount >= IterSize; processingCount -= IterSize)
{
const int IterSize = 2;
for (; processingCount >= IterSize; processingCount -= IterSize)
*(dstPtr + 0) = *(srcPtr + 0 * 2);
*(dstPtr + 1) = *(srcPtr + 1 * 2);
*(dstPtr + 2) = *(srcPtr + 2 * 2);
*(dstPtr + 3) = *(srcPtr + 3 * 2);
*(dstPtr + 4) = *(srcPtr + 4 * 2);
*(dstPtr + 5) = *(srcPtr + 5 * 2);
*(dstPtr + 6) = *(srcPtr + 6 * 2);
*(dstPtr + 7) = *(srcPtr + 7 * 2);

const ulong NonAsciiBitmask =
(1ul << (7 + 8 * 7)) +
(1ul << (7 + 8 * 6)) +
(1ul << (7 + 8 * 5)) +
(1ul << (7 + 8 * 4)) +
(1ul << (7 + 8 * 3)) +
(1ul << (7 + 8 * 2)) +
(1ul << (7 + 8 * 1)) +
(1ul << (7 + 8 * 0));
if ((*(ulong*)dstPtr & NonAsciiBitmask) != 0)
{
*(dstPtr + 0) = *(srcPtr + 0 * 2);
*(dstPtr + 1) = *(srcPtr + 1 * 2);

const ushort NonAsciiBitmask =
(1 << (7 + 8 * 1)) +
(1 << (7 + 8 * 0));
if ((*(ushort*)dstPtr & NonAsciiBitmask) != 0)
{
goto Utf8Fallback;
}
dstPtr += IterSize;
srcPtr += 2*IterSize;
_length += IterSize;
return false;
}
dstPtr += IterSize;
srcPtr += 2*IterSize;
}
}

if (processingCount > 0)
{
const int IterSize = 2;
for (; processingCount >= IterSize; processingCount -= IterSize)
{
const int IterSize = 1;
*(dstPtr + 0) = *(srcPtr + 0 * 2);
*(dstPtr + 1) = *(srcPtr + 1 * 2);

const byte NonAsciiBitmask = 1 << 7;
if ((*dstPtr & NonAsciiBitmask) != 0)
const ushort NonAsciiBitmask =
(1 << (7 + 8 * 1)) +
(1 << (7 + 8 * 0));
if ((*(ushort*)dstPtr & NonAsciiBitmask) != 0)
{
goto Utf8Fallback;
return false;
}
_length += IterSize;
dstPtr += IterSize;
srcPtr += 2*IterSize;
}
}

return;
if (processingCount > 0)
{
*(dstPtr + 0) = *(srcPtr + 0 * 2);

Utf8Fallback: // this is 10x slower or more (GetBytes has no fast ASCII path and no SIMD in this old .NET)
var measurement = MTStopwatch.GetTimestamp();
var charIndex = value.Length - processingCount;
const int Utf8MaxBytesPerChar = 4;
EnsureCapacity(_length + processingCount * Utf8MaxBytesPerChar);
_length += Encoding.UTF8.GetBytes(value, charIndex, processingCount, _buffer, _length);
UTF8FallbackStopwatch.EndMeasurement(measurement);
const byte NonAsciiBitmask = 1 << 7;
if ((*dstPtr & NonAsciiBitmask) != 0)
{
return false;
}
}
}
internal static readonly MTStopwatch UTF8FallbackStopwatch = new();
private static readonly int s_charLowBitsPosition = GetLowerBytePosition();
private static int GetLowerBytePosition()
{
var chars = stackalloc char[] { '1' };
return *(byte*)chars == 0 ? 1 : 0;

return true;
}

internal void Append(DateTime value)
Expand Down

0 comments on commit ae97c64

Please sign in to comment.