From 05063b59acdeadc0bd28ae04bf2c2f95d427c84a Mon Sep 17 00:00:00 2001 From: HoJeong Go <seia@outlook.kr> Date: Tue, 10 Dec 2024 17:25:26 +0900 Subject: [PATCH 01/10] feat: use `TextEncoder` and `TextDecoder` for utf8 strings --- packages/adblocker/src/data-view.ts | 30 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/packages/adblocker/src/data-view.ts b/packages/adblocker/src/data-view.ts index 7d8ab640e2..36eea46899 100644 --- a/packages/adblocker/src/data-view.ts +++ b/packages/adblocker/src/data-view.ts @@ -20,6 +20,9 @@ export const EMPTY_UINT32_ARRAY = new Uint32Array(0); // Check if current architecture is little endian const LITTLE_ENDIAN: boolean = new Int8Array(new Int16Array([1]).buffer)[0] === 1; +// TextEncoder doesn't need to be recreated every time unlike TextDecoder +const TEXT_ENCODER = new TextEncoder(); + // Store compression in a lazy, global singleton let getCompressionSingleton: () => Compression = () => { const COMPRESSION = new Compression(); @@ -87,8 +90,7 @@ export function sizeOfASCII(str: string): number { * Return number of bytes needed to serialize `str` UTF8 string. */ export function sizeOfUTF8(str: string): number { - const encodedLength = encode(str).length; - return encodedLength + sizeOfLength(encodedLength); + return 4 + TEXT_ENCODER.encode(str).length; } /** @@ -389,23 +391,19 @@ export class StaticDataView { } public pushUTF8(raw: string): void { - const str = encode(raw); - this.pushLength(str.length); - - for (let i = 0; i < str.length; i += 1) { - this.buffer[this.pos++] = str.charCodeAt(i); - } + const pos = this.getPos(); + const { written } = TEXT_ENCODER.encodeInto(raw, this.buffer.subarray(pos + 4)); + this.setPos(pos); + this.pushUint32(written); + this.setPos(pos + 4 + written); } public getUTF8(): string { - const byteLength = this.getLength(); - this.pos += byteLength; - return decode( - String.fromCharCode.apply( - null, - // @ts-ignore - this.buffer.subarray(this.pos - byteLength, this.pos), - ), + const byteLength = this.getUint32(); + const pos = this.getPos(); + this.setPos(pos + byteLength); + return new TextDecoder('utf8', { ignoreBOM: true }).decode( + this.buffer.subarray(pos, pos + byteLength), ); } From 1de005ea80539d1422725c2988bfd6d49faddd0e Mon Sep 17 00:00:00 2001 From: HoJeong Go <seia@outlook.kr> Date: Tue, 10 Dec 2024 17:32:46 +0900 Subject: [PATCH 02/10] refactor: pos calculation in `pushUTF8` --- packages/adblocker/src/data-view.ts | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/packages/adblocker/src/data-view.ts b/packages/adblocker/src/data-view.ts index 36eea46899..7a92fd7230 100644 --- a/packages/adblocker/src/data-view.ts +++ b/packages/adblocker/src/data-view.ts @@ -391,11 +391,9 @@ export class StaticDataView { } public pushUTF8(raw: string): void { - const pos = this.getPos(); - const { written } = TEXT_ENCODER.encodeInto(raw, this.buffer.subarray(pos + 4)); - this.setPos(pos); + const { written } = TEXT_ENCODER.encodeInto(raw, this.buffer.subarray(this.pos + 4)); this.pushUint32(written); - this.setPos(pos + 4 + written); + this.setPos(this.pos + written); } public getUTF8(): string { From d6032eb11f5d52fa105f9aae2283becf75de994a Mon Sep 17 00:00:00 2001 From: HoJeong Go <seia@outlook.kr> Date: Wed, 11 Dec 2024 22:11:03 +0900 Subject: [PATCH 03/10] chore: save length of string in 16 bits unsigned integer - ~65535 ASCII only characters --- packages/adblocker/src/data-view.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/adblocker/src/data-view.ts b/packages/adblocker/src/data-view.ts index 7a92fd7230..aef16b7347 100644 --- a/packages/adblocker/src/data-view.ts +++ b/packages/adblocker/src/data-view.ts @@ -392,12 +392,12 @@ export class StaticDataView { public pushUTF8(raw: string): void { const { written } = TEXT_ENCODER.encodeInto(raw, this.buffer.subarray(this.pos + 4)); - this.pushUint32(written); + this.pushUint16(written); this.setPos(this.pos + written); } public getUTF8(): string { - const byteLength = this.getUint32(); + const byteLength = this.getUint16(); const pos = this.getPos(); this.setPos(pos + byteLength); return new TextDecoder('utf8', { ignoreBOM: true }).decode( From 65764e9ae39a53d5bc363bde8d718b47c8136535 Mon Sep 17 00:00:00 2001 From: HoJeong Go <seia@outlook.kr> Date: Thu, 12 Dec 2024 14:22:12 +0900 Subject: [PATCH 04/10] fix: use getLength and pushLength for utf8 --- packages/adblocker/src/data-view.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/adblocker/src/data-view.ts b/packages/adblocker/src/data-view.ts index aef16b7347..32136f86f3 100644 --- a/packages/adblocker/src/data-view.ts +++ b/packages/adblocker/src/data-view.ts @@ -392,12 +392,12 @@ export class StaticDataView { public pushUTF8(raw: string): void { const { written } = TEXT_ENCODER.encodeInto(raw, this.buffer.subarray(this.pos + 4)); - this.pushUint16(written); + this.pushLength(written); this.setPos(this.pos + written); } public getUTF8(): string { - const byteLength = this.getUint16(); + const byteLength = this.getLength(); const pos = this.getPos(); this.setPos(pos + byteLength); return new TextDecoder('utf8', { ignoreBOM: true }).decode( From 0339bdca5b8e58405f00c9b051950297e6e2e8a2 Mon Sep 17 00:00:00 2001 From: HoJeong Go <seia@outlook.kr> Date: Thu, 12 Dec 2024 17:53:12 +0900 Subject: [PATCH 05/10] fix: calculate length of utf8 encoded string --- packages/adblocker/src/data-view.ts | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/packages/adblocker/src/data-view.ts b/packages/adblocker/src/data-view.ts index 32136f86f3..81be8ae289 100644 --- a/packages/adblocker/src/data-view.ts +++ b/packages/adblocker/src/data-view.ts @@ -90,7 +90,12 @@ export function sizeOfASCII(str: string): number { * Return number of bytes needed to serialize `str` UTF8 string. */ export function sizeOfUTF8(str: string): number { - return 4 + TEXT_ENCODER.encode(str).length; + // Fast path for short strs considering the worst case (output ratio of 3) + if (str.length < 43 /* Math.ceil(127 / 3) */) { + return 1 + TEXT_ENCODER.encode(str).length; + } + const result = TEXT_ENCODER.encode(str); + return sizeOfLength(result.length) + result.length; } /** @@ -391,8 +396,21 @@ export class StaticDataView { } public pushUTF8(raw: string): void { - const { written } = TEXT_ENCODER.encodeInto(raw, this.buffer.subarray(this.pos + 4)); + const pos = this.getPos(); + // Assume the size of output length is 1 (which means output is less than 128) + // based on the possible minimal length to avoid memory reallocation. + // The minimal length is always 1 byte per character. + const start = pos + (raw.length > 127 ? 5 : 1); + const { written } = TEXT_ENCODER.encodeInto(raw, this.buffer.subarray(start)); + // If we failed to predict, that means the required bytes for length is 5. + if (pos + sizeOfLength(written) !== start) { + // Push 4 bytes back, `start + 4` or `pos + 5` + this.buffer.copyWithin(pos + 5, start, start + written); + } + // Restore pos to push length + this.setPos(pos); this.pushLength(written); + // Reflect written bytes to pos this.setPos(this.pos + written); } From 3270e6aad5d58fd984c8ccea1d9665093fd2c902 Mon Sep 17 00:00:00 2001 From: HoJeong Go <seia@outlook.kr> Date: Thu, 12 Dec 2024 18:08:23 +0900 Subject: [PATCH 06/10] chore: drop useless fast exit --- packages/adblocker/src/data-view.ts | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/packages/adblocker/src/data-view.ts b/packages/adblocker/src/data-view.ts index 81be8ae289..8dc6b76fef 100644 --- a/packages/adblocker/src/data-view.ts +++ b/packages/adblocker/src/data-view.ts @@ -90,12 +90,8 @@ export function sizeOfASCII(str: string): number { * Return number of bytes needed to serialize `str` UTF8 string. */ export function sizeOfUTF8(str: string): number { - // Fast path for short strs considering the worst case (output ratio of 3) - if (str.length < 43 /* Math.ceil(127 / 3) */) { - return 1 + TEXT_ENCODER.encode(str).length; - } - const result = TEXT_ENCODER.encode(str); - return sizeOfLength(result.length) + result.length; + const encoded = TEXT_ENCODER.encode(str); + return sizeOfLength(encoded.length) + encoded.length; } /** From 47dc63ab36bbca10ed6c609b8e314ca4872c05da Mon Sep 17 00:00:00 2001 From: HoJeong Go <seia@outlook.kr> Date: Thu, 12 Dec 2024 18:16:09 +0900 Subject: [PATCH 07/10] refactor: reuse `sizeOfLength` --- packages/adblocker/src/data-view.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/adblocker/src/data-view.ts b/packages/adblocker/src/data-view.ts index 8dc6b76fef..569437fb98 100644 --- a/packages/adblocker/src/data-view.ts +++ b/packages/adblocker/src/data-view.ts @@ -394,9 +394,9 @@ export class StaticDataView { public pushUTF8(raw: string): void { const pos = this.getPos(); // Assume the size of output length is 1 (which means output is less than 128) - // based on the possible minimal length to avoid memory reallocation. + // based on the possible minimal length to avoid memory relocation. // The minimal length is always 1 byte per character. - const start = pos + (raw.length > 127 ? 5 : 1); + const start = pos + sizeOfLength(raw.length); const { written } = TEXT_ENCODER.encodeInto(raw, this.buffer.subarray(start)); // If we failed to predict, that means the required bytes for length is 5. if (pos + sizeOfLength(written) !== start) { From 72e5b0d4f189307a96c20138a683a562bafd31e4 Mon Sep 17 00:00:00 2001 From: HoJeong Go <seia@outlook.kr> Date: Fri, 10 Jan 2025 22:59:34 +0900 Subject: [PATCH 08/10] Update packages/adblocker/src/data-view.ts Co-authored-by: Krzysztof Modras <1228153+chrmod@users.noreply.github.com> --- packages/adblocker/src/data-view.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/adblocker/src/data-view.ts b/packages/adblocker/src/data-view.ts index 569437fb98..9ff48f1d2e 100644 --- a/packages/adblocker/src/data-view.ts +++ b/packages/adblocker/src/data-view.ts @@ -90,8 +90,8 @@ export function sizeOfASCII(str: string): number { * Return number of bytes needed to serialize `str` UTF8 string. */ export function sizeOfUTF8(str: string): number { - const encoded = TEXT_ENCODER.encode(str); - return sizeOfLength(encoded.length) + encoded.length; + const encodedLength = TEXT_ENCODER.encode(str).length; + return encodedLength + sizeOfLength(encodedLength); } /** From dd8332ac667553d6469b33c36b59e4cc6a488f47 Mon Sep 17 00:00:00 2001 From: HoJeong Go <seia@outlook.kr> Date: Fri, 10 Jan 2025 23:04:25 +0900 Subject: [PATCH 09/10] Update packages/adblocker/src/data-view.ts Co-authored-by: Krzysztof Modras <1228153+chrmod@users.noreply.github.com> --- packages/adblocker/src/data-view.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/adblocker/src/data-view.ts b/packages/adblocker/src/data-view.ts index 9ff48f1d2e..dbce94ff2d 100644 --- a/packages/adblocker/src/data-view.ts +++ b/packages/adblocker/src/data-view.ts @@ -412,10 +412,10 @@ export class StaticDataView { public getUTF8(): string { const byteLength = this.getLength(); - const pos = this.getPos(); - this.setPos(pos + byteLength); + this.pos += byteLength; return new TextDecoder('utf8', { ignoreBOM: true }).decode( - this.buffer.subarray(pos, pos + byteLength), + // @ts-ignore + this.buffer.subarray(this.pos - byteLength, this.pos), ); } From c7b3cbc2bb32b83df5e668d4190565ca40f07d34 Mon Sep 17 00:00:00 2001 From: HoJeong Go <seia@outlook.kr> Date: Fri, 10 Jan 2025 23:07:16 +0900 Subject: [PATCH 10/10] chore: remove unused ts-ignore --- packages/adblocker/src/data-view.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/adblocker/src/data-view.ts b/packages/adblocker/src/data-view.ts index dbce94ff2d..ffc10e1838 100644 --- a/packages/adblocker/src/data-view.ts +++ b/packages/adblocker/src/data-view.ts @@ -414,7 +414,6 @@ export class StaticDataView { const byteLength = this.getLength(); this.pos += byteLength; return new TextDecoder('utf8', { ignoreBOM: true }).decode( - // @ts-ignore this.buffer.subarray(this.pos - byteLength, this.pos), ); }