Add count tokens function #17

syonfox · 2022-12-20T03:28:58Z

I don't actually want to encode into tokens for my use case, quickly count to check my request won't exceed the limit.

This should be faster since we don't initialize the memory for the output array.

const crypto = require('crypto');
// Generate a random string of a given length
function generateRandomString(length) {
    return crypto.randomBytes(length).toString('hex');
}

const {encode, decode, countTokens} = require('gpt-3-encoder')

let str = 'This is an example sentence to try encoding out on!'
// let now = Date.now();
let encoded = encode(str)
console.log('Encoded this string looks like: ', encoded)
console.log('We can look at each token and what it represents)
let tokencount = 0;
for(let token of encoded){
    tokencount ++;
    console.log({token, string: decode([token])})
}
console.log("there are n tokens: ", tokencount);
let decoded = decode(encoded)
console.log('We can decode it back into:\n', decoded)

let now = Date.now();
// todo: write an benchmark for the above method vs  int countTokens(str)
str = generateRandomString(10000);

console.time('fencode');
encoded = encode(str);
console.log(`First encode to cache string n stuff in mem`);
console.timeEnd('fencode');


console.log(`Original string length: ${str.length}`);
// Benchmark the encode function
console.time('encode');
encoded = encode(str);
console.log(`Encoded string length: ${encoded.length}`);
console.timeEnd('encode');

// Benchmark the countTokens function
console.time('countTokens');
let tokenCount = countTokens(str);
console.log(`Number of tokens: ${tokenCount}`);
console.timeEnd('countTokens');


console.log(`Original string length: ${str.length}`);
console.log(`Encoded string length: ${encoded.length}`);
console.log(`Number of tokens: ${tokenCount}`);

We can decode it back into:
 This is an example sentence to try encoding out on!
First encode to cache string n stuff in mem
fencode: 163.57ms
Original string length: 20000
Encoded string length: 11993
encode: 124.265ms
Number of tokens: 11993
countTokens: 29.2ms
Original string length: 20000
Encoded string length: 11993
Number of tokens: 11993

I don't actually want to encode into tokens for my use case, quickly count to check my request won't exceed the limit. This should be faster since we don't initialize the memory for the output array. ``` const crypto = require('crypto'); // Generate a random string of a given length function generateRandomString(length) { return crypto.randomBytes(length).toString('hex'); } const {encode, decode, countTokens} = require('gpt-3-encoder') let str = 'This is an example sentence to try encoding out on!' // let now = Date.now(); let encoded = encode(str) console.log('Encoded this string looks like: ', encoded) console.log('We can look at each token and what it represents) let tokencount = 0; for(let token of encoded){ tokencount ++; console.log({token, string: decode([token])}) } console.log("there are n tokens: ", tokencount); let decoded = decode(encoded) console.log('We can decode it back into:\n', decoded) let now = Date.now(); // todo: write an benchmark for the above method vs int countTokens(str) str = generateRandomString(10000); console.time('fencode'); encoded = encode(str); console.log(`First encode to cache string n stuff in mem`); console.timeEnd('fencode'); console.log(`Original string length: ${str.length}`); // Benchmark the encode function console.time('encode'); encoded = encode(str); console.log(`Encoded string length: ${encoded.length}`); console.timeEnd('encode'); // Benchmark the countTokens function console.time('countTokens'); let tokenCount = countTokens(str); console.log(`Number of tokens: ${tokenCount}`); console.timeEnd('countTokens'); console.log(`Original string length: ${str.length}`); console.log(`Encoded string length: ${encoded.length}`); console.log(`Number of tokens: ${tokenCount}`); ``` ``` We can decode it back into: This is an example sentence to try encoding out on! First encode to cache string n stuff in mem fencode: 163.57ms Original string length: 20000 Encoded string length: 11993 encode: 124.265ms Number of tokens: 11993 countTokens: 29.2ms Original string length: 20000 Encoded string length: 11993 Number of tokens: 11993 ```

Co-authored-by: Andrew Healey <[email protected]>

I don't actually want to encode into tokens for my use case, quickly count to check my request won't exceed the limit. This should be faster since we don't initialize the memory for the output array. ``` const crypto = require('crypto'); // Generate a random string of a given length function generateRandomString(length) { return crypto.randomBytes(length).toString('hex'); } const {encode, decode, countTokens} = require('gpt-3-encoder') let str = 'This is an example sentence to try encoding out on!' // let now = Date.now(); let encoded = encode(str) console.log('Encoded this string looks like: ', encoded) console.log('We can look at each token and what it represents) let tokencount = 0; for(let token of encoded){ tokencount ++; console.log({token, string: decode([token])}) } console.log("there are n tokens: ", tokencount); let decoded = decode(encoded) console.log('We can decode it back into:\n', decoded) let now = Date.now(); // todo: write an benchmark for the above method vs int countTokens(str) str = generateRandomString(10000); console.time('fencode'); encoded = encode(str); console.log(`First encode to cache string n stuff in mem`); console.timeEnd('fencode'); console.log(`Original string length: ${str.length}`); // Benchmark the encode function console.time('encode'); encoded = encode(str); console.log(`Encoded string length: ${encoded.length}`); console.timeEnd('encode'); // Benchmark the countTokens function console.time('countTokens'); let tokenCount = countTokens(str); console.log(`Number of tokens: ${tokenCount}`); console.timeEnd('countTokens'); console.log(`Original string length: ${str.length}`); console.log(`Encoded string length: ${encoded.length}`); console.log(`Number of tokens: ${tokenCount}`); ``` ``` We can decode it back into: This is an example sentence to try encoding out on! First encode to cache string n stuff in mem fencode: 163.57ms Original string length: 20000 Encoded string length: 11993 encode: 124.265ms Number of tokens: 11993 countTokens: 29.2ms Original string length: 20000 Encoded string length: 11993 Number of tokens: 11993 ``` Co-authored-by: Kier <[email protected]>

NickHeiner · 2022-12-23T20:43:21Z

I applied this in my fork: https://www.npmjs.com/package/@nick.heiner/gpt-3-encoder.

merge back the one other change and follow nick with moving to version 1.2 for feature add.

seang · 2023-01-18T15:07:24Z

package.json

-  "name": "gpt-3-encoder",
-  "version": "1.1.3",
+  "name": "@nick.heiner/gpt-3-encoder",
+  "version": "1.2.0",


Could we revert the diff on the package.json here?

#30

We can probably do it more incrementally but if you are interested in biting the bullet I made a few more improvements. Passed tests in my fork etc. seem to work well and even got browserify to bundle it all. I did this npm revert to 1.2.0-rc0

seang · 2023-01-18T15:08:38Z

README.md

@@ -1,3 +1,7 @@
+# This is a fork of https://github.com/latitudegames/GPT-3-Encoder. I made this fork so I could apply some PRs that had been sent to the upstream repo.


Another spot where we could revert this change for cleaning it up to merge

Yep, we don't want to pull in the changes from my fork.

Shackless · 2023-03-15T09:53:09Z

Is this package here abandoned? Then I'll move to your fork, @syonfox

syonfox and others added 2 commits December 19, 2022 22:27

Fix bug with BPE cache (#1)

14a7cfa

Co-authored-by: Andrew Healey <[email protected]>

NickHeiner mentioned this pull request Dec 23, 2022

Add count tokens function NickHeiner/GPT-3-Encoder#2

Merged

NickHeiner and others added 3 commits December 23, 2022 15:37

Add fork notice

df30a04

Bump version

b779e40

Merge pull request #1 from NickHeiner/master

bfefccc

merge back the one other change and follow nick with moving to version 1.2 for feature add.

seang reviewed Jan 18, 2023

View reviewed changes

syonfox added 2 commits January 18, 2023 14:51

Merge branch 'master' into patch-1

930720d

Removepatch notice

ebd603a

syonfox mentioned this pull request Jan 19, 2023

npm audit docs tests and bpe fix + countTokens properly added #18

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add count tokens function #17

Add count tokens function #17

syonfox commented Dec 20, 2022 •

edited

Loading

NickHeiner commented Dec 23, 2022

seang Jan 18, 2023

syonfox Jan 18, 2023

seang Jan 18, 2023

NickHeiner Jan 18, 2023

Shackless commented Mar 15, 2023

		@@ -1,3 +1,7 @@
		# This is a fork of https://github.com/latitudegames/GPT-3-Encoder. I made this fork so I could apply some PRs that had been sent to the upstream repo.

Add count tokens function #17

Are you sure you want to change the base?

Add count tokens function #17

Conversation

syonfox commented Dec 20, 2022 • edited Loading

NickHeiner commented Dec 23, 2022

seang Jan 18, 2023

Choose a reason for hiding this comment

syonfox Jan 18, 2023

Choose a reason for hiding this comment

seang Jan 18, 2023

Choose a reason for hiding this comment

NickHeiner Jan 18, 2023

Choose a reason for hiding this comment

Shackless commented Mar 15, 2023

syonfox commented Dec 20, 2022 •

edited

Loading