Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add gpt-tokenizer package and implement size and token limits #62

Merged
merged 12 commits into from
Nov 29, 2023
Merged
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ type Config = {
maxPagesToCrawl: number;
/** File name for the finished data */
outputFileName: string;
/** Optional maximum file size in megabytes to include in the output file */
maxFileSize?: number().,
guillermoscript marked this conversation as resolved.
Show resolved Hide resolved
/** Optional maximum number tokens to include in the output file */
maxTokens?: number().,
};
```

Expand Down
2 changes: 2 additions & 0 deletions config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,6 @@ export const defaultConfig: Config = {
match: "https://www.builder.io/c/docs/**",
maxPagesToCrawl: 50,
outputFileName: "output.json",
maxFileSize: 1000,
maxTokens: 5000
};
14 changes: 14 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"crawlee": "^3.0.0",
"cross-env": "^7.0.3",
"glob": "^10.3.10",
"gpt-tokenizer": "^2.1.2",
"inquirer": "^9.2.12",
"playwright": "*",
"prettier": "^3.1.0",
Expand Down
10 changes: 10 additions & 0 deletions src/config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,16 @@ export const configSchema = z.object({
.optional(),
/** Optional timeout for waiting for a selector to appear */
waitForSelectorTimeout: z.number().int().nonnegative().optional(),


/** Optional maximum file size in megabytes to include in the output file
* @example 1
*/
maxFileSize: z.number().int().positive().optional(),
/** Optional maximum number tokens to include in the output file
* @example 5000
*/
maxTokens: z.number().int().positive().optional(),
});

export type Config = z.infer<typeof configSchema>;
Expand Down
69 changes: 59 additions & 10 deletions src/core.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ import { readFile, writeFile } from "fs/promises";
import { glob } from "glob";
import {Config, configSchema} from "./config.js";
import { Page } from "playwright";
import {
isWithinTokenLimit,
} from 'gpt-tokenizer'

let pageCounter = 0;

Expand Down Expand Up @@ -112,18 +115,64 @@ export async function crawl(config: Config) {
}
}

export async function write(config: Config) {
configSchema.parse(config);
export async function write(config: Config) {
const jsonFiles = await glob("storage/datasets/default/*.json", { absolute: true });

console.log(`Found ${jsonFiles.length} files to combine...`);

let currentResults: Record<string, any>[] = [];
let currentSize: number = 0;
let fileCounter: number = 1;
const maxBytes: number = config.maxFileSize ? config.maxFileSize * 1024 * 1024 : Infinity;

const getStringByteSize = (str: string): number => Buffer.byteLength(str, 'utf-8');

const nextFileName = (): string => `${config.outputFileName.replace(/\.json$/, '')}-${fileCounter}.json`;

const writeBatchToFile = async (): Promise<void> => {
await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2));
console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`);
currentResults = [];
currentSize = 0;
fileCounter++;
};

let estimatedTokens: number = 0;

const addContentOrSplit = async (data: Record<string, any>): Promise<void> => {
const contentString: string = JSON.stringify(data);
const tokenCount: number | false = isWithinTokenLimit(contentString, config.maxTokens || Infinity);

if (typeof tokenCount === 'number') {
if (estimatedTokens + tokenCount > config.maxTokens!) {
// Only write the batch if it's not empty (something to write)
if (currentResults.length > 0) {
await writeBatchToFile();
}
// Since the addition of a single item exceeded the token limit, halve it.
estimatedTokens = Math.floor(tokenCount / 2);
currentResults.push(data);
} else {
currentResults.push(data);
estimatedTokens += tokenCount;
}
}

const jsonFiles = await glob("storage/datasets/default/*.json", {
absolute: true,
});
currentSize += getStringByteSize(contentString);
if (currentSize > maxBytes) {
await writeBatchToFile();
}
};

const results = [];
// Iterate over each JSON file and process its contents.
for (const file of jsonFiles) {
const data = JSON.parse(await readFile(file, "utf-8"));
results.push(data);
const fileContent = await readFile(file, 'utf-8');
const data: Record<string, any> = JSON.parse(fileContent);
await addContentOrSplit(data);
}

await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
}
// Check if any remaining data needs to be written to a file.
if (currentResults.length > 0) {
await writeBatchToFile();
}
};