BuilderIO · marcelovicentegc · Nov 29, 2023 · Nov 23, 2023 · Nov 23, 2023 · Nov 23, 2023
diff --git a/README.md b/README.md
@@ -78,6 +78,10 @@ type Config = {
   maxPagesToCrawl: number;
   /** File name for the finished data */
   outputFileName: string;
+  /** Optional maximum file size in megabytes to include in the output file */
+  maxFileSize?: number().,
+  /** Optional maximum number tokens to include in the output file */
+  maxTokens?: number().,
 };
 ```
 

diff --git a/config.ts b/config.ts
@@ -5,4 +5,6 @@ export const defaultConfig: Config = {
   match: "https://www.builder.io/c/docs/**",
   maxPagesToCrawl: 50,
   outputFileName: "output.json",
+  maxFileSize: 1000,
+  maxTokens: 5000
 };
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -11,6 +11,7 @@
     "crawlee": "^3.0.0",
     "cross-env": "^7.0.3",
     "glob": "^10.3.10",
+    "gpt-tokenizer": "^2.1.2",
     "inquirer": "^9.2.12",
     "playwright": "*",
     "prettier": "^3.1.0",

diff --git a/src/config.ts b/src/config.ts
@@ -51,6 +51,16 @@ export const configSchema = z.object({
       .optional(),
   /** Optional timeout for waiting for a selector to appear */
   waitForSelectorTimeout: z.number().int().nonnegative().optional(),
+
+
+  /** Optional maximum file size in megabytes to include in the output file
+   * @example 1
+  */
+  maxFileSize: z.number().int().positive().optional(),
+  /** Optional maximum number tokens to include in the output file 
+   * @example 5000
+  */
+  maxTokens: z.number().int().positive().optional(),
 });
 
 export type Config = z.infer<typeof configSchema>;

diff --git a/src/core.ts b/src/core.ts
@@ -4,6 +4,9 @@ import { readFile, writeFile } from "fs/promises";
 import { glob } from "glob";
 import {Config, configSchema} from "./config.js";
 import { Page } from "playwright";
+import {
+  isWithinTokenLimit,
+} from 'gpt-tokenizer'
 
 let pageCounter = 0;
 
@@ -112,18 +115,64 @@ export async function crawl(config: Config) {
   }
 }
 
-export async function write(config: Config) {
-  configSchema.parse(config);
+export async function write(config: Config)  {
+  const jsonFiles = await glob("storage/datasets/default/*.json", { absolute: true });
+
+  console.log(`Found ${jsonFiles.length} files to combine...`);
+
+  let currentResults: Record<string, any>[] = [];
+  let currentSize: number = 0;
+  let fileCounter: number = 1;
+  const maxBytes: number = config.maxFileSize ? config.maxFileSize * 1024 * 1024 : Infinity;
+
+  const getStringByteSize = (str: string): number => Buffer.byteLength(str, 'utf-8');
+
+  const nextFileName = (): string => `${config.outputFileName.replace(/\.json$/, '')}-${fileCounter}.json`;
+
+  const writeBatchToFile = async (): Promise<void> => {
+    await writeFile(nextFileName(), JSON.stringify(currentResults, null, 2));
+    console.log(`Wrote ${currentResults.length} items to ${nextFileName()}`);
+    currentResults = [];
+    currentSize = 0;
+    fileCounter++;
+  };
+
+  let estimatedTokens: number = 0;
+
+  const addContentOrSplit = async (data: Record<string, any>): Promise<void> => {
+    const contentString: string = JSON.stringify(data);
+    const tokenCount: number | false = isWithinTokenLimit(contentString, config.maxTokens || Infinity);
+
+    if (typeof tokenCount === 'number') {
+      if (estimatedTokens + tokenCount > config.maxTokens!) {
+        // Only write the batch if it's not empty (something to write)
+        if (currentResults.length > 0) {
+          await writeBatchToFile();
+        }
+        // Since the addition of a single item exceeded the token limit, halve it.
+        estimatedTokens = Math.floor(tokenCount / 2);
+        currentResults.push(data);
+      } else {
+        currentResults.push(data);
+        estimatedTokens += tokenCount;
+      }
+    }
 
-  const jsonFiles = await glob("storage/datasets/default/*.json", {
-    absolute: true,
-  });
+    currentSize += getStringByteSize(contentString);
+    if (currentSize > maxBytes) {
+      await writeBatchToFile();
+    }
+  };
 
-  const results = [];
+  // Iterate over each JSON file and process its contents.
   for (const file of jsonFiles) {
-    const data = JSON.parse(await readFile(file, "utf-8"));
-    results.push(data);
+    const fileContent = await readFile(file, 'utf-8');
+    const data: Record<string, any> = JSON.parse(fileContent);
+    await addContentOrSplit(data);
   }
 
-  await writeFile(config.outputFileName, JSON.stringify(results, null, 2));
-}
+  // Check if any remaining data needs to be written to a file.
+  if (currentResults.length > 0) {
+    await writeBatchToFile();
+  }
+};