From cf64a4bbd03338c33948b9c4c28c561b8c6c17a2 Mon Sep 17 00:00:00 2001 From: Jacob Cable Date: Mon, 23 Dec 2024 15:30:53 +0000 Subject: [PATCH] refactor(firestore-bigquery-export): update gen-schema gemini approach --- .../src/__tests__/genkit/runAgent.test.ts | 163 +++++++++++++++++ .../scripts/gen-schema-view/src/index.ts | 58 +----- .../gen-schema-view/src/schema/genkit.ts | 169 ++++++++++++------ .../src/schema/genkitSchema.ts | 35 ++++ 4 files changed, 318 insertions(+), 107 deletions(-) create mode 100644 firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/genkit/runAgent.test.ts create mode 100644 firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkitSchema.ts diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/genkit/runAgent.test.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/genkit/runAgent.test.ts new file mode 100644 index 000000000..fe6c08ac5 --- /dev/null +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/__tests__/genkit/runAgent.test.ts @@ -0,0 +1,163 @@ +import * as fs from "fs/promises"; +import { SchemaSchema } from "../../schema/genkitSchema"; + +jest.mock("fs/promises", () => ({ + access: jest.fn(), + writeFile: jest.fn(), +})); + +const mockDefineTool = jest.fn(); +jest.mock("genkit", () => { + const actualGenkit = jest.requireActual("genkit"); + return { + ...actualGenkit, + genkit: jest.fn(() => ({ + defineTool: jest.fn(), + definePrompt: jest.fn(), + })), + }; +}); + +describe("runAgent - additional test", () => { + const schemaDirectory = "./schemas"; + const tablePrefix = "testPrefix"; + const schemaContent = JSON.stringify({ + fields: [ + { + name: "id", + type: "number", + description: "Unique identifier for the document", + }, + ], + }); + + beforeEach(() => { + jest.clearAllMocks(); // Clear all mock states before each test + }); + + it("should write a new schema file if it does not already exist", async () => { + const filePath = `${schemaDirectory}/${tablePrefix}.json`; + + // Mock file system operations + (fs.access as jest.Mock).mockRejectedValueOnce( + new Error("File does not exist") + ); + (fs.writeFile as jest.Mock).mockResolvedValueOnce(undefined); // Explicitly resolve with `undefined` + + // Simulate the tool definition and handler + const writeSchemaHandler = async ({ + fileName, + content, + }: { + fileName: string; + content: string; + }) => { + const filePath = `${schemaDirectory}/${fileName}`; + try { + await fs.access(filePath); // Check if the file exists + return "Error: Schema file already exists"; + } catch { + await fs.writeFile(filePath, content); // Write the file if it doesn't exist + return "Schema created successfully"; + } + }; + + // Call the handler directly + const result = await writeSchemaHandler({ + fileName: `${tablePrefix}.json`, + content: schemaContent, + }); + + // Assertions + expect(fs.access).toHaveBeenCalledWith(filePath); + expect(fs.writeFile).toHaveBeenCalledWith(filePath, schemaContent); + expect(result).toBe("Schema created successfully"); + }); + + it("should return an error if the schema file already exists", async () => { + const filePath = `${schemaDirectory}/${tablePrefix}.json`; + + // Mock file system operations + (fs.access as jest.Mock).mockResolvedValueOnce(undefined); // Simulate file exists + (fs.writeFile as jest.Mock).mockResolvedValueOnce(undefined); // This should NOT be called + + // Simulate the tool definition and handler + const writeSchemaHandler = async ({ + fileName, + content, + }: { + fileName: string; + content: string; + }) => { + const filePath = `${schemaDirectory}/${fileName}`; + try { + await fs.access(filePath); + console.log("File exists, returning error"); // Debugging flow + return "Error: Schema file already exists"; + } catch { + console.log("File does not exist, writing file"); // Debugging flow + await fs.writeFile(filePath, content); + return "Schema created successfully"; + } + }; + + // Call the handler directly + const result = await writeSchemaHandler({ + fileName: `${tablePrefix}.json`, + content: schemaContent, + }); + + console.log( + "Mock calls for fs.access:", + (fs.access as jest.Mock).mock.calls + ); + console.log( + "Mock calls for fs.writeFile:", + (fs.writeFile as jest.Mock).mock.calls + ); + + // Assertions + expect(fs.access).toHaveBeenCalledWith(filePath); + expect(fs.writeFile).not.toHaveBeenCalled(); // Ensure writeFile is NOT called + expect(result).toBe("Error: Schema file already exists"); + }); + + it("should return an error if the schema content is invalid", async () => { + const invalidSchemaContent = JSON.stringify({ + fields: [ + { name: "id", type: "invalid_type" }, // Invalid type + ], + }); + const writeSchemaHandler = async ({ + fileName, + content, + }: { + fileName: string; + content: string; + }) => { + const filePath = `${schemaDirectory}/${fileName}`; + try { + SchemaSchema.parse(JSON.parse(content)); // Validate schema structure + try { + await fs.access(filePath); // Check if the file exists + return "Error: Schema file already exists"; + } catch { + await fs.writeFile(filePath, content); // Write the file if it doesn't exist + return "Schema created successfully"; + } + } catch (error) { + return `Error creating schema: ${error.message}`; + } + }; + + // // Call the handler directly + const result = await writeSchemaHandler({ + fileName: `${tablePrefix}.json`, + content: invalidSchemaContent, + }); + + // Assertions + expect(result).toMatch("Error: Schema file already exists"); + expect(fs.writeFile).not.toHaveBeenCalled(); // Ensure writeFile is NOT called + }); +}); diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/index.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/index.ts index 0ab653a5e..72eb208ca 100644 --- a/firestore-bigquery-export/scripts/gen-schema-view/src/index.ts +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/index.ts @@ -27,9 +27,7 @@ import { runAgent } from "./schema/genkit"; const BIGQUERY_VALID_CHARACTERS = /^[a-zA-Z0-9_]+$/; const FIRESTORE_VALID_CHARACTERS = /^[^\/]+$/; const GCP_PROJECT_VALID_CHARACTERS = /^[a-z][a-z0-9-]{0,29}$/; -const MIN_SAMPLE_SIZE = 1; -const MAX_SAMPLE_SIZE = 100; -const DEFAULT_SAMPLE_SIZE = 10; +const DEFAULT_SAMPLE_SIZE = 100; const validateInput = (value: any, name: string, regex: RegExp) => { if (!value || value === "" || value.trim() === "") { @@ -145,14 +143,9 @@ program ) .option( "--use-gemini-agent", - "Use Gemini AI Agent to automatically analyze your data and generate the schema", + "Use Gemini to automatically analyze your data and generate a draft schema. You will have a chance to manually view and approve this schema before it is used.", false ) - .option( - "--agent-sample-size ", - `Number of documents for the Gemini Agent to analyze (${MIN_SAMPLE_SIZE}-${MAX_SAMPLE_SIZE})`, - DEFAULT_SAMPLE_SIZE.toString() - ) .option( "--schema-dir ", "Directory to store generated schemas", @@ -196,7 +189,7 @@ const questions = [ }, { message: - "Would you like to use a Gemini AI Agent to automatically analyze your data and generate the schema?", + "Would you like to use a Gemini to automatically analyze your data and generate a draft schema?", name: "useGeminiAgent", type: "confirm", default: false, @@ -221,22 +214,6 @@ const questions = [ return true; }, }, - { - message: `How many documents should the Gemini Agent analyze? (${MIN_SAMPLE_SIZE}-${MAX_SAMPLE_SIZE})`, - name: "agentSampleSize", - type: "number", - default: DEFAULT_SAMPLE_SIZE, - when: (answers) => answers.useGeminiAgent, - validate: (value) => { - if (isNaN(value) || value < MIN_SAMPLE_SIZE) { - return `Please provide a number greater than or equal to ${MIN_SAMPLE_SIZE}`; - } - if (value > MAX_SAMPLE_SIZE) { - return `Sample size must not exceed ${MAX_SAMPLE_SIZE} documents`; - } - return true; - }, - }, { message: "Where should this script look for schema definitions? (Enter a comma-separated list of, optionally globbed, paths to files or directories).", @@ -285,14 +262,10 @@ async function run(): Promise { if (config.useGeminiAgent) { try { - console.log("\nStarting Gemini Agent schema generation process..."); - const sampleData = await sampleFirestoreDocuments( config.collectionPath!, config.agentSampleSize! ); - - console.log("Initializing Gemini Agent..."); const chat = runAgent( config.googleAiKey!, config.schemaDirectory || "./schemas", @@ -300,15 +273,11 @@ async function run(): Promise { config.collectionPath!, sampleData ); - - console.log("Generating schema from sample data..."); await chat.send( `Please analyze these documents and generate an appropriate BigQuery schema. ` + `**Then use the writeSchema tool to save it as "${config.tableNamePrefix}.json**". ` + `Let me know once you've created the schema file.` ); - - console.log("Schema generation complete. Reading generated schema..."); const schemaName = `${config.tableNamePrefix}`; const schemas = readSchemas([`./schemas/${schemaName}.json`]); @@ -322,10 +291,7 @@ async function run(): Promise { const schemaPath = `./schemas/${config.tableNamePrefix}.json`; console.log( - `\nSchema generation complete. The schema file has been created at: ${schemaPath}` - ); - console.log( - "Please review the schema file and confirm if you want to proceed." + `\nSchema generation complete. The schema file has been created at: ${schemaPath}. Please review the schema file and confirm if you want to proceed.` ); const confirmation = await inquirer.prompt([ @@ -405,18 +371,6 @@ async function parseConfig(): Promise { ); process.exit(1); } - - const sampleSize = parseInt(program.agentSampleSize); - if ( - isNaN(sampleSize) || - sampleSize < MIN_SAMPLE_SIZE || - sampleSize > MAX_SAMPLE_SIZE - ) { - console.error( - `Agent sample size must be between ${MIN_SAMPLE_SIZE} and ${MAX_SAMPLE_SIZE}.` - ); - process.exit(1); - } } return { @@ -427,9 +381,7 @@ async function parseConfig(): Promise { collectionPath: program.collectionPath, schemas: program.useGeminiAgent ? {} : readSchemas(program.schemaFiles), useGeminiAgent: program.useGeminiAgent, - agentSampleSize: program.useGeminiAgent - ? parseInt(program.agentSampleSize) - : undefined, + agentSampleSize: DEFAULT_SAMPLE_SIZE, googleAiKey: program.googleAiKey, }; } diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts index 431864aef..f73acce29 100644 --- a/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkit.ts @@ -1,46 +1,70 @@ import { gemini20FlashExp, googleAI } from "@genkit-ai/googleai"; -import { genkit, z } from "genkit"; +import { Genkit, genkit, z } from "genkit"; import * as fs from "fs/promises"; import * as path from "path"; +import { SchemaSchema } from "./genkitSchema"; // Assuming the schema is in a separate file -// Define the schema structure for validation -const SchemaFieldType = z.enum([ - "string", - "array", - "map", - "boolean", - "number", - "timestamp", - "geopoint", - "reference", - "null", - "stringified_map", -]); - -const SchemaField = z.lazy(() => - z.object({ - name: z.string(), - type: SchemaFieldType, - description: z.string().optional(), - fields: z.array(SchemaField).optional(), - }) -); +/** + * Initializes Genkit with the Google AI plugin. + * + * @param {string} apiKey - The API key for Google AI. + * @returns {ReturnType} - An instance of Genkit configured with the Google AI plugin. + */ +const initializeGenkit = (apiKey: string) => { + return genkit({ plugins: [googleAI({ apiKey })] }); +}; -const Schema = z.object({ - fields: z.array(SchemaField), -}); +/** + * Validates the content of a schema against the SchemaSchema. + * + * @param {string} content - The JSON string representation of the schema to validate. + * @throws {Error} - Throws an error if the schema is invalid. + * @returns {boolean} - Returns true if the schema is valid. + */ +const validateSchemaContent = (content: string) => { + try { + SchemaSchema.parse(JSON.parse(content)); + return true; + } catch (error) { + throw new Error(`Invalid schema content: ${error.message}`); + } +}; -export const runAgent = ( - apiKey: string, +/** + * Writes a schema file to the specified directory if it does not already exist. + * + * @param {string} schemaDirectory - The directory where schema files are stored. + * @param {string} fileName - The name of the schema file to write. + * @param {string} content - The content of the schema file as a JSON string. + * @returns {Promise} - A message indicating success or an error if the file already exists. + */ +const writeSchemaFile = async ( schemaDirectory: string, - collectionName: string, - tablePrefix: string, - sampleData: any[] -) => { - const ai = genkit({ plugins: [googleAI({ apiKey })] }); + fileName: string, + content: string +): Promise => { + const filePath = path.join(schemaDirectory, fileName); + try { + await fs.access(filePath); + return "Error: Schema file already exists"; + } catch { + await fs.writeFile(filePath, content); + return "Schema created successfully"; + } +}; - // Tool to write a new schema file - const writeSchemaTool = ai.defineTool( +/** + * Defines the writeSchema tool for the Genkit agent. + * + * @param {ReturnType} ai - The Genkit instance. + * @param {string} schemaDirectory - The directory where schema files are stored. + * @returns {object} - The defined tool instance. + */ +const defineWriteSchemaTool = ( + ai: ReturnType, + schemaDirectory: string +) => { + return ai.defineTool( { name: "writeSchema", description: "Creates a new schema file", @@ -50,29 +74,43 @@ export const runAgent = ( }), outputSchema: z.string().describe("Result of the operation"), }, - async ({ fileName, content }) => { - const filePath = path.join(schemaDirectory, fileName); + async ({ + fileName, + content, + }: { + fileName: string; + content: string; + }): Promise => { try { - // Validate schema structure before writing - Schema.parse(JSON.parse(content)); - - // Check if file already exists - try { - await fs.access(filePath); - return "Error: Schema file already exists"; - } catch { - // File doesn't exist, proceed with writing - await fs.writeFile(filePath, content); - return "Schema created successfully"; - } + validateSchemaContent(content); + return await writeSchemaFile(schemaDirectory, fileName, content); } catch (error) { return `Error creating schema: ${error.message}`; } } ); +}; + +/** + * Defines the schema management agent for Genkit. + * + * @param {ReturnType} ai - The Genkit instance. + * @param {string} schemaDirectory - The directory where schema files are stored. + * @param {string} collectionName - The name of the Firestore collection. + * @param {string} tablePrefix - The prefix for the generated BigQuery table schema. + * @param {any[]} sampleData - Sample documents from the Firestore collection. + * @returns {object} - The defined prompt instance. + */ +const defineSchemaAgent = ( + ai: Genkit, + schemaDirectory: string, + collectionName: string, + tablePrefix: string, + sampleData: any[] +): object => { + const writeSchemaTool = defineWriteSchemaTool(ai, schemaDirectory); - // Define the schema management agent - const schemaAgent = ai.definePrompt( + return ai.definePrompt( { name: "schemaAgent", description: "Agent for managing BigQuery schema files", @@ -155,9 +193,7 @@ export const runAgent = ( 4. Make sure all fields are correctly represented in the schema, and described and formatted 5. SQL has a number of reserved keywords that can cause conflicts when creating a schema, timestamp is one such example. To ensure your Firestore document field names do not conflict, use the column_name option to override the field name. - for example: - { "fields": [ { @@ -174,7 +210,32 @@ export const runAgent = ( Begin by analyzing the sample data and create a well-documented schema.` ); +}; - // Return the chat interface +/** + * Main function to run the Genkit agent for schema management. + * + * @param {string} apiKey - The API key for Google AI. + * @param {string} schemaDirectory - The directory where schema files are stored. + * @param {string} collectionName - The name of the Firestore collection. + * @param {string} tablePrefix - The prefix for the generated BigQuery table schema. + * @param {any[]} sampleData - Sample documents from the Firestore collection. + * @returns {Promise} - The chat interface with the schema management agent. + */ +export const runAgent = ( + apiKey: string, + schemaDirectory: string, + collectionName: string, + tablePrefix: string, + sampleData: any[] +) => { + const ai = initializeGenkit(apiKey); + const schemaAgent = defineSchemaAgent( + ai, + schemaDirectory, + collectionName, + tablePrefix, + sampleData + ); return ai.chat(schemaAgent); }; diff --git a/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkitSchema.ts b/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkitSchema.ts new file mode 100644 index 000000000..938658afe --- /dev/null +++ b/firestore-bigquery-export/scripts/gen-schema-view/src/schema/genkitSchema.ts @@ -0,0 +1,35 @@ +import { z } from "genkit"; + +// Define the schema structure for validation +export const SchemaFieldTypeSchema = z.enum([ + "string", + "array", + "map", + "boolean", + "number", + "timestamp", + "geopoint", + "reference", + "null", + "stringified_map", +]); + +export type SchemaFieldType = z.infer; + +export const SchemaFieldSchema = z.lazy(() => + z.object({ + name: z.string(), + type: z.string(), + description: z.string().optional(), + column_name: z.string().optional(), // Optional column_name + fields: z.array(SchemaFieldSchema).optional(), // Recursive reference + }) +); + +export type SchemaField = z.infer; + +export const SchemaSchema = z.object({ + fields: z.array(SchemaFieldSchema), +}); + +export type Schema = z.infer;