From 8bdc2573f60d316cef6b3372398a13229cf6d5de Mon Sep 17 00:00:00 2001 From: Navid Pour Date: Sat, 26 Oct 2024 19:19:52 -0700 Subject: [PATCH] clean up bananalyzer --- evals/bananalyzer-ts/.gitignore | 2 - evals/bananalyzer-ts/index.ts | 275 ------------------- evals/bananalyzer-ts/init.sh | 16 -- evals/bananalyzer-ts/playground.ts | 11 - evals/bananalyzer-ts/schemas.ts | 136 --------- evals/bananalyzer-ts/server/expressServer.ts | 160 ----------- evals/bananalyzer-ts/utils/mhtmlParser.ts | 203 -------------- evals/index.eval.ts | 53 +--- package.json | 2 +- 9 files changed, 7 insertions(+), 851 deletions(-) delete mode 100644 evals/bananalyzer-ts/.gitignore delete mode 100644 evals/bananalyzer-ts/index.ts delete mode 100755 evals/bananalyzer-ts/init.sh delete mode 100644 evals/bananalyzer-ts/playground.ts delete mode 100644 evals/bananalyzer-ts/schemas.ts delete mode 100644 evals/bananalyzer-ts/server/expressServer.ts delete mode 100644 evals/bananalyzer-ts/utils/mhtmlParser.ts diff --git a/evals/bananalyzer-ts/.gitignore b/evals/bananalyzer-ts/.gitignore deleted file mode 100644 index d1df1fdc..00000000 --- a/evals/bananalyzer-ts/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -static -server/public \ No newline at end of file diff --git a/evals/bananalyzer-ts/index.ts b/evals/bananalyzer-ts/index.ts deleted file mode 100644 index 12fbd3d7..00000000 --- a/evals/bananalyzer-ts/index.ts +++ /dev/null @@ -1,275 +0,0 @@ -import { z } from "zod"; -import { Stagehand } from "../../lib"; -import fs from "fs"; -import path from "path"; -import { Server } from "http"; -import { createExpressServer } from "./server/expressServer"; -import { - Example, - getSchemaByName, - getCustomSchema, - getGoals, - SchemaName, -} from "./schemas"; - -const basePath = __dirname.includes("bananalyzer-ts") - ? __dirname - : path.join(__dirname, "bananalyzer-ts"); - -// Validation helper functions -function validateJsonMatch(expected: any, result: any): boolean { - if (typeof expected !== typeof result) return false; - if (Array.isArray(expected)) { - if (!Array.isArray(result) || expected.length !== result.length) - return false; - return expected.every((item, index) => - validateJsonMatch(item, result[index]), - ); - } - if (typeof expected === "object" && expected !== null) { - return Object.keys(expected).every((key) => - validateJsonMatch(expected[key], result[key]), - ); - } - return expected === result; -} - -function validateEndUrlMatch(expected: string, actual: string): boolean { - return actual.endsWith(expected); -} - -export async function evaluateExample( - exampleId: string, - options: { - launchServer?: boolean; - serverPort?: number; - } = { - launchServer: true, - serverPort: 6778, - }, -): Promise { - await new Promise((resolve) => setTimeout(resolve, 2000)); - - const examples = JSON.parse( - fs.readFileSync(path.join(basePath, "static/examples.json"), "utf-8"), - ); - - const example = examples.find((example: Example) => example.id === exampleId); - if (!example) { - console.error(`Example with ID ${exampleId} not found.`); - return false; - } - - const stagehand = new Stagehand({ - env: "LOCAL", - verbose: 1, - headless: process.env.HEADLESS !== "false", - }); - await stagehand.init(); - - let server: Server | null = null; - let localUrl: string = example.url; // Default to the original URL - let resources: any[] = []; - let port = options.serverPort; - - try { - if (example.source === "mhtml") { - // Handle MHTML Source - const mhtmlFilePath = path.resolve( - path.join(basePath, `static/${example.id}/index.mhtml`), - ); - - if (options.launchServer) { - const app = createExpressServer(); - server = app.listen(port, () => { - console.log(`Express server listening on port ${port}`); - }); - // Wait briefly to ensure the server starts - await new Promise((resolve) => setTimeout(resolve, 1000)); - } - - const response = await fetch(`http://localhost:${port}/add-mhtml`, { - method: "POST", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ mhtmlFilePath }), - }); - - if (!response.ok) { - throw new Error(`Failed to add MHTML: ${response.statusText}`); - } - - const responseData = await response.json(); - resources = responseData.resources; - - // Set the local URL to the modified index.html - localUrl = `http://localhost:${port}/static/${example.id}/index.html`; - } - - await stagehand.page.goto(localUrl); - - let schemaDefinition: z.ZodRawShape; - - if ( - typeof example.schema_ === "string" && - SchemaName.options.includes(example.schema_) - ) { - // If schema_ is a predefined SchemaName - schemaDefinition = getSchemaByName(example.schema_ as SchemaName); - } else if (typeof example.schema_ === "object") { - // If schema_ is a custom JSON schema - schemaDefinition = getCustomSchema( - example.schema_ as Record, - ); - } else { - throw new Error("Invalid schema definition"); - } - - // Fetch the goal from goals.json based on the subcategory - const goals = getGoals(); - const goal = - goals[example.subcategory] || - example.goal || - "Scrape the content of this page."; - - let extractionResult; - - if (example.type === "listing_detail") { - // If the type is listing_detail, expect an array of the schema - extractionResult = await stagehand.extract({ - instruction: goal, - schema: z.object({ items: z.array(z.object(schemaDefinition)) }), - modelName: "gpt-4o-2024-08-06", - }); - } else { - // For other types, expect a single object of the schema - extractionResult = await stagehand.extract({ - instruction: goal, - schema: z.object(schemaDefinition), - modelName: "gpt-4o-2024-08-06", - }); - } - - if (example.type === "listing_detail") { - extractionResult = extractionResult.items; - } - - console.log("Extracted data:", extractionResult); - - for (const evalItem of example.evals) { - if (evalItem.type === "json_match") { - if (evalItem.expected) { - if (!validateJsonMatch(evalItem.expected, extractionResult)) { - console.log("❌ JSON match failed"); - return { - _success: false, - case: "json_mismatch_1", - expected: evalItem.expected, - actual: extractionResult, - }; - } - } else if (evalItem.options) { - const matchesAny = evalItem.options.some((option) => - validateJsonMatch(option, extractionResult), - ); - if (!matchesAny) { - console.log("❌ No JSON match found in options"); - return { - _success: false, - case: "json_mismatch_2", - expected: evalItem.expected, - actual: extractionResult, - }; - } - } - } else if ( - evalItem.type === "end_url_match" && - typeof evalItem.expected === "string" - ) { - if ( - !validateEndUrlMatch(evalItem.expected, await stagehand.page.url()) - ) { - console.log("❌ URL match failed"); - return { - _success: false, - case: "url_mismatch", - expected: evalItem.expected, - actual: await stagehand.page.url(), - }; - } - } - } - - console.log("✅ All evaluations passed"); - return { - _success: true, - expected: extractionResult, - actual: extractionResult, - }; - } catch (error) { - console.error("Error during evaluation:", error); - return { - _success: false, - error: error, - }; - } finally { - try { - const deleteResponse = await fetch( - `http://localhost:${port}/delete-resources`, - { - method: "DELETE", - headers: { - "Content-Type": "application/json", - }, - body: JSON.stringify({ exampleId }), - }, - ); - - if (!deleteResponse.ok) { - console.error( - `Failed to delete resources: ${deleteResponse.statusText}`, - ); - } else { - console.log("Resources deleted successfully."); - } - } catch (deleteError) { - console.error("Error deleting resources:", deleteError); - } - - try { - if (server) { - server.close(() => { - console.log("Express server closed."); - }); - } - } catch (closeError) { - console.error("Error closing server:", closeError); - } - - await stagehand.context.close(); - } -} - -export const chosenBananalyzerEvals = [ - { - id: "JNOSAEEZO4j2unWHPFBdO", - tags: ["detail"], - name: "bananalyzer_1", - }, - { - id: "KuDD2GuMDlbuKO4ozdbDA", - tags: ["listing-detail"], - name: "bananalyzer_2", - }, - { - id: "nAXVoJDSuul938vtPvfFB", - tags: ["listing-detail", "detail"], - name: "bananalyzer_3", - }, - { - id: "GQfYTjppPhTgYtsuFUbXF", - tags: ["listing-detail", "detail"], - name: "bananalyzer_4", - }, -]; diff --git a/evals/bananalyzer-ts/init.sh b/evals/bananalyzer-ts/init.sh deleted file mode 100755 index 031a51b3..00000000 --- a/evals/bananalyzer-ts/init.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -# Get the directory of the script -SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - -# Clone the repository -git clone https://github.com/navidkpr/bananalyzer -git -C bananalyzer checkout # In case the repo has a bad commit -# Manually update the hash once in a while - -# Copy the static folder to the script's directory -rm -rf "$SCRIPT_DIR/static" -cp -r bananalyzer/static "$SCRIPT_DIR" - -# Remove the cloned repository -rm -rf bananalyzer \ No newline at end of file diff --git a/evals/bananalyzer-ts/playground.ts b/evals/bananalyzer-ts/playground.ts deleted file mode 100644 index 553a51de..00000000 --- a/evals/bananalyzer-ts/playground.ts +++ /dev/null @@ -1,11 +0,0 @@ -import { evaluateExample } from "."; - -async function test() { - const singleExampleId = "JNOSAEEZO4j2unWHPFBdO"; - - await evaluateExample(singleExampleId) - .then((result) => console.log("Evaluation result:", result)) - .catch((error) => console.error("Evaluation error:", error)); -} - -test(); diff --git a/evals/bananalyzer-ts/schemas.ts b/evals/bananalyzer-ts/schemas.ts deleted file mode 100644 index 6d210461..00000000 --- a/evals/bananalyzer-ts/schemas.ts +++ /dev/null @@ -1,136 +0,0 @@ -import { z } from "zod"; -import fs from "fs"; -import path from "path"; - -const basePath = __dirname.includes("bananalyzer-ts") - ? __dirname - : path.join(__dirname, "bananalyzer-ts"); - -// Load examples -const examples = JSON.parse( - fs.readFileSync(path.join(basePath, "static/examples.json"), "utf-8"), -); - -export const ExampleType = z.enum(["listing", "detail", "listing_detail"]); -export const SchemaName = z.enum([ - "job_posting", - "manufacturing_commerce", - "contact", - "contract", - "forum", - "attorney", - "attorney_job_listing", - // Add any other schema names that exist in your JSON file -]); -export const PossibleTags = z.enum([ - "regression", - "single-output", - "accordion", - "pagination", - "colliding-tags", - "contract", - "badly-formatted", - "urls", - "enqueue", - "infinite-scroll", - "synthetic", - "images", -]); - -// Add a type definition for SchemaName -type SchemaName = - | "job_posting" - | "manufacturing_commerce" - | "contact" - | "contract" - | "forum" - | "attorney" - | "attorney_job_listing"; - -// Define Eval schema -export const EvalSchema = z.object({ - type: z.enum(["json_match", "end_url_match"]).default("json_match"), - expected: z.any().nullable(), - options: z.array(z.any()).nullable(), -}); - -// Update the Example schema to allow schema_ to be either SchemaName or a custom schema object -export const ExampleSchema = z.object({ - id: z.string(), - url: z.string(), - resource_path: z.string().nullable(), - source: z.enum(["html", "mhtml", "hosted", "har"]), - category: z.string(), - subcategory: z.string(), - type: ExampleType, - goal: z.string(), - schema_: z.union([SchemaName, z.record(z.any())]), - evals: z.array(EvalSchema), - tags: z.array(PossibleTags).default([]), -}); - -export type Example = z.infer; -export type Eval = z.infer; - -// Separate function to get predefined schema by name -export function getSchemaByName(schemaName: SchemaName): z.ZodRawShape { - const schemaPath = path.join(basePath, "static/schemas.json"); - const schemasJson = JSON.parse(fs.readFileSync(schemaPath, "utf-8")); - - if (!(schemaName in schemasJson)) { - throw new Error(`Schema ${schemaName} not found in schemas.json`); - } - - const schemaDefinition = schemasJson[schemaName]; - return Object.entries(schemaDefinition).reduce((acc, [key, value]) => { - acc[key] = zodTypeFromJsonSchema(value as any); - return acc; - }, {} as z.ZodRawShape); -} - -// Function to handle custom JSON schemas -export function getCustomSchema( - customSchema: Record, -): z.ZodRawShape { - return Object.entries(customSchema).reduce((acc, [key, value]) => { - acc[key] = zodTypeFromJsonSchema(value); - return acc; - }, {} as z.ZodRawShape); -} - -// Helper function to convert JSON schema types to Zod types -function zodTypeFromJsonSchema(jsonSchema: any): z.ZodTypeAny { - switch (jsonSchema.type) { - case "string": - return z.string(); - case "number": - return z.number(); - case "integer": - return z.number().int(); - case "boolean": - return z.boolean(); - case "array": - return z.array(zodTypeFromJsonSchema(jsonSchema.items)); - case "currency": - return z.string(); - case "object": - return z.object( - Object.entries(jsonSchema.properties).reduce((acc, [key, value]) => { - acc[key] = zodTypeFromJsonSchema(value as any); - return acc; - }, {} as z.ZodRawShape), - ); - case "email": - return z.string(); - case "url": - return z.string(); - default: - return z.any(); - } -} - -// Function to read and parse the goals.json file -export function getGoals(): Record { - const goalsPath = path.join(basePath, "static/goals.json"); - return JSON.parse(fs.readFileSync(goalsPath, "utf-8")); -} diff --git a/evals/bananalyzer-ts/server/expressServer.ts b/evals/bananalyzer-ts/server/expressServer.ts deleted file mode 100644 index 8fca1daf..00000000 --- a/evals/bananalyzer-ts/server/expressServer.ts +++ /dev/null @@ -1,160 +0,0 @@ -import express, { Express, Request, Response } from "express"; -import path from "path"; -import fs from "fs"; -import { parseMHTMLFile } from "../utils/mhtmlParser"; -import * as cheerio from "cheerio"; -import { URL } from "url"; - -const publicDir = path.join(__dirname, "public"); - -// Ensure the public directory exists -if (!fs.existsSync(publicDir)) { - fs.mkdirSync(publicDir, { recursive: true }); -} - -export function createExpressServer(): Express { - const app = express(); - - app.use(express.json()); - - app.use("/static", express.static(publicDir)); - - // Endpoint to add MHTML content - app.post("/add-mhtml", async (req: Request, res: Response) => { - const { mhtmlFilePath } = req.body; - - if (!mhtmlFilePath) { - return res.status(400).send("Missing mhtmlFilePath"); - } - - try { - const parsedMHTML = await parseMHTMLFile(mhtmlFilePath); - - const exampleId = path.basename(path.dirname(mhtmlFilePath)); - const exampleDir = path.join(publicDir, exampleId); - - if (!fs.existsSync(exampleDir)) { - fs.mkdirSync(exampleDir, { recursive: true }); - } - - parsedMHTML.resources.forEach((resource) => { - try { - // Use the correct property, e.g., 'contentLocation' - const resourceURL = new URL(resource.contentLocation); - const relativePath = resourceURL.pathname.startsWith("/") - ? resourceURL.pathname.slice(1) // Remove leading "/" - : resourceURL.pathname; - - // Define the full path within the example directory - const resourcePath = path.join(exampleDir, relativePath); - const resourceDir = path.dirname(resourcePath); - - // Ensure the resource directory exists - if (!fs.existsSync(resourceDir)) { - fs.mkdirSync(resourceDir, { recursive: true }); - } - - // Write the resource content to the specified path - fs.writeFileSync(resourcePath, resource.content); - } catch (resourceError) { - console.error( - `Failed to save resource ${resource.contentLocation}:`, - resourceError, - ); - } - }); - - // Modify HTML to point resource URLs to the local server - const $ = cheerio.load(parsedMHTML.html); - - // Update all ,