Skip to content

Commit

Permalink
remove startCrawl; comment playwright
Browse files Browse the repository at this point in the history
  • Loading branch information
qdequele committed Oct 19, 2024
1 parent 6195809 commit 60f2416
Show file tree
Hide file tree
Showing 8 changed files with 123 additions and 34 deletions.
7 changes: 7 additions & 0 deletions misc/config_tests/-default-simple-playwright.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"start_urls": ["https://www.meilisearch.com/docs"],
"meilisearch_url": "localhost:7700",
"meilisearch_api_key": "masterKey",
"meilisearch_index_uid": "default-simple-playwright",
"crawler_type": "playwright"
}
8 changes: 5 additions & 3 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@
"meilisearch": "0.44.1",
"minimatch": "10.0.1",
"node-html-markdown": "^1.3.0",
"puppeteer": "23.5.3",
"puppeteer-core": "23.5.3",
"puppeteer": "23.6.0",
"puppeteer-core": "23.6.0",
"uuid": "10.0.0",
"yargs": "17.7.2"
},
Expand All @@ -40,6 +40,7 @@
"@apify/log": "2.5.7",
"@apify/tsconfig": "0.1.0",
"@types/express": "5.0.0",
"@types/minimist": "^1.2.5",
"@types/node": "^22.5.5",
"@types/prettier": "3.0.0",
"@types/puppeteer": "7.0.4",
Expand All @@ -52,8 +53,9 @@
"eslint-config-prettier": "9.1.0",
"eslint-plugin-jest": "28.8.3",
"eslint-plugin-prettier": "5.2.1",
"minimist": "^1.2.8",
"nodemon": "3.1.7",
"playwright": "1.48.0",
"playwright": "^1.48.0",
"ts-node": "^10.9.2",
"typescript": "5.6.3"
},
Expand Down
5 changes: 4 additions & 1 deletion src/crawlers/base.ts
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ export abstract class BaseCrawler {
this.sender = sender;
this.config = config;
this.urls = config.start_urls;
this.crawlerType = config.crawler_type || "puppeteer";
this.crawlerType = config.crawler_type || "cheerio";

this.scraper =
this.config.strategy === "docssearch"
Expand Down Expand Up @@ -80,6 +80,9 @@ export abstract class BaseCrawler {
if (this.crawlerType === "puppeteer") {
const pageContent = await context.page.content(); // Get HTML content from Puppeteer page
$ = cheerio.load(pageContent); // Load HTML into Cheerio
// } else if (this.crawlerType === "playwright") {
// const pageContent = await context.page.content(); // Get HTML content from Playwright page
// $ = cheerio.load(pageContent); // Load HTML into Cheerio
} else {
$ = context.$; // Use Cheerio context if not Puppeteer
}
Expand Down
3 changes: 3 additions & 0 deletions src/crawlers/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { Log, RequestQueue } from "crawlee";
import { PuppeteerCrawler } from "./puppeteer";
// import { PlaywrightCrawler } from "./playwright";
import { CheerioCrawler } from "./cheerio";
import { Sender } from "../sender";
import { Config, CrawlerType } from "../types";
Expand All @@ -23,6 +24,8 @@ export class Crawler {
return new PuppeteerCrawler(sender, config, launchOptions, launcher);
case "cheerio":
return new CheerioCrawler(sender, config);
// case "playwright":
// return new PlaywrightCrawler(sender, config, launchOptions);
default:
throw new Error(`Unsupported crawler type: ${crawlerType}`);
}
Expand Down
77 changes: 77 additions & 0 deletions src/crawlers/playwright.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import {
createPlaywrightRouter,
PlaywrightCrawler as CrawleePlaywrightCrawler,
PlaywrightCrawlerOptions,
PlaywrightHook,
PlaywrightCrawlingContext,
Router,
RequestQueue,
} from "crawlee";
import { LaunchOptions } from "playwright";
// import { firefox } from "playwright";
import { BaseCrawler } from "./base";
import { Sender } from "../sender";
import { Config } from "../types";

export class PlaywrightCrawler extends BaseCrawler {
launchOptions: LaunchOptions = {};

constructor(
sender: Sender,
config: Config,
launchOptions: LaunchOptions = {}
) {
super(sender, config);
this.launchOptions = launchOptions;
}

createRouter(): Router<PlaywrightCrawlingContext> {
return createPlaywrightRouter();
}

getCrawlerOptions(
requestQueue: RequestQueue,
router: Router<PlaywrightCrawlingContext>
): PlaywrightCrawlerOptions {
const preNavigationHooks: PlaywrightHook[] = this.config
.additional_request_headers
? [
async (crawlingContext) => {
await crawlingContext.page.route("**/*", async (route) => {
const request = route.request();
await route.continue({
headers: {
...request.headers(),
...this.config.additional_request_headers,
},
});
});
},
]
: [];

return {
requestQueue,
requestHandler: router as any,
preNavigationHooks: preNavigationHooks,
launchContext: {
// launcher: firefox,
launchOptions: {
headless: this.config.headless || true,
// args: ["--no-sandbox", "--disable-setuid-sandbox"],
...this.launchOptions,
},
},
};
}

createCrawlerInstance(
options: PlaywrightCrawlerOptions
): CrawleePlaywrightCrawler {
return new CrawleePlaywrightCrawler(options);
}

override async defaultHandler(context: PlaywrightCrawlingContext) {
await this.handlePage(context);
}
}
22 changes: 0 additions & 22 deletions src/server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ class Server {
this.app.post("/crawl", this.__asyncCrawl.bind(this));
this.app.post("/crawl/async", this.__asyncCrawl.bind(this));
this.app.post("/crawl/sync", this.__syncCrawl.bind(this));
this.app.post("/crawl/start", this.__startCrawl.bind(this));
this.app.post("/webhook", this.__log_webhook.bind(this));

this.app.listen(port, () =>
Expand Down Expand Up @@ -70,27 +69,6 @@ class Server {
res.send("Crawling finished");
}

async __startCrawl(req: express.Request, res: express.Response) {
const config: Config = req.body;
log.info("Starting crawl process", { config });
res.send("Crawling started");

const sender = new Sender(config);
await sender.init();

const crawler = await Crawler.create(
config.crawler_type || "puppeteer",
sender,
config,
config.launch_options,
config.launcher
);

await Crawler.run(crawler);
await sender.finish();
log.info("Crawl process completed", { config });
}

__log_webhook(req: express.Request, res: express.Response) {
log.info("Webhook received", { body: req.body });
res.send("Webhook acknowledged");
Expand Down
2 changes: 0 additions & 2 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ export type SchemaSettings = {
only_type: string;
};

export type ScraperType = "default" | "docssearch" | "schema";

export type Scraper = {
get: (url: string, $: CheerioAPI) => Promise<void>;
};
Expand Down
33 changes: 27 additions & 6 deletions tests/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ import fs from "fs";
import path from "path";
import { exec } from "child_process";
import { MeiliSearch } from "meilisearch";
import minimist from "minimist";
import { minimatch } from "minimatch";

interface TestResult {
name: string;
Expand All @@ -17,13 +19,19 @@ interface TestConfig {
content: Record<string, any>;
}

function getTestConfigs(): TestConfig[] {
function getTestConfigs(pattern?: string): TestConfig[] {
const configDir = path.join(__dirname, "../../misc/config_tests");
const configFiles = fs
let configFiles = fs
.readdirSync(configDir)
.filter((file) => !file.startsWith("-") && file.endsWith(".json"));

return configFiles.map((file) => {
if (pattern) {
configFiles = configFiles.filter((file) =>
minimatch(file, pattern, { nocase: true })
);
}

const configs = configFiles.map((file) => {
const content = JSON.parse(
fs.readFileSync(path.join(configDir, file), "utf-8")
);
Expand All @@ -33,6 +41,8 @@ function getTestConfigs(): TestConfig[] {
content,
};
});

return configs;
}

function runCrawlerWithMetrics(
Expand Down Expand Up @@ -89,9 +99,16 @@ async function verifyMeilisearchContent(configContent: Record<string, any>) {
return stats;
}

async function runAllTests() {
async function runAllTests(pattern?: string) {
const startTime = performance.now();
const testConfigs = getTestConfigs();
const testConfigs = getTestConfigs(pattern);

if (testConfigs.length === 0) {
console.log(
`No test configurations found${pattern ? ` matching pattern: ${pattern}` : ""}`
);
return;
}

const results = {
timestamp: new Date().toISOString(),
Expand Down Expand Up @@ -174,4 +191,8 @@ async function runAllTests() {
}
}

runAllTests().catch(console.error);
// Main execution
const argv = minimist(process.argv.slice(2));
const pattern = argv.pattern || argv.p;

runAllTests(pattern).catch(console.error);

0 comments on commit 60f2416

Please sign in to comment.