diff --git a/src/crawler.ts b/src/crawler.ts index e42da1b..8d10f80 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -5,6 +5,7 @@ import { PuppeteerCrawlingContext, PuppeteerCrawlerOptions, RequestQueue, + PuppeteerHook, } from 'crawlee' import { minimatch } from 'minimatch' @@ -50,8 +51,8 @@ export class Crawler { this.config.strategy == 'docssearch' ? new DocsearchScraper(this.sender, this.config) : this.config.strategy == 'schema' - ? new SchemaScraper(this.sender, this.config) - : new DefaultScraper(this.sender, this.config) + ? new SchemaScraper(this.sender, this.config) + : new DefaultScraper(this.sender, this.config) } async run() { @@ -62,12 +63,27 @@ export class Crawler { //Create the router const router = createPuppeteerRouter() + // type DefaultHandler = Parameters[0]; router.addDefaultHandler(this.defaultHandler.bind(this)) + const preNavigationHooks: PuppeteerHook[] = this.config.additional_request_headers ? [ + async (crawlingContext) => { + crawlingContext.addInterceptRequestHandler(async (request) => { + request.continue({ + headers: { + ...request.headers(), + ...this.config.additional_request_headers, + } + }); + }) + }, + ] : [] + const puppeteerCrawlerOptions: PuppeteerCrawlerOptions = { requestQueue, requestHandler: router, + preNavigationHooks: preNavigationHooks, launchContext: { launchOptions: { headless: this.config.headless || true, diff --git a/src/types.ts b/src/types.ts index 2da52ee..4bf5568 100644 --- a/src/types.ts +++ b/src/types.ts @@ -9,6 +9,7 @@ export type Config = { meilisearch_api_key: string start_urls: string[] urls_to_exclude?: string[] + additional_request_headers?: Record queue?: string[] primary_key?: string batch_size?: number