From b7ce8d1518cb44d8275942ea02369b3333b04082 Mon Sep 17 00:00:00 2001 From: 18043821456 <1092565192@qq.com> Date: Thu, 29 Feb 2024 11:56:58 +0000 Subject: [PATCH] feat: Customizing selector can sometimes cause web crawlers to fail --- src/core.ts | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/src/core.ts b/src/core.ts index 420e5b9f..7b5f9c92 100644 --- a/src/core.ts +++ b/src/core.ts @@ -65,34 +65,28 @@ export async function crawl(config: Config) { ); // Use custom handling for XPath selector - let html: string | null = null; // 初始化为null或空字符串"" - if (config.selector) { - if (config.selector.startsWith("/")) { - await waitForXPath( - page, - config.selector, - config.waitForSelectorTimeout ?? 1000, - ); - html = await getPageHtml(page, config.selector); - } else { - try { - // 尝试等待CSS选择器,捕获可能的异常 - await page.waitForSelector(config.selector, { - timeout: config.waitForSelectorTimeout ?? 1000, - }); - html = await getPageHtml(page, config.selector); - } catch (error) { - // 如果CSS选择器等待失败,则输出日志并等待 - console.log( - `CSS Selector "${config.selector}" not found. Waiting for instead.`, + let effectiveSelector = config.selector; + if (effectiveSelector) { + try { + if (effectiveSelector.startsWith("/")) { + await waitForXPath( + page, + effectiveSelector, + config.waitForSelectorTimeout ?? 1000, ); - await page.waitForSelector("body", { + } else { + await page.waitForSelector(effectiveSelector, { timeout: config.waitForSelectorTimeout ?? 1000, }); - html = await getPageHtml(page, "body"); } + } catch (error) { + console.log( + `Selector "${config.selector}" not found. Defaulting to .`, + ); + effectiveSelector = undefined; } } + const html = await getPageHtml(page, effectiveSelector); // Save results as JSON to ./storage/datasets/default await pushData({ title, url: request.loadedUrl, html });