diff --git a/CHANGELOG.md b/CHANGELOG.md index 81764d5..bcc9c66 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +## [2.0.5](https://github.com/armand1m/papercut/compare/v2.0.4...v2.0.5) (2021-11-15) + + +### Bug Fixes + +* export utilities and add managed jsdom example ([132038b](https://github.com/armand1m/papercut/commit/132038bd46bf6386b168967925f0cadf8a906241)) + ## [2.0.4](https://github.com/armand1m/papercut/compare/v2.0.3...v2.0.4) (2021-11-15) diff --git a/README.md b/README.md index 39ce8ff..ad1cefb 100644 --- a/README.md +++ b/README.md @@ -97,7 +97,10 @@ const main = async () => { baseUrl: "https://news.ycombinator.com/", target: ".athing", selectors: { - rank: ({ text }) => text('.rank'), + rank: (utils) => { + const value = utils.text('.rank').replace(/^\D+/g, ''); + return Number(value); + }, name: ({ text }) => text('.titlelink'), url: ({ href }) => href('.titlelink'), score: ({ element }) => { @@ -229,6 +232,76 @@ Then run it using `node` or `ts-node`: npx ts-node ./paginated-scraper.ts ``` +#### Managed JSDOM + +In case you want to use your own JSDOM and Pino instance and tweak/configure as much as you prefer, you can use the `scrape` function instead. + +In the example below, we use the exposed `createWindow` and `fetchPage` utilities for convenience. You can use JSDOM constructor directly and any other strategy to fetch your page HTML as desired. + +```ts file=./examples/typescript/src/managed-jsdom/scraper.ts +import pino from 'pino' +import { scrape, fetchPage, createWindow } from '@armand1m/papercut'; + +const main = async () => { + const logger = pino({ + name: 'Hacker News', + enabled: false + }); + + const rawHTML = await fetchPage('https://news.ycombinator.com/') + const window = createWindow(rawHTML); + + const results = await scrape({ + strict: true, + logger, + document: window.document, + target: ".athing", + selectors: { + rank: (utils) => { + const value = utils.text('.rank').replace(/^\D+/g, ''); + return Number(value); + }, + name: ({ text }) => text('.titlelink'), + url: ({ href }) => href('.titlelink'), + score: ({ element }) => { + return element.nextElementSibling?.querySelector('.score') + ?.textContent; + }, + createdBy: ({ element }) => { + return element.nextElementSibling?.querySelector('.hnuser') + ?.textContent; + }, + createdAt: ({ element }) => { + return element.nextElementSibling + ?.querySelector('.age') + ?.getAttribute('title'); + }, + }, + options: { + log: false, + cache: true, + concurrency: { + page: 2, + node: 2, + selector: 2 + } + } + }); + + window.close(); + + console.log(JSON.stringify(results, null, 2)); +}; + +main(); +``` + +Then run it using `node` or `ts-node`: + +```sh +npx ts-node ./managed-jsdom.ts +``` + ## API Reference [Click here to open the API reference.](https://armand1m.github.io/papercut) diff --git a/docs/assets/highlight.css b/docs/assets/highlight.css index d04c09f..1ed678f 100644 --- a/docs/assets/highlight.css +++ b/docs/assets/highlight.css @@ -13,16 +13,16 @@ --dark-hl-5: #569CD6; --light-hl-6: #0070C1; --dark-hl-6: #4FC1FF; - --light-hl-7: #267F99; - --dark-hl-7: #4EC9B0; - --light-hl-8: #098658; - --dark-hl-8: #B5CEA8; - --light-hl-9: #811F3F; - --dark-hl-9: #D16969; - --light-hl-10: #000000; - --dark-hl-10: #D7BA7D; - --light-hl-11: #EE0000; - --dark-hl-11: #DCDCAA; + --light-hl-7: #811F3F; + --dark-hl-7: #D16969; + --light-hl-8: #EE0000; + --dark-hl-8: #DCDCAA; + --light-hl-9: #000000; + --dark-hl-9: #D7BA7D; + --light-hl-10: #267F99; + --dark-hl-10: #4EC9B0; + --light-hl-11: #098658; + --dark-hl-11: #B5CEA8; --light-hl-12: #008000; --dark-hl-12: #6A9955; --light-code-background: #FFFFFF; diff --git a/docs/assets/search.js b/docs/assets/search.js index 2f329d9..1a4f5ca 100644 --- a/docs/assets/search.js +++ b/docs/assets/search.js @@ -1 +1 @@ -window.searchData = {"kinds":{"64":"Function","256":"Interface","1024":"Property","65536":"Type literal","4194304":"Type alias"},"rows":[{"id":0,"kind":64,"name":"createScraper","url":"modules.html#createScraper","classes":"tsd-kind-function"},{"id":1,"kind":4194304,"name":"Scraper","url":"modules.html#Scraper","classes":"tsd-kind-type-alias"},{"id":2,"kind":256,"name":"ScraperOptions","url":"interfaces/ScraperOptions.html","classes":"tsd-kind-interface"},{"id":3,"kind":1024,"name":"log","url":"interfaces/ScraperOptions.html#log","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":4,"kind":1024,"name":"cache","url":"interfaces/ScraperOptions.html#cache","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":5,"kind":1024,"name":"concurrency","url":"interfaces/ScraperOptions.html#concurrency","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":6,"kind":65536,"name":"__type","url":"interfaces/ScraperOptions.html#__type","classes":"tsd-kind-type-literal tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":7,"kind":1024,"name":"page","url":"interfaces/ScraperOptions.html#__type.page","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":8,"kind":1024,"name":"node","url":"interfaces/ScraperOptions.html#__type.node","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":9,"kind":1024,"name":"selector","url":"interfaces/ScraperOptions.html#__type.selector","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":10,"kind":256,"name":"ScraperProps","url":"interfaces/ScraperProps.html","classes":"tsd-kind-interface"},{"id":11,"kind":1024,"name":"name","url":"interfaces/ScraperProps.html#name","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperProps"},{"id":12,"kind":1024,"name":"options","url":"interfaces/ScraperProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperProps"},{"id":13,"kind":64,"name":"createRunner","url":"modules.html#createRunner","classes":"tsd-kind-function"},{"id":14,"kind":4194304,"name":"SelectorMap","url":"modules.html#SelectorMap","classes":"tsd-kind-type-alias"},{"id":15,"kind":4194304,"name":"SelectorFunction","url":"modules.html#SelectorFunction","classes":"tsd-kind-type-alias"},{"id":16,"kind":65536,"name":"__type","url":"modules.html#SelectorFunction.__type","classes":"tsd-kind-type-literal tsd-parent-kind-type-alias","parent":"SelectorFunction"},{"id":17,"kind":256,"name":"CreateRunnerProps","url":"interfaces/CreateRunnerProps.html","classes":"tsd-kind-interface"},{"id":18,"kind":1024,"name":"logger","url":"interfaces/CreateRunnerProps.html#logger","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"CreateRunnerProps"},{"id":19,"kind":1024,"name":"options","url":"interfaces/CreateRunnerProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"CreateRunnerProps"},{"id":20,"kind":256,"name":"RunProps","url":"interfaces/RunProps.html","classes":"tsd-kind-interface tsd-has-type-parameter"},{"id":21,"kind":1024,"name":"strict","url":"interfaces/RunProps.html#strict","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":22,"kind":1024,"name":"baseUrl","url":"interfaces/RunProps.html#baseUrl","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":23,"kind":1024,"name":"target","url":"interfaces/RunProps.html#target","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":24,"kind":1024,"name":"selectors","url":"interfaces/RunProps.html#selectors","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":25,"kind":1024,"name":"pagination","url":"interfaces/RunProps.html#pagination","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":26,"kind":64,"name":"scrape","url":"modules.html#scrape","classes":"tsd-kind-function tsd-has-type-parameter"},{"id":27,"kind":256,"name":"ScrapeProps","url":"interfaces/ScrapeProps.html","classes":"tsd-kind-interface tsd-has-type-parameter"},{"id":28,"kind":1024,"name":"strict","url":"interfaces/ScrapeProps.html#strict","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":29,"kind":1024,"name":"target","url":"interfaces/ScrapeProps.html#target","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":30,"kind":1024,"name":"document","url":"interfaces/ScrapeProps.html#document","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":31,"kind":1024,"name":"selectors","url":"interfaces/ScrapeProps.html#selectors","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":32,"kind":1024,"name":"logger","url":"interfaces/ScrapeProps.html#logger","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":33,"kind":1024,"name":"options","url":"interfaces/ScrapeProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":34,"kind":4194304,"name":"ScrapeResultType","url":"modules.html#ScrapeResultType","classes":"tsd-kind-type-alias tsd-has-type-parameter"},{"id":35,"kind":64,"name":"createSelectorUtilities","url":"modules.html#createSelectorUtilities","classes":"tsd-kind-function"},{"id":36,"kind":4194304,"name":"SelectorUtilities","url":"modules.html#SelectorUtilities","classes":"tsd-kind-type-alias"},{"id":37,"kind":64,"name":"geosearch","url":"modules.html#geosearch","classes":"tsd-kind-function"},{"id":38,"kind":256,"name":"GeosearchResult","url":"interfaces/GeosearchResult.html","classes":"tsd-kind-interface"},{"id":39,"kind":1024,"name":"latitude","url":"interfaces/GeosearchResult.html#latitude","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"GeosearchResult"},{"id":40,"kind":1024,"name":"longitude","url":"interfaces/GeosearchResult.html#longitude","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"GeosearchResult"}],"index":{"version":"2.3.9","fields":["name","parent"],"fieldVectors":[["name/0",[0,33.322]],["parent/0",[]],["name/1",[1,33.322]],["parent/1",[]],["name/2",[2,20.329]],["parent/2",[]],["name/3",[3,33.322]],["parent/3",[2,1.611]],["name/4",[4,33.322]],["parent/4",[2,1.611]],["name/5",[5,33.322]],["parent/5",[2,1.611]],["name/6",[6,28.214]],["parent/6",[2,1.611]],["name/7",[7,33.322]],["parent/7",[8,1.969]],["name/8",[9,33.322]],["parent/8",[8,1.969]],["name/9",[10,33.322]],["parent/9",[8,1.969]],["name/10",[11,24.849]],["parent/10",[]],["name/11",[12,33.322]],["parent/11",[11,1.969]],["name/12",[13,24.849]],["parent/12",[11,1.969]],["name/13",[14,33.322]],["parent/13",[]],["name/14",[15,33.322]],["parent/14",[]],["name/15",[16,28.214]],["parent/15",[]],["name/16",[6,28.214]],["parent/16",[16,2.236]],["name/17",[17,24.849]],["parent/17",[]],["name/18",[18,28.214]],["parent/18",[17,1.969]],["name/19",[13,24.849]],["parent/19",[17,1.969]],["name/20",[19,18.659]],["parent/20",[]],["name/21",[20,28.214]],["parent/21",[19,1.479]],["name/22",[21,33.322]],["parent/22",[19,1.479]],["name/23",[22,28.214]],["parent/23",[19,1.479]],["name/24",[23,28.214]],["parent/24",[19,1.479]],["name/25",[24,33.322]],["parent/25",[19,1.479]],["name/26",[25,33.322]],["parent/26",[]],["name/27",[26,17.228]],["parent/27",[]],["name/28",[20,28.214]],["parent/28",[26,1.365]],["name/29",[22,28.214]],["parent/29",[26,1.365]],["name/30",[27,33.322]],["parent/30",[26,1.365]],["name/31",[23,28.214]],["parent/31",[26,1.365]],["name/32",[18,28.214]],["parent/32",[26,1.365]],["name/33",[13,24.849]],["parent/33",[26,1.365]],["name/34",[28,33.322]],["parent/34",[]],["name/35",[29,33.322]],["parent/35",[]],["name/36",[30,33.322]],["parent/36",[]],["name/37",[31,33.322]],["parent/37",[]],["name/38",[32,24.849]],["parent/38",[]],["name/39",[33,33.322]],["parent/39",[32,1.969]],["name/40",[34,33.322]],["parent/40",[32,1.969]]],"invertedIndex":[["__type",{"_index":6,"name":{"6":{},"16":{}},"parent":{}}],["baseurl",{"_index":21,"name":{"22":{}},"parent":{}}],["cache",{"_index":4,"name":{"4":{}},"parent":{}}],["concurrency",{"_index":5,"name":{"5":{}},"parent":{}}],["createrunner",{"_index":14,"name":{"13":{}},"parent":{}}],["createrunnerprops",{"_index":17,"name":{"17":{}},"parent":{"18":{},"19":{}}}],["createscraper",{"_index":0,"name":{"0":{}},"parent":{}}],["createselectorutilities",{"_index":29,"name":{"35":{}},"parent":{}}],["document",{"_index":27,"name":{"30":{}},"parent":{}}],["geosearch",{"_index":31,"name":{"37":{}},"parent":{}}],["geosearchresult",{"_index":32,"name":{"38":{}},"parent":{"39":{},"40":{}}}],["latitude",{"_index":33,"name":{"39":{}},"parent":{}}],["log",{"_index":3,"name":{"3":{}},"parent":{}}],["logger",{"_index":18,"name":{"18":{},"32":{}},"parent":{}}],["longitude",{"_index":34,"name":{"40":{}},"parent":{}}],["name",{"_index":12,"name":{"11":{}},"parent":{}}],["node",{"_index":9,"name":{"8":{}},"parent":{}}],["options",{"_index":13,"name":{"12":{},"19":{},"33":{}},"parent":{}}],["page",{"_index":7,"name":{"7":{}},"parent":{}}],["pagination",{"_index":24,"name":{"25":{}},"parent":{}}],["runprops",{"_index":19,"name":{"20":{}},"parent":{"21":{},"22":{},"23":{},"24":{},"25":{}}}],["scrape",{"_index":25,"name":{"26":{}},"parent":{}}],["scrapeprops",{"_index":26,"name":{"27":{}},"parent":{"28":{},"29":{},"30":{},"31":{},"32":{},"33":{}}}],["scraper",{"_index":1,"name":{"1":{}},"parent":{}}],["scraperesulttype",{"_index":28,"name":{"34":{}},"parent":{}}],["scraperoptions",{"_index":2,"name":{"2":{}},"parent":{"3":{},"4":{},"5":{},"6":{}}}],["scraperoptions.__type",{"_index":8,"name":{},"parent":{"7":{},"8":{},"9":{}}}],["scraperprops",{"_index":11,"name":{"10":{}},"parent":{"11":{},"12":{}}}],["selector",{"_index":10,"name":{"9":{}},"parent":{}}],["selectorfunction",{"_index":16,"name":{"15":{}},"parent":{"16":{}}}],["selectormap",{"_index":15,"name":{"14":{}},"parent":{}}],["selectors",{"_index":23,"name":{"24":{},"31":{}},"parent":{}}],["selectorutilities",{"_index":30,"name":{"36":{}},"parent":{}}],["strict",{"_index":20,"name":{"21":{},"28":{}},"parent":{}}],["target",{"_index":22,"name":{"23":{},"29":{}},"parent":{}}]],"pipeline":[]}} \ No newline at end of file +window.searchData = {"kinds":{"64":"Function","256":"Interface","1024":"Property","65536":"Type literal","4194304":"Type alias"},"rows":[{"id":0,"kind":64,"name":"createScraper","url":"modules.html#createScraper","classes":"tsd-kind-function"},{"id":1,"kind":4194304,"name":"Scraper","url":"modules.html#Scraper","classes":"tsd-kind-type-alias"},{"id":2,"kind":256,"name":"ScraperOptions","url":"interfaces/ScraperOptions.html","classes":"tsd-kind-interface"},{"id":3,"kind":1024,"name":"log","url":"interfaces/ScraperOptions.html#log","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":4,"kind":1024,"name":"cache","url":"interfaces/ScraperOptions.html#cache","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":5,"kind":1024,"name":"concurrency","url":"interfaces/ScraperOptions.html#concurrency","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":6,"kind":65536,"name":"__type","url":"interfaces/ScraperOptions.html#__type","classes":"tsd-kind-type-literal tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":7,"kind":1024,"name":"page","url":"interfaces/ScraperOptions.html#__type.page","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":8,"kind":1024,"name":"node","url":"interfaces/ScraperOptions.html#__type.node","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":9,"kind":1024,"name":"selector","url":"interfaces/ScraperOptions.html#__type.selector","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":10,"kind":256,"name":"ScraperProps","url":"interfaces/ScraperProps.html","classes":"tsd-kind-interface"},{"id":11,"kind":1024,"name":"name","url":"interfaces/ScraperProps.html#name","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperProps"},{"id":12,"kind":1024,"name":"options","url":"interfaces/ScraperProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperProps"},{"id":13,"kind":64,"name":"createRunner","url":"modules.html#createRunner","classes":"tsd-kind-function"},{"id":14,"kind":4194304,"name":"SelectorMap","url":"modules.html#SelectorMap","classes":"tsd-kind-type-alias"},{"id":15,"kind":4194304,"name":"SelectorFunction","url":"modules.html#SelectorFunction","classes":"tsd-kind-type-alias"},{"id":16,"kind":65536,"name":"__type","url":"modules.html#SelectorFunction.__type","classes":"tsd-kind-type-literal tsd-parent-kind-type-alias","parent":"SelectorFunction"},{"id":17,"kind":256,"name":"CreateRunnerProps","url":"interfaces/CreateRunnerProps.html","classes":"tsd-kind-interface"},{"id":18,"kind":1024,"name":"logger","url":"interfaces/CreateRunnerProps.html#logger","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"CreateRunnerProps"},{"id":19,"kind":1024,"name":"options","url":"interfaces/CreateRunnerProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"CreateRunnerProps"},{"id":20,"kind":256,"name":"RunProps","url":"interfaces/RunProps.html","classes":"tsd-kind-interface tsd-has-type-parameter"},{"id":21,"kind":1024,"name":"strict","url":"interfaces/RunProps.html#strict","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":22,"kind":1024,"name":"baseUrl","url":"interfaces/RunProps.html#baseUrl","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":23,"kind":1024,"name":"target","url":"interfaces/RunProps.html#target","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":24,"kind":1024,"name":"selectors","url":"interfaces/RunProps.html#selectors","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":25,"kind":1024,"name":"pagination","url":"interfaces/RunProps.html#pagination","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":26,"kind":64,"name":"scrape","url":"modules.html#scrape","classes":"tsd-kind-function tsd-has-type-parameter"},{"id":27,"kind":256,"name":"ScrapeProps","url":"interfaces/ScrapeProps.html","classes":"tsd-kind-interface tsd-has-type-parameter"},{"id":28,"kind":1024,"name":"strict","url":"interfaces/ScrapeProps.html#strict","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":29,"kind":1024,"name":"target","url":"interfaces/ScrapeProps.html#target","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":30,"kind":1024,"name":"document","url":"interfaces/ScrapeProps.html#document","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":31,"kind":1024,"name":"selectors","url":"interfaces/ScrapeProps.html#selectors","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":32,"kind":1024,"name":"logger","url":"interfaces/ScrapeProps.html#logger","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":33,"kind":1024,"name":"options","url":"interfaces/ScrapeProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":34,"kind":4194304,"name":"ScrapeResultType","url":"modules.html#ScrapeResultType","classes":"tsd-kind-type-alias tsd-has-type-parameter"},{"id":35,"kind":64,"name":"createSelectorUtilities","url":"modules.html#createSelectorUtilities","classes":"tsd-kind-function"},{"id":36,"kind":4194304,"name":"SelectorUtilities","url":"modules.html#SelectorUtilities","classes":"tsd-kind-type-alias"},{"id":37,"kind":64,"name":"geosearch","url":"modules.html#geosearch","classes":"tsd-kind-function"},{"id":38,"kind":256,"name":"GeosearchResult","url":"interfaces/GeosearchResult.html","classes":"tsd-kind-interface"},{"id":39,"kind":1024,"name":"latitude","url":"interfaces/GeosearchResult.html#latitude","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"GeosearchResult"},{"id":40,"kind":1024,"name":"longitude","url":"interfaces/GeosearchResult.html#longitude","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"GeosearchResult"},{"id":41,"kind":64,"name":"fetchPage","url":"modules.html#fetchPage","classes":"tsd-kind-function"},{"id":42,"kind":64,"name":"createWindow","url":"modules.html#createWindow","classes":"tsd-kind-function"}],"index":{"version":"2.3.9","fields":["name","parent"],"fieldVectors":[["name/0",[0,33.787]],["parent/0",[]],["name/1",[1,33.787]],["parent/1",[]],["name/2",[2,20.794]],["parent/2",[]],["name/3",[3,33.787]],["parent/3",[2,1.606]],["name/4",[4,33.787]],["parent/4",[2,1.606]],["name/5",[5,33.787]],["parent/5",[2,1.606]],["name/6",[6,28.679]],["parent/6",[2,1.606]],["name/7",[7,33.787]],["parent/7",[8,1.955]],["name/8",[9,33.787]],["parent/8",[8,1.955]],["name/9",[10,33.787]],["parent/9",[8,1.955]],["name/10",[11,25.314]],["parent/10",[]],["name/11",[12,33.787]],["parent/11",[11,1.955]],["name/12",[13,25.314]],["parent/12",[11,1.955]],["name/13",[14,33.787]],["parent/13",[]],["name/14",[15,33.787]],["parent/14",[]],["name/15",[16,28.679]],["parent/15",[]],["name/16",[6,28.679]],["parent/16",[16,2.215]],["name/17",[17,25.314]],["parent/17",[]],["name/18",[18,28.679]],["parent/18",[17,1.955]],["name/19",[13,25.314]],["parent/19",[17,1.955]],["name/20",[19,19.124]],["parent/20",[]],["name/21",[20,28.679]],["parent/21",[19,1.477]],["name/22",[21,33.787]],["parent/22",[19,1.477]],["name/23",[22,28.679]],["parent/23",[19,1.477]],["name/24",[23,28.679]],["parent/24",[19,1.477]],["name/25",[24,33.787]],["parent/25",[19,1.477]],["name/26",[25,33.787]],["parent/26",[]],["name/27",[26,17.693]],["parent/27",[]],["name/28",[20,28.679]],["parent/28",[26,1.367]],["name/29",[22,28.679]],["parent/29",[26,1.367]],["name/30",[27,33.787]],["parent/30",[26,1.367]],["name/31",[23,28.679]],["parent/31",[26,1.367]],["name/32",[18,28.679]],["parent/32",[26,1.367]],["name/33",[13,25.314]],["parent/33",[26,1.367]],["name/34",[28,33.787]],["parent/34",[]],["name/35",[29,33.787]],["parent/35",[]],["name/36",[30,33.787]],["parent/36",[]],["name/37",[31,33.787]],["parent/37",[]],["name/38",[32,25.314]],["parent/38",[]],["name/39",[33,33.787]],["parent/39",[32,1.955]],["name/40",[34,33.787]],["parent/40",[32,1.955]],["name/41",[35,33.787]],["parent/41",[]],["name/42",[36,33.787]],["parent/42",[]]],"invertedIndex":[["__type",{"_index":6,"name":{"6":{},"16":{}},"parent":{}}],["baseurl",{"_index":21,"name":{"22":{}},"parent":{}}],["cache",{"_index":4,"name":{"4":{}},"parent":{}}],["concurrency",{"_index":5,"name":{"5":{}},"parent":{}}],["createrunner",{"_index":14,"name":{"13":{}},"parent":{}}],["createrunnerprops",{"_index":17,"name":{"17":{}},"parent":{"18":{},"19":{}}}],["createscraper",{"_index":0,"name":{"0":{}},"parent":{}}],["createselectorutilities",{"_index":29,"name":{"35":{}},"parent":{}}],["createwindow",{"_index":36,"name":{"42":{}},"parent":{}}],["document",{"_index":27,"name":{"30":{}},"parent":{}}],["fetchpage",{"_index":35,"name":{"41":{}},"parent":{}}],["geosearch",{"_index":31,"name":{"37":{}},"parent":{}}],["geosearchresult",{"_index":32,"name":{"38":{}},"parent":{"39":{},"40":{}}}],["latitude",{"_index":33,"name":{"39":{}},"parent":{}}],["log",{"_index":3,"name":{"3":{}},"parent":{}}],["logger",{"_index":18,"name":{"18":{},"32":{}},"parent":{}}],["longitude",{"_index":34,"name":{"40":{}},"parent":{}}],["name",{"_index":12,"name":{"11":{}},"parent":{}}],["node",{"_index":9,"name":{"8":{}},"parent":{}}],["options",{"_index":13,"name":{"12":{},"19":{},"33":{}},"parent":{}}],["page",{"_index":7,"name":{"7":{}},"parent":{}}],["pagination",{"_index":24,"name":{"25":{}},"parent":{}}],["runprops",{"_index":19,"name":{"20":{}},"parent":{"21":{},"22":{},"23":{},"24":{},"25":{}}}],["scrape",{"_index":25,"name":{"26":{}},"parent":{}}],["scrapeprops",{"_index":26,"name":{"27":{}},"parent":{"28":{},"29":{},"30":{},"31":{},"32":{},"33":{}}}],["scraper",{"_index":1,"name":{"1":{}},"parent":{}}],["scraperesulttype",{"_index":28,"name":{"34":{}},"parent":{}}],["scraperoptions",{"_index":2,"name":{"2":{}},"parent":{"3":{},"4":{},"5":{},"6":{}}}],["scraperoptions.__type",{"_index":8,"name":{},"parent":{"7":{},"8":{},"9":{}}}],["scraperprops",{"_index":11,"name":{"10":{}},"parent":{"11":{},"12":{}}}],["selector",{"_index":10,"name":{"9":{}},"parent":{}}],["selectorfunction",{"_index":16,"name":{"15":{}},"parent":{"16":{}}}],["selectormap",{"_index":15,"name":{"14":{}},"parent":{}}],["selectors",{"_index":23,"name":{"24":{},"31":{}},"parent":{}}],["selectorutilities",{"_index":30,"name":{"36":{}},"parent":{}}],["strict",{"_index":20,"name":{"21":{},"28":{}},"parent":{}}],["target",{"_index":22,"name":{"23":{},"29":{}},"parent":{}}]],"pipeline":[]}} \ No newline at end of file diff --git a/docs/index.html b/docs/index.html index cc5485f..d8397a1 100644 --- a/docs/index.html +++ b/docs/index.html @@ -2,7 +2,7 @@

Papercut

-

NPM JavaScript Style Guide +

NPM codecov bundlephobia bundlephobia

@@ -78,7 +78,7 @@

Single page scraper

For this example, we gonna scrape Hacker News first page.

Setup a scraper instance and set the selectors using the utilities offered:

-
import { createScraper } from '@armand1m/papercut';

const main = async () => {
const scraper = createScraper({
name: `Hacker News`,
options: {
log: process.env.DEBUG === 'true',
cache: true,
}
});

const results = await scraper.run({
strict: true,
baseUrl: "https://news.ycombinator.com/",
target: ".athing",
selectors: {
rank: ({ text }) => text('.rank'),
name: ({ text }) => text('.titlelink'),
url: ({ href }) => href('.titlelink'),
score: ({ element }) => {
return element.nextElementSibling?.querySelector('.score')
?.textContent;
},
createdBy: ({ element }) => {
return element.nextElementSibling?.querySelector('.hnuser')
?.textContent;
},
createdAt: ({ element }) => {
return element.nextElementSibling
?.querySelector('.age')
?.getAttribute('title');
},
}
});

console.log(JSON.stringify(results, null, 2));
};

main(); +
import { createScraper } from '@armand1m/papercut';

const main = async () => {
const scraper = createScraper({
name: `Hacker News`,
options: {
log: process.env.DEBUG === 'true',
cache: true,
}
});

const results = await scraper.run({
strict: true,
baseUrl: "https://news.ycombinator.com/",
target: ".athing",
selectors: {
rank: (utils) => {
const value = utils.text('.rank').replace(/^\D+/g, '');
return Number(value);
},
name: ({ text }) => text('.titlelink'),
url: ({ href }) => href('.titlelink'),
score: ({ element }) => {
return element.nextElementSibling?.querySelector('.score')
?.textContent;
},
createdBy: ({ element }) => {
return element.nextElementSibling?.querySelector('.hnuser')
?.textContent;
},
createdAt: ({ element }) => {
return element.nextElementSibling
?.querySelector('.age')
?.getAttribute('title');
},
}
});

console.log(JSON.stringify(results, null, 2));
};

main();

Then run it using node or ts-node:

npx ts-node ./single-page-scraper.ts
@@ -89,12 +89,23 @@ 

Paginated scraper

For this example, because I live in Amsterdam, we gonna scrape the Amsterdam Coffeeshops website for all coffeeshops in Amsterdam.

Setup a scraper instance and set the selectors using the utilities offered:

-
import { createScraper } from '@armand1m/papercut';

const createLabeledUrl = (label: string, url: string) => ({ label, url });

const main = async () => {
const scraper = createScraper(
{
name: 'Amsterdam Coffeeshops',
options: {
cache: true,
},
},
);

const results = await scraper.run({
strict: true,
target: '.summary-box',
baseUrl: 'https://amsterdamcoffeeshops.com/search/item/coffeeshops',
pagination: {
enabled: true,
lastPageNumberSelector: '.navigation > .pagination > li:nth-child(8) > a',
createPaginatedUrl: (baseUrl, pageNumber) => {
return `${baseUrl}/p:${pageNumber}`;
},
},
selectors: {
name: ({ text }) => {
return text('.media-body > h3 > a');
},
description: ({ text }) => {
return text('.media-body > .summary-desc');
},
photo: ({ src }) => {
return { url: src('.media-left > a > img') };
},
phone: ({ text }) => {
return text('.media-right > .contact-info > mark > a');
},
address: ({ text }) => {
const address = text('.media-body > address > p');

if (!address) {
return undefined;
}

return address.replace(/\s+/g, ' ').replace(/^\s+|\s+$/g, '');
},
location: async (selectors, $this) => {
const address = $this.address(selectors, $this);
return selectors.geosearch(address);
},
social: ({ href }) => {
const websiteHref = href('.visit-website');
return websiteHref
? [createLabeledUrl('Official Website', websiteHref)]
: [];
},
menus: () => {
/** TODO: scrape menus */
return [];
},
badges: ({ all }) => {
const { asArray: badges } = all('.media-left > div > div > img');

return badges
.map((badge) => badge.getAttribute('title'))
.filter((badge) => badge !== undefined);
},
rating: ({ className }) => {
const rateNumber = className(
'.media-right > .summary-info > span > span'
);

if (!rateNumber) {
return 0;
}

return Number(rateNumber.replace('rate-', ''));
},
}
});

console.log(JSON.stringify(results, null, 2));
};

main(); +
import { createScraper } from '@armand1m/papercut';

const createLabeledUrl = (label: string, url: string) => ({ label, url });

const main = async () => {
const scraper = createScraper(
{
name: 'Amsterdam Coffeeshops',
options: {
cache: true,
},
},
);

const results = await scraper.run({
strict: true,
target: '.summary-box',
baseUrl: 'https://amsterdamcoffeeshops.com/search/item/coffeeshops',
pagination: {
enabled: true,
lastPageNumberSelector: '.navigation > .pagination > li:nth-child(8) > a',
createPaginatedUrl: (baseUrl, pageNumber) => {
return `${baseUrl}/p:${pageNumber}`;
},
},
selectors: {
name: ({ text }) => {
return text('.media-body > h3 > a');
},
description: ({ text }) => {
return text('.media-body > .summary-desc');
},
photo: ({ src }) => {
return { url: src('.media-left > a > img') };
},
phone: ({ text }) => {
return text('.media-right > .contact-info > mark > a');
},
address: ({ text }) => {
const address = text('.media-body > address > p');

if (!address) {
return undefined;
}

return address.replace(/\s+/g, ' ').replace(/^\s+|\s+$/g, '');
},
location: async (selectors, $this) => {
const address = $this.address(selectors, $this);
return selectors.geosearch(address);
},
social: ({ href }) => {
const websiteHref = href('.visit-website');
return websiteHref
? [createLabeledUrl('Official Website', websiteHref)]
: [];
},
menus: () => {
/** TODO: scrape menus */
return [];
},
badges: ({ all }) => {
const { asArray: badges } = all('.media-left > div > div > img');

return badges
.map((badge) => badge.getAttribute('title'))
.filter((badge) => badge !== undefined);
},
rating: ({ className }) => {
const rateNumber = className(
'.media-right > .summary-info > span > span'
);

if (!rateNumber) {
return 0;
}

return Number(rateNumber.replace('rate-', ''));
},
}
});

console.log(JSON.stringify(results, null, 2));
};

main();

Then run it using node or ts-node:

npx ts-node ./paginated-scraper.ts
 
+ +

Managed JSDOM

+
+

In case you want to use your own JSDOM and Pino instance and tweak/configure as much as you prefer, you can use the scrape function instead.

+

In the example below, we use the exposed createWindow and fetchPage utilities for convenience. You can use JSDOM constructor directly and any other strategy to fetch your page HTML as desired.

+
import pino from 'pino'
import { scrape, fetchPage, createWindow } from '@armand1m/papercut';

const main = async () => {
const logger = pino({
name: 'Hacker News',
enabled: false
});

const rawHTML = await fetchPage('https://news.ycombinator.com/')
const window = createWindow(rawHTML);

const results = await scrape({
strict: true,
logger,
document: window.document,
target: ".athing",
selectors: {
rank: (utils) => {
const value = utils.text('.rank').replace(/^\D+/g, '');
return Number(value);
},
name: ({ text }) => text('.titlelink'),
url: ({ href }) => href('.titlelink'),
score: ({ element }) => {
return element.nextElementSibling?.querySelector('.score')
?.textContent;
},
createdBy: ({ element }) => {
return element.nextElementSibling?.querySelector('.hnuser')
?.textContent;
},
createdAt: ({ element }) => {
return element.nextElementSibling
?.querySelector('.age')
?.getAttribute('title');
},
},
options: {
log: false,
cache: true,
concurrency: {
page: 2,
node: 2,
selector: 2
}
}
});

window.close();

console.log(JSON.stringify(results, null, 2));
};

main(); +
+

Then run it using node or ts-node:

+
npx ts-node ./managed-jsdom.ts
+
+

API Reference

@@ -152,4 +163,4 @@

Contributors

Armando Magalhaes -

Legend

  • Property

Settings

Theme

Generated using TypeDoc

\ No newline at end of file +

Legend

  • Property

Settings

Theme

Generated using TypeDoc

\ No newline at end of file diff --git a/docs/interfaces/CreateRunnerProps.html b/docs/interfaces/CreateRunnerProps.html index 90ce9bf..5fe0399 100644 --- a/docs/interfaces/CreateRunnerProps.html +++ b/docs/interfaces/CreateRunnerProps.html @@ -1,6 +1,6 @@ -CreateRunnerProps | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface CreateRunnerProps

Hierarchy

  • CreateRunnerProps

Index

Properties

Properties

logger

logger: Logger
+CreateRunnerProps | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface CreateRunnerProps

Hierarchy

  • CreateRunnerProps

Index

Properties

Properties

logger

logger: Logger

A pino.Logger instance.

-

options

+

options

The scraper options. Use this to tweak log, cache and concurrency settings.

Legend

  • Property

Settings

Theme

Generated using TypeDoc

\ No newline at end of file diff --git a/docs/interfaces/GeosearchResult.html b/docs/interfaces/GeosearchResult.html index 0a25aba..d6beb56 100644 --- a/docs/interfaces/GeosearchResult.html +++ b/docs/interfaces/GeosearchResult.html @@ -1 +1 @@ -GeosearchResult | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface GeosearchResult

Hierarchy

  • GeosearchResult

Index

Properties

latitude

latitude: number

longitude

longitude: number

Legend

  • Property

Settings

Theme

Generated using TypeDoc

\ No newline at end of file +GeosearchResult | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface GeosearchResult

Hierarchy

  • GeosearchResult

Index

Properties

latitude

latitude: number

longitude

longitude: number

Legend

  • Property

Settings

Theme

Generated using TypeDoc

\ No newline at end of file diff --git a/docs/interfaces/RunProps.html b/docs/interfaces/RunProps.html index 9b7ce32..0e639dd 100644 --- a/docs/interfaces/RunProps.html +++ b/docs/interfaces/RunProps.html @@ -1,7 +1,7 @@ -RunProps | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface RunProps<T, B>

Type parameters

Hierarchy

  • RunProps

Index

Properties

baseUrl

baseUrl: string
+RunProps | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface RunProps<T, B>

Type parameters

Hierarchy

  • RunProps

Index

Properties

baseUrl

baseUrl: string

The base url to start scraping off.

This page will be fetched, parsed and mounted in a virtual JSDOM instance.

-

Optional pagination

pagination?: PaginationOptions
+

Optional pagination

pagination?: PaginationOptions

Optional pagination feature.

If enabled and configured, this will make papercut fetch, parse, mount and scrape multiple pages based @@ -9,14 +9,14 @@

As long as you have a way to fetch the last page number from the page you're scraping, and use it as a query param in the page url, you should be fine.

-

selectors

selectors: T
+

selectors

selectors: T

The selectors to be used during the scraping process.

The result object will match the schema of the selectors.

-

strict

strict: B
+

strict

strict: B

If enabled, this will make Papercut scrape the page in strict mode. This means that in case a selector function fails, the entire scraping will be halted with an error.

When enabled, the result types will not expect undefined values.

-

target

target: string
+

target

target: string

The DOM selector for the target nodes to be scraped.

Legend

  • Property

Settings

Theme

Generated using TypeDoc

\ No newline at end of file diff --git a/docs/interfaces/ScrapeProps.html b/docs/interfaces/ScrapeProps.html index c54067b..c232bdd 100644 --- a/docs/interfaces/ScrapeProps.html +++ b/docs/interfaces/ScrapeProps.html @@ -1 +1 @@ -ScrapeProps | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface ScrapeProps<T, B>

Type parameters

Hierarchy

  • ScrapeProps

Index

Properties

document

document: Document

logger

logger: Logger

options

selectors

selectors: T

strict

strict: B

target

target: string

Legend

  • Property

Settings

Theme

Generated using TypeDoc

\ No newline at end of file +ScrapeProps | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface ScrapeProps<T, B>

Type parameters

Hierarchy

  • ScrapeProps

Index

Properties

document

document: Document

logger

logger: Logger

options

selectors

selectors: T

strict

strict: B

target

target: string

Legend

  • Property

Settings

Theme

Generated using TypeDoc

\ No newline at end of file diff --git a/docs/interfaces/ScraperOptions.html b/docs/interfaces/ScraperOptions.html index 70b46a9..101cc1e 100644 --- a/docs/interfaces/ScraperOptions.html +++ b/docs/interfaces/ScraperOptions.html @@ -1,9 +1,9 @@ -ScraperOptions | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface ScraperOptions

Hierarchy

  • ScraperOptions

Index

Properties

cache

cache: boolean
+ScraperOptions | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface ScraperOptions

Hierarchy

  • ScraperOptions

Index

Properties

cache

cache: boolean

Enables HTML payload caching on the disk. Keep in mind that papercut will not clear the cache for you. When enabling this, it's your responsability to deal with cache invalidation.

default

false

-

concurrency

concurrency: { node: number; page: number; selector: number }
+

concurrency

concurrency: { node: number; page: number; selector: number }

Concurrency settings.

Type declaration

  • node: number

    Amount of concurrent promises for node scraping.

    @@ -14,7 +14,7 @@
  • selector: number

    Amount of concurrent promises for selector scraping.

    default

    2

    -

log

log: boolean
+

log

log: boolean

Enables writing pino logs to the stdout.

default

process.env.DEBUG === "true"

Legend

  • Property

Settings

Theme

Generated using TypeDoc

\ No newline at end of file diff --git a/docs/interfaces/ScraperProps.html b/docs/interfaces/ScraperProps.html index 5e34bb7..27f9853 100644 --- a/docs/interfaces/ScraperProps.html +++ b/docs/interfaces/ScraperProps.html @@ -1,7 +1,7 @@ -ScraperProps | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface ScraperProps

Hierarchy

  • ScraperProps

Index

Properties

Properties

name

name: string
+ScraperProps | @armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

Interface ScraperProps

Hierarchy

  • ScraperProps

Index

Properties

Properties

name

name: string

The scraper name. This will be used only for logging purposes.

-

Optional options

options?: Partial<ScraperOptions>
+

Optional options

options?: Partial<ScraperOptions>

The scraper options. Use this to tweak log, cache and concurrency settings.

Legend

  • Property

Settings

Theme

Generated using TypeDoc

\ No newline at end of file diff --git a/docs/modules.html b/docs/modules.html index 6d9ebcf..43a03e1 100644 --- a/docs/modules.html +++ b/docs/modules.html @@ -1,12 +1,12 @@ -@armand1m/papercut
Options
All
  • Public
  • Public/Protected
  • All
Menu

@armand1m/papercut

Index

Type aliases

ScrapeResultType

ScrapeResultType<T, B>: B extends true ? { [ Prop in keyof T]: Awaited<ReturnType<T[Prop]>> } : { [ Prop in keyof T]?: Awaited<ReturnType<T[Prop]>> }

Type parameters

Scraper

Scraper: ReturnType<typeof createScraper>

SelectorFunction

SelectorFunction: (utils: SelectorUtilities, self: SelectorMap) => any

Type declaration

SelectorMap

SelectorMap: Record<string, SelectorFunction>

Map of selector functions.

This type is meant to be checked with an extended type, as users are going to implement a derived version of this for custom scrapers.

-

SelectorUtilities

SelectorUtilities: ReturnType<typeof createSelectorUtilities>

Functions

Const createRunner

SelectorUtilities

SelectorUtilities: ReturnType<typeof createSelectorUtilities>

Functions

Const createRunner

  • Creates a runner instance.

    This method is called by the createScraper function, but can also be externally used if needed to use an @@ -33,7 +33,7 @@

Parameters

  • props: RunProps<T, B>

    The scraping runner properties and selectors.

Returns Promise<ScrapeResultType<T, B>[]>

result Type-safe scraping results based on the given selectors and strict mode.

-

Const createScraper

Const createScraper

  • Creates a new scraper runner.

    This method is papercut entrypoint. It will create an Scraper struct containing a runner that you can tweak @@ -63,7 +63,7 @@

Parameters

  • props: RunProps<T, B>

    The scraping runner properties and selectors.

Returns Promise<ScrapeResultType<T, B>[]>

result Type-safe scraping results based on the given selectors and strict mode.

-

Const createSelectorUtilities

  • createSelectorUtilities(element: Element): { all: (selector: string) => { asArray: Element[]; nodes: NodeListOf<Element> }; attr: (selector: string, attribute: string) => string; className: (selector: string) => string; createWindow: (htmlContent: string) => { close: () => void; document: Document; window: DOMWindow }; element: Element; fetchPage: (url: string) => Promise<string>; geosearch: (q: string, limit?: number) => Promise<GeosearchResult>; href: (selector: string) => string; mapNodeListToArray: (nodeList: NodeList) => Element[]; src: (selector: string) => string; text: (selector: string) => string }

Const createSelectorUtilities

  • createSelectorUtilities(element: Element): { all: (selector: string) => { asArray: Element[]; nodes: NodeListOf<Element> }; attr: (selector: string, attribute: string) => string; className: (selector: string) => string; createWindow: (htmlContent: string) => { close: () => void; document: Document; window: DOMWindow }; element: Element; fetchPage: (url: string) => Promise<string>; geosearch: (q: string, limit?: number) => Promise<GeosearchResult>; href: (selector: string) => string; mapNodeListToArray: (nodeList: NodeList) => Element[]; src: (selector: string) => string; text: (selector: string) => string }
  • This method creates the selector utilities provided to every selector function given to the scrape method.

    These utilities are meant to make the experience of @@ -74,7 +74,7 @@ fallback of an empty string, in case it fails to find the element or a specific property.

    At the same time, you also have direct access to the elementfrom selector functions if needed for more complex tasks.

    -

    Parameters

    • element: Element

    Returns { all: (selector: string) => { asArray: Element[]; nodes: NodeListOf<Element> }; attr: (selector: string, attribute: string) => string; className: (selector: string) => string; createWindow: (htmlContent: string) => { close: () => void; document: Document; window: DOMWindow }; element: Element; fetchPage: (url: string) => Promise<string>; geosearch: (q: string, limit?: number) => Promise<GeosearchResult>; href: (selector: string) => string; mapNodeListToArray: (nodeList: NodeList) => Element[]; src: (selector: string) => string; text: (selector: string) => string }

    • all: (selector: string) => { asArray: Element[]; nodes: NodeListOf<Element> }
        • (selector: string): { asArray: Element[]; nodes: NodeListOf<Element> }
        • Parameters

          • selector: string

          Returns { asArray: Element[]; nodes: NodeListOf<Element> }

          • asArray: Element[]
          • nodes: NodeListOf<Element>
    • attr: (selector: string, attribute: string) => string
        • (selector: string, attribute: string): string
        • Parameters

          • selector: string
          • attribute: string

          Returns string

    • className: (selector: string) => string
        • (selector: string): string
        • Parameters

          • selector: string

          Returns string

    • createWindow: (htmlContent: string) => { close: () => void; document: Document; window: DOMWindow }
        • (htmlContent: string): { close: () => void; document: Document; window: DOMWindow }
        • Parameters

          • htmlContent: string

          Returns { close: () => void; document: Document; window: DOMWindow }

          • close: () => void
              • (): void
              • Returns void

          • document: Document
          • window: DOMWindow
    • element: Element
    • fetchPage: (url: string) => Promise<string>
        • (url: string): Promise<string>
        • Parameters

          • url: string

          Returns Promise<string>

    • geosearch: (q: string, limit?: number) => Promise<GeosearchResult>
    • href: (selector: string) => string
        • (selector: string): string
        • Parameters

          • selector: string

          Returns string

    • mapNodeListToArray: (nodeList: NodeList) => Element[]
        • (nodeList: NodeList): Element[]
        • Parameters

          • nodeList: NodeList

          Returns Element[]

    • src: (selector: string) => string
        • (selector: string): string
        • Parameters

          • selector: string

          Returns string

    • text: (selector: string) => string
        • (selector: string): string
        • Parameters

          • selector: string

          Returns string

Const geosearch

scrape

  • +

    Parameters

    • element: Element

    Returns { all: (selector: string) => { asArray: Element[]; nodes: NodeListOf<Element> }; attr: (selector: string, attribute: string) => string; className: (selector: string) => string; createWindow: (htmlContent: string) => { close: () => void; document: Document; window: DOMWindow }; element: Element; fetchPage: (url: string) => Promise<string>; geosearch: (q: string, limit?: number) => Promise<GeosearchResult>; href: (selector: string) => string; mapNodeListToArray: (nodeList: NodeList) => Element[]; src: (selector: string) => string; text: (selector: string) => string }

    • all: (selector: string) => { asArray: Element[]; nodes: NodeListOf<Element> }
        • (selector: string): { asArray: Element[]; nodes: NodeListOf<Element> }
        • Parameters

          • selector: string

          Returns { asArray: Element[]; nodes: NodeListOf<Element> }

          • asArray: Element[]
          • nodes: NodeListOf<Element>
    • attr: (selector: string, attribute: string) => string
        • (selector: string, attribute: string): string
        • Parameters

          • selector: string
          • attribute: string

          Returns string

    • className: (selector: string) => string
        • (selector: string): string
        • Parameters

          • selector: string

          Returns string

    • createWindow: (htmlContent: string) => { close: () => void; document: Document; window: DOMWindow }
        • (htmlContent: string): { close: () => void; document: Document; window: DOMWindow }
        • Parameters

          • htmlContent: string

          Returns { close: () => void; document: Document; window: DOMWindow }

          • close: () => void
              • (): void
              • Returns void

          • document: Document
          • window: DOMWindow
    • element: Element
    • fetchPage: (url: string) => Promise<string>
        • (url: string): Promise<string>
        • Parameters

          • url: string

          Returns Promise<string>

    • geosearch: (q: string, limit?: number) => Promise<GeosearchResult>
    • href: (selector: string) => string
        • (selector: string): string
        • Parameters

          • selector: string

          Returns string

    • mapNodeListToArray: (nodeList: NodeList) => Element[]
        • (nodeList: NodeList): Element[]
        • Parameters

          • nodeList: NodeList

          Returns Element[]

    • src: (selector: string) => string
        • (selector: string): string
        • Parameters

          • selector: string

          Returns string

    • text: (selector: string) => string
        • (selector: string): string
        • Parameters

          • selector: string

          Returns string

Const createWindow

  • createWindow(htmlContent: string): { close: () => void; document: Document; window: DOMWindow }
  • Parameters

    • htmlContent: string

    Returns { close: () => void; document: Document; window: DOMWindow }

    • close: () => void
        • (): void
        • Returns void

    • document: Document
    • window: DOMWindow

Const fetchPage

  • fetchPage(url: string): Promise<string>

Const geosearch

scrape

  • the scrape function

    this function will select all target nodes from the given document and spawn promise pools for @@ -91,4 +91,4 @@

    The strict mode boolean type. Used to tweak the scrape result type strictness.

Parameters

  • props: ScrapeProps<T, B>

    The scraping properties and selectors.

    -

Returns Promise<ScrapeResultType<T, B>[]>

Legend

  • Property

Settings

Theme

Generated using TypeDoc

\ No newline at end of file +

Returns Promise<ScrapeResultType<T, B>[]>

Legend

  • Property

Settings

Theme

Generated using TypeDoc

\ No newline at end of file diff --git a/package.json b/package.json index 77549f6..19870f8 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "version": "2.0.4", + "version": "2.0.5", "license": "MIT", "main": "dist/index.js", "types": "dist/index.d.ts",