diff --git a/CHANGELOG.md b/CHANGELOG.md
index 81764d5..bcc9c66 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,10 @@
+## [2.0.5](https://github.com/armand1m/papercut/compare/v2.0.4...v2.0.5) (2021-11-15)
+
+
+### Bug Fixes
+
+* export utilities and add managed jsdom example ([132038b](https://github.com/armand1m/papercut/commit/132038bd46bf6386b168967925f0cadf8a906241))
+
## [2.0.4](https://github.com/armand1m/papercut/compare/v2.0.3...v2.0.4) (2021-11-15)
diff --git a/README.md b/README.md
index 39ce8ff..ad1cefb 100644
--- a/README.md
+++ b/README.md
@@ -97,7 +97,10 @@ const main = async () => {
baseUrl: "https://news.ycombinator.com/",
target: ".athing",
selectors: {
- rank: ({ text }) => text('.rank'),
+ rank: (utils) => {
+ const value = utils.text('.rank').replace(/^\D+/g, '');
+ return Number(value);
+ },
name: ({ text }) => text('.titlelink'),
url: ({ href }) => href('.titlelink'),
score: ({ element }) => {
@@ -229,6 +232,76 @@ Then run it using `node` or `ts-node`:
npx ts-node ./paginated-scraper.ts
```
+#### Managed JSDOM
+
+In case you want to use your own JSDOM and Pino instance and tweak/configure as much as you prefer, you can use the `scrape` function instead.
+
+In the example below, we use the exposed `createWindow` and `fetchPage` utilities for convenience. You can use JSDOM constructor directly and any other strategy to fetch your page HTML as desired.
+
+```ts file=./examples/typescript/src/managed-jsdom/scraper.ts
+import pino from 'pino'
+import { scrape, fetchPage, createWindow } from '@armand1m/papercut';
+
+const main = async () => {
+ const logger = pino({
+ name: 'Hacker News',
+ enabled: false
+ });
+
+ const rawHTML = await fetchPage('https://news.ycombinator.com/')
+ const window = createWindow(rawHTML);
+
+ const results = await scrape({
+ strict: true,
+ logger,
+ document: window.document,
+ target: ".athing",
+ selectors: {
+ rank: (utils) => {
+ const value = utils.text('.rank').replace(/^\D+/g, '');
+ return Number(value);
+ },
+ name: ({ text }) => text('.titlelink'),
+ url: ({ href }) => href('.titlelink'),
+ score: ({ element }) => {
+ return element.nextElementSibling?.querySelector('.score')
+ ?.textContent;
+ },
+ createdBy: ({ element }) => {
+ return element.nextElementSibling?.querySelector('.hnuser')
+ ?.textContent;
+ },
+ createdAt: ({ element }) => {
+ return element.nextElementSibling
+ ?.querySelector('.age')
+ ?.getAttribute('title');
+ },
+ },
+ options: {
+ log: false,
+ cache: true,
+ concurrency: {
+ page: 2,
+ node: 2,
+ selector: 2
+ }
+ }
+ });
+
+ window.close();
+
+ console.log(JSON.stringify(results, null, 2));
+};
+
+main();
+```
+
+Then run it using `node` or `ts-node`:
+
+```sh
+npx ts-node ./managed-jsdom.ts
+```
+
## API Reference
[Click here to open the API reference.](https://armand1m.github.io/papercut)
diff --git a/docs/assets/highlight.css b/docs/assets/highlight.css
index d04c09f..1ed678f 100644
--- a/docs/assets/highlight.css
+++ b/docs/assets/highlight.css
@@ -13,16 +13,16 @@
--dark-hl-5: #569CD6;
--light-hl-6: #0070C1;
--dark-hl-6: #4FC1FF;
- --light-hl-7: #267F99;
- --dark-hl-7: #4EC9B0;
- --light-hl-8: #098658;
- --dark-hl-8: #B5CEA8;
- --light-hl-9: #811F3F;
- --dark-hl-9: #D16969;
- --light-hl-10: #000000;
- --dark-hl-10: #D7BA7D;
- --light-hl-11: #EE0000;
- --dark-hl-11: #DCDCAA;
+ --light-hl-7: #811F3F;
+ --dark-hl-7: #D16969;
+ --light-hl-8: #EE0000;
+ --dark-hl-8: #DCDCAA;
+ --light-hl-9: #000000;
+ --dark-hl-9: #D7BA7D;
+ --light-hl-10: #267F99;
+ --dark-hl-10: #4EC9B0;
+ --light-hl-11: #098658;
+ --dark-hl-11: #B5CEA8;
--light-hl-12: #008000;
--dark-hl-12: #6A9955;
--light-code-background: #FFFFFF;
diff --git a/docs/assets/search.js b/docs/assets/search.js
index 2f329d9..1a4f5ca 100644
--- a/docs/assets/search.js
+++ b/docs/assets/search.js
@@ -1 +1 @@
-window.searchData = {"kinds":{"64":"Function","256":"Interface","1024":"Property","65536":"Type literal","4194304":"Type alias"},"rows":[{"id":0,"kind":64,"name":"createScraper","url":"modules.html#createScraper","classes":"tsd-kind-function"},{"id":1,"kind":4194304,"name":"Scraper","url":"modules.html#Scraper","classes":"tsd-kind-type-alias"},{"id":2,"kind":256,"name":"ScraperOptions","url":"interfaces/ScraperOptions.html","classes":"tsd-kind-interface"},{"id":3,"kind":1024,"name":"log","url":"interfaces/ScraperOptions.html#log","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":4,"kind":1024,"name":"cache","url":"interfaces/ScraperOptions.html#cache","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":5,"kind":1024,"name":"concurrency","url":"interfaces/ScraperOptions.html#concurrency","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":6,"kind":65536,"name":"__type","url":"interfaces/ScraperOptions.html#__type","classes":"tsd-kind-type-literal tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":7,"kind":1024,"name":"page","url":"interfaces/ScraperOptions.html#__type.page","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":8,"kind":1024,"name":"node","url":"interfaces/ScraperOptions.html#__type.node","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":9,"kind":1024,"name":"selector","url":"interfaces/ScraperOptions.html#__type.selector","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":10,"kind":256,"name":"ScraperProps","url":"interfaces/ScraperProps.html","classes":"tsd-kind-interface"},{"id":11,"kind":1024,"name":"name","url":"interfaces/ScraperProps.html#name","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperProps"},{"id":12,"kind":1024,"name":"options","url":"interfaces/ScraperProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperProps"},{"id":13,"kind":64,"name":"createRunner","url":"modules.html#createRunner","classes":"tsd-kind-function"},{"id":14,"kind":4194304,"name":"SelectorMap","url":"modules.html#SelectorMap","classes":"tsd-kind-type-alias"},{"id":15,"kind":4194304,"name":"SelectorFunction","url":"modules.html#SelectorFunction","classes":"tsd-kind-type-alias"},{"id":16,"kind":65536,"name":"__type","url":"modules.html#SelectorFunction.__type","classes":"tsd-kind-type-literal tsd-parent-kind-type-alias","parent":"SelectorFunction"},{"id":17,"kind":256,"name":"CreateRunnerProps","url":"interfaces/CreateRunnerProps.html","classes":"tsd-kind-interface"},{"id":18,"kind":1024,"name":"logger","url":"interfaces/CreateRunnerProps.html#logger","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"CreateRunnerProps"},{"id":19,"kind":1024,"name":"options","url":"interfaces/CreateRunnerProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"CreateRunnerProps"},{"id":20,"kind":256,"name":"RunProps","url":"interfaces/RunProps.html","classes":"tsd-kind-interface tsd-has-type-parameter"},{"id":21,"kind":1024,"name":"strict","url":"interfaces/RunProps.html#strict","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":22,"kind":1024,"name":"baseUrl","url":"interfaces/RunProps.html#baseUrl","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":23,"kind":1024,"name":"target","url":"interfaces/RunProps.html#target","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":24,"kind":1024,"name":"selectors","url":"interfaces/RunProps.html#selectors","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":25,"kind":1024,"name":"pagination","url":"interfaces/RunProps.html#pagination","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":26,"kind":64,"name":"scrape","url":"modules.html#scrape","classes":"tsd-kind-function tsd-has-type-parameter"},{"id":27,"kind":256,"name":"ScrapeProps","url":"interfaces/ScrapeProps.html","classes":"tsd-kind-interface tsd-has-type-parameter"},{"id":28,"kind":1024,"name":"strict","url":"interfaces/ScrapeProps.html#strict","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":29,"kind":1024,"name":"target","url":"interfaces/ScrapeProps.html#target","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":30,"kind":1024,"name":"document","url":"interfaces/ScrapeProps.html#document","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":31,"kind":1024,"name":"selectors","url":"interfaces/ScrapeProps.html#selectors","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":32,"kind":1024,"name":"logger","url":"interfaces/ScrapeProps.html#logger","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":33,"kind":1024,"name":"options","url":"interfaces/ScrapeProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":34,"kind":4194304,"name":"ScrapeResultType","url":"modules.html#ScrapeResultType","classes":"tsd-kind-type-alias tsd-has-type-parameter"},{"id":35,"kind":64,"name":"createSelectorUtilities","url":"modules.html#createSelectorUtilities","classes":"tsd-kind-function"},{"id":36,"kind":4194304,"name":"SelectorUtilities","url":"modules.html#SelectorUtilities","classes":"tsd-kind-type-alias"},{"id":37,"kind":64,"name":"geosearch","url":"modules.html#geosearch","classes":"tsd-kind-function"},{"id":38,"kind":256,"name":"GeosearchResult","url":"interfaces/GeosearchResult.html","classes":"tsd-kind-interface"},{"id":39,"kind":1024,"name":"latitude","url":"interfaces/GeosearchResult.html#latitude","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"GeosearchResult"},{"id":40,"kind":1024,"name":"longitude","url":"interfaces/GeosearchResult.html#longitude","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"GeosearchResult"}],"index":{"version":"2.3.9","fields":["name","parent"],"fieldVectors":[["name/0",[0,33.322]],["parent/0",[]],["name/1",[1,33.322]],["parent/1",[]],["name/2",[2,20.329]],["parent/2",[]],["name/3",[3,33.322]],["parent/3",[2,1.611]],["name/4",[4,33.322]],["parent/4",[2,1.611]],["name/5",[5,33.322]],["parent/5",[2,1.611]],["name/6",[6,28.214]],["parent/6",[2,1.611]],["name/7",[7,33.322]],["parent/7",[8,1.969]],["name/8",[9,33.322]],["parent/8",[8,1.969]],["name/9",[10,33.322]],["parent/9",[8,1.969]],["name/10",[11,24.849]],["parent/10",[]],["name/11",[12,33.322]],["parent/11",[11,1.969]],["name/12",[13,24.849]],["parent/12",[11,1.969]],["name/13",[14,33.322]],["parent/13",[]],["name/14",[15,33.322]],["parent/14",[]],["name/15",[16,28.214]],["parent/15",[]],["name/16",[6,28.214]],["parent/16",[16,2.236]],["name/17",[17,24.849]],["parent/17",[]],["name/18",[18,28.214]],["parent/18",[17,1.969]],["name/19",[13,24.849]],["parent/19",[17,1.969]],["name/20",[19,18.659]],["parent/20",[]],["name/21",[20,28.214]],["parent/21",[19,1.479]],["name/22",[21,33.322]],["parent/22",[19,1.479]],["name/23",[22,28.214]],["parent/23",[19,1.479]],["name/24",[23,28.214]],["parent/24",[19,1.479]],["name/25",[24,33.322]],["parent/25",[19,1.479]],["name/26",[25,33.322]],["parent/26",[]],["name/27",[26,17.228]],["parent/27",[]],["name/28",[20,28.214]],["parent/28",[26,1.365]],["name/29",[22,28.214]],["parent/29",[26,1.365]],["name/30",[27,33.322]],["parent/30",[26,1.365]],["name/31",[23,28.214]],["parent/31",[26,1.365]],["name/32",[18,28.214]],["parent/32",[26,1.365]],["name/33",[13,24.849]],["parent/33",[26,1.365]],["name/34",[28,33.322]],["parent/34",[]],["name/35",[29,33.322]],["parent/35",[]],["name/36",[30,33.322]],["parent/36",[]],["name/37",[31,33.322]],["parent/37",[]],["name/38",[32,24.849]],["parent/38",[]],["name/39",[33,33.322]],["parent/39",[32,1.969]],["name/40",[34,33.322]],["parent/40",[32,1.969]]],"invertedIndex":[["__type",{"_index":6,"name":{"6":{},"16":{}},"parent":{}}],["baseurl",{"_index":21,"name":{"22":{}},"parent":{}}],["cache",{"_index":4,"name":{"4":{}},"parent":{}}],["concurrency",{"_index":5,"name":{"5":{}},"parent":{}}],["createrunner",{"_index":14,"name":{"13":{}},"parent":{}}],["createrunnerprops",{"_index":17,"name":{"17":{}},"parent":{"18":{},"19":{}}}],["createscraper",{"_index":0,"name":{"0":{}},"parent":{}}],["createselectorutilities",{"_index":29,"name":{"35":{}},"parent":{}}],["document",{"_index":27,"name":{"30":{}},"parent":{}}],["geosearch",{"_index":31,"name":{"37":{}},"parent":{}}],["geosearchresult",{"_index":32,"name":{"38":{}},"parent":{"39":{},"40":{}}}],["latitude",{"_index":33,"name":{"39":{}},"parent":{}}],["log",{"_index":3,"name":{"3":{}},"parent":{}}],["logger",{"_index":18,"name":{"18":{},"32":{}},"parent":{}}],["longitude",{"_index":34,"name":{"40":{}},"parent":{}}],["name",{"_index":12,"name":{"11":{}},"parent":{}}],["node",{"_index":9,"name":{"8":{}},"parent":{}}],["options",{"_index":13,"name":{"12":{},"19":{},"33":{}},"parent":{}}],["page",{"_index":7,"name":{"7":{}},"parent":{}}],["pagination",{"_index":24,"name":{"25":{}},"parent":{}}],["runprops",{"_index":19,"name":{"20":{}},"parent":{"21":{},"22":{},"23":{},"24":{},"25":{}}}],["scrape",{"_index":25,"name":{"26":{}},"parent":{}}],["scrapeprops",{"_index":26,"name":{"27":{}},"parent":{"28":{},"29":{},"30":{},"31":{},"32":{},"33":{}}}],["scraper",{"_index":1,"name":{"1":{}},"parent":{}}],["scraperesulttype",{"_index":28,"name":{"34":{}},"parent":{}}],["scraperoptions",{"_index":2,"name":{"2":{}},"parent":{"3":{},"4":{},"5":{},"6":{}}}],["scraperoptions.__type",{"_index":8,"name":{},"parent":{"7":{},"8":{},"9":{}}}],["scraperprops",{"_index":11,"name":{"10":{}},"parent":{"11":{},"12":{}}}],["selector",{"_index":10,"name":{"9":{}},"parent":{}}],["selectorfunction",{"_index":16,"name":{"15":{}},"parent":{"16":{}}}],["selectormap",{"_index":15,"name":{"14":{}},"parent":{}}],["selectors",{"_index":23,"name":{"24":{},"31":{}},"parent":{}}],["selectorutilities",{"_index":30,"name":{"36":{}},"parent":{}}],["strict",{"_index":20,"name":{"21":{},"28":{}},"parent":{}}],["target",{"_index":22,"name":{"23":{},"29":{}},"parent":{}}]],"pipeline":[]}}
\ No newline at end of file
+window.searchData = {"kinds":{"64":"Function","256":"Interface","1024":"Property","65536":"Type literal","4194304":"Type alias"},"rows":[{"id":0,"kind":64,"name":"createScraper","url":"modules.html#createScraper","classes":"tsd-kind-function"},{"id":1,"kind":4194304,"name":"Scraper","url":"modules.html#Scraper","classes":"tsd-kind-type-alias"},{"id":2,"kind":256,"name":"ScraperOptions","url":"interfaces/ScraperOptions.html","classes":"tsd-kind-interface"},{"id":3,"kind":1024,"name":"log","url":"interfaces/ScraperOptions.html#log","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":4,"kind":1024,"name":"cache","url":"interfaces/ScraperOptions.html#cache","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":5,"kind":1024,"name":"concurrency","url":"interfaces/ScraperOptions.html#concurrency","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":6,"kind":65536,"name":"__type","url":"interfaces/ScraperOptions.html#__type","classes":"tsd-kind-type-literal tsd-parent-kind-interface","parent":"ScraperOptions"},{"id":7,"kind":1024,"name":"page","url":"interfaces/ScraperOptions.html#__type.page","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":8,"kind":1024,"name":"node","url":"interfaces/ScraperOptions.html#__type.node","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":9,"kind":1024,"name":"selector","url":"interfaces/ScraperOptions.html#__type.selector","classes":"tsd-kind-property tsd-parent-kind-type-literal","parent":"ScraperOptions.__type"},{"id":10,"kind":256,"name":"ScraperProps","url":"interfaces/ScraperProps.html","classes":"tsd-kind-interface"},{"id":11,"kind":1024,"name":"name","url":"interfaces/ScraperProps.html#name","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperProps"},{"id":12,"kind":1024,"name":"options","url":"interfaces/ScraperProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScraperProps"},{"id":13,"kind":64,"name":"createRunner","url":"modules.html#createRunner","classes":"tsd-kind-function"},{"id":14,"kind":4194304,"name":"SelectorMap","url":"modules.html#SelectorMap","classes":"tsd-kind-type-alias"},{"id":15,"kind":4194304,"name":"SelectorFunction","url":"modules.html#SelectorFunction","classes":"tsd-kind-type-alias"},{"id":16,"kind":65536,"name":"__type","url":"modules.html#SelectorFunction.__type","classes":"tsd-kind-type-literal tsd-parent-kind-type-alias","parent":"SelectorFunction"},{"id":17,"kind":256,"name":"CreateRunnerProps","url":"interfaces/CreateRunnerProps.html","classes":"tsd-kind-interface"},{"id":18,"kind":1024,"name":"logger","url":"interfaces/CreateRunnerProps.html#logger","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"CreateRunnerProps"},{"id":19,"kind":1024,"name":"options","url":"interfaces/CreateRunnerProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"CreateRunnerProps"},{"id":20,"kind":256,"name":"RunProps","url":"interfaces/RunProps.html","classes":"tsd-kind-interface tsd-has-type-parameter"},{"id":21,"kind":1024,"name":"strict","url":"interfaces/RunProps.html#strict","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":22,"kind":1024,"name":"baseUrl","url":"interfaces/RunProps.html#baseUrl","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":23,"kind":1024,"name":"target","url":"interfaces/RunProps.html#target","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":24,"kind":1024,"name":"selectors","url":"interfaces/RunProps.html#selectors","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":25,"kind":1024,"name":"pagination","url":"interfaces/RunProps.html#pagination","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"RunProps"},{"id":26,"kind":64,"name":"scrape","url":"modules.html#scrape","classes":"tsd-kind-function tsd-has-type-parameter"},{"id":27,"kind":256,"name":"ScrapeProps","url":"interfaces/ScrapeProps.html","classes":"tsd-kind-interface tsd-has-type-parameter"},{"id":28,"kind":1024,"name":"strict","url":"interfaces/ScrapeProps.html#strict","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":29,"kind":1024,"name":"target","url":"interfaces/ScrapeProps.html#target","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":30,"kind":1024,"name":"document","url":"interfaces/ScrapeProps.html#document","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":31,"kind":1024,"name":"selectors","url":"interfaces/ScrapeProps.html#selectors","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":32,"kind":1024,"name":"logger","url":"interfaces/ScrapeProps.html#logger","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":33,"kind":1024,"name":"options","url":"interfaces/ScrapeProps.html#options","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"ScrapeProps"},{"id":34,"kind":4194304,"name":"ScrapeResultType","url":"modules.html#ScrapeResultType","classes":"tsd-kind-type-alias tsd-has-type-parameter"},{"id":35,"kind":64,"name":"createSelectorUtilities","url":"modules.html#createSelectorUtilities","classes":"tsd-kind-function"},{"id":36,"kind":4194304,"name":"SelectorUtilities","url":"modules.html#SelectorUtilities","classes":"tsd-kind-type-alias"},{"id":37,"kind":64,"name":"geosearch","url":"modules.html#geosearch","classes":"tsd-kind-function"},{"id":38,"kind":256,"name":"GeosearchResult","url":"interfaces/GeosearchResult.html","classes":"tsd-kind-interface"},{"id":39,"kind":1024,"name":"latitude","url":"interfaces/GeosearchResult.html#latitude","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"GeosearchResult"},{"id":40,"kind":1024,"name":"longitude","url":"interfaces/GeosearchResult.html#longitude","classes":"tsd-kind-property tsd-parent-kind-interface","parent":"GeosearchResult"},{"id":41,"kind":64,"name":"fetchPage","url":"modules.html#fetchPage","classes":"tsd-kind-function"},{"id":42,"kind":64,"name":"createWindow","url":"modules.html#createWindow","classes":"tsd-kind-function"}],"index":{"version":"2.3.9","fields":["name","parent"],"fieldVectors":[["name/0",[0,33.787]],["parent/0",[]],["name/1",[1,33.787]],["parent/1",[]],["name/2",[2,20.794]],["parent/2",[]],["name/3",[3,33.787]],["parent/3",[2,1.606]],["name/4",[4,33.787]],["parent/4",[2,1.606]],["name/5",[5,33.787]],["parent/5",[2,1.606]],["name/6",[6,28.679]],["parent/6",[2,1.606]],["name/7",[7,33.787]],["parent/7",[8,1.955]],["name/8",[9,33.787]],["parent/8",[8,1.955]],["name/9",[10,33.787]],["parent/9",[8,1.955]],["name/10",[11,25.314]],["parent/10",[]],["name/11",[12,33.787]],["parent/11",[11,1.955]],["name/12",[13,25.314]],["parent/12",[11,1.955]],["name/13",[14,33.787]],["parent/13",[]],["name/14",[15,33.787]],["parent/14",[]],["name/15",[16,28.679]],["parent/15",[]],["name/16",[6,28.679]],["parent/16",[16,2.215]],["name/17",[17,25.314]],["parent/17",[]],["name/18",[18,28.679]],["parent/18",[17,1.955]],["name/19",[13,25.314]],["parent/19",[17,1.955]],["name/20",[19,19.124]],["parent/20",[]],["name/21",[20,28.679]],["parent/21",[19,1.477]],["name/22",[21,33.787]],["parent/22",[19,1.477]],["name/23",[22,28.679]],["parent/23",[19,1.477]],["name/24",[23,28.679]],["parent/24",[19,1.477]],["name/25",[24,33.787]],["parent/25",[19,1.477]],["name/26",[25,33.787]],["parent/26",[]],["name/27",[26,17.693]],["parent/27",[]],["name/28",[20,28.679]],["parent/28",[26,1.367]],["name/29",[22,28.679]],["parent/29",[26,1.367]],["name/30",[27,33.787]],["parent/30",[26,1.367]],["name/31",[23,28.679]],["parent/31",[26,1.367]],["name/32",[18,28.679]],["parent/32",[26,1.367]],["name/33",[13,25.314]],["parent/33",[26,1.367]],["name/34",[28,33.787]],["parent/34",[]],["name/35",[29,33.787]],["parent/35",[]],["name/36",[30,33.787]],["parent/36",[]],["name/37",[31,33.787]],["parent/37",[]],["name/38",[32,25.314]],["parent/38",[]],["name/39",[33,33.787]],["parent/39",[32,1.955]],["name/40",[34,33.787]],["parent/40",[32,1.955]],["name/41",[35,33.787]],["parent/41",[]],["name/42",[36,33.787]],["parent/42",[]]],"invertedIndex":[["__type",{"_index":6,"name":{"6":{},"16":{}},"parent":{}}],["baseurl",{"_index":21,"name":{"22":{}},"parent":{}}],["cache",{"_index":4,"name":{"4":{}},"parent":{}}],["concurrency",{"_index":5,"name":{"5":{}},"parent":{}}],["createrunner",{"_index":14,"name":{"13":{}},"parent":{}}],["createrunnerprops",{"_index":17,"name":{"17":{}},"parent":{"18":{},"19":{}}}],["createscraper",{"_index":0,"name":{"0":{}},"parent":{}}],["createselectorutilities",{"_index":29,"name":{"35":{}},"parent":{}}],["createwindow",{"_index":36,"name":{"42":{}},"parent":{}}],["document",{"_index":27,"name":{"30":{}},"parent":{}}],["fetchpage",{"_index":35,"name":{"41":{}},"parent":{}}],["geosearch",{"_index":31,"name":{"37":{}},"parent":{}}],["geosearchresult",{"_index":32,"name":{"38":{}},"parent":{"39":{},"40":{}}}],["latitude",{"_index":33,"name":{"39":{}},"parent":{}}],["log",{"_index":3,"name":{"3":{}},"parent":{}}],["logger",{"_index":18,"name":{"18":{},"32":{}},"parent":{}}],["longitude",{"_index":34,"name":{"40":{}},"parent":{}}],["name",{"_index":12,"name":{"11":{}},"parent":{}}],["node",{"_index":9,"name":{"8":{}},"parent":{}}],["options",{"_index":13,"name":{"12":{},"19":{},"33":{}},"parent":{}}],["page",{"_index":7,"name":{"7":{}},"parent":{}}],["pagination",{"_index":24,"name":{"25":{}},"parent":{}}],["runprops",{"_index":19,"name":{"20":{}},"parent":{"21":{},"22":{},"23":{},"24":{},"25":{}}}],["scrape",{"_index":25,"name":{"26":{}},"parent":{}}],["scrapeprops",{"_index":26,"name":{"27":{}},"parent":{"28":{},"29":{},"30":{},"31":{},"32":{},"33":{}}}],["scraper",{"_index":1,"name":{"1":{}},"parent":{}}],["scraperesulttype",{"_index":28,"name":{"34":{}},"parent":{}}],["scraperoptions",{"_index":2,"name":{"2":{}},"parent":{"3":{},"4":{},"5":{},"6":{}}}],["scraperoptions.__type",{"_index":8,"name":{},"parent":{"7":{},"8":{},"9":{}}}],["scraperprops",{"_index":11,"name":{"10":{}},"parent":{"11":{},"12":{}}}],["selector",{"_index":10,"name":{"9":{}},"parent":{}}],["selectorfunction",{"_index":16,"name":{"15":{}},"parent":{"16":{}}}],["selectormap",{"_index":15,"name":{"14":{}},"parent":{}}],["selectors",{"_index":23,"name":{"24":{},"31":{}},"parent":{}}],["selectorutilities",{"_index":30,"name":{"36":{}},"parent":{}}],["strict",{"_index":20,"name":{"21":{},"28":{}},"parent":{}}],["target",{"_index":22,"name":{"23":{},"29":{}},"parent":{}}]],"pipeline":[]}}
\ No newline at end of file
diff --git a/docs/index.html b/docs/index.html
index cc5485f..d8397a1 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -2,7 +2,7 @@
Papercut
-
For this example, we gonna scrape Hacker News first page.
Setup a scraper instance and set the selectors using the utilities offered:
-import { createScraper } from '@armand1m/papercut';
const main = async () => {
const scraper = createScraper({
name: `Hacker News`,
options: {
log: process.env.DEBUG === 'true',
cache: true,
}
});
const results = await scraper.run({
strict: true,
baseUrl: "https://news.ycombinator.com/",
target: ".athing",
selectors: {
rank: ({ text }) => text('.rank'),
name: ({ text }) => text('.titlelink'),
url: ({ href }) => href('.titlelink'),
score: ({ element }) => {
return element.nextElementSibling?.querySelector('.score')
?.textContent;
},
createdBy: ({ element }) => {
return element.nextElementSibling?.querySelector('.hnuser')
?.textContent;
},
createdAt: ({ element }) => {
return element.nextElementSibling
?.querySelector('.age')
?.getAttribute('title');
},
}
});
console.log(JSON.stringify(results, null, 2));
};
main();
+import { createScraper } from '@armand1m/papercut';
const main = async () => {
const scraper = createScraper({
name: `Hacker News`,
options: {
log: process.env.DEBUG === 'true',
cache: true,
}
});
const results = await scraper.run({
strict: true,
baseUrl: "https://news.ycombinator.com/",
target: ".athing",
selectors: {
rank: (utils) => {
const value = utils.text('.rank').replace(/^\D+/g, '');
return Number(value);
},
name: ({ text }) => text('.titlelink'),
url: ({ href }) => href('.titlelink'),
score: ({ element }) => {
return element.nextElementSibling?.querySelector('.score')
?.textContent;
},
createdBy: ({ element }) => {
return element.nextElementSibling?.querySelector('.hnuser')
?.textContent;
},
createdAt: ({ element }) => {
return element.nextElementSibling
?.querySelector('.age')
?.getAttribute('title');
},
}
});
console.log(JSON.stringify(results, null, 2));
};
main();
Then run it using node
or ts-node
:
npx ts-node ./single-page-scraper.ts
@@ -89,12 +89,23 @@ Paginated scraper
For this example, because I live in Amsterdam, we gonna scrape the Amsterdam Coffeeshops website for all coffeeshops in Amsterdam.
Setup a scraper instance and set the selectors using the utilities offered:
-import { createScraper } from '@armand1m/papercut';
const createLabeledUrl = (label: string, url: string) => ({ label, url });
const main = async () => {
const scraper = createScraper(
{
name: 'Amsterdam Coffeeshops',
options: {
cache: true,
},
},
);
const results = await scraper.run({
strict: true,
target: '.summary-box',
baseUrl: 'https://amsterdamcoffeeshops.com/search/item/coffeeshops',
pagination: {
enabled: true,
lastPageNumberSelector: '.navigation > .pagination > li:nth-child(8) > a',
createPaginatedUrl: (baseUrl, pageNumber) => {
return `${baseUrl}/p:${pageNumber}`;
},
},
selectors: {
name: ({ text }) => {
return text('.media-body > h3 > a');
},
description: ({ text }) => {
return text('.media-body > .summary-desc');
},
photo: ({ src }) => {
return { url: src('.media-left > a > img') };
},
phone: ({ text }) => {
return text('.media-right > .contact-info > mark > a');
},
address: ({ text }) => {
const address = text('.media-body > address > p');
if (!address) {
return undefined;
}
return address.replace(/\s+/g, ' ').replace(/^\s+|\s+$/g, '');
},
location: async (selectors, $this) => {
const address = $this.address(selectors, $this);
return selectors.geosearch(address);
},
social: ({ href }) => {
const websiteHref = href('.visit-website');
return websiteHref
? [createLabeledUrl('Official Website', websiteHref)]
: [];
},
menus: () => {
/** TODO: scrape menus */
return [];
},
badges: ({ all }) => {
const { asArray: badges } = all('.media-left > div > div > img');
return badges
.map((badge) => badge.getAttribute('title'))
.filter((badge) => badge !== undefined);
},
rating: ({ className }) => {
const rateNumber = className(
'.media-right > .summary-info > span > span'
);
if (!rateNumber) {
return 0;
}
return Number(rateNumber.replace('rate-', ''));
},
}
});
console.log(JSON.stringify(results, null, 2));
};
main();
+import { createScraper } from '@armand1m/papercut';
const createLabeledUrl = (label: string, url: string) => ({ label, url });
const main = async () => {
const scraper = createScraper(
{
name: 'Amsterdam Coffeeshops',
options: {
cache: true,
},
},
);
const results = await scraper.run({
strict: true,
target: '.summary-box',
baseUrl: 'https://amsterdamcoffeeshops.com/search/item/coffeeshops',
pagination: {
enabled: true,
lastPageNumberSelector: '.navigation > .pagination > li:nth-child(8) > a',
createPaginatedUrl: (baseUrl, pageNumber) => {
return `${baseUrl}/p:${pageNumber}`;
},
},
selectors: {
name: ({ text }) => {
return text('.media-body > h3 > a');
},
description: ({ text }) => {
return text('.media-body > .summary-desc');
},
photo: ({ src }) => {
return { url: src('.media-left > a > img') };
},
phone: ({ text }) => {
return text('.media-right > .contact-info > mark > a');
},
address: ({ text }) => {
const address = text('.media-body > address > p');
if (!address) {
return undefined;
}
return address.replace(/\s+/g, ' ').replace(/^\s+|\s+$/g, '');
},
location: async (selectors, $this) => {
const address = $this.address(selectors, $this);
return selectors.geosearch(address);
},
social: ({ href }) => {
const websiteHref = href('.visit-website');
return websiteHref
? [createLabeledUrl('Official Website', websiteHref)]
: [];
},
menus: () => {
/** TODO: scrape menus */
return [];
},
badges: ({ all }) => {
const { asArray: badges } = all('.media-left > div > div > img');
return badges
.map((badge) => badge.getAttribute('title'))
.filter((badge) => badge !== undefined);
},
rating: ({ className }) => {
const rateNumber = className(
'.media-right > .summary-info > span > span'
);
if (!rateNumber) {
return 0;
}
return Number(rateNumber.replace('rate-', ''));
},
}
});
console.log(JSON.stringify(results, null, 2));
};
main();
Then run it using node
or ts-node
:
npx ts-node ./paginated-scraper.ts
+
+ Managed JSDOM
+
+In case you want to use your own JSDOM and Pino instance and tweak/configure as much as you prefer, you can use the scrape
function instead.
+In the example below, we use the exposed createWindow
and fetchPage
utilities for convenience. You can use JSDOM constructor directly and any other strategy to fetch your page HTML as desired.
+import pino from 'pino'
import { scrape, fetchPage, createWindow } from '@armand1m/papercut';
const main = async () => {
const logger = pino({
name: 'Hacker News',
enabled: false
});
const rawHTML = await fetchPage('https://news.ycombinator.com/')
const window = createWindow(rawHTML);
const results = await scrape({
strict: true,
logger,
document: window.document,
target: ".athing",
selectors: {
rank: (utils) => {
const value = utils.text('.rank').replace(/^\D+/g, '');
return Number(value);
},
name: ({ text }) => text('.titlelink'),
url: ({ href }) => href('.titlelink'),
score: ({ element }) => {
return element.nextElementSibling?.querySelector('.score')
?.textContent;
},
createdBy: ({ element }) => {
return element.nextElementSibling?.querySelector('.hnuser')
?.textContent;
},
createdAt: ({ element }) => {
return element.nextElementSibling
?.querySelector('.age')
?.getAttribute('title');
},
},
options: {
log: false,
cache: true,
concurrency: {
page: 2,
node: 2,
selector: 2
}
}
});
window.close();
console.log(JSON.stringify(results, null, 2));
};
main();
+
+Then run it using node
or ts-node
:
+npx ts-node ./managed-jsdom.ts
+
+
API Reference
@@ -152,4 +163,4 @@ Contributors
Armando Magalhaes
-Generated using TypeDoc