diff --git a/.babelrc b/.babelrc
new file mode 100644
index 00000000..8027b696
--- /dev/null
+++ b/.babelrc
@@ -0,0 +1,3 @@
+{
+ "plugins": ["meaningful-logs"]
+}
\ No newline at end of file
diff --git a/.eslintrc b/.eslintrc
new file mode 100644
index 00000000..91e66d4f
--- /dev/null
+++ b/.eslintrc
@@ -0,0 +1,16 @@
+{ "env": {
+ "node": true
+},
+ "globals": {
+ "d3": true,
+ "$": true,
+ "chrome": true,
+ "jQuery": true,
+ "describe": true,
+ "it": true,
+ "beforeEach": true,
+ "afterEach": true,
+ "after": true,
+ "before": true
+ },
+ "extends": ["standard"]}
diff --git a/.gitignore b/.gitignore
index 70c22ad0..6fa7a6d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
.idea
projectFilesBackup
extension.zip
-
+node_modules
+npm-debug.log
\ No newline at end of file
diff --git a/README.md b/README.md
index 5a64886a..f34f941f 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,13 @@
# Web Scraper
-Web Scraper is a chrome browser extension built for data extraction from web
+Web Scraper is a chrome browser extension and a library built for data extraction from web
pages. Using this extension you can create a plan (sitemap) how a web site
should be traversed and what should be extracted. Using these sitemaps the
Web Scraper will navigate the site accordingly and extract all data. Scraped
data later can be exported as CSV.
-Install the extension from [Chrome store] [chrome-store]
+To use it as an extension install it from [Chrome store] [chrome-store]
+
+To use it as a library do `npm i web-scraper-headless`
### Features
@@ -26,6 +28,31 @@ Install the extension from [Chrome store] [chrome-store]
Submit bugs and suggest features on [bug tracker] [github-issues]
+#### Headless mode
+To use it as a library you need a sitemap, for example exported from the app.
+
+ const webscraper = require('webscraper-headless')
+ const sitemap = {
+ id: 'test',
+ startUrl: 'http://test.lv/',
+ selectors: [
+ {
+ 'id': 'a',
+ 'selector': '#scraper-test-one-page a',
+ 'multiple': false,
+ type: 'SelectorText',
+ 'parentSelectors': [
+ '_root'
+ ]
+ }
+ ]
+ }
+ const options = {} // optional delay and pageLoadDelay
+ webscraper(sitemap, options)
+ .then(function (scraped) {
+ // This is your scraped info
+ })
+
#### Bugs
When submitting a bug please attach an exported sitemap if possible.
diff --git a/extension/assets/base64.js b/extension/assets/base64.js
index 89f58010..93d3e441 100644
--- a/extension/assets/base64.js
+++ b/extension/assets/base64.js
@@ -1,36 +1,37 @@
+var jquery = require('jquery-deferred')
/**
* @url http://jsperf.com/blob-base64-conversion
* @type {{blobToBase64: blobToBase64, base64ToBlob: base64ToBlob}}
*/
var Base64 = {
- blobToBase64: function(blob) {
+ blobToBase64: function (blob) {
+ var deferredResponse = jquery.Deferred()
+ var reader = new FileReader()
+ reader.onload = function () {
+ var dataUrl = reader.result
+ var base64 = dataUrl.split(',')[1]
+ deferredResponse.resolve(base64)
+ }
+ reader.readAsDataURL(blob)
- var deferredResponse = $.Deferred();
- var reader = new FileReader();
- reader.onload = function() {
- var dataUrl = reader.result;
- var base64 = dataUrl.split(',')[1];
- deferredResponse.resolve(base64);
- };
- reader.readAsDataURL(blob);
+ return deferredResponse.promise()
+ },
- return deferredResponse.promise();
- },
+ base64ToBlob: function (base64, mimeType) {
+ var deferredResponse = jquery.Deferred()
+ var binary = atob(base64)
+ var len = binary.length
+ var buffer = new ArrayBuffer(len)
+ var view = new Uint8Array(buffer)
+ for (var i = 0; i < len; i++) {
+ view[i] = binary.charCodeAt(i)
+ }
+ var blob = new Blob([view], {type: mimeType})
+ deferredResponse.resolve(blob)
- base64ToBlob: function(base64, mimeType) {
+ return deferredResponse.promise()
+ }
+}
- var deferredResponse = $.Deferred();
- var binary = atob(base64);
- var len = binary.length;
- var buffer = new ArrayBuffer(len);
- var view = new Uint8Array(buffer);
- for (var i = 0; i < len; i++) {
- view[i] = binary.charCodeAt(i);
- }
- var blob = new Blob([view], {type: mimeType});
- deferredResponse.resolve(blob);
-
- return deferredResponse.promise();
- }
-};
+module.exports = Base64
diff --git a/extension/assets/css-selector b/extension/assets/css-selector
deleted file mode 160000
index d9c20445..00000000
--- a/extension/assets/css-selector
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit d9c20445ae0b8635ccd8c837cb1efc8d455e1a37
diff --git a/extension/assets/jquery.whencallsequentially.js b/extension/assets/jquery.whencallsequentially.js
index eee94369..38fd728d 100644
--- a/extension/assets/jquery.whencallsequentially.js
+++ b/extension/assets/jquery.whencallsequentially.js
@@ -1,48 +1,48 @@
+var jquery = require('jquery-deferred')
/**
* @author Martins Balodis
*
* An alternative version of $.when which can be used to execute asynchronous
* calls sequentially one after another.
*
- * @returns $.Deferred().promise()
+ * @returns jqueryDeferred().promise()
*/
-$.whenCallSequentially = function (functionCalls) {
-
- var deferredResonse = $.Deferred();
- var resultData = new Array();
+module.exports = function whenCallSequentially (functionCalls) {
+ var deferredResonse = jquery.Deferred()
+ var resultData = []
// nothing to do
- if (functionCalls.length === 0) {
- return deferredResonse.resolve(resultData).promise();
- }
+ if (functionCalls.length === 0) {
+ return deferredResonse.resolve(resultData).promise()
+ }
- var currentDeferred = functionCalls.shift()();
+ var currentDeferred = functionCalls.shift()()
// execute synchronous calls synchronously
- while (currentDeferred.state() === 'resolved') {
- currentDeferred.done(function (data) {
- resultData.push(data);
- });
- if (functionCalls.length === 0) {
- return deferredResonse.resolve(resultData).promise();
- }
- currentDeferred = functionCalls.shift()();
- }
+ while (currentDeferred.state() === 'resolved') {
+ currentDeferred.done(function (data) {
+ resultData.push(data)
+ })
+ if (functionCalls.length === 0) {
+ return deferredResonse.resolve(resultData).promise()
+ }
+ currentDeferred = functionCalls.shift()()
+ }
// handle async calls
- var interval = setInterval(function () {
+ var interval = setInterval(function () {
// handle mixed sync calls
- while (currentDeferred.state() === 'resolved') {
- currentDeferred.done(function (data) {
- resultData.push(data);
- });
- if (functionCalls.length === 0) {
- clearInterval(interval);
- deferredResonse.resolve(resultData);
- break;
- }
- currentDeferred = functionCalls.shift()();
- }
- }, 10);
+ while (currentDeferred.state() === 'resolved') {
+ currentDeferred.done(function (data) {
+ resultData.push(data)
+ })
+ if (functionCalls.length === 0) {
+ clearInterval(interval)
+ deferredResonse.resolve(resultData)
+ break
+ }
+ currentDeferred = functionCalls.shift()()
+ }
+ }, 10)
- return deferredResonse.promise();
-};
+ return deferredResonse.promise()
+}
diff --git a/extension/background_page/background_script.js b/extension/background_page/background_script.js
index 480287e8..ea5166f9 100644
--- a/extension/background_page/background_script.js
+++ b/extension/background_page/background_script.js
@@ -1,120 +1,148 @@
-var config = new Config();
-var store;
+var Config = require('../scripts/Config')
+var Store = require('../scripts/Store')
+var Sitemap = require('../scripts/Sitemap')
+var Queue = require('../scripts/Queue')
+var Scraper = require('../scripts/Scraper')
+var ChromePopupBrowser = require('../scripts/ChromePopupBrowser')
+const WebJSDOMBrowser = require('../scripts/WebJSDOMBrowser')
+var getBackgroundScript = require('../scripts/getBackgroundScript')
+var $ = require('jquery')
+var config = new Config()
+var store
config.loadConfiguration(function () {
- console.log("initial configuration", config);
- store = new Store(config);
-});
+ console.log('initial configuration', config)
+ store = new Store(config, {$, window, document})
+})
chrome.storage.onChanged.addListener(function () {
- config.loadConfiguration(function () {
- console.log("configuration changed", config);
- store = new Store(config);
- });
-});
+ config.loadConfiguration(function () {
+ console.log('configuration changed', config)
+ store = new Store(config, {$, window, document})
+ })
+})
-var sendToActiveTab = function(request, callback) {
- chrome.tabs.query({
- active: true,
- currentWindow: true
- }, function (tabs) {
- if (tabs.length < 1) {
- this.console.log("couldn't find active tab");
- }
- else {
- var tab = tabs[0];
- chrome.tabs.sendMessage(tab.id, request, callback);
- }
- });
-};
+var sendToActiveTab = function (request, callback) {
+ chrome.tabs.query({
+ active: true,
+ currentWindow: true
+ }, function (tabs) {
+ if (tabs.length < 1) {
+ this.console.log("couldn't find active tab")
+ } else {
+ var tab = tabs[0]
+ chrome.tabs.sendMessage(tab.id, request, callback)
+ }
+ })
+}
chrome.runtime.onMessage.addListener(
function (request, sender, sendResponse) {
+ console.log('chrome.runtime.onMessage', request)
- console.log("chrome.runtime.onMessage", request);
+ if (request.createSitemap) {
+ store.createSitemap(request.sitemap, sendResponse)
+ return true
+ } else if (request.saveSitemap) {
+ store.saveSitemap(request.sitemap, sendResponse)
+ return true
+ } else if (request.deleteSitemap) {
+ store.deleteSitemap(request.sitemap, sendResponse)
+ return true
+ } else if (request.getAllSitemaps) {
+ store.getAllSitemaps(sendResponse)
+ return true
+ } else if (request.sitemapExists) {
+ store.sitemapExists(request.sitemapId, sendResponse)
+ return true
+ } else if (request.getSitemapData) {
+ store.getSitemapData(new Sitemap(request.sitemap, {$, window, document}), sendResponse)
+ return true
+ } else if (request.scrapeSitemap) {
+ var sitemap = new Sitemap(request.sitemap, {$, window, document})
+ var queue = new Queue()
+ var browser = new ChromePopupBrowser({
+ pageLoadDelay: request.pageLoadDelay
+ })
- if (request.createSitemap) {
- store.createSitemap(request.sitemap, sendResponse);
- return true;
- }
- else if (request.saveSitemap) {
- store.saveSitemap(request.sitemap, sendResponse);
- return true;
- }
- else if (request.deleteSitemap) {
- store.deleteSitemap(request.sitemap, sendResponse);
- return true;
- }
- else if (request.getAllSitemaps) {
- store.getAllSitemaps(sendResponse);
- return true;
- }
- else if (request.sitemapExists) {
- store.sitemapExists(request.sitemapId, sendResponse);
- return true;
- }
- else if (request.getSitemapData) {
- store.getSitemapData(new Sitemap(request.sitemap), sendResponse);
- return true;
- }
- else if (request.scrapeSitemap) {
- var sitemap = new Sitemap(request.sitemap);
- var queue = new Queue();
- var browser = new ChromePopupBrowser({
- pageLoadDelay: request.pageLoadDelay
- });
+ var scraper = new Scraper({
+ queue: queue,
+ sitemap: sitemap,
+ browser: browser,
+ store: store,
+ requestInterval: request.requestInterval
+ }, {$, window, document})
- var scraper = new Scraper({
- queue: queue,
- sitemap: sitemap,
- browser: browser,
- store: store,
- requestInterval: request.requestInterval
- });
-
- try {
- scraper.run(function () {
- browser.close();
- var notification = chrome.notifications.create("scraping-finished", {
- type: 'basic',
- iconUrl: 'assets/images/icon128.png',
- title: 'Scraping finished!',
- message: 'Finished scraping ' + sitemap._id
- }, function(id) {
+ try {
+ scraper.run(function () {
+ browser.close()
+ var notification = chrome.notifications.create('scraping-finished', {
+ type: 'basic',
+ iconUrl: 'assets/images/icon128.png',
+ title: 'Scraping finished!',
+ message: 'Finished scraping ' + sitemap._id
+ }, function (id) {
// notification showed
- });
- sendResponse();
- });
- }
- catch (e) {
- console.log("Scraper execution cancelled".e);
- }
+ })
+ sendResponse()
+ })
+ } catch (e) {
+ console.log('Scraper execution cancelled'.e)
+ }
+
+ return true
+ } else if (request.headlessScrapeSitemap) {
+ const sitemap = new Sitemap(request.sitemap, {$, window, document})
+ const queue = new Queue()
+ const browser = new WebJSDOMBrowser({
+ pageLoadDelay: request.pageLoadDelay
+ }, {$, window, document})
- return true;
- }
- else if(request.previewSelectorData) {
- chrome.tabs.query({
- active: true,
- currentWindow: true
- }, function (tabs) {
- if (tabs.length < 1) {
- this.console.log("couldn't find active tab");
- }
- else {
- var tab = tabs[0];
- chrome.tabs.sendMessage(tab.id, request, sendResponse);
- }
- });
- return true;
- }
- else if(request.backgroundScriptCall) {
+ const scraper = new Scraper({
+ queue: queue,
+ sitemap: sitemap,
+ browser: browser,
+ store: store,
+ requestInterval: request.requestInterval
+ }, {$, window, document})
- var backgroundScript = getBackgroundScript("BackgroundScript");
- var deferredResponse = backgroundScript[request.fn](request.request)
- deferredResponse.done(function(response){
- sendResponse(response);
- });
+ try {
+ scraper.run(function () {
+ browser.close()
+ var notification = chrome.notifications.create('scraping-finished', {
+ type: 'basic',
+ iconUrl: 'assets/images/icon128.png',
+ title: 'Scraping finished!',
+ message: 'Finished scraping ' + sitemap._id
+ }, function (id) {
+ // notification showed
+ })
+ sendResponse()
+ })
+ } catch (e) {
+ console.log('Scraper execution cancelled'.e)
+ }
+ return true
+ } else if (request.previewSelectorData) {
+ chrome.tabs.query({
+ active: true,
+ currentWindow: true
+ }, function (tabs) {
+ if (tabs.length < 1) {
+ this.console.log("couldn't find active tab")
+ } else {
+ var tab = tabs[0]
+ chrome.tabs.sendMessage(tab.id, request, sendResponse)
+ }
+ })
+ return true
+ } else if (request.backgroundScriptCall) {
+ var backgroundScript = getBackgroundScript('BackgroundScript')
+ var deferredResponse = backgroundScript[request.fn](request.request)
+ deferredResponse.done(function (response) {
+ sendResponse(response)
+ })
- return true;
- }
- }
-);
+ return true
+ }
+}
+)
diff --git a/extension/content_script/content_scraper.js b/extension/content_script/content_scraper.js
new file mode 100644
index 00000000..bd3c2a42
--- /dev/null
+++ b/extension/content_script/content_scraper.js
@@ -0,0 +1,44 @@
+var DataExtractor = require('./../scripts/DataExtractor')
+var getContentScript = require('./../scripts/getContentScript')
+const debug = require('debug')('web-scraper-headless:content_scraper')
+function extensionListener (request, sender, sendResponse, options) {
+ var $ = options.$
+ var document = options.document
+ var window = options.window
+ debug('chrome.runtime.onMessage', request)
+
+ if (request.extractData) {
+ debug('received data extraction request', request)
+ var extractor = new DataExtractor(request, {$, window, document})
+ var deferredData = extractor.getData()
+ deferredData.done(function (data) {
+ debug('dataextractor data', data)
+ sendResponse(data)
+ })
+ return true
+ } else if (request.previewSelectorData) {
+ debug('received data-preview extraction request', request)
+ var extractor = new DataExtractor(request, {$, document, window})
+ var deferredData = extractor.getSingleSelectorData(request.parentSelectorIds, request.selectorId)
+ deferredData.done(function (data) {
+ debug('dataextractor data', data)
+ sendResponse(data)
+ })
+ return true
+ }
+ // Universal ContentScript communication handler
+ else if (request.contentScriptCall) {
+ var contentScript = getContentScript('ContentScript')
+
+ debug('received ContentScript request', request)
+
+ var deferredResponse = contentScript[request.fn](request.request, {$, document, window})
+ deferredResponse.done(function (response) {
+ sendResponse(response)
+ })
+
+ return true
+ }
+}
+
+module.exports = extensionListener
diff --git a/extension/content_script/content_scraper_browser.js b/extension/content_script/content_scraper_browser.js
new file mode 100644
index 00000000..ecf9ec6a
--- /dev/null
+++ b/extension/content_script/content_scraper_browser.js
@@ -0,0 +1,7 @@
+const listener = require('./content_scraper')
+const $ = require('jquery')
+module.exports = function (request, sender, sendResponse) {
+ listener(request, sender, sendResponse, {$, window, document})
+ // important so that chrome knows the listener is async
+ return true
+}
diff --git a/extension/content_script/content_script.js b/extension/content_script/content_script.js
index 7bd00b4b..587ad9fe 100644
--- a/extension/content_script/content_script.js
+++ b/extension/content_script/content_script.js
@@ -1,41 +1 @@
-chrome.runtime.onMessage.addListener(
- function (request, sender, sendResponse) {
-
- console.log("chrome.runtime.onMessage", request);
-
- if (request.extractData) {
- console.log("received data extraction request", request);
- var extractor = new DataExtractor(request);
- var deferredData = extractor.getData();
- deferredData.done(function(data){
- console.log("dataextractor data", data);
- sendResponse(data);
- });
- return true;
- }
- else if(request.previewSelectorData) {
- console.log("received data-preview extraction request", request);
- var extractor = new DataExtractor(request);
- var deferredData = extractor.getSingleSelectorData(request.parentSelectorIds, request.selectorId);
- deferredData.done(function(data){
- console.log("dataextractor data", data);
- sendResponse(data);
- });
- return true;
- }
- // Universal ContentScript communication handler
- else if(request.contentScriptCall) {
-
- var contentScript = getContentScript("ContentScript");
-
- console.log("received ContentScript request", request);
-
- var deferredResponse = contentScript[request.fn](request.request);
- deferredResponse.done(function(response) {
- sendResponse(response);
- });
-
- return true;
- }
- }
-);
\ No newline at end of file
+chrome.runtime.onMessage.addListener(contentScraper)
diff --git a/extension/devtools/devtools_init_page.js b/extension/devtools/devtools_init_page.js
index 933883fd..46b04794 100644
--- a/extension/devtools/devtools_init_page.js
+++ b/extension/devtools/devtools_init_page.js
@@ -1 +1,2 @@
-chrome.devtools.panels.create("Web Scraper", "../assets/images/icon48.png", "devtools/devtools_scraper_panel.html");
\ No newline at end of file
+console.log('loading devtools')
+chrome.devtools.panels.create('Web Scraper Headless', '../assets/images/icon48.png', 'devtools/devtools_scraper_panel.html')
diff --git a/extension/devtools/devtools_scraper_panel.html b/extension/devtools/devtools_scraper_panel.html
index 6f7c4e30..322b2f23 100644
--- a/extension/devtools/devtools_scraper_panel.html
+++ b/extension/devtools/devtools_scraper_panel.html
@@ -7,33 +7,11 @@
-
+
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+
-