New Readme content and refactoring

cloudx-labs · Mar 19, 2024 · eefda53 · eefda53
1 parent 853f72d
commit eefda53
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -1,53 +1,57 @@
-# scrapex
+# Scrapex
 
-## Intro
+## Introduction
 
-A basic implementation of a scraping component for extracting the content from a URL.
+Scrapex is a versatile scraping component designed to efficiently extract content from URLs. Leveraging the power of Playwright and Chrome, it ensures seamless support for Single Page Applications (SPAs) and content dependent on JavaScript execution. Initially developed for internal use by our AI Agents, Scrapex offers robust functionality for a wide range of scraping needs.
 
-It uses Playwright with Chrome behind the scenes, to support SPAs and content that relies on JavaScript.
+## Features
 
-Created mainly to be used by our own AI Agents.
+-   _Support for Multiple Output Formats_: Scrapex can output data in HTML, Markdown, or PDF formats, catering to diverse requirements.
+-   _Container Image deployment_: For ease of deployment and scalability, Scrapex is fully compatible with Container environments such as Docker or Kubernetes.
+-   _Customizable Settings_: Through environment variables, as well as parameters in the extraction call, users can tailor the behavior of Scrapex to suit their specific scraping tasks.
 
 ## Configuration
 
-It currently supports 3 output formats:
+Scrapex supports the following output formats:
 
-1. HTML
-2. Markdown (using html-to-md)
-3. PDF (using Playwright's pdf functionality)
+1. _HTML_: Direct extraction of HTML content.
+2. _Markdown_: Conversion of HTML to Markdown using `html-to-md`.
+3. _PDF_: Generation of PDF documents utilizing Playwright's PDF functionality.
 
-## How to run it
+### Environment Variables
 
-Simplest way is with Docker:
+Configure Scrapex using the following environment variables:
 
-docker-compose.yaml:
+| Variable             | Description                                      | Default                                                                                                             |
+| -------------------- | ------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------- |
+| `PORT`               | Port on which Node.js server listens             | `3000`                                                                                                              |
+| `DEFAULT_WAIT`       | Default milliseconds to wait on page load        | `0`                                                                                                                 |
+| `DEFAULT_USER_AGENT` | Default user agent for requests                  | `"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"` |
+| `LOG_LEVEL`          | Logging level (`debug`, `info`, `warn`, `error`) | `debug`                                                                                                             |
 
-```
+## How to Run
+
+The simplest way to run Scrapex is using Docker. Here's an example `docker-compose.yaml`:
+
+```yaml
 version: "3"
 services:
-  app:
-    container_name: scrapex
-    image: ghcr.io/cloudx-labs/scrapex:0.1
-    environment:
-      - TZ=America/Argentina/Buenos_Aires
-      - PORT=3000
-      - LOG_LEVEL=debug
-    ports:
-      - "3003:3000"
+    app:
+        container_name: scrapex
+        image: ghcr.io/cloudx-labs/scrapex:main # it's better to pin down to a specific release version such as v0.1
+        environment:
+            - TZ=America/Argentina/Buenos_Aires
+            - PORT=3000
+            - LOG_LEVEL=debug
+        ports:
+            - "3003:3000"
 ```
 
-env:
-
-PORT
-DEFAULT_WAIT
-DEFAULT_USER_AGENT
-LOG_LEVEL
+## Usage Example
 
-## Usage example:
+To test Scrapex, you can send a request using curl as shown below:
 
-`curl` example:
-
-```
+```bash
 curl --location 'http://localhost:3003/extract' \
 --header 'Content-Type: application/json' \
 --data '{
@@ -64,3 +68,24 @@ curl --location 'http://localhost:3003/extract' \
     }
 }'
 ```
+
+### Payload Parameters
+
+The following table describes the parameters included in the payload of the `curl` example:
+| Parameter | Description | Example |
+|--------------|-------------------------------------------|---------------------------------------------------|
+| url | URL of the page to scrape | https://en.wikipedia.org/wiki/Six_Degrees_of_Kevin_Bacon |
+| outputType | Desired output format | html / md / pdf |
+| wait | Milliseconds to wait before extraction | 2000 |
+| userAgent | User agent to use for the request | Mozilla/5.0 (Windows NT 10.0; Win64; x64)... |
+| settings | Additional settings for output formatting | { "pdf": { "options": { "format": "A4" } } } |
+
+### Settings per extraction Type
+
+#### PDF
+
+All available values for `settings -> pdf -> options` can be found at: https://playwright.dev/docs/api/class-page#page-pdf
+
+#### Markdown (MD)
+
+All available values for `setting -> md -> options` can be found at: https://github.com/stonehank/html-to-md/blob/master/README-EN.md
diff --git a/scrapex/src/handlers/extract.js b/scrapex/src/handlers/extract.js
@@ -24,16 +24,8 @@ export default async function handle(req, res) {
 	log.info(`Extracting "${outputType}" from "${url}"`);
 	log.debug(JSON.stringify(params));
 
-	const browser = await TimeUtils.profile("Opening Browser", () =>
-		chromium.launch({
-			headless: true,
-		})
-	);
-	const context = await TimeUtils.profile("New Context", () =>
-		browser.newContext({
-			userAgent: userAgent,
-		})
-	);
+	const browser = await getBrowser();
+	const context = await getNewContext(browser, userAgent);
 
 	try {
 		if (!extractionHandlers.has(outputType)) {
@@ -45,7 +37,7 @@ export default async function handle(req, res) {
 			wait,
 			params,
 		};
-		const extractionResult = await TimeUtils.profile("Executing extraction", () =>
+		const extractionResult = await TimeUtils.profile("Extraction", () =>
 			extractionHandlers.get(outputType)(parameters)
 		);
 		res.json(extractionResult);
@@ -55,10 +47,7 @@ export default async function handle(req, res) {
 			message: err.message,
 		});
 	} finally {
-		await TimeUtils.profile("Closing Context and Browser", async () => {
-			await context.close();
-			await browser.close();
-		});
+		await tearDown(browser, context);
 	}
 }
 
@@ -99,15 +88,21 @@ async function extractHtml({ context, url, wait }) {
 	});
 }
 
-async function extractMarkdown({ context, url, wait }) {
+async function extractMarkdown({ context, url, wait, params }) {
 	const result = await loadPage({
 		context,
 		url,
 		wait,
 	});
 
 	const htmlContent = await result.page.content();
-	const markdownContent = await TimeUtils.profile("Converting to MD", () => html2md(htmlContent, {}));
+
+	const mdOptions = params.settings?.md?.options || {};
+	if (mdOptions.tagListener) delete mdOptions.tagListener;
+
+	log.debug(`MD options: ${JSON.stringify(mdOptions)}`);
+
+	const markdownContent = await TimeUtils.profile("Converting to MD", () => html2md(htmlContent, mdOptions));
 
 	return await buildResponse(result, {
 		contentType: "text/markdown",
@@ -135,3 +130,28 @@ async function extractPdf({ context, url, wait, params }) {
 		content: buffer.toString("base64"),
 	});
 }
+
+async function getBrowser() {
+	const browser = await TimeUtils.profile("Opening Browser", () =>
+		chromium.launch({
+			headless: true,
+		})
+	);
+	return browser;
+}
+
+async function getNewContext(browser, userAgent) {
+	const context = await TimeUtils.profile("New Context", () =>
+		browser.newContext({
+			userAgent: userAgent,
+		})
+	);
+	return context;
+}
+
+async function tearDown(browser, context) {
+	await TimeUtils.profile("Closing Context and Browser", async () => {
+		await context.close();
+		await browser.close();
+	});
+}