TWPUB · Zacharia2 · Jul 17, 2023 · Jul 17, 2023 · Jul 17, 2023 · Jul 17, 2023
diff --git a/bin/clean.sh b/bin/clean.sh
diff --git a/epub2twpub/epub-reader.js b/epub2twpub/epub-reader.js
@@ -1,13 +1,13 @@
-/*
-Reads an EPUB file and makes the content available via properties
-*/
+/**
+ * Reads an EPUB file and makes the content available via properties
+ */
 
 const fs = require("fs"),
 	path = require("path"),
 	{promisify} = require("util"),
 	readFileAsync = promisify(fs.readFile),
 	writeFileAsync = promisify(fs.writeFile),
-	{DOMParser,XMLSerializer} = require("xmldom"),
+	{DOMParser,XMLSerializer} = require("@xmldom/xmldom"),
 	JSZip = require("jszip"),
 	{TextExtractor} = require("./text-extractor"),
 	{hash,resolvePath} = require("./utils");
@@ -40,9 +40,10 @@ class EpubReader {
 		this.errors.push(message);
 	}
 
-	/*
-	Load an EPUB from a file path
-	*/
+	/**
+	 * Load an EPUB from a file path
+	 * @param {*} epubFilepath Epub File path
+	 */
 	async load(epubFilepath) {
 		// Read the ZIP file
 		const epubFileData = await readFileAsync(epubFilepath);
@@ -107,16 +108,21 @@ class EpubReader {
 		await this.loadImages();
 	}
 
-	/*
-	Check for a metadata item
-	*/
+	/**
+	 * Check for a metadata item
+	 * @param {*} name 
+	 * @returns 
+	 */
 	hasMetadataItem(name) {
 		return name in this.metadata;
 	}
 
-	/*
-	Get a metadata item
-	*/
+	/**
+	 * Get a metadata item
+	 * @param {*} name 
+	 * @param {*} defaultValue 
+	 * @returns 
+	 */
 	getMetadataItem(name,defaultValue) {
 		if(name in this.metadata) {
 			return this.metadata[name];
@@ -125,16 +131,21 @@ class EpubReader {
 		}
 	}
 
-	/*
-	Get a manifest item
-	*/
+	/**
+	 * Get a manifest item
+	 * @param {*} id 
+	 * @param {*} defaultValue 
+	 * @returns 
+	 */
 	getManifestItem(id,defaultValue) {
 		return this.manifest[id] || defaultValue;
 	}
 
-	/*
-	Get the media type of a manifest item
-	*/
+	/**
+	 * Get the media type of a manifest item
+	 * @param {*} href 
+	 * @returns 
+	 */
 	getMediaTypeOfItem(href) {
 		var result;
 		for(const id of Object.keys(this.manifest)) {
@@ -146,10 +157,10 @@ class EpubReader {
 		return result;
 	}
 
-	/*
-	Load the table of contents
-	Returns a tree of {id:, text:, href:, children: {}}
-	*/
+	/**
+	 * Load the table of contents
+	 * @returns Returns a tree of {id:, text:, href:, children: {}}
+	 */
 	async loadToc() {
 		this.tocItem = this.manifest[this.nodeSpine.getAttribute("toc")].href;
 		// Get the TOC file
@@ -179,9 +190,9 @@ class EpubReader {
 		this.toc = visitNodes(navMap.childNodes);
 	}
 
-	/*
-	Load the text chunks and stylesheets
-	*/
+	/**
+	 * Load the text chunks and stylesheets
+	 */
 	async loadTextChunks() {
 		// Setup the text extractor
 		const textExtractor = new TextExtractor({
@@ -194,7 +205,6 @@ class EpubReader {
 			},
 			logError: this.logError.bind(this)
 		});
-		await textExtractor.initialise();
 		// Extract each HTML file listed in the spine
 		for(const spineItem of this.spine) {
 			const manifestItem = this.manifest[spineItem];
@@ -221,9 +231,9 @@ class EpubReader {
 		}
 	}
 
-	/*
-	Load all the images
-	*/
+	/**
+	 * Load all the images
+	 */
 	async loadImages() {
 		// Get the image manifest items
 		for(const id of Object.keys(this.manifest)) {
@@ -232,6 +242,7 @@ class EpubReader {
 				const file = this.zip.file(manifestItem.href),
 					encoding = BINARY_MEDIA_TYPES.includes(manifestItem["media-type"]) ? "base64" : "text";
 				if(file) {
+					// Use only picture file names.
 					this.images[manifestItem.href] = {
 						type: manifestItem["media-type"],
 						text: await file.async(encoding)
@@ -252,11 +263,12 @@ function findNodeAndGetAttribute(rootNode,selectors,attributeName) {
 	return null;
 }
 
-/*
-Find an XML node identified by a list of child tag names
-rootNode: reference to root node
-selectors: array of child tag names
-*/
+/**
+ * Find an XML node identified by a list of child tag names
+ * @param {*} rootNode reference to root node
+ * @param {*} selectors array of child tag names
+ * @returns {*} node
+ */
 function findNode(rootNode,selectors) {
 	let node = rootNode;
 	for(selector of selectors) {

diff --git a/epub2twpub/injected/get-page-text.js → epub2twpub/get-page-struct.js b/epub2twpub/injected/get-page-text.js → epub2twpub/get-page-struct.js
@@ -1,16 +1,17 @@
-/*
-This script is executed within the context of a web page loaded into Puppeteer to extract the text chunks and stylesheets from a page.
-
-Returns a structure: {chunks: [], stylsheets: [text]}
-
-Each chunk entry is: {nodes: [], anchorIds: [], href:} where nodes is a tree of objects representing DOM nodes and strings representing
-text nodes, and anchorIds is an array of anchor IDs associated with each chunk
-
-Each stylsheet entry is the text of the stylesheet
-
-*/
-
-exports.getPageText = function(win,doc) {
+/**
+ * An instance of an HTML document containing the doc.location.href (URL_PREFIX + filename) of the document.
+ * 
+ * To extract the text chunks and stylesheets from a page.
+ * 
+ * Each chunk entry is: {nodes: [], anchorIds: [], href:} where nodes is a tree of objects representing DOM nodes and strings representing
+ * text nodes, and anchorIds is an array of anchor IDs associated with each chunk
+ * 
+ * Each stylsheet entry is the text of the stylesheet
+ * @param {*} win The window instance that currently contains the DOM document
+ * @param {*} doc DOM model instance of the current document
+ * @returns Returns a structure: {chunks: [], stylsheets: [text]}
+ */
+exports.getPageStruct = function(win,doc) {
 	win = win || window;
 	doc = doc || document;
 
@@ -112,7 +113,7 @@ const nonBlankChunks = chunks.outputChunks.filter(chunk => {
 	return !(chunk.anchorIds.length === 0 && (chunk.nodes.length === 1) && (typeof (chunk.nodes[0]) === "string") && (!(/\S/.test(chunk.nodes[0]))));
 })
 // Get the expected test results if present
-const domExpectedResults = document.getElementsByTagName("script")[0];
+const domExpectedResults = doc.getElementsByTagName("script")[0];
 var expectedResults;
 if(domExpectedResults && domExpectedResults.id === "expectedResults") {
 	try {
@@ -127,7 +128,11 @@ return {
 	expectedResults: expectedResults
 };
 
-// Node iterator
+/**
+ * Node iterator
+ * @param {*} e Document Element
+ * @param {*} options Options, the current main disableBlockProcessing?
+ */
 function visitNode(e,options) {
 	options = options || {};
 	var disableBlockProcessing = !!options.disableBlockProcessing;
@@ -156,6 +161,7 @@ function visitNode(e,options) {
 				parentListElement.private.count = count;
 			} else if(nodeInfo.tag === "img") {
 				if(e.hasAttribute("src")) {
+					// Only the file name is needed here.
 					nodeInfo.attributes.src = e.src.slice(URL_PREFIX.length);
 				}
 				if(e.hasAttribute("width")) {

diff --git a/epub2twpub/test.js b/epub2twpub/test.js
@@ -63,8 +63,7 @@ async function testPage(filepath) {
 			console.log("Text extractor error: " + msg)
 		}
 	});
-	await textExtractor.initialise();
-	// Get the text of the page
+	// Get the text of the page 
 	const results = await textExtractor.getPageText("index.html");
 	// Flatten the nodes of the results
 	for(const chunk of results.chunks) {

diff --git a/epub2twpub/text-extractor.js b/epub2twpub/text-extractor.js
@@ -1,58 +1,46 @@
 /*
-Class representing the Puppeteer-based wrapper for get-page-text.js
+Class representing the jsdom wrapper for get-page-struct.js
 */
 
-const playwright = require("playwright"),
-	{getPageText} = require("./injected/get-page-text");
-
+const { JSDOM } = require("jsdom");
+const { getPageStruct } = require("./get-page-struct");
 const URL_PREFIX = "https://example.com/";
 
 class TextExtractor {
 
-	/*
-	Options:
-	getFile: function(href) returns {type:, contents:}
-	logError: function(msg)
-	*/
-	constructor (options) {
-		this.getFile = options.getFile;
-		this.logError = options.logError;
-	}
-
-	async initialise() {
-		this.browser = await playwright.chromium.launch();
-		this.context = await this.browser.newContext({
-			javaScriptEnabled: false
-		});
-		this.page = await this.context.newPage();
-		await this.page.route("**/*",async (route) => {
-			const request = route.request();
-			if(request.method() === "GET" && request.url().startsWith(URL_PREFIX)) {
-				const fileHref = request.url().slice(URL_PREFIX.length);
-				const {type,contents} = await this.getFile(fileHref);
-				if(!type) {
-					this.logError(`Missing file \`${fileHref}\``);
-					route.fulfill({status: 404, contentType: "text/plain", body: "Not found!"});
-				} else  {
-					route.fulfill({status: 200, contentType: type, body: contents});
-				}
-			} else {
-				route.abort();
-			}
-		});
-	}
-
-	async getPageText(href) {
-	// console.log("processing page",href)
-		const pageURL = URL_PREFIX + href;
-		await this.page.goto(pageURL,{waitUntil: "load"});
-		return await this.page.evaluate(getPageText);
-	}
-
-	async close() {
-		await this.page.close();
-		await this.browser.close();
-	}
+  /**
+   * Options:
+   * @param {*} options 
+   * @field getFile: function(href) returns {type:, contents:}
+   * @field logError: function(msg)
+   */
+  constructor(options) {
+    this.getFile = options.getFile;
+    this.logError = options.logError;
+  }
+
+
+  /**
+   * @description 从文件中获取内容，通过JSDOM解析内容获得dom对象。然后通过getStructure传入DOM获得格式化的结构，返回结构。
+   * @param {string} href filename: Text/chapter82.xhtml, But the first character cannot be a '/'
+   * @returns Return a structure：{chunks: [], stylsheets: [text]}。
+   */
+  async getPageText(href) {
+    const { type, contents } = await this.getFile(href);
+    if (!type) {
+      this.logError(`Missing file \`${href}\``);
+      return "";
+    } else {
+      var window = new JSDOM(contents, {
+        contentType: type,
+        url: URL_PREFIX + href,
+        runScripts: "dangerously"
+      }).window;
+      var document = window.document;
+      var result = getPageStruct(window, document);
+    }
+    return result;
+  }
 
 }