Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Recovered PR20 #21

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions bin/clean.sh

This file was deleted.

82 changes: 47 additions & 35 deletions epub2twpub/epub-reader.js
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
/*
Reads an EPUB file and makes the content available via properties
*/
/**
* Reads an EPUB file and makes the content available via properties
*/

const fs = require("fs"),
path = require("path"),
{promisify} = require("util"),
readFileAsync = promisify(fs.readFile),
writeFileAsync = promisify(fs.writeFile),
{DOMParser,XMLSerializer} = require("xmldom"),
{DOMParser,XMLSerializer} = require("@xmldom/xmldom"),
JSZip = require("jszip"),
{TextExtractor} = require("./text-extractor"),
{hash,resolvePath} = require("./utils");
Expand Down Expand Up @@ -40,9 +40,10 @@ class EpubReader {
this.errors.push(message);
}

/*
Load an EPUB from a file path
*/
/**
* Load an EPUB from a file path
* @param {*} epubFilepath Epub File path
*/
async load(epubFilepath) {
// Read the ZIP file
const epubFileData = await readFileAsync(epubFilepath);
Expand Down Expand Up @@ -107,16 +108,21 @@ class EpubReader {
await this.loadImages();
}

/*
Check for a metadata item
*/
/**
* Check for a metadata item
* @param {*} name
* @returns
*/
hasMetadataItem(name) {
return name in this.metadata;
}

/*
Get a metadata item
*/
/**
* Get a metadata item
* @param {*} name
* @param {*} defaultValue
* @returns
*/
getMetadataItem(name,defaultValue) {
if(name in this.metadata) {
return this.metadata[name];
Expand All @@ -125,16 +131,21 @@ class EpubReader {
}
}

/*
Get a manifest item
*/
/**
* Get a manifest item
* @param {*} id
* @param {*} defaultValue
* @returns
*/
getManifestItem(id,defaultValue) {
return this.manifest[id] || defaultValue;
}

/*
Get the media type of a manifest item
*/
/**
* Get the media type of a manifest item
* @param {*} href
* @returns
*/
getMediaTypeOfItem(href) {
var result;
for(const id of Object.keys(this.manifest)) {
Expand All @@ -146,10 +157,10 @@ class EpubReader {
return result;
}

/*
Load the table of contents
Returns a tree of {id:, text:, href:, children: {}}
*/
/**
* Load the table of contents
* @returns Returns a tree of {id:, text:, href:, children: {}}
*/
async loadToc() {
this.tocItem = this.manifest[this.nodeSpine.getAttribute("toc")].href;
// Get the TOC file
Expand Down Expand Up @@ -179,9 +190,9 @@ class EpubReader {
this.toc = visitNodes(navMap.childNodes);
}

/*
Load the text chunks and stylesheets
*/
/**
* Load the text chunks and stylesheets
*/
async loadTextChunks() {
// Setup the text extractor
const textExtractor = new TextExtractor({
Expand All @@ -194,7 +205,6 @@ class EpubReader {
},
logError: this.logError.bind(this)
});
await textExtractor.initialise();
// Extract each HTML file listed in the spine
for(const spineItem of this.spine) {
const manifestItem = this.manifest[spineItem];
Expand All @@ -221,9 +231,9 @@ class EpubReader {
}
}

/*
Load all the images
*/
/**
* Load all the images
*/
async loadImages() {
// Get the image manifest items
for(const id of Object.keys(this.manifest)) {
Expand All @@ -232,6 +242,7 @@ class EpubReader {
const file = this.zip.file(manifestItem.href),
encoding = BINARY_MEDIA_TYPES.includes(manifestItem["media-type"]) ? "base64" : "text";
if(file) {
// Use only picture file names.
this.images[manifestItem.href] = {
type: manifestItem["media-type"],
text: await file.async(encoding)
Expand All @@ -252,11 +263,12 @@ function findNodeAndGetAttribute(rootNode,selectors,attributeName) {
return null;
}

/*
Find an XML node identified by a list of child tag names
rootNode: reference to root node
selectors: array of child tag names
*/
/**
* Find an XML node identified by a list of child tag names
* @param {*} rootNode reference to root node
* @param {*} selectors array of child tag names
* @returns {*} node
*/
function findNode(rootNode,selectors) {
let node = rootNode;
for(selector of selectors) {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
/*
This script is executed within the context of a web page loaded into Puppeteer to extract the text chunks and stylesheets from a page.

Returns a structure: {chunks: [], stylsheets: [text]}

Each chunk entry is: {nodes: [], anchorIds: [], href:} where nodes is a tree of objects representing DOM nodes and strings representing
text nodes, and anchorIds is an array of anchor IDs associated with each chunk

Each stylsheet entry is the text of the stylesheet

*/

exports.getPageText = function(win,doc) {
/**
* An instance of an HTML document containing the doc.location.href (URL_PREFIX + filename) of the document.
*
* To extract the text chunks and stylesheets from a page.
*
* Each chunk entry is: {nodes: [], anchorIds: [], href:} where nodes is a tree of objects representing DOM nodes and strings representing
* text nodes, and anchorIds is an array of anchor IDs associated with each chunk
*
* Each stylsheet entry is the text of the stylesheet
* @param {*} win The window instance that currently contains the DOM document
* @param {*} doc DOM model instance of the current document
* @returns Returns a structure: {chunks: [], stylsheets: [text]}
*/
exports.getPageStruct = function(win,doc) {
win = win || window;
doc = doc || document;

Expand Down Expand Up @@ -112,7 +113,7 @@ const nonBlankChunks = chunks.outputChunks.filter(chunk => {
return !(chunk.anchorIds.length === 0 && (chunk.nodes.length === 1) && (typeof (chunk.nodes[0]) === "string") && (!(/\S/.test(chunk.nodes[0]))));
})
// Get the expected test results if present
const domExpectedResults = document.getElementsByTagName("script")[0];
const domExpectedResults = doc.getElementsByTagName("script")[0];
var expectedResults;
if(domExpectedResults && domExpectedResults.id === "expectedResults") {
try {
Expand All @@ -127,7 +128,11 @@ return {
expectedResults: expectedResults
};

// Node iterator
/**
* Node iterator
* @param {*} e Document Element
* @param {*} options Options, the current main disableBlockProcessing?
*/
function visitNode(e,options) {
options = options || {};
var disableBlockProcessing = !!options.disableBlockProcessing;
Expand Down Expand Up @@ -156,6 +161,7 @@ function visitNode(e,options) {
parentListElement.private.count = count;
} else if(nodeInfo.tag === "img") {
if(e.hasAttribute("src")) {
// Only the file name is needed here.
nodeInfo.attributes.src = e.src.slice(URL_PREFIX.length);
}
if(e.hasAttribute("width")) {
Expand Down
3 changes: 1 addition & 2 deletions epub2twpub/test.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,7 @@ async function testPage(filepath) {
console.log("Text extractor error: " + msg)
}
});
await textExtractor.initialise();
// Get the text of the page
// Get the text of the page
const results = await textExtractor.getPageText("index.html");
// Flatten the nodes of the results
for(const chunk of results.chunks) {
Expand Down
84 changes: 36 additions & 48 deletions epub2twpub/text-extractor.js
Original file line number Diff line number Diff line change
@@ -1,58 +1,46 @@
/*
Class representing the Puppeteer-based wrapper for get-page-text.js
Class representing the jsdom wrapper for get-page-struct.js
*/

const playwright = require("playwright"),
{getPageText} = require("./injected/get-page-text");

const { JSDOM } = require("jsdom");
const { getPageStruct } = require("./get-page-struct");
const URL_PREFIX = "https://example.com/";

class TextExtractor {

/*
Options:
getFile: function(href) returns {type:, contents:}
logError: function(msg)
*/
constructor (options) {
this.getFile = options.getFile;
this.logError = options.logError;
}

async initialise() {
this.browser = await playwright.chromium.launch();
this.context = await this.browser.newContext({
javaScriptEnabled: false
});
this.page = await this.context.newPage();
await this.page.route("**/*",async (route) => {
const request = route.request();
if(request.method() === "GET" && request.url().startsWith(URL_PREFIX)) {
const fileHref = request.url().slice(URL_PREFIX.length);
const {type,contents} = await this.getFile(fileHref);
if(!type) {
this.logError(`Missing file \`${fileHref}\``);
route.fulfill({status: 404, contentType: "text/plain", body: "Not found!"});
} else {
route.fulfill({status: 200, contentType: type, body: contents});
}
} else {
route.abort();
}
});
}

async getPageText(href) {
// console.log("processing page",href)
const pageURL = URL_PREFIX + href;
await this.page.goto(pageURL,{waitUntil: "load"});
return await this.page.evaluate(getPageText);
}

async close() {
await this.page.close();
await this.browser.close();
}
/**
* Options:
* @param {*} options
* @field getFile: function(href) returns {type:, contents:}
* @field logError: function(msg)
*/
constructor(options) {
this.getFile = options.getFile;
this.logError = options.logError;
}


/**
* @description 从文件中获取内容,通过JSDOM解析内容获得dom对象。然后通过getStructure传入DOM获得格式化的结构,返回结构。
* @param {string} href filename: Text/chapter82.xhtml, But the first character cannot be a '/'
* @returns Return a structure:{chunks: [], stylsheets: [text]}。
*/
async getPageText(href) {
const { type, contents } = await this.getFile(href);
if (!type) {
this.logError(`Missing file \`${href}\``);
return "";
} else {
var window = new JSDOM(contents, {
contentType: type,
url: URL_PREFIX + href,
runScripts: "dangerously"
}).window;
var document = window.document;
var result = getPageStruct(window, document);
}
return result;
}

}

Expand Down
Loading