-
Notifications
You must be signed in to change notification settings - Fork 2
/
clean-html.ts
124 lines (109 loc) · 3.01 KB
/
clean-html.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import { Readability, JSDOMParser } from "readability-node"
import { DOMParser, XMLSerializer } from "xmldom-silent"
import UrlParser from "url-parse"
import sanitizeHtml from "sanitize-html"
import { allowedTags, nonTextTags } from "./clean-html-css"
import { fetchHtml } from "./fetch"
export interface ReaderObject {
length?: number
content?: string
excerpt?: string
title?: string
byline: boolean | null
dir: string | undefined
uri?: {
spec: string
host: string
scheme: string
prePath: string
pathBase: string
}
}
export interface Config {
allowedTags?: string[]
nonTextTags?: string[]
}
function convertHtmlToXhtml(html: string) {
try {
const xmlSerializer = new XMLSerializer()
const xhtmlDocument = new DOMParser({
errorHandler: function (level, msg) {
if (level === "error") {
throw new Error(`Unable to convert HTML to XHTML: ${msg}`)
}
}
}).parseFromString(html, "text/html")
return xmlSerializer.serializeToString(xhtmlDocument)
} catch (e) {
console.error(e)
}
}
function createJsDomDocument(xhtml: string) {
try {
const jsDomParser = new JSDOMParser()
jsDomParser.parse(xhtml.trim())
if (jsDomParser.errorState) {
throw new Error(
`Unable to parse XHTML into JsDom ${jsDomParser.errorState}`
)
}
return jsDomParser.doc
} catch (e) {
console.error(e)
}
}
function createReadabilityUrl(sourceUrl: string) {
const sourceUrlParsed = new UrlParser(sourceUrl)
if (!sourceUrlParsed || sourceUrlParsed.host.length === 0) {
throw new Error("Invalid or no source url provided")
}
return {
spec: sourceUrlParsed.href,
host: sourceUrlParsed.host,
scheme: sourceUrlParsed.protocol.slice(0, -1),
prePath: `${sourceUrlParsed.protocol}//${sourceUrlParsed.host}`,
pathBase: `${sourceUrlParsed.protocol}//${
sourceUrlParsed.host
}${sourceUrlParsed.pathname.substring(
0,
sourceUrlParsed.pathname.lastIndexOf("/") + 1
)}`
}
}
async function cleanHtml(
html: string,
sourceUrl: string,
config: Config = { allowedTags: [], nonTextTags: [] }
): Promise<ReaderObject> {
try {
html = !html && sourceUrl ? await fetchHtml(sourceUrl) : html
} catch (e) {
console.error(e)
}
html = sanitizeHtml(html, {
allowedTags: [
...allowedTags,
...(config?.allowedTags ? config?.allowedTags : [])
],
nonTextTags: [
...nonTextTags,
...(config?.nonTextTags ? config?.nonTextTags : [])
]
})
return new Promise(resolve => {
try {
if (!html) {
throw new Error(
"Invalid url or no html provided, please use a html string or url"
)
}
const readabilityUrl = createReadabilityUrl(sourceUrl)
const xhtml = convertHtmlToXhtml(html)
const document = createJsDomDocument(xhtml)
resolve(new Readability(readabilityUrl, document).parse())
} catch (error) {
throw new Error("Unable to clean HTML an issue occured")
}
})
}
export default cleanHtml