forked from j-mendez/clean-html-js
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean-html.ts
129 lines (114 loc) · 3.2 KB
/
clean-html.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import { Readability, JSDOMParser } from "readability-node"
import { DOMParser, XMLSerializer } from "xmldom-silent"
import UrlParser from "url-parse"
import sanitizeHtml, {AllowedAttribute} from "sanitize-html"
import {allowedAttributes, allowedTags, nonTextTags} from "./clean-html-css"
import { fetchHtml } from "./fetch"
export interface ReaderObject {
length?: number
content?: string
excerpt?: string
title?: string
byline: boolean | null
dir: string | undefined
uri?: {
spec: string
host: string
scheme: string
prePath: string
pathBase: string
}
}
export interface Config {
allowedTags?: string[]
nonTextTags?: string[]
allowedAttributes?:Record<string, AllowedAttribute[]>
}
function convertHtmlToXhtml(html: string) {
try {
const xmlSerializer = new XMLSerializer()
const xhtmlDocument = new DOMParser({
errorHandler: function (level, msg) {
if (level === "error") {
throw new Error(`Unable to convert HTML to XHTML: ${msg}`)
}
}
}).parseFromString(html, "text/html")
return xmlSerializer.serializeToString(xhtmlDocument)
} catch (e) {
console.error(e)
}
}
function createJsDomDocument(xhtml: string) {
try {
const jsDomParser = new JSDOMParser()
jsDomParser.parse(xhtml.trim())
if (jsDomParser.errorState) {
throw new Error(
`Unable to parse XHTML into JsDom ${jsDomParser.errorState}`
)
}
return jsDomParser.doc
} catch (e) {
console.error(e)
}
}
function createReadabilityUrl(sourceUrl: string) {
const sourceUrlParsed = new UrlParser(sourceUrl)
if (!sourceUrlParsed || sourceUrlParsed.host.length === 0) {
throw new Error("Invalid or no source url provided")
}
return {
spec: sourceUrlParsed.href,
host: sourceUrlParsed.host,
scheme: sourceUrlParsed.protocol.slice(0, -1),
prePath: `${sourceUrlParsed.protocol}//${sourceUrlParsed.host}`,
pathBase: `${sourceUrlParsed.protocol}//${
sourceUrlParsed.host
}${sourceUrlParsed.pathname.substring(
0,
sourceUrlParsed.pathname.lastIndexOf("/") + 1
)}`
}
}
async function cleanHtml(
html: string,
sourceUrl: string,
config: Config = { allowedTags: [], nonTextTags: [] }
): Promise<ReaderObject> {
try {
html = !html && sourceUrl ? await fetchHtml(sourceUrl) : html
} catch (e) {
console.error(e)
}
html = sanitizeHtml(html, {
allowedTags: [
...allowedTags,
...(config?.allowedTags ? config?.allowedTags : [])
],
nonTextTags: [
...nonTextTags,
...(config?.nonTextTags ? config?.nonTextTags : [])
],
allowedAttributes: {
...allowedAttributes,
...(config?.allowedAttributes ?? {})
}
})
return new Promise(resolve => {
try {
if (!html) {
throw new Error(
"Invalid url or no html provided, please use a html string or url"
)
}
const readabilityUrl = createReadabilityUrl(sourceUrl)
const xhtml = convertHtmlToXhtml(html)
const document = createJsDomDocument(xhtml)
resolve(new Readability(readabilityUrl, document).parse())
} catch (error) {
throw new Error("Unable to clean HTML an issue occured")
}
})
}
export default cleanHtml