forked from github/docs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck-english-links.js
executable file
·215 lines (185 loc) · 7.55 KB
/
check-english-links.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
#!/usr/bin/env node
// [start-readme]
//
// This script runs once per day via a scheduled GitHub Action to check all links in
// English content, not including deprecated Enterprise Server content. It opens an issue
// if it finds broken links. To exclude a link path, add it to `lib/excluded-links.js`.
// Note that linkinator somtimes returns 429 and 503 errors for links that are not actually
// broken, so this script double-checks those using `got`.
//
// [end-readme]
import { fileURLToPath } from 'url'
import path from 'path'
import fs from 'fs'
import { LinkChecker } from 'linkinator'
import program from 'commander'
import { pull, uniq } from 'lodash-es'
import rimraf from 'rimraf'
import mkdirp from 'mkdirp'
import { deprecated } from '../lib/enterprise-server-releases.js'
import got from 'got'
import excludedLinks from '../lib/excluded-links.js'
import libLanguages from '../lib/languages.js'
const __dirname = path.dirname(fileURLToPath(import.meta.url))
const checker = new LinkChecker()
const root = 'http://localhost:4000'
const englishRoot = `${root}/en`
// When using the peter-evans/create-issue-from-file Action to post an
// issue comment you might get an error like this:
//
// "body is too long (maximum is 65536 characters)"
//
// So we cap our to not exceed that length.
// This number doesn't have to be strictly less that the maximum possible
// but it just mustn't exceed the validation limit.
// Note, a little bit of room must be left for adding
// a note in the generated output about the excess.
const DISPLAY_MAX_LENGTH = parseInt(process.env.DISPLAY_MAX_LENGTH || '30000', 10)
// Links with these codes may or may not really be broken.
const retryStatusCodes = [429, 503, 'Invalid']
const LINKINATOR_CONCURRENCY = parseInt(process.env.LINKINATOR_CONCURRENCY || '300')
program
.description('Check all links in the English docs.')
.option(
'-d, --dry-run',
'Turn off recursion to get a fast minimal report (useful for previewing output).'
)
.option(
'-r, --do-not-retry',
`Do not retry broken links with status codes ${retryStatusCodes.join(', ')}.`
)
.option(
'-p, --path <PATH>',
`Provide an optional path to check. Best used with --dry-run. Default: ${englishRoot}`
)
.parse(process.argv)
// Skip excluded links defined in separate file.
// Skip non-English content.
const languagesToSkip = Object.keys(libLanguages)
.filter((code) => code !== 'en')
.map((code) => new RegExp(`${root}/${code}`))
// Skip deprecated Enterprise content.
// Capture the old format https://docs.github.com/enterprise/2.1/
// and the new format https://docs.github.com/[email protected]/.
const enterpriseReleasesToSkip = new RegExp(`${root}.+?[/@](${deprecated.join('|')})(/|$)`)
const config = {
path: program.opts().path || englishRoot,
concurrency: LINKINATOR_CONCURRENCY,
// If this is a dry run, turn off recursion.
recurse: !program.opts().dryRun,
silent: true,
// The values in this array are treated as regexes.
linksToSkip: linksToSkipFactory([
enterpriseReleasesToSkip,
...languagesToSkip,
...excludedLinks,
// Don't leak into the production site
/https:\/\/docs\.github\.com/,
]),
}
// Return a function that can as quickly as possible check if a certain
// href input should be skipped.
// Do this so we can use a `Set` and a `iterable.some()` for a speedier
// check. The default implementation in Linkinator, if you set
// the `linksToSkip` config to be an array, it will, for every URL it
// checks turn that into a new regex every single time.
function linksToSkipFactory(regexAndURLs) {
const set = new Set(regexAndURLs.filter((regexOrURL) => typeof regexOrURL === 'string'))
const regexes = regexAndURLs.filter((regexOrURL) => regexOrURL instanceof RegExp)
return (href) => set.has(href) || regexes.some((regex) => regex.test(href))
}
main()
async function main() {
// Clear and recreate a directory for logs.
const logFile = path.join(__dirname, '../.linkinator/full.log')
rimraf.sync(path.dirname(logFile))
await mkdirp(path.dirname(logFile))
// Update CLI output and append to logfile after each checked link.
checker.on('link', (result) => {
// We don't need to dump all of the HTTP and HTML details
delete result.failureDetails
fs.appendFileSync(logFile, JSON.stringify(result) + '\n')
})
// Start the scan; events will be logged as they occur.
const result = (await checker.check(config)).links
// Scan is complete! Filter the results for broken links.
const brokenLinks = result
.filter((link) => link.state === 'BROKEN')
// Coerce undefined status codes into `Invalid` strings so we can display them.
// Without this, undefined codes get JSON.stringified as `0`, which is not useful output.
.map((link) => {
link.status = link.status || 'Invalid'
return link
})
// It's OK to console.warn because that goes to stderr.
console.warn(`${brokenLinks.length} broken links in total (before retry)`)
if (!program.opts().doNotRetry) {
// Links to retry individually.
const linksToRetry = brokenLinks.filter((link) => retryStatusCodes.includes(link.status))
// It's OK to console.warn because that goes to stderr.
console.warn(`${linksToRetry.length} links to retry`)
await Promise.all(
linksToRetry.map(async (link) => {
try {
// got throws an HTTPError if response code is not 2xx or 3xx.
// If got succeeds, we can remove the link from the list.
await got(link.url)
pull(brokenLinks, link)
// If got fails, do nothing. The link is already in the broken list.
} catch (err) {
// noop
}
})
)
}
// Exit successfully if no broken links!
if (!brokenLinks.length) {
console.log('All links are good!')
process.exit(0)
}
// Format and display the results.
console.log(`${brokenLinks.length} broken links found on ${root}\n`)
console.log(getDisplayBrokenLinks(brokenLinks, DISPLAY_MAX_LENGTH))
console.log(
'\nIf links are "false positives" (e.g. can only be opened by a browser) ' +
'consider making a pull request that edits `lib/excluded-links.js`.'
)
// Exit unsuccessfully if broken links are found.
process.exit(1)
}
function getDisplayBrokenLinks(brokenLinks, maxLength) {
let output = ''
// Sort results by status code.
const allStatusCodes = uniq(
brokenLinks
// Coerce undefined status codes into `Invalid` strings so we can display them.
// Without this, undefined codes get JSON.stringified as `0`,
// which is not useful output.
.map((link) => link.status || 'Invalid')
)
allStatusCodes.forEach((statusCode) => {
const brokenLinksForStatus = brokenLinks.filter((x) => x.status === statusCode)
output += `## Status ${statusCode}: Found ${brokenLinksForStatus.length} broken links\n\n`
output += '```\n'
let exceededDisplayLimit = 0
brokenLinksForStatus.forEach((brokenLinkObj) => {
// We don't need to dump all of the HTTP and HTML details
delete brokenLinkObj.failureDetails
const line = JSON.stringify(brokenLinkObj, null, 2)
if (output.length + line.length > maxLength) {
exceededDisplayLimit++
return
}
output += `${line}\n`
})
output += '```\n'
if (exceededDisplayLimit > 0) {
output += `\n(🎵! Because the comment is already big,
we skipped ${exceededDisplayLimit} additional broken links.
It is unlikely that these are real broken links. More likely
they are false positives due to a server-related issue that
needs investigating. \n`
}
})
return output
}