-
Notifications
You must be signed in to change notification settings - Fork 15
/
make4ht-indexing.lua
294 lines (266 loc) · 9.78 KB
/
make4ht-indexing.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
local M = {}
local log = logging.new "indexing"
-- Handle accented characters in files created with \usepackage[utf]{inputenc}
-- this code was originally part of https://github.com/michal-h21/iec2utf/
local enc = {}
local licrs = {}
local codepoint2utf = unicode.utf8.char
local used_encodings = {}
-- load inputenc encoding file
local function load_encfiles(f)
local file= io.open(f,"r")
local encodings = file:read("*all")
file:close()
for codepoint, licr in encodings:gmatch('DeclareUnicodeCharacter(%b{})(%b{})') do
local codepoint = codepoint2utf(tonumber(codepoint:sub(2,-2),16))
local licr= licr:sub(2,-2):gsub('@tabacckludge','')
licrs[licr] = codepoint
end
end
local function sanitize_licr(l)
return l:gsub(" (.)",function(s) if s:match("[%a]") then return " "..s else return s end end):sub(2,-2)
end
local load_enc = function(enc)
-- use default encodings if used doesn't provide one
enc = enc or {"T1","T2A","T2B","T2C","T3","T5", "LGR"}
for _,e in pairs(enc) do
local filename = e:lower() .. "enc.dfu"
-- don't process an enc file multiple times
if not used_encodings[filename] then
local dfufile = kpse.find_file(filename)
if dfufile then
load_encfiles(dfufile)
end
end
used_encodings[filename] = true
end
end
local cache = {}
local get_utf8 = function(input)
local output = input:gsub('\\IeC[%s]*(%b{})',function(iec)
-- remove \protect commands
local iec = iec:gsub("\\protect%s*", "")
local code = cache[iec] or licrs[sanitize_licr(iec)] or '\\IeC '..iec
-- print(iec, code)
cache[iec] = code
return code
end)
return output
end
-- parse the idx file produced by tex4ht
-- it replaces the document page numbers by index entry number
-- each index entry can then link to place in the HTML file where the
-- \index command had been used
local parse_idx = function(content)
-- index entry number
local current_entry = 0
-- map between index entry number and corresponding HTML file and destination
local map = {}
local buffer = {}
for line in content:gmatch("([^\n]+)") do
if line:match("^\\beforeentry") then
-- increment index entry number
current_entry = current_entry + 1
local file, dest, locator = line:match("\\beforeentry%s*{(.-)}{(.-)}{(.-)}")
-- if the third argument to \beforeentry is not empty,
-- use it as a index entry locator instead of the index counter
if locator and locator == "" then locator = nil end
map[current_entry] = {file = file, dest = dest, locator = locator}
elseif line:match("^\\indexentry") then
-- replace the page number with the current
-- index entry number
local result = line:gsub("%b{}$", "{"..current_entry .."}")
buffer[#buffer+1] = get_utf8(result)
else
buffer[#buffer+1] = line
end
end
-- return table with page to dest map and updated idx file
return {map = map, idx = table.concat(buffer, "\n")}
end
local previous
-- replace numbers in .ind file with links back to text
local function replace_index_pages(rest, entries)
-- keep track of the previous page number
local count = 0
local delete_coma = false
return rest:gsub("(%s*%-*%s*)(,?%s*)(%{?)(%d+)(%}?)", function(dash, coma, lbrace, page, rbrace)
local entry = entries[tonumber(page)]
count = count + 1
if entry then
page = entry.locator or page
if delete_coma then
-- if the coma was marked for deletion, remove it. this may happen after line breaks in the index
coma = ""
end
-- if the page number is the same as the previous one, don't create a link
-- this can happen when we use section numbers as locators. for example,
-- we could get 1.1 -- 1.1, 1.1, so we want to keep only the first one
if page == previous then
previous = page
-- if the first page number on a line is the same as the previous one, we need to delete the coma,
-- otherwise the coma will be left in the output
if count == 1 then
delete_coma = true
end
return ""
else
previous = page
-- don't forget to reset the delete_coma flag after page change
delete_coma = false
-- construct link to the index entry
return dash .. coma.. lbrace .. "\\Link[" .. entry.file .."]{".. entry.dest .."}{}" .. page .."\\EndLink{}" .. rbrace
end
else
return dash .. coma .. lbrace .. page .. rbrace
end
end)
end
-- replace page numbers in the ind file with hyperlinks
local fix_idx_pages = function(content, idxobj)
local buffer = {}
local entries = idxobj.map
for line in content:gmatch("([^\n]+)") do
local line, count = line:gsub("(%s*\\%a+.-%,)(.+)$", function(start,rest)
-- reset the previous page number
previous = nil
-- there is a problem when index term itself contains numbers, like Bible verses (1:2),
-- because they will be detected as page numbers too. I cannot find a good solution
-- that wouldn't break something else.
-- There can be also commands with numbers in braces. These numbers in braces will be ignored,
-- as they may be not page numbers
return start .. replace_index_pages(rest, entries) end)
-- longer index entries may be broken over several lines, in that case, we need to process only numbers
if count == 0 then
line = line:gsub("(%s*%d+.+)", function(rest)
return replace_index_pages(rest, entries)
end)
end
buffer[#buffer+1] = line
end
return table.concat(buffer, "\n")
end
-- prepare the .idx file produced by tex4ht
-- for use with Xindy or Makeindex
local prepare_idx = function(filename)
local f = io.open(filename, "r")
if not f then return nil, "Cannot open file :".. tostring(filename) end
local content = f:read("*all")
local idx = parse_idx(content)
local idxname = os.tmpname()
local f = io.open(idxname, "w")
f:write(idx.idx)
f:close()
-- return the object with mapping between dummy page numbers
-- and link destinations in the files, and the temporary .idx file
-- these can be used for the processing with the index processor
return idx, idxname
end
-- add links to a index file
local process_index = function(indname, idx)
local f = io.open(indname, "r")
if not f then return nil, "Cannot open .ind file: " .. tostring(indname) end
local content = f:read("*all")
f:close()
local newcontent = fix_idx_pages(content, idx)
local f = io.open(indname,"w")
f:write(newcontent)
f:close()
return true
end
local get_idxname = function(par)
return par.idxfile or par.input .. ".idx"
end
local prepare_tmp_idx = function(par)
par.idxfile = mkutils.file_in_builddir(get_idxname(par), par)
if not par.idxfile or not mkutils.file_exists(par.idxfile) then return nil, "Cannot load idx file " .. (par.idxfile or "''") end
-- construct the .ind name, based on the .idx name
par.indfile = par.indfile or par.idxfile:gsub("idx$", "ind")
load_enc()
-- save hyperlinks and clean the .idx file
local idxdata, newidxfile = prepare_idx(par.idxfile)
if not idxdata then
-- if the prepare_idx function returns nil, the second reuturned value contains error msg
return nil, newidxfile
end
return newidxfile, idxdata
end
local splitindex = function(par)
local files = {}
local idxfiles = {}
local buffer
local idxfile = get_idxname(par)
if not idxfile or not mkutils.file_exists(idxfile) then return nil, "Cannot load idx file " .. (idxfile or "''") end
for line in io.lines(idxfile) do
local file = line:match("indexentry%[(.-)%]")
if file then
-- generate idx name for the current output file
file = par.input .. "-" ..file .. ".idx"
local current = files[file] or {}
-- remove file name from the index entry
local indexentry = line:gsub("indexentry%[.-%]", "indexentry")
-- save the index entry and preseding line to the current buffer
table.insert(current, buffer)
table.insert(current, indexentry)
files[file] = current
end
--
buffer = line
end
-- save idx files
for filename, contents in pairs(files) do
log:info("Saving split index file: " .. filename)
idxfiles[#idxfiles+1] = filename
local f = io.open(filename, "w")
f:write(table.concat(contents, "\n"))
f:close()
end
return idxfiles
end
local function run_indexing_command (command, par)
-- detect command name from the command. It will be the first word
local cmd_name = command:match("^[%a]+") or "indexing"
local xindylog = logging.new(cmd_name)
-- support split index
local subindexes = splitindex(par) or {}
if #subindexes > 0 then
-- call the command again on all files produced by splitindex
for _, subindex in ipairs(subindexes) do
-- make copy of the parameters
local t = {}
for k,v in pairs(par) do t[k] = v end
t.idxfile = subindex
run_indexing_command(command, t)
end
return nil
end
local newidxfile, idxdata = prepare_tmp_idx(par)
if not newidxfile then
-- the idxdata will contain error message in the case of error
xindylog:warning(idxdata)
return false
end
par.newidxfile = newidxfile
xindylog:debug("Prepared temporary idx file: ", newidxfile)
-- prepare modules
local xindy_call = command % par
xindylog:info(xindy_call)
local status = mkutils.execute(xindy_call)
-- insert correct links to the index
local status, msg = process_index(par.indfile, idxdata)
if not status then xindylog:warning(msg) end
-- remove the temporary idx file
os.remove(newidxfile)
-- null the indfile, it is necessary in order to support
-- multiple indices
par.indfile = nil
end
M.get_utf8 = get_utf8
M.load_enc = load_enc
M.parse_idx = parse_idx
M.fix_idx_pages = fix_idx_pages
M.prepare_idx = prepare_idx
M.process_index = process_index
M.prepare_tmp_idx = prepare_tmp_idx
M.run_indexing_command = run_indexing_command
return M