-
Notifications
You must be signed in to change notification settings - Fork 3
/
w3c-xml2json.js
executable file
·76 lines (67 loc) · 2.35 KB
/
w3c-xml2json.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env node
/*
* Read in a huge mess of XML and output some nice-ish JSON, which we will
* later compact.
*/
var expat = require('node-expat'),
JSONStream = require('JSONStream'),
fs = require('fs');
var argparse = require('optimist')
.usage('Usage: $0 [-v] [-o <outfile>] [-i <infile>]')
.alias('o', 'output')
.alias('i', 'input');
var args = argparse.argv;
// {{{ fixedFromCharCode(codePoint)
// https://developer.mozilla.org/en/JavaScript/Reference/Global_Objects/String/fromCharCode
function fixedFromCharCode (codePt) {
if (codePt > 0xFFFF) {
codePt -= 0x10000;
return String.fromCharCode(0xD800 + (codePt >> 10), 0xDC00 + (codePt & 0x3FF));
} else {
return String.fromCharCode(codePt);
}
}
// }}}
// Parse XML
var parser = new expat.Parser("UTF-8"),
output = args.output ? fs.createWriteStream(args.output) : process.stdout,
input = args.input ? fs.createReadStream(args.input) : process.stdin,
jsonStream = JSONStream.stringifyObject("{", ",\n", "}\n");
jsonStream.pipe(output);
// Parse!
var currentChar = {},
currentText = "";
parser.on('startElement', function (name, attrs) {
if (name === 'character' && attrs.dec && attrs.dec.match(/^\d+$/)) {
currentChar = {
code: parseInt(attrs.dec, 10), // Codepoint
//name: attrs.na.toLowerCase(), // Name
//block: attrs.blk.toLowerCase(), // Block
}
} else if (name === 'entity' && attrs.set.match(/html/)) {
currentChar.altnames = currentChar.altnames || {}
currentChar.altnames.html = attrs.id;
}
});
parser.on('text', function (text) { currentText = text; });
parser.on('endElement', function (name) {
if (name === 'character') {
// Skip chars that doesn't have any altnames
if ('altnames' in currentChar) {
jsonStream.write([fixedFromCharCode(currentChar.code), currentChar]);
}
} else if (name === 'latex') {
// Save the alternate name for LaTeX...
if (currentText.match(/^\\/)) {
currentChar.altnames = currentChar.altnames || {}
currentChar.altnames.latex = currentText.replace(/^\s+|\s+$/g, '');
}
} else if (name === 'unicode') {
jsonStream.end();
}
});
// Connect input to parser && start
input.on('data', function (data) {
parser.parse(data);
});
input.resume()