-
-
Notifications
You must be signed in to change notification settings - Fork 86
/
Copy pathRule.js
192 lines (176 loc) Β· 6.24 KB
/
Rule.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/**
* Rule: class that can be used to define and process data extraction rules, while parsing a PDF document.
* @author Adrien Joly, http://github.com/adrienjoly
* This content is released under the MIT License.
**/
import { log as LOG } from "./lib/LOG.js";
import { parseColumns } from "./lib/parseColumns.js";
import { parseTable } from "./lib/parseTable.js";
/**
* regexp: a regular expression which a PDF item's text must match in order to execute that rule.
* => a Rule object exposes "accumulators": methods that defines the data extraction strategy of a rule.
**/
export function Rule(regexp) {
this.regexp = regexp;
var self = this;
// proxy accumulators methods
Object.keys(Rule.accumulators).forEach(function (name) {
self[name] = function () {
LOG("building rule:", regexp, "->", name);
self.methodName = name;
self.accumulatorParams = arguments;
self.accumulatorBuilder = Rule.accumulators[name];
return self;
};
});
}
// shortcut for defining Rule objects in a more concise manner
Rule.on = function (regexp) {
return new Rule(regexp);
};
Rule.after = function (regexp) {
var rule = new Rule(regexp);
rule.skipCurrentItem = true;
return rule;
};
/**
* then(): defines a function to be called after a Rule's accumulator has finished processing items.
* fct: the function to be called after a Rule's accumulator has finished processing items.
* the output of the accumulator will be passed as the first parameter of that function.
**/
Rule.prototype.then = function (fct) {
var self = this;
this.terminate = function () {
fct.call(self, self.output);
};
return this;
};
// private function that checks a PDF item against the Rule's regexp, and returns the corresponding accumulator.
Rule.prototype.test = function (item) {
if (this.regexp.test(item.text)) {
// lazy init of accumulators: build and init the accumulator on first match
this.currentItem = item;
if (!this.accumulatorImpl && this.accumulatorBuilder) {
this.accumulatorImpl = this.accumulatorBuilder.apply(
this,
this.accumulatorParams
);
this.accumulatorImpl.methodName = this.methodName;
this.accumulatorImpl.terminate = this.terminate;
}
return this.accumulatorImpl;
}
};
// intended to be run from accumulator, in order to process output before calling termination then() handler.
Rule.prototype.whenDone = function (fct) {
var self = this;
var then = this.terminate;
this.terminate = function () {
fct.call(self);
then();
};
};
/**
* rules: array of Rule objects that will be executed one-by-one, whenever a PDF item matches a rule.
* each rule can only be executed once.
* => returns a function to be called for each item by the PdfReader.
**/
Rule.makeItemProcessor = function (rules) {
var currentAccumulator = null;
function terminateAccumulator() {
var terminatePreviousAcc = (currentAccumulator || {}).terminate;
if (terminatePreviousAcc) {
LOG("terminating accumulator:", currentAccumulator.methodName);
terminatePreviousAcc(currentAccumulator); // TODO: remove currentAccumulator parameter
}
}
var applyRulesOnNextItem = true;
return function (item) {
if (!item)
// last item of the file => flush buffers
return terminateAccumulator();
else if (!item.text) return;
//LOG("ITEM:", item.text, "=> apply rules:", applyRulesOnNextItem);
if (applyRulesOnNextItem)
for (var r in rules) {
var accumulator = rules[r].test(item);
if (accumulator) {
terminateAccumulator();
LOG("current accumulator:", accumulator.methodName);
if (rules[r].skipCurrentItem) applyRulesOnNextItem = false;
currentAccumulator = accumulator;
delete rules[r];
return;
}
}
else applyRulesOnNextItem = true;
// if reaching this point, the current item matches none of the rules => accumulating data on current accumulator
if (currentAccumulator) applyRulesOnNextItem = !currentAccumulator(item);
};
};
/**
* Rule.accumulators: array of accumulators that can be used for defining Rule objects.
* An accumulator is a function that may (or may not) accept parameters, to be provided by the developer of a parser.
* It returns another function that will be run on every following PDF item, in order to accumulate data.
* The output of an accumulator is stored in this.output (field of its parent Rule object).
**/
Rule.accumulators = {
stopAccumulating: function () {
return function () {};
},
};
// method for adding accumulators
Rule.addAccumulator = function (methodName, methodBuilder) {
Rule.accumulators[methodName] = methodBuilder;
};
/**
* This accumulator will store the group values extracted by the regexp of the Rule object,
* on the current matching PDF item, into an array.
*
* E.g. with regex: /hello ([a-z]+)/, the text "hello world" will yield "world".
**/
Rule.addAccumulator("extractRegexpValues", function () {
var matches = this.regexp.exec(this.currentItem.text);
this.output = matches.slice(1);
return function () {}; // following lines are not to be processed by this accumulator
});
/**
* This accumulator will store the value of the next PDF item.
**/
Rule.addAccumulator("parseNextItemValue", function () {
var self = this,
done = false;
return function (item) {
if (done) return;
done = true;
self.output = item.text;
};
});
/**
* This accumulator will store the text of all following PDF items into an array.
**/
Rule.addAccumulator("accumulateAfterHeading", function () {
var output = (this.output = []);
return function accumulate(item) {
output.push(item.text);
};
});
/**
* This accumulator will store the text of all following PDF items with equal x-coordinates.
**/
Rule.addAccumulator("accumulateFromSameX", function () {
var output = (this.output = []),
x = null;
return function accumulate(item) {
if (x === null) x = item.x;
if (x == item.x) output.push(item.text);
};
});
/**
* This accumulator will store a table by detecting its columns, given their names.
**/
Rule.addAccumulator("parseColumns", parseColumns);
/**
* This accumulator will store a table by detecting its columns, given their count.
**/
Rule.addAccumulator("parseTable", parseTable);