-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun-ocr.ts
103 lines (77 loc) · 2.88 KB
/
run-ocr.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"use strict";
import util from 'util';
import { isDefined } from './common';
util.inspect.defaultOptions.depth = Infinity;
import { v1 as vision, protos } from '@google-cloud/vision';
const ImageAnnotatorClient = vision.ImageAnnotatorClient;
// TODO: improve, see https://stackoverflow.com/questions/19687407/press-any-key-to-continue-in-nodejs
const waitForKeyPress = () => new Promise<void>(((resolve) => {
process.stdin.once('data', () => {
resolve();
});
}));
const run = async (bucketName: string, fileName: string, outputPrefix: string) => {
// create a client
// note: authentication is done automatically using env variables
// see https://cloud.google.com/docs/authentication/production
const client = new ImageAnnotatorClient();
// bucket where the file resides
// const bucketName = 'testbook-ocr';
// path to PDF file within bucket
// const fileName = 'test/Modelovky_Biologie_1LF_2011.pdf';
// the dir where to store the results
// const outputPrefix = 'results/Modelovky_Biologie_1LF_2011';
const gcsSourceUri = `gs://${bucketName}/${fileName}`;
const gcsDestinationUri = `gs://${bucketName}/${outputPrefix}/`;
const inputConfig: protos.google.cloud.vision.v1.IInputConfig = {
// supported mime_types are: 'application/pdf' and 'image/tiff'
mimeType: 'application/pdf',
gcsSource: {
uri: gcsSourceUri,
},
};
const outputConfig: protos.google.cloud.vision.v1.IOutputConfig = {
gcsDestination: {
uri: gcsDestinationUri,
},
};
const request: protos.google.cloud.vision.v1.IAsyncBatchAnnotateFilesRequest = {
requests: [
{
inputConfig: inputConfig,
features: [
{
type: 'DOCUMENT_TEXT_DETECTION',
},
],
outputConfig: outputConfig,
},
],
};
console.log('operation preview: client.asyncBatchAnnotateFiles =', request);
console.log('press <enter> key to start the operation');
await waitForKeyPress();
console.log(`enqueuing operation ...`);
const [operation] = await client.asyncBatchAnnotateFiles(request);
console.log(`operation enqueued, name = ${operation.name}, metadata =`, operation.metadata);
console.log(`waiting for the operation to finish ...`);
const [filesResponse] = await operation.promise();
console.log('operation finished, filesResponse =', filesResponse);
const destinationUri = filesResponse.responses?.[0]?.outputConfig?.gcsDestination?.uri;
console.log(`json saved to: ${destinationUri}`);
};
// process.argv[0] - path to node (Node.js interpreter)
// process.argv[1] - path to script
if (!isDefined(process.argv[2]) || !isDefined(process.argv[3]) || !isDefined(process.argv[4])) {
console.error('usage: {bucketName} {fileName} {outputPrefix}');
process.exit(1);
}
run(process.argv[2], process.argv[3], process.argv[4])
.then(() => {
console.log('script finished');
process.exit(0);
})
.catch(err => {
console.error('an error occurred while running script', err);
process.exit(1);
});