Skip to content

Commit

Permalink
Simplified HOCR parsing arguments; added new hocr parsing test
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Nov 17, 2024
1 parent 20dbea4 commit b227972
Show file tree
Hide file tree
Showing 5 changed files with 368 additions and 21 deletions.
22 changes: 5 additions & 17 deletions js/import/convertPageHocr.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,19 +16,12 @@ const debugMode = true;
* @param {Object} params
* @param {string} params.ocrStr
* @param {number} params.n
* @param {?dims} params.pageDims
* @param {number} params.rotateAngle - The angle that the input image is rotated prior to recognition.
* This is used to transform OCR coordinates back to the original coordinate space after recognizing a rotated intermediate image.
* @param {boolean} params.keepItalic - If true, italic tags (`<em>`) are honored. This is false by default,
* as vanilla Tesseract does not recognize italic text in a way that is reliable.
* This is fixed for Legacy recognition in the included custom build of Tesseract.
* @param {?dims} [params.pageDims]
* @param {boolean} params.scribeMode
*/
export async function convertPageHocr({
ocrStr, n, pageDims = null, rotateAngle = 0, keepItalic = false, scribeMode = false,
ocrStr, n, pageDims = null, scribeMode = false,
}) {
rotateAngle = rotateAngle || 0;

let currentLang = 'eng';

const angleRisePage = [];
Expand Down Expand Up @@ -68,12 +61,9 @@ export async function convertPageHocr({
const charRegex = /<span class=["']ocrx_cinfo["'] title='([^'"]+)["']>([^<]*)<\/span>/ig;

// Remove all bold/italics tags. These complicate the syntax and are unfortunately virtually always wrong anyway (coming from Tesseract).
// This does not impact re-uploads of .hocr files created with Scribe.
ocrStr = ocrStr.replaceAll(/<\/?strong>/ig, '');

// The custom built-in Tesseract build should reliably identify italics (for Legacy only)
if (!keepItalic) {
ocrStr = ocrStr.replaceAll(/<\/?em>/ig, '');
}
ocrStr = ocrStr.replaceAll(/<\/?em>/ig, '');

// Delete namespace to simplify xpath
ocrStr = ocrStr.replace(/<html[^>]*>/i, '<html>');
Expand Down Expand Up @@ -368,11 +358,9 @@ export async function convertPageHocr({

ocrStr = ocrStr.replaceAll(lineRegex, convertLine);

pageObj.angle = rotateAngle;

const warn = { char: charMode ? '' : 'char_warning' };

pass2(pageObj, rotateAngle);
pass2(pageObj, 0);
const langSet = pass3(pageObj);

const autoDetectTables = false;
Expand Down
4 changes: 2 additions & 2 deletions js/recognizeConvert.js
Original file line number Diff line number Diff line change
Expand Up @@ -309,9 +309,9 @@ export async function convertOCRPage(ocrRaw, n, mainData, format, engineName, sc
if (format === 'hocr') {
res = await gs.convertPageHocr({ ocrStr: ocrRaw, n, scribeMode });
} else if (format === 'abbyy') {
res = await gs.convertPageAbbyy({ ocrStr: ocrRaw, n, scribeMode });
res = await gs.convertPageAbbyy({ ocrStr: ocrRaw, n });
} else if (format === 'stext') {
res = await gs.convertPageStext({ ocrStr: ocrRaw, n, scribeMode });
res = await gs.convertPageStext({ ocrStr: ocrRaw, n });
} else {
throw new Error(`Invalid format: ${format}`);
}
Expand Down
62 changes: 62 additions & 0 deletions tests/assets/tesseract_italics_example_1a.hocr
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<title></title>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
<meta name='ocr-system' content='tesseract 5.1.0-471-gbc490' />
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
</head>
<body>
<div class='ocr_page' id='page_1' title='image "italics_example.png"; bbox 0 0 962 167; ppageno 0; scan_res 96 96'>
<div class='ocr_carea' id='block_1_1' title="bbox 13 0 934 39">
<p class='ocr_par' id='par_1_1' lang='eng' title="bbox 13 0 934 39">
<span class='ocr_line' id='line_1_1' title="bbox 13 0 934 39; baseline 0 -7; x_size 28; x_descenders 6; x_ascenders 5">
<span class='ocrx_word' id='word_1_1' title='bbox 13 10 129 38; x_wconf 86'><em>Earnings</em></span>
<span class='ocrx_word' id='word_1_2' title='bbox 140 10 162 32; x_wconf 91'><em>In</em></span>
<span class='ocrx_word' id='word_1_3' title='bbox 172 10 227 32; x_wconf 90'><em>Line</em></span>
<span class='ocrx_word' id='word_1_4' title='bbox 237 0 336 38; x_wconf 79'><em>bespite</em></span>
<span class='ocrx_word' id='word_1_5' title='bbox 345 10 420 32; x_wconf 88'><em>Press</em></span>
<span class='ocrx_word' id='word_1_6' title='bbox 430 9 549 38; x_wconf 82'><em>Outages;</em></span>
<span class='ocrx_word' id='word_1_7' title='bbox 559 0 657 39; x_wconf 77'><em>Staying</em></span>
<span class='ocrx_word' id='word_1_8' title='bbox 665 0 762 32; x_wconf 77'><em>Neutral</em></span>
<span class='ocrx_word' id='word_1_9' title='bbox 770 15 802 32; x_wconf 91'><em>on</em></span>
<span class='ocrx_word' id='word_1_10' title='bbox 815 10 934 32; x_wconf 84'><em>Valuation</em></span>
</span>
</p>
</div>
<div class='ocr_carea' id='block_1_2' title="bbox 11 70 795 149">
<p class='ocr_par' id='par_1_2' lang='eng' title="bbox 11 70 795 149">
<span class='ocr_line' id='line_1_2' title="bbox 11 70 518 90; baseline 0 0; x_size 25.460993; x_descenders 5.4609933; x_ascenders 5">
<span class='ocrx_word' id='word_1_11' title='bbox 11 75 90 90; x_wconf 71'><em>mnuc:</em></span>
<span class='ocrx_word' id='word_1_12' title='bbox 100 70 179 90; x_wconf 80'><em>Neutral</em></span>
<span class='ocrx_word' id='word_1_13' title='bbox 352 75 435 90; x_wconf 83'><strong>SECTOR:</strong></span>
<span class='ocrx_word' id='word_1_14' title='bbox 446 70 518 90; x_wconf 84'><em>Metals</em></span>
</span>
<span class='ocr_line' id='line_1_3' title="bbox 11 98 795 124; baseline 0 -6; x_size 24; x_descenders 5; x_ascenders 5">
<span class='ocrx_word' id='word_1_15' title='bbox 11 104 88 119; x_wconf 78'><strong>TARGET</strong></span>
<span class='ocrx_word' id='word_1_16' title='bbox 94 104 156 119; x_wconf 81'><strong>PRICE:</strong></span>
<span class='ocrx_word' id='word_1_17' title='bbox 167 98 257 121; x_wconf 86'>US$170</span>
<span class='ocrx_word' id='word_1_18' title='bbox 352 104 429 119; x_wconf 75'><em>ssc’ron</em></span>
<span class='ocrx_word' id='word_1_19' title='bbox 436 104 626 119; x_wconf 78'><strong>RECOMMENDA&#39;HON:</strong></span>
<span class='ocrx_word' id='word_1_20' title='bbox 636 99 795 124; x_wconf 62'><em>Market-weight</em></span>
</span>
<span class='ocr_line' id='line_1_4' title="bbox 11 127 759 149; baseline -0.001 -2; x_size 25.460993; x_descenders 5.4609933; x_ascenders 5">
<span class='ocrx_word' id='word_1_21' title='bbox 11 132 68 147; x_wconf 84'><strong>BASIS</strong></span>
<span class='ocrx_word' id='word_1_22' title='bbox 75 132 188 147; x_wconf 79'><strong>OFTARGET:</strong></span>
<span class='ocrx_word' id='word_1_23' title='bbox 207 127 245 147; x_wconf 87'><em>P/E</em></span>
<span class='ocrx_word' id='word_1_24' title='bbox 253 127 274 147; x_wconf 73'><em>of</em></span>
<span class='ocrx_word' id='word_1_25' title='bbox 285 128 322 147; x_wconf 87'><em>15x</em></span>
<span class='ocrx_word' id='word_1_26' title='bbox 331 132 357 147; x_wconf 87'><em>on</em></span>
<span class='ocrx_word' id='word_1_27' title='bbox 366 132 402 147; x_wconf 87'><em>our</em></span>
<span class='ocrx_word' id='word_1_28' title='bbox 412 127 477 146; x_wconf 90'><strong>FY&#39;14</strong></span>
<span class='ocrx_word' id='word_1_29' title='bbox 487 127 535 147; x_wconf 89'><em>EPS</em></span>
<span class='ocrx_word' id='word_1_30' title='bbox 545 127 639 147; x_wconf 79'><em>estimate</em></span>
<span class='ocrx_word' id='word_1_31' title='bbox 648 127 759 149; x_wconf 78'><strong>of$11.44,</strong></span>
</span>
</p>
</div>
</div>
</body>
</html>
Loading

0 comments on commit b227972

Please sign in to comment.