Simplified HOCR parsing arguments; added new hocr parsing test

scribeocr · Nov 17, 2024 · b227972 · b227972
1 parent 20dbea4
commit b227972
Show file tree

Hide file tree

Showing 5 changed files with 368 additions and 21 deletions.
diff --git a/js/import/convertPageHocr.js b/js/import/convertPageHocr.js
@@ -16,19 +16,12 @@ const debugMode = true;
  * @param {Object} params
  * @param {string} params.ocrStr
  * @param {number} params.n
- * @param {?dims} params.pageDims
- * @param {number} params.rotateAngle - The angle that the input image is rotated prior to recognition.
- *    This is used to transform OCR coordinates back to the original coordinate space after recognizing a rotated intermediate image.
- * @param {boolean} params.keepItalic - If true, italic tags (`<em>`) are honored.  This is false by default,
- *    as vanilla Tesseract does not recognize italic text in a way that is reliable.
- *    This is fixed for Legacy recognition in the included custom build of Tesseract.
+ * @param {?dims} [params.pageDims]
  * @param {boolean} params.scribeMode
  */
 export async function convertPageHocr({
-  ocrStr, n, pageDims = null, rotateAngle = 0, keepItalic = false, scribeMode = false,
+  ocrStr, n, pageDims = null, scribeMode = false,
 }) {
-  rotateAngle = rotateAngle || 0;
-
   let currentLang = 'eng';
 
   const angleRisePage = [];
@@ -68,12 +61,9 @@ export async function convertPageHocr({
   const charRegex = /<span class=["']ocrx_cinfo["'] title='([^'"]+)["']>([^<]*)<\/span>/ig;
 
   // Remove all bold/italics tags.  These complicate the syntax and are unfortunately virtually always wrong anyway (coming from Tesseract).
+  // This does not impact re-uploads of .hocr files created with Scribe.
   ocrStr = ocrStr.replaceAll(/<\/?strong>/ig, '');
-
-  // The custom built-in Tesseract build should reliably identify italics (for Legacy only)
-  if (!keepItalic) {
-    ocrStr = ocrStr.replaceAll(/<\/?em>/ig, '');
-  }
+  ocrStr = ocrStr.replaceAll(/<\/?em>/ig, '');
 
   // Delete namespace to simplify xpath
   ocrStr = ocrStr.replace(/<html[^>]*>/i, '<html>');
@@ -368,11 +358,9 @@ export async function convertPageHocr({
 
   ocrStr = ocrStr.replaceAll(lineRegex, convertLine);
 
-  pageObj.angle = rotateAngle;
-
   const warn = { char: charMode ? '' : 'char_warning' };
 
-  pass2(pageObj, rotateAngle);
+  pass2(pageObj, 0);
   const langSet = pass3(pageObj);
 
   const autoDetectTables = false;

diff --git a/js/recognizeConvert.js b/js/recognizeConvert.js
@@ -309,9 +309,9 @@ export async function convertOCRPage(ocrRaw, n, mainData, format, engineName, sc
   if (format === 'hocr') {
     res = await gs.convertPageHocr({ ocrStr: ocrRaw, n, scribeMode });
   } else if (format === 'abbyy') {
-    res = await gs.convertPageAbbyy({ ocrStr: ocrRaw, n, scribeMode });
+    res = await gs.convertPageAbbyy({ ocrStr: ocrRaw, n });
   } else if (format === 'stext') {
-    res = await gs.convertPageStext({ ocrStr: ocrRaw, n, scribeMode });
+    res = await gs.convertPageStext({ ocrStr: ocrRaw, n });
   } else {
     throw new Error(`Invalid format: ${format}`);
   }

diff --git a/tests/assets/tesseract_italics_example_1a.hocr b/tests/assets/tesseract_italics_example_1a.hocr
@@ -0,0 +1,62 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head>
+  <title></title>
+  <meta http-equiv="Content-Type" content="text/html;charset=utf-8"/>
+  <meta name='ocr-system' content='tesseract 5.1.0-471-gbc490' />
+  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
+ </head>
+ <body>
+  <div class='ocr_page' id='page_1' title='image "italics_example.png"; bbox 0 0 962 167; ppageno 0; scan_res 96 96'>
+   <div class='ocr_carea' id='block_1_1' title="bbox 13 0 934 39">
+    <p class='ocr_par' id='par_1_1' lang='eng' title="bbox 13 0 934 39">
+     <span class='ocr_line' id='line_1_1' title="bbox 13 0 934 39; baseline 0 -7; x_size 28; x_descenders 6; x_ascenders 5">
+      <span class='ocrx_word' id='word_1_1' title='bbox 13 10 129 38; x_wconf 86'><em>Earnings</em></span>
+      <span class='ocrx_word' id='word_1_2' title='bbox 140 10 162 32; x_wconf 91'><em>In</em></span>
+      <span class='ocrx_word' id='word_1_3' title='bbox 172 10 227 32; x_wconf 90'><em>Line</em></span>
+      <span class='ocrx_word' id='word_1_4' title='bbox 237 0 336 38; x_wconf 79'><em>bespite</em></span>
+      <span class='ocrx_word' id='word_1_5' title='bbox 345 10 420 32; x_wconf 88'><em>Press</em></span>
+      <span class='ocrx_word' id='word_1_6' title='bbox 430 9 549 38; x_wconf 82'><em>Outages;</em></span>
+      <span class='ocrx_word' id='word_1_7' title='bbox 559 0 657 39; x_wconf 77'><em>Staying</em></span>
+      <span class='ocrx_word' id='word_1_8' title='bbox 665 0 762 32; x_wconf 77'><em>Neutral</em></span>
+      <span class='ocrx_word' id='word_1_9' title='bbox 770 15 802 32; x_wconf 91'><em>on</em></span>
+      <span class='ocrx_word' id='word_1_10' title='bbox 815 10 934 32; x_wconf 84'><em>Valuation</em></span>
+     </span>
+    </p>
+   </div>
+   <div class='ocr_carea' id='block_1_2' title="bbox 11 70 795 149">
+    <p class='ocr_par' id='par_1_2' lang='eng' title="bbox 11 70 795 149">
+     <span class='ocr_line' id='line_1_2' title="bbox 11 70 518 90; baseline 0 0; x_size 25.460993; x_descenders 5.4609933; x_ascenders 5">
+      <span class='ocrx_word' id='word_1_11' title='bbox 11 75 90 90; x_wconf 71'><em>mnuc:</em></span>
+      <span class='ocrx_word' id='word_1_12' title='bbox 100 70 179 90; x_wconf 80'><em>Neutral</em></span>
+      <span class='ocrx_word' id='word_1_13' title='bbox 352 75 435 90; x_wconf 83'><strong>SECTOR:</strong></span>
+      <span class='ocrx_word' id='word_1_14' title='bbox 446 70 518 90; x_wconf 84'><em>Metals</em></span>
+     </span>
+     <span class='ocr_line' id='line_1_3' title="bbox 11 98 795 124; baseline 0 -6; x_size 24; x_descenders 5; x_ascenders 5">
+      <span class='ocrx_word' id='word_1_15' title='bbox 11 104 88 119; x_wconf 78'><strong>TARGET</strong></span>
+      <span class='ocrx_word' id='word_1_16' title='bbox 94 104 156 119; x_wconf 81'><strong>PRICE:</strong></span>
+      <span class='ocrx_word' id='word_1_17' title='bbox 167 98 257 121; x_wconf 86'>US$170</span>
+      <span class='ocrx_word' id='word_1_18' title='bbox 352 104 429 119; x_wconf 75'><em>ssc’ron</em></span>
+      <span class='ocrx_word' id='word_1_19' title='bbox 436 104 626 119; x_wconf 78'><strong>RECOMMENDA&#39;HON:</strong></span>
+      <span class='ocrx_word' id='word_1_20' title='bbox 636 99 795 124; x_wconf 62'><em>Market-weight</em></span>
+     </span>
+     <span class='ocr_line' id='line_1_4' title="bbox 11 127 759 149; baseline -0.001 -2; x_size 25.460993; x_descenders 5.4609933; x_ascenders 5">
+      <span class='ocrx_word' id='word_1_21' title='bbox 11 132 68 147; x_wconf 84'><strong>BASIS</strong></span>
+      <span class='ocrx_word' id='word_1_22' title='bbox 75 132 188 147; x_wconf 79'><strong>OFTARGET:</strong></span>
+      <span class='ocrx_word' id='word_1_23' title='bbox 207 127 245 147; x_wconf 87'><em>P/E</em></span>
+      <span class='ocrx_word' id='word_1_24' title='bbox 253 127 274 147; x_wconf 73'><em>of</em></span>
+      <span class='ocrx_word' id='word_1_25' title='bbox 285 128 322 147; x_wconf 87'><em>15x</em></span>
+      <span class='ocrx_word' id='word_1_26' title='bbox 331 132 357 147; x_wconf 87'><em>on</em></span>
+      <span class='ocrx_word' id='word_1_27' title='bbox 366 132 402 147; x_wconf 87'><em>our</em></span>
+      <span class='ocrx_word' id='word_1_28' title='bbox 412 127 477 146; x_wconf 90'><strong>FY&#39;14</strong></span>
+      <span class='ocrx_word' id='word_1_29' title='bbox 487 127 535 147; x_wconf 89'><em>EPS</em></span>
+      <span class='ocrx_word' id='word_1_30' title='bbox 545 127 639 147; x_wconf 79'><em>estimate</em></span>
+      <span class='ocrx_word' id='word_1_31' title='bbox 648 127 759 149; x_wconf 78'><strong>of$11.44,</strong></span>
+     </span>
+    </p>
+   </div>
+  </div>
+ </body>
+</html>