Skip to content

Commit

Permalink
Updated table detection code
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Oct 31, 2024
1 parent 569c6ab commit 87fa04f
Show file tree
Hide file tree
Showing 9 changed files with 245 additions and 115 deletions.
2 changes: 1 addition & 1 deletion js/extractTables.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { calcBoxOverlap } from './modifyOCR.js';
import ocr from './objects/ocrObjects.js';
import { calcBoxOverlap } from './utils/miscUtils.js';

/**
*
Expand Down
4 changes: 2 additions & 2 deletions js/import/convertPageHocr.js
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ export async function convertPageHocr({
pass2(pageObj, rotateAngle);
const langSet = pass3(pageObj);

const autoDetectTables = true;
const autoDetectTables = false;
const dataTablePage = new LayoutDataTablePage(n);
if (autoDetectTables) {
const tableBboxes = detectTablesInPage(pageObj);
Expand All @@ -384,7 +384,7 @@ export async function convertPageHocr({
dataTable.page = dataTablePage;
dataTablePage.tables.push(dataTable);
});
}
}

return {
pageObj, dataTables: dataTablePage, warn, langSet,
Expand Down
26 changes: 24 additions & 2 deletions js/import/convertPageStext.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import ocr from '../objects/ocrObjects.js';

import {
calcBboxUnion,
calcBoxOverlap,
calcLang,
mean50,
quantile,
Expand All @@ -11,6 +12,7 @@ import {

import { LayoutDataTablePage } from '../objects/layoutObjects.js';
import { detectTablesInPage, makeTableFromBbox } from '../utils/detectTables.js';
import { splitLineAgressively } from '../utils/ocrUtils.js';

/**
* @param {Object} params
Expand Down Expand Up @@ -411,6 +413,10 @@ export async function convertPageStext({ ocrStr, n }) {
// If there are no letters in the line, drop the entire line element
if (lettersKept === 0) return;

// Recalculate the bounding box.
// The bounding boxes reported by mupdf are often significantly larger than the actual text.
ocr.updateLineBbox(lineObj);

pageObj.lines.push(lineObj);
parLineArr.push(lineObj);
// eslint-disable-next-line consistent-return
Expand Down Expand Up @@ -450,16 +456,32 @@ export async function convertPageStext({ ocrStr, n }) {

pageObj.angle = angleOut;

const autoDetectTables = true;
const autoDetectTables = false;
const dataTablePage = new LayoutDataTablePage(n);
if (autoDetectTables) {
const tableBboxes = detectTablesInPage(pageObj);

for (let i = 0; i < pageObj.lines.length; i++) {
const line = pageObj.lines[i];
let inTable = false;
for (let j = 0; j < tableBboxes.length; j++) {
if (calcBoxOverlap(line.bbox, tableBboxes[j]) > 0.25) {
inTable = true;
break;
}
}
if (inTable) {
const newLines = splitLineAgressively(line);
pageObj.lines.splice(i, 1, ...newLines);
}
}

tableBboxes.forEach((bbox) => {
const dataTable = makeTableFromBbox(pageObj, bbox);
dataTable.page = dataTablePage;
dataTablePage.tables.push(dataTable);
});
}
}

return { pageObj, dataTables: dataTablePage, langSet };
}
24 changes: 1 addition & 23 deletions js/modifyOCR.js
Original file line number Diff line number Diff line change
@@ -1,27 +1,5 @@
import ocr from './objects/ocrObjects.js';
import { getRandomAlphanum } from './utils/miscUtils.js';

/**
* Returns the proportion of boxA's area contained in boxB
* @param {bbox} boxA
* @param {bbox} boxB
*/
export function calcBoxOverlap(boxA, boxB) {
const left = Math.max(boxA.left, boxB.left);
const top = Math.max(boxA.top, boxB.top);
const right = Math.min(boxA.right, boxB.right);
const bottom = Math.min(boxA.bottom, boxB.bottom);

const width = right - left;
const height = bottom - top;

if (width < 0 || height < 0) return 0;

const areaA = (boxA.bottom - boxA.top) * (boxA.right - boxA.left);
const area = width * height;

return area / areaA;
}
import { calcBoxOverlap, getRandomAlphanum } from './utils/miscUtils.js';

/**
* Adds lines from a new page to an existing page.
Expand Down
Loading

0 comments on commit 87fa04f

Please sign in to comment.