Skip to content

Commit

Permalink
Fixed imports of PDF text with superscripts; added new test
Browse files Browse the repository at this point in the history
  • Loading branch information
Balearica committed Nov 9, 2024
1 parent 9361df4 commit add376e
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 2 deletions.
10 changes: 8 additions & 2 deletions js/import/convertPageStext.js
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ export async function convertPageStext({ ocrStr, n }) {
const bboxes = [];

const baselineSlopeArr = /** @type {Array<Number>} */ ([]);
let baselineFirstDone = false;
const baselineFirst = /** @type {Array<Number>} */ ([]);

let baselineCurrent = 0;
Expand Down Expand Up @@ -193,16 +194,21 @@ export async function convertPageStext({ ocrStr, n }) {
bboxesWordArr = [];
}

// If the first word was determined to be a superscript, reset `baselineFirst` to avoid skewing the slope calculation.
if (sizeDelta > 0) {
baselineFirst.length = 0;
// If the first word was determined to be a superscript, reset `baselineFirst` to avoid skewing the slope calculation.
if (!baselineFirstDone) baselineFirst.length = 0;
familyCurrent = fontNameStrI || familyCurrent;
sizeCurrent = sizeCurrentRaw || sizeCurrent;
fontSizeWord = sizeCurrent;
fontFamily = familyCurrent;
superArr[superArr.length - 1] = true;
}

// If `baselineFirstDone` was set using a non-superscript word, mark it as done.
if (superArr.length > 0 && !superArr[superArr.length - 1] && baselineFirst.length > 0) {
baselineFirstDone = true;
}

superCurrent = sizeDelta < 0;
} else {
sizeCurrent = sizeCurrentRaw || sizeCurrent;
Expand Down
Binary file added tests/assets/superscript_examples_rotated.pdf
Binary file not shown.
14 changes: 14 additions & 0 deletions tests/module/importPdfText.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -247,3 +247,17 @@ describe('Check that PDF imports split lines correctly.', function () {
await scribe.terminate();
});
}).timeout(120000);

describe('Check that line baselines are imported correctly.', function () {
this.timeout(10000);

it('Should correctly parse line baselines for pages with rotation', async () => {
await scribe.importFiles([`${ASSETS_PATH_KARMA}/superscript_examples_rotated.pdf`], { extractPDFTextNative: true, extractPDFTextOCR: true });
assert.strictEqual(Math.round(scribe.data.ocr.active[0].lines[25].baseline[1]), -10);
assert.strictEqual(Math.round(scribe.data.ocr.active[1].lines[25].baseline[1]), -165);
}).timeout(10000);

after(async () => {
await scribe.terminate();
});
}).timeout(120000);

0 comments on commit add376e

Please sign in to comment.