Skip to content

Commit

Permalink
Improve pdf converting.
Browse files Browse the repository at this point in the history
  • Loading branch information
zensh committed Aug 8, 2023
1 parent 2044757 commit 7fbdef8
Show file tree
Hide file tree
Showing 6 changed files with 205 additions and 35 deletions.
1 change: 1 addition & 0 deletions dist/api.js
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ export async function convertingAPI(ctx) {
const buf = await getRawBody(ctx.req, { limit: '500kb' });
try {
const doc = await converter(buf);
// console.log(Buffer.from(doc).toString('hex'))
ctx.body = {
result: doc
};
Expand Down
104 changes: 90 additions & 14 deletions dist/converting.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,31 +39,67 @@ async function convertPdf(buf) {
for (let i = 1; i <= doc.numPages; i++) {
const page = await doc.getPage(i);
const content = await page.getTextContent();
let child = Object.create(null);
child.type = 'paragraph';
child.content = [];
const hl = new HeadingLevel();
for (let item of content.items) {
item = item;
if (item.str == null || item.str.length === 0) {
if (item.height > 0) {
hl.add(item.height);
}
}
hl.finalize();
let texts = [];
let height = 0;
for (let item of content.items) {
item = item;
if (item.str == null) {
continue;
}
let text = item.str;
if (item.dir === 'ttb') {
text = text.replace(/\n/g, ' ');
}
child.content.push({
type: 'text',
text
});
if (text !== '') {
texts.push(text);
}
if (item.height > height) {
height = item.height;
}
if (item.hasEOL) {
node.content.push(child);
child = Object.create(null);
child.type = 'paragraph';
child.content = [];
const level = hl.level(height);
if (level == 0) {
node.content.push({
type: 'paragraph',
content: [{
type: 'text',
text: texts.join('')
}]
});
}
else {
node.content.push({
type: "heading",
attrs: {
id: null,
level,
},
content: [{
type: 'text',
text: texts.join('')
}]
});
}
texts = [];
height = 0;
}
}
if (child.content.length > 0) {
node.content.push(child);
if (texts.length > 0) {
node.content.push({
type: 'paragraph',
content: [{
type: 'text',
text: texts.join('')
}]
});
}
page.cleanup();
}
Expand Down Expand Up @@ -91,3 +127,43 @@ function convertText(buf) {
const amender = new JSONDocumentAmender();
return Promise.resolve(Buffer.from(encode(amender.amendNode(node))));
}
export class HeadingLevel {
sample;
levels;
constructor() {
this.sample = new Map();
this.levels = [];
}
add(height) {
const key = (height - 0.01).toFixed(2);
let count = this.sample.get(key) ?? 0;
count += 1;
this.sample.set(key, count);
}
finalize() {
const keys = Array.from(this.sample.keys());
if (keys.length === 0) {
return [];
}
keys.sort((a, b) => (this.sample.get(b) ?? 0) - (this.sample.get(a) ?? 0));
const levels = [];
const h = parseFloat(keys[0]);
for (const key of keys.slice(1)) {
const height = parseFloat(key);
if (height > h) {
levels.push(height);
}
}
levels.sort((a, b) => a - b);
this.levels = levels.slice(0, 6);
this.levels.sort((a, b) => b - a);
}
level(height) {
for (let i = 0; i < this.levels.length; i++) {
if (height >= this.levels[i]) {
return i + 1; // 1 ~ 6
}
}
return 0; // not heading
}
}
10 changes: 7 additions & 3 deletions dist/tiptap.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import { Emoji, emojis } from '@tiptap-pro/extension-emoji';
import { UniqueID } from '@tiptap-pro/extension-unique-id';
import { Mathematics } from '@tiptap-pro/extension-mathematics';
// import { writeFileSync } from 'node:fs'
const uidTypes = ['blockquote', 'codeBlock', 'detailsSummary', 'detailsContent', 'heading', 'listItem', 'paragraph', 'tableHeader', 'tableCell'];
const tiptapExtensions = [
Document,
Details.configure({
Expand Down Expand Up @@ -96,7 +97,7 @@ const tiptapExtensions = [
Underline,
UniqueID.configure({
attributeName: "id",
types: ['blockquote', 'codeBlock', 'detailsSummary', 'detailsContent', 'heading', 'listItem', 'paragraph', 'tableHeader', 'tableCell'],
types: uidTypes,
generateID: () => nanoid(6)
}),
Youtube.configure({
Expand All @@ -121,10 +122,13 @@ export class JSONDocumentAmender {
// https://prosemirror.net/docs/ref/#model.Document_Structure
amendNode(node) {
// attrs: Attrs
if (node.attrs != null) {
if (uidTypes.includes(node.type) && node.attrs == null) {
node.attrs = { id: this.amendId('') };
}
else if (node.attrs != null) {
// tiptap BUG: generateJSON reuses some attrs object, we need to clone a new one.
node.attrs = Object.assign({}, node.attrs);
if (Object.hasOwn(node.attrs, 'id')) {
if (uidTypes.includes(node.type)) {
node.attrs.id = this.amendId(node.attrs.id);
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ export async function convertingAPI(ctx: Context): Promise<void> {

try {
const doc = await converter(buf)

// console.log(Buffer.from(doc).toString('hex'))
ctx.body = {
result: doc
}
Expand Down
114 changes: 100 additions & 14 deletions src/converting.ts
Original file line number Diff line number Diff line change
Expand Up @@ -47,34 +47,73 @@ async function convertPdf(buf: Buffer): Promise<Buffer> {
for (let i = 1; i <= doc.numPages; i++) {
const page = await doc.getPage(i)
const content = await page.getTextContent()
let child: Node = Object.create(null)
child.type = 'paragraph'
child.content = []
const hl = new HeadingLevel()

for (let item of content.items) {
item = item as TextItem
if (item.str == null || item.str.length === 0) {
if (item.height > 0) {
hl.add(item.height)
}
}
hl.finalize()

let texts = []
let height = 0
for (let item of content.items) {
item = item as TextItem
if (item.str == null) {
continue
}

let text = item.str
if (item.dir === 'ttb') {
text = text.replace(/\n/g, ' ')
}
child.content.push({
type: 'text',
text
})

if (text !== '') {
texts.push(text)
}

if (item.height > height) {
height = item.height
}

if (item.hasEOL) {
node.content.push(child)
child = Object.create(null)
child.type = 'paragraph'
child.content = []
const level = hl.level(height)
if (level == 0) {
node.content.push({
type: 'paragraph',
content: [{
type: 'text',
text: texts.join('')
}]
})
} else {
node.content.push({
type: "heading",
attrs: {
id: null,
level,
},
content: [{
type: 'text',
text: texts.join('')
}]
})
}
texts = []
height = 0
}
}

if (child.content.length > 0) {
node.content.push(child)
if (texts.length > 0) {
node.content.push({
type: 'paragraph',
content: [{
type: 'text',
text: texts.join('')
}]
})
}

page.cleanup()
Expand Down Expand Up @@ -105,4 +144,51 @@ function convertText(buf: Buffer): Promise<Buffer> {

const amender = new JSONDocumentAmender()
return Promise.resolve(Buffer.from(encode(amender.amendNode(node))))
}

export class HeadingLevel {
sample: Map<string, number>
levels: number[]
constructor() {
this.sample = new Map()
this.levels = []
}

add(height: number) {
const key = (height - 0.01).toFixed(2)
let count = this.sample.get(key) ?? 0
count += 1
this.sample.set(key, count)
}

finalize() {
const keys = Array.from(this.sample.keys())
if (keys.length === 0) {
return []
}

keys.sort((a, b) => (this.sample.get(b) ?? 0) - (this.sample.get(a) ?? 0))
const levels: number[] = []
const h = parseFloat(keys[0])
for (const key of keys.slice(1)) {
const height = parseFloat(key)
if (height > h) {
levels.push(height)
}
}

levels.sort((a, b) => a - b)
this.levels = levels.slice(0, 6)
this.levels.sort((a, b) => b - a)
}

level(height: number): number {
for (let i = 0; i < this.levels.length; i++) {
if (height >= this.levels[i]) {
return i + 1 // 1 ~ 6
}
}

return 0 // not heading
}
}
9 changes: 6 additions & 3 deletions src/tiptap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import { Mathematics } from '@tiptap-pro/extension-mathematics'

// import { writeFileSync } from 'node:fs'

const uidTypes = ['blockquote', 'codeBlock', 'detailsSummary', 'detailsContent', 'heading', 'listItem', 'paragraph', 'tableHeader', 'tableCell']
const tiptapExtensions = [
Document,
Details.configure({
Expand Down Expand Up @@ -99,7 +100,7 @@ const tiptapExtensions = [
Underline,
UniqueID.configure({
attributeName: "id",
types: ['blockquote', 'codeBlock', 'detailsSummary', 'detailsContent', 'heading', 'listItem', 'paragraph', 'tableHeader', 'tableCell'],
types: uidTypes,
generateID: () => nanoid(6)
}),
Youtube.configure({
Expand Down Expand Up @@ -140,10 +141,12 @@ export class JSONDocumentAmender {
// https://prosemirror.net/docs/ref/#model.Document_Structure
amendNode(node: Node): any {
// attrs: Attrs
if (node.attrs != null) {
if (uidTypes.includes(node.type) && node.attrs == null) {
node.attrs = { id: this.amendId('') }
} else if (node.attrs != null) {
// tiptap BUG: generateJSON reuses some attrs object, we need to clone a new one.
node.attrs = Object.assign({}, node.attrs)
if (Object.hasOwn(node.attrs, 'id')) {
if (uidTypes.includes(node.type)) {
node.attrs.id = this.amendId(node.attrs.id)
}
}
Expand Down

0 comments on commit 7fbdef8

Please sign in to comment.