Skip to content

Commit

Permalink
play with parse pdf’s
Browse files Browse the repository at this point in the history
Signed-off-by: ManAnRuck <[email protected]>
  • Loading branch information
ManAnRuck authored and Manuel Ruck committed Nov 1, 2023
1 parent 2d2e877 commit efc7310
Show file tree
Hide file tree
Showing 11 changed files with 2,573 additions and 0 deletions.
2 changes: 2 additions & 0 deletions services/cron-jobs/parse-pdf/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
node_modules
build
2 changes: 2 additions & 0 deletions services/cron-jobs/parse-pdf/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
node_modules
build
35 changes: 35 additions & 0 deletions services/cron-jobs/parse-pdf/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
FROM node:12-alpine AS BUILD_IMAGE

# install next-optimized-images requirements
RUN apk --no-cache update \
&& apk --no-cache add curl bash \
&& rm -fr /var/cache/apk/*

# install node-prune (https://github.com/tj/node-prune)
RUN curl -sfL https://install.goreleaser.com/github.com/tj/node-prune.sh | bash -s -- -b /usr/local/bin

WORKDIR /app
COPY package.json yarn.lock ./
RUN yarn --frozen-lockfile
COPY . .

RUN yarn build

RUN npm prune --production

# run node prune
RUN /usr/local/bin/node-prune

FROM node:12-alpine

WORKDIR /app

COPY . .

# copy from build image
COPY --from=BUILD_IMAGE /app/build ./build
COPY --from=BUILD_IMAGE /app/node_modules ./node_modules

ENV NODE_ENV=production

ENTRYPOINT [ "yarn", "start" ]
15 changes: 15 additions & 0 deletions services/cron-jobs/parse-pdf/Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM node:12-alpine

RUN apk --no-cache update \
&& apk --no-cache add git \
&& rm -fr /var/cache/apk/*

WORKDIR /app
COPY package.json yarn.lock ./
RUN yarn --frozen-lockfile

COPY . .

ENV NODE_ENV=development

ENTRYPOINT [ "yarn", "dev" ]
25 changes: 25 additions & 0 deletions services/cron-jobs/parse-pdf/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"name": "push-send-queued",
"version": "0.1.6",
"main": "build/index.js",
"license": "Apache-2.0",
"scripts": {
"dev": "ts-node-dev ./src/index",
"lint": "yarn lint:ts && yarn lint:exports",
"lint:ts": "tsc --noEmit",
"lint:exports": "ts-unused-exports ./tsconfig.json --excludePathsFromReport=generated --excludePathsFromReport=resolvers --excludePathsFromReport=/schemas",
"build": "tsc",
"start": "node ./build/index.js",
"apollo:codegen": "apollo client:codegen --target typescript --globalTypesFile=./src/__generated__/globalTypes.ts"
},
"dependencies": {
"@democracy-deutschland/bundestagio-common": "^0.1.7",
"brain.js": "^2.0.0-beta.1",
"pdf2json": "^1.2.0"
},
"devDependencies": {
"ts-node-dev": "^1.0.0-pre.49",
"ts-unused-exports": "^6.2.1",
"typescript": "^3.9.5"
}
}
24 changes: 24 additions & 0 deletions services/cron-jobs/parse-pdf/src/@types/pdf2json/index.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
declare class Pdf2Json {
constructor() {}
loadPDF(path: string): void;
on(event: "pdfParser_dataReady", cb: (data: any) => void): void;
}

declare module "pdf2json" {
export = Pdf2Json;
export interface TextBlock {
x: number;
y: number;
w: number;
sw: number;
clr: number;
A: "left" | "center" | "right";
R: [
{
T: string;
S: number;
TS: [number, number, 0 | 1, 0 | 1];
}
];
}
}
67 changes: 67 additions & 0 deletions services/cron-jobs/parse-pdf/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import mongoConnect from "./mongoose";
import fs from "fs";
import http from "http";
import PDFParser, { TextBlock } from "pdf2json";

declare function unescape(s: string): string;

import { ProcedureModel } from "@democracy-deutschland/bundestagio-common";

const httpOptions: http.RequestOptions = {
headers: { "User-Agent": "Mozilla/5.0" },
};

const download = async (url: string, dest: string): Promise<string> => {
var file = fs.createWriteStream(dest);
return new Promise((resolve, reject) => {
http
.get(url, httpOptions, (response) => {
response.pipe(file);
file.on("finish", function () {
resolve(dest);
});
})
.on("error", function (err) {
// Handle errors
fs.unlink(dest, () => {}); // Delete the file async. (But we don't check the result)
reject(err);
});
});
};

const start = async () => {
console.log("START PARSER");
const procedure = await ProcedureModel.findOne({});
if (procedure) {
const document = procedure.importantDocuments[0];
// const path = await download(
// document.url,
// `/tmp/${document.number.replace("/", "0")}.pdf`
// );
const path = "/tmp/1901596.pdf";
console.log(path);
const pdfParser = new PDFParser();
pdfParser.loadPDF(path);
pdfParser.on("pdfParser_dataReady", (data) => {
const page = data.formImage.Pages[1];

// data.formImage.Pages.forEach((page: any) => {
const texts = page.Texts as any[];
console.log(texts);
// });
});
}
console.log("DONE PARSER");
};

(async () => {
console.info("START");
console.info("process.env", process.env.DB_URL);
if (!process.env.DB_URL) {
throw new Error("you have to set environment variable: DB_URL");
}
await mongoConnect();
console.log("procedures", await ProcedureModel.countDocuments({}));
await start().catch(() => process.exit(1));
// process.exit(0);
})();
102 changes: 102 additions & 0 deletions services/cron-jobs/parse-pdf/src/index_positions.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import mongoConnect from "./mongoose";
import fs from "fs";
import http from "http";
import PDFParser, { TextBlock } from "pdf2json";

declare function unescape(s: string): string;

import { ProcedureModel } from "@democracy-deutschland/bundestagio-common";

const httpOptions: http.RequestOptions = {
headers: { "User-Agent": "Mozilla/5.0" },
};

const download = async (url: string, dest: string): Promise<string> => {
var file = fs.createWriteStream(dest);
return new Promise((resolve, reject) => {
http
.get(url, httpOptions, (response) => {
response.pipe(file);
file.on("finish", function () {
resolve(dest);
});
})
.on("error", function (err) {
// Handle errors
fs.unlink(dest, () => {}); // Delete the file async. (But we don't check the result)
reject(err);
});
});
};

const start = async () => {
console.log("START PARSER");
const procedure = await ProcedureModel.findOne({});
if (procedure) {
const document = procedure.importantDocuments[0];
// const path = await download(
// document.url,
// `/tmp/${document.number.replace("/", "0")}.pdf`
// );
const path = "/tmp/1901596.pdf";
console.log(path);
const pdfParser = new PDFParser();
pdfParser.loadPDF(path);
pdfParser.on("pdfParser_dataReady", (data) => {
const page = data.formImage.Pages[0];
fs.writeFileSync(`${path}.json`, page);

// data.formImage.Pages.forEach((page: any) => {
const texts = page.Texts as any[];
const textBlocks: string[] = [];
let tmpText: string;
texts.forEach((textBlock: TextBlock, index) => {
const text = decodeURIComponent(textBlock.R[0].T);

if (index === 0) {
tmpText = text;
return;
}
const prevText = texts[index - 1];
if (index < 10) {
console.log(
text,
textBlock,
textBlock.x + textBlock.sw - (prevText.x + prevText.w)
);
}
if (
textBlock.x <= prevText.x + prevText.w + 0.1 &&
textBlock.y < prevText.y + 1
) {
const space =
textBlock.y > 5 || textBlock.x - (prevText.x + prevText.w) > 0
? " "
: "";
tmpText += space + text;
if (index === texts.length - 1) {
textBlocks.push(tmpText);
}
} else {
textBlocks.push(tmpText);
tmpText = text;
}
});
console.log(textBlocks);
// });
});
}
console.log("DONE PARSER");
};

(async () => {
console.info("START");
console.info("process.env", process.env.DB_URL);
if (!process.env.DB_URL) {
throw new Error("you have to set environment variable: DB_URL");
}
await mongoConnect();
console.log("procedures", await ProcedureModel.countDocuments({}));
await start().catch(() => process.exit(1));
// process.exit(0);
})();
22 changes: 22 additions & 0 deletions services/cron-jobs/parse-pdf/src/mongoose.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import { mongoose } from "@democracy-deutschland/bundestagio-common";
export default () =>
new Promise(async (resolve, reject) => {
mongoose.set("useFindAndModify", false);
// Mongo Debug
mongoose.set("debug", false);

mongoose.connect(process.env.DB_URL!, {
useNewUrlParser: true,
useUnifiedTopology: true,
});

mongoose.connection.once("connected", () => {
console.info("MongoDB is running");
resolve();
});
mongoose.connection.on("error", (e: Error) => {
// Unknown if this ends up in main - therefore we log here
console.error(e.stack);
throw e;
});
});
72 changes: 72 additions & 0 deletions services/cron-jobs/parse-pdf/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
{
"compilerOptions": {
/* Visit https://aka.ms/tsconfig.json to read more about this file */

/* Basic Options */
// "incremental": true, /* Enable incremental compilation */
"target": "es6" /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019', 'ES2020', or 'ESNEXT'. */,
"module": "commonjs" /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', 'es2020', or 'ESNext'. */,
// "lib": [], /* Specify library files to be included in the compilation. */
// "allowJs": true, /* Allow javascript files to be compiled. */
// "checkJs": true, /* Report errors in .js files. */
// "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */
// "declaration": true, /* Generates corresponding '.d.ts' file. */
// "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */
// "sourceMap": true, /* Generates corresponding '.map' file. */
// "outFile": "./", /* Concatenate and emit output to single file. */
"outDir": "./build" /* Redirect output structure to the directory. */,
"rootDir": "./src" /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */,
// "composite": true, /* Enable project compilation */
// "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */
// "removeComments": true, /* Do not emit comments to output. */
// "noEmit": true, /* Do not emit outputs. */
// "importHelpers": true, /* Import emit helpers from 'tslib'. */
// "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */
// "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */

/* Strict Type-Checking Options */
"strict": true /* Enable all strict type-checking options. */,
// "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */
// "strictNullChecks": true, /* Enable strict null checks. */
// "strictFunctionTypes": true, /* Enable strict checking of function types. */
// "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */
// "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */
// "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */
// "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */

/* Additional Checks */
// "noUnusedLocals": true, /* Report errors on unused locals. */
// "noUnusedParameters": true, /* Report errors on unused parameters. */
// "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */
// "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */

/* Module Resolution Options */
// "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */
// "baseUrl": "./src/" /* Base directory to resolve non-absolute module names. */,
// "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */
// "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */
"typeRoots": [
"./src/@types",
"./node_modules/@types"
] /* List of folders to include type definitions from. */,
// "types": [], /* Type declaration files to be included in compilation. */
// "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */
"esModuleInterop": true /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */,
// "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */

/* Source Map Options */
// "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
// "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */
// "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */

/* Experimental Options */
// "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */
// "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */

/* Advanced Options */
"skipLibCheck": true /* Skip type checking of declaration files. */,
"forceConsistentCasingInFileNames": true /* Disallow inconsistently-cased references to the same file. */
}
}
Loading

0 comments on commit efc7310

Please sign in to comment.