Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

play with parse pdf’s #527

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions services/cron-jobs/parse-pdf/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
node_modules
build
2 changes: 2 additions & 0 deletions services/cron-jobs/parse-pdf/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
node_modules
build
35 changes: 35 additions & 0 deletions services/cron-jobs/parse-pdf/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
FROM node:12-alpine AS BUILD_IMAGE

# install next-optimized-images requirements
RUN apk --no-cache update \
&& apk --no-cache add curl bash \
&& rm -fr /var/cache/apk/*

# install node-prune (https://github.com/tj/node-prune)
RUN curl -sfL https://install.goreleaser.com/github.com/tj/node-prune.sh | bash -s -- -b /usr/local/bin

WORKDIR /app
COPY package.json yarn.lock ./
RUN yarn --frozen-lockfile
COPY . .

RUN yarn build

RUN npm prune --production

# run node prune
RUN /usr/local/bin/node-prune

FROM node:12-alpine

WORKDIR /app

COPY . .

# copy from build image
COPY --from=BUILD_IMAGE /app/build ./build
COPY --from=BUILD_IMAGE /app/node_modules ./node_modules

ENV NODE_ENV=production

ENTRYPOINT [ "yarn", "start" ]
15 changes: 15 additions & 0 deletions services/cron-jobs/parse-pdf/Dockerfile.dev
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
FROM node:12-alpine

RUN apk --no-cache update \
&& apk --no-cache add git \
&& rm -fr /var/cache/apk/*

WORKDIR /app
COPY package.json yarn.lock ./
RUN yarn --frozen-lockfile

COPY . .

ENV NODE_ENV=development

ENTRYPOINT [ "yarn", "dev" ]
25 changes: 25 additions & 0 deletions services/cron-jobs/parse-pdf/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"name": "push-send-queued",
"version": "0.1.6",
"main": "build/index.js",
"license": "Apache-2.0",
"scripts": {
"dev": "ts-node-dev ./src/index",
"lint": "yarn lint:ts && yarn lint:exports",
"lint:ts": "tsc --noEmit",
"lint:exports": "ts-unused-exports ./tsconfig.json --excludePathsFromReport=generated --excludePathsFromReport=resolvers --excludePathsFromReport=/schemas",
"build": "tsc",
"start": "node ./build/index.js",
"apollo:codegen": "apollo client:codegen --target typescript --globalTypesFile=./src/__generated__/globalTypes.ts"
},
"dependencies": {
"@democracy-deutschland/bundestagio-common": "^0.1.7",
"brain.js": "^2.0.0-beta.1",
"pdf2json": "^1.2.0"
},
"devDependencies": {
"ts-node-dev": "^1.0.0-pre.49",
"ts-unused-exports": "^6.2.1",
"typescript": "^3.9.5"
}
}
24 changes: 24 additions & 0 deletions services/cron-jobs/parse-pdf/src/@types/pdf2json/index.d.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
declare class Pdf2Json {
constructor() {}
loadPDF(path: string): void;
on(event: "pdfParser_dataReady", cb: (data: any) => void): void;
}

declare module "pdf2json" {
export = Pdf2Json;
export interface TextBlock {
x: number;
y: number;
w: number;
sw: number;
clr: number;
A: "left" | "center" | "right";
R: [
{
T: string;
S: number;
TS: [number, number, 0 | 1, 0 | 1];
}
];
}
}
67 changes: 67 additions & 0 deletions services/cron-jobs/parse-pdf/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import mongoConnect from "./mongoose";
import fs from "fs";
import http from "http";
import PDFParser, { TextBlock } from "pdf2json";

declare function unescape(s: string): string;

import { ProcedureModel } from "@democracy-deutschland/bundestagio-common";

const httpOptions: http.RequestOptions = {
headers: { "User-Agent": "Mozilla/5.0" },
};

const download = async (url: string, dest: string): Promise<string> => {
var file = fs.createWriteStream(dest);
return new Promise((resolve, reject) => {
http
.get(url, httpOptions, (response) => {
response.pipe(file);
file.on("finish", function () {
resolve(dest);
});
})
.on("error", function (err) {
// Handle errors
fs.unlink(dest, () => {}); // Delete the file async. (But we don't check the result)
reject(err);
});
});
};

const start = async () => {
console.log("START PARSER");
const procedure = await ProcedureModel.findOne({});
if (procedure) {
const document = procedure.importantDocuments[0];
// const path = await download(
// document.url,
// `/tmp/${document.number.replace("/", "0")}.pdf`
// );
const path = "/tmp/1901596.pdf";
console.log(path);
const pdfParser = new PDFParser();
pdfParser.loadPDF(path);
pdfParser.on("pdfParser_dataReady", (data) => {
const page = data.formImage.Pages[1];

// data.formImage.Pages.forEach((page: any) => {
const texts = page.Texts as any[];
console.log(texts);
// });
});
}
console.log("DONE PARSER");
};

(async () => {
console.info("START");
console.info("process.env", process.env.DB_URL);
if (!process.env.DB_URL) {
throw new Error("you have to set environment variable: DB_URL");
}
await mongoConnect();
console.log("procedures", await ProcedureModel.countDocuments({}));
await start().catch(() => process.exit(1));
// process.exit(0);
})();
102 changes: 102 additions & 0 deletions services/cron-jobs/parse-pdf/src/index_positions.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import mongoConnect from "./mongoose";
import fs from "fs";
import http from "http";
import PDFParser, { TextBlock } from "pdf2json";

declare function unescape(s: string): string;

import { ProcedureModel } from "@democracy-deutschland/bundestagio-common";

const httpOptions: http.RequestOptions = {
headers: { "User-Agent": "Mozilla/5.0" },
};

const download = async (url: string, dest: string): Promise<string> => {
var file = fs.createWriteStream(dest);
return new Promise((resolve, reject) => {
http
.get(url, httpOptions, (response) => {
response.pipe(file);
file.on("finish", function () {
resolve(dest);
});
})
.on("error", function (err) {
// Handle errors
fs.unlink(dest, () => {}); // Delete the file async. (But we don't check the result)
reject(err);
});
});
};

const start = async () => {
console.log("START PARSER");
const procedure = await ProcedureModel.findOne({});
if (procedure) {
const document = procedure.importantDocuments[0];
// const path = await download(
// document.url,
// `/tmp/${document.number.replace("/", "0")}.pdf`
// );
const path = "/tmp/1901596.pdf";
console.log(path);
const pdfParser = new PDFParser();
pdfParser.loadPDF(path);
pdfParser.on("pdfParser_dataReady", (data) => {
const page = data.formImage.Pages[0];
fs.writeFileSync(`${path}.json`, page);

// data.formImage.Pages.forEach((page: any) => {
const texts = page.Texts as any[];
const textBlocks: string[] = [];
let tmpText: string;
texts.forEach((textBlock: TextBlock, index) => {
const text = decodeURIComponent(textBlock.R[0].T);

if (index === 0) {
tmpText = text;
return;
}
const prevText = texts[index - 1];
if (index < 10) {
console.log(
text,
textBlock,
textBlock.x + textBlock.sw - (prevText.x + prevText.w)
);
}
if (
textBlock.x <= prevText.x + prevText.w + 0.1 &&
textBlock.y < prevText.y + 1
) {
const space =
textBlock.y > 5 || textBlock.x - (prevText.x + prevText.w) > 0
? " "
: "";
tmpText += space + text;
if (index === texts.length - 1) {
textBlocks.push(tmpText);
}
} else {
textBlocks.push(tmpText);
tmpText = text;
}
});
console.log(textBlocks);
// });
});
}
console.log("DONE PARSER");
};

(async () => {
console.info("START");
console.info("process.env", process.env.DB_URL);
if (!process.env.DB_URL) {
throw new Error("you have to set environment variable: DB_URL");
}
await mongoConnect();
console.log("procedures", await ProcedureModel.countDocuments({}));
await start().catch(() => process.exit(1));
// process.exit(0);
})();
22 changes: 22 additions & 0 deletions services/cron-jobs/parse-pdf/src/mongoose.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import { mongoose } from "@democracy-deutschland/bundestagio-common";
export default () =>
new Promise(async (resolve, reject) => {
mongoose.set("useFindAndModify", false);
// Mongo Debug
mongoose.set("debug", false);

mongoose.connect(process.env.DB_URL!, {
useNewUrlParser: true,
useUnifiedTopology: true,
});

mongoose.connection.once("connected", () => {
console.info("MongoDB is running");
resolve();
});
mongoose.connection.on("error", (e: Error) => {
// Unknown if this ends up in main - therefore we log here
console.error(e.stack);
throw e;
});
});
72 changes: 72 additions & 0 deletions services/cron-jobs/parse-pdf/tsconfig.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
{
"compilerOptions": {
/* Visit https://aka.ms/tsconfig.json to read more about this file */

/* Basic Options */
// "incremental": true, /* Enable incremental compilation */
"target": "es6" /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019', 'ES2020', or 'ESNEXT'. */,
"module": "commonjs" /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', 'es2020', or 'ESNext'. */,
// "lib": [], /* Specify library files to be included in the compilation. */
// "allowJs": true, /* Allow javascript files to be compiled. */
// "checkJs": true, /* Report errors in .js files. */
// "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */
// "declaration": true, /* Generates corresponding '.d.ts' file. */
// "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */
// "sourceMap": true, /* Generates corresponding '.map' file. */
// "outFile": "./", /* Concatenate and emit output to single file. */
"outDir": "./build" /* Redirect output structure to the directory. */,
"rootDir": "./src" /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */,
// "composite": true, /* Enable project compilation */
// "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */
// "removeComments": true, /* Do not emit comments to output. */
// "noEmit": true, /* Do not emit outputs. */
// "importHelpers": true, /* Import emit helpers from 'tslib'. */
// "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */
// "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */

/* Strict Type-Checking Options */
"strict": true /* Enable all strict type-checking options. */,
// "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */
// "strictNullChecks": true, /* Enable strict null checks. */
// "strictFunctionTypes": true, /* Enable strict checking of function types. */
// "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */
// "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */
// "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */
// "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */

/* Additional Checks */
// "noUnusedLocals": true, /* Report errors on unused locals. */
// "noUnusedParameters": true, /* Report errors on unused parameters. */
// "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */
// "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */

/* Module Resolution Options */
// "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */
// "baseUrl": "./src/" /* Base directory to resolve non-absolute module names. */,
// "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */
// "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */
"typeRoots": [
"./src/@types",
"./node_modules/@types"
] /* List of folders to include type definitions from. */,
// "types": [], /* Type declaration files to be included in compilation. */
// "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */
"esModuleInterop": true /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */,
// "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */

/* Source Map Options */
// "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */
// "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */
// "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */

/* Experimental Options */
// "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */
// "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */

/* Advanced Options */
"skipLibCheck": true /* Skip type checking of declaration files. */,
"forceConsistentCasingInFileNames": true /* Disallow inconsistently-cased references to the same file. */
}
}
Loading
Loading