-
Notifications
You must be signed in to change notification settings - Fork 9
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: ManAnRuck <[email protected]>
- Loading branch information
Showing
11 changed files
with
2,573 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
node_modules | ||
build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
node_modules | ||
build |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
FROM node:12-alpine AS BUILD_IMAGE | ||
|
||
# install next-optimized-images requirements | ||
RUN apk --no-cache update \ | ||
&& apk --no-cache add curl bash \ | ||
&& rm -fr /var/cache/apk/* | ||
|
||
# install node-prune (https://github.com/tj/node-prune) | ||
RUN curl -sfL https://install.goreleaser.com/github.com/tj/node-prune.sh | bash -s -- -b /usr/local/bin | ||
|
||
WORKDIR /app | ||
COPY package.json yarn.lock ./ | ||
RUN yarn --frozen-lockfile | ||
COPY . . | ||
|
||
RUN yarn build | ||
|
||
RUN npm prune --production | ||
|
||
# run node prune | ||
RUN /usr/local/bin/node-prune | ||
|
||
FROM node:12-alpine | ||
|
||
WORKDIR /app | ||
|
||
COPY . . | ||
|
||
# copy from build image | ||
COPY --from=BUILD_IMAGE /app/build ./build | ||
COPY --from=BUILD_IMAGE /app/node_modules ./node_modules | ||
|
||
ENV NODE_ENV=production | ||
|
||
ENTRYPOINT [ "yarn", "start" ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
FROM node:12-alpine | ||
|
||
RUN apk --no-cache update \ | ||
&& apk --no-cache add git \ | ||
&& rm -fr /var/cache/apk/* | ||
|
||
WORKDIR /app | ||
COPY package.json yarn.lock ./ | ||
RUN yarn --frozen-lockfile | ||
|
||
COPY . . | ||
|
||
ENV NODE_ENV=development | ||
|
||
ENTRYPOINT [ "yarn", "dev" ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
{ | ||
"name": "push-send-queued", | ||
"version": "0.1.6", | ||
"main": "build/index.js", | ||
"license": "Apache-2.0", | ||
"scripts": { | ||
"dev": "ts-node-dev ./src/index", | ||
"lint": "yarn lint:ts && yarn lint:exports", | ||
"lint:ts": "tsc --noEmit", | ||
"lint:exports": "ts-unused-exports ./tsconfig.json --excludePathsFromReport=generated --excludePathsFromReport=resolvers --excludePathsFromReport=/schemas", | ||
"build": "tsc", | ||
"start": "node ./build/index.js", | ||
"apollo:codegen": "apollo client:codegen --target typescript --globalTypesFile=./src/__generated__/globalTypes.ts" | ||
}, | ||
"dependencies": { | ||
"@democracy-deutschland/bundestagio-common": "^0.1.7", | ||
"brain.js": "^2.0.0-beta.1", | ||
"pdf2json": "^1.2.0" | ||
}, | ||
"devDependencies": { | ||
"ts-node-dev": "^1.0.0-pre.49", | ||
"ts-unused-exports": "^6.2.1", | ||
"typescript": "^3.9.5" | ||
} | ||
} |
24 changes: 24 additions & 0 deletions
24
services/cron-jobs/parse-pdf/src/@types/pdf2json/index.d.ts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
declare class Pdf2Json { | ||
constructor() {} | ||
loadPDF(path: string): void; | ||
on(event: "pdfParser_dataReady", cb: (data: any) => void): void; | ||
} | ||
|
||
declare module "pdf2json" { | ||
export = Pdf2Json; | ||
export interface TextBlock { | ||
x: number; | ||
y: number; | ||
w: number; | ||
sw: number; | ||
clr: number; | ||
A: "left" | "center" | "right"; | ||
R: [ | ||
{ | ||
T: string; | ||
S: number; | ||
TS: [number, number, 0 | 1, 0 | 1]; | ||
} | ||
]; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
import mongoConnect from "./mongoose"; | ||
import fs from "fs"; | ||
import http from "http"; | ||
import PDFParser, { TextBlock } from "pdf2json"; | ||
|
||
declare function unescape(s: string): string; | ||
|
||
import { ProcedureModel } from "@democracy-deutschland/bundestagio-common"; | ||
|
||
const httpOptions: http.RequestOptions = { | ||
headers: { "User-Agent": "Mozilla/5.0" }, | ||
}; | ||
|
||
const download = async (url: string, dest: string): Promise<string> => { | ||
var file = fs.createWriteStream(dest); | ||
return new Promise((resolve, reject) => { | ||
http | ||
.get(url, httpOptions, (response) => { | ||
response.pipe(file); | ||
file.on("finish", function () { | ||
resolve(dest); | ||
}); | ||
}) | ||
.on("error", function (err) { | ||
// Handle errors | ||
fs.unlink(dest, () => {}); // Delete the file async. (But we don't check the result) | ||
reject(err); | ||
}); | ||
}); | ||
}; | ||
|
||
const start = async () => { | ||
console.log("START PARSER"); | ||
const procedure = await ProcedureModel.findOne({}); | ||
if (procedure) { | ||
const document = procedure.importantDocuments[0]; | ||
// const path = await download( | ||
// document.url, | ||
// `/tmp/${document.number.replace("/", "0")}.pdf` | ||
// ); | ||
const path = "/tmp/1901596.pdf"; | ||
console.log(path); | ||
const pdfParser = new PDFParser(); | ||
pdfParser.loadPDF(path); | ||
pdfParser.on("pdfParser_dataReady", (data) => { | ||
const page = data.formImage.Pages[1]; | ||
|
||
// data.formImage.Pages.forEach((page: any) => { | ||
const texts = page.Texts as any[]; | ||
console.log(texts); | ||
// }); | ||
}); | ||
} | ||
console.log("DONE PARSER"); | ||
}; | ||
|
||
(async () => { | ||
console.info("START"); | ||
console.info("process.env", process.env.DB_URL); | ||
if (!process.env.DB_URL) { | ||
throw new Error("you have to set environment variable: DB_URL"); | ||
} | ||
await mongoConnect(); | ||
console.log("procedures", await ProcedureModel.countDocuments({})); | ||
await start().catch(() => process.exit(1)); | ||
// process.exit(0); | ||
})(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
import mongoConnect from "./mongoose"; | ||
import fs from "fs"; | ||
import http from "http"; | ||
import PDFParser, { TextBlock } from "pdf2json"; | ||
|
||
declare function unescape(s: string): string; | ||
|
||
import { ProcedureModel } from "@democracy-deutschland/bundestagio-common"; | ||
|
||
const httpOptions: http.RequestOptions = { | ||
headers: { "User-Agent": "Mozilla/5.0" }, | ||
}; | ||
|
||
const download = async (url: string, dest: string): Promise<string> => { | ||
var file = fs.createWriteStream(dest); | ||
return new Promise((resolve, reject) => { | ||
http | ||
.get(url, httpOptions, (response) => { | ||
response.pipe(file); | ||
file.on("finish", function () { | ||
resolve(dest); | ||
}); | ||
}) | ||
.on("error", function (err) { | ||
// Handle errors | ||
fs.unlink(dest, () => {}); // Delete the file async. (But we don't check the result) | ||
reject(err); | ||
}); | ||
}); | ||
}; | ||
|
||
const start = async () => { | ||
console.log("START PARSER"); | ||
const procedure = await ProcedureModel.findOne({}); | ||
if (procedure) { | ||
const document = procedure.importantDocuments[0]; | ||
// const path = await download( | ||
// document.url, | ||
// `/tmp/${document.number.replace("/", "0")}.pdf` | ||
// ); | ||
const path = "/tmp/1901596.pdf"; | ||
console.log(path); | ||
const pdfParser = new PDFParser(); | ||
pdfParser.loadPDF(path); | ||
pdfParser.on("pdfParser_dataReady", (data) => { | ||
const page = data.formImage.Pages[0]; | ||
fs.writeFileSync(`${path}.json`, page); | ||
|
||
// data.formImage.Pages.forEach((page: any) => { | ||
const texts = page.Texts as any[]; | ||
const textBlocks: string[] = []; | ||
let tmpText: string; | ||
texts.forEach((textBlock: TextBlock, index) => { | ||
const text = decodeURIComponent(textBlock.R[0].T); | ||
|
||
if (index === 0) { | ||
tmpText = text; | ||
return; | ||
} | ||
const prevText = texts[index - 1]; | ||
if (index < 10) { | ||
console.log( | ||
text, | ||
textBlock, | ||
textBlock.x + textBlock.sw - (prevText.x + prevText.w) | ||
); | ||
} | ||
if ( | ||
textBlock.x <= prevText.x + prevText.w + 0.1 && | ||
textBlock.y < prevText.y + 1 | ||
) { | ||
const space = | ||
textBlock.y > 5 || textBlock.x - (prevText.x + prevText.w) > 0 | ||
? " " | ||
: ""; | ||
tmpText += space + text; | ||
if (index === texts.length - 1) { | ||
textBlocks.push(tmpText); | ||
} | ||
} else { | ||
textBlocks.push(tmpText); | ||
tmpText = text; | ||
} | ||
}); | ||
console.log(textBlocks); | ||
// }); | ||
}); | ||
} | ||
console.log("DONE PARSER"); | ||
}; | ||
|
||
(async () => { | ||
console.info("START"); | ||
console.info("process.env", process.env.DB_URL); | ||
if (!process.env.DB_URL) { | ||
throw new Error("you have to set environment variable: DB_URL"); | ||
} | ||
await mongoConnect(); | ||
console.log("procedures", await ProcedureModel.countDocuments({})); | ||
await start().catch(() => process.exit(1)); | ||
// process.exit(0); | ||
})(); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
import { mongoose } from "@democracy-deutschland/bundestagio-common"; | ||
export default () => | ||
new Promise(async (resolve, reject) => { | ||
mongoose.set("useFindAndModify", false); | ||
// Mongo Debug | ||
mongoose.set("debug", false); | ||
|
||
mongoose.connect(process.env.DB_URL!, { | ||
useNewUrlParser: true, | ||
useUnifiedTopology: true, | ||
}); | ||
|
||
mongoose.connection.once("connected", () => { | ||
console.info("MongoDB is running"); | ||
resolve(); | ||
}); | ||
mongoose.connection.on("error", (e: Error) => { | ||
// Unknown if this ends up in main - therefore we log here | ||
console.error(e.stack); | ||
throw e; | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
{ | ||
"compilerOptions": { | ||
/* Visit https://aka.ms/tsconfig.json to read more about this file */ | ||
|
||
/* Basic Options */ | ||
// "incremental": true, /* Enable incremental compilation */ | ||
"target": "es6" /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019', 'ES2020', or 'ESNEXT'. */, | ||
"module": "commonjs" /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', 'es2020', or 'ESNext'. */, | ||
// "lib": [], /* Specify library files to be included in the compilation. */ | ||
// "allowJs": true, /* Allow javascript files to be compiled. */ | ||
// "checkJs": true, /* Report errors in .js files. */ | ||
// "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */ | ||
// "declaration": true, /* Generates corresponding '.d.ts' file. */ | ||
// "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */ | ||
// "sourceMap": true, /* Generates corresponding '.map' file. */ | ||
// "outFile": "./", /* Concatenate and emit output to single file. */ | ||
"outDir": "./build" /* Redirect output structure to the directory. */, | ||
"rootDir": "./src" /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */, | ||
// "composite": true, /* Enable project compilation */ | ||
// "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */ | ||
// "removeComments": true, /* Do not emit comments to output. */ | ||
// "noEmit": true, /* Do not emit outputs. */ | ||
// "importHelpers": true, /* Import emit helpers from 'tslib'. */ | ||
// "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */ | ||
// "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */ | ||
|
||
/* Strict Type-Checking Options */ | ||
"strict": true /* Enable all strict type-checking options. */, | ||
// "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */ | ||
// "strictNullChecks": true, /* Enable strict null checks. */ | ||
// "strictFunctionTypes": true, /* Enable strict checking of function types. */ | ||
// "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */ | ||
// "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */ | ||
// "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */ | ||
// "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */ | ||
|
||
/* Additional Checks */ | ||
// "noUnusedLocals": true, /* Report errors on unused locals. */ | ||
// "noUnusedParameters": true, /* Report errors on unused parameters. */ | ||
// "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */ | ||
// "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */ | ||
|
||
/* Module Resolution Options */ | ||
// "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */ | ||
// "baseUrl": "./src/" /* Base directory to resolve non-absolute module names. */, | ||
// "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */ | ||
// "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */ | ||
"typeRoots": [ | ||
"./src/@types", | ||
"./node_modules/@types" | ||
] /* List of folders to include type definitions from. */, | ||
// "types": [], /* Type declaration files to be included in compilation. */ | ||
// "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */ | ||
"esModuleInterop": true /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */, | ||
// "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */ | ||
// "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ | ||
|
||
/* Source Map Options */ | ||
// "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */ | ||
// "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ | ||
// "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */ | ||
// "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */ | ||
|
||
/* Experimental Options */ | ||
// "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */ | ||
// "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */ | ||
|
||
/* Advanced Options */ | ||
"skipLibCheck": true /* Skip type checking of declaration files. */, | ||
"forceConsistentCasingInFileNames": true /* Disallow inconsistently-cased references to the same file. */ | ||
} | ||
} |
Oops, something went wrong.