From 91ae3b983849048701ce0565e3551e0f58f69433 Mon Sep 17 00:00:00 2001 From: Manuel Ruck Date: Sat, 17 Jun 2023 18:22:02 +0200 Subject: [PATCH] add period 20 to import-plenary-minutes Signed-off-by: Manuel Ruck --- .../import-plenary-minutes/package.json | 4 +- .../import-plenary-minutes/src/index.ts | 37 +++++++++++++------ .../import-plenary-minutes/src/types.ts | 1 + .../import-plenary-minutes/yarn.lock | 10 ++--- 4 files changed, 33 insertions(+), 19 deletions(-) diff --git a/services/cron-jobs/import-plenary-minutes/package.json b/services/cron-jobs/import-plenary-minutes/package.json index 3dbfc52e5..7ac77045b 100644 --- a/services/cron-jobs/import-plenary-minutes/package.json +++ b/services/cron-jobs/import-plenary-minutes/package.json @@ -1,6 +1,6 @@ { "name": "import-plenary-minutes", - "version": "0.1.0", + "version": "0.1.1-alpha.0", "main": "build/index.js", "license": "Apache-2.0", "scripts": { @@ -12,7 +12,7 @@ "start": "node ./build/index.js" }, "dependencies": { - "@democracy-deutschland/bundestagio-common": "0.1.16", + "@democracy-deutschland/bundestagio-common": "0.1.21", "axios": "^0.19.2", "cheerio": "^1.0.0-rc.3", "moment": "^2.27.0" diff --git a/services/cron-jobs/import-plenary-minutes/src/index.ts b/services/cron-jobs/import-plenary-minutes/src/index.ts index d71ff7d9f..7c7dff6ab 100644 --- a/services/cron-jobs/import-plenary-minutes/src/index.ts +++ b/services/cron-jobs/import-plenary-minutes/src/index.ts @@ -30,7 +30,7 @@ const getMeta = (meta: cheerio.Cheerio): MetaData => { }; }; -const getPlenaryMinutes = (plenaryMinutes: cheerio.Cheerio): PlenaryMinutesItem[] => { +const getPlenaryMinutes = (plenaryMinutes: cheerio.Cheerio, period: number): PlenaryMinutesItem[] => { const plenaryMinutesItems: PlenaryMinutesItem[] = []; plenaryMinutes.each((i, elem) => { // Parse Title @@ -47,6 +47,7 @@ const getPlenaryMinutes = (plenaryMinutes: cheerio.Cheerio): PlenaryMinutesItem[ const plenaryMinutesItem: PlenaryMinutesItem = { date: m.toDate(), + period, meeting: parseInt(match.meeting), xml: `https://www.bundestag.de${xmlLink}`, }; @@ -56,14 +57,14 @@ const getPlenaryMinutes = (plenaryMinutes: cheerio.Cheerio): PlenaryMinutesItem[ return plenaryMinutesItems; }; -const parsePage = async (url: string) => { +const parsePage = async (url: string, period: number) => { return await AxiosInstance.get(url).then((response) => { const html = response.data; const $ = cheerio.load(html); const meta: cheerio.Cheerio = $('.meta-slider'); const plenaryMinutesTable: cheerio.Cheerio = $('.bt-table-data > tbody > tr'); const metaData = getMeta(meta); - const plenaryMinutes = getPlenaryMinutes(plenaryMinutesTable); + const plenaryMinutes = getPlenaryMinutes(plenaryMinutesTable, period); return { meta: metaData, plenaryMinutes, @@ -71,17 +72,24 @@ const parsePage = async (url: string) => { }); }; -const getUrl = (offset: number) => - `https://www.bundestag.de/ajax/filterlist/de/services/opendata/543410-543410/h_49f0d94cb26682ff1e9428b6de471a5b?offset=${offset}`; +const getUrl = ({ offset, id }: { offset: number; id: string }) => + `https://www.bundestag.de/ajax/filterlist/de/services/opendata/${id}?offset=${offset}`; -const start = async () => { - let url: string | false = getUrl(0); +const periods = [ + { period: 19, id: '543410-543410' }, + { period: 20, id: '866354-866354' }, +]; + +const start = async (period: number) => { + const periodId = periods.find((p) => p.period === period)!.id; + + let url: string | false = getUrl({ offset: 0, id: periodId }); const data: PlenaryMinutesItem[] = []; do { - const { meta, plenaryMinutes } = await parsePage(url); + const { meta, plenaryMinutes } = await parsePage(url, period); data.push(...plenaryMinutes); if (meta.nextOffset < meta.hits) { - url = getUrl(meta.nextOffset); + url = getUrl({ offset: meta.nextOffset, id: periodId }); } else { url = false; } @@ -89,7 +97,7 @@ const start = async () => { await PlenaryMinuteModel.collection.bulkWrite( data.map((item) => ({ updateOne: { - filter: { meeting: item.meeting }, + filter: { meeting: item.meeting, period: item.period }, update: { $set: item, }, @@ -97,7 +105,7 @@ const start = async () => { }, })), ); - console.log('found: ', data.length); + console.log(`found for period ${period}: `, data.length); }; (async () => { @@ -108,7 +116,12 @@ const start = async () => { } await mongoConnect(); console.log('PlenaryMinutes', await PlenaryMinuteModel.countDocuments({})); - await start().catch(() => { + await start(19).catch((err) => { + console.error(err); + process.exit(1); + }); + await start(20).catch((err) => { + console.error(err); process.exit(1); }); process.exit(0); diff --git a/services/cron-jobs/import-plenary-minutes/src/types.ts b/services/cron-jobs/import-plenary-minutes/src/types.ts index f2125b75f..ab1c50c95 100644 --- a/services/cron-jobs/import-plenary-minutes/src/types.ts +++ b/services/cron-jobs/import-plenary-minutes/src/types.ts @@ -6,6 +6,7 @@ export interface MetaData { export interface PlenaryMinutesItem { meeting: number; + period: number; date: Date; xml: string; } diff --git a/services/cron-jobs/import-plenary-minutes/yarn.lock b/services/cron-jobs/import-plenary-minutes/yarn.lock index eacc64987..3af9b11ee 100644 --- a/services/cron-jobs/import-plenary-minutes/yarn.lock +++ b/services/cron-jobs/import-plenary-minutes/yarn.lock @@ -9,10 +9,10 @@ dependencies: typescript "^3.7.2" -"@democracy-deutschland/bundestagio-common@0.1.16": - version "0.1.16" - resolved "https://registry.yarnpkg.com/@democracy-deutschland/bundestagio-common/-/bundestagio-common-0.1.16.tgz#04a55005d498758d0c2f0d423d8a57af7725e5ca" - integrity sha512-nLVqzShop/rE0leGdqSWqeXllIcrL3Epeh18BfnNqTMBd1hXj/axsmmo/jjn7mnPTvrZFuWZireelnapa4ejFA== +"@democracy-deutschland/bundestagio-common@0.1.21": + version "0.1.21" + resolved "https://registry.yarnpkg.com/@democracy-deutschland/bundestagio-common/-/bundestagio-common-0.1.21.tgz#0f3abfff22bf428c04603c0a142cc90b9bc0a326" + integrity sha512-J7TEznNnVAhgHfMbEeCbh8DS20jScBZ2jxAqxxVC04bUnaNPy/WHevkiXihHnLnmFsvyrjo31O8eOmnhNiTVDg== dependencies: "@democracy-deutschland/bundestag.io-definitions" "^1.0.2" "@types/cron" "^1.7.2" @@ -781,7 +781,7 @@ mongoosastic@^4.6.0: elasticsearch "16.7.1" lodash.clonedeep "4.5.0" -mongoose-diff-history@mimani/mongoose-diff-history#master: +"mongoose-diff-history@github:mimani/mongoose-diff-history#master": version "2.1.0" resolved "https://codeload.github.com/mimani/mongoose-diff-history/tar.gz/1fb081a4308d3745ebb2646f2faeaa7ce867ca86" dependencies: