Skip to content

Commit

Permalink
add period 20 to import-plenary-minutes
Browse files Browse the repository at this point in the history
Signed-off-by: Manuel Ruck <[email protected]>
  • Loading branch information
Manuel Ruck committed Jun 17, 2023
1 parent 586fc60 commit f106805
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 19 deletions.
4 changes: 2 additions & 2 deletions services/cron-jobs/import-plenary-minutes/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "import-plenary-minutes",
"version": "0.1.0",
"version": "0.1.1-alpha.0",
"main": "build/index.js",
"license": "Apache-2.0",
"scripts": {
Expand All @@ -12,7 +12,7 @@
"start": "node ./build/index.js"
},
"dependencies": {
"@democracy-deutschland/bundestagio-common": "0.1.16",
"@democracy-deutschland/bundestagio-common": "0.1.22-alpha.0",
"axios": "^0.19.2",
"cheerio": "^1.0.0-rc.3",
"moment": "^2.27.0"
Expand Down
37 changes: 25 additions & 12 deletions services/cron-jobs/import-plenary-minutes/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ const getMeta = (meta: cheerio.Cheerio): MetaData => {
};
};

const getPlenaryMinutes = (plenaryMinutes: cheerio.Cheerio): PlenaryMinutesItem[] => {
const getPlenaryMinutes = (plenaryMinutes: cheerio.Cheerio, period: number): PlenaryMinutesItem[] => {
const plenaryMinutesItems: PlenaryMinutesItem[] = [];
plenaryMinutes.each((i, elem) => {
// Parse Title
Expand All @@ -47,6 +47,7 @@ const getPlenaryMinutes = (plenaryMinutes: cheerio.Cheerio): PlenaryMinutesItem[

const plenaryMinutesItem: PlenaryMinutesItem = {
date: m.toDate(),
period,
meeting: parseInt(match.meeting),
xml: `https://www.bundestag.de${xmlLink}`,
};
Expand All @@ -56,48 +57,55 @@ const getPlenaryMinutes = (plenaryMinutes: cheerio.Cheerio): PlenaryMinutesItem[
return plenaryMinutesItems;
};

const parsePage = async (url: string) => {
const parsePage = async (url: string, period: number) => {
return await AxiosInstance.get(url).then((response) => {
const html = response.data;
const $ = cheerio.load(html);
const meta: cheerio.Cheerio = $('.meta-slider');
const plenaryMinutesTable: cheerio.Cheerio = $('.bt-table-data > tbody > tr');
const metaData = getMeta(meta);
const plenaryMinutes = getPlenaryMinutes(plenaryMinutesTable);
const plenaryMinutes = getPlenaryMinutes(plenaryMinutesTable, period);
return {
meta: metaData,
plenaryMinutes,
};
});
};

const getUrl = (offset: number) =>
`https://www.bundestag.de/ajax/filterlist/de/services/opendata/543410-543410/h_49f0d94cb26682ff1e9428b6de471a5b?offset=${offset}`;
const getUrl = ({ offset, id }: { offset: number; id: string }) =>
`https://www.bundestag.de/ajax/filterlist/de/services/opendata/${id}?offset=${offset}`;

const start = async () => {
let url: string | false = getUrl(0);
const periods = [
{ period: 19, id: '543410-543410' },
{ period: 20, id: '866354-866354' },
];

const start = async (period: number) => {
const periodId = periods.find((p) => p.period === period)!.id;

let url: string | false = getUrl({ offset: 0, id: periodId });
const data: PlenaryMinutesItem[] = [];
do {
const { meta, plenaryMinutes } = await parsePage(url);
const { meta, plenaryMinutes } = await parsePage(url, period);
data.push(...plenaryMinutes);
if (meta.nextOffset < meta.hits) {
url = getUrl(meta.nextOffset);
url = getUrl({ offset: meta.nextOffset, id: periodId });
} else {
url = false;
}
} while (url);
await PlenaryMinuteModel.collection.bulkWrite(
data.map((item) => ({
updateOne: {
filter: { meeting: item.meeting },
filter: { meeting: item.meeting, period: item.period },
update: {
$set: item,
},
upsert: true,
},
})),
);
console.log('found: ', data.length);
console.log(`found for period ${period}: `, data.length);
};

(async () => {
Expand All @@ -108,7 +116,12 @@ const start = async () => {
}
await mongoConnect();
console.log('PlenaryMinutes', await PlenaryMinuteModel.countDocuments({}));
await start().catch(() => {
await start(19).catch((err) => {
console.error(err);
process.exit(1);
});
await start(20).catch((err) => {
console.error(err);
process.exit(1);
});
process.exit(0);
Expand Down
1 change: 1 addition & 0 deletions services/cron-jobs/import-plenary-minutes/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export interface MetaData {

export interface PlenaryMinutesItem {
meeting: number;
period: number;
date: Date;
xml: string;
}
10 changes: 5 additions & 5 deletions services/cron-jobs/import-plenary-minutes/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
dependencies:
typescript "^3.7.2"

"@democracy-deutschland/[email protected].16":
version "0.1.16"
resolved "https://registry.yarnpkg.com/@democracy-deutschland/bundestagio-common/-/bundestagio-common-0.1.16.tgz#04a55005d498758d0c2f0d423d8a57af7725e5ca"
integrity sha512-nLVqzShop/rE0leGdqSWqeXllIcrL3Epeh18BfnNqTMBd1hXj/axsmmo/jjn7mnPTvrZFuWZireelnapa4ejFA==
"@democracy-deutschland/[email protected].22-alpha.0":
version "0.1.22-alpha.0"
resolved "https://registry.yarnpkg.com/@democracy-deutschland/bundestagio-common/-/bundestagio-common-0.1.22-alpha.0.tgz#552d4da523a945d2ddc13a89ef340e2e990cfcb4"
integrity sha512-zN8VgMjxpZ5ddjlGeZ3fChVpGA4COvlbtp2JmQI6qAAHYdcq5XjOrzpAg+nvPUtsHLzHvwcdDmPZodLNRC/i4g==
dependencies:
"@democracy-deutschland/bundestag.io-definitions" "^1.0.2"
"@types/cron" "^1.7.2"
Expand Down Expand Up @@ -781,7 +781,7 @@ mongoosastic@^4.6.0:
elasticsearch "16.7.1"
lodash.clonedeep "4.5.0"

mongoose-diff-history@mimani/mongoose-diff-history#master:
"mongoose-diff-history@github:mimani/mongoose-diff-history#master":
version "2.1.0"
resolved "https://codeload.github.com/mimani/mongoose-diff-history/tar.gz/1fb081a4308d3745ebb2646f2faeaa7ce867ca86"
dependencies:
Expand Down

0 comments on commit f106805

Please sign in to comment.