Skip to content

Commit

Permalink
add period 20 to import-plenary-minutes
Browse files Browse the repository at this point in the history
Signed-off-by: Manuel Ruck <[email protected]>
  • Loading branch information
Manuel Ruck committed Oct 22, 2023
1 parent e61093b commit 2141270
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 12 deletions.
33 changes: 21 additions & 12 deletions services/cron-jobs/import-plenary-minutes/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ const getMeta = (meta: cheerio.Cheerio): MetaData => {
};
};

const getPlenaryMinutes = (plenaryMinutes: cheerio.Cheerio): PlenaryMinutesItem[] => {
const getPlenaryMinutes = (plenaryMinutes: cheerio.Cheerio, period: number): PlenaryMinutesItem[] => {
const plenaryMinutesItems: PlenaryMinutesItem[] = [];
plenaryMinutes.each((i, elem) => {
// Parse Title
Expand All @@ -46,6 +46,7 @@ const getPlenaryMinutes = (plenaryMinutes: cheerio.Cheerio): PlenaryMinutesItem[

const plenaryMinutesItem: PlenaryMinutesItem = {
date: m.toDate(),
period,
meeting: parseInt(match.meeting),
xml: `https://www.bundestag.de${xmlLink}`,
};
Expand All @@ -55,48 +56,55 @@ const getPlenaryMinutes = (plenaryMinutes: cheerio.Cheerio): PlenaryMinutesItem[
return plenaryMinutesItems;
};

const parsePage = async (url: string) => {
const parsePage = async (url: string, period: number) => {
return await AxiosInstance.get(url).then((response) => {
const html = response.data;
const $ = cheerio.load(html);
const meta: cheerio.Cheerio = $('.meta-slider');
const plenaryMinutesTable: cheerio.Cheerio = $('.bt-table-data > tbody > tr');
const metaData = getMeta(meta);
const plenaryMinutes = getPlenaryMinutes(plenaryMinutesTable);
const plenaryMinutes = getPlenaryMinutes(plenaryMinutesTable, period);
return {
meta: metaData,
plenaryMinutes,
};
});
};

const getUrl = (offset: number) =>
`https://www.bundestag.de/ajax/filterlist/de/services/opendata/543410-543410/h_49f0d94cb26682ff1e9428b6de471a5b?offset=${offset}`;
const getUrl = ({ offset, id }: { offset: number; id: string }) =>
`https://www.bundestag.de/ajax/filterlist/de/services/opendata/${id}?offset=${offset}`;

const start = async () => {
let url: string | false = getUrl(0);
const periods = [
{ period: 19, id: '543410-543410' },
{ period: 20, id: '866354-866354' },
];

const start = async (period: number) => {
const periodId = periods.find((p) => p.period === period)!.id;

let url: string | false = getUrl({ offset: 0, id: periodId });
const data: PlenaryMinutesItem[] = [];
do {
const { meta, plenaryMinutes } = await parsePage(url);
const { meta, plenaryMinutes } = await parsePage(url, period);
data.push(...plenaryMinutes);
if (meta.nextOffset < meta.hits) {
url = getUrl(meta.nextOffset);
url = getUrl({ offset: meta.nextOffset, id: periodId });
} else {
url = false;
}
} while (url);
await PlenaryMinuteModel.collection.bulkWrite(
data.map((item) => ({
updateOne: {
filter: { meeting: item.meeting },
filter: { meeting: item.meeting, period: item.period },
update: {
$set: item,
},
upsert: true,
},
})),
);
console.log('found: ', data.length);
console.log(`found for period ${period}: `, data.length);
};

(async () => {
Expand All @@ -107,6 +115,7 @@ const start = async () => {
}
await mongoConnect();
console.log('PlenaryMinutes', await PlenaryMinuteModel.countDocuments({}));
await start();
await start(19);
await start(20);
process.exit(0);
})();
1 change: 1 addition & 0 deletions services/cron-jobs/import-plenary-minutes/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ export interface MetaData {

export interface PlenaryMinutesItem {
meeting: number;
period: number;
date: Date;
xml: string;
}

0 comments on commit 2141270

Please sign in to comment.