Skip to content

Commit

Permalink
feat: use api instead of crawling bt website
Browse files Browse the repository at this point in the history
Signed-off-by: Manuel Ruck <[email protected]>
  • Loading branch information
Manuel Ruck committed Apr 7, 2024
1 parent df12f75 commit 325fde1
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 46 deletions.
2 changes: 2 additions & 0 deletions services/cron-jobs/import-plenary-minutes/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"start": "node ./build/index.js"
},
"dependencies": {
"@democracy-deutschland/bt-dip-sdk": "1.3.0",
"@democracy-deutschland/bundestagio-common": "workspace:*",
"axios": "^1.6.0",
"cheerio": "^1.0.0-rc.3",
Expand All @@ -20,6 +21,7 @@
"devDependencies": {
"@types/axios": "^0.14.0",
"@types/cheerio": "^0.22.21",
"@types/node": "^20.12.5",
"dotenv": "^16.0.0",
"ts-node-dev": "^1.0.0-pre.49",
"ts-unused-exports": "^8.0.0",
Expand Down
27 changes: 27 additions & 0 deletions services/cron-jobs/import-plenary-minutes/src/config.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
const {
DB_URL = 'mongodb://localhost:27017/bundestagio',
IMPORT_PROCEDURES_START_CURSOR = '*',
IMPORT_PROCEDURES_FILTER_BEFORE = new Date().toISOString().slice(0, 10),
IMPORT_PROCEDURES_FILTER_AFTER = new Date(Number(new Date()) - 1000 * 60 * 60 * 24 * 7 * 4)
.toISOString()
.slice(0, 10),
} = process.env;

let { IMPORT_PROCEDURES_CHUNK_SIZE = 100, IMPORT_PROCEDURES_CHUNK_ROUNDS = 5 } = process.env;

IMPORT_PROCEDURES_CHUNK_SIZE = Number(IMPORT_PROCEDURES_CHUNK_SIZE);
IMPORT_PROCEDURES_CHUNK_ROUNDS = Number(IMPORT_PROCEDURES_CHUNK_ROUNDS);
const IMPORT_PROCEDURES_FILTER_TYPES = process.env.IMPORT_PROCEDURES_FILTER_TYPES
? process.env.IMPORT_PROCEDURES_FILTER_TYPES.split(',')
: undefined;

export const CONFIG = {
DIP_API_KEY: process.env.DIP_API_KEY || '',
DB_URL,
IMPORT_PROCEDURES_CHUNK_SIZE,
IMPORT_PROCEDURES_CHUNK_ROUNDS,
IMPORT_PROCEDURES_FILTER_BEFORE,
IMPORT_PROCEDURES_FILTER_AFTER,
IMPORT_PROCEDURES_FILTER_TYPES,
IMPORT_PROCEDURES_START_CURSOR,
} as const;
112 changes: 66 additions & 46 deletions services/cron-jobs/import-plenary-minutes/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
import axios from 'axios';
import axios, { AxiosResponse } from 'axios';
import cheerio from 'cheerio';
import moment from 'moment';
import { PlenaryMinuteModel, mongoConnect } from '@democracy-deutschland/bundestagio-common';

import { MetaData, PlenaryMinutesItem } from './types';
import { Configuration, PlenarprotokolleApi, PlenarprotokollListResponse } from '@democracy-deutschland/bt-dip-sdk';
import { CONFIG } from './config';

const config = new Configuration({
apiKey: `ApiKey ${CONFIG.DIP_API_KEY}`, // Replace #YOUR_API_KEY# with your api key
});
const api = new PlenarprotokolleApi(config, undefined, axios);

const AxiosInstance = axios.create();

Expand Down Expand Up @@ -35,40 +42,50 @@ const getPlenaryMinutes = (plenaryMinutes: cheerio.Cheerio, period: number): Ple
// Parse Title
const title = cheerio(elem).find('strong').text().trim();
const regex = /Plenarprotokoll der (?<meeting>\d{1,3}).*?dem (?<date>.*?)$/gi;
const match = regex.exec(title)!.groups as {
meeting: string;
date: string;
};
const m = moment(match.date, 'DD MMMM YYYY', 'de');
try {
const match = regex.exec(title)?.groups as {
meeting: string;
date: string;
};
const m = moment(match.date, 'DD MMMM YYYY', 'de');

// Parse link
const xmlLink = cheerio(elem).find('.bt-link-dokument').attr('href');
// Parse link
const xmlLink = cheerio(elem).find('.bt-link-dokument').attr('href');

const plenaryMinutesItem: PlenaryMinutesItem = {
date: m.toDate(),
period,
meeting: parseInt(match.meeting),
xml: `https://www.bundestag.de${xmlLink}`,
};
plenaryMinutesItems.push(plenaryMinutesItem);
const plenaryMinutesItem: PlenaryMinutesItem = {
date: m.toDate(),
period,
meeting: parseInt(match.meeting),
xml: `https://www.bundestag.de${xmlLink}`,
};
plenaryMinutesItems.push(plenaryMinutesItem);
} catch (error) {
console.log('error', error, title);
}
});

return plenaryMinutesItems;
};

const parsePage = async (url: string, period: number) => {
return await AxiosInstance.get(url).then((response) => {
const html = response.data;
const $ = cheerio.load(html);
const meta: cheerio.Cheerio = $('.meta-slider');
const plenaryMinutesTable: cheerio.Cheerio = $('.bt-table-data > tbody > tr');
const metaData = getMeta(meta);
const plenaryMinutes = getPlenaryMinutes(plenaryMinutesTable, period);
return {
meta: metaData,
plenaryMinutes,
};
});
console.log('parsePage', url);
return await AxiosInstance.get(url)
.then((response) => {
const html = response.data;
const $ = cheerio.load(html);
const meta: cheerio.Cheerio = $('.meta-slider');
const plenaryMinutesTable: cheerio.Cheerio = $('.bt-table-data > tbody > tr');
const metaData = getMeta(meta);
const plenaryMinutes = getPlenaryMinutes(plenaryMinutesTable, period);
return {
meta: metaData,
plenaryMinutes,
};
})
.catch((error) => {
console.error('error', error);
throw error;
});
};

const getUrl = ({ offset, id }: { offset: number; id: string }) =>
Expand All @@ -79,22 +96,27 @@ const periods = [
{ period: 20, id: '866354-866354' },
];

const start = async (period: number) => {
const periodId = periods.find((p) => p.period === period)!.id;

let url: string | false = getUrl({ offset: 0, id: periodId });
const data: PlenaryMinutesItem[] = [];
const start = async () => {
const cursor: string | undefined = undefined;
const plenarprotokollItems: PlenaryMinutesItem[] = [];
let hasNextPage = true;
do {
const { meta, plenaryMinutes } = await parsePage(url, period);
data.push(...plenaryMinutes);
if (meta.nextOffset < meta.hits) {
url = getUrl({ offset: meta.nextOffset, id: periodId });
} else {
url = false;
const { data } = await api.getPlenarprotokollList({ cursor });
for (const plenarprotokoll of data.documents) {
const regex = /Protokoll der (?<meeting>\d+)\. Sitzung/gi;
const match = regex.exec(plenarprotokoll.titel);
const meetingNumber = match?.groups?.meeting;
plenarprotokollItems.push({
date: new Date(plenarprotokoll.datum),
period: plenarprotokoll.wahlperiode,
meeting: meetingNumber,
xml: plenarprotokoll.fundstelle.xml_url,
});
}
} while (url);
hasNextPage = cursor !== data.cursor;
} while (hasNextPage);
await PlenaryMinuteModel.collection.bulkWrite(
data.map((item) => ({
plenarprotokollItems.map((item) => ({
updateOne: {
filter: { meeting: item.meeting, period: item.period },
update: {
Expand All @@ -104,18 +126,16 @@ const start = async (period: number) => {
},
})),
);
console.log(`found for period ${period}: `, data.length);
};

(async () => {
console.info('START');
console.info('process.env', process.env.DB_URL);
if (!process.env.DB_URL) {
console.info('process.env', CONFIG.DB_URL);
if (!CONFIG.DB_URL) {
throw new Error('you have to set environment variable: DB_URL');
}
await mongoConnect(process.env.DB_URL);
await mongoConnect(CONFIG.DB_URL);
console.log('PlenaryMinutes', await PlenaryMinuteModel.countDocuments({}));
await start(19);
await start(20);
await start();
process.exit(0);
})();

0 comments on commit 325fde1

Please sign in to comment.