This repository was archived by the owner on Jun 5, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.js
More file actions
70 lines (64 loc) · 2.36 KB
/
scraper.js
File metadata and controls
70 lines (64 loc) · 2.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
const fetch = require("isomorphic-fetch");
const _ = require("lodash");
const fs = require("fs");
const getForOffset = offset => {
const url = `http://www.bundestag.de/ajax/filterlist/de/dokumente/protokolle/plenarprotokolle/-/442112/h_6810466be65964217012227c14bad20f?limit=10&noFilterSet=true&offset=${offset}`;
return fetch(url)
.then(response => {
console.log(`Finished downloading page with offset ${offset}`);
if (response.status >= 400) {
throw new Error("Bad response from server");
}
return response.text();
})
.then(html => {
return new JSDOM(html);
})
.catch(console.error);
};
const getTops = dom => {
const trs = dom.window.document.querySelectorAll(".bt-table-data tbody tr");
return Array.from(trs).map(tr => {
const germanDate= tr.querySelector("td:first-child").getAttribute('data-th');
const [day, month, year] = germanDate.split(".");
const date = `${year}-${month}-${day}`;
const lis = tr.querySelectorAll(".bt-top-liste > li");
const session = tr.querySelector("td:nth-child(2)").textContent;
const sessionData = Array.from(lis).map((li, index) => {
const agg = {};
agg["index"] = index;
agg["topic"] = li.querySelector("strong").textContent.trim();
agg["speakers"] = Array.from(
li.querySelectorAll(".bt-redner-liste strong")
).map(strong => strong.textContent);
return agg;
});
return { tops: sessionData, session, date };
});
};
const downloadAndWrite = (filepath) => {
const requests = [];
for (let offset = 0; offset < 250; offset += 10) {
requests.push(getForOffset(offset));
}
Promise.all(requests).then(promises => {
const res = _.flatMap(promises, data => getTops(data));
const json = JSON.stringify(res, null, 2);
fs.writeFile(filepath, json, "utf8", () =>
console.log("Done Writing! ✍️")
);
});
};
const main = () => {
const filepath = process.argv[2];
if (!filepath) {
console.error(
"⚠️ You need to supply the path that the file should be written to"
);
return;
}
downloadAndWrite(filepath);
};
main();