-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.js
156 lines (140 loc) · 4.46 KB
/
scrape.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
const cheerio = require("cheerio");
var axios = require("axios");
var GeoJSON = require("geojson");
var converter = require("json-2-csv");
var shpConverter = require("geojson2shp");
var fs = require("fs");
var rootUrl = "https://sk.wikipedia.org/";
var tableUrl =
"https://sk.wikipedia.org/wiki/Zoznam_slovensk%C3%BDch_obc%C3%AD_a_vojensk%C3%BDch_obvodov";
const getBodyOfUrl = async (url) => {
try {
const response = await axios.get(url);
return response.data;
} catch (error) {
console.log(error);
}
};
const replace = (where, what, toWhat) => {
let result = where;
what.forEach((w) => {
result = result.split(w).join(toWhat);
});
return result;
};
// parsing string containing a year value
const parseYear = (text) => {
if (parseInt(text) == text) {
return parseInt(text);
} else {
const numbers = text.match(/\d/g);
if (numbers.length > 3) {
const year = parseInt(
numbers.slice(numbers.length - 4, numbers.length).join("")
);
if (year < 2000) {
return year;
}
}
}
return false;
};
const scrap = async () => {
const municipalities = [];
const tableBody = await getBodyOfUrl(tableUrl);
const $ = cheerio.load(tableBody);
// getting the list of municipalities and their links from the table
$("table").map((ti, table) => {
const tableEl = cheerio.load(table);
tableEl("tr").map((ri, row) => {
const rowEl = cheerio.load(row);
const municipality = {};
rowEl("td").map((ci, td) => {
const columnEl = cheerio.load(td);
// get name and the link
if (ci === 0) {
municipality.name = columnEl.text().trim();
municipality.link = columnEl("a").attr("href");
}
});
if (municipality.name && municipality.link) {
municipalities.push(municipality);
}
});
});
// iterate all municipalities
for (var si in municipalities) {
//if (si < 10) {
const municipality = municipalities[si];
const municipalityHtml = await getBodyOfUrl(rootUrl + municipality.link);
const municipalityEl = cheerio.load(municipalityHtml);
console.log(
"parsing",
municipality.name,
parseInt((si / municipalities.length) * 100) + "%"
);
municipalityEl(".infobox tr").map((ri, row) => {
const rowEl = cheerio.load(row);
const rowTitle = rowEl("tr th").text().trim();
//console.log(rowTitle);
if (rowTitle === "Obyvateľstvo") {
municipality.population = parseInt(
rowEl("tr td").contents().first().text().trim().replace(/\s/g, "")
);
} else if (rowTitle === "Kraj") {
municipality.region = rowEl("tr td").contents().first().text().trim();
} else if (rowTitle === "Okres") {
municipality.district = rowEl("tr td").contents().first().text().trim();
} else if (rowTitle === "Región") {
municipality.region_historical = rowEl("tr td")
.contents()
.first()
.text()
.trim();
} else if (rowTitle === "Rozloha") {
municipality.area = parseFloat(
rowEl("tr td").contents().first().text().trim().replace(",", ".")
);
} else if (rowTitle === "Prvá pís. zmienka") {
municipality.first_mentioned = parseYear(
rowEl("tr td").contents().first().text().trim()
);
} else if (rowTitle === "Nadmorská výška") {
municipality.elevation = parseInt(
rowEl("tr td").contents().first().text().trim()
);
} else if (rowTitle === "Súradnice") {
const coordinates = replace(
rowEl("tr span.geo-dec").text().trim(),
[","],
"."
)
.split(" ")
.map((coord) => parseFloat(coord));
municipality.coordinate_x = coordinates[0];
municipality.coordinate_y = coordinates[1];
}
});
}
//console.log(municipality);
//}
// create and export .geojson
const geojson = GeoJSON.parse(municipalities, {
Point: ["coordinate_x", "coordinate_y"],
});
fs.writeFileSync(
"./out/municipalities-slovakia.geojson",
JSON.stringify(geojson)
);
// create and export .csv
converter.json2csv(municipalities, (err, csv) => {
fs.writeFileSync("./out/municipalities-slovakia.csv", csv);
});
// create zipped shapefile from the geojson
await shpConverter.convert(
"./out/municipalities-slovakia.geojson",
"./out/municipalities-shp.zip",
{ layer: "municipalities-slovakia", targetCrs: 4326 }
);
};
scrap();