scrapers/insee.js

217 lines
6.1 KiB
JavaScript
Raw Permalink Normal View History

2023-12-10 21:33:34 +01:00
const precision = {
commune: 'COM',
departement: 'DEP',
arrondissement: 'ARR',
}
const API_KEYS = [
"7f113b8e-b1ca-36c0-ae0b-4f038f941241", // original app
"aaaf836f-0cc9-3b6e-9f43-11c7c277ba38", // app 2
"5511ea5b-442d-37f3-bb09-11f6af8a3e79", // app 3
"3ce290d1-2bfa-38d9-b6a1-b4170765039d", // app 4
"e5f49c24-6a4f-328e-9113-bcd06ff033eb", // app 5
"5f252c10-64c8-3956-9467-2e1d0c0cc942", // app 6
"dfebb8ea-427d-3b62-a62f-5e4b220e094d", // app 7
"0781ac0b-fd37-3f10-b43a-7797eca237eb", // app 8
"5f47c3ea-4311-311b-95e1-c2ed6c813839", // app 9
"bde457a2-0de3-3c59-9190-df850c7facfd", // app 10
"ceec0915-199c-304f-ae4d-bce9edd694cc", // app 11
"80074dcf-3ded-3bd7-abb7-a7de94e4fb6a", // app 12
"5de51d31-2580-3c14-8e6d-5e46f5d9da98", // app 13
"22e87dc0-73e8-309c-9a41-e886808fd652", // app 14
"da8be43f-baff-399d-a94d-574e0d7fee62", // app 15
"92146137-957e-32c2-b309-8a8e656c4228", // app 16
"3d79910f-b0d6-30af-a26f-65a6ff7d90b3", // app 17
"af6eb70c-8a9c-351b-a7de-b39dcc92cb98", // app 18
"c07501c9-1f03-362a-9e87-aaa5f071a839", // app 19
"c8602bc1-e794-3d18-8d1a-c5ace7049fdb", // app 20
"7295921f-0bf4-31d5-824e-1021084adcbf", // app 21
"1fb4f0b7-e667-35ca-b905-1038e7321d95", // app 22
"98e0c55b-bd34-3503-9c62-41a49aeffd91", // app 23
"0fdd5b94-3501-3dd8-8947-0c14539f18ee", // app 24
"c274217f-9052-3bff-9efd-c4a5ade8e86e", // app 25
"3c8be149-a666-32ea-9194-73abbfcb290f", // app 26
"673d5d64-add0-39c3-894d-76eff7d33d79", // app 27
"eede4152-5fcf-3d21-aa60-5c5b5da74dc6", // app 28
"ac0d96fa-f4be-335b-9b5f-04db1360b84b", // app 29
"ad785a7b-8138-316d-842e-255058188620", // app 30
];
// limit imposed by INSEE
const MAX_RPM = 30;
// 60 seconds divided by RPM => wait time (in seconds) per request (for one api key)
const WAIT_TIME_FOR_SINGLE = 60 / MAX_RPM;
// wait time between each loop iteration, in milliseconds,
// so as to maximize the number of api calls
// by switching to the next api key
const WAIT_TIME_MILLIS = ((WAIT_TIME_FOR_SINGLE / API_KEYS.length) * 1000);
// add little margin of error to be sure
const WAIT_TIME = WAIT_TIME_MILLIS + ( WAIT_TIME_MILLIS * 0.1 );
console.log('='.repeat(80));
console.log(`Got ${API_KEYS.length} API keys :`);
console.log(`Going to wait ${WAIT_TIME}ms between each call.`);
console.log('='.repeat(80));
console.log();
let CALL_COUNTER = 0;
async function request_data(code, granularity='COM'){
//console.log(`Requesting ${granularity} : ${code}`)
const api_n = CALL_COUNTER % API_KEYS.length;
const api_key = `Bearer ${API_KEYS[api_n]}`;
CALL_COUNTER += 1;
const res = await fetch(`https://api.insee.fr/donnees-locales/V0.1/donnees/geo-INDICS_FILO_DISP_DET@GEO2023FILO2020_BV/${granularity}-${code}.ALL`, {
headers: {
'Accept': 'application/json',
"Authorization": api_key,
}
});
let text;
let data;
try {
text = await res.text();
data = JSON.parse(text);
}
catch(e){
if( text.trim() === 'Aucune zone ne correspond à la requête' ) {
//console.error('No data.')
console.log(`[\x1b[31m${granularity}-${code}\x1b[0m] ... \x1b[0m`);
return null;
}
else {
console.error(`[WARNING] Error parsing data ! for ${code} at granularity "${granularity}"`)
}
return undefined;
}
const commune = {
code: code,
};
commune.nom = data.Zone?.Millesime?.Nccenr;
if( commune.nom === undefined ) {
console.error(`[WARNING] Unnamed zone ! for ${code} at granularity "${granularity}"`)
return undefined;
}
if( data.Cellule ) {
const cellules_impots = data.Cellule.filter( cell => {
if( cell ) {
return cell.Mesure["@code"] === 'PIMPOT';
} else {
return false;
}
})
commune.impots = Math.abs( Number(cellules_impots[0].Valeur) );
}
else {
//console.error(`No Cellule data for ${code}`);
//console.log(data);
console.log(`[\x1b[31m${granularity}-${code}\x1b[0m] ${commune.nom} \x1b[0m`);
return undefined;
}
console.log(`[\x1b[1;32m${granularity}-${code}\x1b[0m\x1b[1m] ${commune.nom} : \x1b[1;33m${commune.impots}\x1b[0m`);
return commune;
}
async function sleep(ms) {
return await new Promise((resolve, reject) => setTimeout(resolve, ms));
}
(async () => {
const com_data = [];
const dep_data = [];
for( let dep = 1; dep < 99; dep++ ) {
const code_departement = String(dep).padStart(2, '0');
const departement = await request_data(code_departement, precision.departement);
if( departement ) {
dep_data.push(departement);
let empty_in_a_row = 0;
// --------
// Communes
// --------
//
//for( let com = 1; com < 999; com++ ) {
// const code_commune = code_departement + String(com).padStart(3, '0');
// const commune = await request_data(code_commune, precision.commune);
// if( commune ) {
// empty_in_a_row = 0;
// com_data.push(commune);
// }
// else if ( commune === null ) {
// empty_in_a_row += 1;
// if(empty_in_a_row > 100) {
// console.log(`Got 100 empty data in a row : assuming no more data for dep.`)
// break;
// }
// }
//
// await sleep(WAIT_TIME);
//}
// ---------------
// Arrondissements
// ---------------
//
for( let com = 1; com < 10; com++ ) {
const code_commune = code_departement + String(com)//.padStart(3, '0');
const commune = await request_data(code_commune, precision.arrondissement);
if( commune ) {
empty_in_a_row = 0;
com_data.push(commune);
}
else if ( commune === null ) {
empty_in_a_row += 1;
if(empty_in_a_row > 100) {
console.log(`Got 100 empty data in a row : assuming no more data for dep.`)
break;
}
}
await sleep(WAIT_TIME);
}
}
//console.log(`\n==> Saving communes for dep ${code_departement}...`)
//await Bun.write(`./communes-${code_departement}.json`, JSON.stringify( com_data, null, 4 ));
//console.log('==> Done.\n')
console.log(`\n==> Saving arrondissements for dep ${code_departement}...`)
await Bun.write(`./arrondissements-${code_departement}.json`, JSON.stringify( com_data, null, 4 ));
console.log('==> Done.\n')
await sleep(WAIT_TIME);
}
//await Bun.write('./departements.json', JSON.stringify( dep_data, null, 4 ));
//console.log(dep_data)
console.log('Done !')
})();