Skip to content

Commit b494602

Browse files
authored
Merge branch 'main' into feat/presentation/elu
2 parents 95e425a + bb5eb85 commit b494602

33 files changed

+3334
-2472
lines changed

.github/workflows/pre-commit-tests.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,4 +37,4 @@ jobs:
3737
poetry run pytest -v
3838
- name: Verify tests results
3939
if: ${{ failure() }}
40-
run: exit 1
40+
run: exit 1

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,20 @@ Pour comprendre en détail comment ça fonctionne :
8080

8181
[Comprendre les Extracteurs de données](./docs/extract.md)
8282

83+
## Chargement
84+
85+
Après avoir extrait les données, il est nécessaire de les charger en base de donnée avant de lancer la transformation dbt.
86+
87+
Le script “load“ se lance exactement comme le script extract ci-dessus, avec les mêmes arguments :
88+
89+
```bash
90+
# Extraire tous les datasets source du domaine "geographical_references"
91+
poetry run bin/odis.py load --domain geographical_references
92+
93+
# Extraire seulement les datasets "regions" et "departements du domaine "geographical_references"
94+
poetry run bin/odis.py load --sources geographical_references.regions, geographical_references.departements
95+
```
96+
8397
## Sonder les sources disponibles
8498

8599
L’option “explain” permet de voir facilement comment les API, Domaines et Sources sont définis dans la configuration. Si l’option “explain” est passée, le script n’extrait aucune donnée mais montre seulement les infos sur les configurations demandées.

bin/odis.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ async def extract_data_sources(
152152
)
153153

154154
for exc in tasks_exceptions:
155-
logger.error(f"Error: {exc}")
155+
logger.exception(f"Error during extraction: {exc}", exc_info=exc)
156156

157157
# exit with a non-zero status code
158158
# to indicate that there was an error

common/utils/http/async_client.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,26 +19,30 @@ class AsyncHttpClient(HttpClient):
1919
"""
2020

2121
_session: aiohttp.ClientSession
22+
_timeout: aiohttp.ClientTimeout
2223

23-
def __init__(self, max_connections: int = 100):
24+
def __init__(self, max_connections: int = 100, timeout: int = 1200):
2425
"""
2526
Args:
2627
max_connections (int, optional): The max number of concurrent connections.
2728
Defaults to 100.
29+
timeout (int, optional): The timeout in seconds for each individual HTTP request.
30+
Defaults to 1200 (20 minutes).
2831
"""
2932

3033
conn = aiohttp.TCPConnector(limit=max_connections)
31-
self._session = aiohttp.ClientSession(connector=conn)
34+
self._timeout = aiohttp.ClientTimeout(total=timeout)
35+
self._session = aiohttp.ClientSession(connector=conn, timeout=self._timeout)
3236

3337
logger.debug(
34-
f"AsyncHttpClient initialized with max_connections={max_connections}"
38+
f"AsyncHttpClient initialized with max_connections={max_connections}, timeout={timeout}s"
3539
)
3640

3741
@retry(
3842
retry=retry_if_exception_type(aiohttp.ClientError),
39-
stop=(stop_after_delay(180) | stop_after_attempt(3)),
43+
stop=(stop_after_delay(2400) | stop_after_attempt(3)),
4044
before=before_log(logger, logging.DEBUG),
41-
reraise=True, # re-raise the last exception
45+
reraise=True,
4246
)
4347
async def get(
4448
self,

common/utils/source_extractors.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,11 @@ async def download_page(self, url: str) -> ExtractionResult:
9898
# if not given in API response, BUT if no next page could be derived, then true
9999
# else: false
100100
is_last_key = self.model.response_map.get("is_last")
101-
is_last = (
102-
jmespath.search(is_last_key, payload)
103-
if is_last_key
104-
else (next_url is None)
105-
)
101+
if is_last_key:
102+
is_last_from_response = jmespath.search(is_last_key, payload)
103+
is_last = is_last_from_response if is_last_from_response is not None else (next_url is None)
104+
else:
105+
is_last = (next_url is None)
106106

107107
# If all went well, success = true
108108
success = True

datasources.yaml

Lines changed: 48 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ domains:
288288
API: GEO.api.gouv.fr
289289
description: Référentiel géographique GEO - niveau commune
290290
type: JsonExtractor
291-
endpoint: /communes?fields=code,nom,population,departement,region,centre
291+
endpoint: /communes?fields=code,nom,population,departement,region,centre,epci
292292
format: json
293293

294294
arrondissements:
@@ -315,6 +315,15 @@ domains:
315315
name: excelfile_dfload
316316
type: notebook
317317

318+
geocodes_passage_annuel:
319+
API: Opendatasoft
320+
description: Table de passage annuelle 2025, équivalence codes géographiques 2025 et année précédente d'établissement de codes (2023)
321+
type: FileExtractor
322+
endpoint: /catalog/datasets/georef-france-matching-code/exports/csv?use_labels=true
323+
format: csv
324+
load_params:
325+
separator: ;
326+
318327
accueil:
319328

320329
cada_cph_huda:
@@ -338,14 +347,13 @@ domains:
338347
maxResult: 10000
339348
# TIME_PERIOD: 2021
340349
startPeriod: "2009-01-01"
341-
endPeriod: "2025-01-01"
350+
endPeriod: "2025-10-01"
342351
GEO: # géographies
343352
- COM # communes
344353
- DEP # départements
345354
- REG # régions
346355
RP_MEASURE: DWELLINGS # nombre de logements
347356
L_STAY: _T
348-
TOH: _T
349357
CARS: _T
350358
NOR: _T
351359
TSH: # statut d'occupation du logement
@@ -379,11 +387,11 @@ domains:
379387
format: json
380388
extract_params:
381389
maxResult: 10000
382-
TIME_PERIOD: [2010,2015,2021,2025]
390+
startPeriod: "2009-01-01"
391+
endPeriod: "2025-10-01"
383392
GEO: ["COM", "DEP", "REG"]
384393
RP_MEASURE: DWELLINGS # nombre de logements
385394
L_STAY: _T
386-
TOH: _T
387395
CARS: _T
388396
NOR: _T
389397
TSH: # statut d'occupation du logement
@@ -412,11 +420,11 @@ domains:
412420
format: json
413421
extract_params:
414422
maxResult: 10000
415-
TIME_PERIOD: [2010,2015,2021,2025]
423+
startPeriod: "2009-01-01"
424+
endPeriod: "2025-10-01"
416425
GEO: ["COM", "DEP", "REG"]
417426
RP_MEASURE: DWELLINGS
418427
L_STAY: _T
419-
TOH: _T
420428
CARS: _T
421429
NOR: _T
422430
TSH:
@@ -442,11 +450,11 @@ domains:
442450
format: json
443451
extract_params:
444452
maxResult: 10000
445-
TIME_PERIOD: [2010,2015,2021,2025]
453+
startPeriod: "2009-01-01"
454+
endPeriod: "2025-10-01"
446455
GEO: ["COM", "DEP", "REG"]
447456
RP_MEASURE: DWELLINGS
448457
L_STAY: _T
449-
TOH: _T
450458
CARS: _T
451459
NOR: _T
452460
TSH:
@@ -472,11 +480,11 @@ domains:
472480
format: json
473481
extract_params:
474482
maxResult: 10000
475-
TIME_PERIOD: [2010,2015,2021,2025]
483+
startPeriod: "2009-01-01"
484+
endPeriod: "2025-10-01"
476485
GEO: ["COM", "DEP", "REG"]
477486
RP_MEASURE: DWELLINGS_ROOMS # nombre de pièces
478487
L_STAY: _T
479-
TOH: _T
480488
CARS: _T
481489
NOR: _T
482490
TSH: # statut d'occupation du logement
@@ -557,7 +565,8 @@ domains:
557565
format: json
558566
extract_params:
559567
maxResult: 10000
560-
TIME_PERIOD: 2023
568+
startPeriod: "2009-01-01"
569+
endPeriod: "2025-10-01"
561570
GEO: ["COM","REG","DEP"]
562571
FACILITY_TYPE: ["A122", "A128", "A203", "A206", "A208","A304","A501","A503", "A504"]
563572
response_map:
@@ -569,7 +578,6 @@ domains:
569578

570579
population:
571580

572-
573581
by_age:
574582
API: INSEE.Melodi
575583
description: Répartition de la population par tranches d'âge
@@ -587,42 +595,21 @@ domains:
587595

588596
by_age_gender:
589597
API: INSEE.Melodi
590-
description: Répartition de la population par tranches d'âge et par genre (M/F, Cis/Trans)
598+
description: Répartition de la population par tranches d'âge et par genre
591599
type: MelodiExtractor
592-
endpoint: /data/DS_ESTIMATION_POPULATION # Dataset des estimations de population de l'INSEE
600+
endpoint: /data/DS_RP_POPULATION_PRINC # recensement données principales
593601
format: json
594602
extract_params:
595603
maxResult: 10000
596-
SEX:
597-
- M # Hommes (all)
598-
- M_cis # Hommes cis
599-
- F # Femmes (all)
600-
- F_cis # Femmes cis
601-
- TS # Personnes Trans
602-
- _Z # 'Non Applicable'
603-
EP_MEASURE: POP # Prendre uniquement la mesure de population
604+
TIME_PERIOD: 2022
605+
SEX: ["M", "F", "_T"]
606+
GEO: ["COM", "DEP", "REG"]
607+
RP_MEASURE: POP
604608
response_map:
605609
data: observations
606610
next: paging.next
607611
is_last: paging.isLast
608612

609-
# repartition_sexe_age:
610-
# API: INSEE.Melodi
611-
# description: repartition de la population par sexe et par tranche d'age
612-
# type: MelodiExtractor
613-
# endpoint: /data/DS_RP_POPULATION_PRINC
614-
# format: json
615-
# extract_params:
616-
# maxResult: 10000
617-
# TIME_PERIOD: 2021
618-
# GEO: ["COM","DEP","REG"]
619-
# SEX: ["F","M"]
620-
# AGE: ["Y_LT15","Y15T24","Y25T39","Y40T54","Y55T64","Y65T79","Y_GE80"]
621-
# response_map:
622-
# data: observations
623-
# next: paging.next
624-
# is_last: paging.isLast
625-
626613
population_superficie:
627614
API: INSEE.Melodi
628615
description: recuperation de la population et de la superficie pour calcul de densite
@@ -631,14 +618,31 @@ domains:
631618
format: json
632619
extract_params:
633620
maxResult: 10000
634-
TIME_PERIOD: 2021
621+
TIME_PERIOD: 2022
635622
GEO: ["COM","DEP","REG"]
636623
RP_MEASURE: ["POP","SUP"]
637624
OCS: "_T"
638625
response_map:
639626
data: observations
640627
next: paging.next
641628
is_last: paging.isLast
629+
630+
population_totale:
631+
API: INSEE.Melodi
632+
description: recuperation des populations communale et totale des communes
633+
type: MelodiExtractor
634+
endpoint: /data/DS_POPULATIONS_REFERENCE
635+
format: json
636+
extract_params:
637+
maxResult: 10000
638+
TIME_PERIOD: 2022
639+
GEO: ["COM"]
640+
POPREF_MEASURE: ["PMUN", "PTOT"]
641+
page: 1
642+
response_map:
643+
data: observations
644+
next: paging.next
645+
is_last: paging.isLast
642646

643647
categorie_socio_pro:
644648
API: INSEE.Melodi
@@ -648,12 +652,12 @@ domains:
648652
format: json
649653
extract_params:
650654
maxResult: 10000
651-
TIME_PERIOD: 2021
655+
TIME_PERIOD: 2022
652656
GEO: ["COM","DEP","REG"]
653657
SEX: "_T"
654658
RP_MEASURE: "POP"
655-
AGE: "Y_GE15"
656-
PCS: ["1","2","3","4","5","6","7","9"]
659+
AGE: "Y_GE15" # 15 years old or more
660+
PCS: ["1", "2", "3", "4", "5", "6", "7", "9", "_T"] # 8 was excluded (students), added it back just in case
657661
response_map:
658662
data: observations
659663
next: paging.next

0 commit comments

Comments
 (0)