Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions common/utils/source_extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,12 @@ async def download(self) -> AsyncGenerator[ExtractionResult, None]:

is_last = False
url = self.url
is_first_page = True

while not is_last:

# yield the request result
result = await self.download_page(url)
result = await self.download_page(url, is_first_page)

is_last = result.is_last

Expand All @@ -60,19 +61,26 @@ async def download(self) -> AsyncGenerator[ExtractionResult, None]:
await asyncio.sleep(60 / self.api_config.throttle)

url = result.next_url
is_first_page = False

logger.debug(f"Next page: {result.next_url}")

yield result

async def download_page(self, url: str) -> ExtractionResult:
async def download_page(self, url: str, is_first_page: bool = False) -> ExtractionResult:
"""Downloads data corresponding to the given source model.
The parameters of the request (URL, headers etc) are set using the inherited set_query_parameters method.
"""

# if url has a query string, ignore the dict-defined parameters
url_querystr = urllib.parse.urlparse(url).query
passed_params = self.model.extract_params if url_querystr == "" else None

# For INSEE Melodi API: add page=1 for first request if not already present
# This ensures the API returns proper pagination metadata (next, isLast)
if is_first_page and passed_params is not None and "page" not in passed_params:
passed_params = {**passed_params, "page": 1}

# logger.info(f"querying '{url}'")

success = False
Expand Down
69 changes: 36 additions & 33 deletions datasources.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,27 @@ domains:
format: csv
load_params:
separator: ;

population_communes:
API: INSEE.Melodi
description: Population des communes
type: MelodiExtractor
endpoint: /data/DS_RP_POPULATION_PRINC
format: json
extract_params:
maxResult: 10000
TIME_PERIOD: 2022
# startPeriod: "2009-01-01"
# endPeriod: "2025-10-01"
GEO: "COM"
RP_MEASURE: POP
SEX: _T
AGE: _T
response_map:
data: observations
next: paging.next
is_last: paging.isLast


geographical_references:

Expand Down Expand Up @@ -326,9 +347,9 @@ domains:
format: json
extract_params:
maxResult: 10000
# TIME_PERIOD: 2021
startPeriod: "2009-01-01"
endPeriod: "2025-10-01"
TIME_PERIOD: 2022
# startPeriod: "2009-01-01"
# endPeriod: "2025-10-01"
GEO: # géographies
- COM # communes
- DEP # départements
Expand Down Expand Up @@ -368,8 +389,9 @@ domains:
format: json
extract_params:
maxResult: 10000
startPeriod: "2009-01-01"
endPeriod: "2025-10-01"
TIME_PERIOD: 2022
# startPeriod: "2009-01-01"
# endPeriod: "2025-10-01"
GEO: ["COM", "DEP", "REG"]
RP_MEASURE: DWELLINGS # nombre de logements
L_STAY: _T
Expand Down Expand Up @@ -401,8 +423,9 @@ domains:
format: json
extract_params:
maxResult: 10000
startPeriod: "2009-01-01"
endPeriod: "2025-10-01"
TIME_PERIOD: 2022
# startPeriod: "2009-01-01"
# endPeriod: "2025-10-01"
GEO: ["COM", "DEP", "REG"]
RP_MEASURE: DWELLINGS
L_STAY: _T
Expand Down Expand Up @@ -431,8 +454,9 @@ domains:
format: json
extract_params:
maxResult: 10000
startPeriod: "2009-01-01"
endPeriod: "2025-10-01"
TIME_PERIOD: 2022
# startPeriod: "2009-01-01"
# endPeriod: "2025-10-01"
GEO: ["COM", "DEP", "REG"]
RP_MEASURE: DWELLINGS
L_STAY: _T
Expand Down Expand Up @@ -461,8 +485,9 @@ domains:
format: json
extract_params:
maxResult: 10000
startPeriod: "2009-01-01"
endPeriod: "2025-10-01"
TIME_PERIOD: 2022
# startPeriod: "2009-01-01"
# endPeriod: "2025-10-01"
GEO: ["COM", "DEP", "REG"]
RP_MEASURE: DWELLINGS_ROOMS # nombre de pièces
L_STAY: _T
Expand Down Expand Up @@ -678,26 +703,4 @@ domains:
name: unzip_load_csv_files
type: notebook

taux_pauvrete_communes:
API: INSEE.statistiques
description: |
Taux de pauvretés au niveau des communes par seuil
Millésime 2021
type: FileExtractor
endpoint: /fichier/7756855/indic-struct-distrib-revenu-2021-COMMUNES_csv.zip
format: zip
preprocessor:
name: unzip_load_csv_files
type: notebook

taux_pauvrete_supra:
API: INSEE.statistiques
description: |
Taux de pauvretés au niveau des arrondissements, departements, région par seuil
Millésime 2021
type: FileExtractor
endpoint: /fichier/7756855/indic-struct-distrib-revenu-2021-SUPRA_csv.zip
format: zip
preprocessor:
name: unzip_load_csv_files
type: notebook
2 changes: 1 addition & 1 deletion dbt_odis/models/bronze/_odis_bronze__sources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ sources:
description: Source JSON loadée dans le champ data contenant
loaded_at_field: created_at

- name: presentation_page_population_communes
- name: presentation_population_communes
description: Source JSON loadée dans le champ data contenant
loaded_at_field: created_at

Expand Down
22 changes: 0 additions & 22 deletions dbt_odis/models/bronze/presentation_page_population_communes.sql

This file was deleted.

21 changes: 21 additions & 0 deletions dbt_odis/models/bronze/presentation_population_communes.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{{ config(
tags = ['bronze', 'presentation'],
alias = 'vw_presentation_population_communes'
)
}}


with population_communes as
(
select
(data::jsonb)->'measures' -> 'OBS_VALUE_NIVEAU' ->> 'value' as value,
(data::jsonb)->'dimensions' ->> 'AGE' as "AGE",
(data::jsonb)->'dimensions' ->> 'GEO' as "GEO",
(data::jsonb)->'dimensions' ->> 'SEX' as "SEX",
(data::jsonb)->'dimensions' ->> 'RP_MEASURE' as "RP_MEASURE",
(data::jsonb)->'dimensions' ->> 'TIME_PERIOD' as "TIME_PERIOD",
created_at
from {{ source('bronze', 'presentation_population_communes') }}
)

select * from population_communes
11 changes: 11 additions & 0 deletions dbt_odis/models/gold/_odis_gold__models.yml
Original file line number Diff line number Diff line change
Expand Up @@ -570,3 +570,14 @@ models:
- name: densite
description: densité de population, nombre d'habitants au kilomètre carré.
data_type: numeric
- name: gold_presentation_population_communes
description: Table contenant le nombre d'habitants par communes. Les grandes villes sont considérées à l'échelle de la commune, pas de l'arrondissement.
columns:
- name: codgeo
description: Code INSEE de la commune (différent du code postal)
data_type: text
tests:
- assert_big_cities_exist
- name: population_totale
description: Population totale de la commune d'après les données INSEE
data_type: float
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{{ config(
tags = ['gold', 'population', 'presentation'],
alias='vw_presentation_population_communes_gold',
) }}


select codgeo, population_totale, year from {{ ref("stg_presentation_population_communes") }}
24 changes: 24 additions & 0 deletions dbt_odis/models/silver/stg_presentation_population_communes.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{{ config(
tags = ['silver', 'population', 'presentation'],
alias='vw_presentation_population_communes_stg',
materialized='view'
) }}

with cleaned as (
select
split_part("GEO", '-', 3) as codgeo,
"value"::float as population,
"TIME_PERIOD"::integer as year,
"GEO" as geo_original
from {{ ref("presentation_population_communes") }}
where ("GEO" like '%COM-%')
and "AGE" = '_T'
and "SEX" = '_T'
and "RP_MEASURE" = 'POP'
)

select
codgeo,
population as population_totale,
year
from cleaned
15 changes: 14 additions & 1 deletion dbt_odis/seeds/_odis_bronze__seeds.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,17 @@ seeds:
Code_FAP86: text
Intitulé_FAP86: text
Code_FAP22: text
Intitulé_FAP22: text
Intitulé_FAP22: text
- name: insee_top_50_communes_par_habitants
description: "50 communes françaises principales par nombre d'habitants."
config:
quote_columns: true
delimiter: ';'
column_types:
AGE: text
GEO: text
GEO_OBJECT: text
RP_MEASURE: text
SEX: text
TIME_PERIOD: float
OBS_VALUE: float
51 changes: 51 additions & 0 deletions dbt_odis/seeds/insee_top_50_communes_par_habitants.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
AGE;GEO;GEO_OBJECT;RP_MEASURE;SEX;TIME_PERIOD;OBS_VALUE
_T;75056;COM;POP;_T;2022;2113705.0
_T;13055;COM;POP;_T;2022;877215.0
_T;69123;COM;POP;_T;2022;520774.0
_T;31555;COM;POP;_T;2022;511684.0
_T;06088;COM;POP;_T;2022;353701.0
_T;44109;COM;POP;_T;2022;325070.0
_T;34172;COM;POP;_T;2022;307101.0
_T;67482;COM;POP;_T;2022;291709.0
_T;33063;COM;POP;_T;2022;265328.0
_T;59350;COM;POP;_T;2022;238695.0
_T;35238;COM;POP;_T;2022;227830.0
_T;83137;COM;POP;_T;2022;180834.0
_T;51454;COM;POP;_T;2022;178478.0
_T;42218;COM;POP;_T;2022;172569.0
_T;76351;COM;POP;_T;2022;166462.0
_T;69266;COM;POP;_T;2022;162207.0
_T;21231;COM;POP;_T;2022;159941.0
_T;49007;COM;POP;_T;2022;157555.0
_T;38185;COM;POP;_T;2022;156389.0
_T;97411;COM;POP;_T;2022;156149.0
_T;30189;COM;POP;_T;2022;150444.0
_T;93066;COM;POP;_T;2022;148907.0
_T;13001;COM;POP;_T;2022;147933.0
_T;63113;COM;POP;_T;2022;147751.0
_T;72181;COM;POP;_T;2022;145182.0
_T;29019;COM;POP;_T;2022;140993.0
_T;37261;COM;POP;_T;2022;138668.0
_T;80021;COM;POP;_T;2022;134780.0
_T;74010;COM;POP;_T;2022;131272.0
_T;87085;COM;POP;_T;2022;129754.0
_T;57463;COM;POP;_T;2022;121695.0
_T;66136;COM;POP;_T;2022;120996.0
_T;92012;COM;POP;_T;2022;120205.0
_T;25056;COM;POP;_T;2022;120057.0
_T;45234;COM;POP;_T;2022;116344.0
_T;76540;COM;POP;_T;2022;116331.0
_T;93048;COM;POP;_T;2022;110758.0
_T;14118;COM;POP;_T;2022;108398.0
_T;95018;COM;POP;_T;2022;107135.0
_T;97415;COM;POP;_T;2022;106220.0
_T;68224;COM;POP;_T;2022;104924.0
_T;54395;COM;POP;_T;2022;104387.0
_T;59512;COM;POP;_T;2022;99507.0
_T;59599;COM;POP;_T;2022;99160.0
_T;92050;COM;POP;_T;2022;98119.0
_T;94081;COM;POP;_T;2022;95282.0
_T;94028;COM;POP;_T;2022;92859.0
_T;84007;COM;POP;_T;2022;91760.0
_T;92004;COM;POP;_T;2022;91457.0
_T;92025;COM;POP;_T;2022;90692.0
23 changes: 23 additions & 0 deletions dbt_odis/tests/generic/test_assert_big_cities_exist.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
{% test assert_big_cities_exist(model, column_name) %}

with big_cities as (
select '75056' as codgeo
union all
select '13055'
union all
select '69123'
),

missing as (
select
bc.codgeo
from big_cities bc
left join {{ model }} model
on bc.codgeo = model.{{column_name}}
where model.{{column_name}} is null
)

-- Le test échoue si le code commune de Paris, Marseille ou Lyon manque
select * from missing

{% endtest %}
19 changes: 19 additions & 0 deletions dbt_odis/tests/test_population_main_big_cities.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

with insee_data as (
select * from {{ ref('insee_top_50_communes_par_habitants') }}
)

select
id."GEO" as codgeo,
id."OBS_VALUE" as insee_population,
model.population_totale as model_population,
id."OBS_VALUE" / model.population_totale as ratio
from insee_data id
left join {{ ref('gold_presentation_population_communes') }} model
on id."GEO" = model.codgeo
where model.population_totale is not null and model.year = 2022
and model.population_totale > 0
and (
id."OBS_VALUE" / model.population_totale < 0.9
or id."OBS_VALUE" / model.population_totale > 1.1
)