Skip to content

Commit 4f58c42

Browse files
authored
Merge branch 'main' into nb_menages
2 parents 10c8286 + bb5eb85 commit 4f58c42

File tree

86 files changed

+15208
-2675
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+15208
-2675
lines changed

.github/workflows/pre-commit-tests.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ jobs:
1111
steps:
1212
- uses: actions/checkout@v2
1313
- uses: actions/setup-python@v2
14+
with:
15+
python-version: '3.13'
1416

1517
- name: Install poetry
1618
run: |
@@ -35,4 +37,4 @@ jobs:
3537
poetry run pytest -v
3638
- name: Verify tests results
3739
if: ${{ failure() }}
38-
run: exit 1
40+
run: exit 1

README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,20 @@ Pour comprendre en détail comment ça fonctionne :
8080

8181
[Comprendre les Extracteurs de données](./docs/extract.md)
8282

83+
## Chargement
84+
85+
Après avoir extrait les données, il est nécessaire de les charger en base de donnée avant de lancer la transformation dbt.
86+
87+
Le script “load“ se lance exactement comme le script extract ci-dessus, avec les mêmes arguments :
88+
89+
```bash
90+
# Extraire tous les datasets source du domaine "geographical_references"
91+
poetry run bin/odis.py load --domain geographical_references
92+
93+
# Extraire seulement les datasets "regions" et "departements du domaine "geographical_references"
94+
poetry run bin/odis.py load --sources geographical_references.regions, geographical_references.departements
95+
```
96+
8397
## Sonder les sources disponibles
8498

8599
L’option “explain” permet de voir facilement comment les API, Domaines et Sources sont définis dans la configuration. Si l’option “explain” est passée, le script n’extrait aucune donnée mais montre seulement les infos sur les configurations demandées.

bin/odis.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ async def extract_data_sources(
152152
)
153153

154154
for exc in tasks_exceptions:
155-
logger.error(f"Error: {exc}")
155+
logger.exception(f"Error during extraction: {exc}", exc_info=exc)
156156

157157
# exit with a non-zero status code
158158
# to indicate that there was an error

common/data_source_model.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ class DataLoadParameters(BaseModel):
7474
)
7575

7676

77+
7778
class DataProcessingParameters(BaseModel):
7879

7980
name: str = Field(
@@ -90,6 +91,7 @@ class DataProcessingParameters(BaseModel):
9091
Type of the processor.
9192
The following values are accepted :
9293
- "notebook" (default)
94+
- "python" (python class)
9395
""",
9496
)
9597

@@ -100,6 +102,13 @@ class DataProcessingParameters(BaseModel):
100102
""",
101103
)
102104

105+
sheets: Optional[list[str]] = Field(
106+
default=None,
107+
description="""
108+
List of sheet names that need to be extracted.
109+
"""
110+
)
111+
103112
@computed_field
104113
@property
105114
def base_path(self) -> Path:

common/utils/exceptions.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,11 @@ class InvalidCSV(Exception):
2222
def __init__(self, message: str):
2323
super().__init__(message)
2424
self.message = message
25+
26+
27+
class InvalidExcel(Exception):
28+
"""Exception raised when the CSV file is invalid or not found"""
29+
30+
def __init__(self, message: str):
31+
super().__init__(message)
32+
self.message = message

common/utils/file_handler.py

Lines changed: 43 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -94,14 +94,46 @@ class XlsxReader(FileReader):
9494
def __init__(self, import_path: str):
9595
self.import_path = import_path
9696

97-
def try_load(self, model: DomainModel) -> pd.DataFrame:
98-
return pd.read_excel(
97+
def try_load(self, model: DomainModel) -> dict[ str, pd.DataFrame ]:
98+
99+
wb = pd.ExcelFile(
99100
self.import_path,
100-
header=model.load_params.header,
101-
skipfooter=model.load_params.skipfooter,
102-
sep=model.load_params.separator,
103-
engine="python", # Required for skipfooter parameter
104-
)
101+
engine = "openpyxl"
102+
)
103+
104+
preprocess_params = model.preprocessor
105+
sheets_list = preprocess_params.sheets
106+
sheet_names = wb.sheet_names
107+
108+
109+
results = {}
110+
111+
if sheets_list:
112+
for sheet_name in [x for x in sheets_list if x in sheet_names]:
113+
results[ sheet_name ] = pd.read_excel(
114+
wb,
115+
sheet_name = sheet_name,
116+
header=model.load_params.header,
117+
skipfooter=model.load_params.skipfooter,
118+
engine="openpyxl",
119+
)
120+
121+
else:
122+
pd_load = pd.read_excel(
123+
wb,
124+
header=model.load_params.header,
125+
skipfooter=model.load_params.skipfooter,
126+
engine="openpyxl",
127+
)
128+
129+
if isinstance(pd_load, dict):
130+
results = pd_load
131+
elif isinstance(pd_load, pd.DataFrame):
132+
results['0'] = pd_load
133+
else:
134+
logger.info(f'Type not recognized: {type(pd_load)}')
135+
136+
return results
105137

106138

107139
class MetadataReader(FileReader):
@@ -365,25 +397,11 @@ def xlsx_load(
365397
storage_info: StorageInfo,
366398
model: DomainModel,
367399
) -> pd.DataFrame:
368-
"""Parses an Excel file and returns the data as a pandas dataframe
369-
370-
TODO:
371-
- benchmark usage of pandas vs csv module
372-
373-
Args:
374-
storage_info (StorageInfo) : the info where the file is stored
375-
model (DomainModel): the model that generated the data
376-
377-
Returns:
378-
DataFrame: the data from the CSV file as a pandas DataFrame
379-
380-
Raises:
381-
InvalidCSV: if the file is not found or the CSV is invalid
382400
"""
383-
raise NotImplementedError(
384-
"XLSX file loading is not implemented yet. Please use CSV or JSON files instead."
385-
)
386-
# _filepath = Path(storage_info.location) / Path(storage_info.file_name)
401+
Parses an Excel file and returns the data as a pandas dataframe.
402+
"""
403+
filepath = Path(storage_info.location) / Path(storage_info.file_name)
404+
return XlsxReader(filepath).load(model=model)
387405

388406
def load_metadata(
389407
self, model: DomainModel, operation: OperationType

common/utils/http/async_client.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,26 +19,30 @@ class AsyncHttpClient(HttpClient):
1919
"""
2020

2121
_session: aiohttp.ClientSession
22+
_timeout: aiohttp.ClientTimeout
2223

23-
def __init__(self, max_connections: int = 100):
24+
def __init__(self, max_connections: int = 100, timeout: int = 1200):
2425
"""
2526
Args:
2627
max_connections (int, optional): The max number of concurrent connections.
2728
Defaults to 100.
29+
timeout (int, optional): The timeout in seconds for each individual HTTP request.
30+
Defaults to 1200 (20 minutes).
2831
"""
2932

3033
conn = aiohttp.TCPConnector(limit=max_connections)
31-
self._session = aiohttp.ClientSession(connector=conn)
34+
self._timeout = aiohttp.ClientTimeout(total=timeout)
35+
self._session = aiohttp.ClientSession(connector=conn, timeout=self._timeout)
3236

3337
logger.debug(
34-
f"AsyncHttpClient initialized with max_connections={max_connections}"
38+
f"AsyncHttpClient initialized with max_connections={max_connections}, timeout={timeout}s"
3539
)
3640

3741
@retry(
3842
retry=retry_if_exception_type(aiohttp.ClientError),
39-
stop=(stop_after_delay(180) | stop_after_attempt(3)),
43+
stop=(stop_after_delay(2400) | stop_after_attempt(3)),
4044
before=before_log(logger, logging.DEBUG),
41-
reraise=True, # re-raise the last exception
45+
reraise=True,
4246
)
4347
async def get(
4448
self,

common/utils/interfaces/data_handler.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ def json_load(
8383

8484
def csv_load(self, page_log: PageLog, *args, **kwargs) -> DataFrame: ...
8585

86+
def xlsx_load(self, page_log: PageLog, *args, **kwargs) -> DataFrame: ...
87+
8688
def load_metadata(
8789
self, model: DomainModel, operation: OperationType, *args, **kwargs
8890
) -> MetadataInfo: ...

common/utils/interfaces/loader.py

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -34,46 +34,38 @@ class Column(BaseModel):
3434
def sanitize_name(cls, value: str, info: ValidationInfo) -> str:
3535
"""
3636
Sanitize column names by:
37-
1. Replacing spaces with underscores
38-
2. Removing accents
39-
3. Ensuring the name is SQL-friendly
40-
4. remove surrounding quotes
41-
5. Ensuring the name starts with a letter
42-
6. Converting to lowercase
43-
7. Removing any non-alphanumeric characters except underscores
37+
- Removing accents
38+
- Replacing spaces with underscores
39+
- Removing special characters
40+
- Forcing lowercase
41+
- Ensuring it starts with a letter
42+
- Truncating to PostgreSQL's 63-character limit
4443
"""
45-
# Normalize unicode characters and remove accents
44+
# Normalize and clean
4645
normalized = (
4746
unicodedata.normalize("NFKD", value)
4847
.encode("ascii", "ignore")
4948
.decode("utf-8")
5049
)
51-
# Replace spaces with underscores
5250
sanitized = normalized.replace(" ", "_")
53-
# Remove any non-alphanumeric characters except underscores
5451
sanitized = re.sub(r"[^\w]", "", sanitized)
55-
# Remove surrounding quotes
56-
sanitized = sanitized.strip('"').strip("'")
57-
# Remove any leading/trailing whitespace
58-
sanitized = sanitized.strip()
59-
# Remove any leading/trailing underscores
60-
sanitized = sanitized.strip("_")
61-
62-
# Ensure the name is not empty
52+
sanitized = sanitized.strip('"').strip("'").strip("_").strip()
53+
6354
if not sanitized:
64-
raise ValueError(f"Invalid column name: {value}")
65-
# Ensure the column name starts with a letter
55+
raise ValueError(f"Invalid column name: '{value}'")
56+
6657
if not sanitized[0].isalpha():
6758
sanitized = f"col_{sanitized}"
6859

69-
return sanitized.lower()
60+
sanitized = sanitized.lower()
7061

71-
def __repr__(self):
72-
"""pretty print column name
62+
# Truncate to 63 characters for PostgreSQL
63+
if len(sanitized) > 63:
64+
logger.warning(f"Column name '{sanitized}' truncated to 63 characters")
65+
sanitized = sanitized[63:]
66+
67+
return sanitized
7368

74-
ex: <column_name: TEXT>
75-
"""
76-
return f"<{self.name}: {self.data_type}>"
7769

7870

7971
class AbstractDataLoader(ABC):

0 commit comments

Comments
 (0)