dataforgoodfr
diff --git a/‎.github/workflows/pre-commit-tests.yaml‎
Lines changed: 3 additions & 1 deletion b/‎.github/workflows/pre-commit-tests.yaml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎README.md‎
Lines changed: 14 additions & 0 deletions b/‎README.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎bin/odis.py‎
Lines changed: 1 addition & 1 deletion b/‎bin/odis.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/data_source_model.py‎
Lines changed: 9 additions & 0 deletions b/‎common/data_source_model.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎common/utils/exceptions.py‎
Lines changed: 8 additions & 0 deletions b/‎common/utils/exceptions.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎common/utils/file_handler.py‎
Lines changed: 43 additions & 25 deletions b/‎common/utils/file_handler.py‎
Lines changed: 43 additions & 25 deletions
diff --git a/‎common/utils/http/async_client.py‎
Lines changed: 9 additions & 5 deletions b/‎common/utils/http/async_client.py‎
Lines changed: 9 additions & 5 deletions
diff --git a/‎common/utils/interfaces/data_handler.py‎
Lines changed: 2 additions & 0 deletions b/‎common/utils/interfaces/data_handler.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎common/utils/interfaces/loader.py‎
Lines changed: 18 additions & 26 deletions b/‎common/utils/interfaces/loader.py‎
Lines changed: 18 additions & 26 deletions
@@ -11,6 +11,8 @@ jobs:
     steps:
     - uses: actions/checkout@v2
     - uses: actions/setup-python@v2
+      with:
+        python-version: '3.13'
 
     - name: Install poetry
       run: |
@@ -35,4 +37,4 @@ jobs:
         poetry run pytest -v
     - name: Verify tests results
       if: ${{ failure() }}
-      run: exit 1
+      run: exit 1
@@ -80,6 +80,20 @@ Pour comprendre en détail comment ça fonctionne :
 
 [Comprendre les Extracteurs de données](./docs/extract.md)
 
+## Chargement
+
+Après avoir extrait les données, il est nécessaire de les charger en base de donnée avant de lancer la transformation dbt.
+
+Le script “load“ se lance exactement comme le script extract ci-dessus, avec les mêmes arguments :
+
+```bash
+# Extraire tous les datasets source du domaine "geographical_references"
+poetry run bin/odis.py load --domain geographical_references
+
+# Extraire seulement les datasets "regions" et "departements du domaine "geographical_references"
+poetry run bin/odis.py load --sources geographical_references.regions, geographical_references.departements
+```
+
 ## Sonder les sources disponibles
 
 L’option “explain” permet de voir facilement comment les API, Domaines et Sources sont définis dans la configuration. Si l’option “explain” est passée, le script n’extrait aucune donnée mais montre seulement les infos sur les configurations demandées.
 
@@ -152,7 +152,7 @@ async def extract_data_sources(
         )
 
         for exc in tasks_exceptions:
-            logger.error(f"Error: {exc}")
+            logger.exception(f"Error during extraction: {exc}", exc_info=exc)
 
         # exit with a non-zero status code
         # to indicate that there was an error
 
@@ -74,6 +74,7 @@ class DataLoadParameters(BaseModel):
     )
 
 
+
 class DataProcessingParameters(BaseModel):
 
     name: str = Field(
@@ -90,6 +91,7 @@ class DataProcessingParameters(BaseModel):
             Type of the processor.
             The following values are accepted :
              - "notebook" (default)
+             - "python" (python class)
         """,
     )
 
@@ -100,6 +102,13 @@ class DataProcessingParameters(BaseModel):
         """,
     )
 
+    sheets: Optional[list[str]] = Field(
+        default=None,
+        description="""
+            List of sheet names that need to be extracted.
+        """
+    )
+
     @computed_field
     @property
     def base_path(self) -> Path:
 
@@ -22,3 +22,11 @@ class InvalidCSV(Exception):
     def __init__(self, message: str):
         super().__init__(message)
         self.message = message
+
+
+class InvalidExcel(Exception):
+    """Exception raised when the CSV file is invalid or not found"""
+
+    def __init__(self, message: str):
+        super().__init__(message)
+        self.message = message
@@ -94,14 +94,46 @@ class XlsxReader(FileReader):
     def __init__(self, import_path: str):
         self.import_path = import_path
 
-    def try_load(self, model: DomainModel) -> pd.DataFrame:
-        return pd.read_excel(
+    def try_load(self, model: DomainModel) -> dict[ str, pd.DataFrame ]:
+
+        wb = pd.ExcelFile( 
             self.import_path,
-            header=model.load_params.header,
-            skipfooter=model.load_params.skipfooter,
-            sep=model.load_params.separator,
-            engine="python",  # Required for skipfooter parameter
-        )
+            engine = "openpyxl"
+            )
+
+        preprocess_params = model.preprocessor
+        sheets_list = preprocess_params.sheets
+        sheet_names = wb.sheet_names
+
+
+        results = {}
+
+        if sheets_list:
+            for sheet_name in [x for x in sheets_list if x in sheet_names]:
+                results[ sheet_name ] = pd.read_excel(
+                    wb,
+                    sheet_name = sheet_name,
+                    header=model.load_params.header,
+                    skipfooter=model.load_params.skipfooter,
+                    engine="openpyxl",
+                )
+        
+        else:
+            pd_load = pd.read_excel(
+                    wb,
+                    header=model.load_params.header,
+                    skipfooter=model.load_params.skipfooter,
+                    engine="openpyxl",
+                )
+
+            if isinstance(pd_load, dict):
+                results = pd_load
+            elif isinstance(pd_load, pd.DataFrame):
+                results['0'] = pd_load
+            else:
+                logger.info(f'Type not recognized: {type(pd_load)}')
+
+        return results
 
 
 class MetadataReader(FileReader):
@@ -365,25 +397,11 @@ def xlsx_load(
         storage_info: StorageInfo,
         model: DomainModel,
     ) -> pd.DataFrame:
-        """Parses an Excel file and returns the data as a pandas dataframe
-
-        TODO:
-        - benchmark usage of pandas vs csv module
-
-        Args:
-            storage_info (StorageInfo) : the info where the file is stored
-            model (DomainModel): the model that generated the data
-
-        Returns:
-            DataFrame: the data from the CSV file as a pandas DataFrame
-
-        Raises:
-            InvalidCSV: if the file is not found or the CSV is invalid
         """
-        raise NotImplementedError(
-            "XLSX file loading is not implemented yet. Please use CSV or JSON files instead."
-        )
-        # _filepath = Path(storage_info.location) / Path(storage_info.file_name)
+        Parses an Excel file and returns the data as a pandas dataframe.
+        """
+        filepath = Path(storage_info.location) / Path(storage_info.file_name)
+        return XlsxReader(filepath).load(model=model)
 
     def load_metadata(
         self, model: DomainModel, operation: OperationType
 
@@ -19,26 +19,30 @@ class AsyncHttpClient(HttpClient):
     """
 
     _session: aiohttp.ClientSession
+    _timeout: aiohttp.ClientTimeout
 
-    def __init__(self, max_connections: int = 100):
+    def __init__(self, max_connections: int = 100, timeout: int = 1200):
         """
         Args:
             max_connections (int, optional): The max number of concurrent connections.
                 Defaults to 100.
+            timeout (int, optional): The timeout in seconds for each individual HTTP request.
+                Defaults to 1200 (20 minutes).
         """
 
         conn = aiohttp.TCPConnector(limit=max_connections)
-        self._session = aiohttp.ClientSession(connector=conn)
+        self._timeout = aiohttp.ClientTimeout(total=timeout)
+        self._session = aiohttp.ClientSession(connector=conn, timeout=self._timeout)
 
         logger.debug(
-            f"AsyncHttpClient initialized with max_connections={max_connections}"
+            f"AsyncHttpClient initialized with max_connections={max_connections}, timeout={timeout}s"
         )
 
     @retry(
         retry=retry_if_exception_type(aiohttp.ClientError),
-        stop=(stop_after_delay(180) | stop_after_attempt(3)),
+        stop=(stop_after_delay(2400) | stop_after_attempt(3)),
         before=before_log(logger, logging.DEBUG),
-        reraise=True,  # re-raise the last exception
+        reraise=True,  
     )
     async def get(
         self,
 
@@ -83,6 +83,8 @@ def json_load(
 
     def csv_load(self, page_log: PageLog, *args, **kwargs) -> DataFrame: ...
 
+    def xlsx_load(self, page_log: PageLog, *args, **kwargs) -> DataFrame: ...
+
     def load_metadata(
         self, model: DomainModel, operation: OperationType, *args, **kwargs
     ) -> MetadataInfo: ...
 
@@ -34,46 +34,38 @@ class Column(BaseModel):
     def sanitize_name(cls, value: str, info: ValidationInfo) -> str:
         """
         Sanitize column names by:
-        1. Replacing spaces with underscores
-        2. Removing accents
-        3. Ensuring the name is SQL-friendly
-        4. remove surrounding quotes
-        5. Ensuring the name starts with a letter
-        6. Converting to lowercase
-        7. Removing any non-alphanumeric characters except underscores
+        - Removing accents
+        - Replacing spaces with underscores
+        - Removing special characters
+        - Forcing lowercase
+        - Ensuring it starts with a letter
+        - Truncating to PostgreSQL's 63-character limit
         """
-        # Normalize unicode characters and remove accents
+        # Normalize and clean
         normalized = (
             unicodedata.normalize("NFKD", value)
             .encode("ascii", "ignore")
             .decode("utf-8")
         )
-        # Replace spaces with underscores
         sanitized = normalized.replace(" ", "_")
-        # Remove any non-alphanumeric characters except underscores
         sanitized = re.sub(r"[^\w]", "", sanitized)
-        # Remove surrounding quotes
-        sanitized = sanitized.strip('"').strip("'")
-        # Remove any leading/trailing whitespace
-        sanitized = sanitized.strip()
-        # Remove any leading/trailing underscores
-        sanitized = sanitized.strip("_")
-
-        # Ensure the name is not empty
+        sanitized = sanitized.strip('"').strip("'").strip("_").strip()
+
         if not sanitized:
-            raise ValueError(f"Invalid column name: {value}")
-        # Ensure the column name starts with a letter
+            raise ValueError(f"Invalid column name: '{value}'")
+
         if not sanitized[0].isalpha():
             sanitized = f"col_{sanitized}"
 
-        return sanitized.lower()
+        sanitized = sanitized.lower()
 
-    def __repr__(self):
-        """pretty print column name
+        # Truncate to 63 characters for PostgreSQL
+        if len(sanitized) > 63:
+            logger.warning(f"Column name '{sanitized}' truncated to 63 characters")
+            sanitized = sanitized[63:]
+
+        return sanitized
 
-        ex: <column_name: TEXT>
-        """
-        return f"<{self.name}: {self.data_type}>"
 
 
 class AbstractDataLoader(ABC):
Original file line number	Diff line number	Diff line change
`@@ -152,7 +152,7 @@ async def extract_data_sources(`
`152`	`152`	`)`
`153`	`153`
`154`	`154`	`for exc in tasks_exceptions:`
`155`		`- logger.error(f"Error: {exc}")`
	`155`	`+ logger.exception(f"Error during extraction: {exc}", exc_info=exc)`
`156`	`156`
`157`	`157`	`# exit with a non-zero status code`
`158`	`158`	`# to indicate that there was an error`