Merge pull request #1722 from asminkarki012/fix/csv-parser-include-headers

dartpain · web-flow · commit 3227b0e69cc7 · 2025-04-04T14:53:45.000+03:00
fix[csv_parser]:missing header
diff --git a/application/parser/file/tabular_parser.py b/application/parser/file/tabular_parser.py
@@ -73,7 +73,13 @@ class PandasCSVParser(BaseParser):
             for more information.
             Set to empty dict by default, this means pandas will try to figure
             out the separators, table head, etc. on its own.
-
+            
+        header_period (int): Controls how headers are included in output:
+            - 0: Headers only at the beginning
+            - 1: Headers in every row
+            - N > 1: Headers every N rows
+            
+        header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
     """
 
     def __init__(
@@ -83,6 +89,8 @@ def __init__(
             col_joiner: str = ", ",
             row_joiner: str = "\n",
             pandas_config: dict = {},
+            header_period: int = 20,
+            header_prefix: str = "HEADERS: ",
             **kwargs: Any
     ) -> None:
         """Init params."""
@@ -91,6 +99,8 @@ def __init__(
         self._col_joiner = col_joiner
         self._row_joiner = row_joiner
         self._pandas_config = pandas_config
+        self._header_period = header_period
+        self._header_prefix = header_prefix
 
     def _init_parser(self) -> Dict:
         """Init parser."""
@@ -104,15 +114,26 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]
             raise ValueError("pandas module is required to read CSV files.")
 
         df = pd.read_csv(file, **self._pandas_config)
+        headers = df.columns.tolist()
+        header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"
+
+        if not self._concat_rows:
+            return df.apply(
+                lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
+            ).tolist()
+        
+        text_list = []
+        if self._header_period != 1:
+            text_list.append(header_row)
+        
+        for i, row in df.iterrows():
+            if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
+                text_list.append(header_row)
+            text_list.append(self._col_joiner.join(row.astype(str).tolist()))
+            if self._header_period == 1 and i < len(df) - 1:
+                text_list.append(header_row)
 
-        text_list = df.apply(
-            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
-        ).tolist()
-
-        if self._concat_rows:
-            return (self._row_joiner).join(text_list)
-        else:
-            return text_list
+        return self._row_joiner.join(text_list)
 
 
 class ExcelParser(BaseParser):
@@ -138,7 +159,13 @@ class ExcelParser(BaseParser):
             for more information.
             Set to empty dict by default, this means pandas will try to figure
             out the table structure on its own.
-
+            
+        header_period (int): Controls how headers are included in output:
+            - 0: Headers only at the beginning (default)
+            - 1: Headers in every row
+            - N > 1: Headers every N rows
+            
+        header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
     """
 
     def __init__(
@@ -148,6 +175,8 @@ def __init__(
             col_joiner: str = ", ",
             row_joiner: str = "\n",
             pandas_config: dict = {},
+            header_period: int = 20,
+            header_prefix: str = "HEADERS: ",
             **kwargs: Any
     ) -> None:
         """Init params."""
@@ -156,6 +185,8 @@ def __init__(
         self._col_joiner = col_joiner
         self._row_joiner = row_joiner
         self._pandas_config = pandas_config
+        self._header_period = header_period
+        self._header_prefix = header_prefix
 
     def _init_parser(self) -> Dict:
         """Init parser."""
@@ -169,12 +200,22 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]
             raise ValueError("pandas module is required to read Excel files.")
 
         df = pd.read_excel(file, **self._pandas_config)
-
-        text_list = df.apply(
-            lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
-        ).tolist()
-
-        if self._concat_rows:
-            return (self._row_joiner).join(text_list)
-        else:
-            return text_list
+        headers = df.columns.tolist()
+        header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"
+        
+        if not self._concat_rows:
+            return df.apply(
+                lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
+            ).tolist()
+        
+        text_list = []
+        if self._header_period != 1:
+            text_list.append(header_row)
+
+        for i, row in df.iterrows():
+            if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
+                text_list.append(header_row)
+            text_list.append(self._col_joiner.join(row.astype(str).tolist()))
+            if self._header_period == 1 and i < len(df) - 1:
+                text_list.append(header_row)
+        return self._row_joiner.join(text_list)