Skip to content

Commit 3227b0e

Browse files
authored
Merge pull request #1722 from asminkarki012/fix/csv-parser-include-headers
fix[csv_parser]:missing header
2 parents 94c7bba + 57a6fb3 commit 3227b0e

File tree

1 file changed

+60
-19
lines changed

1 file changed

+60
-19
lines changed

application/parser/file/tabular_parser.py

Lines changed: 60 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,13 @@ class PandasCSVParser(BaseParser):
7373
for more information.
7474
Set to empty dict by default, this means pandas will try to figure
7575
out the separators, table head, etc. on its own.
76-
76+
77+
header_period (int): Controls how headers are included in output:
78+
- 0: Headers only at the beginning
79+
- 1: Headers in every row
80+
- N > 1: Headers every N rows
81+
82+
header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
7783
"""
7884

7985
def __init__(
@@ -83,6 +89,8 @@ def __init__(
8389
col_joiner: str = ", ",
8490
row_joiner: str = "\n",
8591
pandas_config: dict = {},
92+
header_period: int = 20,
93+
header_prefix: str = "HEADERS: ",
8694
**kwargs: Any
8795
) -> None:
8896
"""Init params."""
@@ -91,6 +99,8 @@ def __init__(
9199
self._col_joiner = col_joiner
92100
self._row_joiner = row_joiner
93101
self._pandas_config = pandas_config
102+
self._header_period = header_period
103+
self._header_prefix = header_prefix
94104

95105
def _init_parser(self) -> Dict:
96106
"""Init parser."""
@@ -104,15 +114,26 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]
104114
raise ValueError("pandas module is required to read CSV files.")
105115

106116
df = pd.read_csv(file, **self._pandas_config)
117+
headers = df.columns.tolist()
118+
header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"
119+
120+
if not self._concat_rows:
121+
return df.apply(
122+
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
123+
).tolist()
124+
125+
text_list = []
126+
if self._header_period != 1:
127+
text_list.append(header_row)
128+
129+
for i, row in df.iterrows():
130+
if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
131+
text_list.append(header_row)
132+
text_list.append(self._col_joiner.join(row.astype(str).tolist()))
133+
if self._header_period == 1 and i < len(df) - 1:
134+
text_list.append(header_row)
107135

108-
text_list = df.apply(
109-
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
110-
).tolist()
111-
112-
if self._concat_rows:
113-
return (self._row_joiner).join(text_list)
114-
else:
115-
return text_list
136+
return self._row_joiner.join(text_list)
116137

117138

118139
class ExcelParser(BaseParser):
@@ -138,7 +159,13 @@ class ExcelParser(BaseParser):
138159
for more information.
139160
Set to empty dict by default, this means pandas will try to figure
140161
out the table structure on its own.
141-
162+
163+
header_period (int): Controls how headers are included in output:
164+
- 0: Headers only at the beginning (default)
165+
- 1: Headers in every row
166+
- N > 1: Headers every N rows
167+
168+
header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
142169
"""
143170

144171
def __init__(
@@ -148,6 +175,8 @@ def __init__(
148175
col_joiner: str = ", ",
149176
row_joiner: str = "\n",
150177
pandas_config: dict = {},
178+
header_period: int = 20,
179+
header_prefix: str = "HEADERS: ",
151180
**kwargs: Any
152181
) -> None:
153182
"""Init params."""
@@ -156,6 +185,8 @@ def __init__(
156185
self._col_joiner = col_joiner
157186
self._row_joiner = row_joiner
158187
self._pandas_config = pandas_config
188+
self._header_period = header_period
189+
self._header_prefix = header_prefix
159190

160191
def _init_parser(self) -> Dict:
161192
"""Init parser."""
@@ -169,12 +200,22 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]
169200
raise ValueError("pandas module is required to read Excel files.")
170201

171202
df = pd.read_excel(file, **self._pandas_config)
172-
173-
text_list = df.apply(
174-
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
175-
).tolist()
176-
177-
if self._concat_rows:
178-
return (self._row_joiner).join(text_list)
179-
else:
180-
return text_list
203+
headers = df.columns.tolist()
204+
header_row = f"{self._header_prefix}{self._col_joiner.join(headers)}"
205+
206+
if not self._concat_rows:
207+
return df.apply(
208+
lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1
209+
).tolist()
210+
211+
text_list = []
212+
if self._header_period != 1:
213+
text_list.append(header_row)
214+
215+
for i, row in df.iterrows():
216+
if (self._header_period > 1 and i > 0 and i % self._header_period == 0):
217+
text_list.append(header_row)
218+
text_list.append(self._col_joiner.join(row.astype(str).tolist()))
219+
if self._header_period == 1 and i < len(df) - 1:
220+
text_list.append(header_row)
221+
return self._row_joiner.join(text_list)

0 commit comments

Comments
 (0)