@@ -73,7 +73,13 @@ class PandasCSVParser(BaseParser):
7373 for more information.
7474 Set to empty dict by default, this means pandas will try to figure
7575 out the separators, table head, etc. on its own.
76-
76+
77+ header_period (int): Controls how headers are included in output:
78+ - 0: Headers only at the beginning
79+ - 1: Headers in every row
80+ - N > 1: Headers every N rows
81+
82+ header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
7783 """
7884
7985 def __init__ (
@@ -83,6 +89,8 @@ def __init__(
8389 col_joiner : str = ", " ,
8490 row_joiner : str = "\n " ,
8591 pandas_config : dict = {},
92+ header_period : int = 20 ,
93+ header_prefix : str = "HEADERS: " ,
8694 ** kwargs : Any
8795 ) -> None :
8896 """Init params."""
@@ -91,6 +99,8 @@ def __init__(
9199 self ._col_joiner = col_joiner
92100 self ._row_joiner = row_joiner
93101 self ._pandas_config = pandas_config
102+ self ._header_period = header_period
103+ self ._header_prefix = header_prefix
94104
95105 def _init_parser (self ) -> Dict :
96106 """Init parser."""
@@ -104,15 +114,26 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]
104114 raise ValueError ("pandas module is required to read CSV files." )
105115
106116 df = pd .read_csv (file , ** self ._pandas_config )
117+ headers = df .columns .tolist ()
118+ header_row = f"{ self ._header_prefix } { self ._col_joiner .join (headers )} "
119+
120+ if not self ._concat_rows :
121+ return df .apply (
122+ lambda row : (self ._col_joiner ).join (row .astype (str ).tolist ()), axis = 1
123+ ).tolist ()
124+
125+ text_list = []
126+ if self ._header_period != 1 :
127+ text_list .append (header_row )
128+
129+ for i , row in df .iterrows ():
130+ if (self ._header_period > 1 and i > 0 and i % self ._header_period == 0 ):
131+ text_list .append (header_row )
132+ text_list .append (self ._col_joiner .join (row .astype (str ).tolist ()))
133+ if self ._header_period == 1 and i < len (df ) - 1 :
134+ text_list .append (header_row )
107135
108- text_list = df .apply (
109- lambda row : (self ._col_joiner ).join (row .astype (str ).tolist ()), axis = 1
110- ).tolist ()
111-
112- if self ._concat_rows :
113- return (self ._row_joiner ).join (text_list )
114- else :
115- return text_list
136+ return self ._row_joiner .join (text_list )
116137
117138
118139class ExcelParser (BaseParser ):
@@ -138,7 +159,13 @@ class ExcelParser(BaseParser):
138159 for more information.
139160 Set to empty dict by default, this means pandas will try to figure
140161 out the table structure on its own.
141-
162+
163+ header_period (int): Controls how headers are included in output:
164+ - 0: Headers only at the beginning (default)
165+ - 1: Headers in every row
166+ - N > 1: Headers every N rows
167+
168+ header_prefix (str): Prefix for header rows. Default is "HEADERS: ".
142169 """
143170
144171 def __init__ (
@@ -148,6 +175,8 @@ def __init__(
148175 col_joiner : str = ", " ,
149176 row_joiner : str = "\n " ,
150177 pandas_config : dict = {},
178+ header_period : int = 20 ,
179+ header_prefix : str = "HEADERS: " ,
151180 ** kwargs : Any
152181 ) -> None :
153182 """Init params."""
@@ -156,6 +185,8 @@ def __init__(
156185 self ._col_joiner = col_joiner
157186 self ._row_joiner = row_joiner
158187 self ._pandas_config = pandas_config
188+ self ._header_period = header_period
189+ self ._header_prefix = header_prefix
159190
160191 def _init_parser (self ) -> Dict :
161192 """Init parser."""
@@ -169,12 +200,22 @@ def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]
169200 raise ValueError ("pandas module is required to read Excel files." )
170201
171202 df = pd .read_excel (file , ** self ._pandas_config )
172-
173- text_list = df .apply (
174- lambda row : (self ._col_joiner ).join (row .astype (str ).tolist ()), axis = 1
175- ).tolist ()
176-
177- if self ._concat_rows :
178- return (self ._row_joiner ).join (text_list )
179- else :
180- return text_list
203+ headers = df .columns .tolist ()
204+ header_row = f"{ self ._header_prefix } { self ._col_joiner .join (headers )} "
205+
206+ if not self ._concat_rows :
207+ return df .apply (
208+ lambda row : (self ._col_joiner ).join (row .astype (str ).tolist ()), axis = 1
209+ ).tolist ()
210+
211+ text_list = []
212+ if self ._header_period != 1 :
213+ text_list .append (header_row )
214+
215+ for i , row in df .iterrows ():
216+ if (self ._header_period > 1 and i > 0 and i % self ._header_period == 0 ):
217+ text_list .append (header_row )
218+ text_list .append (self ._col_joiner .join (row .astype (str ).tolist ()))
219+ if self ._header_period == 1 and i < len (df ) - 1 :
220+ text_list .append (header_row )
221+ return self ._row_joiner .join (text_list )
0 commit comments