@@ -93,14 +93,19 @@ def _str_items(items_list):
9393
9494
9595def read_tfs (
96- tfs_file_path : Union [pathlib .Path , str ],
97- index : str = None ,
98- non_unique_behavior : str = "warn" ,
96+ tfs_file_path : Union [pathlib .Path , str ], index : str = None , non_unique_behavior : str = "warn"
9997) -> TfsDataFrame :
10098 """
10199 Parses the TFS table present in **tfs_file_path** and returns a customized version of a Pandas
102100 DataFrame (a TfsDataFrame).
103101
102+ Methodology: This function parses the first lines of the file until it gets to the `types` line.
103+ While parsed, the appropriate information is gathered (headers content, column names & types,
104+ number of lines parsed). After reaching the types lines, the rest of the file is given to parse
105+ to ``pandas.read_csv`` with the right options to make use of it's C engine's speed. After this,
106+ conversion to ``TfsDataDrame`` is made, proper types are applied to columns, the index is set and
107+ the frame is validated before being returned.
108+
104109 Args:
105110 tfs_file_path (Union[pathlib.Path, str]): PosixPath object to the output TFS file. Can be
106111 a string, in which case it will be cast to a PosixPath object.
@@ -109,55 +114,69 @@ def read_tfs(
109114 non_unique_behavior (str): behavior to adopt if non-unique indices or columns are found in the
110115 dataframe. Accepts **warn** and **raise** as values, case-insensitively, which dictates
111116 to respectively issue a warning or raise an error if non-unique elements are found.
117+
112118 Returns:
113- A TfsDataFrame object.
119+ A TfsDataFrame object with the loaded data from the file .
114120 """
115121 tfs_file_path = pathlib .Path (tfs_file_path )
116122 headers = OrderedDict ()
117- rows_list = []
123+ non_data_lines : int = 0
118124 column_names = column_types = None
119125
120126 LOGGER .debug (f"Reading path: { tfs_file_path .absolute ()} " )
121127 with tfs_file_path .open ("r" ) as tfs_data :
122128 for line in tfs_data :
129+ non_data_lines += 1
123130 line_components = shlex .split (line )
124131 if not line_components :
125132 continue
126133 if line_components [0 ] == HEADER :
127134 name , value = _parse_header (line_components [1 :])
128135 headers [name ] = value
129136 elif line_components [0 ] == NAMES :
130- LOGGER .debug ("Setting column names." )
137+ LOGGER .debug ("Parsing column names." )
131138 column_names = np .array (line_components [1 :])
132139 elif line_components [0 ] == TYPES :
133- LOGGER .debug ("Setting column types." )
140+ LOGGER .debug ("Parsing column types." )
134141 column_types = _compute_types (line_components [1 :])
135142 elif line_components [0 ] == COMMENTS :
136143 continue
137- else :
138- if column_names is None :
139- LOGGER .error (f"No column names in file { tfs_file_path .absolute ()} , aborting" )
140- raise TfsFormatError ("Column names have not been set." )
141- if column_types is None :
142- LOGGER .error (f"No column types in file { tfs_file_path .absolute ()} , aborting" )
143- raise TfsFormatError ("Column types have not been set." )
144- line_components = [part .strip ('"' ) for part in line_components ]
145- rows_list .append (line_components )
146- data_frame = _create_data_frame (column_names , column_types , rows_list , headers )
147-
148- if index : # Use given column as index
149- data_frame = data_frame .set_index (index )
150- else : # Try to find Index automatically
151- index_column = [colname for colname in data_frame .columns if colname .startswith (INDEX_ID )]
152- if index_column :
153- data_frame = data_frame .set_index (index_column )
154- index_name = index_column [0 ].replace (INDEX_ID , "" )
155- if index_name == "" :
156- index_name = None # to remove it completely (Pandas makes a difference)
157- data_frame = data_frame .rename_axis (index_name )
158-
159- _validate (data_frame , f"from file { tfs_file_path .absolute ()} " , non_unique_behavior )
160- return data_frame
144+ else : # After all previous cases should only be data lines. If not, file is fucked.
145+ break # Break to not go over all lines, saves a lot of time on big files
146+
147+ if column_names is None :
148+ LOGGER .error (f"No column names in file { tfs_file_path .absolute ()} , aborting" )
149+ raise TfsFormatError ("Column names have not been set." )
150+ if column_types is None :
151+ LOGGER .error (f"No column types in file { tfs_file_path .absolute ()} , aborting" )
152+ raise TfsFormatError ("Column types have not been set." )
153+
154+ LOGGER .debug ("Parsing data part of the file" )
155+ # DO NOT use comment=COMMENTS in here, if you do and the symbol is in an element for some
156+ # reason then the entire parsing will crash
157+ data_frame = pd .read_csv (
158+ tfs_file_path ,
159+ engine = "c" , # faster, and we do not need the features of the python engine
160+ skiprows = non_data_lines - 1 , # because we incremented for the first data line in loop above
161+ delim_whitespace = True , # understands ' ' is our delimiter
162+ skipinitialspace = True , # understands ' ' and ' ' are both valid delimiters
163+ quotechar = '"' , # elements surrounded by " are one entry -> correct parsing of strings with spaces
164+ names = column_names , # column names we have determined, avoids using first read row for columns
165+ )
166+
167+ LOGGER .debug ("Converting to TfsDataFrame" )
168+ tfs_data_frame = TfsDataFrame (data_frame , headers = headers )
169+ _assign_column_types (tfs_data_frame , column_names , column_types ) # ensure proper types
170+
171+ if index :
172+ LOGGER .debug (f"Setting '{ index } ' column as index" )
173+ tfs_data_frame = tfs_data_frame .set_index (index )
174+ else :
175+ LOGGER .debug ("Attempting to find index identifier in columns" )
176+ tfs_data_frame = _find_and_set_index (tfs_data_frame )
177+
178+ _validate (tfs_data_frame , f"from file { tfs_file_path .absolute ()} " , non_unique_behavior )
179+ return tfs_data_frame
161180
162181
163182def write_tfs (
@@ -334,6 +353,27 @@ def quote_strings(s):
334353 return data_frame
335354
336355
356+ def _find_and_set_index (data_frame : TfsDataFrame ) -> TfsDataFrame :
357+ """
358+ Looks for a column with a name starting with the index identifier, and sets it as index if found.
359+ The index identifier will be stripped from the column name first.
360+
361+ Args:
362+ data_frame (TfsDataFrame): the TfsDataFrame to look for an index in.
363+
364+ Returns:
365+ The TfsDataFrame after operation, whether an index was found or not.
366+ """
367+ index_column = [colname for colname in data_frame .columns if colname .startswith (INDEX_ID )]
368+ if index_column :
369+ data_frame = data_frame .set_index (index_column )
370+ index_name = index_column [0 ].replace (INDEX_ID , "" )
371+ if index_name == "" :
372+ index_name = None # to remove it completely (Pandas makes a difference)
373+ data_frame = data_frame .rename_axis (index = index_name )
374+ return data_frame
375+
376+
337377def _create_data_frame (column_names , column_types , rows_list , headers ) -> TfsDataFrame :
338378 data = np .array (rows_list ) if rows_list else None # case of empty dataframe
339379 tfs_data_frame = TfsDataFrame (data = data , columns = column_names , headers = headers )
0 commit comments