1515 def profile (fun ):
1616 return fun
1717
18+ def format_bytes (size ):
19+ for unit in ['B' , 'KB' , 'MB' , 'GB' ]:
20+ if size < 1024.0 :
21+ return f"{ size :.2f} { unit } "
22+ size /= 1024.0
23+ return f"{ size :.2f} TB"
24+
1825# selection of scores with low cross-correlation for metabolomics scoring
1926def use_metabolomics_scores ():
2027 return [
@@ -112,8 +119,12 @@ def create_index_if_not_exists(con, index_name, table_name, column_name):
112119
113120
114121def is_parquet_file (file_path ):
122+ '''
123+ Check if the file is a valid Parquet file.
124+ '''
115125 import pyarrow .parquet as pq
116126 from pyarrow .lib import ArrowInvalid , ArrowIOError
127+
117128 # First check extension
118129 if not os .path .splitext (file_path )[1 ].lower () in ('.parquet' , '.pq' ):
119130 return False
@@ -124,6 +135,29 @@ def is_parquet_file(file_path):
124135 return True
125136 except (ArrowInvalid , ArrowIOError , OSError ):
126137 return False
138+
139+ def is_valid_split_parquet_dir (path ):
140+ '''
141+ Checks if the directory contains both required parquet files
142+ and that each is a valid Parquet file.
143+ '''
144+ if not os .path .isdir (path ):
145+ return False
146+
147+ required_files = [
148+ "precursors_features.parquet" ,
149+ "transition_features.parquet"
150+ ]
151+
152+ for filename in required_files :
153+ full_path = os .path .join (path , filename )
154+ if not os .path .isfile (full_path ):
155+ return False
156+ if not is_parquet_file (full_path ):
157+ return False
158+
159+ return True
160+
127161
128162def get_parquet_column_names (file_path ):
129163 """
0 commit comments