@@ -68,21 +68,41 @@ def __init__(self, path: str | Path):
6868 self ._path = Path (path )
6969
7070 @property
71- def files (self ) -> list [str ]:
71+ def path (self ) -> Path :
72+ return self ._path
73+
74+ @property
75+ def files (self ) -> list [Path ]:
76+ if not self .path .exists ():
77+ return []
78+ elif not self .path .is_dir ():
79+ raise NotADirectoryError (
80+ f"Path exists but is not a directory: { self ._path } "
81+ )
82+
7283 files = []
7384 for entry in os .listdir (self ._path ):
7485 full_path = os .path .join (self ._path , entry )
7586 if os .path .isfile (full_path ):
76- files .append (full_path )
87+ files .append (Path ( full_path ) )
7788 return files
7889
7990 @property
8091 def num_files (self ) -> int :
8192 return len (self .files )
8293
8394 @property
84- def dataset_files (self ) -> list [str ]:
85- return glob .glob (f"{ self ._path } /*.parquet" )
95+ def dataset_files (self ) -> list [Path ]:
96+ if not self .path .exists ():
97+ return []
98+ elif not self .path .is_dir ():
99+ raise NotADirectoryError (
100+ f"Path exists but is not a directory: { self ._path } "
101+ )
102+
103+ return [
104+ Path (filepath ) for filepath in glob .glob (f"{ self ._path } /*.parquet" )
105+ ]
86106
87107 @property
88108 def num_dataset_files (self ) -> int :
@@ -92,16 +112,19 @@ def num_dataset_files(self) -> int:
92112 def _generate_config_path (path : str | Path ) -> Path :
93113 return Path (path ) / ".cfg"
94114
95- @staticmethod
96- def _get_dataset_from_path (path : str | Path ) -> ds .Dataset :
97- return ds .dataset (path , format = "parquet" )
98-
99115
100116class CacheReader (CacheFiles ):
101- def __init__ (self , path : str | Path ):
117+ def __init__ (
118+ self ,
119+ path : str | Path ,
120+ batch_size : int ,
121+ rows_per_file : int ,
122+ compression : str ,
123+ ):
102124 self ._path = Path (path )
103- self ._cfg = None
104- self ._dataset = None
125+ self ._batch_size = batch_size
126+ self ._rows_per_file = rows_per_file
127+ self ._compression = compression
105128
106129 # validate path
107130 if not self ._path .exists ():
@@ -111,45 +134,48 @@ def __init__(self, path: str | Path):
111134 f"Path exists but is not a directory: { self ._path } "
112135 )
113136
137+ @classmethod
138+ def load (cls , path : str | Path ):
139+ def _retrieve (config : dict , key : str ):
140+ if value := config .get (key , None ):
141+ return value
142+ raise KeyError (
143+ f"'{ key } ' is not defined within { cls ._generate_config_path (path )} "
144+ )
145+
146+ cfg_path = cls ._generate_config_path (path )
147+ with open (cfg_path , "r" ) as f :
148+ cfg = json .load (f )
149+ batch_size = _retrieve (cfg , "batch_size" )
150+ rows_per_file = _retrieve (cfg , "rows_per_file" )
151+ compression = _retrieve (cfg , "compression" )
152+
153+ return cls (
154+ path = path ,
155+ batch_size = batch_size ,
156+ rows_per_file = rows_per_file ,
157+ compression = compression ,
158+ )
159+
114160 @property
115161 def dataset (self ) -> ds .Dataset :
116- if not self ._dataset :
117- self ._dataset = ds .dataset (
118- self ._path ,
119- format = "parquet" ,
120- )
121- return self ._dataset
162+ return ds .dataset (self ._path , format = "parquet" )
122163
123164 @property
124165 def schema (self ) -> pa .Schema :
125166 return self .dataset .schema
126167
127- @property
128- def config (self ) -> dict :
129- if self ._cfg is None :
130- cfg_path = self ._generate_config_path (self ._path )
131- with open (cfg_path , "r" ) as f :
132- self ._cfg = json .load (f )
133- return self ._cfg
134-
135- def _read_config (self , key : str ):
136- if value := self .config .get (key , None ):
137- return value
138- raise KeyError (
139- f"'{ key } ' is not defined within { self ._generate_config_path (self ._path )} "
140- )
141-
142168 @property
143169 def batch_size (self ) -> int :
144- return int ( self ._read_config ( "batch_size" ))
170+ return self ._batch_size
145171
146172 @property
147173 def rows_per_file (self ) -> int :
148- return int ( self ._read_config ( "rows_per_file" ))
174+ return self ._rows_per_file
149175
150176 @property
151177 def compression (self ) -> str :
152- return str ( self ._read_config ( "compression" ))
178+ return self ._compression
153179
154180
155181class CacheWriter (CacheFiles ):
@@ -209,7 +235,7 @@ def create(
209235 @classmethod
210236 def load (cls , path : str | Path ):
211237 cfg_path = cls ._generate_config_path (path )
212- dataset = cls . _get_dataset_from_path (path )
238+ dataset = ds . dataset (path , format = "parquet" )
213239 with open (cfg_path , "r" ) as f :
214240 cfg = json .load (f )
215241 return cls (
@@ -218,6 +244,23 @@ def load(cls, path: str | Path):
218244 ** cfg ,
219245 )
220246
247+ @classmethod
248+ def delete (cls , path : str | Path ):
249+ path = Path (path )
250+ if not path .exists ():
251+ return
252+ cache = cls .load (path )
253+ # delete config file
254+ cfg_path = cls ._generate_config_path (path )
255+ if cfg_path .exists () and cfg_path .is_file ():
256+ cfg_path .unlink ()
257+ # delete parquet files
258+ for file in cache .dataset_files :
259+ if file .exists () and file .is_file () and file .suffix == ".parquet" :
260+ file .unlink ()
261+ # delete empty cache directory
262+ path .rmdir ()
263+
221264 def write_rows (
222265 self ,
223266 rows : list [dict [str , Any ]],
@@ -286,10 +329,6 @@ def flush(self):
286329 self ._count = 0
287330 self ._close_writer ()
288331
289- def delete (self ):
290- for file in self .files :
291- Path (file ).unlink ()
292-
293332 def _next_filename (self ) -> Path :
294333 files = self .dataset_files
295334 if not files :
0 commit comments