@@ -86,9 +86,7 @@ async def wait_for(coros, name=""):
86
86
# wrap the coro in a task to work with python 3.10 and 3.11+ where asyncio.wait semantics
87
87
# changed to not accept any awaitable
88
88
start = time .time ()
89
- done , _ = await asyncio .wait (
90
- [asyncio .create_task (_ensure_coro (c )) for c in coros ]
91
- )
89
+ done , _ = await asyncio .wait ([asyncio .create_task (_ensure_coro (c )) for c in coros ])
92
90
end = time .time ()
93
91
log .info (f"waiting for { name } took { end - start } s" )
94
92
for d in done :
@@ -166,9 +164,7 @@ async def acquire(self, need=1):
166
164
need_to_make = need - have
167
165
168
166
if need_to_make > can_make :
169
- raise Exception (
170
- f"Cannot allocate workers above { self .max_workers } "
171
- )
167
+ raise Exception (f"Cannot allocate workers above { self .max_workers } " )
172
168
173
169
if need_to_make > 0 :
174
170
log .debug (f"creating { need_to_make } additional processors" )
@@ -197,9 +193,9 @@ def _new_processor(self):
197
193
self .processors_ready .clear ()
198
194
processor_key = new_friendly_name ()
199
195
log .debug (f"starting processor: { processor_key } " )
200
- processor = DFRayProcessor .options (
201
- name = f"Processor : { processor_key } "
202
- ). remote ( processor_key )
196
+ processor = DFRayProcessor .options (name = f"Processor : { processor_key } " ). remote (
197
+ processor_key
198
+ )
203
199
self .pool [processor_key ] = processor
204
200
self .processors_started .add (processor .start_up .remote ())
205
201
self .available .add (processor_key )
@@ -248,9 +244,7 @@ async def _wait_for_serve(self):
248
244
249
245
async def all_done (self ):
250
246
log .info ("calling processor all done" )
251
- refs = [
252
- processor .all_done .remote () for processor in self .pool .values ()
253
- ]
247
+ refs = [processor .all_done .remote () for processor in self .pool .values ()]
254
248
await wait_for (refs , "processors to be all done" )
255
249
log .info ("all processors shutdown" )
256
250
@@ -293,9 +287,7 @@ async def update_plan(
293
287
)
294
288
295
289
async def serve (self ):
296
- log .info (
297
- f"[{ self .processor_key } ] serving on { self .processor_service .addr ()} "
298
- )
290
+ log .info (f"[{ self .processor_key } ] serving on { self .processor_service .addr ()} " )
299
291
await self .processor_service .serve ()
300
292
log .info (f"[{ self .processor_key } ] done serving" )
301
293
@@ -332,9 +324,7 @@ def __init__(
332
324
worker_pool_min : int ,
333
325
worker_pool_max : int ,
334
326
) -> None :
335
- log .info (
336
- f"Creating DFRayContextSupervisor worker_pool_min: { worker_pool_min } "
337
- )
327
+ log .info (f"Creating DFRayContextSupervisor worker_pool_min: { worker_pool_min } " )
338
328
self .pool = DFRayProcessorPool (worker_pool_min , worker_pool_max )
339
329
self .stages : dict [str , InternalStageData ] = {}
340
330
log .info ("Created DFRayContextSupervisor" )
@@ -347,9 +337,7 @@ async def wait_for_ready(self):
347
337
348
338
async def get_stage_addrs (self , stage_id : int ):
349
339
addrs = [
350
- sd .remote_addr
351
- for sd in self .stages .values ()
352
- if sd .stage_id == stage_id
340
+ sd .remote_addr for sd in self .stages .values () if sd .stage_id == stage_id
353
341
]
354
342
return addrs
355
343
@@ -399,10 +387,7 @@ async def new_query(
399
387
refs .append (
400
388
isd .remote_processor .update_plan .remote (
401
389
isd .stage_id ,
402
- {
403
- stage_id : val ["child_addrs" ]
404
- for (stage_id , val ) in kid .items ()
405
- },
390
+ {stage_id : val ["child_addrs" ] for (stage_id , val ) in kid .items ()},
406
391
isd .partition_group ,
407
392
isd .plan_bytes ,
408
393
)
@@ -434,9 +419,7 @@ async def sort_out_addresses(self):
434
419
]
435
420
436
421
# sanity check
437
- assert all (
438
- [op == output_partitions [0 ] for op in output_partitions ]
439
- )
422
+ assert all ([op == output_partitions [0 ] for op in output_partitions ])
440
423
output_partitions = output_partitions [0 ]
441
424
442
425
for child_stage_isd in child_stage_datas :
@@ -520,9 +503,7 @@ def collect(self) -> list[pa.RecordBatch]:
520
503
)
521
504
log .debug (f"last stage addrs { last_stage_addrs } " )
522
505
523
- reader = self .df .read_final_stage (
524
- last_stage_id , last_stage_addrs [0 ]
525
- )
506
+ reader = self .df .read_final_stage (last_stage_id , last_stage_addrs [0 ])
526
507
log .debug ("got reader" )
527
508
self ._batches = list (reader )
528
509
return self ._batches
@@ -589,11 +570,55 @@ def __init__(
589
570
)
590
571
591
572
def register_parquet (self , name : str , path : str ):
573
+ """
574
+ Register a Parquet file with the given name and path.
575
+ The path can be a local filesystem path, absolute filesystem path, or a url.
576
+
577
+ If the path is a object store url, the appropriate object store will be registered.
578
+ Configuration of the object store will be gathered from the environment.
579
+
580
+ For example for s3:// urls, credentials will be looked for by the AWS SDK,
581
+ which will check environment variables, credential files, etc
582
+
583
+ Parameters:
584
+ path (str): The file path to the Parquet file.
585
+ name (str): The name to register the Parquet file under.
586
+ """
592
587
self .ctx .register_parquet (name , path )
593
588
594
- def register_listing_table (
595
- self , name : str , path : str , file_extention = "parquet"
596
- ):
589
+ def register_csv (self , name : str , path : str ):
590
+ """
591
+ Register a csvfile with the given name and path.
592
+ The path can be a local filesystem path, absolute filesystem path, or a url.
593
+
594
+ If the path is a object store url, the appropriate object store will be registered.
595
+ Configuration of the object store will be gathered from the environment.
596
+
597
+ For example for s3:// urls, credentials will be looked for by the AWS SDK,
598
+ which will check environment variables, credential files, etc
599
+
600
+ Parameters:
601
+ path (str): The file path to the csv file.
602
+ name (str): The name to register the Parquet file under.
603
+ """
604
+ self .ctx .register_csv (name , path )
605
+
606
+ def register_listing_table (self , name : str , path : str , file_extention = "parquet" ):
607
+ """
608
+ Register a directory of parquet files with the given name.
609
+ The path can be a local filesystem path, absolute filesystem path, or a url.
610
+
611
+ If the path is a object store url, the appropriate object store will be registered.
612
+ Configuration of the object store will be gathered from the environment.
613
+
614
+ For example for s3:// urls, credentials will be looked for by the AWS SDK,
615
+ which will check environment variables, credential files, etc
616
+
617
+ Parameters:
618
+ path (str): The file path to the Parquet file directory
619
+ name (str): The name to register the Parquet file under.
620
+ """
621
+
597
622
self .ctx .register_listing_table (name , path , file_extention )
598
623
599
624
def sql (self , query : str ) -> DFRayDataFrame :
0 commit comments