@@ -126,11 +126,14 @@ def __init__(
126
126
self .actor = actor
127
127
self .request_counter : RequestCounter = request_counter
128
128
129
- def is_available (self ) -> bool :
129
+ def get_evaluator_status (self ) -> str :
130
130
try :
131
- return self .evaluator is None
132
- except Exception :
133
- return False
131
+ if self .evaluator is None :
132
+ return "Evaluator not initialized."
133
+ assert isinstance (self .evaluator , Evaluator )
134
+ return (self .evaluator .class_name , self .evaluator .model .class_name )
135
+ except Exception as e :
136
+ return String .format_exception_msg (e )
134
137
135
138
def get_ip_address (self ) -> Optional [str ]:
136
139
try :
@@ -363,6 +366,10 @@ def ray_evaluator_params(cls, params: Dict) -> Dict:
363
366
return params
364
367
365
368
def initialize (self , reinit_ray : bool = False , ** kwargs ):
369
+ if self .model_num_gpus <= 1 or self .AlgorithmClass .class_name == "VLLMGenerativeLM" :
370
+ self .nested_evaluator_name : str = get_default (self .nested_evaluator_name , "local" )
371
+ else :
372
+ self .nested_evaluator_name : str = get_default (self .nested_evaluator_name , "accelerate" )
366
373
## Connect to the Ray cluster
367
374
if not ray .is_initialized () or reinit_ray is True :
368
375
ray .init (
@@ -385,7 +392,7 @@ def _load_model(
385
392
** kwargs ,
386
393
) -> List [RayActorComposite ]:
387
394
num_actors : int = get_default (num_actors , self .num_actors )
388
- progress_bar : Optional [Dict ] = self ._run_evaluation_progress_bar (progress_bar )
395
+ progress_bar : Union [Dict , bool ] = self ._run_evaluation_progress_bar (progress_bar )
389
396
nested_evaluator_params : Dict = self ._create_nested_evaluator_params (** kwargs )
390
397
391
398
def actor_factory (* , request_counter : Any , actor_i : int , actor_id : str , ** kwargs ):
@@ -475,10 +482,7 @@ def num_actors(self) -> int:
475
482
return num_actors
476
483
477
484
def _create_nested_evaluator_params (self , ** kwargs ) -> Dict :
478
- nested_evaluator_name : str = get_default (
479
- self .nested_evaluator_name ,
480
- "accelerate" if self .model_num_gpus > 1 else "local" ,
481
- )
485
+ nested_evaluator_name : str = self .nested_evaluator_name
482
486
if self .model_dir is not None and not self .model_dir .is_remote_storage ():
483
487
raise ValueError (
484
488
f"When passing `model_dir` to { self .class_name } .of(...), the model directory "
@@ -563,44 +567,44 @@ def _run_evaluation(
563
567
evaluated_predictions : Optional [Predictions ] = None
564
568
evaluated_metrics : Optional [List [Metric ]] = None
565
569
566
- try :
567
- timer : Timer = Timer (silent = True )
568
- timer .start ()
569
- ## Verbosity >= 1: progress bars
570
- progress_bar : Optional [Dict ] = self ._run_evaluation_progress_bar (progress_bar )
571
- ## Verbosity >= 2: basic logging
572
- main_logger : Callable = partial (
573
- self .ray_logger ,
574
- ## Unless we request silence (verbosity=0), print important information.
575
- should_log = self .verbosity >= 2 ,
576
- tracker = tracker ,
577
- )
578
- ## Verbosity >= 3: detailed logging
579
- debug_logger : Callable = partial (
580
- self .ray_logger ,
581
- ## Unless we request silence (verbosity=0), print important information.
582
- should_log = self .verbosity >= 3 ,
583
- tracker = tracker ,
570
+ timer : Timer = Timer (silent = True )
571
+ timer .start ()
572
+ ## Verbosity >= 1: progress bars
573
+ progress_bar : Union [Dict , bool ] = self ._run_evaluation_progress_bar (progress_bar )
574
+ ## Verbosity >= 2: basic logging
575
+ main_logger : Callable = partial (
576
+ self .ray_logger ,
577
+ ## Unless we request silence (verbosity=0), print important information.
578
+ should_log = self .verbosity >= 2 ,
579
+ tracker = tracker ,
580
+ )
581
+ ## Verbosity >= 3: detailed logging
582
+ debug_logger : Callable = partial (
583
+ self .ray_logger ,
584
+ ## Unless we request silence (verbosity=0), print important information.
585
+ should_log = self .verbosity >= 3 ,
586
+ tracker = tracker ,
587
+ )
588
+ main_logger (self ._evaluate_start_msg (tracker = tracker , ** kwargs ))
589
+ if batch_size is None :
590
+ raise ValueError (
591
+ f"Could not find batch_size in model hyperparams; "
592
+ f"please pass it explicitly like so: { self .class_name } .evaluate(batch_size=...)"
584
593
)
585
- main_logger ( self . _evaluate_start_msg ( tracker = tracker , ** kwargs ))
586
- if batch_size is None :
594
+ if predictions_destination is not None :
595
+ if predictions_destination . storage is not Storage . S3 :
587
596
raise ValueError (
588
- f"Could not find batch_size in model hyperparams ; "
589
- f"please pass it explicitly like so : { self . class_name } .evaluate(batch_size=...) "
597
+ f"Results can only be saved to { Storage . S3 } ; "
598
+ f"found storage { predictions_destination . storage } having path : { predictions_destination . path } "
590
599
)
591
- if predictions_destination is not None :
592
- if predictions_destination .storage is not Storage .S3 :
593
- raise ValueError (
594
- f"Results can only be saved to { Storage .S3 } ; "
595
- f"found storage { predictions_destination .storage } having path: { predictions_destination .path } "
596
- )
597
- if not predictions_destination .is_path_valid_dir ():
598
- raise ValueError (
599
- f"Expected predictions destination to be a valid directory; "
600
- f'found: "{ predictions_destination .path } "...did you forget a "/" at the end?'
601
- )
602
- assert predictions_destination .format is not None ## Checked in .evaluate().
600
+ if not predictions_destination .is_path_valid_dir ():
601
+ raise ValueError (
602
+ f"Expected predictions destination to be a valid directory; "
603
+ f'found: "{ predictions_destination .path } "...did you forget a "/" at the end?'
604
+ )
605
+ assert predictions_destination .format is not None ## Checked in .evaluate().
603
606
607
+ try :
604
608
actors_were_created_in_this_call : bool = self .init_model (progress_bar = progress_bar , ** kwargs )
605
609
num_actors_created : int = len (self .model )
606
610
if actors_were_created_in_this_call :
@@ -869,15 +873,16 @@ def _run_evaluation(
869
873
)
870
874
)
871
875
return evaluated_predictions , evaluated_metrics
876
+ except Exception as e :
877
+ raise e
872
878
except KeyboardInterrupt as e :
873
879
raise e
874
880
finally :
875
881
if "row_counter" in locals ():
876
882
accumulate (ray .kill (row_counter ))
877
883
del row_counter
878
- if (
879
- self .cache_timeout is None
880
- ): ## If we don't have a timeout, delete actors after every execution.
884
+ ## If we don't have a timeout, delete actors after every execution.
885
+ if self .cache_timeout is None :
881
886
self .cleanup_model ()
882
887
return evaluated_predictions , evaluated_metrics
883
888
@@ -894,10 +899,10 @@ def _get_actor_usages(self) -> List[Tuple[int, float, str]]:
894
899
)
895
900
return actor_usages
896
901
897
- def _run_evaluation_progress_bar (self , progress_bar : Optional [Dict ], ** kwargs ) -> Optional [Dict ]:
902
+ def _run_evaluation_progress_bar (self , progress_bar : Optional [Dict ], ** kwargs ) -> Union [Dict , bool ]:
898
903
if self .verbosity >= 2 :
899
904
return progress_bar
900
- return None
905
+ return False
901
906
902
907
def _evaluate_start_msg (self , * , tracker : Tracker , ** kwargs ) -> str :
903
908
if tracker .tracker_name == "noop" :
0 commit comments