diff --git a/eval/eval.py b/eval/eval.py index ef561e04..f3e2eda7 100644 --- a/eval/eval.py +++ b/eval/eval.py @@ -20,7 +20,7 @@ from lm_eval.loggers import EvaluationTracker, WandbLogger from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash from lm_eval.tasks import TaskManager as PretrainTaskManager -from lm_eval.utils import handle_non_serializable, sanitize_model_name, simple_parse_args_string +from lm_eval.utils import setup_logging, handle_non_serializable, sanitize_model_name, simple_parse_args_string from eval.chat_benchmarks.curator_lm import CuratorAPIModel # register curator model from eval.chat_benchmarks.precomputed_hf_lm import PrecomputedHFLM # register precomputed_hf model @@ -29,6 +29,7 @@ from eval.eval_tracker import DCEvaluationTracker from eval.task import TaskManager as InstructTaskManager +eval_logger = logging.getLogger(__name__) def setup_custom_parser(): """ @@ -128,8 +129,8 @@ def evaluate( Dictionary mapping task names to their evaluation results. Each result dictionary contains metrics specific to that task. """ - eval_logger = utils.eval_logger - eval_logger.setLevel(getattr(logging, f"{verbosity}")) + if verbosity is not None: + setup_logging(verbosity=verbosity) # Split tasks between benchmark and pretrain benchmark_tasks = [t for t in task_list if t in task_manager.tasks] @@ -309,16 +310,16 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: try: model_name = evaluation_tracker.get_model_attribute_from_db(args.model_id, "weights_location") args.model_args = update_model_args_with_name(args.model_args or "", model_name) - utils.eval_logger.info(f"Retrieved model name from database: {model_name}") + eval_logger.info(f"Retrieved model name from database: {model_name}") except Exception as e: - utils.eval_logger.error(f"Failed to retrieve model name from database: {str(e)}") + eval_logger.error(f"Failed to retrieve model name from database: {str(e)}") sys.exit(1) if not args.overwrite_database: task_list = [ task for task in task_list if not evaluation_tracker.check_if_already_done(task, args.model_id) ] if len(task_list) == 0: - utils.eval_logger.info("All tasks passed in were found in the database.") + eval_logger.info("All tasks passed in were found in the database.") exit() elif args.model_name: model_name = args.model_name @@ -334,7 +335,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: ) pretrain_task_manager = PretrainTaskManager(args.verbosity, include_path=args.include_path) - utils.eval_logger.info(f"Selected Tasks: {[task for task in task_list]}") + eval_logger.info(f"Selected Tasks: {[task for task in task_list]}") # Only check for OpenAI API keys if at least one task requires an annotator model # TODO: Should we just skip the evaluation that requires the annotator model if the annotator model is not set or fail completely? @@ -357,7 +358,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: try: lm = initialize_model(args.model, args.model_args, batch_size=args.batch_size) except Exception as e: - utils.eval_logger.error(f"Failed to initialize model: {str(e)}") + eval_logger.error(f"Failed to initialize model: {str(e)}") sys.exit(1) # Log experiment configuration @@ -370,9 +371,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None: fewshot_as_multiturn=args.fewshot_as_multiturn, ) - # Initialize logging and environment - eval_logger = utils.eval_logger - eval_logger.setLevel(getattr(logging, f"{args.verbosity}")) + # Initialize environment os.environ["TOKENIZERS_PARALLELISM"] = "false" # Setup wandb logging if requested @@ -562,7 +561,7 @@ def handle_evaluation_output( if args.log_samples: wandb_logger.log_eval_samples(samples) except Exception as e: - utils.eval_logger.info(f"Logging to Weights and Biases failed due to {e}") + eval_logger.info(f"Logging to Weights and Biases failed due to {e}") evaluation_tracker.save_results_aggregated(results=results, samples=samples if args.log_samples else None) if args.use_database and not args.debug: @@ -580,7 +579,7 @@ def handle_evaluation_output( for task_name, config in results["configs"].items(): evaluation_tracker.save_results_samples(task_name=task_name, samples=samples[task_name]) - utils.eval_logger.info( + eval_logger.info( f"Eval arugments: {args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), " f"limit: {args.limit}, num_fewshot: {args.num_fewshot}, annotator_model: {args.annotator_model}, " f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}" diff --git a/eval/eval_tracker.py b/eval/eval_tracker.py index a1a879ca..1e883802 100644 --- a/eval/eval_tracker.py +++ b/eval/eval_tracker.py @@ -13,11 +13,12 @@ import torch from huggingface_hub import model_info from lm_eval.loggers.evaluation_tracker import GeneralConfigTracker -from lm_eval.utils import eval_logger, handle_non_serializable, hash_string, simple_parse_args_string +from lm_eval.utils import setup_logging, handle_non_serializable, hash_string, simple_parse_args_string from database.models import Dataset, EvalResult, EvalSetting, Model from database.utils import create_db_engine, create_tables, get_model_from_db, get_or_add_model_by_name, sessionmaker +eval_logger = logging.getLogger(__name__) def flatten_dict(d: Dict[str, Any], parent_key: str = "", sep: str = "/") -> Dict[str, Any]: """ diff --git a/pyproject.toml b/pyproject.toml index e39e4043..e499cba8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -154,7 +154,7 @@ dependencies = [ "swebench>=3.0.4", # LM Eval - "lm-eval[vllm] @ git+https://github.com/EtashGuha/lm-evaluation-harness@etashg/tokenize_fix", + "lm-eval[vllm] @ git+https://github.com/EleutherAI/lm-evaluation-harness@v0.4.8", ] [project.urls]