Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 12 additions & 13 deletions eval/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
from lm_eval.loggers import EvaluationTracker, WandbLogger
from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash
from lm_eval.tasks import TaskManager as PretrainTaskManager
from lm_eval.utils import handle_non_serializable, sanitize_model_name, simple_parse_args_string
from lm_eval.utils import setup_logging, handle_non_serializable, sanitize_model_name, simple_parse_args_string

from eval.chat_benchmarks.curator_lm import CuratorAPIModel # register curator model
from eval.chat_benchmarks.precomputed_hf_lm import PrecomputedHFLM # register precomputed_hf model
Expand All @@ -29,6 +29,7 @@
from eval.eval_tracker import DCEvaluationTracker
from eval.task import TaskManager as InstructTaskManager

eval_logger = logging.getLogger(__name__)

def setup_custom_parser():
"""
Expand Down Expand Up @@ -128,8 +129,8 @@ def evaluate(
Dictionary mapping task names to their evaluation results.
Each result dictionary contains metrics specific to that task.
"""
eval_logger = utils.eval_logger
eval_logger.setLevel(getattr(logging, f"{verbosity}"))
if verbosity is not None:
setup_logging(verbosity=verbosity)

# Split tasks between benchmark and pretrain
benchmark_tasks = [t for t in task_list if t in task_manager.tasks]
Expand Down Expand Up @@ -309,16 +310,16 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None:
try:
model_name = evaluation_tracker.get_model_attribute_from_db(args.model_id, "weights_location")
args.model_args = update_model_args_with_name(args.model_args or "", model_name)
utils.eval_logger.info(f"Retrieved model name from database: {model_name}")
eval_logger.info(f"Retrieved model name from database: {model_name}")
except Exception as e:
utils.eval_logger.error(f"Failed to retrieve model name from database: {str(e)}")
eval_logger.error(f"Failed to retrieve model name from database: {str(e)}")
sys.exit(1)
if not args.overwrite_database:
task_list = [
task for task in task_list if not evaluation_tracker.check_if_already_done(task, args.model_id)
]
if len(task_list) == 0:
utils.eval_logger.info("All tasks passed in were found in the database.")
eval_logger.info("All tasks passed in were found in the database.")
exit()
elif args.model_name:
model_name = args.model_name
Expand All @@ -334,7 +335,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None:
)
pretrain_task_manager = PretrainTaskManager(args.verbosity, include_path=args.include_path)

utils.eval_logger.info(f"Selected Tasks: {[task for task in task_list]}")
eval_logger.info(f"Selected Tasks: {[task for task in task_list]}")

# Only check for OpenAI API keys if at least one task requires an annotator model
# TODO: Should we just skip the evaluation that requires the annotator model if the annotator model is not set or fail completely?
Expand All @@ -357,7 +358,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None:
try:
lm = initialize_model(args.model, args.model_args, batch_size=args.batch_size)
except Exception as e:
utils.eval_logger.error(f"Failed to initialize model: {str(e)}")
eval_logger.error(f"Failed to initialize model: {str(e)}")
sys.exit(1)

# Log experiment configuration
Expand All @@ -370,9 +371,7 @@ def cli_evaluate(args: Optional[argparse.Namespace] = None) -> None:
fewshot_as_multiturn=args.fewshot_as_multiturn,
)

# Initialize logging and environment
eval_logger = utils.eval_logger
eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
# Initialize environment
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Setup wandb logging if requested
Expand Down Expand Up @@ -562,7 +561,7 @@ def handle_evaluation_output(
if args.log_samples:
wandb_logger.log_eval_samples(samples)
except Exception as e:
utils.eval_logger.info(f"Logging to Weights and Biases failed due to {e}")
eval_logger.info(f"Logging to Weights and Biases failed due to {e}")

evaluation_tracker.save_results_aggregated(results=results, samples=samples if args.log_samples else None)
if args.use_database and not args.debug:
Expand All @@ -580,7 +579,7 @@ def handle_evaluation_output(
for task_name, config in results["configs"].items():
evaluation_tracker.save_results_samples(task_name=task_name, samples=samples[task_name])

utils.eval_logger.info(
eval_logger.info(
f"Eval arugments: {args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), "
f"limit: {args.limit}, num_fewshot: {args.num_fewshot}, annotator_model: {args.annotator_model}, "
f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
Expand Down
3 changes: 2 additions & 1 deletion eval/eval_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,12 @@
import torch
from huggingface_hub import model_info
from lm_eval.loggers.evaluation_tracker import GeneralConfigTracker
from lm_eval.utils import eval_logger, handle_non_serializable, hash_string, simple_parse_args_string
from lm_eval.utils import setup_logging, handle_non_serializable, hash_string, simple_parse_args_string

from database.models import Dataset, EvalResult, EvalSetting, Model
from database.utils import create_db_engine, create_tables, get_model_from_db, get_or_add_model_by_name, sessionmaker

eval_logger = logging.getLogger(__name__)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to also import logging in this file?


def flatten_dict(d: Dict[str, Any], parent_key: str = "", sep: str = "/") -> Dict[str, Any]:
"""
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ dependencies = [
"swebench>=3.0.4",

# LM Eval
"lm-eval[vllm] @ git+https://github.com/EtashGuha/lm-evaluation-harness@etashg/tokenize_fix",
"lm-eval[vllm] @ git+https://github.com/EleutherAI/lm-evaluation-harness@v0.4.8",
]

[project.urls]
Expand Down