graphcore · payoto · Feb 20, 2024 · Feb 20, 2024 · Feb 20, 2024 · Feb 20, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -2,10 +2,14 @@ FROM ubuntu:22.04
 
 RUN apt-get update && apt-get install -y python3 python3-pip
 
-COPY . /app
+COPY ./requirements.txt /app/requirements.txt
 
 WORKDIR /app
 
+RUN pip3 install -r requirements.txt
+
+COPY . /app
+
 RUN test -f /app/generations.json && rm /app/generations.json || true
 
 RUN pip3 install .

diff --git a/README.md b/README.md
@@ -17,7 +17,7 @@
 
 ## Features
 
-This is a framework for the evaluation of code generation models. This work is inspired from [EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) for evaluating language models in general. We welcome contributions to fix issues, enhance features and add new benchmarks. You can find contribution guides in [`docs/guide.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/docs/guide.md) and [`CONTRIBUTING.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/CONTRIBUTING.md) and more documentation in [`docs/README.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/docs/README.md). 
+This is a framework for the evaluation of code generation models. This work is inspired from [EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) for evaluating language models in general. We welcome contributions to fix issues, enhance features and add new benchmarks. You can find contribution guides in [`docs/guide.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/docs/guide.md) and [`CONTRIBUTING.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/CONTRIBUTING.md) and more documentation in [`docs/README.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/docs/README.md).
 
 Below are the features and tasks of this framework:
 
@@ -57,7 +57,7 @@ pip install -e ".[ds1000]" # installs all additional dependencies except PyTorch
 # torch==1.12.1 required. Download version with relevant GPU support etc., e.g.,
 pip install torch==1.12.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
 
-# to suppress any tensorflow optimization warnings, 
+# to suppress any tensorflow optimization warnings,
 # precede call to "accelerate launch" with "TF_CPP_MIN_LOG_LEVEL=3"
 
 # on some systems, tensorflow will attempt to allocate all GPU memory
@@ -82,7 +82,7 @@ The evaluation part (solutions execution) for [MultiPL-E](https://github.com/nup
 ## Usage
 You can use this evaluation harness to generate text solutions to code benchmarks with your model, to evaluate (and execute) the solutions or to do both. While it is better to use GPUs for the generation, the evaluation only requires CPUs. So it might be beneficial to separate these two steps. By default both generation and evaluation are performed.
 
-For more details on how to evaluate on the tasks, please refer to the documentation in [`docs/README.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/docs/README.md). 
+For more details on how to evaluate on the tasks, please refer to the documentation in [`docs/README.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/docs/README.md).
 
 ### Generation and evaluation
 Below is an example to generate and evaluate on a task.
@@ -101,8 +101,8 @@ accelerate launch  main.py \
   --allow_code_execution \
   --save_generations
 ```
-* `limit` represents the number of problems to solve, if it's not provided all problems in the benchmark are selected. 
-* `allow_code_execution` is for executing the generated code: it is off by default, read the displayed warning before calling it to enable execution. 
+* `limit` represents the number of problems to solve, if it's not provided all problems in the benchmark are selected.
+* `allow_code_execution` is for executing the generated code: it is off by default, read the displayed warning before calling it to enable execution.
 * Some models with custom code on the HF hub like [SantaCoder](https://huggingface.co/bigcode/santacoder) require calling `--trust_remote_code`, for private models add `--use_auth_token`.
 * `save_generations` saves the post-processed generations in a json file at `save_generations_path` (by default `generations.json`). You can also save references by calling `--save_references`
 * `max_length_generation` is the maximum token length of generation including the input token length. The default is 512, but for some tasks like GSM8K and GSM-Hard, the complete prompt with 8 shot examples (as used in [PAL](https://github.com/reasoning-machines/pal)) take up `~1500` tokens, hence the value should be greater than that and the recommended value of `max_length_generation` is `2048` for these tasks.
@@ -113,7 +113,7 @@ Some tasks don't require code execution such as
 
 ### Generation only
 
-If you want to generate solutions without executing and evaluating the code, call `--generation_only`, in addition to the instructions above. This will save the solutions in a json file provided in `save_generation_path` in the working directory. 
+If you want to generate solutions without executing and evaluating the code, call `--generation_only`, in addition to the instructions above. This will save the solutions in a json file provided in `save_generation_path` in the working directory.
 
 This can be useful if you don't want to execute code in the machine you're using for generations for security or efficiency reasons. For instance, you can do the generations on multiple GPUs, but switch to a multiple workers CPU machine or docker container for the execution.
 
@@ -193,7 +193,7 @@ To implement a new task in this evaluation harness, see the guide in [`docs/guid
 We provide documentation for the existing benchmarks and how to run the evaluation in [`docs/README.md`](https://github.com/bigcode-project/bigcode-evaluation-harness/blob/main/docs/README.md).
 
 ## Remarks
-* Currenltly, we use data parallel evaluation across multiple GPUs using `accelerate`, this assumes that you can fit the model in one GPU. 
+* Currenltly, we use data parallel evaluation across multiple GPUs using `accelerate`, this assumes that you can fit the model in one GPU.
 
 ## Acknowledgements
 We thank EleutherAI for their work on the [lm-evaluation harness](https://github.com/EleutherAI/lm-evaluation-harness) from which this repository is inspired.

diff --git a/bigcode_eval/evaluator.py b/bigcode_eval/evaluator.py
@@ -13,7 +13,7 @@
 ################################################################################
                                   !!!WARNING!!!
 ################################################################################
-The "code_eval"/"apps_metric" you are about to use, execute untrusted 
+The "code_eval"/"apps_metric" you are about to use, execute untrusted
 model-generated code in Python.
 Although it is highly unlikely that model-generated code will do something
 overtly malicious in response to this test suite, model-generated code may act
@@ -22,7 +22,7 @@
 does not perform destructive actions on their host or network. For more
 information on how OpenAI sandboxes its code, see the paper "Evaluating Large
 Language Models Trained on Code" (https://arxiv.org/abs/2107.03374).
-Once you have read this disclaimer and taken appropriate precautions, set the argument 
+Once you have read this disclaimer and taken appropriate precautions, set the argument
 "allow_code_execution" to True.
 ################################################################################\
 """
@@ -47,7 +47,7 @@ def generate_text(self, task_name, intermediate_generations=None):
         # if args.limit is used, make sure args.limit_start + args.limit <= len(dataset)
         n_tasks = min(self.args.limit, len(dataset) - self.args.limit_start) if self.args.limit else len(dataset)
         # when args.limit is None
-        # adjust n_tasks by args.limit_start to prevent out of bounds issues 
+        # adjust n_tasks by args.limit_start to prevent out of bounds issues
         if not self.args.limit:
             n_tasks -= self.args.limit_start
         references = [task.get_reference(dataset[i]) for i in range(self.args.limit_start, self.args.limit_start+n_tasks)]
@@ -97,7 +97,7 @@ def evaluate(self, task_name, intermediate_generations=None):
         if self.accelerator.is_main_process:
             if not self.args.load_generations_path:
                 save_generations_path = f"{os.path.splitext(self.args.save_generations_path)[0]}_{task_name}.json"
-                self.save_json_files(generations, references, save_generations_path, f"references_{task_name}.json")
+                self.save_json_files(generations, references, save_generations_path, self.args.save_references_path)
 
             # make sure tokenizer plays nice with multiprocessing
             os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -118,6 +118,13 @@ def save_json_files(
             with open(save_generations_path, "w") as fp:
                 json.dump(generations, fp)
                 print(f"generations were saved at {save_generations_path}")
+
+            if hasattr(generations, "to_table"):
+                full_gen = f"{os.path.splitext(save_generations_path)[0]}_full.json"
+                with open(full_gen, "w") as fp:
+                    json.dump(generations.to_table(), fp)
+                    print(f"Full generations were saved at {full_gen}")
+
         if self.args.save_references:
             with open(save_references_path, "w") as fp:
                 json.dump(references, fp)

diff --git a/bigcode_eval/generation.py b/bigcode_eval/generation.py
@@ -37,7 +37,26 @@ def __init__(self, input_length, multiplier):
     def __call__(self, input_ids, scores, **kwargs):
         """Returns true if generated sequence is too long."""
         return input_ids.shape[1] > int(self.input_length * self.multiplier)
-
+
+
+class Generations(list):
+    """A list which can carry extra data in a dictionary and can turn it all in a table.
+
+    TODO: This is a terrible hack, do something better.
+    """
+    @property
+    def sneaky_extra_data(self):
+        if not hasattr(self, "_extra_data"):
+            self._extra_data = {}
+        return self._extra_data
+
+    def to_table(self):
+        headers = [*self.sneaky_extra_data.keys(), "generations"]
+        return [
+            {k:v[0] if isinstance(v, list) and len(v)==1 else v for k, v in zip(headers, row)}
+            for row in zip(*self.sneaky_extra_data.values(), self)
+        ]
+
 
 def parallel_generations(
         task,
@@ -74,17 +93,20 @@ def parallel_generations(
         prompts = [task.get_prompt(doc) for doc in dataset]
         awaitables = [client.completions.create(
             model=args.model,
-            prompt=prompt, 
+            prompt=prompt,
             n=args.batch_size,
             max_tokens=args.max_length_generation,
             temperature=args.temperature,
             top_p=args.top_p
         ) for prompt in prompts]
         responses = asyncio.run(tqdm.gather(*awaitables))
-        generations = []
+        generations = Generations()
+        generations.sneaky_extra_data["prompts"] = prompts
+        generations.sneaky_extra_data["raw_response"] = []
         for i, (prompt, response) in enumerate(zip(prompts, responses)):
             texts = [prompt + choice.text for choice in response.choices]
             generations.append([task.postprocess_generation(text, i) for text in texts])
+            generations.sneaky_extra_data["raw_response"].append([choice.text for choice in response.choices])
         return generations
 
     set_seed(args.seed, device_specific=True)
@@ -101,7 +123,7 @@ def parallel_generations(
     # The input_length / start_length set to 0 for now will be adjusted later
     # Check if the task has a custom check_fn method for the stopping criteria
     if task.stop_words and tokenizer.eos_token:
-        task.stop_words.append(tokenizer.eos_token)    
+        task.stop_words.append(tokenizer.eos_token)
     if hasattr(task, "check_fn"):
         stopping_criteria.append(
             EndOfFunctionCriteria(0, task.stop_words, tokenizer, task.check_fn)
@@ -114,7 +136,7 @@ def parallel_generations(
         stopping_criteria.append(
             TooLongFunctionCriteria(0, task.max_length_multiplier)
         )
-    
+
     if stopping_criteria:
         gen_kwargs["stopping_criteria"] = StoppingCriteriaList(stopping_criteria)
 

diff --git a/bigcode_eval/tasks/__init__.py b/bigcode_eval/tasks/__init__.py
@@ -1,4 +1,5 @@
 import inspect
+from .. import base
 from pprint import pprint
 
 from . import (apps, codexglue_code_to_text, codexglue_text_to_text, conala,
@@ -32,7 +33,7 @@
 ALL_TASKS = sorted(list(TASK_REGISTRY))
 
 
-def get_task(task_name, args=None):
+def get_task(task_name, args=None)-> base.Task:
     try:
         kwargs = {}
         if "prompt" in inspect.signature(TASK_REGISTRY[task_name]).parameters: