VLMEvalKit/vlmeval/dataset/charxiv.py at 16058faba0d842932068ca2ef9691918df0dbd98 · open-compass/VLMEvalKit · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
import os
import json
from typing import Dict, List, Tuple, Any, Union
import pandas as pd
import warnings

from vlmeval.dataset.image_base import ImageBaseDataset
from vlmeval.smp import misc, file
from vlmeval.smp.file import get_intermediate_file_path
from vlmeval import utils
from vlmeval.dataset.utils import build_judge


def auxeval(judge_model: Any, line: pd.Series, **kwargs: Any) -> Dict[str, Any]:
    """
    Evaluate a line using the judge model.

    Args:
        judge_model: The model used for evaluation
        line: A pandas Series containing the data to evaluate
        **kwargs: Additional arguments for the judge model

    Returns:
        Dict containing evaluation results with extract_answer and score
    """
    failure_result = {"extract_answer": "Failed to parse response", "score": 0.0}
    prompt = line["grading_query"].replace("{PREDICTION}", line["prediction"])

    retry = kwargs.get("retry", 10)
    max_tokens = kwargs.get("max_tokens", 256)
    temperature = kwargs.get("temperature", 0)
    seed = kwargs.get("seed", 42)
    top_p = kwargs.get("top_p", 1)

    for _ in range(retry):
        try:
            response = judge_model.generate(
                prompt,
                temperature=temperature,
                max_tokens=max_tokens,
                seed=seed,
                top_p=top_p,
            )
            content = json.loads(response)
            if not isinstance(content, dict):
                return failure_result
            if "score" not in content or "extract_answer" not in content:
                return failure_result
            return content
        except Exception:
            continue

    return failure_result


def qid2category(mode: str) -> Tuple[Dict[int, str], str]:
    """
    Map question IDs to their categories based on the evaluation mode.

    Args:
        mode: Either "descriptive" or "reasoning"

    Returns:
        Tuple containing a mapping dictionary and the index column name

    Raises:
        ValueError: If the mode is not recognized
    """
    if mode == "descriptive":
        index_col = "qid"
        return {
            1: "Information Extraction",
            2: "Information Extraction",
            3: "Information Extraction",
            4: "Information Extraction",
            5: "Information Extraction",
            6: "Information Extraction",
            7: "Information Extraction",
            8: "Enumeration",
            9: "Enumeration",
            10: "Counting",
            11: "Pattern Recognition",
            12: "Counting",
            13: "Enumeration",
            14: "Enumeration",
            15: "Enumeration",
            16: "Pattern Recognition",
            17: "Compositionality",
            18: "Pattern Recognition",
            19: "Counting",
        }, index_col
    elif mode == "reasoning":
        index_col = "inst_category"
        return {
            1: "Text-in-Chart",
            2: "Text-in-General",
            3: "Number-in-Chart",
            4: "Number-in-General",
        }, index_col
    else:
        raise ValueError(f"Invalid mode: {mode}")


class CharXiv(ImageBaseDataset):
    TYPE = "VQA"
    DATASET_URL = {
        "CharXiv_descriptive_val": "http://opencompass.openxlab.space/utils/VLMEval/CharXiv_descriptive_val.tsv",
        "CharXiv_reasoning_val": "http://opencompass.openxlab.space/utils/VLMEval/CharXiv_reasoning_val.tsv",
    }
    DATASET_MD5 = {
        "CharXiv_descriptive_val": "e165037032f169a59dd09ea5d7ad3073",
        "CharXiv_reasoning_val": "98eeff269b40726982627b19338ccd45",
    }

    def build_prompt(self, line: Union[int, pd.Series]) -> List[Dict[str, str]]:
        """
        Build a prompt for the model from a data line.

        Args:
            line: Either an index into the dataset or a pandas Series

        Returns:
            List of message dictionaries containing the image and question
        """
        if isinstance(line, int):
            line = self.data.iloc[line]

        if self.meta_only:
            tgt_path = misc.toliststr(line["image"])
        else:
            tgt_path = self.dump_image(line)

        messages = [{"type": "image", "value": tgt_path[0]}]
        messages.append({"type": "text", "value": line["question"]})
        return messages

    def get_scores(self, result_file: str) -> pd.DataFrame:
        """
        Calculate scores by category from evaluation results.

        Args:
            result_file: Path to the file containing evaluation results

        Returns:
            DataFrame with scores for each category and overall score

        Raises:
            ValueError: If the dataset name is invalid
        """
        data = file.load(result_file)

        if "descriptive" in self.dataset_name:
            mode = "descriptive"
        elif "reasoning" in self.dataset_name:
            mode = "reasoning"
        else:
            raise ValueError(f"Invalid dataset name: {self.dataset_name}")

        category_map, index_col = qid2category(mode)

        # Group scores by category
        scores_by_category = {}
        for _, row in data.iterrows():
            category = category_map[row[index_col]]
            if category not in scores_by_category:
                scores_by_category[category] = []
            scores_by_category[category].append(row["score"])

        # Calculate average score for each category
        result = {}
        for category, scores in scores_by_category.items():
            result[category] = [sum(scores) / len(scores)]

        # Calculate overall score
        result["Overall"] = [
            sum(sum(scores) for scores in scores_by_category.values()) / len(data)
        ]

        return pd.DataFrame(result)

    def evaluate(self, eval_file: str, **judge_kwargs: Any) -> pd.DataFrame:
        """
        Evaluate model predictions on the CharXiv dataset.

        Args:
            eval_file: Path to the file containing model predictions
            **judge_kwargs: Additional arguments for the judge model

        Returns:
            DataFrame with evaluation scores by category
        """
        # Set up judge model
        if "LOCAL_LLM" in os.environ:
            judge_model = os.path.basename(os.environ.get("LOCAL_LLM"))
        else:
            judge_model = judge_kwargs.get("model", "gpt-4o-mini")

        if judge_model != "gpt-4o-mini":
            warnings.warn(
                f"The judge_model '{judge_model}' is not gpt-4o-mini. Evaluation results may not be accurate."
            )

        judge_model = build_judge(model=judge_model, **judge_kwargs)
        judge_model_name = judge_model.model

        # Define file paths
        result_file = get_intermediate_file_path(eval_file, f"_{judge_model_name}")
        temp_result_file = get_intermediate_file_path(eval_file, f"_{judge_model_name}", "pkl")
        score_file = get_intermediate_file_path(result_file, "_acc", "csv")

        # Return existing results if available
        if os.path.exists(result_file):
            score = self.get_scores(result_file)
            file.dump(score, score_file)
            return score

        data = file.load(eval_file)
        if "score" not in data.columns:
            data["score"] = 0
        if "extract_answer" not in data.columns:
            data["extract_answer"] = ""

        # Load intermediate results if available
        processed_results = {}
        if os.path.exists(temp_result_file):
            processed_results = file.load(temp_result_file)

        # Identify unprocessed indices
        indices = [i for i in range(len(data)) if i not in processed_results]
        tups = [(judge_model, data.iloc[i]) for i in range(len(data)) if i not in processed_results]

        # Process remaining examples
        nproc = judge_kwargs.pop("nproc", 4)
        if len(indices):
            utils.track_progress_rich(
                auxeval,
                tups,
                nproc=nproc,
                chunksize=nproc,
                keys=indices,
                save=temp_result_file,
                **judge_kwargs,
            )
            processed_results = file.load(temp_result_file)

        # Update data with evaluation results
        data["score"] = data.apply(lambda x: processed_results[x.name]["score"], axis=1)
        data["extract_answer"] = data.apply(
            lambda x: processed_results[x.name]["extract_answer"], axis=1
        )

        # Save results and return scores
        file.dump(data, result_file)
        score = self.get_scores(result_file)
        file.dump(score, score_file)
        return score