MemBase/memory_evaluation.py at main · zjunlp/MemBase · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import argparse
from membase import (
    DATASET_MAPPING,
    EvaluationRunner,
    EvaluationRunnerConfig,
)
from membase.utils import import_function_from_path


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="A script to evaluate the answers of the search results."
    )
    parser.add_argument(
        "--search-results-path",
        type=str,
        required=True,
        help="Path to the search results."
    )
    parser.add_argument(
        "--qa-model",
        type=str,
        default="gpt-4.1-mini",
        help="Model name or path for question answering."
    )
    parser.add_argument(
        "--judge-model",
        type=str,
        default="gpt-4.1-mini",
        help="Model name or path for judgment."
    )
    parser.add_argument(
        "--qa-batch-size",
        type=int,
        default=4,
        help="Batch size for question-answering."
    )
    parser.add_argument(
        "--judge-batch-size",
        type=int,
        default=4,
        help="Batch size for judgment."
    )
    parser.add_argument(
        "--api-config-path",
        type=str,
        default=None,
        help="Path to the API config file."
    )
    parser.add_argument(
        "--context-builder",
        type=str,
        default=None,
        help=(
            "Import path for a custom context builder function that converts a list of "
            "memory entries into a context string. "
            "It accepts 'module.submodule.function' or 'path/to/file.py:function'."
        ),
    )
    parser.add_argument(
        "--prompt-template",
        type=str,
        default=None,
        help=(
            "Import path for a custom prompt template factory that returns a "
            "template with $question and $context placeholders. "
            "It accepts 'module.submodule.function' or 'path/to/file.py:function'."
        ),
    )
    parser.add_argument(
        "--add-question-timestamp",
        action="store_true",
        help="Append the question timestamp to the prompt.",
    )
    parser.add_argument(
        "--dataset-type",
        choices=list(DATASET_MAPPING.keys()),
        default=list(DATASET_MAPPING.keys())[0],
        type=str,
        help="The type of the dataset used to evaluate the memory layer."
    )
    parser.add_argument(
        "--metrics",
        type=str,
        nargs="+",
        default=None,
        help="Metric names to compute.",
    )
    parser.add_argument(
        "--traced-data-save-dir",
        type=str,
        default="traced_data",
        help="Directory where execution graph artefacts are saved.",
    )
    parser.add_argument(
        "--tracing",
        action="store_true",
        help="Enable execution graph tracing.",
    )
    args = parser.parse_args()

    context_builder = (
        import_function_from_path(args.context_builder)
        if args.context_builder is not None else None
    )
    prompt_template = (
        import_function_from_path(args.prompt_template)
        if args.prompt_template is not None else None
    )

    runner_config = EvaluationRunnerConfig(
        search_results_path=args.search_results_path,
        dataset_type=args.dataset_type,
        qa_model=args.qa_model,
        judge_model=args.judge_model,
        qa_batch_size=args.qa_batch_size,
        judge_batch_size=args.judge_batch_size,
        api_config_path=args.api_config_path,
        context_builder=context_builder,
        prompt_template=prompt_template,
        add_question_timestamp=args.add_question_timestamp,
        metrics=args.metrics,
        traced_data_save_dir=args.traced_data_save_dir,
        tracing=args.tracing,
    )
    EvaluationRunner(runner_config).run()