GLM-V/glmv_reward/configs/full_config.yaml at main · zai-org/GLM-V · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
reward_log_dir: "logs/reward_judge"

datasource_reward_config_mapping:
  default: "general_verifier_config"
  general: "general_verifier_config"
  math: "math_verifier_config"
  chemistry: "chemistry_verifier_config"
  physics: "physics_verifier_config"
  chart: "chart_verifier_config"
  mmsi: "mmsi_verifier_config"
  multi_image: "multi_image_general_verifier_config"
  ocr: "ocr_verifier_config"
  ocr_ignore_case: "ocr_ignore_case_verifier_config"
  vqa: "vqa_verifier_config"
  counting: "counting_verifier_config"
  language_mix: "language_mix_verifier_config"
  geoguess: "geoquesteoquest_verifier_config"

  # agent
  AndroidWorld: "androidworld_verifier_config"
  WebVoyager: "webvoyager_verifier_config"
  OSWorld: "osworld_verifier_config"

reward_configs:
    geoquesteoquest_verifier_config:
        verifier_type: "geoquest"
        strict_boxed_extraction: true
        llm_api_key:
          - "your_api_keys"
        llm_judge_url:
          - "https://open.bigmodel.cn/api/paas/v4/chat/completions"
        llm_model:
          - glm-4-flash
        llm_max_tokens: 4096
        llm_temperature: 0.8
        llm_top_p: 0.6
        model_type: "100b"
        reasoning: true
        llm_judge_prompt_template: |
            请判断模型对图片的分析结论是否正确，并根据规则给出0~1之间的评分。
            模型的目标是根据用户给出的图片推理出图片拍摄的地点。请仔细分析模型给出的结论是否与地点名称和地址相符。注意，有时地点名称会使用不同的语言给出，请你进行适当翻译以对齐语言。如果模型给出的回答与答案完全相符，获得满分1.0分，如果模型的回答与答案完全不符合，获得0.0分。
            注意：模型分析结论中的地点名称必须准确，模糊、不准确的答案是错误的，回答的粒度过大，或者模糊不清的，视为错误答案。
            注意：在analysis字段用不超过30个字简略输出你的分析过程。
            判分规则：
            你需要从两方面进行评分：
            1）地点地址描述：
                - 地点地址描述部分满分为0.5分
                - 正确描述到国家一级，可以获得0.1分
                - 正确描述到省级/州级/当地的一级行政区，可以获得0.25分
                - 正确描述到城市一级，可以获得0.5分
            2）地点名称：
                - 地点名称部分满分为0.5分
                - 仅能描述出照片氛围、大概的地点分类（如仅提到是一处名胜古迹、自然景观、水域、公园、景点、地标、建筑等），或者猜测地点数量大于2，不得分（0.0分）
                - 正确描述地点的详细分类，没有进行具体地点猜测或者猜测错误，酌情给出0.1~0.24之间的分数
                - 正确描述地点的详细分类，并进行1-2个错误但合理的具体地点猜测，得0.25分（比如正确猜出是古代皇家园林，猜测是颐和园或者北海公园，实际是圆明园）
                - 正确的地点名称（可选的正确描述地点的详细分类），得0.5分（正确猜出地点名称）
            请观察以下给出的正确/错误的例子：
            1. 示例（正确的地点名称+地址）：
            正确答案：
                地点名称：National Japanese American Memorial
                地点地址：Washington, DC 20001, USA
            模型分析结论：这里可能是美国华盛顿特区的日裔美国人拘留营纪念碑所在地
            {
                "analysis": "地点名称为 National Japanese American Memorial（美国国家日裔美国人纪念碑），地址为华盛顿特区（Washington, DC 20001, USA）。地点地址描述部分：模型正确分析出了地点所在的城市（华盛顿特区），得0.5分；地点名称部分：模型分析结论虽然用'可能是'这样的推测性语气，但明确指出了该地是美国华盛顿特区的日裔美国人拘留营纪念碑，与实际地点名称完全对应，因此得0.5分。总分：0.5+0.5=1.0分",
                "score": 1.0
            }
            2. 示例（正确的地址城市+准确的地点描述+没有进行具体地点猜测）：
            正确答案：
                地点名称：The Georgian House (National Trust for Scotland)
                地点地址：7 Charlotte Square, Edinburgh EH2 4DR, UK
            模型分析结论：这里可能是英国爱丁堡新城的乔治亚风格联排街区。
            {
                "analysis": "地点名称为 The Georgian House，经翻译为乔治亚风格房屋博物馆。地点地址描述部分：模型正确分析出了地点位于爱丁堡，得0.5分；地点名称部分：模型分析结论提到这里可能是英国爱丁堡新城的乔治亚风格联排街区，地点描述整体准确，但是具体地点描述不够准确，因此需要在0.1~0.24分之间酌情判分，基于模型给出了正确具体地点的描述，只是不够准确，因此给出0.2分的得分。总分：0.5+0.2=0.7分",
                "score": 0.7
            }
            3. 示例（正确的地址城市+准确的地点描述+1~2个错误但合理的猜测）：
            正确答案：
                地点名称：Summer Palace
                地点地址：19 Xin Jian Gong Men Lu, Hai Dian Qu, China, 100091
            模型分析结论：这个地方可能是北京的一个著名历史景点，如北海公园或故宫。
            {
                "analysis": "地点名称'Summer Palace'翻译为中文即为'颐和园'，地址位于中国北京市海淀区新建宫门路19号。地点地址描述部分：模型正确分析出了地点位于北京，得0.5分；地点名称部分：模型正确分析出了地点的详细分类（著名历史景点）并进行了合理猜测，模型猜测该地为'北海公园或故宫'，虽然两个都是错误猜测，但是比较合理，因此这一部分可以获得0.25分。总分：0.5+0.25=0.75分",
                "score": 0.75
            }
            4. 示例（错误的城市名和地点名称）
            正确答案：
                地点名称：Haidian Park
                地点地址：2 Xin Jian Gong Men Lu, Hai Dian Qu, China, 100086
            模型分析结论：这是位于中国河北省承德市的承德避暑山庄
            {
                "analysis": "地点名称'Summer Palace'翻译为中文为‘海淀公园’，位于中国北京市。地点地址描述部分：模型分析的地址为'中国河北省承德市'，国家预测正确，但省份和城市预测错误，因此获得0.1分；地点名称部分：模型没有对地点的进行详细分类，并且具体地点猜测为'承德避暑山庄'，猜测错误。因此该部分不得分(0.0分）。总分：0.1+0.0=0.1分",
                "score": 0.1
            }
            5. 示例（模糊的地点描述）
            正确答案：
                地点名称：海淀清真寺
                地点地址：Zhongguancun, Haidian District, China, 100086
            模型分析结论：可能是一个位于北京的具有历史和文化价值的旅游景点或古代建筑
            {
                "analysis": "地点地址描述部分：模型分析判断图片中的建筑位于北京，属于正确描述到城市一级，可以获得0.5分；地点地址描述部分：模型描述了建筑风格，但并未明确指出具体地点‘海淀清真寺’或者其精确地址，而只是以模糊的‘古代建筑’、‘旅游景点’来描述。根据要求，不得分（0.0分）。总分：0.5+0.0=0.5分",
                "score": 0.5
            }
            6. 示例（猜测地点数量大于两个）
            正确答案：
                地点名称：Yuanmingyuan Park
                地点地址：275X+67P圆明园 Haidian District, China, 100084
            模型分析结论：可能是北京的景点，如故宫、颐和园、北海公园、圆明园或者其他公园
            {
                "analysis": "地点名称'Yuanmingyuan Park'翻译为中文为'圆明园公园'，位于中国北京市。地点地址描述部分：模型正确分析出了地点位于北京市，属于正确描述到城市一级，可以获得0.5分；地点地址描述部分：模型分析结论中未进行地点分类描述，而且进行了多个地点的猜测，超过了两个猜测地点的数量上限，因此不得分（0.0分）。总分：0.5+0.0=0.5分",
                "score": 0.5
            }
            7. 示例（完全错误的地点地址但合理的地点详细分类）
            正确答案：
                地点名称：Eglise Notre Dame
                地点地址：2 Rue de l\'Église, 92100 Boulogne-Billancourt, France
            模型分析结论：意大利某座历史悠久的基督教教堂
            {
                "analysis": "地点名称'Eglise Notre Dame'翻译为中文为'圣母院'，位于法国巴黎。地点地址描述部分：模型错误分析地点为意大利，在国家层级错误，因此不得分（0.0分）；地点名称部分：模型分析结论中有合理的地点详细分类描述（教堂），因此得分（0.2分）。总分：0.0+0.2=0.2分",
                "score": 0.2
            }
            8. 示例（部分正确的地点地址和错误的地点详细分类）
            正确答案：
                地点名称：上党门
                地点地址：中国山西省长治市潞州区天晚集北路54P4+VRX 邮政编码: 046099
            模型分析结论：中国山西省晋中市平遥县的平遥古城
            {
                "analysis": "地点地址描述部分：模型分析地点为中国山西省晋中市，在省级行政区层面正确，因此得分0.25分；地点名称部分：模型分析结论中为平遥古城，与正确地点名称上党门的地点详细分类不同，因此不得分（0.0分）。总分：0.25+0.0=0.25分",
                "score": 0.25
            }
            9. 示例（错误的地点地址和模糊的地点详细分类）
            正确答案：
                地点名称：神戸の壁・鎮魂と復興のベンチ
                地点地址：Japan, 〒651-0073 Hyogo, Kobe, Chuo Ward, Wakinohamakaigandōri, 1-chōme−5−２ 東館
            模型分析结论：中国某城市的纪念性地标
            {
                "analysis": "地点地址描述部分：模型分析地点为中国某城市，在国家层面错误，因此不得分（0.0分）；地点名称部分：模型分析结论中仅仅提及为某处地标，过于模糊，不得分（0.0分）。总分：0.0+0.0=0.0分",
                "score": 0.0
            }
            10. 示例（缺失地点地址和过于模糊的地点详细分类）
            正确答案：
                地点名称：东钱湖
                地点地址：中国宁波市鄞州区东钱湖
            模型分析结论：某个湖泊景区
            {
                "analysis": "地点地址描述部分：模型没有给出地点分析，因此不得分（0.0分）；地点名称部分：模型分析结论中仅仅提及为某个湖泊景区，对于地点名称的分析过于模糊，不得分（0.0分）。总分：0.0+0.0=0.0分",
                "score": 0.0
            }
            请你参考以上示例，对模型分析结论进行判断。输出时，请用以下JSON格式输出你的判断：
            {
                "analysis": "判断分析",
                "score": 评价分数
            }
            正确答案：
                地点名称: {place_name}
                地点地址: {address}
            模型分析结论: {predict}

    mmsi_verifier_config:
        verifier_type: "mmsi"
        sympy_tolerance: 0.9 # 注意是0.9不是1.0
        strict_boxed_extraction: true # Math answers usually need to be in \boxed{}
        # Optional: if MathVerifier needs to fallback to an LLM judge for tricky cases
        enable_llm_judge_fallback: true
        llm_api_key: "your_api_keys"
        llm_judge_url: "https://open.bigmodel.cn/api/paas/v4/chat/completions"
        llm_model: glm-4-flash
        llm_judge_prompt_template: |
            ### Task
            You are a perfect math verifier. Your task is to evaluate if the generated `Response` is mathematically equivalent to the `Ground Truth` answer, strictly considering the context provided by the `Question`. Based *only* on the `Question` and the mathematical equivalence between the `Response` and `Ground Truth`, respond **only** with `1.0` or `0.0`. **Do not provide any explanations, reasoning, or thinking process.** Your output must be solely the numerical score.

            **Evaluation Criteria:**

            * Assign **`1.0`** if the `Response` is **mathematically equivalent** to the `Ground Truth` in the context of the `Question`.
            * Assign **`0.0`** if the `Response` is **incorrect** or **partially incorrect** in any way that changes its mathematical value or meaning compared to the `Ground Truth`, considering the `Question`.

            ---

            **Score `1.0`: Mathematically Equivalent**

            This means the `Response` represents the exact same mathematical value, expression, set, or concept as the `Ground Truth`, even if the notation or form differs. Equivalence includes:

            * **Numerical Formats:** `1.5` vs `3/2`; `0.5` vs `1/2`; `100` vs `1e2`; `4` vs `4.0`.
            * **Symbolic Representation:** `x*y` vs `xy`; `\frac{1}{2}` vs `1/2`; `pi` vs `π`; `sqrt(x)` vs `√x`; `\angle{ABC}` vs `∠ABC`; `\alpha` vs `α`; `\in` vs `∈`; `\times` vs `×`; `\cdot` vs `·`; `零` vs `0`; `²` vs `^2`; `³` vs `^3`; `\leqslant` vs `≤`; `\geqslant` vs `≥`; `\neq` vs `≠`.
            * **Algebraic Equivalence:** `(x+1)^2` vs `x^2+2x+1`; `2x+3` vs `3+2x`; `y = 2x + 3` vs `2x - y + 3 = 0`.
            * **Simplification:** Unsimplified but correct answers are equivalent to the simplified `Ground Truth` (e.g., `4/8` vs `1/2`; `x + x` vs `2x`).
            * **Units:** Presence or absence of units is acceptable if the numerical value is correct and the `Question` doesn't strictly require units (e.g., `15` vs `15 meters`).
            * **Notation:** Different but standard notations for the same concept (e.g., `72 degrees` vs `72°`; `>` vs `greater than`; interval notation `(5, \infty)` vs inequality `x>5`).
            * **Sets/Lists (Unordered):** If the `Ground Truth` represents an unordered collection (like a set of solutions), order differences are acceptable. Different separators (comma, semicolon, space) are fine. Both `Response` and `Ground Truth` must contain the *exact same elements* with the *exact same multiplicity*.
                * Example: `Response: {2, 1, 3}` vs `Ground Truth: {1, 2, 3}` → **1.0**
                * Example: `Response: 1; 3; 2` vs `Ground Truth: 1, 2, 3` → **1.0** (if context/`Question` implies order doesn't matter)

            ---

            **Score `0.0`: Incorrect or Partially Incorrect**

            This includes any deviation that makes the `Response` mathematically different from the `Ground Truth`, considering the context of the `Question`. Pay close attention to:

            * **Approximation Errors:** ANY approximation of an exact `Ground Truth` is incorrect, unless the `Ground Truth` itself is explicitly an approximation or specifies acceptable rounding per the `Question`.
                * ❌ `Response: 3.14` → `Ground Truth: π` → **0.0**
                * ❌ `Response: 1.414` → `Ground Truth: sqrt(2)` → **0.0**
                * ❌ `Response: 0.33` → `Ground Truth: 1/3` → **0.0**
                * ❌ `Response: 0.67` → `Ground Truth: 2/3` → **0.0**
            * **Precision/Rounding Errors:** Incorrect rounding or providing a rounded answer when an exact form is required by the `Question` or `Ground Truth`.
                * ❌ `Response: 1.7` → `Ground Truth: 1.73` (if specific precision required) → **0.0**
                * ❌ `Response: rounded to 4` → `Ground Truth: 4.2` → **0.0**
            * **Calculation Errors:** Basic arithmetic mistakes, sign errors, order of operation errors.
                * ❌ `Response: 5+3*2 = 16` → `Ground Truth: 11` → **0.0**
                * ❌ `Response: -5` → `Ground Truth: 5` → **0.0**
            * **Conceptual & Algebraic Errors:** Incorrect formula application, wrong variable used, errors in algebraic manipulation (factoring, expanding, solving).
                * ❌ `Response: x^2+y^2` → `Ground Truth: (x+y)^2` (i.e., `x^2+2xy+y^2`) → **0.0**
                * ❌ `Response: Area = length + width` → `Ground Truth: Area = length * width` → **0.0**
            * **Incompleteness / Partial Answers:** Missing solutions, missing components of a multi-part answer requested by the `Question`, providing only one condition when multiple are required.
                * ❌ `Response: x=2` → `Ground Truth: x=2 or x=-2` → **0.0**
                * ❌ `Response: The answer is 5` → `Ground Truth: 5 and 7` (if `Question` asked for two numbers) → **0.0**
                * ❌ `Response: x > 3` → `Ground Truth: 3 < x < 10` → **0.0**
            * **Extraneous Solutions:** Including incorrect solutions alongside correct ones.
                * ❌ `Response: x=1, x=-3` → `Ground Truth: x=1` (where x=-3 is an extraneous root) → **0.0**
            * **Incorrect Order (When Order Matters):** For coordinates, vectors, sequences, matrices, etc., where order is significant based on the `Question` or mathematical convention.
                * ❌ `Response: (3, 2)` → `Ground Truth: (2, 3)` → **0.0**
                * ❌ `Response: [1, 3, 2]` → `Ground Truth: [1, 2, 3]` (if `Question` implies order matters) → **0.0**
            * **Incorrect Set/List Elements:** Wrong numbers, missing elements, extra elements, or incorrect multiplicity.
                * ❌ `Response: {1, 3}` → `Ground Truth: {1, 2, 3}` → **0.0**
                * ❌ `Response: {1, 1, 2}` → `Ground Truth: {1, 2}` → **0.0**
                * ❌ `Response: {1, 2, 4}` → `Ground Truth: {1, 2, 3}` → **0.0**
            * **Fundamental Format Mismatches:** Providing an answer in a fundamentally different mathematical structure than required by the `Question`.
                * ❌ `Response: 5` (scalar) → `Ground Truth: (5, 0)` (vector/coordinate, if `Question` asked for a point) → **0.0**
                * ❌ `Response: positive` → `Ground Truth: x > 0` (if `Question` required a mathematical expression) → **0.0**
            * **Non-Mathematical or Avoidant Responses:** Responses that avoid answering the question by stating lack of information, ambiguity, or insufficiency—when the `Ground Truth` clearly indicates a mathematically valid answer.
                * ❌ `Response: Insufficient information` → `Ground Truth: Line KO is annotated as x` → **0.0**
                * ❌ `Response: Insufficient information` → `Ground Truth: Angle GIY is annotated as 77` → **0.0**
            * **Incorrect or Incomplete List of Elements (Entities, Labels, Points, etc.):** If the `Response` provides an incorrect set of named elements (e.g., points, angles, labels) compared to the `Ground Truth`, whether due to missing elements, wrong elements, or wrong grouping.
                * ❌ `Response: Q, B, A, L` → `Ground Truth: The point lying on circle F are B, L, Q` → **0.0**
                * ❌ `Response: R, E, M, X, L, W` → `Ground Truth: The point lying on circle A are E, M, R, W, X` → **0.0**
            * **Incorrect Object Referencing or Mismatched Entities:** If the response refers to the wrong geometric objects or mismatches the identity of mathematical entities, even if similar terms are used.
                * ❌ `Response: angle BKH` → `Ground Truth: Angle BNH is equal to angle NHB` → **0.0**

            ---

            ### Input:

            Question: {question}
            Response: {predict}
            Ground Truth: {label}

            ### Output:
            Respond **strictly and only** with `1.0` or `0.0`. Do **not** provide any explanations or justification.
        llm_max_tokens: 2048
        llm_temperature: 0.1
        llm_top_p: 1.0

    math_verifier_config:
        verifier_type: "math"
        sympy_tolerance: 1.0e-6
        strict_boxed_extraction: true
        enable_llm_judge_fallback: true
        llm_api_key:
          - "your_api_keys"
        llm_judge_url:
          - "https://open.bigmodel.cn/api/paas/v4/chat/completions"
        llm_model:
          - glm-4-flash
        llm_max_tokens: 4096
        llm_temperature: 0.01
        llm_top_p: 0.01
        llm_judge_prompt_template: |
            You are an expert mathematical evaluator. Your task is to compare a generated 'Response' with a 'Ground Truth' answer for a given 'Question' and provide a score of 1.0 for a perfect match and 0.0 otherwise.

            **Important Context: You are a text-only model.**
            You will NOT see any images, even if the 'Question' mentions them (e.g., "Based on the image..."). Your entire evaluation must be based solely on the text provided in the 'Question', 'Response', and 'Ground Truth'. Do not attempt to infer or hallucinate image content.

            **Primary Scoring Principle**

            The 'Response' must be **definitive, unambiguous, and mathematically equivalent** to the 'Ground Truth'. It should be presented in a direct and standard mathematical format.

            **Detailed Scoring Rules**

            **Score 1.0 if the Response meets ALL of the following criteria:**
            1.  **Correctness:** The response is mathematically equivalent to the ground truth.
            2.  **Completeness:**
                - If the question has a single answer, the response must provide it.
                - If the question has multiple solutions that form a complete set (e.g., roots of an equation, all possible values), the response must provide ALL required solutions.
            3.  **Definitiveness:** The response must be a confident and final answer. It should not express uncertainty, list unrequired alternatives, or use hedging language.

            **Score 0.0 if the Response exhibits ANY of the following flaws:**
            - It is mathematically incorrect.
            - It is incomplete (e.g., provides only one of two required solutions).
            - It is an explanation of the method rather than the answer itself.
            - It is ambiguous or expresses uncertainty (see "Hacking Patterns" below).
            - It uses non-standard or obfuscated formats (e.g., Roman numerals, HTML entities).

            **Guidance on Common Scenarios & Hacking Patterns**

            This section helps you differentiate valid responses from "hacking" attempts.

            **1. Handling Multiple Answers: Complete Solution vs. Uncertain Hedging**

            - **VALID (Score 1.0):** The response lists all required parts of a complete solution set. This is common for solving equations.
            - **Example:**
                - Question: `Solve x² = 4.`
                - Ground Truth: `x = ±2`
                - Response: `x = 2 or x = -2` -> **Score 1.0** (This is a complete, definitive solution set).

            - **HACKING (Score 0.0):** The response lists alternatives because it is uncertain, or adds vague, catch-all phrases.
            - **Examples:**
                - Response: `10 (or 8)` vs. Ground Truth: `10` -> **Score 0.0** (Indicates uncertainty between two distinct possibilities).
                - Response: `平行四边形（或矩形、菱形等合理答案）` vs. Ground Truth: `平行四边形` -> **Score 0.0** (The phrase "等合理答案" is a clear hedging pattern).
                - Response: `正确（或错误）` vs. Ground Truth: `正确` -> **Score 0.0** (Provides mutually exclusive options for a single-answer question).

            **2. Handling Formatting & Representation**

            - **VALID (Score 1.0):** The response uses standard, mathematically equivalent formats.
            - **Examples:**
                - `3/2` vs. `1.5`
                - `x(x+1)` vs. `x^2+x`
                - `a ≤ 4` vs. `(-∞, 4]`

            - **HACKING (Score 0.0):** The response uses non-standard or obfuscated formats not typically used in the context of the question.
            - **Examples:**
                - `XV` (Roman) vs. `15` (Arabic)
                - `&#49;&#53;` (HTML entities) vs. `15` (note this you must give 0.0)

            **Method-Only Responses (Score 0.0):**
            - If the response only provides a description of the method, a formula, a hint, or a general explanation, **but does not directly output a final, definitive answer**, score 0.0.

            **Detailed Scoring Rules:**
            - **1.0**: The response is mathematically equivalent to the ground truth.
            - **0.0**: The response is incorrect, incomplete, ambiguous, explanatory in nature, uses encoding, or otherwise fails to convey the same mathematical meaning.
            **Method-Only Responses (Score 0.0):**
            - If the response only provides a description of the method, an unsimplified formula, a hint, or a general explanation, **but does not directly output a final, definitive answer**, score 0.0.
            **General Instructions on Equivalence (Score 1.0 if equivalent):**
            - **Algebraic Equivalence:** Trivial simplifications, term reordering, and mathematically equivalent forms (e.g., `2x+3` and `3+2x`, `x(x+1)` and `x^2+x`) are acceptable.
            - **Numerical Equivalence:** Different formats representing the same number are acceptable (e.g., `3/2` and `1.5`; `50%` and `0.5`; `72°` and `72`).
            - **Multiple Choice:** Either the correct option letter (e.g., 'A') or the content of the correct option is acceptable.
            - **Lists and Sequences:** If the ground truth requires a list of items, the response must contain all required items. Order may not matter unless specified.
            - **Intervals:** Different standard notations for the same interval are acceptable (e.g., `(-∞, 4]` and `a ≤ 4`).
            - **Geometric Naming:** The order of endpoints or vertices does not affect which line, segment, triangle, or polygon is represented.
                - Lines/segments: "line LI" = "line IL", "segment CD" = "segment DC"
                - Triangles: "triangle ABC" = "triangle CBA" = "triangle BCA"
                - Polygons with more vertices: "polygon ABCD" = "polygon DCBA" = "polygon BCDA"
            - **Unit Guidelines**
                - If the generated response does not include any units, its numerical value must exactly match the ground-truth value.
                (Example: ground-truth is "12 m", generated response must be "12"; "13" would be incorrect.)
                - If the generated response includes units, then:
                    - The value must either be numerically equivalent to the ground truth (including matching units),
                    (e.g., "12 m" vs "12 m"),
                    - Or it must be convertible through correct unit conversion to match the ground-truth value.
                    (e.g., "1200 cm" is acceptable for "12 m")
                - Incorrect or mismatched units (e.g., wrong type or wrong scale without proper conversion) will result in a score of 0.0.
                - If the ground-truth answer does not contain any units, then the units should be determined based on the context of the question.

            **Examples:**

            **Basic Math Equivalence Examples:**
            Response: 3/2
            Ground Truth: 1.5
            1.0

            Response: $2x+3$
            Ground Truth: $3+2x$
            1.0

            Response: $x^2+2x+1$
            Ground Truth: $1+2x+x^2$
            1.0

            Response: 72 degrees
            Ground Truth: 72
            1.0

            Response: [number]
            Ground Truth: 2
            0.0

            Response: 1.0, 2.0, 3.0
            Ground Truth: 1.0; 2.0; 3.0
            1.0

            **Multiple Choice Examples:**
            Question: What is the value of x in the equation 2x + 3 = 7?
            Choices:
            (A) 2
            (B) 3
            (C) 4
            (D) 5
            Response: A
            Ground Truth: 2
            1.0

            Question: What is the value of x in the equation 2x + 3 = 7?
            Choices:
            (A) 2
            (B) 3
            (C) 4
            (D) 5
            Response: 2
            Ground Truth: A
            1.0

            Question: What is the value of x in the equation 2x + 3 = 7?
            Choices:
            (A) 2
            (B) 3
            (C) 4
            (D) 5
            Response: 3
            Ground Truth: A
            0.0

            **Algebraic Expression Equivalence:**
            Response: 1/2x
            Ground Truth: x/2
            1.0

            Response: 1/2x
            Ground Truth: 1/(2x)
            0.0

            Response: \\frac{1}{1+\\frac{1}{x}}
            Ground Truth: \\frac{x}{x+1}
            1.0

            Response: \\dfrac{45 - 20\\sqrt{3}}{4} \\text{m}
            Ground Truth: 2.6 m
            1.0

            Response: x^2 + x
            Ground Truth: x(x+1)
            1.0

            Response: x + x
            Ground Truth: 2x
            1.0

            Response: x + 1
            Ground Truth: x^2 + 1
            0.0

            Response: x^2 + x + 1
            Ground Truth: (x+1)^2
            0.0

            **Units and Measurements:**
            Response: 12 m
            Ground Truth: 12
            1.0

            Response: 12
            Ground Truth: 12 m
            1.0

            Response: 12cm
            Ground Truth: 0.12 m
            1.0

            Response: 12
            Ground Truth: 12 kg
            1.0

            Response: 0.5
            Ground Truth: 50%
            1.0

            Response: 1 m/s
            Ground Truth: 3.6 km/h
            1.0

            Question: 1 kg 等于多少 g？
            Response: 1 kg
            Ground Truth: 1000 g
            0.0

            **Lists and Sequences:**
            Response: 1, 2
            Ground Truth: 1; 2; 3
            0.0

            Response: D, R, E, C, B, F
            Ground Truth: The point lying on circle A are B, C, F, E, R
            0.0

            Response: D, R, E, C, B, F
            Ground Truth: The point lying on circle A are B, D, C, E, R
            0.0

            Response: D, R, E, C, B, F
            Ground Truth: The point lying on circle A are B, C, E, R, D, F
            1.0

            **Method-only Examples (Always 0.0):**
            Response: \(\text{Use the Law of Sines to find the missing side length}\)
            Ground Truth: 6
            0.0

            Response: \boxed{\text{Apply the quadratic formula to solve for x in the equation}}\)
            Ground Truth: 2
            0.0

            Response: \(\text{Calculate the area using the formula A = πr²}\)
            Ground Truth: 25π
            0.0

            Response: [Use the Pythagorean theorem to find the hypotenuse]
            Ground Truth: 5
            0.0

            Response: \(\text{Find the derivative using the power rule}\)
            Ground Truth: 2x
            0.0

            Response: \(\text{Use the unit circle to find the cosine value}\)
            Ground Truth: 1/2
            0.0

            Question: 如图1，在矩形ABCD中，AB＜BC，点E为对角线AC上的一个动点，连接BE，DE，过E作EF⊥BC于F．设AE=x，图1中某条线段的长为y，若表示y与x的函数关系的图象大致如图2所示，则这条线段是图1中的______．（写出所有可能的答案）
            Response: DE
            Ground Truth: BE \\text{ (or } DE \\text{)}
            0.0

            **Chinese Language Equivalence:**
            Response: 三张桌子和四把椅子
            Ground Truth: 四把椅子和三张桌子
            1.0

            Response: 两个苹果和三个香蕉
            Ground Truth: 三个香蕉和两个苹果
            1.0

            Response: 五本书和两支笔
            Ground Truth: 两支笔和五本书
            1.0

            Response: 张三比李四高，李四比王五高
            Ground Truth: 李四比王五高，张三比李四高
            1.0

            Response: 小明比小红跑得快，小红比小华跑得快
            Ground Truth: 小红比小华跑得快，小明比小红跑得快
            1.0

            **Interval Notation Examples:**
            Response: (-∞, 4]
            Ground Truth: a ≤ 4
            1.0

            Response: (-2, 4]
            Ground Truth: a ≤ 4
            0.0

            Response: (-∞, -2] ∪ [2, ∞)
            Ground Truth: x ≤ -2 or x ≥ 2
            1.0

            Response: (-2, 8)
            Ground Truth: -2 < x < 8
            1.0

            Response: [-2, 8]
            Ground Truth: (-2, 8)
            0.0

            **LaTeX Fraction Examples:**
            Response: \dfrac{1}{2}
            Ground Truth: 0.5
            1.0

            Response: \dfrac{1}{2}
            Ground Truth: 0.25
            0.0

            **LaTeX Equation Examples:**
            Question: Solve sin(x) = 1
            Response: x = \frac{\pi}{2}
            Ground Truth: x = \frac{\pi}{2} + 2\pi n
            0.0

            Question: Solve sin(x) = 1, x in [0, 2\pi]
            Response: x = \frac{\pi}{2} + 2\pi n
            Ground Truth: x = \frac{\pi}{2}
            0.0

            Question: Solve sin(x) = 1
            Response: x = \frac{\pi}{2} + 2\pi n
            Ground Truth: x = \frac{\pi}{2} + 2\pi n
            1.0

            Question: Solve sin(x) = 1
            Response: x = \frac{\pi}{2} + \pi n
            Ground Truth: x = \frac{\pi}{2} + 2\pi n
            0.0

            Question: Solve sin(x) = 1
            Response: x = \frac{\pi}{2} + 4\pi n
            Ground Truth: x = \frac{\pi}{2} + 2\pi n
            0.0

            **Symbol and Function Evaluation Examples (Score 1.0):**
            Response: √9
            Ground Truth: 3
            1.0

            Response: √4
            Ground Truth: 2
            1.0

            Response: cos(π)
            Ground Truth: -1
            1.0

            Question: What is 5 × 3?
            Response: &#49;&#53;
            Ground Truth: 15
            0.0

            Question: What is 2 + 2?
            Response: ４
            Ground Truth: 4
            1.0

            **LaTeX and Standard Form Equivalence (Score 1.0):**
            Response: \dfrac{1}{2}
            Ground Truth: 1/2
            1.0

            **Interval and Inequality Notation Equivalence (Score 1.0):**
            Response: a \leq 4
            Ground Truth: (-\infty, 4]
            1.0

            **Distinguishing Answer from Explanation (Score 0.0):**
            Response: \text{No, the y-intercept is not equal to the slope because the slope is undefined and there is no single y-intercept for a vertical line.}
            Ground Truth: No
            0.0

            **Final Instructions:**
            Respond with only 1.0 or 0.0. Do not include a rationale.
            Question: {question}
            Response: {predict}
            Ground Truth: {label}

    chemistry_verifier_config:
        verifier_type: "chemistry"
        sympy_tolerance: 1.0e-5
        strict_boxed_extraction: true
        enable_llm_judge_fallback: true
        llm_api_key:
          - "your_api_keys"
        llm_judge_url:
          - "https://open.bigmodel.cn/api/paas/v4/chat/completions"
        llm_model:
          - glm-4-flash
        llm_judge_prompt_template: |
            You are an answer equivalence validator for scientific questions (chemistry, physics, mathematics). Your task is to decide if the Model Response and the Ground Truth are fully equivalent in meaning, calculation, concept, or substance, **strictly according to the context of the given Question and standard scientific conventions**.

            **ONLY output 1.0 or 0.0. DO NOT output any explanation, reasoning, or extra text under any circumstance.**

            ---

            ## Scoring Criteria

            ### Score 1.0 (Equivalent):

            Output 1.0 if, under the Question’s context and standard scientific interpretation, the Response and Ground Truth represent the same meaning or value even if they differ in:
            - Chemical formula notation: Answers are considered correct if they use different but equivalent chemical formula notations, such as subscripted numbers, plain numbers, or LaTeX format, as long as they clearly represent the same substance (e.g., "H₂SO₄" = "H2SO4" and MgF_2 = MgF₂)
            - For chemical equations, as long as the reactants and products are the same and the equation is balanced, differences in the order of terms or the left/right placement are acceptable and should be considered correct (e.g., "Zn + 2HCl → ZnCl₂ + H₂" and "2HCl + Zn → ZnCl₂ + H₂" are equivalent).
            - For chemical formulas, if the ground truth specifies a physical state symbol (such as (l), (g), (s), (aq)), a response without a state symbol is still considered correct. However, if the response includes a state symbol that does not match the ground truth (e.g., "CO₂(g)" vs. "CO₂(s)"), it should be considered incorrect. Otherwise, the inclusion or omission of state symbols is acceptable if the compound itself is the same.            - **Unit strict requirement:** If the question explicitly requires a specific unit, the model's response must use that unit; if it uses a different unit (even if it is physically equivalent), it should be scored as 0.0.
            - **Units or format:** Correct standard conversions (e.g., 1 cal ≡ 4.184 J; 0.5 ≡ 50%; 100% ≡ 1; “m/s” ≡ “米每秒”).
            - **Expression style:** Mathematical or chemical notation differences as long as the expressions are mathematically/chemically equivalent (e.g., e^(x+y) ≡ e^x * e^y).
            - **Equivalent synonyms:** Standard/widely accepted names, common names, or symbols (e.g., “乙醇” ≡ “酒精”; “ascorbic acid” ≡ “vitamin C”).
            - **Formatting:** Trivial formatting, spacing, or order (when the order does not affect meaning, e.g., unordered sets, items).
            - **Chemical equations:** Minor notation differences (e.g. arrow types, state symbol omission if not critical).
            - **Fraction, decimal, percentage forms:** Numerically equivalent (e.g., 0.5; 50%; 1/2).
            - **General and specific correct answers:** If the response provides the general correct reason, it should be accepted as correct, even if it also includes specific substances or additional details.
            - **Avogadro's number (N_A):** Answers using N_A notation (6.02×10^{23}) are considered correct if numerically equivalent to the ground truth.
            - **Order questions with explicit instruction:** If the question specifies the order to be filled, the answer can be written directly in sequence (e.g., ABC) without using comparison symbols, as long as the order is correct.
            - **Element conservation in chemical equations:** Chemical equations must conserve the number of atoms for each element on both sides. If the equation does not maintain this balance, it should not be considered correct.
            - **Element balancing:** Element counts in chemical equations must be balanced. Both integer and fractional coefficients are allowed. Expressions like "H₂ + 0.5 O₂ → H₂O" and "2H₂ + O₂ = 2H₂O" (as well as "2Na + Cl₂ → 2NaCl" and "Na + 0.5Cl₂ = NaCl") are considered equivalent.
            - **Reaction conditions:** For chemical reactions, as long as key catalysts are mentioned, other reaction conditions can be more flexible (e.g., "2KClO3在加热和MnO2催化下分解", "2KClO3 → 2KCl + 3O2（MnO2作催化剂）" are equivalent).
            - **Arrow type:** Single-directional arrows (→), reversible arrows (⇌), and equal signs (=) are all treated as equivalent. Any of these arrows in the model response or ground truth should be considered interchangeable, and differences between them should not affect correctness.
            - **Science expression equivalence:**
                - **Scientific concept equivalence:** Use different but scientifically equivalent terms to express the same concept (e.g., "chemical stability" ≡ "cannot burn or support combustion").
                - **Detailed explanation:** Provide a more detailed correct explanation, including the core points of the standard answer and reasonable supplements.
                - **Flexible expression:** Use different language styles to express the same scientific fact (e.g., "physical change" ≡ "molecules themselves do not change, only their state changes").
            - **Chemical expression diversity:**
                - **Chemical reaction description:** Use different ways to describe the same chemical phenomenon (e.g., "bubble formation, solution color change" ≡ "chemical reaction occurs").
                - **Experimental phenomenon expression:** Describe the same experimental phenomenon from different angles, as long as it is scientifically accurate.
                - **Explanation level:** Explain from the molecular/atomic level ≡ explain from the macroscopic level, as long as it is scientifically correct.
            - **Practical supplement:** Provide useful information in actual operations, beyond the standard answer but scientifically correct.
            - **Condition constraints:** Clearly state the core experimental conditions or restrictions.
            - **Allow equivalent answer principle:** If the ground truth explicitly states that certain answers are all reasonable options (e.g., "or CuCl₂, Cu(NO₃)₂, CuO, etc."), then as long as the model answers any of these allowed options, it should be considered correct. Example: Ground Truth: CuSO₄ (or CuCl₂, Cu(NO₃)₂, CuO, etc.) Reponse: CuSO₄
            - **Open-ended questions:** For open-ended questions, such as those asking for properties or uses, as long as the model's response is consistent with the ground truth in terms of content and meaning, it should be considered correct, regardless of the quantity of information provided. 原料丰富且成本低" equals to 资源丰富、成本低廉、环保、高效。
            - **Set answer equivalence:** The response is correct if it covers the exact set of correct categories, regardless of order, format (options or full names), or language.
            - **Fill-in-the-blank questions:** For fill-in-the-blank questions, as long as all key points required by the ground truth are included anywhere in the response, the answer is considered correct, regardless of extra context or wording.
            - **Value range or mathematical expression:** For fill-in-the-blank questions that require a value range or mathematical expression, the response is considered correct as long as it is mathematically equivalent to the ground truth answer, regardless of whether it is expressed as an inequality, a phrase, or a formula (e.g., if the ground truth is "小于t₁", then "0℃ < t < t₁℃" is also correct).
            - **Numerical answers with units:** For fill-in-the-blank questions involving numerical answers with units, the response is considered correct as long as it is physically equivalent to the ground truth after appropriate unit conversion, even if the units or formats differ (e.g., "25 °C" and "298 K" are equivalent; "1 nm" and "10⁻⁹ m" are also equivalent).
            - **Ion or chemical species:** For answers involving ions or chemical species, as long as the response clearly and correctly describes the same ion using either its chemical symbol or a clear verbal description, it should be considered correct (e.g., "F⁻" and "带1个单位负电荷的氟离子" are considered equivalent).
            ### Score 0.0 (Not Equivalent):

            Output 0.0 in any of the following cases:

            - **Substance, value, formulae, or key information is incorrect, incomplete, or missing.**
            - **Mathematically/chemically different expressions or results.**
            - **Wrong or missing units or wrong conversion.**
            - **Extra or contradictory information.**
            - **Wrong order if the order is essential (e.g., ordered procedures).**
            - **Unbalanced chemical equations:** Chemical equations that are not properly balanced should not be considered correct.
            - **Additional incorrect information:** If the response contains the correct answer but also includes additional incorrect, irrelevant, or contradictory information alongside the correct content, it should be scored as 0.0. Note: Parenthetical explanations or alternative correct answers (e.g., "CuSO4 (or other reasonable copper compounds)") are acceptable, but listing multiple specific substances when only one is required (e.g., "CuSO4、NaOH" when ground truth is "CuSO4") is not equivalent.
            - **Unit or format:** For fill-in-the-blank questions that require a specific unit or format, the response must use the exact unit or expression specified in the ground truth; mathematically equivalent answers that use a different unit or notation are considered incorrect (e.g., if the ground truth is "15分钟", a response like "15~20 min" should be marked as incorrect).
            - **State symbol mismatch:** If the response includes a state symbol that does not match the ground truth (e.g., "CO₂(g)" vs. "CO₂(s)"), it should be considered incorrect. Otherwise, the inclusion or omission of state symbols is acceptable if the compound itself is the same.
            ---

            ### Examples:
            **Chemistry Formula Examples:**

            Response: 2KClO3在加热和MnO2催化下分解
            Ground Truth: 2KClO3 → 2KCl + 3O2（MnO2作催化剂）
            1.0

            Response: H2SO4(aq)
            Ground Truth: H2SO4(l)
            0.0

            Response: H₂O(l)
            Ground Truth: H₂O
            1.0

            Response: CO₂
            Ground Truth: CO₂(g)
            1.0

            Response: NaCl
            Ground Truth: NaCl(s)
            1.0

            Response: H₂SO₄
            Ground Truth: H₂SO₄(aq)
            1.0

            Response: 2\text{H}^+ + \text{CO}_3^{2-} = \text{CO}_2 \uparrow + \text{H}_2\text{O}
            Ground Truth: 2H^{+}+CO₃^{2-}═CO₂↑+H₂O
            1.0

            Response: 2NH_3 + CO2 \xlongequal CO(NH_2)_2 + H_2O
            Ground Truth: 2NH₃ + CO₂ → CO(NH₂)₂ + H₂O
            1.0

            Response: H2O ⇌ H+ + OH-
            Ground Truth: H2O → H+ + OH-
            1.0

            Response: H2O → H+ + OH-
            Ground Truth: H2O ⇌ H+ + OH-
            1.0

            Response: \mathrm{S} + \mathrm{O_2} → \mathrm{SO_2}
            Ground Truth: S + O₂ → SO₂
            1.0

            Response: 2H2 + O2 → 2H2O
            Ground Truth: 2H2 + O2 = 2H2O
            1.0

            Response: H2 + 0.5 O2 → H2O
            Ground Truth: 2H2 + O2 = 2H2O
            1.0

            Response: Zn + 2HCl → ZnCl2 + H2
            Ground Truth: 2HCl + Zn → ZnCl2 + H2
            1.0

            Question:发生反应的化学方程式_____。
            Response: \ce{CuSO4 + 2NaOH = Cu(OH)2↓ + Na2SO4} \quad \text{和} \quad \ce{Cu(OH)2 + H2SO4 = CuSO4 + 2H2O}
            Ground Truth: 2NaOH + H₂SO₄ → Na₂SO₄ + 2H₂O, 2NaOH + CuSO₄ → Cu(OH)₂↓ + Na₂SO₄
            0.0

            Question: 反应式为_________。
            Response: 2Na + Cl2 → 2NaCl
            Ground Truth: Na + Cl2 → NaCl
            0.0

            Response: \ce{Zn^{2+}}、\ce{Cu^{2+}
            Ground Truth: Zn²⁺、Cu²⁺
            1.0

            Response: \ce{H_2}
            Ground Truth: H₂
            1.0

            Question: 化学式是_____．
            Response: MgF_2
            Ground Truth: MgF₂
            1.0

            Response: H₂SO₄
            Ground Truth: H2SO₄
            1.0

            Response: MgF_2
            Ground Truth: MgF₂
            1.0

            Question: 反应式为_________。
            Response: \ce{Ag^+ + e^- = Ag}
            Ground Truth: \(\rm{Ag^{+} + e^{-} \longrightarrow Ag}\)
            1.0

            Question: 反应式为_________。
            Response: 2\mathrm{Al} + 2\mathrm{NaOH} + 2\mathrm{H_2O} \xlongequal{} 2\mathrm{NaAlO_2} + 3\mathrm{H_2}↑
            Ground Truth: 2Al + 2NaOH + 2H₂O = 2NaAlO₂ + 3H₂↑
            1.0

            Question: 反应式为_________。
            Response: 2NH_3 + CO \xlongequal CO(NH_2)_2 + H_2O
            Ground Truth: 2NH₃ + CO₂ → CO(NH₂)₂ + H₂O
            1.0

            Question: 反应式为_________。
            Response: CH4 + 2O2 → CO2 + 2H2O + heat
            Ground Truth: CH4 + 2O2 → CO2 + 2H2O + 光
            0.0

            **Chemistry Term Examples:**

            Response: 1 nM
            Ground Truth: 1e-9 mol/L
            1.0

            Response: 3N_A
            Ground Truth: 1.806×10²⁴
            1.0

            Response: 双置换反应
            Ground Truth: 复分解反应
            1.0

            Response: 复分解反应
            Ground Truth: 双置换反应
            1.0

            Response: 中和反应
            Ground Truth: 酸碱中和反应
            1.0

            Response: 酒精
            Ground Truth: 乙醇
            1.0

            Response: 石灰水
            Ground Truth: 氢氧化钙溶液
            1.0

            Response: 苯酚
            Ground Truth: 石炭酸
            1.0

            Response: 氢氧化钠
            Ground Truth: 强碱溶液
            0.0

            Response: 单置换反应
            Ground Truth: 置换反应
            1.0

            Response: 单质
            Ground Truth: 元素
            0.0

            Response: 元素
            Ground Truth: 单质
            0.0

            Response: 单置换反应
            Ground Truth: 复分解反应
            0.0

            Response: 分子
            Ground Truth: 极性分子
            0.0

            Response: 盐
            Ground Truth: 碱
            0.0

            Response: 25 °C
            Ground Truth: 298 K
            1.0

            Response: 1 nm
            Ground Truth: 10⁻⁹ m
            1.0


            **Chemistry Description Examples:**
            Response: 不能燃烧也不支持燃烧
            Ground Truth: 化学性质稳定
            1.0

            Response: 调节酸液滴入锌粒的速度，可以影响氢气的生成速率
            Ground Truth: 通过控制酸液加入速度，能调节锌和酸反应的快慢
            1.0

            Response: 烧杯底部有晶体析出；塑料块下沉
            Ground Truth: 试管内有硝酸钾晶体析出，塑料板浸入溶液中的体积增大
            1.0

            Response: 原料丰富且成本低
            Ground Truth: 资源丰富、成本低廉、环保、高效
            1.0

            Question: Zn2+、Cu2+能与NH3、H2O、Cl—等形成配位数为4的配合物。[Zn(H2O)4]SO4中不存在的化学键类型有_______ (填序号）。\na.配位键 b.金属键 c.共价键 d.氢键 e.离子键
            Response: {b,d}
            Ground Truth: {d,b}
            1.0

            Response: CuSO4
            Ground Truth: CuSO4（或CuCl2、Cu(NO3)2、CuO等合理答案均可）
            1.0

            Response: CuSO4、NaOH
            Ground Truth: CuSO4
            0.0

            Question: 数值是多少_____?(必须用atm为单位)
            Response: 101325 Pa
            Ground Truth: 1 atm
            0.0

            Question: 白磷相关的安全操作是______。
            Response: 白磷着火点低，水下切割可隔绝氧气并降温防止自燃
            Ground Truth: 防止白磷与空气接触
            1.0

            Response: "制取蒸馏水时分子种类不变，水电解时分子分裂为原子"
            Ground Truth: "制取蒸馏水是物理变化，水电解是化学变化"
            1.0

            Question: 此方法的优点是 _____。
            Response: 原料丰富且成本低
            Ground Truth: 资源丰富、成本低廉、环保、高效
            1.0

            Question: 防止铁锈蚀的一种方法是______．
            Response: 涂油
            Ground Truth: 涂油、刷漆、电镀等
            1.0

            Question:喝下汽水后会打嗝，原因是______。
            Response: 汽水中二氧化碳的溶解度随温度升高而减小，喝下后体温使汽水温度升高，二氧化碳气体逸出
            Ground Truth: 二氧化碳气体从胃中逸出
            1.0

            举一例节约用水的具体做法______。
            Response: 淘米水用来浇花
            Ground Truth: 一水多用
            1.0

            Response: <
            GT: ＜
            1.0

            Question: A和C溶液中溶质的质量分数的大小关系是_____．（用“＜”、“＞”或“=”表示）
            Response: ＞
            Ground Truth: A > C
            1.0

            Question: 观察到______现象
            Response: 导管内壁出现白色固体
            Ground Truth: 导管内有凝结的固体
            1.0

            Question: 水分子由_____构成．
            模型: 氢原子、氯原子和氧原子
            GT: 一个氢原子、一个氯原子和一个氧原子
            1.0

            Question: 含有的物质是_____。
            Response: NaCl
            Ground Truth: 氯化钠
            1.0

            Question: 由高到低的排列顺序为______________
            Response: BAC
            Ground Truth: B > A > C
            1.0

            Question:三物质溶解度由小到大的顺序为_____。
            Response: A < B < C
            Ground Truth: A、B、C
            1.0

            Question: 图中表示的是______；
            Response: 硒元素的相对原子质量
            Ground Truth: Se的相对原子质量
            1.0

            Question: 装置中棉花放置的位置_____。
            Response: 反应容器的导气管口
            Ground Truth: 导气管的入口处
            1.0