statgpt-backend/configurations/clients/sample/tools.yaml at 95ead448f73f074dba175bf1085090e490f85c2d · epam/statgpt-backend · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
_available_datasets_call_id: &available_datasets_call_id "call_EBJJeaOMKeCzm8h378ubURQN"
_available_terms_tool_call_id: &availableTermsCallId "call_EBJJeaOMKeCzm8h378ubU003"
_data_query_gpt_5_models: &data-query-gpt-5-models
  datasetsSelectionModelConfig:
    deployment: "gpt-5.2-2025-12-11"
    reasoningEffort: "none"
    verbosity: "low"
    temperature: 1
    seed: null
  dimensionsSelectionModelConfig:
    deployment: "gpt-5.2-2025-12-11"
    reasoningEffort: "none"
    verbosity: "low"
    temperature: 1
    seed: null
  indicatorsSelectionModelConfig:
    deployment: "gpt-5.2-2025-12-11"
    reasoningEffort: "none"
    verbosity: "low"
    temperature: 1
    seed: null
  incompleteQueriesModelConfig:
    deployment: "gpt-5.2-2025-12-11"
    reasoningEffort: "none"
    verbosity: "low"
    temperature: 1
    seed: null
  groupExpanderModelConfig:
    deployment: "gpt-5.2-2025-12-11"
    reasoningEffort: "none"
    verbosity: "low"
    temperature: 1
    seed: null
  namedEntitiesModelConfig:
    deployment: "gpt-5.2-2025-12-11"
    reasoningEffort: "none"
    verbosity: "low"
    temperature: 1
    seed: null
  queryNormalizationModelConfig:
    deployment: "gpt-5.2-2025-12-11"
    reasoningEffort: "none"
    verbosity: "low"
    temperature: 1
    seed: null
  timePeriodModelConfig:
    deployment: "gpt-5.2-2025-12-11"
    reasoningEffort: "none"
    verbosity: "low"
    temperature: 1
    seed: null
_relevancy_prompts: &relevancy_prompts
  systemMessage: |-
    You are an expert in statistical indicators.
    As an output provide relevance score for each input case as JSON object {{"relevance": [{{"number": "score"}}]}}.
    Only JSON, no markdown

  userMessage: |-
    Instruction steps:
    - input consists of separate groups of items. Analyze all groups
    - each numbered item describes statistical indicator. Analyze all items
    - use the number from round brackets for the output reference and all other info as a source for relevance context.
      All items that have number in a round brackets should be present in output together with score
    - use all parent levels of an item in the relevance context
    - for each input indicator provide relevancy score as 0, 1, 2, 3 to the statement,
      where: 0 - irrelevant, 1 - somewhat relevant, 2 - highly relevant, 3 - extremely relevant
    - if the input indicator is at least somewhat relevant to the statement then the relevancy score can't be score 0
    - if the input statement has extra clarification description then it is mandatory to have relevant part in the candidate item to get score 3.
      This extra clarification description could be essential to distinguish relevant and extremely relevant items.
      Ignore punctuation like comma or squares in extra clarification

    Statement:
    {statement}

    Input:
    {items}
_shared_settings: &shared_settings
  channels:
    - statgpt-sample

tools:

  - type: available_datasets
    <<: *shared_settings
    name: "Available_Datasets"
    description: >-
      Provides a list of all available datasets onboarded to the `Query_Data` tool with metadata and some details
      about them. Details include the name and description of the dataset, the provider (agency), and the last
      update date.

      This tool does not accept any arguments.

      For questions about the availability of indicators you should refer to the `Query_Data` tool.
    details:
      fake_call:
        tool_call_id: *available_datasets_call_id
      version: full
      include_indicator_count: true

  - type: dataset_structure
    <<: *shared_settings
    name: "Dataset_Structure"
    description: >-
      Provides the structure of a specific dataset, including its dimensions, attributes, their types and sample
      values. This information helps to understand how data is organized within the dataset and what kind of
      information can be queried.

      Use that tool to get the structure of a dataset when it makes sense, e.g. when user asks about a specific
      dataset or when you need to understand what kind of data is available in a dataset to answer or when user
      specifically asks about the structure of a dataset.

      Example questions that this tool can help to answer:
      * What kind of data is available in the <X> dataset?
      * How can I query the <Y> dataset?
      * What are the dimensions and attributes of the <Z> dataset?

      For questions about data and the availability of indicators you should refer to the `Query_Data` tool.
    details:
      stagesConfig:
        toolCallName: "Looking into structure of dataset '{dataset_id}'"
        toolResultName: "Structure of dataset '{dataset_id}'"

  - type: data_query
    <<: *shared_settings
    name: "Query_Data"
    description: |-
      Executing sdmx query on available datasets. Some datasets include forecasts for next years.
      Constructed query is used to fetch indicators from one of the datasets.
      Instructions:
      * Don't try to expand country groups or regions, it's done by tool itself
      * Summarize but DON'T REPHRASE time filter: "from now to 2030" must remain "from now to 2030"
      * Tool works best for single indicator query (e.g. GDP, inflation)
      * Tool supports star-queries for countries, e.g. "Give GDP for all countries"
      * Tool may ask clarifications if query is unclear. If query is modified accordingly, tool will provide
        requested data.
      * Good query example: "Please give me wage information of USA"
      * Bad query example: "What are the recent economy indicators for Baltic countries?"
        Reason: ambiguous query, specific indicators should be mentioned, e.g. GDP, unemployment rate

      Keep in mind: tool works best when detailed and concise query is provided
    details:
      allowAutoUpdate: true
      indexerVersion: "hybrid"
      indicatorSelectionVersion: "hybrid"
      hybridSearchConfig:
        namedEntitiesToRemove: ["Country/Reference area", "Counterpart area/country"]
        prompts:
          relevancyPrompts: *relevancy_prompts
      llmModels: *data-query-gpt-5-models
      attachments:
        customTable:
          enabledStr: "True"
          name: "Data: {dataset_source_id}"
        plotlyGrid:
          enabledStr: "$env:{DIAL_SHOW_PLOTLY_GRID|False}"
          name: "Plotly Grid: {dataset_source_id}"
        csvFile:
          enabledStr: "False"
        plotlyGraphs:
          enabledStr: "False"
        jsonQuery:
          enabledStr: "True"
          name: "Query (JSON): {dataset_source_id}"
        pythonCode:
          enabledStr: "True"
          name: "Python Code: {dataset_source_id}"
        mergedPythonCode:
          enabledStr: "True"
          name: "Python Code"
      stagesConfig:
        debugOnly: true
        toolCallName: "Searching for data: {query}"
        toolResultName: "Data search result: {query}"
        rules:
          - key: constructing_data_query
            debugOnly: false
          - key: extracting_named_entities
            debugOnly: false
          - key: executing_data_query
            debugOnly: false
          - key: normalizing_query
            debugOnly: false
          - key: selecting_indicators
            debugOnly: false
      messages:
        noDataForCountry: >-
          No data was found for {country_details}. Try to change the query.
        noData: >-
          No data was found for the provided query. Try to change the query.
        dataQueryExecutedAgentOnly: |-
          If the executed query is only remotely related to the user query, you must mention that fact to the user,
          to not mislead them. It is recommended to search in other sources using tools available.
          Result of the executed query is shown to the user in the table attachment.
        multipleDatasetsAgentOnly: |-
          If the executed query is only remotely related to the user query, it is recommended to mention that fact to
          the user, instead of suggesting user to choose one of the datasets. Other tools might be used to search for
          the data.


  - type: available_terms
    <<: *shared_settings
    name: "Available_Terms"
    description: |-
      Use this tool to:
      * Retrieve a comprehensive list of all terms currently available in the glossary.
      * Confirm whether a specific term exists in the glossary.

      Detailed Guidance:
      * The list of available glossary terms provided by this tool is complete; there are no additional terms beyond
        what is returned.
      * Whenever referring to or explaining to user any glossary terms you must obtain the definitions of any listed
        terms using the "Term_Definitions" tool.
    details:
      fake_call:
        tool_call_id: *availableTermsCallId

  - type: term_definitions
    <<: *shared_settings
    name: "Term_Definitions"
    description: |-
      Use this tool to:
      * Retrieve definitions for up to 10 requested terms that appear in the glossary.
      * Consult the "Available_Terms" tool if you are unsure which terms are in the glossary.

      Detailed Guidance:
      * Confirm availability of terms using the "Available_Terms" tool first.
    details:
      stagesConfig:
        toolCallName: "Searching in Glossary of Terms: {terms}"
        toolResultName: "Glossary search result: {terms}"
      limit: 10