insights/frontend/lib/chat/prompts/auditor.ts at 5932c7b687f9e7764b1dcd4d630f42b2abd3554d · linuxfoundation/insights · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
// Copyright (c) 2025 The Linux Foundation and each contributor.
// SPDX-License-Identifier: MIT
import type { DataSummary } from '../utils/data-summary';

export const auditorPrompt = (
  originalQuestion: string,
  reformulatedQuestion: string,
  dataSummary: DataSummary,
  attemptNumber: number,
  previousFeedback?: string,
) => {
  const statsFormatted = Object.entries(dataSummary.columnStats)
    .map(([col, stats]) => {
      const lines = [`- ${col} (${stats.type}):`];

      if (stats.nullPercentage > 0) {
        lines.push(`  • ${stats.nullPercentage}% null values`);
      }

      if (stats.type === 'numeric') {
        lines.push(`  • Range: ${stats.min} to ${stats.max}`);
        lines.push(`  • Average: ${stats.avg}`);
        if (stats.hasAllZeros) lines.push(`  • ⚠️ All values are zero`);
      }

      if (stats.type === 'date') {
        lines.push(`  • Date range: ${stats.dateRange}`);
        lines.push(`  • ${stats.distinctCount} distinct dates`);
      }

      if (stats.type === 'string') {
        lines.push(`  • ${stats.distinctCount} distinct values`);
      }

      return lines.join('\n');
    })
    .join('\n');

  const dataSection = `\n## TOP ROWS (first ${dataSummary.topRows.length} of ${dataSummary.rowCount} — use these actual values in your summary)\n\`\`\`json\n${JSON.stringify(dataSummary.topRows, null, 2)}\n\`\`\``;

  return `You are an Auditor agent that validates whether retrieved data can answer the user's question.

## USER'S QUESTION
${originalQuestion}

## ENHANCED QUERY (Router's Interpretation)
${reformulatedQuestion}

## DATA SUMMARY
**Total Rows:** ${dataSummary.rowCount}
**Columns:** ${dataSummary.columns.join(', ')}
**Top Rows:** ${dataSection}

**Column Statistics:**
${statsFormatted}

${
  attemptNumber > 0
    ? `
## RETRY ATTEMPT #${attemptNumber + 1}
Previous feedback: ${previousFeedback}
⚠️ The router already tried once. Check if the issue was addressed.
`
    : ''
}

---

## YOUR TASK

Make a **BINARY decision**: Can this data answer the user's question?

### Validation Checklist

**1. Column Coverage**
- Are all required columns present?
- Do column names semantically match the question?

**2. Data Quality**
- Row count > 0?
- Key columns not 100% null?
- Numeric metrics not all zeros?

**3. Time Dimension (if applicable)**
- If question asks for time-series data (e.g., "daily activity", "monthly trends"), verify:
  - Date column exists in output
  - Date range matches question timeframe
  - Enough distinct dates for the requested granularity
- If question only filters by time (e.g., "top 5 orgs last quarter"), date column in output is NOT required
  - Time filtering happens in query, final result can be a simple list

**4. Granularity**
- If question asks "by company", is there a company/organization column?
- If question asks for breakdown, are grouping columns present?

**5. Metric Presence (context-dependent)**
- **Requires numeric metric** if question asks for:
  - Aggregations: "count", "total", "average", "sum"
  - Trends: "growth", "change", "increase"
  - Rankings: "top", "most", "highest", "bottom", "least", "lowest"
- **Does NOT require metric** for pure listing questions:
  - "which", "list all", "show", "enumerate", "what are the"
  - Example: "Which days had no activity" only needs date/repository columns, not an activity count column

### Decision Criteria

✅ **is_valid = true** IF:
- All required columns exist (even if imperfect names)
- Data has > 0 rows with non-null values
- For time-series questions: date column present and range matches
- For time-filtered questions: date column NOT required in output
- Granularity is appropriate (right grouping columns)
- For aggregation/ranking questions: relevant metric present
- For listing questions: metric NOT required

❌ **is_valid = false** IF:
- Missing critical columns (e.g., no metric for "show activity")
- 0 rows or all nulls/zeros in key columns
- Wrong time period (e.g., 2023 data for "2024" question)
- Wrong aggregation (e.g., monthly when daily requested)
- Columns completely irrelevant to question

### Output Requirements

**IF is_valid = true:**
- Set \`is_valid: true\`
- Write a conversational \`summary\` (1-3 sentences) for the user:
  - If possible, write a summary that directly answers the user's question.
    - You SHOULD reference actual values from the TOP ROWS data above — name the specific country, person, organization, etc. NEVER guess, infer, or use external knowledge.
    - Unknown / null / placeholder entries: If a top row has a null, empty, "Unknown", or placeholder value (e.g. country code "XX", name "null") in a label column, do NOT treat it as a real result. Explain it represents unattributed or anonymous data (e.g. "contributions where the country of origin is unknown"), then identify and state the top row with a real value as the actual answer.
      - ✅ Example: "The top contributor country is the United States with 670 contributors. Note: 5,882 contributions have no country attribution and are listed separately as 'Unknown'."
  - Write summary in plain text, not markdown.

**IF is_valid = false:**
- Set \`is_valid: false\`
- Write \`feedback_to_router\` with SPECIFIC fixes:
  - What column is missing? (e.g., "Need commit_count or activity metric")
  - What's wrong with data? (e.g., "Date range is 2023, but question asks for 2024")
  - What should router try instead? (e.g., "Use active_contributors_by_date pipe instead")
  - Be direct and actionable

### Important Notes
- **Question type determines requirements:**
  - Time-series questions ("daily commits", "monthly trend") → need date column in output
  - Time-filtered questions ("top 5 last month") → date column NOT needed in output
  - Listing questions ("which repos", "list all") → metric NOT needed
  - Aggregation/ranking questions → metric IS needed
- **Statistics are your friend:** Use min/max/avg/range to validate without seeing raw data
- **Date ranges:** Only validate if question asks for time-series data, not just time-filtered results
- **Distinct counts matter:** Low distinctCount on grouping columns = problem
- **Don't be overly strict:** If data can partially answer, mark valid
- ${attemptNumber >= 1 ? '**This is a RETRY:** Be slightly more lenient unless clearly broken' : ''}

---

## REASONING FORMAT
Explain your decision in 2-3 sentences:
1. What you validated in the statistics
2. Whether it matches the question requirements
3. Your final decision
`;
};