-
Notifications
You must be signed in to change notification settings - Fork 41
Expand file tree
/
Copy pathdata-summary.ts
More file actions
139 lines (124 loc) · 3.85 KB
/
data-summary.ts
File metadata and controls
139 lines (124 loc) · 3.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
// Copyright (c) 2025 The Linux Foundation and each contributor.
// SPDX-License-Identifier: MIT
export interface DataSummary {
rowCount: number;
columns: string[];
columnStats: Record<string, ColumnStats>;
topRows: Record<string, unknown>[];
}
export interface ColumnStats {
type: 'numeric' | 'string' | 'date' | 'boolean';
nullCount: number;
nullPercentage: number;
// Numeric stats
min?: number;
max?: number;
sum?: number;
avg?: number;
hasAllZeros?: boolean;
// String/categorical stats
distinctCount?: number;
minLength?: number;
maxLength?: number;
// Date stats
earliestDate?: string;
latestDate?: string;
dateRange?: string;
}
/**
* Generate statistical summary of dataset
* Token-efficient: ~1500-2000 tokens for typical dataset
* Top rows of raw data sent to LLM + statistics
*
* @param data - Array of data rows
* @returns Statistical summary optimized for auditor validation
*/
export function generateDataSummary<T extends Record<string, unknown>>(data: T[]): DataSummary {
if (!data || data.length === 0) {
return {
rowCount: 0,
columns: [],
columnStats: {},
topRows: [],
};
}
const columns = Object.keys(data[0] || {});
const columnStats: Record<string, ColumnStats> = {};
for (const col of columns) {
const values = data.map((row) => row[col]);
const nonNullValues = values.filter((v) => v !== null && v !== undefined && v !== '');
const nullCount = data.length - nonNullValues.length;
const nullPercentage = Math.round((nullCount / data.length) * 100);
if (nonNullValues.length === 0) {
// All nulls - mark as string type with full null percentage
columnStats[col] = {
type: 'string',
nullCount,
nullPercentage,
};
continue;
}
const firstValue = nonNullValues[0];
let stats: ColumnStats;
// Numeric columns
if (typeof firstValue === 'number') {
const numericValues = nonNullValues as number[];
const sum = numericValues.reduce((a, b) => a + b, 0);
const hasAllZeros = numericValues.every((v) => v === 0);
stats = {
type: 'numeric',
nullCount,
nullPercentage,
min: Math.min(...numericValues),
max: Math.max(...numericValues),
sum,
avg: Math.round((sum / numericValues.length) * 100) / 100, // Round to 2 decimals
hasAllZeros,
};
}
// Boolean columns
else if (typeof firstValue === 'boolean') {
stats = {
type: 'boolean',
nullCount,
nullPercentage,
distinctCount: new Set(nonNullValues).size,
};
}
// Date columns (detect date strings)
else if (typeof firstValue === 'string' && !isNaN(Date.parse(firstValue))) {
const dates = nonNullValues.map((v) => new Date(v as string));
const earliest = new Date(Math.min(...dates.map((d) => d.getTime())));
const latest = new Date(Math.max(...dates.map((d) => d.getTime())));
stats = {
type: 'date',
nullCount,
nullPercentage,
distinctCount: new Set(nonNullValues).size,
earliestDate: earliest.toISOString().split('T')[0],
latestDate: latest.toISOString().split('T')[0],
dateRange: `${earliest.toISOString().split('T')[0]} to ${latest.toISOString().split('T')[0]}`,
};
}
// String columns
else {
const stringValues = nonNullValues.map((v) => String(v));
stats = {
type: 'string',
nullCount,
nullPercentage,
distinctCount: new Set(stringValues).size,
minLength: Math.min(...stringValues.map((s) => s.length)),
maxLength: Math.max(...stringValues.map((s) => s.length)),
};
}
columnStats[col] = stats;
}
const rows = data as Record<string, unknown>[];
return {
rowCount: data.length,
columns,
columnStats,
topRows: rows.slice(0, 3),
};
}