fix: deprecate dataset queries for schema.gov.it. feat: add logging for data size

mfortini · mfortini · commit 73ccb2f0f8e1 · 2026-03-13T11:54:37.000+01:00
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -53,6 +53,8 @@ Single-file implementation (`src/index.ts`) using:
 - `explore_dataset` - Get dataset details and distributions
 - `preview_distribution` - Download and preview first rows of CSV/JSON data
 
+Note: keep these tools available, but do not treat them as the default entry point for `schema.gov.it`. In this catalog, many DCAT-AP_IT datasets are semantic assets such as ontologies, controlled vocabularies, and their distributions. For `schema.gov.it`, prefer ontology, vocabulary, class/property, and SPARQL tools first; dataset tools are more useful for external catalogs or specific DCAT-AP_IT inspection tasks.
+
 **Intelligent Tools:**
 - `search_concepts` - Fuzzy keyword search (use when URI is unknown)
 - `inspect_concept` - Deep profiling (definition, hierarchy, usage, relations)
diff --git a/README.md b/README.md
@@ -38,6 +38,8 @@ Il server espone **34 strumenti** organizzati in 11 categorie:
 *   `explore_dataset`: Mostra dettagli e distribuzioni di un dataset.
 *   `preview_distribution`: Scarica e mostra le prime righe di una distribuzione CSV/JSON.
 
+Nota: questi tool restano utili, ma su `schema.gov.it` sono spesso secondari. Il catalogo contiene soprattutto asset semantici pubblicati come dataset DCAT-AP_IT, ad esempio ontologie, vocabolari controllati e relative distribuzioni. Per esplorare `schema.gov.it` conviene di norma partire da ontologie, vocabolari, classi, proprietà e query SPARQL; i tool dataset sono più indicati per cataloghi esterni o per casi DCAT-AP_IT specifici.
+
 ### 6. Intelligence (Avanzato)
 *   `search_concepts`: **Ricerca fuzzy**. Trova concetti (es. "Scuola") senza conoscere l'URI esatto.
 *   `inspect_concept`: **Deep Dive**. Ottiene in un colpo solo definizione, gerarchia, usage stats e vicini di un concetto.
@@ -239,7 +241,7 @@ Una volta configurato, puoi chiedere all'agente cose come:
 *   **Compressione Token**: Le liste lunghe (> 5 item) vengono restituite in formato tabellare compatto per risparmiare token.
 *   **Input Sanitizzati**: Tutti i parametri utente sono sanitizzati per prevenire SPARQL injection.
 *   **Ontologia Locale**: I tool del gruppo 9 (`inspect_local_ontology`, `query_local_ontology`, `compare_local_with_remote`) usano [oxigraph](https://github.com/oxigraph/oxigraph) (WASM) per caricare file RDF/OWL locali in memoria ed eseguire SPARQL. I file vengono cachati dopo il primo caricamento; le query successive sullo stesso file non rileggono il disco. Formati supportati: `.ttl`, `.owl`, `.rdf`, `.nt`, `.jsonld`.
-*   **Logging**: Tutte le chiamate vengono loggate in `logs/usage_log.jsonl` per analisi e miglioramento continuo.
+*   **Logging**: Tutte le chiamate vengono loggate in `logs/usage_log.jsonl` per analisi e miglioramento continuo. Ogni entry include argomenti, riepilogo, `source_data_metrics` e `ai_data_metrics`: metriche quantitative dei dati ricevuti e del payload finale passato al modello, ad esempio numero di caratteri e, quando rilevabile, righe, colonne o numero di elementi.
 *   **Trasporto**: Il server supporta sia `stdio` (default, per uso locale) che HTTP/SSE (via `MCP_TRANSPORT=sse`, per uso remoto/Docker).
 
 ## Licenza
diff --git a/src/executor.ts b/src/executor.ts
@@ -15,13 +15,19 @@ const LOG_FILE = join(LOG_DIR, "usage_log.jsonl");
 export async function logUsage(
   toolName: string,
   args: Record<string, unknown>,
-  resultSummary: string
+  resultSummary: string,
+  options?: {
+    sourceData?: unknown;
+    aiData?: unknown;
+  }
 ): Promise<void> {
   const entry = {
     timestamp: new Date().toISOString(),
     tool: toolName,
     args,
     summary: resultSummary,
+    source_data_metrics: buildDataMetrics(options?.sourceData),
+    ai_data_metrics: buildDataMetrics(options?.aiData),
   };
   try {
     await appendFile(LOG_FILE, JSON.stringify(entry) + "\n");
@@ -51,6 +57,46 @@ export function truncateResult(text: string): { text: string; truncated: boolean
   return { text: truncated, truncated: true };
 }
 
+function buildDataMetrics(value: unknown): Record<string, unknown> | undefined {
+  if (value === undefined) {
+    return undefined;
+  }
+
+  try {
+    const json = JSON.stringify(value);
+    const metrics: Record<string, unknown> = {
+      chars: json.length,
+    };
+
+    if (Array.isArray(value)) {
+      metrics.kind = "array";
+      metrics.items = value.length;
+    } else if (value && typeof value === "object") {
+      metrics.kind = "object";
+      metrics.keys = Object.keys(value as Record<string, unknown>).length;
+
+      const sparqlLike = value as {
+        head?: { vars?: unknown[] };
+        results?: { bindings?: unknown[] };
+      };
+      if (Array.isArray(sparqlLike.head?.vars)) {
+        metrics.vars = sparqlLike.head.vars.length;
+      }
+      if (Array.isArray(sparqlLike.results?.bindings)) {
+        metrics.rows = sparqlLike.results.bindings.length;
+      }
+    } else {
+      metrics.kind = typeof value;
+    }
+
+    return metrics;
+  } catch (error: unknown) {
+    return {
+      _serialization_error: getErrorMessage(error),
+    };
+  }
+}
+
 /**
  * Central helper for executing tools with consistent error handling, logging, and truncation.
  * @param toolName - Name of the tool for logging
@@ -68,11 +114,13 @@ export async function executeTool<T>(
     console.error(`[Tool] ${toolName} completed: ${result.success ? 'SUCCESS' : 'FAILURE'}`);
 
     if (!result.success) {
-      await logUsage(toolName, args, `Error: ${result.error}`);
       let errorText = `Error: ${result.error}`;
       if (result.suggestion) {
         errorText += `\nSuggestion: ${result.suggestion}`;
       }
+      await logUsage(toolName, args, `Error: ${result.error}`, {
+        aiData: { error: result.error, suggestion: result.suggestion },
+      });
       return {
         content: [{ type: "text", text: errorText }],
         isError: true,
@@ -83,7 +131,22 @@ export async function executeTool<T>(
     const { text, truncated } = truncateResult(jsonText);
 
     const rowInfo = result.rowCount !== undefined ? `, ${result.rowCount} rows` : "";
-    await logUsage(toolName, args, `Success${rowInfo}${truncated ? " (truncated)" : ""}`);
+    const aiData = truncated
+      ? {
+          _truncated: true,
+          _message: `Result exceeded ${CHARACTER_LIMIT} characters and was truncated`,
+          chars_before_truncation: jsonText.length,
+          chars_sent_to_ai: text.length,
+        }
+      : {
+          chars_sent_to_ai: text.length,
+          payload: result.data,
+        };
+
+    await logUsage(toolName, args, `Success${rowInfo}${truncated ? " (truncated)" : ""}`, {
+      sourceData: result.sourceData,
+      aiData,
+    });
 
     if (truncated) {
       return {
@@ -104,7 +167,9 @@ export async function executeTool<T>(
   } catch (error: unknown) {
     const message = getErrorMessage(error);
     console.error(`[Tool] ${toolName} error:`, message);
-    await logUsage(toolName, args, `Error: ${message}`);
+    await logUsage(toolName, args, `Error: ${message}`, {
+      aiData: { error: message },
+    });
     return {
       content: [{ type: "text", text: `Error: ${message}` }],
       isError: true,
@@ -125,6 +190,6 @@ export async function executeSparqlTool(
     const result = await executeSparql(query);
     const rowCount = result.results?.bindings?.length ?? 0;
     const compressed = compressSparqlResult(result);
-    return { success: true, data: compressed, rowCount };
+    return { success: true, data: compressed, rowCount, sourceData: result };
   });
 }
diff --git a/src/tools/group-e.ts b/src/tools/group-e.ts
@@ -15,6 +15,8 @@ server.registerTool(
     title: "List Datasets",
     description: `List available Datasets (dcatapit:Dataset) in the catalog.
 
+Use this when you explicitly need DCAT-AP_IT dataset records. On schema.gov.it, these are often semantic assets such as ontologies, controlled vocabularies, and related distributions rather than classic tabular datasets, so ontology/vocabulary/SPARQL tools are usually a better starting point.
+
 **Args:**
 - limit: Maximum datasets per page (default: 20)
 - offset: Number of datasets to skip (default: 0)
@@ -75,6 +77,10 @@ server.registerTool(
           },
         },
         rowCount: count,
+        sourceData: {
+          dataResult,
+          countResult,
+        },
       };
     });
   }
@@ -86,6 +92,8 @@ server.registerTool(
     title: "Explore Dataset",
     description: `Get details of a specific Dataset including metadata and distributions.
 
+Use this for targeted DCAT-AP_IT inspection. On schema.gov.it, many datasets describe semantic assets, so this tool is usually secondary to ontology, vocabulary, class/property, and SPARQL exploration.
+
 **Args:**
 - datasetUri: URI of the dataset to explore
 
@@ -138,6 +146,10 @@ server.registerTool(
           metadata: compressSparqlResult(details),
           distributions: compressSparqlResult(distributions),
         },
+        sourceData: {
+          metadata: details,
+          distributions,
+        },
         rowCount: (details.results?.bindings?.length ?? 0) +
           (distributions.results?.bindings?.length ?? 0),
       };
@@ -151,6 +163,8 @@ server.registerTool(
     title: "Preview Distribution",
     description: `Download and preview the first rows of a distribution file.
 
+Most useful after you already identified a concrete distribution URL. On schema.gov.it, distributions often belong to semantic assets rather than classic tabular datasets.
+
 **Args:**
 - url: Download URL of the distribution (CSV or JSON)
 
@@ -207,6 +221,11 @@ server.registerTool(
         return {
           success: true,
           data: `Preview of ${url}:\n\n${preview}`,
+          sourceData: {
+            url,
+            contentType,
+            bodyPreview: text.slice(0, 4000),
+          },
         };
       } finally {
         clearTimeout(timeoutId);
diff --git a/src/tools/group-j.ts b/src/tools/group-j.ts
@@ -129,7 +129,7 @@ server.registerTool(
       const result = await executeSparql(query, safeEndpoint, injectPrefixes ?? false, 15000);
       const rowCount = result.results?.bindings?.length ?? 0;
       const compressed = compressSparqlResult(result);
-      return { success: true, data: compressed, rowCount };
+      return { success: true, data: compressed, rowCount, sourceData: result };
     });
   }
 );
@@ -219,6 +219,7 @@ server.registerTool(
       return {
         success: true,
         data: { concept: uri, alignments },
+        sourceData: result,
         rowCount: alignments.length,
       };
     });
@@ -265,7 +266,7 @@ server.registerTool(
       const result = await executeSparql(query, safeEndpoint, false, 15000);
       const rowCount = result.results?.bindings?.length ?? 0;
       const compressed = compressSparqlResult(result);
-      return { success: true, data: compressed, rowCount };
+      return { success: true, data: compressed, rowCount, sourceData: result };
     });
   }
 );
diff --git a/src/types.ts b/src/types.ts
@@ -38,6 +38,7 @@ export interface ToolSuccess<T = unknown> {
   success: true;
   data: T;
   rowCount?: number;
+  sourceData?: unknown;
 }
 
 /** Error tool result */

Original file line number	Diff line number	Diff line change
`@@ -38,6 +38,7 @@ export interface ToolSuccess<T = unknown> {`
`38`	`38`	`success: true;`
`39`	`39`	`data: T;`
`40`	`40`	`rowCount?: number;`
	`41`	`+ sourceData?: unknown;`
`41`	`42`	`}`
`42`	`43`
`43`	`44`	`/** Error tool result */`