Skip to content
Open
Original file line number Diff line number Diff line change
Expand Up @@ -45,4 +45,10 @@ public class ElasticsearchEndpoint
public int? BootstrapTimeout { get; set; }
public bool NoSemantic { get; set; }
public bool ForceReindex { get; set; }

/// <summary>
/// Enable AI enrichment of documents using LLM-generated metadata.
/// When enabled, documents are enriched with summaries, search queries, and questions.
/// </summary>
public bool EnableAiEnrichment { get; set; }
}
45 changes: 45 additions & 0 deletions src/Elastic.Documentation/Search/DocumentationDocument.cs
Original file line number Diff line number Diff line change
Expand Up @@ -83,4 +83,49 @@ public record DocumentationDocument
[JsonPropertyName("hidden")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
public bool Hidden { get; set; }

// AI Enrichment fields - populated by DocumentEnrichmentService

/// <summary>
/// Content-addressable hash of title + body for AI enrichment cache lookup.
/// Used by the enrich processor to join AI-generated fields at index time.
/// </summary>
[JsonPropertyName("content_hash")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
public string? ContentHash { get; set; }

/// <summary>
/// 3-5 sentences dense with technical entities, API names, and core functionality for vector matching.
/// </summary>
[JsonPropertyName("ai_rag_optimized_summary")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
public string? AiRagOptimizedSummary { get; set; }

/// <summary>
/// Exactly 5-10 words for a UI tooltip.
/// </summary>
[JsonPropertyName("ai_short_summary")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
public string? AiShortSummary { get; set; }

/// <summary>
/// A 3-8 word keyword string representing a high-intent user search for this doc.
/// </summary>
[JsonPropertyName("ai_search_query")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
public string? AiSearchQuery { get; set; }

/// <summary>
/// Array of 3-5 specific questions answered by this document.
/// </summary>
[JsonPropertyName("ai_questions")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
public string[]? AiQuestions { get; set; }

/// <summary>
/// Array of 2-4 specific use cases this doc helps with.
/// </summary>
[JsonPropertyName("ai_use_cases")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
public string[]? AiUseCases { get; set; }
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,15 @@ public abstract partial class ElasticsearchIngestChannel<TChannelOptions, TChann
where TChannelOptions : CatalogIndexChannelOptionsBase<DocumentationDocument>
where TChannel : CatalogIndexChannel<DocumentationDocument, TChannelOptions>
{
protected static string CreateMappingSetting(string synonymSetName, string[] synonyms)
protected static string CreateMappingSetting(string synonymSetName, string[] synonyms, string? defaultPipeline = null)
{
var indexTimeSynonyms = $"[{string.Join(",", synonyms.Select(r => $"\"{r}\""))}]";
var pipelineSetting = defaultPipeline is not null ? $"\"default_pipeline\": \"{defaultPipeline}\"," : "";
// language=json
return
$$$"""
{
{{{pipelineSetting}}}
"analysis": {
"normalizer": {
"keyword_normalizer": {
Expand Down Expand Up @@ -156,6 +158,7 @@ protected static string CreateMapping(string? inferenceId) =>
}
},
"hash" : { "type" : "keyword" },
"content_hash" : { "type" : "keyword" },
"search_title": {
"type": "text",
"analyzer": "synonyms_fixed_analyzer",
Expand Down Expand Up @@ -201,6 +204,32 @@ protected static string CreateMapping(string? inferenceId) =>
"fields" : {
{{(!string.IsNullOrWhiteSpace(inferenceId) ? $"\"semantic_text\": {{{InferenceMapping(inferenceId)}}}" : "")}}
}
},
"ai_rag_optimized_summary": {
"type": "text",
"analyzer": "synonyms_fixed_analyzer",
"search_analyzer": "synonyms_analyzer",
"fields": {
{{(!string.IsNullOrWhiteSpace(inferenceId) ? $"\"semantic_text\": {{{InferenceMapping(inferenceId)}}}" : "")}}
}
},
"ai_short_summary": {
"type": "text"
},
"ai_search_query": {
"type": "keyword"
},
"ai_questions": {
"type": "text",
"fields": {
{{(!string.IsNullOrWhiteSpace(inferenceId) ? $"\"semantic_text\": {{{InferenceMapping(inferenceId)}}}" : "")}}
}
},
"ai_use_cases": {
"type": "text",
"fields": {
{{(!string.IsNullOrWhiteSpace(inferenceId) ? $"\"semantic_text\": {{{InferenceMapping(inferenceId)}}}" : "")}}
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ public class ElasticsearchLexicalIngestChannel(
ElasticsearchEndpoint endpoint,
string indexNamespace,
DistributedTransport transport,
string[] indexTimeSynonyms
string[] indexTimeSynonyms,
string? defaultPipeline = null
)
: ElasticsearchIngestChannel<CatalogIndexChannelOptions<DocumentationDocument>, CatalogIndexChannel<DocumentationDocument>>
(logFactory, collector, endpoint, transport, o => new(o), t => new(t)
Expand All @@ -34,7 +35,7 @@ string[] indexTimeSynonyms
{ "batch_index_date", d.BatchIndexDate.ToString("o") }
}),
GetMapping = () => CreateMapping(null),
GetMappingSettings = () => CreateMappingSetting($"docs-{indexNamespace}", indexTimeSynonyms),
GetMappingSettings = () => CreateMappingSetting($"docs-{indexNamespace}", indexTimeSynonyms, defaultPipeline),
IndexFormat =
$"{endpoint.IndexNamePrefix.Replace("semantic", "lexical").ToLowerInvariant()}-{indexNamespace.ToLowerInvariant()}-{{0:yyyy.MM.dd.HHmmss}}",
ActiveSearchAlias = $"{endpoint.IndexNamePrefix.Replace("semantic", "lexical").ToLowerInvariant()}-{indexNamespace.ToLowerInvariant()}"
Expand All @@ -46,14 +47,15 @@ public class ElasticsearchSemanticIngestChannel(
ElasticsearchEndpoint endpoint,
string indexNamespace,
DistributedTransport transport,
string[] indexTimeSynonyms
string[] indexTimeSynonyms,
string? defaultPipeline = null
)
: ElasticsearchIngestChannel<SemanticIndexChannelOptions<DocumentationDocument>, SemanticIndexChannel<DocumentationDocument>>
(logFactory, collector, endpoint, transport, o => new(o), t => new(t)
{
BulkOperationIdLookup = d => d.Url,
GetMapping = (inferenceId, _) => CreateMapping(inferenceId),
GetMappingSettings = (_, _) => CreateMappingSetting($"docs-{indexNamespace}", indexTimeSynonyms),
GetMappingSettings = (_, _) => CreateMappingSetting($"docs-{indexNamespace}", indexTimeSynonyms, defaultPipeline),
IndexFormat = $"{endpoint.IndexNamePrefix.ToLowerInvariant()}-{indexNamespace.ToLowerInvariant()}-{{0:yyyy.MM.dd.HHmmss}}",
ActiveSearchAlias = $"{endpoint.IndexNamePrefix}-{indexNamespace.ToLowerInvariant()}",
IndexNumThreads = endpoint.IndexNumThreads,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using Elastic.Documentation.Navigation;
using Elastic.Documentation.Search;
using Elastic.Ingest.Elasticsearch.Indices;
using Elastic.Markdown.Exporters.Elasticsearch.Enrichment;
using Elastic.Markdown.Helpers;
using Markdig.Syntax;
using Microsoft.Extensions.Logging;
Expand Down Expand Up @@ -131,6 +132,13 @@ public async ValueTask<bool> ExportAsync(MarkdownExportFileContext fileContext,
};

CommonEnrichments(doc, currentNavigation);

// AI Enrichment - hybrid approach:
// - Cache hits: enrich processor applies fields at index time
// - Cache misses: apply fields inline before indexing
doc.ContentHash = ContentHashGenerator.Generate(doc.Title, doc.StrippedBody ?? string.Empty);
await TryEnrichDocumentAsync(doc, ctx);

AssignDocumentMetadata(doc);

if (_indexStrategy == IngestStrategy.Multiplex)
Expand Down Expand Up @@ -166,6 +174,11 @@ public async ValueTask<bool> FinishExportAsync(IDirectoryInfo outputFolder, Canc
doc.Abstract = @abstract;
doc.Headings = headings;
CommonEnrichments(doc, null);

// AI Enrichment - hybrid approach
doc.ContentHash = ContentHashGenerator.Generate(doc.Title, doc.StrippedBody ?? string.Empty);
await TryEnrichDocumentAsync(doc, ctx);

AssignDocumentMetadata(doc);

// Write to channels following the multiplex or reindex strategy
Expand All @@ -191,4 +204,51 @@ public async ValueTask<bool> FinishExportAsync(IDirectoryInfo outputFolder, Canc
return true;
}

/// <summary>
/// Hybrid AI enrichment: cache hits rely on enrich processor, cache misses apply fields inline.
/// </summary>
private async ValueTask TryEnrichDocumentAsync(DocumentationDocument doc, Cancel ctx)
{
if (_enrichmentCache is null || _llmClient is null || string.IsNullOrWhiteSpace(doc.ContentHash))
return;

// Check if enrichment exists in cache
if (_enrichmentCache.Exists(doc.ContentHash))
{
// Cache hit - enrich processor will apply fields at index time
_ = Interlocked.Increment(ref _cacheHitCount);
return;
}

// Check if we've hit the limit for new enrichments
var current = Interlocked.Increment(ref _newEnrichmentCount);
if (current > _enrichmentOptions.MaxNewEnrichmentsPerRun)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move this check above the Exists() check (especially if it does IO to do an exist check).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The order is intentional. Exists() is an in-memory dictionary lookup (as of now). Cache hits don't call the LLM, so they shouldn't count against the limit. If we checked the limit first, we'd block documents that already have cached enrichments. The limit caps LLM calls, not total enrichments.

{
_ = Interlocked.Decrement(ref _newEnrichmentCount);
return;
}

// Cache miss - generate enrichment inline and apply directly
try
{
var enrichment = await _llmClient.EnrichAsync(doc.Title, doc.StrippedBody ?? string.Empty, ctx);
if (enrichment is not { HasData: true })
return;

// Store in cache for future runs
await _enrichmentCache.StoreAsync(doc.ContentHash, enrichment, _enrichmentOptions.PromptVersion, ctx);

// Apply fields directly (enrich processor won't have this entry yet)
doc.AiRagOptimizedSummary = enrichment.RagOptimizedSummary;
doc.AiShortSummary = enrichment.ShortSummary;
doc.AiSearchQuery = enrichment.SearchQuery;
doc.AiQuestions = enrichment.Questions;
doc.AiUseCases = enrichment.UseCases;
}
catch (Exception ex) when (ex is not OperationCanceledException)
{
_logger.LogWarning(ex, "Failed to enrich document {Url}", doc.Url);
_ = Interlocked.Decrement(ref _newEnrichmentCount);
}
}
}
Loading
Loading