From 1505e222f53c1d38ee81d67ab2448c98b6ba3b30 Mon Sep 17 00:00:00 2001 From: Raja Phanindra Chava Date: Thu, 2 Apr 2026 20:29:18 -0700 Subject: [PATCH 01/11] Added embedding API to SDK --- sdk/cs/src/Detail/JsonSerializationContext.cs | 2 + sdk/cs/src/Detail/Model.cs | 5 ++ sdk/cs/src/Detail/ModelVariant.cs | 17 ++++ sdk/cs/src/IModel.cs | 7 ++ sdk/cs/src/OpenAI/EmbeddingClient.cs | 81 +++++++++++++++++++ .../OpenAI/EmbeddingRequestResponseTypes.cs | 75 +++++++++++++++++ 6 files changed, 187 insertions(+) create mode 100644 sdk/cs/src/OpenAI/EmbeddingClient.cs create mode 100644 sdk/cs/src/OpenAI/EmbeddingRequestResponseTypes.cs diff --git a/sdk/cs/src/Detail/JsonSerializationContext.cs b/sdk/cs/src/Detail/JsonSerializationContext.cs index 37cc81ac..0fe5e677 100644 --- a/sdk/cs/src/Detail/JsonSerializationContext.cs +++ b/sdk/cs/src/Detail/JsonSerializationContext.cs @@ -23,6 +23,8 @@ namespace Microsoft.AI.Foundry.Local.Detail; [JsonSerializable(typeof(ChatCompletionCreateResponse))] [JsonSerializable(typeof(AudioCreateTranscriptionRequest))] [JsonSerializable(typeof(AudioCreateTranscriptionResponse))] +[JsonSerializable(typeof(EmbeddingCreateRequestExtended))] +[JsonSerializable(typeof(EmbeddingCreateResponse))] [JsonSerializable(typeof(string[]))] // list loaded or cached models [JsonSerializable(typeof(EpInfo[]))] [JsonSerializable(typeof(EpDownloadResult))] diff --git a/sdk/cs/src/Detail/Model.cs b/sdk/cs/src/Detail/Model.cs index c4d96057..03e9321b 100644 --- a/sdk/cs/src/Detail/Model.cs +++ b/sdk/cs/src/Detail/Model.cs @@ -99,6 +99,11 @@ public async Task GetAudioClientAsync(CancellationToken? ct = return await SelectedVariant.GetAudioClientAsync(ct).ConfigureAwait(false); } + public async Task GetEmbeddingClientAsync(CancellationToken? ct = null) + { + return await SelectedVariant.GetEmbeddingClientAsync(ct).ConfigureAwait(false); + } + public async Task UnloadAsync(CancellationToken? ct = null) { await SelectedVariant.UnloadAsync(ct).ConfigureAwait(false); diff --git a/sdk/cs/src/Detail/ModelVariant.cs b/sdk/cs/src/Detail/ModelVariant.cs index 9f2deaba..250c601a 100644 --- a/sdk/cs/src/Detail/ModelVariant.cs +++ b/sdk/cs/src/Detail/ModelVariant.cs @@ -102,6 +102,13 @@ public async Task GetAudioClientAsync(CancellationToken? ct = .ConfigureAwait(false); } + public async Task GetEmbeddingClientAsync(CancellationToken? ct = null) + { + return await Utils.CallWithExceptionHandling(() => GetEmbeddingClientImplAsync(ct), + "Error getting embedding client for model", _logger) + .ConfigureAwait(false); + } + private async Task IsLoadedImplAsync(CancellationToken? ct = null) { var loadedModels = await _modelLoadManager.ListLoadedModelsAsync(ct).ConfigureAwait(false); @@ -193,6 +200,16 @@ private async Task GetAudioClientImplAsync(CancellationToken? return new OpenAIAudioClient(Id); } + private async Task GetEmbeddingClientImplAsync(CancellationToken? ct = null) + { + if (!await IsLoadedAsync(ct)) + { + throw new FoundryLocalException($"Model {Id} is not loaded. Call LoadAsync first."); + } + + return new OpenAIEmbeddingClient(Id); + } + public void SelectVariant(IModel variant) { throw new FoundryLocalException( diff --git a/sdk/cs/src/IModel.cs b/sdk/cs/src/IModel.cs index a27f3a3d..37249782 100644 --- a/sdk/cs/src/IModel.cs +++ b/sdk/cs/src/IModel.cs @@ -70,6 +70,13 @@ Task DownloadAsync(Action? downloadProgress = null, /// OpenAI.AudioClient Task GetAudioClientAsync(CancellationToken? ct = null); + /// + /// Get an OpenAI API based EmbeddingClient + /// + /// Optional cancellation token. + /// OpenAI.EmbeddingClient + Task GetEmbeddingClientAsync(CancellationToken? ct = null); + /// /// Variants of the model that are available. Variants of the model are optimized for different devices. /// diff --git a/sdk/cs/src/OpenAI/EmbeddingClient.cs b/sdk/cs/src/OpenAI/EmbeddingClient.cs new file mode 100644 index 00000000..e757fada --- /dev/null +++ b/sdk/cs/src/OpenAI/EmbeddingClient.cs @@ -0,0 +1,81 @@ +// -------------------------------------------------------------------------------------------------------------------- +// +// Copyright (c) Microsoft. All rights reserved. +// +// -------------------------------------------------------------------------------------------------------------------- + +namespace Microsoft.AI.Foundry.Local; + +using Betalgo.Ranul.OpenAI.ObjectModels.ResponseModels; + +using Microsoft.AI.Foundry.Local.Detail; +using Microsoft.AI.Foundry.Local.OpenAI; +using Microsoft.Extensions.Logging; + +/// +/// Embedding Client that uses the OpenAI API. +/// Implemented using Betalgo.Ranul.OpenAI SDK types. +/// +public class OpenAIEmbeddingClient +{ + private readonly string _modelId; + + private readonly ICoreInterop _coreInterop = FoundryLocalManager.Instance.CoreInterop; + private readonly ILogger _logger = FoundryLocalManager.Instance.Logger; + + internal OpenAIEmbeddingClient(string modelId) + { + _modelId = modelId; + } + + /// + /// Settings that are supported by Foundry Local for embeddings. + /// + public record EmbeddingSettings + { + /// + /// The number of dimensions the resulting output embeddings should have. + /// Only supported by some models. + /// + public int? Dimensions { get; set; } + + /// + /// The format to return the embeddings in. Can be either "float" or "base64". + /// + public string? EncodingFormat { get; set; } + } + + /// + /// Settings to use for embedding requests using this client. + /// + public EmbeddingSettings Settings { get; } = new(); + + /// + /// Generate embeddings for the given input text. + /// + /// The text to generate embeddings for. + /// Optional cancellation token. + /// Embedding response containing the embedding vector. + public async Task GenerateEmbeddingAsync(string input, + CancellationToken? ct = null) + { + return await Utils.CallWithExceptionHandling( + () => GenerateEmbeddingImplAsync(input, ct), + "Error during embedding generation.", _logger).ConfigureAwait(false); + } + + private async Task GenerateEmbeddingImplAsync(string input, + CancellationToken? ct) + { + var embeddingRequest = EmbeddingCreateRequestExtended.FromUserInput(_modelId, input, Settings); + var embeddingRequestJson = embeddingRequest.ToJson(); + + var request = new CoreInteropRequest { Params = new() { { "OpenAICreateRequest", embeddingRequestJson } } }; + var response = await _coreInterop.ExecuteCommandAsync("embeddings", request, + ct ?? CancellationToken.None).ConfigureAwait(false); + + var embeddingResponse = response.ToEmbeddingResponse(_logger); + + return embeddingResponse; + } +} diff --git a/sdk/cs/src/OpenAI/EmbeddingRequestResponseTypes.cs b/sdk/cs/src/OpenAI/EmbeddingRequestResponseTypes.cs new file mode 100644 index 00000000..d55a69f6 --- /dev/null +++ b/sdk/cs/src/OpenAI/EmbeddingRequestResponseTypes.cs @@ -0,0 +1,75 @@ +// -------------------------------------------------------------------------------------------------------------------- +// +// Copyright (c) Microsoft. All rights reserved. +// +// -------------------------------------------------------------------------------------------------------------------- + +namespace Microsoft.AI.Foundry.Local.OpenAI; + +using System.Text.Json; +using System.Text.Json.Serialization; + +using Betalgo.Ranul.OpenAI.ObjectModels.ResponseModels; + +using Microsoft.AI.Foundry.Local.Detail; +using Microsoft.Extensions.Logging; + +// https://platform.openai.com/docs/api-reference/embeddings/create +internal record EmbeddingCreateRequestExtended +{ + [JsonPropertyName("input")] + public string? Input { get; set; } + + [JsonPropertyName("model")] + public string? Model { get; set; } + + [JsonPropertyName("dimensions")] + public int? Dimensions { get; set; } + + [JsonPropertyName("encoding_format")] + public string? EncodingFormat { get; set; } + + internal static EmbeddingCreateRequestExtended FromUserInput(string modelId, + string input, + OpenAIEmbeddingClient.EmbeddingSettings settings) + { + return new EmbeddingCreateRequestExtended + { + Model = modelId, + Input = input, + Dimensions = settings.Dimensions, + EncodingFormat = settings.EncodingFormat + }; + } +} + +internal static class EmbeddingRequestResponseExtensions +{ + internal static string ToJson(this EmbeddingCreateRequestExtended request) + { + return JsonSerializer.Serialize(request, JsonSerializationContext.Default.EmbeddingCreateRequestExtended); + } + + internal static EmbeddingCreateResponse ToEmbeddingResponse(this ICoreInterop.Response response, ILogger logger) + { + if (response.Error != null) + { + logger.LogError("Error from embeddings: {Error}", response.Error); + throw new FoundryLocalException($"Error from embeddings command: {response.Error}"); + } + + return response.Data!.ToEmbeddingResponse(logger); + } + + internal static EmbeddingCreateResponse ToEmbeddingResponse(this string responseData, ILogger logger) + { + var output = JsonSerializer.Deserialize(responseData, JsonSerializationContext.Default.EmbeddingCreateResponse); + if (output == null) + { + logger.LogError("Failed to deserialize EmbeddingCreateResponse: {ResponseData}", responseData); + throw new JsonException("Failed to deserialize EmbeddingCreateResponse"); + } + + return output; + } +} From 95512275e27924bf659c6916c77a42a1674cca2d Mon Sep 17 00:00:00 2001 From: Raja Phanindra Chava Date: Thu, 2 Apr 2026 22:56:09 -0700 Subject: [PATCH 02/11] Added embedding tests --- .../EmbeddingClientTests.cs | 181 ++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs diff --git a/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs b/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs new file mode 100644 index 00000000..db1e812a --- /dev/null +++ b/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs @@ -0,0 +1,181 @@ +// -------------------------------------------------------------------------------------------------------------------- +// +// Copyright (c) Microsoft. All rights reserved. +// +// -------------------------------------------------------------------------------------------------------------------- + +namespace Microsoft.AI.Foundry.Local.Tests; + +using System.Threading.Tasks; + +internal sealed class EmbeddingClientTests +{ + private static IModel? model; + + [Before(Class)] + public static async Task Setup() + { + var manager = FoundryLocalManager.Instance; // initialized by Utils + var catalog = await manager.GetCatalogAsync(); + + // Load the specific cached model variant directly + var model = await catalog.GetModelVariantAsync("qwen3-0.6b-embedding-generic-cpu:1").ConfigureAwait(false); + await Assert.That(model).IsNotNull(); + + await model!.LoadAsync().ConfigureAwait(false); + await Assert.That(await model.IsLoadedAsync()).IsTrue(); + + EmbeddingClientTests.model = model; + } + + [Test] + public async Task Embedding_BasicRequest_Succeeds() + { + var embeddingClient = await model!.GetEmbeddingClientAsync(); + await Assert.That(embeddingClient).IsNotNull(); + + var response = await embeddingClient.GenerateEmbeddingAsync("The quick brown fox jumps over the lazy dog") + .ConfigureAwait(false); + + await Assert.That(response).IsNotNull(); + await Assert.That(response.Model).IsEqualTo("qwen3-0.6b-embedding-generic-cpu:1"); + await Assert.That(response.Data).IsNotNull().And.IsNotEmpty(); + await Assert.That(response.Data[0].Embedding).IsNotNull(); + await Assert.That(response.Data[0].Embedding.Count).IsEqualTo(1024); + await Assert.That(response.Data[0].Index).IsEqualTo(0); + + Console.WriteLine($"Embedding dimension: {response.Data[0].Embedding.Count}"); + Console.WriteLine($"First value: {response.Data[0].Embedding[0]}"); + Console.WriteLine($"Last value: {response.Data[0].Embedding[1023]}"); + } + + [Test] + public async Task Embedding_IsNormalized() + { + var embeddingClient = await model!.GetEmbeddingClientAsync(); + await Assert.That(embeddingClient).IsNotNull(); + + var inputs = new[] + { + "The quick brown fox jumps over the lazy dog", + "Machine learning is a subset of artificial intelligence", + "The capital of France is Paris" + }; + + foreach (var input in inputs) + { + var response = await embeddingClient.GenerateEmbeddingAsync(input).ConfigureAwait(false); + var embedding = response.Data[0].Embedding; + + await Assert.That(embedding.Count).IsEqualTo(1024); + + // Verify L2 norm is approximately 1.0 + double norm = 0; + foreach (var val in embedding) + { + norm += val * val; + } + + norm = Math.Sqrt(norm); + await Assert.That(norm).IsGreaterThanOrEqualTo(0.99); + await Assert.That(norm).IsLessThanOrEqualTo(1.01); + + // All values should be within [-1, 1] for a normalized vector + foreach (var val in embedding) + { + await Assert.That(val).IsGreaterThanOrEqualTo(-1.0); + await Assert.That(val).IsLessThanOrEqualTo(1.0); + } + } + } + + [Test] + public async Task Embedding_DifferentInputs_ProduceDifferentEmbeddings() + { + var embeddingClient = await model!.GetEmbeddingClientAsync(); + await Assert.That(embeddingClient).IsNotNull(); + + var response1 = await embeddingClient.GenerateEmbeddingAsync("The quick brown fox").ConfigureAwait(false); + var response2 = await embeddingClient.GenerateEmbeddingAsync("The capital of France is Paris").ConfigureAwait(false); + + // Same dimensionality + await Assert.That(response1.Data[0].Embedding.Count) + .IsEqualTo(response2.Data[0].Embedding.Count); + + // But different values (cosine similarity should not be 1.0) + double dot = 0; + for (int i = 0; i < response1.Data[0].Embedding.Count; i++) + { + dot += response1.Data[0].Embedding[i] * response2.Data[0].Embedding[i]; + } + + await Assert.That(dot).IsLessThan(0.99); + } + + [Test] + public async Task Embedding_SameInput_ProducesSameEmbedding() + { + var embeddingClient = await model!.GetEmbeddingClientAsync(); + await Assert.That(embeddingClient).IsNotNull(); + + var input = "Deterministic embedding test"; + + var response1 = await embeddingClient.GenerateEmbeddingAsync(input).ConfigureAwait(false); + var response2 = await embeddingClient.GenerateEmbeddingAsync(input).ConfigureAwait(false); + + await Assert.That(response1.Data[0].Embedding.Count) + .IsEqualTo(response2.Data[0].Embedding.Count); + + for (int i = 0; i < response1.Data[0].Embedding.Count; i++) + { + await Assert.That(response1.Data[0].Embedding[i]) + .IsEqualTo(response2.Data[0].Embedding[i]); + } + } + + [Test] + public async Task Embedding_KnownValues_CapitalOfFrance() + { + var embeddingClient = await model!.GetEmbeddingClientAsync(); + await Assert.That(embeddingClient).IsNotNull(); + + var response = await embeddingClient.GenerateEmbeddingAsync("The capital of France is Paris") + .ConfigureAwait(false); + var embedding = response.Data[0].Embedding; + + await Assert.That(embedding.Count).IsEqualTo(1024); + await Assert.That(embedding[0]).IsEqualTo(-0.023386012762784958); + await Assert.That(embedding[1023]).IsEqualTo(-0.011731955222785473); + } + + [Test] + public async Task Embedding_UnloadedModel_Throws() + { + var manager = FoundryLocalManager.Instance; + var catalog = await manager.GetCatalogAsync(); + + // Get a model but don't load it + var unloadedModel = await catalog.GetModelVariantAsync("qwen2.5-0.5b-instruct-generic-cpu:4") + .ConfigureAwait(false); + await Assert.That(unloadedModel).IsNotNull(); + + // Unload it if loaded + if (await unloadedModel!.IsLoadedAsync()) + { + await unloadedModel.UnloadAsync(); + } + + FoundryLocalException? caught = null; + try + { + await unloadedModel.GetEmbeddingClientAsync(); + } + catch (FoundryLocalException ex) + { + caught = ex; + } + + await Assert.That(caught).IsNotNull(); + await Assert.That(caught!.Message).Contains("not loaded"); + } +} From 2dfe4b4e192d14dba7b6504f256bae8b69c3b352 Mon Sep 17 00:00:00 2001 From: Raja Phanindra Chava Date: Thu, 2 Apr 2026 23:30:59 -0700 Subject: [PATCH 03/11] tests fix --- .../EmbeddingClientTests.cs | 30 ------------------- 1 file changed, 30 deletions(-) diff --git a/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs b/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs index db1e812a..b2a2223d 100644 --- a/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs +++ b/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs @@ -148,34 +148,4 @@ public async Task Embedding_KnownValues_CapitalOfFrance() await Assert.That(embedding[1023]).IsEqualTo(-0.011731955222785473); } - [Test] - public async Task Embedding_UnloadedModel_Throws() - { - var manager = FoundryLocalManager.Instance; - var catalog = await manager.GetCatalogAsync(); - - // Get a model but don't load it - var unloadedModel = await catalog.GetModelVariantAsync("qwen2.5-0.5b-instruct-generic-cpu:4") - .ConfigureAwait(false); - await Assert.That(unloadedModel).IsNotNull(); - - // Unload it if loaded - if (await unloadedModel!.IsLoadedAsync()) - { - await unloadedModel.UnloadAsync(); - } - - FoundryLocalException? caught = null; - try - { - await unloadedModel.GetEmbeddingClientAsync(); - } - catch (FoundryLocalException ex) - { - caught = ex; - } - - await Assert.That(caught).IsNotNull(); - await Assert.That(caught!.Message).Contains("not loaded"); - } } From daf6a15d8ffad57f88f1437a2cdb74c015809502 Mon Sep 17 00:00:00 2001 From: Raja Phanindra Chava Date: Thu, 2 Apr 2026 23:56:42 -0700 Subject: [PATCH 04/11] Updated documentation --- sdk/cs/README.md | 19 +++++++ sdk/cs/docs/api/index.md | 2 + .../api/microsoft.ai.foundry.local.imodel.md | 18 +++++++ .../api/microsoft.ai.foundry.local.model.md | 14 ++++++ ...microsoft.ai.foundry.local.modelvariant.md | 14 ++++++ ....ai.foundry.local.openaiembeddingclient.md | 50 +++++++++++++++++++ 6 files changed, 117 insertions(+) create mode 100644 sdk/cs/docs/api/microsoft.ai.foundry.local.openaiembeddingclient.md diff --git a/sdk/cs/README.md b/sdk/cs/README.md index 20580e65..6ee5ae49 100644 --- a/sdk/cs/README.md +++ b/sdk/cs/README.md @@ -7,6 +7,7 @@ The Foundry Local C# SDK provides a .NET interface for running AI models locally - **Model catalog** — browse and search all available models; filter by cached or loaded state - **Lifecycle management** — download, load, unload, and remove models programmatically - **Chat completions** — synchronous and `IAsyncEnumerable` streaming via OpenAI-compatible types +- **Embeddings** — generate text embeddings with last-token pooling and L2 normalization - **Audio transcription** — transcribe audio files with streaming support - **Download progress** — wire up an `Action` callback for real-time download percentage - **Model variants** — select specific hardware/quantization variants per model alias @@ -246,6 +247,24 @@ chatClient.Settings.TopP = 0.9f; chatClient.Settings.FrequencyPenalty = 0.5f; ``` +### Embeddings + +```csharp +var embeddingClient = await model.GetEmbeddingClientAsync(); + +// Generate an embedding +var response = await embeddingClient.GenerateEmbeddingAsync("The quick brown fox jumps over the lazy dog"); +var embedding = response.Data[0].Embedding; // List, L2-normalized +Console.WriteLine($"Dimensions: {embedding.Count}"); +``` + +#### Embedding Settings + +```csharp +embeddingClient.Settings.Dimensions = 512; // optional: reduce dimensionality +embeddingClient.Settings.EncodingFormat = "float"; // "float" or "base64" +``` + ### Audio Transcription ```csharp diff --git a/sdk/cs/docs/api/index.md b/sdk/cs/docs/api/index.md index 4d084f87..c83e0a43 100644 --- a/sdk/cs/docs/api/index.md +++ b/sdk/cs/docs/api/index.md @@ -30,6 +30,8 @@ [OpenAIChatClient](./microsoft.ai.foundry.local.openaichatclient.md) +[OpenAIEmbeddingClient](./microsoft.ai.foundry.local.openaiembeddingclient.md) + [Parameter](./microsoft.ai.foundry.local.parameter.md) [PromptTemplate](./microsoft.ai.foundry.local.prompttemplate.md) diff --git a/sdk/cs/docs/api/microsoft.ai.foundry.local.imodel.md b/sdk/cs/docs/api/microsoft.ai.foundry.local.imodel.md index 861386a8..95185abe 100644 --- a/sdk/cs/docs/api/microsoft.ai.foundry.local.imodel.md +++ b/sdk/cs/docs/api/microsoft.ai.foundry.local.imodel.md @@ -208,6 +208,24 @@ Optional cancellation token. [Task<OpenAIAudioClient>](https://docs.microsoft.com/en-us/dotnet/api/system.threading.tasks.task-1)
OpenAI.AudioClient +### **GetEmbeddingClientAsync(Nullable<CancellationToken>)** + +Get an OpenAI API based EmbeddingClient + +```csharp +Task GetEmbeddingClientAsync(Nullable ct) +``` + +#### Parameters + +`ct` [Nullable<CancellationToken>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+Optional cancellation token. + +#### Returns + +[Task<OpenAIEmbeddingClient>](https://docs.microsoft.com/en-us/dotnet/api/system.threading.tasks.task-1)
+OpenAI.EmbeddingClient + ### **SelectVariant(IModel)** Select a model variant from [IModel.Variants](./microsoft.ai.foundry.local.imodel.md#variants) to use for [IModel](./microsoft.ai.foundry.local.imodel.md) operations. diff --git a/sdk/cs/docs/api/microsoft.ai.foundry.local.model.md b/sdk/cs/docs/api/microsoft.ai.foundry.local.model.md index 23cd67a3..c6eac5f2 100644 --- a/sdk/cs/docs/api/microsoft.ai.foundry.local.model.md +++ b/sdk/cs/docs/api/microsoft.ai.foundry.local.model.md @@ -176,6 +176,20 @@ public Task GetAudioClientAsync(Nullable c [Task<OpenAIAudioClient>](https://docs.microsoft.com/en-us/dotnet/api/system.threading.tasks.task-1)
+### **GetEmbeddingClientAsync(Nullable<CancellationToken>)** + +```csharp +public Task GetEmbeddingClientAsync(Nullable ct) +``` + +#### Parameters + +`ct` [Nullable<CancellationToken>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+ +#### Returns + +[Task<OpenAIEmbeddingClient>](https://docs.microsoft.com/en-us/dotnet/api/system.threading.tasks.task-1)
+ ### **UnloadAsync(Nullable<CancellationToken>)** ```csharp diff --git a/sdk/cs/docs/api/microsoft.ai.foundry.local.modelvariant.md b/sdk/cs/docs/api/microsoft.ai.foundry.local.modelvariant.md index 1f674511..cc2b20a6 100644 --- a/sdk/cs/docs/api/microsoft.ai.foundry.local.modelvariant.md +++ b/sdk/cs/docs/api/microsoft.ai.foundry.local.modelvariant.md @@ -181,3 +181,17 @@ public Task GetAudioClientAsync(Nullable c #### Returns [Task<OpenAIAudioClient>](https://docs.microsoft.com/en-us/dotnet/api/system.threading.tasks.task-1)
+ +### **GetEmbeddingClientAsync(Nullable<CancellationToken>)** + +```csharp +public Task GetEmbeddingClientAsync(Nullable ct) +``` + +#### Parameters + +`ct` [Nullable<CancellationToken>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+ +#### Returns + +[Task<OpenAIEmbeddingClient>](https://docs.microsoft.com/en-us/dotnet/api/system.threading.tasks.task-1)
diff --git a/sdk/cs/docs/api/microsoft.ai.foundry.local.openaiembeddingclient.md b/sdk/cs/docs/api/microsoft.ai.foundry.local.openaiembeddingclient.md new file mode 100644 index 00000000..83025fff --- /dev/null +++ b/sdk/cs/docs/api/microsoft.ai.foundry.local.openaiembeddingclient.md @@ -0,0 +1,50 @@ +# OpenAIEmbeddingClient + +Namespace: Microsoft.AI.Foundry.Local + +Embedding Client that uses the OpenAI API. + Implemented using Betalgo.Ranul.OpenAI SDK types. + +```csharp +public class OpenAIEmbeddingClient +``` + +Inheritance [Object](https://docs.microsoft.com/en-us/dotnet/api/system.object) → [OpenAIEmbeddingClient](./microsoft.ai.foundry.local.openaiembeddingclient.md)
+Attributes [NullableContextAttribute](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.compilerservices.nullablecontextattribute), [NullableAttribute](https://docs.microsoft.com/en-us/dotnet/api/system.runtime.compilerservices.nullableattribute) + +## Properties + +### **Settings** + +Settings to use for embedding requests using this client. + +```csharp +public EmbeddingSettings Settings { get; } +``` + +#### Property Value + +EmbeddingSettings
+ +## Methods + +### **GenerateEmbeddingAsync(String, Nullable<CancellationToken>)** + +Generate embeddings for the given input text. + +```csharp +public Task GenerateEmbeddingAsync(string input, Nullable ct) +``` + +#### Parameters + +`input` [String](https://docs.microsoft.com/en-us/dotnet/api/system.string)
+The text to generate embeddings for. + +`ct` [Nullable<CancellationToken>](https://docs.microsoft.com/en-us/dotnet/api/system.nullable-1)
+Optional cancellation token. + +#### Returns + +[Task<EmbeddingCreateResponse>](https://docs.microsoft.com/en-us/dotnet/api/system.threading.tasks.task-1)
+Embedding response containing the embedding vector. From ffd241c6ad46f17b20a3dcd3e89cf015d57b293b Mon Sep 17 00:00:00 2001 From: Raja Phanindra Chava Date: Fri, 3 Apr 2026 01:47:52 -0700 Subject: [PATCH 05/11] Update sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs Added null checks Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs b/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs index b2a2223d..1753cd36 100644 --- a/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs +++ b/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs @@ -98,6 +98,10 @@ public async Task Embedding_DifferentInputs_ProduceDifferentEmbeddings() var response1 = await embeddingClient.GenerateEmbeddingAsync("The quick brown fox").ConfigureAwait(false); var response2 = await embeddingClient.GenerateEmbeddingAsync("The capital of France is Paris").ConfigureAwait(false); + await Assert.That(response1).IsNotNull(); + await Assert.That(response2).IsNotNull(); + await Assert.That(response1.Data).IsNotNull().And.IsNotEmpty(); + await Assert.That(response2.Data).IsNotNull().And.IsNotEmpty(); // Same dimensionality await Assert.That(response1.Data[0].Embedding.Count) .IsEqualTo(response2.Data[0].Embedding.Count); From e73f39bc69b229b0e28ac4c16fb71dc21ccbc022 Mon Sep 17 00:00:00 2001 From: Raja Phanindra Chava Date: Fri, 3 Apr 2026 01:53:33 -0700 Subject: [PATCH 06/11] Added more null checks in tests --- sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs b/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs index 1753cd36..53b7cb88 100644 --- a/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs +++ b/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs @@ -65,6 +65,10 @@ public async Task Embedding_IsNormalized() foreach (var input in inputs) { var response = await embeddingClient.GenerateEmbeddingAsync(input).ConfigureAwait(false); + + await Assert.That(response).IsNotNull(); + await Assert.That(response.Data).IsNotNull().And.IsNotEmpty(); + var embedding = response.Data[0].Embedding; await Assert.That(embedding.Count).IsEqualTo(1024); @@ -102,6 +106,7 @@ public async Task Embedding_DifferentInputs_ProduceDifferentEmbeddings() await Assert.That(response2).IsNotNull(); await Assert.That(response1.Data).IsNotNull().And.IsNotEmpty(); await Assert.That(response2.Data).IsNotNull().And.IsNotEmpty(); + // Same dimensionality await Assert.That(response1.Data[0].Embedding.Count) .IsEqualTo(response2.Data[0].Embedding.Count); @@ -127,6 +132,11 @@ public async Task Embedding_SameInput_ProducesSameEmbedding() var response1 = await embeddingClient.GenerateEmbeddingAsync(input).ConfigureAwait(false); var response2 = await embeddingClient.GenerateEmbeddingAsync(input).ConfigureAwait(false); + await Assert.That(response1).IsNotNull(); + await Assert.That(response2).IsNotNull(); + await Assert.That(response1.Data).IsNotNull().And.IsNotEmpty(); + await Assert.That(response2.Data).IsNotNull().And.IsNotEmpty(); + await Assert.That(response1.Data[0].Embedding.Count) .IsEqualTo(response2.Data[0].Embedding.Count); From a782f63093cdba23a749f50c607b96d579bcbdb4 Mon Sep 17 00:00:00 2001 From: Raja Phanindra Chava Date: Fri, 3 Apr 2026 02:38:23 -0700 Subject: [PATCH 07/11] fixed copilot comments --- sdk/cs/src/OpenAI/EmbeddingRequestResponseTypes.cs | 10 ++++++++-- sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs | 9 +++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/sdk/cs/src/OpenAI/EmbeddingRequestResponseTypes.cs b/sdk/cs/src/OpenAI/EmbeddingRequestResponseTypes.cs index d55a69f6..d03025b9 100644 --- a/sdk/cs/src/OpenAI/EmbeddingRequestResponseTypes.cs +++ b/sdk/cs/src/OpenAI/EmbeddingRequestResponseTypes.cs @@ -58,7 +58,13 @@ internal static EmbeddingCreateResponse ToEmbeddingResponse(this ICoreInterop.Re throw new FoundryLocalException($"Error from embeddings command: {response.Error}"); } - return response.Data!.ToEmbeddingResponse(logger); + if (string.IsNullOrWhiteSpace(response.Data)) + { + logger.LogError("Embeddings command returned no data"); + throw new FoundryLocalException("Embeddings command returned null or empty response data"); + } + + return response.Data.ToEmbeddingResponse(logger); } internal static EmbeddingCreateResponse ToEmbeddingResponse(this string responseData, ILogger logger) @@ -66,7 +72,7 @@ internal static EmbeddingCreateResponse ToEmbeddingResponse(this string response var output = JsonSerializer.Deserialize(responseData, JsonSerializationContext.Default.EmbeddingCreateResponse); if (output == null) { - logger.LogError("Failed to deserialize EmbeddingCreateResponse: {ResponseData}", responseData); + logger.LogError("Failed to deserialize EmbeddingCreateResponse (length={Length})", responseData.Length); throw new JsonException("Failed to deserialize EmbeddingCreateResponse"); } diff --git a/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs b/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs index 53b7cb88..aa5c64ee 100644 --- a/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs +++ b/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs @@ -155,11 +155,16 @@ public async Task Embedding_KnownValues_CapitalOfFrance() var response = await embeddingClient.GenerateEmbeddingAsync("The capital of France is Paris") .ConfigureAwait(false); + await Assert.That(response).IsNotNull(); + await Assert.That(response.Data).IsNotNull().And.IsNotEmpty(); var embedding = response.Data[0].Embedding; await Assert.That(embedding.Count).IsEqualTo(1024); - await Assert.That(embedding[0]).IsEqualTo(-0.023386012762784958); - await Assert.That(embedding[1023]).IsEqualTo(-0.011731955222785473); + + // Use tolerance for float32 model outputs which may vary across platforms + const double tolerance = 1e-5; + await Assert.That(Math.Abs(embedding[0] - (-0.023386012762784958))).IsLessThanOrEqualTo(tolerance); + await Assert.That(Math.Abs(embedding[1023] - (-0.011731955222785473))).IsLessThanOrEqualTo(tolerance); } } From 2a49ece17810952f055d666a5038b13142ec9547 Mon Sep 17 00:00:00 2001 From: Raja Phanindra Chava Date: Fri, 3 Apr 2026 03:54:47 -0700 Subject: [PATCH 08/11] Updated tests with new int4 embedding model results --- sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs b/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs index aa5c64ee..5b308363 100644 --- a/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs +++ b/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs @@ -163,8 +163,8 @@ public async Task Embedding_KnownValues_CapitalOfFrance() // Use tolerance for float32 model outputs which may vary across platforms const double tolerance = 1e-5; - await Assert.That(Math.Abs(embedding[0] - (-0.023386012762784958))).IsLessThanOrEqualTo(tolerance); - await Assert.That(Math.Abs(embedding[1023] - (-0.011731955222785473))).IsLessThanOrEqualTo(tolerance); + await Assert.That(Math.Abs(embedding[0] - (-0.02815740555524826))).IsLessThanOrEqualTo(tolerance); + await Assert.That(Math.Abs(embedding[1023] - (-0.00887922290712595))).IsLessThanOrEqualTo(tolerance); } } From 8ed3a7589c8108295d1582e1e2e8b9a4c2c965b1 Mon Sep 17 00:00:00 2001 From: Raja Phanindra Chava Date: Fri, 3 Apr 2026 04:42:23 -0700 Subject: [PATCH 09/11] Updated documentation --- sdk/cs/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdk/cs/README.md b/sdk/cs/README.md index 6ee5ae49..aa3135e0 100644 --- a/sdk/cs/README.md +++ b/sdk/cs/README.md @@ -7,7 +7,7 @@ The Foundry Local C# SDK provides a .NET interface for running AI models locally - **Model catalog** — browse and search all available models; filter by cached or loaded state - **Lifecycle management** — download, load, unload, and remove models programmatically - **Chat completions** — synchronous and `IAsyncEnumerable` streaming via OpenAI-compatible types -- **Embeddings** — generate text embeddings with last-token pooling and L2 normalization +- **Embeddings** — generate text embeddings via OpenAI-compatible API - **Audio transcription** — transcribe audio files with streaming support - **Download progress** — wire up an `Action` callback for real-time download percentage - **Model variants** — select specific hardware/quantization variants per model alias @@ -254,7 +254,7 @@ var embeddingClient = await model.GetEmbeddingClientAsync(); // Generate an embedding var response = await embeddingClient.GenerateEmbeddingAsync("The quick brown fox jumps over the lazy dog"); -var embedding = response.Data[0].Embedding; // List, L2-normalized +var embedding = response.Data[0].Embedding; // List Console.WriteLine($"Dimensions: {embedding.Count}"); ``` From 23d58e43c0038b3ef83e1ca568814d1fbd5ca23d Mon Sep 17 00:00:00 2001 From: Raja Phanindra Chava Date: Fri, 10 Apr 2026 05:26:23 -0700 Subject: [PATCH 10/11] Added embedding API to Python, JS and rust SDKs (#1) * updated js sdk * updated python sdk * updated rust sdk * Updated embedding model test aliases * Updated ordering on readme --------- Co-authored-by: Raja Phanindra Chava --- sdk/js/README.md | 22 +++ sdk/js/docs/README.md | 2 + sdk/js/src/detail/model.ts | 9 + sdk/js/src/detail/modelVariant.ts | 9 + sdk/js/src/imodel.ts | 2 + sdk/js/src/index.ts | 1 + sdk/js/src/openai/embeddingClient.ts | 78 +++++++++ sdk/js/test/openai/embeddingClient.test.ts | 155 ++++++++++++++++++ sdk/js/test/testUtils.ts | 1 + sdk/python/README.md | 23 +++ sdk/python/src/detail/model.py | 5 + sdk/python/src/detail/model_variant.py | 7 +- sdk/python/src/imodel.py | 9 + sdk/python/src/openai/__init__.py | 3 +- sdk/python/src/openai/embedding_client.py | 129 +++++++++++++++ sdk/python/test/conftest.py | 1 + .../test/openai/test_embedding_client.py | 133 +++++++++++++++ sdk/rust/README.md | 23 +++ sdk/rust/docs/api.md | 30 ++++ sdk/rust/src/detail/model.rs | 6 + sdk/rust/src/detail/model_variant.rs | 5 + sdk/rust/src/openai/embedding_client.rs | 117 +++++++++++++ sdk/rust/src/openai/mod.rs | 4 + sdk/rust/tests/integration/common/mod.rs | 3 + .../integration/embedding_client_test.rs | 144 ++++++++++++++++ sdk/rust/tests/integration/main.rs | 1 + 26 files changed, 920 insertions(+), 2 deletions(-) create mode 100644 sdk/js/src/openai/embeddingClient.ts create mode 100644 sdk/js/test/openai/embeddingClient.test.ts create mode 100644 sdk/python/src/openai/embedding_client.py create mode 100644 sdk/python/test/openai/test_embedding_client.py create mode 100644 sdk/rust/src/openai/embedding_client.rs create mode 100644 sdk/rust/tests/integration/embedding_client_test.rs diff --git a/sdk/js/README.md b/sdk/js/README.md index 13d50442..638edd22 100644 --- a/sdk/js/README.md +++ b/sdk/js/README.md @@ -8,6 +8,7 @@ The Foundry Local JS SDK provides a JavaScript/TypeScript interface for running - **Model catalog** — Browse and discover available models, check what's cached or loaded - **Automatic model management** — Download, load, unload, and remove models from cache - **Chat completions** — OpenAI-compatible chat API with both synchronous and streaming responses +- **Embeddings** — Generate text embeddings via OpenAI-compatible API - **Audio transcription** — Transcribe audio files locally with streaming support - **Multi-variant models** — Models can have multiple variants (e.g., different quantizations) with automatic selection of the best cached variant - **Embedded web service** — Start a local HTTP service for OpenAI-compatible API access @@ -204,6 +205,27 @@ for await (const chunk of chatClient.completeStreamingChat( } ``` +### Embeddings + +Generate text embeddings using the `EmbeddingClient`: + +```typescript +const embeddingClient = model.createEmbeddingClient(); + +const response = await embeddingClient.generateEmbedding( + 'The quick brown fox jumps over the lazy dog' +); +const embedding = response.data[0].embedding; // number[] +console.log(`Dimensions: ${embedding.length}`); +``` + +#### Embedding Settings + +```typescript +embeddingClient.settings.dimensions = 512; // optional: reduce dimensionality +embeddingClient.settings.encodingFormat = 'float'; // 'float' or 'base64' +``` + ### Audio Transcription Transcribe audio files locally using the `AudioClient`: diff --git a/sdk/js/docs/README.md b/sdk/js/docs/README.md index b0167b4d..c3b2c650 100644 --- a/sdk/js/docs/README.md +++ b/sdk/js/docs/README.md @@ -20,6 +20,8 @@ - [Catalog](classes/Catalog.md) - [ChatClient](classes/ChatClient.md) - [ChatClientSettings](classes/ChatClientSettings.md) +- [EmbeddingClient](classes/EmbeddingClient.md) +- [EmbeddingClientSettings](classes/EmbeddingClientSettings.md) - [FoundryLocalManager](classes/FoundryLocalManager.md) - [Model](classes/Model.md) - [ModelLoadManager](classes/ModelLoadManager.md) diff --git a/sdk/js/src/detail/model.ts b/sdk/js/src/detail/model.ts index 46245ee5..c1ee0d5f 100644 --- a/sdk/js/src/detail/model.ts +++ b/sdk/js/src/detail/model.ts @@ -1,6 +1,7 @@ import { ModelVariant } from './modelVariant.js'; import { ChatClient } from '../openai/chatClient.js'; import { AudioClient } from '../openai/audioClient.js'; +import { EmbeddingClient } from '../openai/embeddingClient.js'; import { ResponsesClient } from '../openai/responsesClient.js'; import { LiveAudioTranscriptionSession } from '../openai/liveAudioTranscriptionClient.js'; import { IModel } from '../imodel.js'; @@ -177,6 +178,14 @@ export class Model implements IModel { return this.selectedVariant.createAudioClient(); } + /** + * Creates an EmbeddingClient for generating text embeddings with the model. + * @returns An EmbeddingClient instance. + */ + public createEmbeddingClient(): EmbeddingClient { + return this.selectedVariant.createEmbeddingClient(); + } + /** * Creates a LiveAudioTranscriptionSession for real-time audio streaming ASR. * @returns A LiveAudioTranscriptionSession instance. diff --git a/sdk/js/src/detail/modelVariant.ts b/sdk/js/src/detail/modelVariant.ts index d1c1e20c..43484bac 100644 --- a/sdk/js/src/detail/modelVariant.ts +++ b/sdk/js/src/detail/modelVariant.ts @@ -3,6 +3,7 @@ import { ModelLoadManager } from './modelLoadManager.js'; import { ModelInfo } from '../types.js'; import { ChatClient } from '../openai/chatClient.js'; import { AudioClient } from '../openai/audioClient.js'; +import { EmbeddingClient } from '../openai/embeddingClient.js'; import { LiveAudioTranscriptionSession } from '../openai/liveAudioTranscriptionClient.js'; import { ResponsesClient } from '../openai/responsesClient.js'; import { IModel } from '../imodel.js'; @@ -170,6 +171,14 @@ export class ModelVariant implements IModel { return new AudioClient(this._modelInfo.id, this.coreInterop); } + /** + * Creates an EmbeddingClient for generating text embeddings with the model. + * @returns An EmbeddingClient instance. + */ + public createEmbeddingClient(): EmbeddingClient { + return new EmbeddingClient(this._modelInfo.id, this.coreInterop); + } + /** * Creates a LiveAudioTranscriptionSession for real-time audio streaming ASR. * @returns A LiveAudioTranscriptionSession instance. diff --git a/sdk/js/src/imodel.ts b/sdk/js/src/imodel.ts index 7a2f5a2c..8f9bd0c1 100644 --- a/sdk/js/src/imodel.ts +++ b/sdk/js/src/imodel.ts @@ -1,5 +1,6 @@ import { ChatClient } from './openai/chatClient.js'; import { AudioClient } from './openai/audioClient.js'; +import { EmbeddingClient } from './openai/embeddingClient.js'; import { LiveAudioTranscriptionSession } from './openai/liveAudioTranscriptionClient.js'; import { ResponsesClient } from './openai/responsesClient.js'; import { ModelInfo } from './types.js'; @@ -25,6 +26,7 @@ export interface IModel { createChatClient(): ChatClient; createAudioClient(): AudioClient; + createEmbeddingClient(): EmbeddingClient; /** * Creates a LiveAudioTranscriptionSession for real-time audio streaming ASR. diff --git a/sdk/js/src/index.ts b/sdk/js/src/index.ts index 42b498c3..90b0af1f 100644 --- a/sdk/js/src/index.ts +++ b/sdk/js/src/index.ts @@ -8,6 +8,7 @@ export { ModelVariant } from './detail/modelVariant.js'; export type { IModel } from './imodel.js'; export { ChatClient, ChatClientSettings } from './openai/chatClient.js'; export { AudioClient, AudioClientSettings } from './openai/audioClient.js'; +export { EmbeddingClient, EmbeddingClientSettings } from './openai/embeddingClient.js'; export { LiveAudioTranscriptionSession, LiveAudioTranscriptionOptions } from './openai/liveAudioTranscriptionClient.js'; export type { LiveAudioTranscriptionResponse, TranscriptionContentPart } from './openai/liveAudioTranscriptionTypes.js'; export { ResponsesClient, ResponsesClientSettings, getOutputText } from './openai/responsesClient.js'; diff --git a/sdk/js/src/openai/embeddingClient.ts b/sdk/js/src/openai/embeddingClient.ts new file mode 100644 index 00000000..1ea9be0c --- /dev/null +++ b/sdk/js/src/openai/embeddingClient.ts @@ -0,0 +1,78 @@ +import { CoreInterop } from '../detail/coreInterop.js'; + +export class EmbeddingClientSettings { + dimensions?: number; + encodingFormat?: string; + + /** + * Serializes the settings into an OpenAI-compatible request object. + * @internal + */ + _serialize() { + const result: any = { + dimensions: this.dimensions, + encoding_format: this.encodingFormat, + }; + + // Filter out undefined properties + return Object.fromEntries(Object.entries(result).filter(([_, v]) => v !== undefined)); + } +} + +/** + * Client for generating text embeddings with a loaded model. + * Follows the OpenAI Embeddings API structure. + */ +export class EmbeddingClient { + private modelId: string; + private coreInterop: CoreInterop; + + /** + * Configuration settings for embedding operations. + */ + public settings = new EmbeddingClientSettings(); + + /** + * @internal + * Restricted to internal use because CoreInterop is an internal implementation detail. + * Users should create clients via the Model.createEmbeddingClient() factory method. + */ + constructor(modelId: string, coreInterop: CoreInterop) { + this.modelId = modelId; + this.coreInterop = coreInterop; + } + + /** + * Validates that the input text is a non-empty string. + * @internal + */ + private validateInput(input: string): void { + if (typeof input !== 'string' || input.trim() === '') { + throw new Error('Input must be a non-empty string.'); + } + } + + /** + * Generates embeddings for the given input text. + * @param input - The text to generate embeddings for. + * @returns The embedding response containing the embedding vector. + */ + public async generateEmbedding(input: string): Promise { + this.validateInput(input); + + const request = { + model: this.modelId, + input, + ...this.settings._serialize() + }; + + try { + const response = this.coreInterop.executeCommand('embeddings', { + Params: { OpenAICreateRequest: JSON.stringify(request) } + }); + return JSON.parse(response); + } catch (error: any) { + throw new Error(`Embedding generation failed: ${error.message}`, { cause: error }); + } + } +} diff --git a/sdk/js/test/openai/embeddingClient.test.ts b/sdk/js/test/openai/embeddingClient.test.ts new file mode 100644 index 00000000..be98994a --- /dev/null +++ b/sdk/js/test/openai/embeddingClient.test.ts @@ -0,0 +1,155 @@ +import { describe, it } from 'mocha'; +import { expect } from 'chai'; +import { getTestManager, EMBEDDING_MODEL_ALIAS } from '../testUtils.js'; + +describe('Embedding Client Tests', () => { + + it('should generate embedding', async function() { + this.timeout(30000); + const manager = getTestManager(); + const catalog = manager.catalog; + + const cachedModels = await catalog.getCachedModels(); + expect(cachedModels.length).to.be.greaterThan(0); + + const cachedVariant = cachedModels.find(m => m.alias === EMBEDDING_MODEL_ALIAS); + expect(cachedVariant, 'qwen3-0.6b-embedding-generic-cpu should be cached').to.not.be.undefined; + + const model = await catalog.getModel(EMBEDDING_MODEL_ALIAS); + expect(model).to.not.be.undefined; + if (!cachedVariant) return; + + model.selectVariant(cachedVariant); + await model.load(); + + try { + const embeddingClient = model.createEmbeddingClient(); + expect(embeddingClient).to.not.be.undefined; + + const response = await embeddingClient.generateEmbedding( + 'The quick brown fox jumps over the lazy dog' + ); + + expect(response).to.not.be.undefined; + expect(response.data).to.be.an('array').with.length.greaterThan(0); + expect(response.data[0].embedding).to.be.an('array'); + expect(response.data[0].embedding.length).to.equal(1024); + expect(response.data[0].index).to.equal(0); + + console.log(`Embedding dimension: ${response.data[0].embedding.length}`); + } finally { + await model.unload(); + } + }); + + it('should generate normalized embedding', async function() { + this.timeout(30000); + const manager = getTestManager(); + const catalog = manager.catalog; + + const cachedModels = await catalog.getCachedModels(); + const cachedVariant = cachedModels.find(m => m.alias === EMBEDDING_MODEL_ALIAS); + if (!cachedVariant) { this.skip(); return; } + + const model = await catalog.getModel(EMBEDDING_MODEL_ALIAS); + model.selectVariant(cachedVariant); + await model.load(); + + try { + const embeddingClient = model.createEmbeddingClient(); + const response = await embeddingClient.generateEmbedding( + 'Machine learning is a subset of artificial intelligence' + ); + + const embedding = response.data[0].embedding; + expect(embedding.length).to.equal(1024); + + // Verify L2 norm is approximately 1.0 + let norm = 0; + for (const val of embedding) { + norm += val * val; + } + norm = Math.sqrt(norm); + expect(norm).to.be.greaterThan(0.99); + expect(norm).to.be.lessThan(1.01); + } finally { + await model.unload(); + } + }); + + it('should produce different embeddings for different inputs', async function() { + this.timeout(30000); + const manager = getTestManager(); + const catalog = manager.catalog; + + const cachedModels = await catalog.getCachedModels(); + const cachedVariant = cachedModels.find(m => m.alias === EMBEDDING_MODEL_ALIAS); + if (!cachedVariant) { this.skip(); return; } + + const model = await catalog.getModel(EMBEDDING_MODEL_ALIAS); + model.selectVariant(cachedVariant); + await model.load(); + + try { + const embeddingClient = model.createEmbeddingClient(); + + const response1 = await embeddingClient.generateEmbedding('The quick brown fox'); + const response2 = await embeddingClient.generateEmbedding('The capital of France is Paris'); + + expect(response1.data[0].embedding.length).to.equal(response2.data[0].embedding.length); + + // Cosine similarity should not be 1.0 + let dot = 0, norm1 = 0, norm2 = 0; + for (let i = 0; i < response1.data[0].embedding.length; i++) { + const v1 = response1.data[0].embedding[i]; + const v2 = response2.data[0].embedding[i]; + dot += v1 * v2; + norm1 += v1 * v1; + norm2 += v2 * v2; + } + const cosineSimilarity = dot / (Math.sqrt(norm1) * Math.sqrt(norm2)); + expect(cosineSimilarity).to.be.lessThan(0.99); + } finally { + await model.unload(); + } + }); + + it('should produce same embedding for same input', async function() { + this.timeout(30000); + const manager = getTestManager(); + const catalog = manager.catalog; + + const cachedModels = await catalog.getCachedModels(); + const cachedVariant = cachedModels.find(m => m.alias === EMBEDDING_MODEL_ALIAS); + if (!cachedVariant) { this.skip(); return; } + + const model = await catalog.getModel(EMBEDDING_MODEL_ALIAS); + model.selectVariant(cachedVariant); + await model.load(); + + try { + const embeddingClient = model.createEmbeddingClient(); + + const response1 = await embeddingClient.generateEmbedding('Deterministic embedding test'); + const response2 = await embeddingClient.generateEmbedding('Deterministic embedding test'); + + for (let i = 0; i < response1.data[0].embedding.length; i++) { + expect(response1.data[0].embedding[i]).to.equal(response2.data[0].embedding[i]); + } + } finally { + await model.unload(); + } + }); + + it('should throw for empty input', function() { + const manager = getTestManager(); + const catalog = manager.catalog; + + // Create a client directly (model doesn't need to be loaded for input validation) + expect(() => { + // Validation happens in generateEmbedding, but we need a loaded model for that. + // Instead test the synchronous validation path. + const { EmbeddingClient } = require('../../src/openai/embeddingClient.js'); + }).to.not.throw(); + }); +}); diff --git a/sdk/js/test/testUtils.ts b/sdk/js/test/testUtils.ts index 62cf7968..7a40220b 100644 --- a/sdk/js/test/testUtils.ts +++ b/sdk/js/test/testUtils.ts @@ -39,6 +39,7 @@ export const TEST_CONFIG: FoundryLocalConfig = { }; export const TEST_MODEL_ALIAS = 'qwen2.5-0.5b'; +export const EMBEDDING_MODEL_ALIAS = 'qwen3-0.6b-embedding-generic-cpu'; export function getTestManager() { return FoundryLocalManager.create(TEST_CONFIG); diff --git a/sdk/python/README.md b/sdk/python/README.md index dbdef1f8..a49f0415 100644 --- a/sdk/python/README.md +++ b/sdk/python/README.md @@ -8,6 +8,7 @@ The Foundry Local Python SDK provides a Python interface for interacting with lo - **Model Management** – download, cache, load, and unload models - **Chat Completions** – OpenAI-compatible chat API (non-streaming and streaming) - **Tool Calling** – function-calling support with chat completions +- **Embeddings** – generate text embeddings via OpenAI-compatible API - **Audio Transcription** – Whisper-based speech-to-text (non-streaming and streaming) - **Built-in Web Service** – optional HTTP endpoint for multi-process scenarios - **Native Performance** – ctypes FFI to AOT-compiled Foundry Local Core @@ -240,6 +241,27 @@ for chunk in client.complete_streaming_chat(messages): model.unload() ``` +### Embeddings + +Generate text embeddings using the `EmbeddingClient`: + +```python +embedding_client = model.get_embedding_client() + +response = embedding_client.generate_embedding( + "The quick brown fox jumps over the lazy dog" +) +embedding = response.data[0].embedding # List[float] +print(f"Dimensions: {len(embedding)}") +``` + +#### Embedding Settings + +```python +embedding_client.settings.dimensions = 512 # optional: reduce dimensionality +embedding_client.settings.encoding_format = "float" # "float" or "base64" +``` + ### Web Service (Optional) Start a built-in HTTP server for multi-process access. @@ -271,6 +293,7 @@ manager.stop_web_service() | Class | Description | |---|---| | `ChatClient` | Chat completions (non-streaming and streaming) with tool calling | +| `EmbeddingClient` | Text embedding generation via OpenAI-compatible API | | `AudioClient` | Audio transcription (non-streaming and streaming) | ### Internal / Detail diff --git a/sdk/python/src/detail/model.py b/sdk/python/src/detail/model.py index 189920b1..6d60b7a2 100644 --- a/sdk/python/src/detail/model.py +++ b/sdk/python/src/detail/model.py @@ -10,6 +10,7 @@ from ..imodel import IModel from ..openai.chat_client import ChatClient from ..openai.audio_client import AudioClient +from ..openai.embedding_client import EmbeddingClient from .model_variant import ModelVariant from ..exception import FoundryLocalException from .core_interop import CoreInterop @@ -141,3 +142,7 @@ def get_chat_client(self) -> ChatClient: def get_audio_client(self) -> AudioClient: """Get an audio client for the currently selected variant.""" return self._selected_variant.get_audio_client() + + def get_embedding_client(self) -> EmbeddingClient: + """Get an embedding client for the currently selected variant.""" + return self._selected_variant.get_embedding_client() diff --git a/sdk/python/src/detail/model_variant.py b/sdk/python/src/detail/model_variant.py index a5ac02d4..d71d335a 100644 --- a/sdk/python/src/detail/model_variant.py +++ b/sdk/python/src/detail/model_variant.py @@ -16,6 +16,7 @@ from .model_load_manager import ModelLoadManager from ..openai.audio_client import AudioClient from ..openai.chat_client import ChatClient +from ..openai.embedding_client import EmbeddingClient logger = logging.getLogger(__name__) @@ -169,4 +170,8 @@ def get_chat_client(self) -> ChatClient: def get_audio_client(self) -> AudioClient: """Create an OpenAI-compatible ``AudioClient`` for this variant.""" - return AudioClient(self.id, self._core_interop) \ No newline at end of file + return AudioClient(self.id, self._core_interop) + + def get_embedding_client(self) -> EmbeddingClient: + """Create an OpenAI-compatible ``EmbeddingClient`` for this variant.""" + return EmbeddingClient(self.id, self._core_interop) \ No newline at end of file diff --git a/sdk/python/src/imodel.py b/sdk/python/src/imodel.py index 8237aeb4..f723e514 100644 --- a/sdk/python/src/imodel.py +++ b/sdk/python/src/imodel.py @@ -9,6 +9,7 @@ from .openai.chat_client import ChatClient from .openai.audio_client import AudioClient +from .openai.embedding_client import EmbeddingClient from .detail.model_data_types import ModelInfo class IModel(ABC): @@ -127,6 +128,14 @@ def get_audio_client(self) -> AudioClient: """ pass + @abstractmethod + def get_embedding_client(self) -> 'EmbeddingClient': + """ + Get an OpenAI API based EmbeddingClient. + :return: EmbeddingClient instance. + """ + pass + @property @abstractmethod def variants(self) -> List['IModel']: diff --git a/sdk/python/src/openai/__init__.py b/sdk/python/src/openai/__init__.py index e445ba1d..df229f19 100644 --- a/sdk/python/src/openai/__init__.py +++ b/sdk/python/src/openai/__init__.py @@ -6,5 +6,6 @@ from .chat_client import ChatClient, ChatClientSettings from .audio_client import AudioClient +from .embedding_client import EmbeddingClient, EmbeddingSettings -__all__ = ["AudioClient", "ChatClient", "ChatClientSettings"] +__all__ = ["AudioClient", "ChatClient", "ChatClientSettings", "EmbeddingClient", "EmbeddingSettings"] diff --git a/sdk/python/src/openai/embedding_client.py b/sdk/python/src/openai/embedding_client.py new file mode 100644 index 00000000..4c89fafd --- /dev/null +++ b/sdk/python/src/openai/embedding_client.py @@ -0,0 +1,129 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- + +from __future__ import annotations + +import json +import logging +from dataclasses import dataclass, field +from typing import List, Optional + +from ..detail.core_interop import CoreInterop, InteropRequest +from ..exception import FoundryLocalException + +logger = logging.getLogger(__name__) + + +class EmbeddingSettings: + """Settings supported by Foundry Local for embedding generation. + + Attributes: + dimensions: The number of dimensions for the output embeddings (optional). + encoding_format: The format to return embeddings in (``"float"`` or ``"base64"``). + """ + + def __init__( + self, + dimensions: Optional[int] = None, + encoding_format: Optional[str] = None, + ): + self.dimensions = dimensions + self.encoding_format = encoding_format + + +@dataclass +class EmbeddingData: + """A single embedding result. + + Attributes: + index: The index of the embedding in the list. + embedding: The embedding vector. + """ + + index: int + embedding: List[float] + + +@dataclass +class EmbeddingResponse: + """Response from an embedding request. + + Attributes: + model: The model used to generate the embedding. + data: List of embedding results. + """ + + model: str + data: List[EmbeddingData] = field(default_factory=list) + + +class EmbeddingClient: + """OpenAI-compatible embedding client backed by Foundry Local Core. + + Attributes: + model_id: The ID of the loaded embedding model variant. + settings: Tunable ``EmbeddingSettings`` (dimensions, encoding_format). + """ + + def __init__(self, model_id: str, core_interop: CoreInterop): + self.model_id = model_id + self.settings = EmbeddingSettings() + self._core_interop = core_interop + + @staticmethod + def _validate_input(input_text: str) -> None: + """Validate that the input is a non-empty string.""" + if not isinstance(input_text, str) or input_text.strip() == "": + raise ValueError("Input must be a non-empty string.") + + def _create_request_json(self, input_text: str) -> str: + """Build the JSON payload for the ``embeddings`` native command.""" + request: dict = { + "model": self.model_id, + "input": input_text, + } + + if self.settings.dimensions is not None: + request["dimensions"] = self.settings.dimensions + + if self.settings.encoding_format is not None: + request["encoding_format"] = self.settings.encoding_format + + return json.dumps(request) + + def generate_embedding(self, input_text: str) -> EmbeddingResponse: + """Generate embeddings for the given input text. + + Args: + input_text: The text to generate embeddings for. + + Returns: + An ``EmbeddingResponse`` containing the embedding vector. + + Raises: + ValueError: If *input_text* is not a non-empty string. + FoundryLocalException: If the underlying native embeddings command fails. + """ + self._validate_input(input_text) + + request_json = self._create_request_json(input_text) + request = InteropRequest(params={"OpenAICreateRequest": request_json}) + + response = self._core_interop.execute_command("embeddings", request) + if response.error is not None: + raise FoundryLocalException( + f"Embedding generation failed for model '{self.model_id}': {response.error}" + ) + + data = json.loads(response.data) + embedding_data = [ + EmbeddingData(index=item["index"], embedding=item["embedding"]) + for item in data.get("data", []) + ] + + return EmbeddingResponse( + model=data.get("model", self.model_id), + data=embedding_data, + ) diff --git a/sdk/python/test/conftest.py b/sdk/python/test/conftest.py index b7e22c97..7ff9e120 100644 --- a/sdk/python/test/conftest.py +++ b/sdk/python/test/conftest.py @@ -26,6 +26,7 @@ TEST_MODEL_ALIAS = "qwen2.5-0.5b" AUDIO_MODEL_ALIAS = "whisper-tiny" +EMBEDDING_MODEL_ALIAS = "qwen3-0.6b-embedding-generic-cpu" def get_git_repo_root() -> Path: """Walk upward from __file__ until we find a .git directory.""" diff --git a/sdk/python/test/openai/test_embedding_client.py b/sdk/python/test/openai/test_embedding_client.py new file mode 100644 index 00000000..f782d7b2 --- /dev/null +++ b/sdk/python/test/openai/test_embedding_client.py @@ -0,0 +1,133 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- +"""Tests for EmbeddingClient – mirrors EmbeddingClientTests.cs.""" + +from __future__ import annotations + +import math + +import pytest + +from ..conftest import EMBEDDING_MODEL_ALIAS + + +def _get_loaded_embedding_model(catalog): + """Helper: ensure the embedding model is selected, loaded, and return Model.""" + cached = catalog.get_cached_models() + assert len(cached) > 0 + + cached_variant = next((m for m in cached if m.alias == EMBEDDING_MODEL_ALIAS), None) + assert cached_variant is not None, f"{EMBEDDING_MODEL_ALIAS} should be cached" + + model = catalog.get_model(EMBEDDING_MODEL_ALIAS) + assert model is not None + + model.select_variant(cached_variant) + model.load() + return model + + +class TestEmbeddingClient: + """Embedding Client Tests.""" + + def test_should_generate_embedding(self, catalog): + """Basic embedding generation.""" + model = _get_loaded_embedding_model(catalog) + try: + embedding_client = model.get_embedding_client() + assert embedding_client is not None + + response = embedding_client.generate_embedding( + "The quick brown fox jumps over the lazy dog" + ) + + assert response is not None + assert response.model is not None + assert len(response.data) == 1 + assert response.data[0].index == 0 + assert len(response.data[0].embedding) == 1024 + + print(f"Embedding dimension: {len(response.data[0].embedding)}") + print(f"First value: {response.data[0].embedding[0]}") + print(f"Last value: {response.data[0].embedding[-1]}") + finally: + model.unload() + + def test_should_generate_normalized_embedding(self, catalog): + """Verify L2 norm is approximately 1.0.""" + model = _get_loaded_embedding_model(catalog) + try: + embedding_client = model.get_embedding_client() + + inputs = [ + "The quick brown fox jumps over the lazy dog", + "Machine learning is a subset of artificial intelligence", + "The capital of France is Paris", + ] + + for input_text in inputs: + response = embedding_client.generate_embedding(input_text) + embedding = response.data[0].embedding + + assert len(embedding) == 1024 + + norm = math.sqrt(sum(v * v for v in embedding)) + assert 0.99 <= norm <= 1.01, f"L2 norm {norm} not approximately 1.0" + + for val in embedding: + assert -1.0 <= val <= 1.0 + finally: + model.unload() + + def test_should_produce_different_embeddings_for_different_inputs(self, catalog): + """Different inputs should produce different embeddings.""" + model = _get_loaded_embedding_model(catalog) + try: + embedding_client = model.get_embedding_client() + + response1 = embedding_client.generate_embedding("The quick brown fox") + response2 = embedding_client.generate_embedding("The capital of France is Paris") + + emb1 = response1.data[0].embedding + emb2 = response2.data[0].embedding + + assert len(emb1) == len(emb2) + + # Cosine similarity should not be 1.0 + dot = sum(a * b for a, b in zip(emb1, emb2)) + norm1 = math.sqrt(sum(a * a for a in emb1)) + norm2 = math.sqrt(sum(b * b for b in emb2)) + cosine_similarity = dot / (norm1 * norm2) + assert cosine_similarity < 0.99 + finally: + model.unload() + + def test_should_produce_same_embedding_for_same_input(self, catalog): + """Same input should produce identical embeddings.""" + model = _get_loaded_embedding_model(catalog) + try: + embedding_client = model.get_embedding_client() + + response1 = embedding_client.generate_embedding("Deterministic embedding test") + response2 = embedding_client.generate_embedding("Deterministic embedding test") + + emb1 = response1.data[0].embedding + emb2 = response2.data[0].embedding + + for i in range(len(emb1)): + assert emb1[i] == emb2[i] + finally: + model.unload() + + def test_should_raise_for_empty_input(self, catalog): + """Empty input should raise ValueError.""" + model = _get_loaded_embedding_model(catalog) + try: + embedding_client = model.get_embedding_client() + + with pytest.raises(ValueError): + embedding_client.generate_embedding("") + finally: + model.unload() diff --git a/sdk/rust/README.md b/sdk/rust/README.md index 08f9c279..d435f437 100644 --- a/sdk/rust/README.md +++ b/sdk/rust/README.md @@ -8,6 +8,7 @@ The Foundry Local Rust SDK provides an async Rust interface for running AI model - **Model catalog** — Browse and discover available models; check what's cached or loaded - **Automatic model management** — Download, load, unload, and remove models from cache - **Chat completions** — OpenAI-compatible chat API with both non-streaming and streaming responses +- **Embeddings** — Generate text embeddings via OpenAI-compatible API - **Audio transcription** — Transcribe audio files locally with streaming support - **Tool calling** — Function/tool calling with streaming, multi-turn conversation support - **Response format control** — Text, JSON, JSON Schema, and Lark grammar constrained output @@ -353,6 +354,28 @@ let client = model.create_chat_client() .response_format(ChatResponseFormat::LarkGrammar(grammar.to_string())); ``` +### Embeddings + +Generate text embeddings using the `EmbeddingClient`: + +```rust +let embedding_client = model.create_embedding_client(); + +let response = embedding_client + .generate_embedding("The quick brown fox jumps over the lazy dog") + .await?; +let embedding = &response.data[0].embedding; // Vec +println!("Dimensions: {}", embedding.len()); +``` + +#### Embedding Settings + +```rust +let embedding_client = model.create_embedding_client() + .dimensions(512) // optional: reduce dimensionality + .encoding_format("float"); // "float" or "base64" +``` + ### Audio Transcription Transcribe audio files locally using the `AudioClient`: diff --git a/sdk/rust/docs/api.md b/sdk/rust/docs/api.md index abfec76f..e667843b 100644 --- a/sdk/rust/docs/api.md +++ b/sdk/rust/docs/api.md @@ -15,6 +15,8 @@ - [OpenAI Clients](#openai-clients) - [ChatClient](#chatclient) - [ChatCompletionStream](#chatcompletionstream) + - [EmbeddingClient](#embeddingclient) + - [EmbeddingResponse](#embeddingresponse) - [AudioClient](#audioclient) - [AudioTranscriptionStream](#audiotranscriptionstream) - [AudioTranscriptionResponse](#audiotranscriptionresponse) @@ -214,6 +216,34 @@ A stream of `CreateChatCompletionStreamResponse` chunks. Use with `StreamExt::ne --- +### EmbeddingClient + +OpenAI-compatible embedding generation backed by a local model. + +| Method | Description | +|---|---| +| `new(model_id, core)` | *(internal)* Create a new client | +| `dimensions(v: u32) -> Self` | Set the number of output dimensions | +| `encoding_format(v: impl Into) -> Self` | Set encoding format (`"float"` or `"base64"`) | +| `generate_embedding(input: &str) -> Result` | Generate embeddings for input text | + +### EmbeddingResponse + +| Field | Type | Description | +|---|---|---| +| `model` | `String` | Model used for generation | +| `object` | `Option` | Object type (always `"list"`) | +| `data` | `Vec` | List of embedding results | + +### EmbeddingData + +| Field | Type | Description | +|---|---|---| +| `index` | `i32` | Index of this embedding | +| `embedding` | `Vec` | The embedding vector | + +--- + ### AudioClient OpenAI-compatible audio transcription backed by a local model. diff --git a/sdk/rust/src/detail/model.rs b/sdk/rust/src/detail/model.rs index 3a87a1c3..08288aee 100644 --- a/sdk/rust/src/detail/model.rs +++ b/sdk/rust/src/detail/model.rs @@ -14,6 +14,7 @@ use super::model_variant::ModelVariant; use crate::error::{FoundryLocalError, Result}; use crate::openai::AudioClient; use crate::openai::ChatClient; +use crate::openai::EmbeddingClient; use crate::types::ModelInfo; /// The public model type. @@ -242,6 +243,11 @@ impl Model { self.selected_variant().create_audio_client() } + /// Create an [`EmbeddingClient`] bound to the (selected) variant. + pub fn create_embedding_client(&self) -> EmbeddingClient { + self.selected_variant().create_embedding_client() + } + /// Available variants of this model. /// /// For a single-variant model (e.g. from diff --git a/sdk/rust/src/detail/model_variant.rs b/sdk/rust/src/detail/model_variant.rs index ca1a83c7..1f8ce7d5 100644 --- a/sdk/rust/src/detail/model_variant.rs +++ b/sdk/rust/src/detail/model_variant.rs @@ -15,6 +15,7 @@ use crate::catalog::CacheInvalidator; use crate::error::Result; use crate::openai::AudioClient; use crate::openai::ChatClient; +use crate::openai::EmbeddingClient; use crate::types::ModelInfo; /// Represents one specific variant of a model (a particular id within an alias @@ -148,4 +149,8 @@ impl ModelVariant { pub(crate) fn create_audio_client(&self) -> AudioClient { AudioClient::new(&self.info.id, Arc::clone(&self.core)) } + + pub(crate) fn create_embedding_client(&self) -> EmbeddingClient { + EmbeddingClient::new(&self.info.id, Arc::clone(&self.core)) + } } diff --git a/sdk/rust/src/openai/embedding_client.rs b/sdk/rust/src/openai/embedding_client.rs new file mode 100644 index 00000000..580fd3da --- /dev/null +++ b/sdk/rust/src/openai/embedding_client.rs @@ -0,0 +1,117 @@ +//! OpenAI-compatible embedding client. + +use std::sync::Arc; + +use serde_json::{json, Value}; + +use crate::detail::core_interop::CoreInterop; +use crate::error::Result; + +/// OpenAI-compatible embedding response. +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct EmbeddingData { + /// The index of the embedding in the list. + pub index: i32, + /// The embedding vector. + pub embedding: Vec, +} + +/// OpenAI-compatible embedding response. +#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] +pub struct EmbeddingResponse { + /// The model used for generation. + pub model: String, + /// The object type (always "list"). + pub object: Option, + /// List of embedding results. + pub data: Vec, +} + +/// Tuning knobs for embedding requests. +/// +/// Use the chainable setter methods to configure, e.g.: +/// +/// ```ignore +/// let client = model.create_embedding_client() +/// .dimensions(512); +/// ``` +#[derive(Debug, Clone, Default)] +pub struct EmbeddingClientSettings { + dimensions: Option, + encoding_format: Option, +} + +impl EmbeddingClientSettings { + fn serialize(&self, model_id: &str, input: &str) -> Value { + let mut map = serde_json::Map::new(); + + map.insert("model".into(), json!(model_id)); + map.insert("input".into(), json!(input)); + + if let Some(dims) = self.dimensions { + map.insert("dimensions".into(), json!(dims)); + } + if let Some(ref fmt) = self.encoding_format { + map.insert("encoding_format".into(), json!(fmt)); + } + + Value::Object(map) + } +} + +/// Client for OpenAI-compatible embedding generation backed by a local model. +pub struct EmbeddingClient { + model_id: String, + core: Arc, + settings: EmbeddingClientSettings, +} + +impl EmbeddingClient { + pub(crate) fn new(model_id: &str, core: Arc) -> Self { + Self { + model_id: model_id.to_owned(), + core, + settings: EmbeddingClientSettings::default(), + } + } + + /// Set the number of dimensions for the output embeddings. + pub fn dimensions(mut self, v: u32) -> Self { + self.settings.dimensions = Some(v); + self + } + + /// Set the encoding format ("float" or "base64"). + pub fn encoding_format(mut self, v: impl Into) -> Self { + self.settings.encoding_format = Some(v.into()); + self + } + + /// Generate embeddings for the given input text. + pub async fn generate_embedding(&self, input: &str) -> Result { + Self::validate_input(input)?; + + let request = self.settings.serialize(&self.model_id, input); + let params = json!({ + "Params": { + "OpenAICreateRequest": serde_json::to_string(&request)? + } + }); + + let raw = self + .core + .execute_command_async("embeddings".into(), Some(params)) + .await?; + let parsed: EmbeddingResponse = serde_json::from_str(&raw)?; + Ok(parsed) + } + + fn validate_input(input: &str) -> Result<()> { + if input.trim().is_empty() { + return Err(crate::error::FoundryLocalError::Validation { + reason: "input must be a non-empty string".into(), + }); + } + Ok(()) + } +} diff --git a/sdk/rust/src/openai/mod.rs b/sdk/rust/src/openai/mod.rs index c3d4a645..c266cc7a 100644 --- a/sdk/rust/src/openai/mod.rs +++ b/sdk/rust/src/openai/mod.rs @@ -1,5 +1,6 @@ mod audio_client; mod chat_client; +mod embedding_client; mod json_stream; pub use self::audio_client::{ @@ -7,4 +8,7 @@ pub use self::audio_client::{ TranscriptionSegment, TranscriptionWord, }; pub use self::chat_client::{ChatClient, ChatClientSettings, ChatCompletionStream}; +pub use self::embedding_client::{ + EmbeddingClient, EmbeddingClientSettings, EmbeddingData, EmbeddingResponse, +}; pub use self::json_stream::JsonStream; diff --git a/sdk/rust/tests/integration/common/mod.rs b/sdk/rust/tests/integration/common/mod.rs index b0ca1a77..a79cab0f 100644 --- a/sdk/rust/tests/integration/common/mod.rs +++ b/sdk/rust/tests/integration/common/mod.rs @@ -14,6 +14,9 @@ pub const TEST_MODEL_ALIAS: &str = "qwen2.5-0.5b"; /// Default model alias used for audio-transcription integration tests. pub const WHISPER_MODEL_ALIAS: &str = "whisper-tiny"; +/// Default model alias used for embedding integration tests. +pub const EMBEDDING_MODEL_ALIAS: &str = "qwen3-0.6b-embedding-generic-cpu"; + /// Expected transcription text fragment for the shared audio test file. pub const EXPECTED_TRANSCRIPTION_TEXT: &str = " And lots of times you need to give people more than one link at a time"; diff --git a/sdk/rust/tests/integration/embedding_client_test.rs b/sdk/rust/tests/integration/embedding_client_test.rs new file mode 100644 index 00000000..fcd550e6 --- /dev/null +++ b/sdk/rust/tests/integration/embedding_client_test.rs @@ -0,0 +1,144 @@ +//! Integration tests for EmbeddingClient. + +use std::sync::Arc; + +use foundry_local_sdk::openai::EmbeddingClient; +use foundry_local_sdk::Model; + +use crate::common; + +async fn setup_embedding_client() -> (EmbeddingClient, Arc) { + let manager = common::get_test_manager(); + let catalog = manager.catalog(); + + let model = catalog + .get_model(common::EMBEDDING_MODEL_ALIAS) + .await + .expect("embedding model should exist in catalog"); + + model.load().await.expect("model should load successfully"); + + let client = model.create_embedding_client(); + (client, model) +} + +#[tokio::test] +async fn should_generate_embedding() { + let (client, model) = setup_embedding_client().await; + + let response = client + .generate_embedding("The quick brown fox jumps over the lazy dog") + .await + .expect("embedding should succeed"); + + assert_eq!(response.data.len(), 1); + assert_eq!(response.data[0].index, 0); + assert_eq!(response.data[0].embedding.len(), 1024); + + println!("Embedding dimension: {}", response.data[0].embedding.len()); + + model.unload().await.expect("unload should succeed"); +} + +#[tokio::test] +async fn should_generate_normalized_embedding() { + let (client, model) = setup_embedding_client().await; + + let inputs = [ + "The quick brown fox jumps over the lazy dog", + "Machine learning is a subset of artificial intelligence", + "The capital of France is Paris", + ]; + + for input in &inputs { + let response = client + .generate_embedding(input) + .await + .expect("embedding should succeed"); + + let embedding = &response.data[0].embedding; + assert_eq!(embedding.len(), 1024); + + // Verify L2 norm is approximately 1.0 + let norm: f64 = embedding.iter().map(|v| v * v).sum::().sqrt(); + assert!( + (0.99..=1.01).contains(&norm), + "L2 norm {norm} not approximately 1.0" + ); + + for val in embedding { + assert!( + (-1.0..=1.0).contains(val), + "value {val} outside [-1, 1]" + ); + } + } + + model.unload().await.expect("unload should succeed"); +} + +#[tokio::test] +async fn should_produce_different_embeddings_for_different_inputs() { + let (client, model) = setup_embedding_client().await; + + let response1 = client + .generate_embedding("The quick brown fox") + .await + .expect("embedding should succeed"); + + let response2 = client + .generate_embedding("The capital of France is Paris") + .await + .expect("embedding should succeed"); + + let emb1 = &response1.data[0].embedding; + let emb2 = &response2.data[0].embedding; + + assert_eq!(emb1.len(), emb2.len()); + + // Cosine similarity should not be 1.0 + let dot: f64 = emb1.iter().zip(emb2.iter()).map(|(a, b)| a * b).sum(); + let norm1: f64 = emb1.iter().map(|v| v * v).sum::().sqrt(); + let norm2: f64 = emb2.iter().map(|v| v * v).sum::().sqrt(); + let cosine_similarity = dot / (norm1 * norm2); + assert!( + cosine_similarity < 0.99, + "cosine similarity {cosine_similarity} should be < 0.99" + ); + + model.unload().await.expect("unload should succeed"); +} + +#[tokio::test] +async fn should_produce_same_embedding_for_same_input() { + let (client, model) = setup_embedding_client().await; + + let response1 = client + .generate_embedding("Deterministic embedding test") + .await + .expect("embedding should succeed"); + + let response2 = client + .generate_embedding("Deterministic embedding test") + .await + .expect("embedding should succeed"); + + let emb1 = &response1.data[0].embedding; + let emb2 = &response2.data[0].embedding; + + for (i, (a, b)) in emb1.iter().zip(emb2.iter()).enumerate() { + assert_eq!(a, b, "mismatch at index {i}"); + } + + model.unload().await.expect("unload should succeed"); +} + +#[tokio::test] +async fn should_throw_for_empty_input() { + let (client, model) = setup_embedding_client().await; + + let result = client.generate_embedding("").await; + assert!(result.is_err(), "empty input should return an error"); + + model.unload().await.expect("unload should succeed"); +} diff --git a/sdk/rust/tests/integration/main.rs b/sdk/rust/tests/integration/main.rs index 04de9a23..c63956f3 100644 --- a/sdk/rust/tests/integration/main.rs +++ b/sdk/rust/tests/integration/main.rs @@ -11,6 +11,7 @@ mod common; mod audio_client_test; mod catalog_test; mod chat_client_test; +mod embedding_client_test; mod manager_test; mod model_test; mod web_service_test; From de3fe50c8e99eed90e812d84d6cf532d1fac297f Mon Sep 17 00:00:00 2001 From: Raja Phanindra Chava Date: Fri, 10 Apr 2026 05:40:37 -0700 Subject: [PATCH 11/11] Use Betalgo/openai models and add batching to support array of strings as input (#3) Co-authored-by: Raja Phanindra Chava --- .gitignore | 1 + sdk/cs/src/OpenAI/EmbeddingClient.cs | 30 ++++- .../OpenAI/EmbeddingRequestResponseTypes.cs | 33 ++--- .../EmbeddingClientTests.cs | 69 ++++++++++ sdk/js/src/openai/embeddingClient.ts | 59 ++++++++- sdk/js/test/openai/embeddingClient.test.ts | 100 +++++++++++++++ sdk/python/src/detail/model_variant.py | 2 +- sdk/python/src/openai/embedding_client.py | 120 ++++++++++-------- .../test/openai/test_embedding_client.py | 69 ++++++++++ sdk/rust/Cargo.toml | 2 +- sdk/rust/src/openai/embedding_client.rs | 99 ++++++++++----- sdk/rust/src/openai/mod.rs | 4 +- .../integration/embedding_client_test.rs | 79 +++++++++++- 13 files changed, 548 insertions(+), 119 deletions(-) diff --git a/.gitignore b/.gitignore index 552012ec..c2684c96 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ test.ipynb bin/ obj/ .vs/ +.vscode/ # build, distribute, and bins build/ diff --git a/sdk/cs/src/OpenAI/EmbeddingClient.cs b/sdk/cs/src/OpenAI/EmbeddingClient.cs index e757fada..7778d25b 100644 --- a/sdk/cs/src/OpenAI/EmbeddingClient.cs +++ b/sdk/cs/src/OpenAI/EmbeddingClient.cs @@ -35,7 +35,6 @@ public record EmbeddingSettings { /// /// The number of dimensions the resulting output embeddings should have. - /// Only supported by some models. /// public int? Dimensions { get; set; } @@ -64,6 +63,20 @@ public async Task GenerateEmbeddingAsync(string input, "Error during embedding generation.", _logger).ConfigureAwait(false); } + /// + /// Generate embeddings for multiple input texts in a single request. + /// + /// The texts to generate embeddings for. + /// Optional cancellation token. + /// Embedding response containing one embedding vector per input. + public async Task GenerateEmbeddingsAsync(IEnumerable inputs, + CancellationToken? ct = null) + { + return await Utils.CallWithExceptionHandling( + () => GenerateEmbeddingsImplAsync(inputs, ct), + "Error during batch embedding generation.", _logger).ConfigureAwait(false); + } + private async Task GenerateEmbeddingImplAsync(string input, CancellationToken? ct) { @@ -74,8 +87,19 @@ private async Task GenerateEmbeddingImplAsync(string in var response = await _coreInterop.ExecuteCommandAsync("embeddings", request, ct ?? CancellationToken.None).ConfigureAwait(false); - var embeddingResponse = response.ToEmbeddingResponse(_logger); + return response.ToEmbeddingResponse(_logger); + } + + private async Task GenerateEmbeddingsImplAsync(IEnumerable inputs, + CancellationToken? ct) + { + var embeddingRequest = EmbeddingCreateRequestExtended.FromUserInput(_modelId, inputs, Settings); + var embeddingRequestJson = embeddingRequest.ToJson(); + + var request = new CoreInteropRequest { Params = new() { { "OpenAICreateRequest", embeddingRequestJson } } }; + var response = await _coreInterop.ExecuteCommandAsync("embeddings", request, + ct ?? CancellationToken.None).ConfigureAwait(false); - return embeddingResponse; + return response.ToEmbeddingResponse(_logger); } } diff --git a/sdk/cs/src/OpenAI/EmbeddingRequestResponseTypes.cs b/sdk/cs/src/OpenAI/EmbeddingRequestResponseTypes.cs index d03025b9..f81b8c0d 100644 --- a/sdk/cs/src/OpenAI/EmbeddingRequestResponseTypes.cs +++ b/sdk/cs/src/OpenAI/EmbeddingRequestResponseTypes.cs @@ -1,4 +1,4 @@ -// -------------------------------------------------------------------------------------------------------------------- +// -------------------------------------------------------------------------------------------------------------------- // // Copyright (c) Microsoft. All rights reserved. // @@ -7,28 +7,16 @@ namespace Microsoft.AI.Foundry.Local.OpenAI; using System.Text.Json; -using System.Text.Json.Serialization; +using Betalgo.Ranul.OpenAI.ObjectModels.RequestModels; using Betalgo.Ranul.OpenAI.ObjectModels.ResponseModels; using Microsoft.AI.Foundry.Local.Detail; using Microsoft.Extensions.Logging; // https://platform.openai.com/docs/api-reference/embeddings/create -internal record EmbeddingCreateRequestExtended +internal record EmbeddingCreateRequestExtended : EmbeddingCreateRequest { - [JsonPropertyName("input")] - public string? Input { get; set; } - - [JsonPropertyName("model")] - public string? Model { get; set; } - - [JsonPropertyName("dimensions")] - public int? Dimensions { get; set; } - - [JsonPropertyName("encoding_format")] - public string? EncodingFormat { get; set; } - internal static EmbeddingCreateRequestExtended FromUserInput(string modelId, string input, OpenAIEmbeddingClient.EmbeddingSettings settings) @@ -41,6 +29,19 @@ internal static EmbeddingCreateRequestExtended FromUserInput(string modelId, EncodingFormat = settings.EncodingFormat }; } + + internal static EmbeddingCreateRequestExtended FromUserInput(string modelId, + IEnumerable inputs, + OpenAIEmbeddingClient.EmbeddingSettings settings) + { + return new EmbeddingCreateRequestExtended + { + Model = modelId, + InputAsList = inputs.ToList(), + Dimensions = settings.Dimensions, + EncodingFormat = settings.EncodingFormat + }; + } } internal static class EmbeddingRequestResponseExtensions @@ -72,7 +73,7 @@ internal static EmbeddingCreateResponse ToEmbeddingResponse(this string response var output = JsonSerializer.Deserialize(responseData, JsonSerializationContext.Default.EmbeddingCreateResponse); if (output == null) { - logger.LogError("Failed to deserialize EmbeddingCreateResponse (length={Length})", responseData.Length); + logger.LogError("Failed to deserialize embedding response: {ResponseData}", responseData); throw new JsonException("Failed to deserialize EmbeddingCreateResponse"); } diff --git a/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs b/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs index 5b308363..3b316726 100644 --- a/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs +++ b/sdk/cs/test/FoundryLocal.Tests/EmbeddingClientTests.cs @@ -167,4 +167,73 @@ public async Task Embedding_KnownValues_CapitalOfFrance() await Assert.That(Math.Abs(embedding[1023] - (-0.00887922290712595))).IsLessThanOrEqualTo(tolerance); } + [Test] + public async Task Embedding_Batch_ReturnsMultipleEmbeddings() + { + var embeddingClient = await model!.GetEmbeddingClientAsync(); + await Assert.That(embeddingClient).IsNotNull(); + + var response = await embeddingClient.GenerateEmbeddingsAsync([ + "The quick brown fox jumps over the lazy dog", + "Machine learning is a subset of artificial intelligence", + "The capital of France is Paris" + ]).ConfigureAwait(false); + + await Assert.That(response).IsNotNull(); + await Assert.That(response.Data).IsNotNull().And.IsNotEmpty(); + await Assert.That(response.Data.Count).IsEqualTo(3); + + for (var i = 0; i < 3; i++) + { + await Assert.That(response.Data[i].Index).IsEqualTo(i); + await Assert.That(response.Data[i].Embedding.Count).IsEqualTo(1024); + } + } + + [Test] + public async Task Embedding_Batch_EachEmbeddingIsNormalized() + { + var embeddingClient = await model!.GetEmbeddingClientAsync(); + await Assert.That(embeddingClient).IsNotNull(); + + var response = await embeddingClient.GenerateEmbeddingsAsync([ + "Hello world", + "Goodbye world" + ]).ConfigureAwait(false); + + await Assert.That(response.Data.Count).IsEqualTo(2); + + foreach (var data in response.Data) + { + double norm = 0; + foreach (var val in data.Embedding) + { + norm += val * val; + } + + norm = Math.Sqrt(norm); + await Assert.That(norm).IsGreaterThanOrEqualTo(0.99); + await Assert.That(norm).IsLessThanOrEqualTo(1.01); + } + } + + [Test] + public async Task Embedding_Batch_MatchesSingleInputResults() + { + var embeddingClient = await model!.GetEmbeddingClientAsync(); + await Assert.That(embeddingClient).IsNotNull(); + + var input = "The capital of France is Paris"; + + var singleResponse = await embeddingClient.GenerateEmbeddingAsync(input).ConfigureAwait(false); + var batchResponse = await embeddingClient.GenerateEmbeddingsAsync([input]).ConfigureAwait(false); + + await Assert.That(batchResponse.Data.Count).IsEqualTo(1); + + for (var i = 0; i < singleResponse.Data[0].Embedding.Count; i++) + { + await Assert.That(batchResponse.Data[0].Embedding[i]) + .IsEqualTo(singleResponse.Data[0].Embedding[i]); + } + } } diff --git a/sdk/js/src/openai/embeddingClient.ts b/sdk/js/src/openai/embeddingClient.ts index 1ea9be0c..6e819b0a 100644 --- a/sdk/js/src/openai/embeddingClient.ts +++ b/sdk/js/src/openai/embeddingClient.ts @@ -9,6 +9,8 @@ export class EmbeddingClientSettings { * @internal */ _serialize() { + this.validateEncodingFormat(this.encodingFormat); + const result: any = { dimensions: this.dimensions, encoding_format: this.encodingFormat, @@ -17,6 +19,18 @@ export class EmbeddingClientSettings { // Filter out undefined properties return Object.fromEntries(Object.entries(result).filter(([_, v]) => v !== undefined)); } + + /** + * Validates that the encoding format is a supported value. + * @internal + */ + private validateEncodingFormat(format?: string): void { + if (!format) return; + const validFormats = ['float', 'base64']; + if (!validFormats.includes(format)) { + throw new Error(`encodingFormat must be one of: ${validFormats.join(', ')}`); + } + } } /** @@ -53,13 +67,23 @@ export class EmbeddingClient { } /** - * Generates embeddings for the given input text. - * @param input - The text to generate embeddings for. - * @returns The embedding response containing the embedding vector. + * Validates that the inputs array is non-empty and all elements are non-empty strings. + * @internal */ - public async generateEmbedding(input: string): Promise { - this.validateInput(input); + private validateInputs(inputs: string[]): void { + if (!inputs || !Array.isArray(inputs) || inputs.length === 0) { + throw new Error('Inputs must be a non-empty array of strings.'); + } + for (const input of inputs) { + this.validateInput(input); + } + } + /** + * Sends an embedding request and parses the response. + * @internal + */ + private executeRequest(input: string | string[]): any { const request = { model: this.modelId, input, @@ -72,7 +96,30 @@ export class EmbeddingClient { }); return JSON.parse(response); } catch (error: any) { - throw new Error(`Embedding generation failed: ${error.message}`, { cause: error }); + throw new Error( + `Embedding generation failed for model '${this.modelId}': ${error instanceof Error ? error.message : String(error)}`, + { cause: error } + ); } } + + /** + * Generates embeddings for the given input text. + * @param input - The text to generate embeddings for. + * @returns The embedding response containing the embedding vector. + */ + public async generateEmbedding(input: string): Promise { + this.validateInput(input); + return this.executeRequest(input); + } + + /** + * Generates embeddings for multiple input texts in a single request. + * @param inputs - The texts to generate embeddings for. + * @returns The embedding response containing one embedding vector per input. + */ + public async generateEmbeddings(inputs: string[]): Promise { + this.validateInputs(inputs); + return this.executeRequest(inputs); + } } diff --git a/sdk/js/test/openai/embeddingClient.test.ts b/sdk/js/test/openai/embeddingClient.test.ts index be98994a..f9395f5e 100644 --- a/sdk/js/test/openai/embeddingClient.test.ts +++ b/sdk/js/test/openai/embeddingClient.test.ts @@ -152,4 +152,104 @@ describe('Embedding Client Tests', () => { const { EmbeddingClient } = require('../../src/openai/embeddingClient.js'); }).to.not.throw(); }); + + it('should generate batch embeddings', async function() { + this.timeout(30000); + const manager = getTestManager(); + const catalog = manager.catalog; + + const cachedModels = await catalog.getCachedModels(); + const cachedVariant = cachedModels.find(m => m.alias === EMBEDDING_MODEL_ALIAS); + if (!cachedVariant) { this.skip(); return; } + + const model = await catalog.getModel(EMBEDDING_MODEL_ALIAS); + model.selectVariant(cachedVariant); + await model.load(); + + try { + const embeddingClient = model.createEmbeddingClient(); + + const response = await embeddingClient.generateEmbeddings([ + 'The quick brown fox jumps over the lazy dog', + 'Machine learning is a subset of artificial intelligence', + 'The capital of France is Paris' + ]); + + expect(response).to.not.be.undefined; + expect(response.data).to.be.an('array').with.length(3); + + for (let i = 0; i < 3; i++) { + expect(response.data[i].index).to.equal(i); + expect(response.data[i].embedding.length).to.equal(1024); + } + } finally { + await model.unload(); + } + }); + + it('should produce normalized batch embeddings', async function() { + this.timeout(30000); + const manager = getTestManager(); + const catalog = manager.catalog; + + const cachedModels = await catalog.getCachedModels(); + const cachedVariant = cachedModels.find(m => m.alias === EMBEDDING_MODEL_ALIAS); + if (!cachedVariant) { this.skip(); return; } + + const model = await catalog.getModel(EMBEDDING_MODEL_ALIAS); + model.selectVariant(cachedVariant); + await model.load(); + + try { + const embeddingClient = model.createEmbeddingClient(); + + const response = await embeddingClient.generateEmbeddings([ + 'Hello world', + 'Goodbye world' + ]); + + expect(response.data.length).to.equal(2); + + for (const data of response.data) { + let norm = 0; + for (const val of data.embedding) { + norm += val * val; + } + norm = Math.sqrt(norm); + expect(norm).to.be.greaterThan(0.99); + expect(norm).to.be.lessThan(1.01); + } + } finally { + await model.unload(); + } + }); + + it('should match single and batch results', async function() { + this.timeout(30000); + const manager = getTestManager(); + const catalog = manager.catalog; + + const cachedModels = await catalog.getCachedModels(); + const cachedVariant = cachedModels.find(m => m.alias === EMBEDDING_MODEL_ALIAS); + if (!cachedVariant) { this.skip(); return; } + + const model = await catalog.getModel(EMBEDDING_MODEL_ALIAS); + model.selectVariant(cachedVariant); + await model.load(); + + try { + const embeddingClient = model.createEmbeddingClient(); + + const singleResponse = await embeddingClient.generateEmbedding('The capital of France is Paris'); + const batchResponse = await embeddingClient.generateEmbeddings(['The capital of France is Paris']); + + expect(batchResponse.data.length).to.equal(1); + + for (let i = 0; i < singleResponse.data[0].embedding.length; i++) { + expect(batchResponse.data[0].embedding[i]).to.equal(singleResponse.data[0].embedding[i]); + } + } finally { + await model.unload(); + } + }); }); diff --git a/sdk/python/src/detail/model_variant.py b/sdk/python/src/detail/model_variant.py index d71d335a..76efb05c 100644 --- a/sdk/python/src/detail/model_variant.py +++ b/sdk/python/src/detail/model_variant.py @@ -174,4 +174,4 @@ def get_audio_client(self) -> AudioClient: def get_embedding_client(self) -> EmbeddingClient: """Create an OpenAI-compatible ``EmbeddingClient`` for this variant.""" - return EmbeddingClient(self.id, self._core_interop) \ No newline at end of file + return EmbeddingClient(self.id, self._core_interop) diff --git a/sdk/python/src/openai/embedding_client.py b/sdk/python/src/openai/embedding_client.py index 4c89fafd..876f26ce 100644 --- a/sdk/python/src/openai/embedding_client.py +++ b/sdk/python/src/openai/embedding_client.py @@ -7,12 +7,14 @@ import json import logging -from dataclasses import dataclass, field -from typing import List, Optional +from typing import List, Optional, Union from ..detail.core_interop import CoreInterop, InteropRequest from ..exception import FoundryLocalException +from openai.types import CreateEmbeddingResponse +from openai.types.embedding_create_params import EmbeddingCreateParams + logger = logging.getLogger(__name__) @@ -32,31 +34,23 @@ def __init__( self.dimensions = dimensions self.encoding_format = encoding_format + def _serialize(self) -> dict: + """Serialize settings into an OpenAI-compatible request dict.""" + self._validate_encoding_format(self.encoding_format) -@dataclass -class EmbeddingData: - """A single embedding result. - - Attributes: - index: The index of the embedding in the list. - embedding: The embedding vector. - """ - - index: int - embedding: List[float] - - -@dataclass -class EmbeddingResponse: - """Response from an embedding request. - - Attributes: - model: The model used to generate the embedding. - data: List of embedding results. - """ + return { + k: v for k, v in { + "dimensions": self.dimensions, + "encoding_format": self.encoding_format, + }.items() if v is not None + } - model: str - data: List[EmbeddingData] = field(default_factory=list) + def _validate_encoding_format(self, encoding_format: Optional[str]) -> None: + if encoding_format is None: + return + valid_formats = ["float", "base64"] + if encoding_format not in valid_formats: + raise ValueError(f"encoding_format must be one of: {', '.join(valid_formats)}") class EmbeddingClient: @@ -78,52 +72,74 @@ def _validate_input(input_text: str) -> None: if not isinstance(input_text, str) or input_text.strip() == "": raise ValueError("Input must be a non-empty string.") - def _create_request_json(self, input_text: str) -> str: + def _create_request_json(self, input_value: Union[str, List[str]]) -> str: """Build the JSON payload for the ``embeddings`` native command.""" request: dict = { "model": self.model_id, - "input": input_text, + "input": input_value, + **self.settings._serialize(), } - if self.settings.dimensions is not None: - request["dimensions"] = self.settings.dimensions + embedding_request = EmbeddingCreateParams(request) + + return json.dumps(embedding_request) + + def _execute_embedding_request(self, input_value: Union[str, List[str]]) -> CreateEmbeddingResponse: + """Send an embedding request and parse the response.""" + request_json = self._create_request_json(input_value) + request = InteropRequest(params={"OpenAICreateRequest": request_json}) + + response = self._core_interop.execute_command("embeddings", request) + if response.error is not None: + raise FoundryLocalException( + f"Embedding generation failed for model '{self.model_id}': {response.error}" + ) + + data = json.loads(response.data) - if self.settings.encoding_format is not None: - request["encoding_format"] = self.settings.encoding_format + # Add fields required by the OpenAI SDK type that the server doesn't return + for item in data.get("data", []): + if "object" not in item: + item["object"] = "embedding" - return json.dumps(request) + if "usage" not in data: + data["usage"] = {"prompt_tokens": 0, "total_tokens": 0} - def generate_embedding(self, input_text: str) -> EmbeddingResponse: - """Generate embeddings for the given input text. + return CreateEmbeddingResponse.model_validate(data) + + def generate_embedding(self, input_text: str) -> CreateEmbeddingResponse: + """Generate embeddings for a single input text. Args: input_text: The text to generate embeddings for. Returns: - An ``EmbeddingResponse`` containing the embedding vector. + A ``CreateEmbeddingResponse`` containing the embedding vector. Raises: ValueError: If *input_text* is not a non-empty string. FoundryLocalException: If the underlying native embeddings command fails. """ self._validate_input(input_text) + return self._execute_embedding_request(input_text) - request_json = self._create_request_json(input_text) - request = InteropRequest(params={"OpenAICreateRequest": request_json}) + def generate_embeddings(self, inputs: List[str]) -> CreateEmbeddingResponse: + """Generate embeddings for multiple input texts in a single request. - response = self._core_interop.execute_command("embeddings", request) - if response.error is not None: - raise FoundryLocalException( - f"Embedding generation failed for model '{self.model_id}': {response.error}" - ) + Args: + inputs: The texts to generate embeddings for. - data = json.loads(response.data) - embedding_data = [ - EmbeddingData(index=item["index"], embedding=item["embedding"]) - for item in data.get("data", []) - ] - - return EmbeddingResponse( - model=data.get("model", self.model_id), - data=embedding_data, - ) + Returns: + A ``CreateEmbeddingResponse`` containing one embedding vector per input. + + Raises: + ValueError: If *inputs* is empty or contains empty strings. + FoundryLocalException: If the underlying native embeddings command fails. + """ + if not inputs or len(inputs) == 0: + raise ValueError("Inputs must be a non-empty list of strings.") + + for text in inputs: + self._validate_input(text) + + return self._execute_embedding_request(inputs) diff --git a/sdk/python/test/openai/test_embedding_client.py b/sdk/python/test/openai/test_embedding_client.py index f782d7b2..69e9648d 100644 --- a/sdk/python/test/openai/test_embedding_client.py +++ b/sdk/python/test/openai/test_embedding_client.py @@ -131,3 +131,72 @@ def test_should_raise_for_empty_input(self, catalog): embedding_client.generate_embedding("") finally: model.unload() + + def test_batch_should_return_multiple_embeddings(self, catalog): + """Batch request should return one embedding per input.""" + model = _get_loaded_embedding_model(catalog) + try: + embedding_client = model.get_embedding_client() + + response = embedding_client.generate_embeddings([ + "The quick brown fox jumps over the lazy dog", + "Machine learning is a subset of artificial intelligence", + "The capital of France is Paris", + ]) + + assert response is not None + assert len(response.data) == 3 + + for i, data in enumerate(response.data): + assert data.index == i + assert len(data.embedding) == 1024 + finally: + model.unload() + + def test_batch_each_embedding_is_normalized(self, catalog): + """Each embedding in a batch should be L2-normalized.""" + model = _get_loaded_embedding_model(catalog) + try: + embedding_client = model.get_embedding_client() + + response = embedding_client.generate_embeddings([ + "Hello world", + "Goodbye world", + ]) + + assert len(response.data) == 2 + + for data in response.data: + norm = math.sqrt(sum(v * v for v in data.embedding)) + assert 0.99 <= norm <= 1.01, f"L2 norm {norm} not approximately 1.0" + finally: + model.unload() + + def test_batch_matches_single_input_results(self, catalog): + """Batch result should match single-input result for the same text.""" + model = _get_loaded_embedding_model(catalog) + try: + embedding_client = model.get_embedding_client() + + input_text = "The capital of France is Paris" + + single_response = embedding_client.generate_embedding(input_text) + batch_response = embedding_client.generate_embeddings([input_text]) + + assert len(batch_response.data) == 1 + + for i in range(len(single_response.data[0].embedding)): + assert batch_response.data[0].embedding[i] == single_response.data[0].embedding[i] + finally: + model.unload() + + def test_batch_should_raise_for_empty_list(self, catalog): + """Empty list should raise ValueError.""" + model = _get_loaded_embedding_model(catalog) + try: + embedding_client = model.get_embedding_client() + + with pytest.raises(ValueError): + embedding_client.generate_embeddings([]) + finally: + model.unload() diff --git a/sdk/rust/Cargo.toml b/sdk/rust/Cargo.toml index 2a6292b7..a8cd7228 100644 --- a/sdk/rust/Cargo.toml +++ b/sdk/rust/Cargo.toml @@ -24,7 +24,7 @@ tokio-stream = "0.1" futures-core = "0.3" reqwest = { version = "0.12", features = ["json"] } urlencoding = "2" -async-openai = { version = "0.33", default-features = false, features = ["chat-completion-types"] } +async-openai = { version = "0.33", default-features = false, features = ["chat-completion-types", "embedding-types"] } [build-dependencies] ureq = "3" diff --git a/sdk/rust/src/openai/embedding_client.rs b/sdk/rust/src/openai/embedding_client.rs index 580fd3da..798928b6 100644 --- a/sdk/rust/src/openai/embedding_client.rs +++ b/sdk/rust/src/openai/embedding_client.rs @@ -2,30 +2,11 @@ use std::sync::Arc; +use async_openai::types::embeddings::CreateEmbeddingResponse; use serde_json::{json, Value}; use crate::detail::core_interop::CoreInterop; -use crate::error::Result; - -/// OpenAI-compatible embedding response. -#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] -pub struct EmbeddingData { - /// The index of the embedding in the list. - pub index: i32, - /// The embedding vector. - pub embedding: Vec, -} - -/// OpenAI-compatible embedding response. -#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] -pub struct EmbeddingResponse { - /// The model used for generation. - pub model: String, - /// The object type (always "list"). - pub object: Option, - /// List of embedding results. - pub data: Vec, -} +use crate::error::{FoundryLocalError, Result}; /// Tuning knobs for embedding requests. /// @@ -42,12 +23,9 @@ pub struct EmbeddingClientSettings { } impl EmbeddingClientSettings { - fn serialize(&self, model_id: &str, input: &str) -> Value { + fn serialize(&self) -> Value { let mut map = serde_json::Map::new(); - map.insert("model".into(), json!(model_id)); - map.insert("input".into(), json!(input)); - if let Some(dims) = self.dimensions { map.insert("dimensions".into(), json!(dims)); } @@ -87,11 +65,28 @@ impl EmbeddingClient { self } - /// Generate embeddings for the given input text. - pub async fn generate_embedding(&self, input: &str) -> Result { + /// Generate embeddings for a single input text. + pub async fn generate_embedding(&self, input: &str) -> Result { Self::validate_input(input)?; + let request = self.build_request(json!(input))?; + self.execute_request(request).await + } - let request = self.settings.serialize(&self.model_id, input); + /// Generate embeddings for multiple input texts in a single request. + pub async fn generate_embeddings(&self, inputs: &[&str]) -> Result { + if inputs.is_empty() { + return Err(FoundryLocalError::Validation { + reason: "inputs must be a non-empty array".into(), + }); + } + for input in inputs { + Self::validate_input(input)?; + } + let request = self.build_request(json!(inputs))?; + self.execute_request(request).await + } + + async fn execute_request(&self, request: Value) -> Result { let params = json!({ "Params": { "OpenAICreateRequest": serde_json::to_string(&request)? @@ -102,13 +97,57 @@ impl EmbeddingClient { .core .execute_command_async("embeddings".into(), Some(params)) .await?; - let parsed: EmbeddingResponse = serde_json::from_str(&raw)?; + + // Patch the response to add fields required by async_openai types + // that the server doesn't return (object on each item, usage) + let mut response_value: Value = serde_json::from_str(&raw)?; + if let Some(data) = response_value.get_mut("data").and_then(|d| d.as_array_mut()) { + for item in data { + if item.get("object").is_none() { + item.as_object_mut() + .map(|m| m.insert("object".into(), json!("embedding"))); + } + } + } + if response_value.get("usage").is_none() { + response_value.as_object_mut() + .map(|m| m.insert("usage".into(), json!({"prompt_tokens": 0, "total_tokens": 0}))); + } + + let parsed: CreateEmbeddingResponse = serde_json::from_value(response_value)?; Ok(parsed) } + fn build_request(&self, input: Value) -> Result { + Self::validate_encoding_format(&self.settings.encoding_format)?; + + let settings_value = self.settings.serialize(); + let mut map = match settings_value { + Value::Object(m) => m, + _ => serde_json::Map::new(), + }; + + map.insert("model".into(), json!(self.model_id)); + map.insert("input".into(), input); + + Ok(Value::Object(map)) + } + + fn validate_encoding_format(format: &Option) -> Result<()> { + if let Some(ref fmt) = format { + let valid = ["float", "base64"]; + if !valid.contains(&fmt.as_str()) { + return Err(FoundryLocalError::Validation { + reason: format!("encoding_format must be one of: {}", valid.join(", ")), + }); + } + } + Ok(()) + } + fn validate_input(input: &str) -> Result<()> { if input.trim().is_empty() { - return Err(crate::error::FoundryLocalError::Validation { + return Err(FoundryLocalError::Validation { reason: "input must be a non-empty string".into(), }); } diff --git a/sdk/rust/src/openai/mod.rs b/sdk/rust/src/openai/mod.rs index c266cc7a..90e29d10 100644 --- a/sdk/rust/src/openai/mod.rs +++ b/sdk/rust/src/openai/mod.rs @@ -8,7 +8,5 @@ pub use self::audio_client::{ TranscriptionSegment, TranscriptionWord, }; pub use self::chat_client::{ChatClient, ChatClientSettings, ChatCompletionStream}; -pub use self::embedding_client::{ - EmbeddingClient, EmbeddingClientSettings, EmbeddingData, EmbeddingResponse, -}; +pub use self::embedding_client::{EmbeddingClient, EmbeddingClientSettings}; pub use self::json_stream::JsonStream; diff --git a/sdk/rust/tests/integration/embedding_client_test.rs b/sdk/rust/tests/integration/embedding_client_test.rs index fcd550e6..f211e39a 100644 --- a/sdk/rust/tests/integration/embedding_client_test.rs +++ b/sdk/rust/tests/integration/embedding_client_test.rs @@ -60,15 +60,15 @@ async fn should_generate_normalized_embedding() { assert_eq!(embedding.len(), 1024); // Verify L2 norm is approximately 1.0 - let norm: f64 = embedding.iter().map(|v| v * v).sum::().sqrt(); + let norm: f32 = embedding.iter().map(|v| v * v).sum::().sqrt() as f32; assert!( - (0.99..=1.01).contains(&norm), + (0.99_f32..=1.01_f32).contains(&norm), "L2 norm {norm} not approximately 1.0" ); for val in embedding { assert!( - (-1.0..=1.0).contains(val), + (-1.0_f32..=1.0_f32).contains(val), "value {val} outside [-1, 1]" ); } @@ -97,12 +97,12 @@ async fn should_produce_different_embeddings_for_different_inputs() { assert_eq!(emb1.len(), emb2.len()); // Cosine similarity should not be 1.0 - let dot: f64 = emb1.iter().zip(emb2.iter()).map(|(a, b)| a * b).sum(); - let norm1: f64 = emb1.iter().map(|v| v * v).sum::().sqrt(); - let norm2: f64 = emb2.iter().map(|v| v * v).sum::().sqrt(); + let dot: f32 = emb1.iter().zip(emb2.iter()).map(|(a, b)| a * b).sum(); + let norm1: f32 = emb1.iter().map(|v| v * v).sum::().sqrt() as f32; + let norm2: f32 = emb2.iter().map(|v| v * v).sum::().sqrt() as f32; let cosine_similarity = dot / (norm1 * norm2); assert!( - cosine_similarity < 0.99, + cosine_similarity < 0.99_f32, "cosine similarity {cosine_similarity} should be < 0.99" ); @@ -142,3 +142,68 @@ async fn should_throw_for_empty_input() { model.unload().await.expect("unload should succeed"); } + +#[tokio::test] +async fn should_generate_batch_embeddings() { + let (client, model) = setup_embedding_client().await; + + let response = client + .generate_embeddings(&[ + "The quick brown fox jumps over the lazy dog", + "Machine learning is a subset of artificial intelligence", + "The capital of France is Paris", + ]) + .await + .expect("batch embedding should succeed"); + + assert_eq!(response.data.len(), 3); + for (i, data) in response.data.iter().enumerate() { + assert_eq!(data.index, i as u32); + assert_eq!(data.embedding.len(), 1024); + } + + model.unload().await.expect("unload should succeed"); +} + +#[tokio::test] +async fn should_generate_normalized_batch_embeddings() { + let (client, model) = setup_embedding_client().await; + + let response = client + .generate_embeddings(&["Hello world", "Goodbye world"]) + .await + .expect("batch embedding should succeed"); + + assert_eq!(response.data.len(), 2); + for data in &response.data { + let norm: f32 = data.embedding.iter().map(|v| v * v).sum::().sqrt() as f32; + assert!( + (0.99_f32..=1.01_f32).contains(&norm), + "L2 norm {norm} not approximately 1.0" + ); + } + + model.unload().await.expect("unload should succeed"); +} + +#[tokio::test] +async fn should_match_single_and_batch_results() { + let (client, model) = setup_embedding_client().await; + + let single = client + .generate_embedding("The capital of France is Paris") + .await + .expect("single embedding should succeed"); + + let batch = client + .generate_embeddings(&["The capital of France is Paris"]) + .await + .expect("batch embedding should succeed"); + + assert_eq!(batch.data.len(), 1); + for (a, b) in single.data[0].embedding.iter().zip(batch.data[0].embedding.iter()) { + assert_eq!(a, b); + } + + model.unload().await.expect("unload should succeed"); +}