diff --git a/packages/lms-client/src/llm/.test-snapshots/LLM.heavy.test.ts.snap b/packages/lms-client/src/llm/.test-snapshots/LLM.heavy.test.ts.snap index 3119f543..5cd42950 100644 --- a/packages/lms-client/src/llm/.test-snapshots/LLM.heavy.test.ts.snap +++ b/packages/lms-client/src/llm/.test-snapshots/LLM.heavy.test.ts.snap @@ -1,6 +1,6 @@ // Jest Snapshot v1, https://goo.gl/fbAQLP -exports[`LLM Can tokenize correctly 1`] = ` +exports[`LLM with default model fixture Can tokenize correctly 1`] = ` [ 1143, 64866, @@ -11,7 +11,7 @@ exports[`LLM Can tokenize correctly 1`] = ` ] `; -exports[`LLM Can tokenize multiple strings correctly 1`] = ` +exports[`LLM with default model fixture Can tokenize multiple strings correctly 1`] = ` [ [ 34, @@ -34,7 +34,7 @@ exports[`LLM Can tokenize multiple strings correctly 1`] = ` ] `; -exports[`LLM can apply prompt template to a regular chat 1`] = ` +exports[`LLM with default model fixture can apply prompt template to a regular chat 1`] = ` "<|im_start|>system This is the system prompt.<|im_end|> <|im_start|>user @@ -47,24 +47,29 @@ User message 2<|im_end|> " `; -exports[`LLM can get model info 1`] = ` +exports[`LLM with default model fixture can get model info 1`] = ` { "architecture": "qwen2", "contextLength": 4096, + "deviceIdentifier": null, "displayName": "Qwen2.5 0.5B Instruct", "format": "gguf", "identifier": Any, + "indexedModelIdentifier": "lmstudio-community/Qwen2.5-0.5B-Instruct-GGUF/Qwen2.5-0.5B-Instruct-Q4_K_M.gguf", "instanceReference": Any, + "lastUsedTime": 1773374807159, "maxContextLength": 32768, "modelKey": Any, "paramsString": "0.5B", "path": "lmstudio-community/Qwen2.5-0.5B-Instruct-GGUF/Qwen2.5-0.5B-Instruct-Q4_K_M.gguf", + "publisher": "lmstudio-community", "quantization": { "bits": 4, "name": "Q4_K_M", }, "sizeBytes": 397807936, "trainedForToolUse": true, + "ttlMs": 3600000, "type": "llm", "vision": false, } diff --git a/packages/lms-client/src/llm/LLM.heavy.test.ts b/packages/lms-client/src/llm/LLM.heavy.test.ts index a6e4a11c..d669bf70 100644 --- a/packages/lms-client/src/llm/LLM.heavy.test.ts +++ b/packages/lms-client/src/llm/LLM.heavy.test.ts @@ -3,64 +3,142 @@ import { ensureHeavyTestsEnvironment, llmTestingQwen05B } from "../shared.heavy. describe("LLM", () => { let client: LMStudioClient; - let model: LLM; const chat = Chat.from([ { role: "system", content: "This is the system prompt." }, { role: "user", content: "User message 1" }, { role: "assistant", content: "Assistant message 1" }, { role: "user", content: "User message 2" }, ]); + const defaultLoadConfig = { + llamaKCacheQuantizationType: "f32" as const, + llamaVCacheQuantizationType: "f32" as const, + }; beforeAll(async () => { client = new LMStudioClient(); await ensureHeavyTestsEnvironment(client); }); - beforeEach(async () => { - model = await client.llm.model(llmTestingQwen05B, { - verbose: false, - config: { - llamaKCacheQuantizationType: "f32", - llamaVCacheQuantizationType: "f32", - }, + describe("with default model fixture", () => { + let model: LLM; + + beforeEach(async () => { + model = await client.llm.model(llmTestingQwen05B, { + verbose: false, + config: defaultLoadConfig, + }); + }, 60_000); + it("can apply prompt template to a regular chat", async () => { + const formatted = await model.applyPromptTemplate(chat); + expect(formatted).toMatchSnapshot(); }); - }, 60_000); - it("can apply prompt template to a regular chat", async () => { - const formatted = await model.applyPromptTemplate(chat); - expect(formatted).toMatchSnapshot(); - }); - it("can get model context length", async () => { - const contextLength = await model.getContextLength(); - expect(contextLength).toMatchInlineSnapshot(`4096`); - }); - it("can get model info", async () => { - const modelInfo = await model.getModelInfo(); - expect(modelInfo).toMatchSnapshot({ - identifier: expect.any(String), - instanceReference: expect.any(String), - modelKey: expect.any(String), + it("can get model context length", async () => { + const contextLength = await model.getContextLength(); + expect(contextLength).toMatchInlineSnapshot(`4096`); + }); + it("can get model info", async () => { + const modelInfo = await model.getModelInfo(); + expect(modelInfo).toMatchSnapshot({ + identifier: expect.any(String), + instanceReference: expect.any(String), + modelKey: expect.any(String), + }); + }); + it("Can tokenize correctly", async () => { + const tokens = await model.tokenize("Chaos is a ladder."); + expect(tokens).toMatchSnapshot(); + }); + it("Can tokenize multiple strings correctly", async () => { + const tokens = await model.tokenize([ + "Cersei understands the consequences of her absence", + "and she is absent anyway", + ]); + expect(tokens).toMatchSnapshot(); + }); + it("Can count tokens correctly", async () => { + const count = await model.countTokens("Chaos is a ladder."); + expect(count).toMatchInlineSnapshot(`6`); + }); + it("Has correct properties", async () => { + expect(model.displayName).toMatchInlineSnapshot(`"Qwen2.5 0.5B Instruct"`); + expect(model.format).toMatchInlineSnapshot(`"gguf"`); + expect(model.identifier).toEqual(llmTestingQwen05B); + expect(model.path).toEqual(llmTestingQwen05B); + expect(model.sizeBytes).toMatchInlineSnapshot(`397807936`); + expect(model.trainedForToolUse).toMatchInlineSnapshot(`true`); + expect(model.vision).toMatchInlineSnapshot(`false`); }); }); - it("Can tokenize correctly", async () => { - const tokens = await model.tokenize("Chaos is a ladder."); - expect(tokens).toMatchSnapshot(); - }); - it("Can tokenize multiple strings correctly", async () => { - const tokens = await model.tokenize([ - "Cersei understands the consequences of her absence", - "and she is absent anyway", - ]); - expect(tokens).toMatchSnapshot(); - }); - it("Can count tokens correctly", async () => { - const count = await model.countTokens("Chaos is a ladder."); - expect(count).toMatchInlineSnapshot(`6`); - }); - it("Has correct properties", async () => { - expect(model.displayName).toMatchInlineSnapshot(`"Qwen2.5 0.5B Instruct"`); - expect(model.format).toMatchInlineSnapshot(`"gguf"`); - expect(model.identifier).toEqual(llmTestingQwen05B); - expect(model.path).toEqual(llmTestingQwen05B); - expect(model.sizeBytes).toMatchInlineSnapshot(`397807936`); - expect(model.trainedForToolUse).toMatchInlineSnapshot(`true`); - expect(model.vision).toMatchInlineSnapshot(`false`); + + describe("load config round-trips", () => { + it("preserves fit=true through getLoadConfig() to load() round-trip", async () => { + let firstModel: LLM | undefined; + let roundTripModel: LLM | undefined; + try { + firstModel = await client.llm.model(llmTestingQwen05B, { + verbose: false, + config: defaultLoadConfig, + }); + + const firstLoadConfig = await firstModel.getLoadConfig(); + + expect(firstLoadConfig.fit).toBe(true); + expect(firstLoadConfig.gpu?.splitStrategy).toBe("evenly"); + + roundTripModel = await client.llm.load(llmTestingQwen05B, { + identifier: `fit-roundtrip-${Date.now()}-${Math.round(Math.random() * 1_000_000)}`, + verbose: false, + config: firstLoadConfig, + }); + + const secondLoadConfig = await roundTripModel.getLoadConfig(); + + expect(secondLoadConfig.fit).toBe(true); + expect(secondLoadConfig.gpu?.splitStrategy).toBe("evenly"); + } finally { + if (roundTripModel !== undefined) { + await roundTripModel.unload(); + } + if (firstModel !== undefined) { + await firstModel.unload(); + } + } + }, 60_000); + it("preserves fit=false through getLoadConfig() to load() round-trip", async () => { + let firstManualModel: LLM | undefined; + let secondManualModel: LLM | undefined; + try { + firstManualModel = await client.llm.load(llmTestingQwen05B, { + identifier: `fit-disabled-${Date.now()}-${Math.round(Math.random() * 1_000_000)}`, + verbose: false, + config: { + fit: false, + gpu: { ratio: "off" }, + ...defaultLoadConfig, + }, + }); + + const firstLoadConfig = await firstManualModel.getLoadConfig(); + + expect(firstLoadConfig.fit).toBe(false); + expect(firstLoadConfig.gpu?.ratio).toBe("off"); + + secondManualModel = await client.llm.load(llmTestingQwen05B, { + identifier: `fit-disabled-roundtrip-${Date.now()}-${Math.round(Math.random() * 1_000_000)}`, + verbose: false, + config: firstLoadConfig, + }); + + const secondLoadConfig = await secondManualModel.getLoadConfig(); + + expect(secondLoadConfig.fit).toBe(false); + expect(secondLoadConfig.gpu?.ratio).toBe("off"); + } finally { + if (secondManualModel !== undefined) { + await secondManualModel.unload(); + } + if (firstManualModel !== undefined) { + await firstManualModel.unload(); + } + } + }, 60_000); }); }); diff --git a/packages/lms-kv-config/src/conversion/llmLoadModelConfig.test.ts b/packages/lms-kv-config/src/conversion/llmLoadModelConfig.test.ts new file mode 100644 index 00000000..9f4799fb --- /dev/null +++ b/packages/lms-kv-config/src/conversion/llmLoadModelConfig.test.ts @@ -0,0 +1,100 @@ +import { type LLMLoadModelConfig } from "@lmstudio/lms-shared-types"; +import { kvConfigField, makeKVConfigFromFields } from "../KVConfig.js"; +import { llmLlamaMoeLoadConfigSchematics } from "../schema.js"; +import { + kvConfigToLLMLoadModelConfig, + llmLoadModelConfigToKVConfig, +} from "./llmLoadModelConfig.js"; + +/** + * Helper: convert an LLMLoadModelConfig to KVConfig, then parse back the `llama.fit` + * field. Returns `undefined` when the field is absent from the produced KVConfig. + */ +function fitFieldAfterConversion(config: LLMLoadModelConfig): boolean | undefined { + const kvConfig = llmLoadModelConfigToKVConfig(config); + const parsed = llmLlamaMoeLoadConfigSchematics.parsePartial(kvConfig); + return parsed.get("llama.fit"); +} + +describe("llmLoadModelConfigToKVConfig — fit inference", () => { + it("preserves explicit fit: true", () => { + expect(fitFieldAfterConversion({ fit: true })).toBe(true); + }); + + it("preserves explicit fit: false", () => { + expect(fitFieldAfterConversion({ fit: false })).toBe(false); + }); + + it("infers fit=false when ratio is set without fit", () => { + expect(fitFieldAfterConversion({ gpu: { ratio: 0.5 } })).toBe(false); + }); + + it("infers fit=false when numCpuExpertLayersRatio is set without fit", () => { + expect(fitFieldAfterConversion({ gpu: { numCpuExpertLayersRatio: 0.5 } })).toBe(false); + }); + + it("infers fit=false when mainGpu is set without fit (including mainGpu: 0)", () => { + expect(fitFieldAfterConversion({ gpu: { mainGpu: 0 } })).toBe(false); + expect(fitFieldAfterConversion({ gpu: { mainGpu: 1 } })).toBe(false); + }); + + it("infers fit=false when splitStrategy is set without fit", () => { + expect(fitFieldAfterConversion({ gpu: { splitStrategy: "evenly" } })).toBe(false); + }); + + it("does NOT infer fit=false when only disabledGpus is set", () => { + expect(fitFieldAfterConversion({ gpu: { disabledGpus: [1] } })).toBeUndefined(); + }); + + it("does NOT infer fit when no GPU config is provided", () => { + expect(fitFieldAfterConversion({})).toBeUndefined(); + }); + + it("explicit fit: true wins even when ratio is also set", () => { + expect(fitFieldAfterConversion({ fit: true, gpu: { ratio: 0.5 } })).toBe(true); + }); + + it("explicit fit: false is preserved even with no other GPU params", () => { + expect(fitFieldAfterConversion({ fit: false })).toBe(false); + }); +}); + +describe("kvConfigToLLMLoadModelConfig — fit field read-back", () => { + // KVConfig field keys use the full global path (llm.load.* prefix) because + // the schematics preserve the original fullKey even after scoping. + it("reads fit=true from KVConfig", () => { + const kvConfig = makeKVConfigFromFields([kvConfigField("llm.load.llama.fit", true)]); + const result = kvConfigToLLMLoadModelConfig(kvConfig); + expect(result.fit).toBe(true); + }); + + it("reads fit=false from KVConfig", () => { + const kvConfig = makeKVConfigFromFields([kvConfigField("llm.load.llama.fit", false)]); + const result = kvConfigToLLMLoadModelConfig(kvConfig); + expect(result.fit).toBe(false); + }); + + it("fit is undefined when absent from KVConfig", () => { + const kvConfig = makeKVConfigFromFields([]); + const result = kvConfigToLLMLoadModelConfig(kvConfig); + expect(result.fit).toBeUndefined(); + }); +}); + +describe("round-trip", () => { + it("preserves fit and ratio through config → KVConfig → config", () => { + const original: LLMLoadModelConfig = { fit: true, gpu: { ratio: 0.5 } }; + const kvConfig = llmLoadModelConfigToKVConfig(original); + const result = kvConfigToLLMLoadModelConfig(kvConfig); + expect(result.fit).toBe(true); + expect(result.gpu?.ratio).toBe(0.5); + }); + + it("preserves inferred fit=false through round-trip when ratio is set", () => { + const original: LLMLoadModelConfig = { gpu: { ratio: 0.75 } }; + const kvConfig = llmLoadModelConfigToKVConfig(original); + const result = kvConfigToLLMLoadModelConfig(kvConfig); + expect(result.fit).toBe(false); + expect(result.gpu?.ratio).toBe(0.75); + }); +}); diff --git a/packages/lms-kv-config/src/conversion/llmLoadModelConfig.ts b/packages/lms-kv-config/src/conversion/llmLoadModelConfig.ts index a1c0eab3..4136c749 100644 --- a/packages/lms-kv-config/src/conversion/llmLoadModelConfig.ts +++ b/packages/lms-kv-config/src/conversion/llmLoadModelConfig.ts @@ -22,6 +22,20 @@ interface KvConfigToLLMLoadModelConfigOpts { modelFormat?: ModelCompatibilityType; } +function resolveLlamaFit(config: Pick): boolean | undefined { + // If the caller explicitly set any GPU param that fit mode would ignore (ratio, MoE expert + // offload, mainGpu, splitStrategy) but didn't set root-level fit, infer fit=false so the + // modelDefault layer's fit=true doesn't silently override their intent. disabledGpus is excluded + // because fit still respects it. + const hasGpuParamIgnoredByFit = + config.gpu?.ratio !== undefined + || config.gpu?.numCpuExpertLayersRatio !== undefined + || config.gpu?.mainGpu !== undefined + || config.gpu?.splitStrategy !== undefined; + + return config.fit ?? (hasGpuParamIgnoredByFit ? false : undefined); +} + function kvConfigToLLMLlamaLoadModelConfig( config: KVConfig, { useDefaultsForMissingKeys }: Omit = {}, @@ -51,6 +65,11 @@ function kvConfigToLLMLlamaLoadModelConfig( result.gpuStrictVramCap = gpuStrictVramCap; } + const llamaFit = parsed.get("llama.fit"); + if (llamaFit !== undefined) { + result.fit = llamaFit; + } + const llamaAccelerationOffloadRatio = parsed.get("llama.acceleration.offloadRatio"); if (llamaAccelerationOffloadRatio !== undefined) { gpuFields = { @@ -213,6 +232,7 @@ export function llmLoadModelConfigToKVConfig(config: LLMLoadModelConfig): KVConf const top = llmLoadSchematics.buildPartialConfig({ "gpuSplitConfig": convertGPUSettingToGPUSplitConfig(config.gpu), "gpuStrictVramCap": config.gpuStrictVramCap, + "llama.fit": resolveLlamaFit(config), "llama.acceleration.offloadRatio": config.gpu?.ratio, "numCpuExpertLayersRatio": config.gpu?.numCpuExpertLayersRatio, "numParallelSessions": config.maxParallelPredictions, diff --git a/packages/lms-kv-config/src/schema.ts b/packages/lms-kv-config/src/schema.ts index dc2c28d3..2fa0d828 100644 --- a/packages/lms-kv-config/src/schema.ts +++ b/packages/lms-kv-config/src/schema.ts @@ -267,6 +267,7 @@ export const globalConfigSchematics = new KVConfigSchematicsBuilder(kvValueTypes .field("useUnifiedKvCache", "boolean", { isExperimental: true }, true) .scope("llama", builder => builder + .field("fit", "boolean", { machineDependent: true }, false) .scope("acceleration", builder => builder.field( "offloadRatio", diff --git a/packages/lms-shared-types/src/GPUSplitStrategy.ts b/packages/lms-shared-types/src/GPUSplitStrategy.ts index fed9f909..e11f5a31 100644 --- a/packages/lms-shared-types/src/GPUSplitStrategy.ts +++ b/packages/lms-shared-types/src/GPUSplitStrategy.ts @@ -62,7 +62,7 @@ export function convertGPUSettingToGPUSplitConfig( ? "priorityOrder" : gpuSetting?.splitStrategy ?? "evenly", disabledGpus: gpuSetting?.disabledGpus ?? [], - priority: gpuSetting?.mainGpu ? [gpuSetting.mainGpu] : [], + priority: gpuSetting?.mainGpu !== undefined ? [gpuSetting.mainGpu] : [], customRatio: [], }; } diff --git a/packages/lms-shared-types/src/llm/LLMLoadModelConfig.ts b/packages/lms-shared-types/src/llm/LLMLoadModelConfig.ts index 1661dbf3..ccde7a88 100644 --- a/packages/lms-shared-types/src/llm/LLMLoadModelConfig.ts +++ b/packages/lms-shared-types/src/llm/LLMLoadModelConfig.ts @@ -38,6 +38,8 @@ export type GPUSetting = { * A number between 0 to 1 representing the ratio of the work should be distributed to the GPU, * where 0 means no work is distributed and 1 means all work is distributed. Can also specify the * string "off" to mean 0 and the string "max" to mean 1. + * + * When the root-level `fit` setting is true, this field is ignored. */ ratio?: LLMLlamaAccelerationOffloadRatio; /** @@ -45,14 +47,20 @@ export type GPUSetting = { * forced into CPU memory, where 1 means all expert layers will be in CPU memory regardless of * GPU offload configuration and 0 means the expert offload will be determined by GPU offload. * Can also specify the string "off" to mean 0 and the string "max" to mean 1. + * + * When the root-level `fit` setting is true, this field is ignored. */ numCpuExpertLayersRatio?: LLMLlamaAccelerationOffloadRatio; /** * The index of the GPU to use as the main GPU. + * + * When the root-level `fit` setting is true, this field is ignored. */ mainGpu?: number; /** * How to split computation across multiple GPUs. + * + * When the root-level `fit` setting is true, this field is ignored. */ splitStrategy?: LLMSplitStrategy; /** @@ -141,6 +149,15 @@ export const llmMlxKvCacheQuantizationSchema = z.object({ /** @public */ export interface LLMLoadModelConfig { + /** + * When true, enables a fit algorithm that determines optimal layer placement across available + * GPUs automatically. + * + * When `fit` is true, manual GPU placement settings under `gpu` are ignored, except + * `disabledGpus`. + */ + fit?: boolean; + /** * How to distribute the work to your GPUs. See {@link GPUSetting} for more information. * @@ -333,6 +350,7 @@ export interface LLMLoadModelConfig { mlxKvCacheQuantization?: LLMMlxKvCacheQuantization | false; } export const llmLoadModelConfigSchema = z.object({ + fit: z.boolean().optional(), gpu: gpuSettingSchema.optional(), maxParallelPredictions: z.number().int().min(1).optional(), useUnifiedKvCache: z.boolean().optional(),