Skip to content

Commit 52fda71

Browse files
authored
Merge branch 'main' into feat/sdk-qwen35-gemma4-reasoning-budget
2 parents b977f43 + 8b442aa commit 52fda71

18 files changed

Lines changed: 551 additions & 20 deletions

File tree

packages/sdk/client/api/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ export { getLoadedModelInfo } from "./get-loaded-model-info";
2929
export { ocr } from "./ocr";
3030
export { invokePlugin, invokePluginStream } from "./invoke-plugin";
3131
export { diffusion, type DiffusionProgressTick } from "./diffusion";
32+
export { upscale } from "./upscale";
3233
export {
3334
modelRegistryList,
3435
modelRegistrySearch,

packages/sdk/client/api/upscale.ts

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
import {
2+
upscaleStreamResponseSchema,
3+
type UpscaleClientParams,
4+
type UpscaleStats,
5+
type UpscaleStreamRequest,
6+
} from "@/schemas/sdcpp-config";
7+
import { stream as streamRpc } from "@/client/rpc/rpc-client";
8+
import { decodeBase64, encodeBase64 } from "@/utils/encoding";
9+
import { StreamEndedError } from "@/utils/errors-client";
10+
11+
interface UpscaleResult {
12+
outputs: Promise<Uint8Array[]>;
13+
stats: Promise<UpscaleStats | undefined>;
14+
}
15+
16+
/**
17+
* Runs standalone ESRGAN upscaling on an arbitrary PNG/JPEG image.
18+
*
19+
* The model must have been loaded with `modelType: "diffusion"` and
20+
* `modelConfig.mode: "upscale"` — calling `upscale()` against a model
21+
* loaded in default (`mode: "diffusion"`) mode throws
22+
* `ModelOperationNotSupportedError` upfront.
23+
*
24+
* `outputs` always resolves to length 1: `repeats` runs N passes
25+
* internally and emits a single final image at `source * scale^repeats`
26+
* dimensions. The `Uint8Array[]` shape reserves headroom for future
27+
* multi-output variants.
28+
*
29+
* @param params - `{ modelId, image, repeats? }`. `image` is raw PNG/JPEG
30+
* bytes; the client base64-encodes them on the wire.
31+
* @returns `{ outputs, stats }` — `outputs` resolves to a single-element
32+
* array containing the final upscaled PNG; `stats` resolves to
33+
* addon-side stats (load/upscale ms, final width/height, repeats
34+
* actually executed, etc.).
35+
* @throws {ModelOperationNotSupportedError} If the model was not loaded
36+
* with `mode: "upscale"`.
37+
* @throws {StreamEndedError} If the RPC stream closes without emitting a
38+
* terminal `done` chunk.
39+
*
40+
* @example
41+
* ```ts
42+
* const modelId = await loadModel(REALESRGAN_X4PLUS_ANIME_6B, {
43+
* modelType: "diffusion",
44+
* modelConfig: { mode: "upscale", upscaler: { tile_size: 128 } },
45+
* });
46+
* const pngBytes = fs.readFileSync("input.png");
47+
* const { outputs, stats } = upscale({ modelId, image: pngBytes, repeats: 2 });
48+
* const [upscaledPng] = await outputs;
49+
* fs.writeFileSync("upscaled.png", upscaledPng);
50+
* console.log(await stats);
51+
* ```
52+
*/
53+
export function upscale(params: UpscaleClientParams): UpscaleResult {
54+
const request: UpscaleStreamRequest = {
55+
modelId: params.modelId,
56+
image: encodeBase64(params.image),
57+
...(params.repeats !== undefined && { repeats: params.repeats }),
58+
type: "upscaleStream",
59+
};
60+
61+
let statsResolver: (value: UpscaleStats | undefined) => void = () => {};
62+
let statsRejecter: (error: unknown) => void = () => {};
63+
const statsPromise = new Promise<UpscaleStats | undefined>(
64+
(resolve, reject) => {
65+
statsResolver = resolve;
66+
statsRejecter = reject;
67+
},
68+
);
69+
statsPromise.catch(() => {});
70+
71+
let outputsResolver: (value: Uint8Array[]) => void = () => {};
72+
let outputsRejecter: (error: unknown) => void = () => {};
73+
const outputsPromise = new Promise<Uint8Array[]>((resolve, reject) => {
74+
outputsResolver = resolve;
75+
outputsRejecter = reject;
76+
});
77+
outputsPromise.catch(() => {});
78+
79+
const collectedBuffers: Uint8Array[] = [];
80+
81+
async function processResponses() {
82+
let sawDone = false;
83+
try {
84+
for await (const response of streamRpc(request)) {
85+
if (
86+
response &&
87+
typeof response === "object" &&
88+
"type" in response &&
89+
response.type === "upscaleStream"
90+
) {
91+
const parsed = upscaleStreamResponseSchema.parse(response);
92+
93+
if (parsed.data) {
94+
collectedBuffers.push(decodeBase64(parsed.data));
95+
}
96+
97+
if (parsed.done) {
98+
sawDone = true;
99+
statsResolver(parsed.stats);
100+
outputsResolver(collectedBuffers);
101+
}
102+
}
103+
}
104+
105+
if (!sawDone) {
106+
const error = new StreamEndedError();
107+
statsRejecter(error);
108+
outputsRejecter(error);
109+
}
110+
} catch (error) {
111+
statsRejecter(error);
112+
outputsRejecter(error);
113+
}
114+
}
115+
116+
void processResponses();
117+
118+
return {
119+
outputs: outputsPromise,
120+
stats: statsPromise,
121+
};
122+
}

packages/sdk/index.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ export {
3535
invokePluginStream,
3636
diffusion,
3737
type DiffusionProgressTick,
38+
upscale,
3839
modelRegistryList,
3940
modelRegistrySearch,
4041
modelRegistryGetModel,
@@ -106,6 +107,9 @@ export {
106107
type DiffusionClientParams,
107108
type DiffusionStreamResponse,
108109
type DiffusionStats,
110+
type UpscaleClientParams,
111+
type UpscaleStreamResponse,
112+
type UpscaleStats,
109113
definePlugin,
110114
defineHandler,
111115
defineDuplexHandler,

packages/sdk/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,7 @@
173173
},
174174
"dependencies": {
175175
"@qvac/decoder-audio": "^0.3.7",
176-
"@qvac/diffusion-cpp": "^0.6.0",
176+
"@qvac/diffusion-cpp": "^0.7.0",
177177
"@qvac/embed-llamacpp": "^0.15.0",
178178
"@qvac/error": "^0.1.1",
179179
"@qvac/langdetect-text": "^0.1.2",

packages/sdk/schemas/common.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ import { ocrStreamRequestSchema, ocrStreamResponseSchema } from "./ocr";
6464
import {
6565
diffusionStreamRequestSchema,
6666
diffusionStreamResponseSchema,
67+
upscaleStreamRequestSchema,
68+
upscaleStreamResponseSchema,
6769
} from "./sdcpp-config";
6870
import {
6971
finetuneRequestSchema,
@@ -110,6 +112,7 @@ export const requestSchema = z.union([
110112
getLoadedModelInfoRequestSchema,
111113
ocrStreamRequestSchema,
112114
diffusionStreamRequestSchema,
115+
upscaleStreamRequestSchema,
113116
finetuneRequestSchema,
114117
pluginInvokeRequestSchema,
115118
pluginInvokeStreamRequestSchema,
@@ -146,6 +149,7 @@ export const responseSchema = z.discriminatedUnion("type", [
146149
getLoadedModelInfoResponseSchema,
147150
ocrStreamResponseSchema,
148151
diffusionStreamResponseSchema,
152+
upscaleStreamResponseSchema,
149153
finetuneResponseSchema,
150154
finetuneProgressResponseSchema,
151155
pluginInvokeResponseSchema,

packages/sdk/schemas/plugin.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -340,7 +340,7 @@ export const PLUGIN_OCR = "@qvac/sdk/onnx-ocr/plugin" as const;
340340

341341
/**
342342
* Image generation plugin (stable-diffusion.cpp).
343-
* Provides: text-to-image generation.
343+
* Provides: text-to-image generation and standalone ESRGAN image upscaling.
344344
*/
345345
export const PLUGIN_DIFFUSION =
346346
"@qvac/sdk/sdcpp-generation/plugin" as const;

packages/sdk/schemas/sdcpp-config.ts

Lines changed: 118 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,15 @@ const ABSOLUTE_PATH_PATTERN = /^(\/|[A-Za-z]:[\\/]|\\\\)/;
88

99
export const sdcppConfigSchema = z
1010
.object({
11+
mode: z.enum(["diffusion", "upscale"]).default("diffusion")
12+
.describe(
13+
"Operation mode for the diffusion plugin. " +
14+
"`'diffusion'` (default) builds a full SD / SDXL / SD3 / FLUX pipeline from " +
15+
"the primary model plus optional auxiliary text encoders, VAE, and ESRGAN " +
16+
"upscaler, and exposes diffusion({ ... }). " +
17+
"`'upscale'` builds a standalone ESRGAN upscaler from the primary model " +
18+
"file alone (auxiliary model sources are ignored) and exposes upscale({ ... }).",
19+
),
1120
threads: z.number().optional(),
1221
device: z.enum(["gpu", "cpu"]).optional(),
1322
prediction: z
@@ -52,17 +61,18 @@ export const sdcppConfigSchema = z
5261
upscaler: z.object({
5362
type: z.literal("esrgan").optional()
5463
.describe("Type of upscaler to use for post-generation upscaling when requested in diffusion({ upscale })."),
55-
model_src: modelSrcInputSchema
64+
model_src: modelSrcInputSchema.optional()
5665
.describe(
57-
"ESRGAN upscaler model (e.g. RealESRGAN_x4plus_anime_6B.pth). When " +
58-
"provided, generation requests can opt into post-generation upscale " +
59-
"via diffusion({ upscale: true }) or diffusion({ upscale: { repeats } }).",
66+
"ESRGAN upscaler model (e.g. RealESRGAN_x4plus_anime_6B.pth). " +
67+
"Required in diffusion mode when this `upscaler` block is set — " +
68+
"configures the post-generation upscaler invoked via diffusion({ upscale }). " +
69+
"In `mode: 'upscale'` the primary modelSrc itself is the ESRGAN model, " +
70+
"so this field is ignored.",
6071
),
6172
tile_size: z.number().int().positive().optional()
6273
.describe(
6374
"ESRGAN upscaler tile size in pixels. Smaller tiles use less VRAM " +
64-
"at the cost of more passes. Only used when upscaler.model_src is " +
65-
"configured and diffusion({ upscale }) is requested.",
75+
"at the cost of more passes.",
6676
),
6777
direct: z.boolean().optional()
6878
.describe(
@@ -83,10 +93,19 @@ export const sdcppConfigSchema = z
8393
"Number of CPU threads dedicated to the ESRGAN upscaler. -1 = auto.",
8494
),
8595
}).strict().optional()
86-
.describe("Configuration for an optional upscaler that can be applied after diffusion generation when requested in diffusion({ upscale })."),
96+
.describe(
97+
"ESRGAN upscaler configuration. In diffusion mode this enables the " +
98+
"post-generation upscale path invoked via diffusion({ upscale }) and " +
99+
"requires `model_src`. In `mode: 'upscale'` only the tuning fields " +
100+
"(tile_size, direct, offload_params_to_cpu, threads) are honored — " +
101+
"the primary modelSrc IS the ESRGAN model in that mode and " +
102+
"`model_src` here is ignored. Mode-dependent constraints (e.g. " +
103+
"`model_src` required in diffusion mode) are enforced by the " +
104+
"sdcpp-generation plugin at load time, not at the schema layer.",
105+
),
87106
});
88107

89-
export type SdcppConfig = z.infer<typeof sdcppConfigSchema>;
108+
export type SdcppConfig = z.input<typeof sdcppConfigSchema>;
90109

91110
export const diffusionStatsSchema = z.object({
92111
modelLoadMs: z
@@ -350,3 +369,94 @@ export type DiffusionClientParams = DiffusionClientParamsBase &
350369
| { init_image?: Uint8Array; init_images?: never }
351370
| { init_image?: never; init_images?: Uint8Array[] }
352371
);
372+
373+
// ============================================
374+
// Standalone ESRGAN upscale (mode: "upscale")
375+
// ============================================
376+
377+
export const upscaleStatsSchema = z.object({
378+
modelLoadMs: z
379+
.number()
380+
.optional()
381+
.describe("Wall-clock time in milliseconds spent loading the upscaler model."),
382+
upscaleMs: z
383+
.number()
384+
.optional()
385+
.describe("Wall-clock time in milliseconds for the most recent upscale job."),
386+
totalUpscaleMs: z
387+
.number()
388+
.optional()
389+
.describe("Cumulative upscale time in milliseconds across all jobs."),
390+
totalWallMs: z
391+
.number()
392+
.optional()
393+
.describe(
394+
"Total wall-clock time in milliseconds including model load and upscaling.",
395+
),
396+
totalUpscales: z
397+
.number()
398+
.optional()
399+
.describe("Cumulative number of upscale calls."),
400+
totalImages: z
401+
.number()
402+
.optional()
403+
.describe("Cumulative number of images produced."),
404+
totalPixels: z
405+
.number()
406+
.optional()
407+
.describe("Cumulative number of pixels produced across all images."),
408+
width: z.number().optional().describe("Width of the most recent emitted PNG."),
409+
height: z.number().optional().describe("Height of the most recent emitted PNG."),
410+
repeats: z
411+
.number()
412+
.optional()
413+
.describe("Number of ESRGAN passes used by the most recent upscale job."),
414+
});
415+
416+
export type UpscaleStats = z.infer<typeof upscaleStatsSchema>;
417+
418+
export const upscaleRequestSchema = z.object({
419+
modelId: z
420+
.string()
421+
.describe(
422+
"Identifier of the loaded upscaler model. The model must have been loaded " +
423+
"with `modelType: 'diffusion'` and `modelConfig.mode: 'upscale'`.",
424+
),
425+
image: z
426+
.string()
427+
.min(1)
428+
.regex(BASE64_PATTERN)
429+
.describe("Base64-encoded PNG/JPEG bytes of the source image."),
430+
repeats: z
431+
.number()
432+
.int()
433+
.positive()
434+
.optional()
435+
.describe(
436+
"Number of ESRGAN passes to run sequentially. Each pass multiplies " +
437+
"dimensions by the model's native scale factor; only the final image " +
438+
"is emitted (`outputs.length === 1`). Defaults to 1.",
439+
),
440+
});
441+
442+
export type UpscaleRequest = z.input<typeof upscaleRequestSchema>;
443+
444+
export const upscaleStreamRequestSchema = upscaleRequestSchema.extend({
445+
type: z.literal("upscaleStream"),
446+
});
447+
448+
export type UpscaleStreamRequest = z.input<typeof upscaleStreamRequestSchema>;
449+
450+
export const upscaleStreamResponseSchema = z.object({
451+
type: z.literal("upscaleStream"),
452+
data: z.string().optional(),
453+
outputIndex: z.number().optional(),
454+
done: z.boolean().optional(),
455+
stats: upscaleStatsSchema.optional(),
456+
});
457+
458+
export type UpscaleStreamResponse = z.infer<typeof upscaleStreamResponseSchema>;
459+
460+
export type UpscaleClientParams = Omit<UpscaleRequest, "image"> & {
461+
image: Uint8Array;
462+
};

0 commit comments

Comments
 (0)