Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions extensions/llamacpp-extension/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,14 @@
"key": "flash_attn",
"title": "Flash Attention",
"description": "Enable Flash Attention for optimized performance.",
"controllerType": "checkbox",
"controllerType": "dropdown",
"controllerProps": {
"value": false
"value": "auto",
"options": [
{ "value": "auto", "name": "Auto" },
{ "value": "on", "name": "ON" },
{ "value": "off", "name": "OFF" }
]
}
},
{
Expand Down
52 changes: 12 additions & 40 deletions extensions/llamacpp-extension/src/backend.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,53 +99,28 @@ export async function listSupportedBackends(): Promise<
let supportedBackends = []

// NOTE: menloresearch's tags for llama.cpp builds are a bit different
// TODO: fetch versions from the server?
// TODO: select CUDA version based on driver version
if (sysType == 'windows-x86_64') {
// NOTE: if a machine supports AVX2, should we include noavx and avx?
supportedBackends.push('win-noavx-x64')
if (features.avx) supportedBackends.push('win-avx-x64')
if (features.avx2) supportedBackends.push('win-avx2-x64')
if (features.avx512) supportedBackends.push('win-avx512-x64')
supportedBackends.push('win-common_cpus-x64')
if (features.cuda11) {
if (features.avx512) supportedBackends.push('win-avx512-cuda-cu11.7-x64')
else if (features.avx2) supportedBackends.push('win-avx2-cuda-cu11.7-x64')
else if (features.avx) supportedBackends.push('win-avx-cuda-cu11.7-x64')
else supportedBackends.push('win-noavx-cuda-cu11.7-x64')
supportedBackends.push('win-cuda-11-common_cpus-x64')
}
if (features.cuda12) {
if (features.avx512) supportedBackends.push('win-avx512-cuda-cu12.0-x64')
else if (features.avx2) supportedBackends.push('win-avx2-cuda-cu12.0-x64')
else if (features.avx) supportedBackends.push('win-avx-cuda-cu12.0-x64')
else supportedBackends.push('win-noavx-cuda-cu12.0-x64')
supportedBackends.push('win-cuda-12-common_cpus-x64')
}
if (features.vulkan) supportedBackends.push('win-vulkan-x64')
if (features.vulkan) supportedBackends.push('win-vulkan-common_cpus-x64')
}
// not available yet, placeholder for future
else if (sysType === 'windows-aarch64' || sysType === 'windows-arm64') {
supportedBackends.push('win-arm64')
} else if (sysType === 'linux-x86_64' || sysType === 'linux-x86') {
supportedBackends.push('linux-noavx-x64')
if (features.avx) supportedBackends.push('linux-avx-x64')
if (features.avx2) supportedBackends.push('linux-avx2-x64')
if (features.avx512) supportedBackends.push('linux-avx512-x64')
supportedBackends.push('linux-common_cpus-x64')
if (features.cuda11) {
if (features.avx512)
supportedBackends.push('linux-avx512-cuda-cu11.7-x64')
else if (features.avx2)
supportedBackends.push('linux-avx2-cuda-cu11.7-x64')
else if (features.avx) supportedBackends.push('linux-avx-cuda-cu11.7-x64')
else supportedBackends.push('linux-noavx-cuda-cu11.7-x64')
supportedBackends.push('linux-cuda-11-common_cpus-x64')
}
if (features.cuda12) {
if (features.avx512)
supportedBackends.push('linux-avx512-cuda-cu12.0-x64')
else if (features.avx2)
supportedBackends.push('linux-avx2-cuda-cu12.0-x64')
else if (features.avx) supportedBackends.push('linux-avx-cuda-cu12.0-x64')
else supportedBackends.push('linux-noavx-cuda-cu12.0-x64')
supportedBackends.push('linux-cuda-12-common_cpus-x64')
}
if (features.vulkan) supportedBackends.push('linux-vulkan-x64')
if (features.vulkan) supportedBackends.push('linux-vulkan-common_cpus-x64')
}
// not available yet, placeholder for future
else if (sysType === 'linux-aarch64' || sysType === 'linux-arm64') {
Expand Down Expand Up @@ -230,10 +205,7 @@ export async function downloadBackend(
version: string,
source: 'github' | 'cdn' = 'github'
): Promise<void> {
const janDataFolderPath = await getJanDataFolderPath()
const llamacppPath = await joinPath([janDataFolderPath, 'llamacpp'])
const backendDir = await getBackendDir(backend, version)
const libDir = await joinPath([llamacppPath, 'lib'])

const downloadManager = window.core.extensionManager.getByName(
'@janhq/download-extension'
Expand Down Expand Up @@ -265,7 +237,7 @@ export async function downloadBackend(
source === 'github'
? `https://github.com/menloresearch/llama.cpp/releases/download/${version}/cudart-llama-bin-${platformName}-cu11.7-x64.tar.gz`
: `https://catalog.jan.ai/llama.cpp/releases/${version}/cudart-llama-bin-${platformName}-cu11.7-x64.tar.gz`,
save_path: await joinPath([libDir, 'cuda11.tar.gz']),
save_path: await joinPath([backendDir, 'build', 'bin', 'cuda11.tar.gz']),
proxy: proxyConfig,
})
} else if (backend.includes('cu12.0') && !(await _isCudaInstalled('12.0'))) {
Expand All @@ -274,7 +246,7 @@ export async function downloadBackend(
source === 'github'
? `https://github.com/menloresearch/llama.cpp/releases/download/${version}/cudart-llama-bin-${platformName}-cu12.0-x64.tar.gz`
: `https://catalog.jan.ai/llama.cpp/releases/${version}/cudart-llama-bin-${platformName}-cu12.0-x64.tar.gz`,
save_path: await joinPath([libDir, 'cuda12.tar.gz']),
save_path: await joinPath([backendDir, 'build', 'bin', 'cuda12.tar.gz']),
proxy: proxyConfig,
})
}
Expand Down Expand Up @@ -344,8 +316,8 @@ async function _getSupportedFeatures() {
}

// https://docs.nvidia.com/deploy/cuda-compatibility/#cuda-11-and-later-defaults-to-minor-version-compatibility
let minCuda11DriverVersion
let minCuda12DriverVersion
let minCuda11DriverVersion: string
let minCuda12DriverVersion: string
if (sysInfo.os_type === 'linux') {
minCuda11DriverVersion = '450.80.02'
minCuda12DriverVersion = '525.60.13'
Expand Down
47 changes: 18 additions & 29 deletions extensions/llamacpp-extension/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,12 @@ import { invoke } from '@tauri-apps/api/core'
import { getProxyConfig } from './util'
import { basename } from '@tauri-apps/api/path'
import {
loadLlamaModel,
readGgufMetadata,
getModelSize,
isModelSupported,
planModelLoadInternal,
unloadLlamaModel,
} from '@janhq/tauri-plugin-llamacpp-api'
import { getSystemUsage, getSystemInfo } from '@janhq/tauri-plugin-hardware-api'

Expand Down Expand Up @@ -69,7 +71,7 @@ type LlamacppConfig = {
device: string
split_mode: string
main_gpu: number
flash_attn: boolean
flash_attn: string
cont_batching: boolean
no_mmap: boolean
mlock: boolean
Expand Down Expand Up @@ -549,9 +551,9 @@ export default class llamacpp_extension extends AIEngine {

// Helper to map backend string to a priority category
const getBackendCategory = (backendString: string): string | undefined => {
if (backendString.includes('cu12.0')) return 'cuda-cu12.0'
if (backendString.includes('cu11.7')) return 'cuda-cu11.7'
if (backendString.includes('vulkan')) return 'vulkan'
if (backendString.includes('cuda-12-common_cpus')) return 'cuda-cu12.0'
if (backendString.includes('cuda-11-common_cpus')) return 'cuda-cu11.7'
if (backendString.includes('vulkan-common_cpus')) return 'vulkan'
if (backendString.includes('avx512')) return 'avx512'
if (backendString.includes('avx2')) return 'avx2'
if (
Expand Down Expand Up @@ -1646,14 +1648,11 @@ export default class llamacpp_extension extends AIEngine {
args.push('--split-mode', cfg.split_mode)
if (cfg.main_gpu !== undefined && cfg.main_gpu != 0)
args.push('--main-gpu', String(cfg.main_gpu))
// Note: Older llama.cpp versions are no longer supported
if (cfg.flash_attn !== undefined || cfg.flash_attn === '') args.push('--flash-attn', String(cfg.flash_attn)) //default: auto = ON when supported

// Boolean flags
if (cfg.ctx_shift) args.push('--context-shift')
if (Number(version.replace(/^b/, '')) >= 6325) {
if (!cfg.flash_attn) args.push('--flash-attn', 'off') //default: auto = ON when supported
} else {
if (cfg.flash_attn) args.push('--flash-attn')
}
if (cfg.cont_batching) args.push('--cont-batching')
args.push('--no-mmap')
if (cfg.mlock) args.push('--mlock')
Expand Down Expand Up @@ -1688,20 +1687,9 @@ export default class llamacpp_extension extends AIEngine {

logger.info('Calling Tauri command llama_load with args:', args)
const backendPath = await getBackendExePath(backend, version)
const libraryPath = await joinPath([await this.getProviderPath(), 'lib'])

try {
// TODO: add LIBRARY_PATH
const sInfo = await invoke<SessionInfo>(
'plugin:llamacpp|load_llama_model',
{
backendPath,
libraryPath,
args,
envs,
isEmbedding,
}
)
const sInfo = await loadLlamaModel(backendPath, args, envs, isEmbedding)
return sInfo
} catch (error) {
logger.error('Error in load command:\n', error)
Expand All @@ -1717,12 +1705,7 @@ export default class llamacpp_extension extends AIEngine {
const pid = sInfo.pid
try {
// Pass the PID as the session_id
const result = await invoke<UnloadResult>(
'plugin:llamacpp|unload_llama_model',
{
pid: pid,
}
)
const result = await unloadLlamaModel(pid)

// If successful, remove from active sessions
if (result.success) {
Expand Down Expand Up @@ -2042,7 +2025,10 @@ export default class llamacpp_extension extends AIEngine {
if (sysInfo?.os_type === 'linux' && Array.isArray(sysInfo.gpus)) {
const usage = await getSystemUsage()
if (usage && Array.isArray(usage.gpus)) {
const uuidToUsage: Record<string, { total_memory: number; used_memory: number }> = {}
const uuidToUsage: Record<
string,
{ total_memory: number; used_memory: number }
> = {}
for (const u of usage.gpus as any[]) {
if (u && typeof u.uuid === 'string') {
uuidToUsage[u.uuid] = u
Expand Down Expand Up @@ -2082,7 +2068,10 @@ export default class llamacpp_extension extends AIEngine {
typeof u.used_memory === 'number'
) {
const total = Math.max(0, Math.floor(u.total_memory))
const free = Math.max(0, Math.floor(u.total_memory - u.used_memory))
const free = Math.max(
0,
Math.floor(u.total_memory - u.used_memory)
)
return { ...dev, mem: total, free }
}
}
Expand Down
29 changes: 18 additions & 11 deletions src-tauri/plugins/tauri-plugin-llamacpp/guest-js/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,18 @@ import { invoke } from '@tauri-apps/api/core'

// Types
export interface SessionInfo {
pid: number
port: number
model_id: string
model_path: string
api_key: string
pid: number;
port: number;
model_id: string;
model_path: string;
is_embedding: boolean
api_key: string;
mmproj_path?: string;
}

export interface UnloadResult {
success: boolean;
error?: string;
}

export interface DeviceInfo {
Expand All @@ -29,19 +36,19 @@ export async function cleanupLlamaProcesses(): Promise<void> {
// LlamaCpp server commands
export async function loadLlamaModel(
backendPath: string,
libraryPath?: string,
args: string[] = [],
isEmbedding: boolean = false
args: string[],
envs: Record<string, string>,
isEmbedding: boolean
): Promise<SessionInfo> {
return await invoke('plugin:llamacpp|load_llama_model', {
backendPath,
libraryPath,
args,
isEmbedding,
envs,
isEmbedding
})
}

export async function unloadLlamaModel(pid: number): Promise<void> {
export async function unloadLlamaModel(pid: number): Promise<UnloadResult> {
return await invoke('plugin:llamacpp|unload_llama_model', { pid })
}

Expand Down
10 changes: 4 additions & 6 deletions src-tauri/plugins/tauri-plugin-llamacpp/src/commands.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ pub struct UnloadResult {
pub async fn load_llama_model<R: Runtime>(
app_handle: tauri::AppHandle<R>,
backend_path: &str,
library_path: Option<&str>,
mut args: Vec<String>,
envs: HashMap<String, String>,
is_embedding: bool,
Expand All @@ -52,7 +51,7 @@ pub async fn load_llama_model<R: Runtime>(
log::info!("Attempting to launch server at path: {:?}", backend_path);
log::info!("Using arguments: {:?}", args);

validate_binary_path(backend_path)?;
let bin_path = validate_binary_path(backend_path)?;

let port = parse_port_from_args(&args);
let model_path_pb = validate_model_path(&mut args)?;
Expand Down Expand Up @@ -83,11 +82,11 @@ pub async fn load_llama_model<R: Runtime>(
let model_id = extract_arg_value(&args, "-a");

// Configure the command to run the server
let mut command = Command::new(backend_path);
let mut command = Command::new(&bin_path);
command.args(args);
command.envs(envs);

setup_library_path(library_path, &mut command);
setup_library_path(bin_path.parent().and_then(|p| p.to_str()), &mut command);
command.stdout(Stdio::piped());
command.stderr(Stdio::piped());
setup_windows_process_flags(&mut command);
Expand Down Expand Up @@ -280,10 +279,9 @@ pub async fn unload_llama_model<R: Runtime>(
#[tauri::command]
pub async fn get_devices(
backend_path: &str,
library_path: Option<&str>,
envs: HashMap<String, String>,
) -> ServerResult<Vec<DeviceInfo>> {
get_devices_from_backend(backend_path, library_path, envs).await
get_devices_from_backend(backend_path, envs).await
}

/// Generate API key using HMAC-SHA256
Expand Down
9 changes: 4 additions & 5 deletions src-tauri/plugins/tauri-plugin-llamacpp/src/device.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,20 +19,19 @@ pub struct DeviceInfo {

pub async fn get_devices_from_backend(
backend_path: &str,
library_path: Option<&str>,
envs: HashMap<String, String>,
) -> ServerResult<Vec<DeviceInfo>> {
log::info!("Getting devices from server at path: {:?}", backend_path);

validate_binary_path(backend_path)?;
let bin_path = validate_binary_path(backend_path)?;

// Configure the command to run the server with --list-devices
let mut command = Command::new(backend_path);
let mut command = Command::new(&bin_path);
command.arg("--list-devices");
command.envs(envs);

// Set up library path
setup_library_path(library_path, &mut command);
setup_library_path(bin_path.parent().and_then(|p| p.to_str()), &mut command);

command.stdout(Stdio::piped());
command.stderr(Stdio::piped());
Expand Down Expand Up @@ -410,4 +409,4 @@ AnotherInvalid
assert_eq!(result[0].id, "Vulkan0");
assert_eq!(result[1].id, "CUDA0");
}
}
}
1 change: 0 additions & 1 deletion src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ pub async fn estimate_kv_cache_internal(
ctx_size: Option<u64>,
) -> Result<KVCacheEstimate, KVCacheError> {
log::info!("Received ctx_size parameter: {:?}", ctx_size);
log::info!("Received model metadata:\n{:?}", &meta);
let arch = meta
.get("general.architecture")
.ok_or(KVCacheError::ArchitectureNotFound)?;
Expand Down
Loading