diff --git a/extensions/llamacpp-extension/settings.json b/extensions/llamacpp-extension/settings.json index ce5fc62e4b..ac47068585 100644 --- a/extensions/llamacpp-extension/settings.json +++ b/extensions/llamacpp-extension/settings.json @@ -149,9 +149,14 @@ "key": "flash_attn", "title": "Flash Attention", "description": "Enable Flash Attention for optimized performance.", - "controllerType": "checkbox", + "controllerType": "dropdown", "controllerProps": { - "value": false + "value": "auto", + "options": [ + { "value": "auto", "name": "Auto" }, + { "value": "on", "name": "ON" }, + { "value": "off", "name": "OFF" } + ] } }, { diff --git a/extensions/llamacpp-extension/src/backend.ts b/extensions/llamacpp-extension/src/backend.ts index bd05432275..97a8de381b 100644 --- a/extensions/llamacpp-extension/src/backend.ts +++ b/extensions/llamacpp-extension/src/backend.ts @@ -99,53 +99,28 @@ export async function listSupportedBackends(): Promise< let supportedBackends = [] // NOTE: menloresearch's tags for llama.cpp builds are a bit different - // TODO: fetch versions from the server? - // TODO: select CUDA version based on driver version if (sysType == 'windows-x86_64') { - // NOTE: if a machine supports AVX2, should we include noavx and avx? - supportedBackends.push('win-noavx-x64') - if (features.avx) supportedBackends.push('win-avx-x64') - if (features.avx2) supportedBackends.push('win-avx2-x64') - if (features.avx512) supportedBackends.push('win-avx512-x64') + supportedBackends.push('win-common_cpus-x64') if (features.cuda11) { - if (features.avx512) supportedBackends.push('win-avx512-cuda-cu11.7-x64') - else if (features.avx2) supportedBackends.push('win-avx2-cuda-cu11.7-x64') - else if (features.avx) supportedBackends.push('win-avx-cuda-cu11.7-x64') - else supportedBackends.push('win-noavx-cuda-cu11.7-x64') + supportedBackends.push('win-cuda-11-common_cpus-x64') } if (features.cuda12) { - if (features.avx512) supportedBackends.push('win-avx512-cuda-cu12.0-x64') - else if (features.avx2) supportedBackends.push('win-avx2-cuda-cu12.0-x64') - else if (features.avx) supportedBackends.push('win-avx-cuda-cu12.0-x64') - else supportedBackends.push('win-noavx-cuda-cu12.0-x64') + supportedBackends.push('win-cuda-12-common_cpus-x64') } - if (features.vulkan) supportedBackends.push('win-vulkan-x64') + if (features.vulkan) supportedBackends.push('win-vulkan-common_cpus-x64') } // not available yet, placeholder for future else if (sysType === 'windows-aarch64' || sysType === 'windows-arm64') { supportedBackends.push('win-arm64') } else if (sysType === 'linux-x86_64' || sysType === 'linux-x86') { - supportedBackends.push('linux-noavx-x64') - if (features.avx) supportedBackends.push('linux-avx-x64') - if (features.avx2) supportedBackends.push('linux-avx2-x64') - if (features.avx512) supportedBackends.push('linux-avx512-x64') + supportedBackends.push('linux-common_cpus-x64') if (features.cuda11) { - if (features.avx512) - supportedBackends.push('linux-avx512-cuda-cu11.7-x64') - else if (features.avx2) - supportedBackends.push('linux-avx2-cuda-cu11.7-x64') - else if (features.avx) supportedBackends.push('linux-avx-cuda-cu11.7-x64') - else supportedBackends.push('linux-noavx-cuda-cu11.7-x64') + supportedBackends.push('linux-cuda-11-common_cpus-x64') } if (features.cuda12) { - if (features.avx512) - supportedBackends.push('linux-avx512-cuda-cu12.0-x64') - else if (features.avx2) - supportedBackends.push('linux-avx2-cuda-cu12.0-x64') - else if (features.avx) supportedBackends.push('linux-avx-cuda-cu12.0-x64') - else supportedBackends.push('linux-noavx-cuda-cu12.0-x64') + supportedBackends.push('linux-cuda-12-common_cpus-x64') } - if (features.vulkan) supportedBackends.push('linux-vulkan-x64') + if (features.vulkan) supportedBackends.push('linux-vulkan-common_cpus-x64') } // not available yet, placeholder for future else if (sysType === 'linux-aarch64' || sysType === 'linux-arm64') { @@ -230,10 +205,7 @@ export async function downloadBackend( version: string, source: 'github' | 'cdn' = 'github' ): Promise { - const janDataFolderPath = await getJanDataFolderPath() - const llamacppPath = await joinPath([janDataFolderPath, 'llamacpp']) const backendDir = await getBackendDir(backend, version) - const libDir = await joinPath([llamacppPath, 'lib']) const downloadManager = window.core.extensionManager.getByName( '@janhq/download-extension' @@ -265,7 +237,7 @@ export async function downloadBackend( source === 'github' ? `https://github.com/menloresearch/llama.cpp/releases/download/${version}/cudart-llama-bin-${platformName}-cu11.7-x64.tar.gz` : `https://catalog.jan.ai/llama.cpp/releases/${version}/cudart-llama-bin-${platformName}-cu11.7-x64.tar.gz`, - save_path: await joinPath([libDir, 'cuda11.tar.gz']), + save_path: await joinPath([backendDir, 'build', 'bin', 'cuda11.tar.gz']), proxy: proxyConfig, }) } else if (backend.includes('cu12.0') && !(await _isCudaInstalled('12.0'))) { @@ -274,7 +246,7 @@ export async function downloadBackend( source === 'github' ? `https://github.com/menloresearch/llama.cpp/releases/download/${version}/cudart-llama-bin-${platformName}-cu12.0-x64.tar.gz` : `https://catalog.jan.ai/llama.cpp/releases/${version}/cudart-llama-bin-${platformName}-cu12.0-x64.tar.gz`, - save_path: await joinPath([libDir, 'cuda12.tar.gz']), + save_path: await joinPath([backendDir, 'build', 'bin', 'cuda12.tar.gz']), proxy: proxyConfig, }) } @@ -344,8 +316,8 @@ async function _getSupportedFeatures() { } // https://docs.nvidia.com/deploy/cuda-compatibility/#cuda-11-and-later-defaults-to-minor-version-compatibility - let minCuda11DriverVersion - let minCuda12DriverVersion + let minCuda11DriverVersion: string + let minCuda12DriverVersion: string if (sysInfo.os_type === 'linux') { minCuda11DriverVersion = '450.80.02' minCuda12DriverVersion = '525.60.13' diff --git a/extensions/llamacpp-extension/src/index.ts b/extensions/llamacpp-extension/src/index.ts index 8d4f277b69..9bb3e65593 100644 --- a/extensions/llamacpp-extension/src/index.ts +++ b/extensions/llamacpp-extension/src/index.ts @@ -38,10 +38,12 @@ import { invoke } from '@tauri-apps/api/core' import { getProxyConfig } from './util' import { basename } from '@tauri-apps/api/path' import { + loadLlamaModel, readGgufMetadata, getModelSize, isModelSupported, planModelLoadInternal, + unloadLlamaModel, } from '@janhq/tauri-plugin-llamacpp-api' import { getSystemUsage, getSystemInfo } from '@janhq/tauri-plugin-hardware-api' @@ -69,7 +71,7 @@ type LlamacppConfig = { device: string split_mode: string main_gpu: number - flash_attn: boolean + flash_attn: string cont_batching: boolean no_mmap: boolean mlock: boolean @@ -549,9 +551,9 @@ export default class llamacpp_extension extends AIEngine { // Helper to map backend string to a priority category const getBackendCategory = (backendString: string): string | undefined => { - if (backendString.includes('cu12.0')) return 'cuda-cu12.0' - if (backendString.includes('cu11.7')) return 'cuda-cu11.7' - if (backendString.includes('vulkan')) return 'vulkan' + if (backendString.includes('cuda-12-common_cpus')) return 'cuda-cu12.0' + if (backendString.includes('cuda-11-common_cpus')) return 'cuda-cu11.7' + if (backendString.includes('vulkan-common_cpus')) return 'vulkan' if (backendString.includes('avx512')) return 'avx512' if (backendString.includes('avx2')) return 'avx2' if ( @@ -1646,14 +1648,11 @@ export default class llamacpp_extension extends AIEngine { args.push('--split-mode', cfg.split_mode) if (cfg.main_gpu !== undefined && cfg.main_gpu != 0) args.push('--main-gpu', String(cfg.main_gpu)) + // Note: Older llama.cpp versions are no longer supported + if (cfg.flash_attn !== undefined || cfg.flash_attn !== '') args.push('--flash-attn', String(cfg.flash_attn)) //default: auto = ON when supported // Boolean flags if (cfg.ctx_shift) args.push('--context-shift') - if (Number(version.replace(/^b/, '')) >= 6325) { - if (!cfg.flash_attn) args.push('--flash-attn', 'off') //default: auto = ON when supported - } else { - if (cfg.flash_attn) args.push('--flash-attn') - } if (cfg.cont_batching) args.push('--cont-batching') args.push('--no-mmap') if (cfg.mlock) args.push('--mlock') @@ -1688,20 +1687,9 @@ export default class llamacpp_extension extends AIEngine { logger.info('Calling Tauri command llama_load with args:', args) const backendPath = await getBackendExePath(backend, version) - const libraryPath = await joinPath([await this.getProviderPath(), 'lib']) try { - // TODO: add LIBRARY_PATH - const sInfo = await invoke( - 'plugin:llamacpp|load_llama_model', - { - backendPath, - libraryPath, - args, - envs, - isEmbedding, - } - ) + const sInfo = await loadLlamaModel(backendPath, args, envs, isEmbedding) return sInfo } catch (error) { logger.error('Error in load command:\n', error) @@ -1717,12 +1705,7 @@ export default class llamacpp_extension extends AIEngine { const pid = sInfo.pid try { // Pass the PID as the session_id - const result = await invoke( - 'plugin:llamacpp|unload_llama_model', - { - pid: pid, - } - ) + const result = await unloadLlamaModel(pid) // If successful, remove from active sessions if (result.success) { @@ -2042,7 +2025,10 @@ export default class llamacpp_extension extends AIEngine { if (sysInfo?.os_type === 'linux' && Array.isArray(sysInfo.gpus)) { const usage = await getSystemUsage() if (usage && Array.isArray(usage.gpus)) { - const uuidToUsage: Record = {} + const uuidToUsage: Record< + string, + { total_memory: number; used_memory: number } + > = {} for (const u of usage.gpus as any[]) { if (u && typeof u.uuid === 'string') { uuidToUsage[u.uuid] = u @@ -2082,7 +2068,10 @@ export default class llamacpp_extension extends AIEngine { typeof u.used_memory === 'number' ) { const total = Math.max(0, Math.floor(u.total_memory)) - const free = Math.max(0, Math.floor(u.total_memory - u.used_memory)) + const free = Math.max( + 0, + Math.floor(u.total_memory - u.used_memory) + ) return { ...dev, mem: total, free } } } diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/guest-js/index.ts b/src-tauri/plugins/tauri-plugin-llamacpp/guest-js/index.ts index 7c0e3e4be2..c576803082 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/guest-js/index.ts +++ b/src-tauri/plugins/tauri-plugin-llamacpp/guest-js/index.ts @@ -2,11 +2,18 @@ import { invoke } from '@tauri-apps/api/core' // Types export interface SessionInfo { - pid: number - port: number - model_id: string - model_path: string - api_key: string + pid: number; + port: number; + model_id: string; + model_path: string; + is_embedding: boolean + api_key: string; + mmproj_path?: string; +} + +export interface UnloadResult { + success: boolean; + error?: string; } export interface DeviceInfo { @@ -29,19 +36,19 @@ export async function cleanupLlamaProcesses(): Promise { // LlamaCpp server commands export async function loadLlamaModel( backendPath: string, - libraryPath?: string, - args: string[] = [], - isEmbedding: boolean = false + args: string[], + envs: Record, + isEmbedding: boolean ): Promise { return await invoke('plugin:llamacpp|load_llama_model', { backendPath, - libraryPath, args, - isEmbedding, + envs, + isEmbedding }) } -export async function unloadLlamaModel(pid: number): Promise { +export async function unloadLlamaModel(pid: number): Promise { return await invoke('plugin:llamacpp|unload_llama_model', { pid }) } diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/commands.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/commands.rs index 1d898b4d97..2b14f5ca7a 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/src/commands.rs +++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/commands.rs @@ -41,7 +41,6 @@ pub struct UnloadResult { pub async fn load_llama_model( app_handle: tauri::AppHandle, backend_path: &str, - library_path: Option<&str>, mut args: Vec, envs: HashMap, is_embedding: bool, @@ -52,7 +51,7 @@ pub async fn load_llama_model( log::info!("Attempting to launch server at path: {:?}", backend_path); log::info!("Using arguments: {:?}", args); - validate_binary_path(backend_path)?; + let bin_path = validate_binary_path(backend_path)?; let port = parse_port_from_args(&args); let model_path_pb = validate_model_path(&mut args)?; @@ -83,11 +82,11 @@ pub async fn load_llama_model( let model_id = extract_arg_value(&args, "-a"); // Configure the command to run the server - let mut command = Command::new(backend_path); + let mut command = Command::new(&bin_path); command.args(args); command.envs(envs); - setup_library_path(library_path, &mut command); + setup_library_path(bin_path.parent().and_then(|p| p.to_str()), &mut command); command.stdout(Stdio::piped()); command.stderr(Stdio::piped()); setup_windows_process_flags(&mut command); @@ -280,10 +279,9 @@ pub async fn unload_llama_model( #[tauri::command] pub async fn get_devices( backend_path: &str, - library_path: Option<&str>, envs: HashMap, ) -> ServerResult> { - get_devices_from_backend(backend_path, library_path, envs).await + get_devices_from_backend(backend_path, envs).await } /// Generate API key using HMAC-SHA256 diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/device.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/device.rs index 80b0293ac4..922e70c140 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/src/device.rs +++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/device.rs @@ -19,20 +19,19 @@ pub struct DeviceInfo { pub async fn get_devices_from_backend( backend_path: &str, - library_path: Option<&str>, envs: HashMap, ) -> ServerResult> { log::info!("Getting devices from server at path: {:?}", backend_path); - validate_binary_path(backend_path)?; + let bin_path = validate_binary_path(backend_path)?; // Configure the command to run the server with --list-devices - let mut command = Command::new(backend_path); + let mut command = Command::new(&bin_path); command.arg("--list-devices"); command.envs(envs); // Set up library path - setup_library_path(library_path, &mut command); + setup_library_path(bin_path.parent().and_then(|p| p.to_str()), &mut command); command.stdout(Stdio::piped()); command.stderr(Stdio::piped()); @@ -410,4 +409,4 @@ AnotherInvalid assert_eq!(result[0].id, "Vulkan0"); assert_eq!(result[1].id, "CUDA0"); } -} \ No newline at end of file +} diff --git a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/utils.rs b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/utils.rs index cdbbf92d5c..10dc66f484 100644 --- a/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/utils.rs +++ b/src-tauri/plugins/tauri-plugin-llamacpp/src/gguf/utils.rs @@ -62,7 +62,6 @@ pub async fn estimate_kv_cache_internal( ctx_size: Option, ) -> Result { log::info!("Received ctx_size parameter: {:?}", ctx_size); - log::info!("Received model metadata:\n{:?}", &meta); let arch = meta .get("general.architecture") .ok_or(KVCacheError::ArchitectureNotFound)?;