-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathqlm
More file actions
63 lines (54 loc) · 2.74 KB
/
qlm
File metadata and controls
63 lines (54 loc) · 2.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#function qlm {
#This function body is to be autoloaded, along with its completion companion _qlm.
#Place this file in your fpath and `autoload -Uz qlm _qlm` from your .zshrc
#or just 'autoload -Uz full/path/to/qlm (same for the completion function _qlm)
#Do not forget to register the completion function in .zshrc with `compdef _qlm qlm`, (after compinit).
#The Large language model names and filenames to select from can be declared/added in qlm.cfg.
#Get config:
source $ZDOTDIR/.zfunc/qlm.cfg
#This collects any (overriding) options that the user may want to pass directly to llama-cli
#These must always be followed by "--" to delimit from model name and prompt, which will follow.
local posit=${argv[(Ie)--]}
if (( $posit )); then
local llamacli_opts=${argv:1:$posit-1}
shift $(( $posit ))
fi
if [[ $# -gt 2 ]]; then
echo "Usage: $0 [-flags [args] -- ] [ModelName] ['Prompt']"
return 1
elif [[ $# -ge 1 ]]; then
# Check if the first argument is a valid LLM.
if [[ -n "${llmodels[$1]}" ]]; then
local choice="$1"
shift
else
# Raise an error and exit if not found and there are 2 command-line arguments:
[[ -n "$2" ]] && { echo "Error: '$1' is not a valid large language model." ; return 1 ; }
#If the only argument is not a model name, it must be a prompt. Load the default model:
local choice="$defaultlm"
fi
else
#No arguments. Load the default LLM:
local choice="$defaultlm"
fi
local maxlayers=$gpulayers[$choice]
#Dynamic adjustment of layers to offload (down, by not more then 7) if VRAM is tied up (only NVIDIA gpus for now)
if [[ $maxlayers -lt 99 ]]; then
local memory=$(nvidia-smi --query-gpu=memory.free --format=csv,nounits,noheader)
local load=(${(P)choice})
for lay in {$maxlayers..$(( maxlayers-7 ))}; do
[[ $memory -gt $load[$lay] ]] && { maxlayers=$lay ; lay=0 ; break ; }
done
(( $lay )) && { echo "Insufficient VRAM, please free some. LLM run will be slow, exiting..." ; return 1 ; }
fi
#End Dynamic adj.....
echo -n "${choice}" > $lastllmf
echo "${choice}: $maxlayers layers offloaded."
local userinput="${1:-$(cat $TPROMPTF 2>/dev/null || xsel -op)}"
#(( 2 - ${#funcstack} )) &&
echo -e "\n${userinput}\n" > /dev/shm/reqlm
#The local inference engine (defined above) must be in the PATH.
#The 'UIUIUI' placeholder is used to keep config in a single file, that is sourced on startup.
#GGML_CUDA_ENABLE_UNIFIED_MEMORY=1
$inferengine -t 8 --temp $temps[$choice] -fa --top-p 0.95 -mli --no-mmap --mlock --no-display-prompt --no-perf ${1:+--no-warmup} -c $ctxsize[$choice] -ngl $maxlayers -s $(date +%s) ${1:+-no-cnv} -m $LLMDIR/$llmodels[$choice] -p "${llmprompts[$choice]//UIUIUI/"${userinput}"}" ${(z)llamacli_opts} 2>/dev/null | tee -a /dev/shm/reqlm
#}