Zshelf/qlm at main · QuantiusBenignus/Zshelf · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#function qlm {
#This function body is to be autoloaded, along with its completion companion _qlm.
#Place this file in your fpath and `autoload -Uz qlm _qlm` from your .zshrc
#or just 'autoload -Uz full/path/to/qlm (same for the completion function _qlm)
#Do not forget to register the completion function in .zshrc with `compdef _qlm qlm`, (after compinit).
#The Large language model names and filenames to select from can be declared/added in qlm.cfg.

#Get config:
source $ZDOTDIR/.zfunc/qlm.cfg

#This collects any (overriding) options that the user may want to pass directly to llama-cli
#These must always be followed by "--" to delimit from model name and prompt, which will follow.
local posit=${argv[(Ie)--]}
if (( $posit )); then
   local llamacli_opts=${argv:1:$posit-1}
   shift $(( $posit ))
fi

if [[ $# -gt 2 ]]; then
    echo "Usage: $0 [-flags [args] -- ] [ModelName] ['Prompt']"
    return 1
elif [[ $# -ge 1 ]]; then
   # Check if the first argument is a valid LLM.
   if [[ -n "${llmodels[$1]}" ]]; then
       local choice="$1"
       shift
   else
       # Raise an error and exit if not found and there are 2 command-line arguments:
       [[ -n "$2" ]] && { echo "Error: '$1' is not a valid large language model." ; return 1 ; }
       #If the only argument is not a model name, it must be a prompt. Load the default model:
       local choice="$defaultlm"
   fi
else
   #No arguments. Load the default LLM:
   local choice="$defaultlm"
fi

local maxlayers=$gpulayers[$choice]

#Dynamic adjustment of layers to offload (down, by not more then 7) if VRAM is tied up (only NVIDIA gpus for now)
if [[ $maxlayers -lt 99 ]]; then
    local memory=$(nvidia-smi --query-gpu=memory.free --format=csv,nounits,noheader)
    local load=(${(P)choice})
    for lay in {$maxlayers..$(( maxlayers-7 ))}; do
        [[ $memory -gt $load[$lay] ]] && { maxlayers=$lay ; lay=0 ; break ; }
    done
    (( $lay )) && { echo "Insufficient VRAM, please free some. LLM run will be slow, exiting..." ; return 1 ; }
fi
#End Dynamic adj.....

echo -n "${choice}" > $lastllmf
echo "${choice}: $maxlayers layers offloaded."
local userinput="${1:-$(cat $TPROMPTF 2>/dev/null || xsel -op)}"
#(( 2 - ${#funcstack} )) &&
echo -e "\n${userinput}\n" > /dev/shm/reqlm

#The local inference engine (defined above) must be in the PATH.
#The 'UIUIUI' placeholder is used to keep config in a single file, that is sourced on startup.
#GGML_CUDA_ENABLE_UNIFIED_MEMORY=1
$inferengine -t 8 --temp $temps[$choice] -fa --top-p 0.95 -mli --no-mmap --mlock --no-display-prompt --no-perf ${1:+--no-warmup} -c $ctxsize[$choice] -ngl $maxlayers -s $(date +%s) ${1:+-no-cnv} -m $LLMDIR/$llmodels[$choice] -p "${llmprompts[$choice]//UIUIUI/"${userinput}"}" ${(z)llamacli_opts} 2>/dev/null | tee -a /dev/shm/reqlm

#}