Skip to content

Commit b8f888f

Browse files
authored
Logging Improvements (#88)
This change revamps the internal logging architecture to be more flexible and descriptive. Previously all logs from both llama-swap and upstream services were mixed together. This makes it harder to troubleshoot and identify problems. This PR adds these new endpoints: - `/logs/stream/proxy` - just llama-swap's logs - `/logs/stream/upstream` - stdout output from the upstream server
1 parent 192b2ae commit b8f888f

8 files changed

+254
-90
lines changed

README.md

+9-5
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
![GitHub Actions Workflow Status](https://img.shields.io/github/actions/workflow/status/mostlygeek/llama-swap/go-ci.yml)
44
![GitHub Repo stars](https://img.shields.io/github/stars/mostlygeek/llama-swap)
55

6-
7-
86
# llama-swap
97

108
llama-swap is a light weight, transparent proxy server that provides automatic model swapping to llama.cpp's server.
@@ -69,8 +67,8 @@ models:
6967
# Default (and minimum) is 15 seconds
7068
healthCheckTimeout: 60
7169

72-
# Write HTTP logs (useful for troubleshooting), defaults to false
73-
logRequests: true
70+
# Valid log levels: debug, info (default), warn, error
71+
logLevel: info
7472

7573
# define valid model values and the upstream server start
7674
models:
@@ -221,9 +219,15 @@ Of course, CLI access is also supported:
221219
# sends up to the last 10KB of logs
222220
curl http://host/logs'
223221

224-
# streams logs
222+
# streams combined logs
225223
curl -Ns 'http://host/logs/stream'
226224

225+
# just llama-swap's logs
226+
curl -Ns 'http://host/logs/stream/proxy'
227+
228+
# just upstream's logs
229+
curl -Ns 'http://host/logs/stream/upstream'
230+
227231
# stream and filter logs with linux pipes
228232
curl -Ns http://host/logs/stream | grep 'eval time'
229233

config.example.yaml

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# Seconds to wait for llama.cpp to be available to serve requests
22
# Default (and minimum): 15 seconds
3-
healthCheckTimeout: 15
3+
healthCheckTimeout: 90
44

5-
# Log HTTP requests helpful for troubleshoot, defaults to False
6-
logRequests: true
5+
# valid log levels: debug, info (default), warn, error
6+
logLevel: info
77

88
models:
99
"llama":

proxy/config.go

+1
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ func (m *ModelConfig) SanitizedCommand() ([]string, error) {
2727
type Config struct {
2828
HealthCheckTimeout int `yaml:"healthCheckTimeout"`
2929
LogRequests bool `yaml:"logRequests"`
30+
LogLevel string `yaml:"logLevel"`
3031
Models map[string]ModelConfig `yaml:"models"`
3132
Profiles map[string][]string `yaml:"profiles"`
3233

proxy/logMonitor.go

+90
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,21 @@ package proxy
22

33
import (
44
"container/ring"
5+
"fmt"
56
"io"
67
"os"
78
"sync"
89
)
910

11+
type LogLevel int
12+
13+
const (
14+
LevelDebug LogLevel = iota
15+
LevelInfo
16+
LevelWarn
17+
LevelError
18+
)
19+
1020
type LogMonitor struct {
1121
clients map[chan []byte]bool
1222
mu sync.RWMutex
@@ -15,6 +25,10 @@ type LogMonitor struct {
1525

1626
// typically this can be os.Stdout
1727
stdout io.Writer
28+
29+
// logging levels
30+
level LogLevel
31+
prefix string
1832
}
1933

2034
func NewLogMonitor() *LogMonitor {
@@ -26,6 +40,8 @@ func NewLogMonitorWriter(stdout io.Writer) *LogMonitor {
2640
clients: make(map[chan []byte]bool),
2741
buffer: ring.New(10 * 1024), // keep 10KB of buffered logs
2842
stdout: stdout,
43+
level: LevelInfo,
44+
prefix: "",
2945
}
3046
}
3147

@@ -94,3 +110,77 @@ func (w *LogMonitor) broadcast(msg []byte) {
94110
}
95111
}
96112
}
113+
114+
func (w *LogMonitor) SetPrefix(prefix string) {
115+
w.mu.Lock()
116+
defer w.mu.Unlock()
117+
w.prefix = prefix
118+
}
119+
120+
func (w *LogMonitor) SetLogLevel(level LogLevel) {
121+
w.mu.Lock()
122+
defer w.mu.Unlock()
123+
w.level = level
124+
}
125+
126+
func (w *LogMonitor) formatMessage(level string, msg string) []byte {
127+
prefix := ""
128+
if w.prefix != "" {
129+
prefix = fmt.Sprintf("[%s] ", w.prefix)
130+
}
131+
return []byte(fmt.Sprintf("%s[%s] %s\n", prefix, level, msg))
132+
}
133+
134+
func (w *LogMonitor) log(level LogLevel, msg string) {
135+
if level < w.level {
136+
return
137+
}
138+
w.Write(w.formatMessage(level.String(), msg))
139+
}
140+
141+
func (w *LogMonitor) Debug(msg string) {
142+
w.log(LevelDebug, msg)
143+
}
144+
145+
func (w *LogMonitor) Info(msg string) {
146+
w.log(LevelInfo, msg)
147+
}
148+
149+
func (w *LogMonitor) Warn(msg string) {
150+
w.log(LevelWarn, msg)
151+
}
152+
153+
func (w *LogMonitor) Error(msg string) {
154+
w.log(LevelError, msg)
155+
}
156+
157+
func (w *LogMonitor) Debugf(format string, args ...interface{}) {
158+
w.log(LevelDebug, fmt.Sprintf(format, args...))
159+
}
160+
161+
func (w *LogMonitor) Infof(format string, args ...interface{}) {
162+
w.log(LevelInfo, fmt.Sprintf(format, args...))
163+
}
164+
165+
func (w *LogMonitor) Warnf(format string, args ...interface{}) {
166+
w.log(LevelWarn, fmt.Sprintf(format, args...))
167+
}
168+
169+
func (w *LogMonitor) Errorf(format string, args ...interface{}) {
170+
w.log(LevelError, fmt.Sprintf(format, args...))
171+
}
172+
173+
func (l LogLevel) String() string {
174+
switch l {
175+
case LevelDebug:
176+
return "DEBUG"
177+
case LevelInfo:
178+
return "INFO"
179+
case LevelWarn:
180+
return "WARN"
181+
case LevelError:
182+
return "ERROR"
183+
default:
184+
return "UNKNOWN"
185+
}
186+
}

proxy/process.go

+33-22
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,12 @@ const (
3030
)
3131

3232
type Process struct {
33-
ID string
34-
config ModelConfig
35-
cmd *exec.Cmd
36-
logMonitor *LogMonitor
33+
ID string
34+
config ModelConfig
35+
cmd *exec.Cmd
36+
37+
processLogger *LogMonitor
38+
proxyLogger *LogMonitor
3739

3840
healthCheckTimeout int
3941
healthCheckLoopInterval time.Duration
@@ -53,13 +55,14 @@ type Process struct {
5355
shutdownCancel context.CancelFunc
5456
}
5557

56-
func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, logMonitor *LogMonitor) *Process {
58+
func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, processLogger *LogMonitor, proxyLogger *LogMonitor) *Process {
5759
ctx, cancel := context.WithCancel(context.Background())
5860
return &Process{
5961
ID: ID,
6062
config: config,
6163
cmd: nil,
62-
logMonitor: logMonitor,
64+
processLogger: processLogger,
65+
proxyLogger: proxyLogger,
6366
healthCheckTimeout: healthCheckTimeout,
6467
healthCheckLoopInterval: 5 * time.Second, /* default, can not be set by user - used for testing */
6568
state: StateStopped,
@@ -68,6 +71,11 @@ func NewProcess(ID string, healthCheckTimeout int, config ModelConfig, logMonito
6871
}
6972
}
7073

74+
// LogMonitor returns the log monitor associated with the process.
75+
func (p *Process) LogMonitor() *LogMonitor {
76+
return p.processLogger
77+
}
78+
7179
// custom error types for swapping state
7280
var (
7381
ErrExpectedStateMismatch = errors.New("expected state mismatch")
@@ -85,9 +93,11 @@ func (p *Process) swapState(expectedState, newState ProcessState) (ProcessState,
8593
}
8694

8795
if !isValidTransition(p.state, newState) {
96+
p.proxyLogger.Warnf("Invalid state transition from %s to %s", p.state, newState)
8897
return p.state, ErrInvalidStateTransition
8998
}
9099

100+
p.proxyLogger.Debugf("State transition from %s to %s", expectedState, newState)
91101
p.state = newState
92102
return p.state, nil
93103
}
@@ -152,8 +162,8 @@ func (p *Process) start() error {
152162
defer p.waitStarting.Done()
153163

154164
p.cmd = exec.Command(args[0], args[1:]...)
155-
p.cmd.Stdout = p.logMonitor
156-
p.cmd.Stderr = p.logMonitor
165+
p.cmd.Stdout = p.processLogger
166+
p.cmd.Stderr = p.processLogger
157167
p.cmd.Env = p.config.Env
158168

159169
err = p.cmd.Start()
@@ -214,15 +224,16 @@ func (p *Process) start() error {
214224
return errors.New("health check interrupted due to shutdown")
215225
default:
216226
if err := p.checkHealthEndpoint(healthURL); err == nil {
227+
p.proxyLogger.Infof("Health check passed on %s", healthURL)
217228
cancelHealthCheck()
218229
break loop
219230
} else {
220231
if strings.Contains(err.Error(), "connection refused") {
221232
endTime, _ := checkDeadline.Deadline()
222233
ttl := time.Until(endTime)
223-
fmt.Fprintf(p.logMonitor, "!!! Connection refused on %s, ttl %.0fs\n", healthURL, ttl.Seconds())
234+
p.proxyLogger.Infof("Connection refused on %s, retrying in %.0fs", healthURL, ttl.Seconds())
224235
} else {
225-
fmt.Fprintf(p.logMonitor, "!!! Health check error: %v\n", err)
236+
p.proxyLogger.Infof("Health check error on %s, %v", healthURL, err)
226237
}
227238
}
228239
}
@@ -246,7 +257,8 @@ func (p *Process) start() error {
246257
p.inFlightRequests.Wait()
247258

248259
if time.Since(p.lastRequestHandled) > maxDuration {
249-
fmt.Fprintf(p.logMonitor, "!!! Unloading model %s, TTL of %ds reached.\n", p.ID, p.config.UnloadAfter)
260+
261+
p.proxyLogger.Infof("Unloading model %s, TTL of %ds reached.", p.ID, p.config.UnloadAfter)
250262
p.Stop()
251263
return
252264
}
@@ -267,15 +279,15 @@ func (p *Process) Stop() {
267279

268280
// calling Stop() when state is invalid is a no-op
269281
if curState, err := p.swapState(StateReady, StateStopping); err != nil {
270-
fmt.Fprintf(p.logMonitor, "!!! Info - Stop() Ready -> StateStopping err: %v, current state: %v\n", err, curState)
282+
p.proxyLogger.Infof("Stop() Ready -> StateStopping err: %v, current state: %v", err, curState)
271283
return
272284
}
273285

274286
// stop the process with a graceful exit timeout
275287
p.stopCommand(5 * time.Second)
276288

277289
if curState, err := p.swapState(StateStopping, StateStopped); err != nil {
278-
fmt.Fprintf(p.logMonitor, "!!! Info - Stop() StateStopping -> StateStopped err: %v, current state: %v\n", err, curState)
290+
p.proxyLogger.Infof("Stop() StateStopping -> StateStopped err: %v, current state: %v", err, curState)
279291
}
280292
}
281293

@@ -300,33 +312,32 @@ func (p *Process) stopCommand(sigtermTTL time.Duration) {
300312
}()
301313

302314
if p.cmd == nil || p.cmd.Process == nil {
303-
fmt.Fprintf(p.logMonitor, "!!! process [%s] cmd or cmd.Process is nil", p.ID)
315+
p.proxyLogger.Warnf("Process [%s] cmd or cmd.Process is nil", p.ID)
304316
return
305317
}
306318

307319
if err := p.terminateProcess(); err != nil {
308-
fmt.Fprintf(p.logMonitor, "!!! failed to gracefully terminate process [%s]: %v\n", p.ID, err)
320+
p.proxyLogger.Infof("Failed to gracefully terminate process [%s]: %v", p.ID, err)
309321
}
310322

311323
select {
312324
case <-sigtermTimeout.Done():
313-
fmt.Fprintf(p.logMonitor, "!!! process [%s] timed out waiting to stop, sending KILL signal\n", p.ID)
325+
p.proxyLogger.Infof("Process [%s] timed out waiting to stop, sending KILL signal", p.ID)
314326
p.cmd.Process.Kill()
315327
case err := <-sigtermNormal:
316328
if err != nil {
317329
if errno, ok := err.(syscall.Errno); ok {
318-
fmt.Fprintf(p.logMonitor, "!!! process [%s] errno >> %v\n", p.ID, errno)
330+
p.proxyLogger.Errorf("Process [%s] errno >> %v", p.ID, errno)
319331
} else if exitError, ok := err.(*exec.ExitError); ok {
320332
if strings.Contains(exitError.String(), "signal: terminated") {
321-
fmt.Fprintf(p.logMonitor, "!!! process [%s] stopped OK\n", p.ID)
333+
p.proxyLogger.Infof("Process [%s] stopped OK", p.ID)
322334
} else if strings.Contains(exitError.String(), "signal: interrupt") {
323-
fmt.Fprintf(p.logMonitor, "!!! process [%s] interrupted OK\n", p.ID)
335+
p.proxyLogger.Infof("Process [%s] interrupted OK", p.ID)
324336
} else {
325-
fmt.Fprintf(p.logMonitor, "!!! process [%s] ExitError >> %v, exit code: %d\n", p.ID, exitError, exitError.ExitCode())
337+
p.proxyLogger.Warnf("Process [%s] ExitError >> %v, exit code: %d", p.ID, exitError, exitError.ExitCode())
326338
}
327-
328339
} else {
329-
fmt.Fprintf(p.logMonitor, "!!! process [%s] exited >> %v\n", p.ID, err)
340+
p.proxyLogger.Errorf("Process [%s] exited >> %v", p.ID, err)
330341
}
331342
}
332343
}

0 commit comments

Comments
 (0)