Skip to content

Commit 511db7e

Browse files
committed
Fix client hangs on stale SSH connections
Wrap ssh.DialNet in a goroutine so it respects context cancellation, skip retries on context/timeout errors, and invalidate the cached connection on failure. Fixes: #28453 Signed-off-by: Jan Rodák <hony.com@seznam.cz>
1 parent 7fe6abf commit 511db7e

2 files changed

Lines changed: 27 additions & 5 deletions

File tree

pkg/bindings/connection.go

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ const (
4444
clientKey = valueKey("Client")
4545
versionKey = valueKey("ServiceVersion")
4646
machineModeKey = valueKey("MachineMode")
47+
4748
)
4849

4950
type ConnectError struct {
@@ -297,8 +298,22 @@ func sshClient(_url *url.URL, uri string, identity string, machine bool) (Connec
297298
val := strings.TrimSuffix(b.String(), "\n")
298299
_url.Path = val
299300
}
300-
dialContext := func(_ context.Context, _, _ string) (net.Conn, error) {
301-
return ssh.DialNet(conn, "unix", _url)
301+
dialContext := func(ctx context.Context, _, _ string) (net.Conn, error) {
302+
type result struct {
303+
conn net.Conn
304+
err error
305+
}
306+
ch := make(chan result, 1)
307+
go func() {
308+
c, err := ssh.DialNet(conn, "unix", _url)
309+
ch <- result{c, err}
310+
}()
311+
select {
312+
case r := <-ch:
313+
return r.conn, r.err
314+
case <-ctx.Done():
315+
return nil, ctx.Err()
316+
}
302317
}
303318
connection.Client = &http.Client{
304319
Transport: &http.Transport{
@@ -314,8 +329,8 @@ func tcpClient(_url *url.URL, opts Options) (Connection, error) {
314329
connection := Connection{
315330
URI: _url,
316331
}
317-
dialContext := func(_ context.Context, _, _ string) (net.Conn, error) {
318-
return net.Dial("tcp", _url.Host)
332+
dialContext := func(ctx context.Context, _, _ string) (net.Conn, error) {
333+
return (&net.Dialer{}).DialContext(ctx, "tcp", _url.Host)
319334
}
320335
// use proxy if env `CONTAINER_PROXY` set
321336
if proxyURI, found := os.LookupEnv("CONTAINER_PROXY"); found {
@@ -480,12 +495,17 @@ func (c *Connection) DoRequest(ctx context.Context, httpBody io.Reader, httpMeth
480495
}
481496
}
482497

483-
// Give the Do three chances in the case of a comm/service hiccup
498+
// Give the Do three chances in the case of a comm/service hiccup.
499+
// Don't retry on context or timeout errors — those won't recover.
484500
for i := 1; i <= 3; i++ {
485501
response, err = c.Client.Do(req) //nolint:bodyclose // The caller has to close the body.
486502
if err == nil {
487503
break
488504
}
505+
var netErr net.Error
506+
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) || (errors.As(err, &netErr) && netErr.Timeout()) {
507+
break
508+
}
489509
time.Sleep(time.Duration(i*100) * time.Millisecond)
490510
}
491511
return &APIResponse{response, req}, err

pkg/domain/infra/runtime_tunnel.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ func newConnection(facts *entities.PodmanConfig, farmNodeName string) (context.C
2424
if connection == nil || farmNodeName != "" {
2525
ctx, err := newConnectionWithoutLock(context.Background(), facts)
2626
if err != nil {
27+
// Clear stale connection so the next call retries.
28+
connection = nil
2729
return ctx, err
2830
}
2931
connection = &ctx

0 commit comments

Comments
 (0)