From 771e474c959f88ef466ed981f105703770f0bc43 Mon Sep 17 00:00:00 2001 From: Tomas Valenta Date: Tue, 6 May 2025 00:41:27 +0200 Subject: [PATCH 1/8] Reconfigure api timeouts --- packages/api/main.go | 20 +++++++++++--------- packages/cluster/network/main.tf | 5 +++-- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/packages/api/main.go b/packages/api/main.go index 2bfaf3c38..3c822a311 100644 --- a/packages/api/main.go +++ b/packages/api/main.go @@ -45,9 +45,9 @@ const ( maxMultipartMemory = 1 << 27 // 128 MiB maxUploadLimit = 1 << 28 // 256 MiB - maxReadHeaderTimeout = 60 * time.Second - maxReadTimeout = 75 * time.Second - maxWriteTimeout = 75 * time.Second + maxReadTimeout = 75 * time.Second + maxWriteTimeout = 75 * time.Second + idleTimeout = 620 * time.Second defaultPort = 80 ) @@ -171,12 +171,14 @@ func NewGinServer(ctx context.Context, logger *zap.Logger, apiStore *handlers.AP r.MaxMultipartMemory = maxMultipartMemory s := &http.Server{ - Handler: r, - Addr: fmt.Sprintf("0.0.0.0:%d", port), - ReadHeaderTimeout: maxReadHeaderTimeout, - ReadTimeout: maxReadTimeout, - WriteTimeout: maxWriteTimeout, - BaseContext: func(net.Listener) context.Context { return ctx }, + Handler: r, + Addr: fmt.Sprintf("0.0.0.0:%d", port), + // Configure timeouts to be greater than the proxy timeouts. + // https://github.com/golang/go/issues/47007 + ReadTimeout: maxReadTimeout, + WriteTimeout: maxWriteTimeout, + IdleTimeout: idleTimeout, + BaseContext: func(net.Listener) context.Context { return ctx }, } return s diff --git a/packages/cluster/network/main.tf b/packages/cluster/network/main.tf index de76ad28e..240899318 100644 --- a/packages/cluster/network/main.tf +++ b/packages/cluster/network/main.tf @@ -310,8 +310,9 @@ resource "google_compute_url_map" "orch_map" { ### IPv4 block ### resource "google_compute_target_https_proxy" "default" { - name = "${var.prefix}https-proxy" - url_map = google_compute_url_map.orch_map.self_link + name = "${var.prefix}https-proxy" + url_map = google_compute_url_map.orch_map.self_link + http_keep_alive_timeout_sec = 540 certificate_map = "//certificatemanager.googleapis.com/${google_certificate_manager_certificate_map.certificate_map.id}" } From 6fca79ae149cb04b381a267e9bcf305d38e9abb8 Mon Sep 17 00:00:00 2001 From: Jakub Novak Date: Thu, 8 May 2025 11:36:56 +0200 Subject: [PATCH 2/8] Add buffet between upstream and downstream --- packages/shared/pkg/proxy/proxy.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/packages/shared/pkg/proxy/proxy.go b/packages/shared/pkg/proxy/proxy.go index 08180fb0c..d7459de1c 100644 --- a/packages/shared/pkg/proxy/proxy.go +++ b/packages/shared/pkg/proxy/proxy.go @@ -12,6 +12,7 @@ import ( ) const maxClientConns = 8192 // Reasonably big number that is lower than the number of available ports. +const idleTimeoutBufferUpstreamDownstream = 10 type Proxy struct { http.Server @@ -33,10 +34,13 @@ func New( return &Proxy{ Server: http.Server{ - Addr: fmt.Sprintf(":%d", port), - ReadTimeout: 0, - WriteTimeout: 0, - IdleTimeout: idleTimeout, + Addr: fmt.Sprintf(":%d", port), + ReadTimeout: 0, + WriteTimeout: 0, + // Downstream idle > upstream idle, + // otherwise a new downstream connection can try to reuse an open upstream connection + // and reuse TCP connection which is already closed + IdleTimeout: idleTimeout + idleTimeoutBufferUpstreamDownstream, ReadHeaderTimeout: 0, Handler: handler(p, getDestination), }, From 7a033a1d0ff69ab8920e1f2fa4de871893f4c83b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Nov=C3=A1k?= Date: Thu, 8 May 2025 03:03:53 -0700 Subject: [PATCH 3/8] Update packages/shared/pkg/proxy/proxy.go --- packages/shared/pkg/proxy/proxy.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/shared/pkg/proxy/proxy.go b/packages/shared/pkg/proxy/proxy.go index d7459de1c..9131294ea 100644 --- a/packages/shared/pkg/proxy/proxy.go +++ b/packages/shared/pkg/proxy/proxy.go @@ -38,8 +38,8 @@ func New( ReadTimeout: 0, WriteTimeout: 0, // Downstream idle > upstream idle, - // otherwise a new downstream connection can try to reuse an open upstream connection - // and reuse TCP connection which is already closed + // otherwise a new downstream connection can try to reuse an upstream connection + // which is already in state `CLOSE_WAIT` resulting in error IdleTimeout: idleTimeout + idleTimeoutBufferUpstreamDownstream, ReadHeaderTimeout: 0, Handler: handler(p, getDestination), From a287207084e85b98065ff2b2d6f497679b7d4d70 Mon Sep 17 00:00:00 2001 From: Jakub Novak Date: Thu, 8 May 2025 12:07:44 +0200 Subject: [PATCH 4/8] Improve comment --- packages/shared/pkg/proxy/proxy.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/shared/pkg/proxy/proxy.go b/packages/shared/pkg/proxy/proxy.go index 9131294ea..9eef8a207 100644 --- a/packages/shared/pkg/proxy/proxy.go +++ b/packages/shared/pkg/proxy/proxy.go @@ -37,9 +37,9 @@ func New( Addr: fmt.Sprintf(":%d", port), ReadTimeout: 0, WriteTimeout: 0, - // Downstream idle > upstream idle, - // otherwise a new downstream connection can try to reuse an upstream connection - // which is already in state `CLOSE_WAIT` resulting in error + // Downstream (client side) idle > upstream (server side) idle + // otherwise a new connection between downstream idle and upstream idle + // will try to reuse an upstream connection which is in `CLOSE_WAIT` state resulting in error IdleTimeout: idleTimeout + idleTimeoutBufferUpstreamDownstream, ReadHeaderTimeout: 0, Handler: handler(p, getDestination), From 7ec208f2a1220613dd6acdad60cfcf3491255f29 Mon Sep 17 00:00:00 2001 From: Jakub Novak Date: Thu, 8 May 2025 12:19:16 +0200 Subject: [PATCH 5/8] Add comments to idle timeouts --- packages/api/main.go | 4 +++- packages/client-proxy/internal/proxy/proxy.go | 5 ++++- packages/orchestrator/internal/proxy/proxy.go | 5 ++++- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/packages/api/main.go b/packages/api/main.go index 3c822a311..1084fe4ab 100644 --- a/packages/api/main.go +++ b/packages/api/main.go @@ -47,7 +47,9 @@ const ( maxReadTimeout = 75 * time.Second maxWriteTimeout = 75 * time.Second - idleTimeout = 620 * time.Second + // This timeout should be > 600 (GCP LB upstream idle timeout) to prevent race condition + // https://cloud.google.com/load-balancing/docs/https#timeouts_and_retries%23:~:text=The%20load%20balancer%27s%20backend%20keepalive,is%20greater%20than%20600%20seconds + idleTimeout = 620 * time.Second defaultPort = 80 ) diff --git a/packages/client-proxy/internal/proxy/proxy.go b/packages/client-proxy/internal/proxy/proxy.go index de937dd97..ce56c7e89 100644 --- a/packages/client-proxy/internal/proxy/proxy.go +++ b/packages/client-proxy/internal/proxy/proxy.go @@ -23,7 +23,10 @@ const ( maxRetries = 3 minOrchestratorProxyConns = 3 - idleTimeout = 620 * time.Second + // This timeout should be > 600 (GCP LB upstream idle timeout) to prevent race condition + // Also it's a good practice to set it to a value higher than the idle timeout of the backend service + // https://cloud.google.com/load-balancing/docs/https#timeouts_and_retries%23:~:text=The%20load%20balancer%27s%20backend%20keepalive,is%20greater%20than%20600%20seconds + idleTimeout = 620 * time.Second // We use a constant connection key, because we don't have to separate connection pools // as we need to do when connecting to sandboxes (from orchestrator proxy) to prevent reuse of pool connections diff --git a/packages/orchestrator/internal/proxy/proxy.go b/packages/orchestrator/internal/proxy/proxy.go index 1d6ddfac5..95d3c0bf1 100644 --- a/packages/orchestrator/internal/proxy/proxy.go +++ b/packages/orchestrator/internal/proxy/proxy.go @@ -19,7 +19,10 @@ import ( const ( minSandboxConns = 1 - idleTimeout = 630 * time.Second + // This timeout should be > 600 (GCP LB upstream idle timeout) to prevent race condition + // Also it's a good practice to set it to higher values as you progress in the stack + // https://cloud.google.com/load-balancing/docs/https#timeouts_and_retries%23:~:text=The%20load%20balancer%27s%20backend%20keepalive,is%20greater%20than%20600%20seconds + idleTimeout = 630 * time.Second ) func NewSandboxProxy(port uint, sandboxes *smap.Map[*sandbox.Sandbox]) (*reverse_proxy.Proxy, error) { From 83cfb4781fc79eb03862e82b8b09d48fa9f47df6 Mon Sep 17 00:00:00 2001 From: Tomas Valenta Date: Thu, 8 May 2025 06:35:09 -0700 Subject: [PATCH 6/8] Update packages/cluster/network/main.tf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jakub Novák --- packages/cluster/network/main.tf | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/packages/cluster/network/main.tf b/packages/cluster/network/main.tf index 240899318..de76ad28e 100644 --- a/packages/cluster/network/main.tf +++ b/packages/cluster/network/main.tf @@ -310,9 +310,8 @@ resource "google_compute_url_map" "orch_map" { ### IPv4 block ### resource "google_compute_target_https_proxy" "default" { - name = "${var.prefix}https-proxy" - url_map = google_compute_url_map.orch_map.self_link - http_keep_alive_timeout_sec = 540 + name = "${var.prefix}https-proxy" + url_map = google_compute_url_map.orch_map.self_link certificate_map = "//certificatemanager.googleapis.com/${google_certificate_manager_certificate_map.certificate_map.id}" } From ca8fae5fcb8e9689900aba04a6771892450c562b Mon Sep 17 00:00:00 2001 From: Tomas Valenta Date: Thu, 8 May 2025 06:35:17 -0700 Subject: [PATCH 7/8] Update packages/api/main.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jakub Novák --- packages/api/main.go | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/api/main.go b/packages/api/main.go index 1084fe4ab..e2493026c 100644 --- a/packages/api/main.go +++ b/packages/api/main.go @@ -176,7 +176,6 @@ func NewGinServer(ctx context.Context, logger *zap.Logger, apiStore *handlers.AP Handler: r, Addr: fmt.Sprintf("0.0.0.0:%d", port), // Configure timeouts to be greater than the proxy timeouts. - // https://github.com/golang/go/issues/47007 ReadTimeout: maxReadTimeout, WriteTimeout: maxWriteTimeout, IdleTimeout: idleTimeout, From 04a5bd0ab85ba4b1274b55f5adfd2978908cd750 Mon Sep 17 00:00:00 2001 From: Tomas Valenta Date: Thu, 8 May 2025 06:43:39 -0700 Subject: [PATCH 8/8] Lower the idle timeouts --- packages/client-proxy/internal/proxy/proxy.go | 2 +- packages/orchestrator/internal/proxy/proxy.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/client-proxy/internal/proxy/proxy.go b/packages/client-proxy/internal/proxy/proxy.go index ce56c7e89..f857e2022 100644 --- a/packages/client-proxy/internal/proxy/proxy.go +++ b/packages/client-proxy/internal/proxy/proxy.go @@ -26,7 +26,7 @@ const ( // This timeout should be > 600 (GCP LB upstream idle timeout) to prevent race condition // Also it's a good practice to set it to a value higher than the idle timeout of the backend service // https://cloud.google.com/load-balancing/docs/https#timeouts_and_retries%23:~:text=The%20load%20balancer%27s%20backend%20keepalive,is%20greater%20than%20600%20seconds - idleTimeout = 620 * time.Second + idleTimeout = 610 * time.Second // We use a constant connection key, because we don't have to separate connection pools // as we need to do when connecting to sandboxes (from orchestrator proxy) to prevent reuse of pool connections diff --git a/packages/orchestrator/internal/proxy/proxy.go b/packages/orchestrator/internal/proxy/proxy.go index 95d3c0bf1..e5a111507 100644 --- a/packages/orchestrator/internal/proxy/proxy.go +++ b/packages/orchestrator/internal/proxy/proxy.go @@ -22,7 +22,7 @@ const ( // This timeout should be > 600 (GCP LB upstream idle timeout) to prevent race condition // Also it's a good practice to set it to higher values as you progress in the stack // https://cloud.google.com/load-balancing/docs/https#timeouts_and_retries%23:~:text=The%20load%20balancer%27s%20backend%20keepalive,is%20greater%20than%20600%20seconds - idleTimeout = 630 * time.Second + idleTimeout = 620 * time.Second ) func NewSandboxProxy(port uint, sandboxes *smap.Map[*sandbox.Sandbox]) (*reverse_proxy.Proxy, error) {