From d3e8d2b6ddb912f108c93bf258cfb745a6f1c902 Mon Sep 17 00:00:00 2001 From: serversidehannes Date: Tue, 30 Jun 2026 20:05:14 +0200 Subject: [PATCH] fix(chart): cap per-pod backend concurrency at the frontproxy (maxconn) The remaining concurrent-backup OOM is below the app's memory limiter: uvicorn buffers each in-flight request body off the socket BEFORE our limiter runs, so a backup flood piles up request bodies in the HTTP server's C-level buffers (the governor reads ~64MB while RSS hits 512Mi+ -> OOMKilled, exit 137). This memory is invisible and ungovernable from the app layer. The load balancer is the right place to bound it. haproxy had only a global maxconn (4096) and no per-pod cap, so it could dump 100+ concurrent connections onto a single pod. Add `maxconn` per backend server (default 40) plus `timeout queue`: haproxy now caps in-flight requests per pod and QUEUES the excess (redispatching to a less-loaded pod) instead of overrunning one pod's uvicorn buffers. The app's existing limiter then governs the admitted few. Verified locally at prod config (512Mi cap, 64MB budget, 2026.6.14 app): - direct 128x16MB PUT flood -> OOMKilled exit 137 (reproduces prod) - same flood via haproxy maxconn 40 -> 256/256 ok, pod peaks 335MiB, no OOM - harsh mixed upload+GET flood via haproxy -> 322MiB, no OOM haproxy queues rather than rejects, so clients mostly see success, not 503s. Validated the rendered haproxy.cfg with `haproxy -c` (exit 0). --- chart/templates/frontproxy/configmap.yaml | 6 +++++- chart/values.yaml | 9 +++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/chart/templates/frontproxy/configmap.yaml b/chart/templates/frontproxy/configmap.yaml index 315d125..d56e9ef 100644 --- a/chart/templates/frontproxy/configmap.yaml +++ b/chart/templates/frontproxy/configmap.yaml @@ -20,6 +20,7 @@ data: timeout connect {{ .Values.frontproxy.timeouts.connect }} timeout client {{ .Values.frontproxy.timeouts.client }} timeout server {{ .Values.frontproxy.timeouts.server }} + timeout queue {{ .Values.frontproxy.timeouts.queue }} frontend http_in bind *:{{ .Values.frontproxy.containerPort }} @@ -32,7 +33,10 @@ data: # Use the FQDN: HAProxy's resolver does NOT apply /etc/resolv.conf search # domains, so a bare service name NXDOMAINs and leaves 0 backends (503 ). option redispatch - server-template pod 1-{{ int .Values.replicaCount }} {{ .Chart.Name }}-headless.{{ .Release.Namespace }}.svc.cluster.local:{{ .Values.service.port }} check resolvers k8s init-addr none + # maxconn per pod bounds in-flight requests on each backend so uvicorn can't + # accumulate request-body buffers and OOM; excess connections queue (timeout + # queue) and redispatch instead of overrunning a pod. + server-template pod 1-{{ int .Values.replicaCount }} {{ .Chart.Name }}-headless.{{ .Release.Namespace }}.svc.cluster.local:{{ .Values.service.port }} check resolvers k8s init-addr none maxconn {{ int .Values.frontproxy.maxConnPerPod }} resolvers k8s parse-resolv-conf diff --git a/chart/values.yaml b/chart/values.yaml index fabc1d9..42866a6 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -209,6 +209,15 @@ frontproxy: client: "1h" server: "1h" connect: "10s" + # How long a connection waits in the per-pod queue for a free slot before 503. + queue: "30s" + # Max concurrent connections HAProxy sends to EACH backend pod. This is the + # real bound on a pod's memory: uvicorn buffers each in-flight request body off + # the socket before our app's limiter runs, so without a per-pod cap a backup + # flood piles up bodies and OOMs the pod. Excess connections queue here (up to + # timeouts.queue) and redispatch to a less-loaded pod instead of hard-failing. + # ~40 x worst-case part size stays well under the pod memory limit. + maxConnPerPod: 40 resources: requests: cpu: "50m"