ServerSideHannes · ServerSideHannes · Jun 30, 2026 · Jun 30, 2026
diff --git a/chart/templates/frontproxy/configmap.yaml b/chart/templates/frontproxy/configmap.yaml
@@ -20,6 +20,7 @@ data:
       timeout connect {{ .Values.frontproxy.timeouts.connect }}
       timeout client {{ .Values.frontproxy.timeouts.client }}
       timeout server {{ .Values.frontproxy.timeouts.server }}
+      timeout queue {{ .Values.frontproxy.timeouts.queue }}
 
     frontend http_in
       bind *:{{ .Values.frontproxy.containerPort }}
@@ -32,7 +33,10 @@ data:
       # Use the FQDN: HAProxy's resolver does NOT apply /etc/resolv.conf search
       # domains, so a bare service name NXDOMAINs and leaves 0 backends (503 <NOSRV>).
       option redispatch
-      server-template pod 1-{{ int .Values.replicaCount }} {{ .Chart.Name }}-headless.{{ .Release.Namespace }}.svc.cluster.local:{{ .Values.service.port }} check resolvers k8s init-addr none
+      # maxconn per pod bounds in-flight requests on each backend so uvicorn can't
+      # accumulate request-body buffers and OOM; excess connections queue (timeout
+      # queue) and redispatch instead of overrunning a pod.
+      server-template pod 1-{{ int .Values.replicaCount }} {{ .Chart.Name }}-headless.{{ .Release.Namespace }}.svc.cluster.local:{{ .Values.service.port }} check resolvers k8s init-addr none maxconn {{ int .Values.frontproxy.maxConnPerPod }}
 
     resolvers k8s
       parse-resolv-conf

diff --git a/chart/values.yaml b/chart/values.yaml
@@ -209,6 +209,15 @@ frontproxy:
     client: "1h"
     server: "1h"
     connect: "10s"
+    # How long a connection waits in the per-pod queue for a free slot before 503.
+    queue: "30s"
+  # Max concurrent connections HAProxy sends to EACH backend pod. This is the
+  # real bound on a pod's memory: uvicorn buffers each in-flight request body off
+  # the socket before our app's limiter runs, so without a per-pod cap a backup
+  # flood piles up bodies and OOMs the pod. Excess connections queue here (up to
+  # timeouts.queue) and redispatch to a less-loaded pod instead of hard-failing.
+  # ~40 x worst-case part size stays well under the pod memory limit.
+  maxConnPerPod: 40
   resources:
     requests:
       cpu: "50m"