From d3e8d2b6ddb912f108c93bf258cfb745a6f1c902 Mon Sep 17 00:00:00 2001
From: serversidehannes <hangus1995@gmail.com>
Date: Tue, 30 Jun 2026 20:05:14 +0200
Subject: [PATCH] fix(chart): cap per-pod backend concurrency at the frontproxy
 (maxconn)

The remaining concurrent-backup OOM is below the app's memory limiter: uvicorn
buffers each in-flight request body off the socket BEFORE our limiter runs, so a
backup flood piles up request bodies in the HTTP server's C-level buffers (the
governor reads ~64MB while RSS hits 512Mi+ -> OOMKilled, exit 137). This memory
is invisible and ungovernable from the app layer.

The load balancer is the right place to bound it. haproxy had only a global
maxconn (4096) and no per-pod cap, so it could dump 100+ concurrent connections
onto a single pod. Add `maxconn` per backend server (default 40) plus
`timeout queue`: haproxy now caps in-flight requests per pod and QUEUES the
excess (redispatching to a less-loaded pod) instead of overrunning one pod's
uvicorn buffers. The app's existing limiter then governs the admitted few.

Verified locally at prod config (512Mi cap, 64MB budget, 2026.6.14 app):
  - direct 128x16MB PUT flood -> OOMKilled exit 137 (reproduces prod)
  - same flood via haproxy maxconn 40 -> 256/256 ok, pod peaks 335MiB, no OOM
  - harsh mixed upload+GET flood via haproxy -> 322MiB, no OOM
haproxy queues rather than rejects, so clients mostly see success, not 503s.
Validated the rendered haproxy.cfg with `haproxy -c` (exit 0).
---
 chart/templates/frontproxy/configmap.yaml | 6 +++++-
 chart/values.yaml                         | 9 +++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)
diff --git a/chart/templates/frontproxy/configmap.yaml b/chart/templates/frontproxy/configmap.yaml
index 315d125..d56e9ef 100644
--- a/chart/templates/frontproxy/configmap.yaml
+++ b/chart/templates/frontproxy/configmap.yaml
@@ -20,6 +20,7 @@ data:
       timeout connect {{ .Values.frontproxy.timeouts.connect }}
       timeout client {{ .Values.frontproxy.timeouts.client }}
       timeout server {{ .Values.frontproxy.timeouts.server }}
+      timeout queue {{ .Values.frontproxy.timeouts.queue }}
 
     frontend http_in
       bind *:{{ .Values.frontproxy.containerPort }}
@@ -32,7 +33,10 @@ data:
       # Use the FQDN: HAProxy's resolver does NOT apply /etc/resolv.conf search
       # domains, so a bare service name NXDOMAINs and leaves 0 backends (503 <NOSRV>).
       option redispatch
-      server-template pod 1-{{ int .Values.replicaCount }} {{ .Chart.Name }}-headless.{{ .Release.Namespace }}.svc.cluster.local:{{ .Values.service.port }} check resolvers k8s init-addr none
+      # maxconn per pod bounds in-flight requests on each backend so uvicorn can't
+      # accumulate request-body buffers and OOM; excess connections queue (timeout
+      # queue) and redispatch instead of overrunning a pod.
+      server-template pod 1-{{ int .Values.replicaCount }} {{ .Chart.Name }}-headless.{{ .Release.Namespace }}.svc.cluster.local:{{ .Values.service.port }} check resolvers k8s init-addr none maxconn {{ int .Values.frontproxy.maxConnPerPod }}
 
     resolvers k8s
       parse-resolv-conf
diff --git a/chart/values.yaml b/chart/values.yaml
index fabc1d9..42866a6 100644
--- a/chart/values.yaml
+++ b/chart/values.yaml
@@ -209,6 +209,15 @@ frontproxy:
     client: "1h"
     server: "1h"
     connect: "10s"
+    # How long a connection waits in the per-pod queue for a free slot before 503.
+    queue: "30s"
+  # Max concurrent connections HAProxy sends to EACH backend pod. This is the
+  # real bound on a pod's memory: uvicorn buffers each in-flight request body off
+  # the socket before our app's limiter runs, so without a per-pod cap a backup
+  # flood piles up bodies and OOMs the pod. Excess connections queue here (up to
+  # timeouts.queue) and redispatch to a less-loaded pod instead of hard-failing.
+  # ~40 x worst-case part size stays well under the pod memory limit.
+  maxConnPerPod: 40
   resources:
     requests:
       cpu: "50m"