From 8acf20f70bd341612d6fc06294957dc99ca1bd6c Mon Sep 17 00:00:00 2001
From: tithakka <tithakka@redhat.com>
Date: Tue, 30 Jun 2026 22:58:59 -0500
Subject: [PATCH] HYPERFLEET-1306 - fix: add preStop hook and rollout strategy
 to API deployment

---
 charts/README.md                 |  3 +++
 charts/templates/deployment.yaml | 11 +++++++++++
 charts/values.yaml               | 25 +++++++++++++++++++++++++
 3 files changed, 39 insertions(+)

diff --git a/charts/README.md b/charts/README.md
index 0567b239..e4e0312d 100644
--- a/charts/README.md
+++ b/charts/README.md
@@ -127,6 +127,9 @@ helm install hyperfleet-api oci://REGISTRY/hyperfleet-api \
 | service | object | `{"type":"ClusterIP"}` | Kubernetes Service configuration |
 | service.type | string | `"ClusterIP"` | Service type (`ClusterIP`, `LoadBalancer`, `NodePort`) |
 | resources | object | `{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"100m","memory":"128Mi"}}` | CPU and memory resource requests and limits |
+| lifecycle | object | `{"preStop":{"exec":{"command":["/bin/sh","-c","sleep 5"]}}}` | Container lifecycle hooks. Use `preStop` to delay SIGTERM during rolling updates, giving the LoadBalancer time to drain the old pod. See HYPERFLEET-1306. |
+| strategy | object | Kubernetes default (25% maxUnavailable, 25% maxSurge) | Deployment rollout strategy. `maxUnavailable: 0` ensures zero-downtime during rolling updates — the old pod stays until the new one is Ready. |
+| terminationGracePeriodSeconds | int | `30` | Seconds Kubernetes waits after SIGTERM before SIGKILL. Must be > preStop sleep (5s) + API server shutdown (10s) + buffer. The health server uses a separate 20s timeout for OTel cleanup. |
 | nodeSelector | object | `{}` | Node selector constraints for pod scheduling |
 | tolerations | list | `[]` | Tolerations for pod scheduling |
 | affinity | object | `{}` | Affinity rules for pod scheduling |
diff --git a/charts/templates/deployment.yaml b/charts/templates/deployment.yaml
index 4f40512b..819a14ef 100644
--- a/charts/templates/deployment.yaml
+++ b/charts/templates/deployment.yaml
@@ -9,6 +9,10 @@ spec:
   {{- if not .Values.autoscaling.enabled }}
   replicas: {{ .Values.replicaCount }}
   {{- end }}
+  {{- with .Values.strategy }}
+  strategy:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
   selector:
     matchLabels:
       {{- include "hyperfleet-api.selectorLabels" . | nindent 6 }}
@@ -44,6 +48,9 @@ spec:
         {{- toYaml . | nindent 8 }}
       {{- end }}
       serviceAccountName: {{ include "hyperfleet-api.serviceAccountName" . }}
+      {{- if .Values.terminationGracePeriodSeconds }}
+      terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }}
+      {{- end }}
       securityContext:
         {{- toYaml .Values.podSecurityContext | nindent 8 }}
       {{- if and .Values.nativeSidecars (not (semverCompare ">=1.28.0-0" .Capabilities.KubeVersion.Version)) }}
@@ -151,6 +158,10 @@ spec:
           failureThreshold: 3
         resources:
           {{- toYaml .Values.resources | nindent 10 }}
+        {{- with .Values.lifecycle }}
+        lifecycle:
+          {{- toYaml . | nindent 10 }}
+        {{- end }}
         volumeMounts:
         # ConfigMap mount - generated from values or existingConfigMap
         - name: config
diff --git a/charts/values.yaml b/charts/values.yaml
index 45b53c95..d8bd2e4b 100644
--- a/charts/values.yaml
+++ b/charts/values.yaml
@@ -267,6 +267,31 @@ resources:
     cpu: 100m
     memory: 128Mi
 
+# -- Container lifecycle hooks. Use `preStop` to delay SIGTERM during
+# rolling updates, giving the LoadBalancer time to drain the old pod.
+# See HYPERFLEET-1306.
+lifecycle:
+  preStop:
+    exec:
+      command:
+        - /bin/sh
+        - -c
+        - sleep 5
+
+# -- Deployment rollout strategy. `maxUnavailable: 0` ensures zero-downtime
+# during rolling updates — the old pod stays until the new one is Ready.
+# @default -- Kubernetes default (25% maxUnavailable, 25% maxSurge)
+strategy:
+  rollingUpdate:
+    maxSurge: 1
+    maxUnavailable: 0
+  type: RollingUpdate
+
+# -- Seconds Kubernetes waits after SIGTERM before SIGKILL.
+# Must be > preStop sleep (5s) + API server shutdown (10s) + buffer.
+# The health server uses a separate 20s timeout for OTel cleanup.
+terminationGracePeriodSeconds: 30
+
 # -- Node selector constraints for pod scheduling
 nodeSelector: {}