#!/bin/sh # graceful-drain.sh — preStop hook for IGW Pod hc container # POSIX/ash compatible (alpine). Each step prints [N] description. # Best-effort: individual curl failures are logged but do not abort the script. set -u HC_URL="${HC_URL:-http://127.0.0.1:18180}" ENVOY_ADMIN="${ENVOY_ADMIN:-http://127.0.0.1:15000}" DRAIN_TIMEOUT="${DRAIN_TIMEOUT:-120}" POLL_INTERVAL="${POLL_INTERVAL:-2}" LB_BUFFER="${LB_BUFFER:-10}" START_TS=$(date +%s) log() { printf "ts=%s %s\n" "$(date -u +%Y-%m-%dT%H:%M:%SZ)" "$*"; } # curl wrapper — logs failures, never exits do_curl() { out=$(curl -sf -m 5 "$@" 2>&1) rc=$? if [ $rc -ne 0 ]; then log "level=warn msg=curl_failed args=$* rc=$rc output=${out}" fi printf '%s' "$out" return $rc } # --------------------------------------------------------------------------- # [1] Transition hc → DRAINING # --------------------------------------------------------------------------- log "[1] transitioning hc to DRAINING" do_curl -X POST "${HC_URL}/drain/start" || true # --------------------------------------------------------------------------- # [2] Signal Envoy to drain listeners (graceful, don't exit) # --------------------------------------------------------------------------- log "[2] posting drain_listeners to Envoy admin" do_curl -X POST "${ENVOY_ADMIN}/drain_listeners?graceful&skip_exit" || true # --------------------------------------------------------------------------- # [3] Poll active request counts until 0 or timeout # --------------------------------------------------------------------------- log "[3] polling Envoy stats for active requests (timeout=${DRAIN_TIMEOUT}s)" DEADLINE=$((START_TS + DRAIN_TIMEOUT)) while true; do NOW=$(date +%s) # Fetch stats; on curl failure treat as non-zero so we keep polling STATS=$(curl -sf -m 5 \ "${ENVOY_ADMIN}/stats?filter=downstream_rq_active%7Cupstream_rq_active" 2>/dev/null || true) # Sum all matching gauge values using awk ACTIVE=$(printf '%s\n' "$STATS" | awk -F': ' ' /downstream_rq_active|upstream_rq_active/ { sum += $2 } END { print (sum+0) } ') log "level=info msg=drain_poll active_requests=${ACTIVE}" if [ "${ACTIVE}" -eq 0 ] 2>/dev/null; then log "level=info msg=all_requests_drained" break fi if [ "$NOW" -ge "$DEADLINE" ]; then log "level=warn msg=drain_timeout_exceeded elapsed=$((NOW - START_TS))s" break fi sleep "${POLL_INTERVAL}" done # --------------------------------------------------------------------------- # [4] Flip /health_check.html → 503 (remove from HAProxy backend pool) # --------------------------------------------------------------------------- log "[4] transitioning hc to DRAINED_WAIT_LB (/health_check.html → 503)" do_curl -X POST "${HC_URL}/drain/lb-fail" || true # --------------------------------------------------------------------------- # [5] Wait for HAProxy to detect the health failure and remove backend # --------------------------------------------------------------------------- log "[5] sleeping LB_BUFFER=${LB_BUFFER}s for HAProxy to drain backend" sleep "${LB_BUFFER}" # --------------------------------------------------------------------------- # [6] Flip /health → 503 (K8s removes endpoint from Service) # --------------------------------------------------------------------------- log "[6] transitioning hc to TERMINATING (/health → 503)" do_curl -X POST "${HC_URL}/disable-readiness" || true # --------------------------------------------------------------------------- # [7] Done # --------------------------------------------------------------------------- ELAPSED=$(( $(date +%s) - START_TS )) log "[done] preStop completed in ${ELAPSED}s"