#!/usr/bin/env bash # scripts/dns-flip-test.sh — resolution(DNS vs DNS_ROUND_ROBIN) 거동 재현 하네스. # # 전제: scripts/dns-lab-setup.sh 로 랩이 이미 떠 있어야 함. # # usage: bash scripts/dns-flip-test.sh # mode1 IP flip 시 기존 롱세션 드롭 여부 (STRICT=drain vs LOGICAL=preserve) # mode2 죽은 IP 트래픽 유실 + outlier/retry 복구 (주로 STRICT) # mode3 LOGICAL 트레이드오프: flip 후에도 유지되다, IP가 죽어야 뒤늦게 재연결 # # 결과는 docs/test-reports/ 로 저장. (초안 — 첫 실행은 라이브에서 필터/타이밍 튜닝 필요.) set -uo pipefail VARIANT="${1:?usage: dns-flip-test.sh }" MODE="${2:?usage: dns-flip-test.sh }" CTX="${CTX:-homelab}"; NS="${NS:-dns-lab}" DIR="$(cd "$(dirname "$0")/.." && pwd)"; SC="$DIR/scenarios/50-dns-resolution" K="kubectl --context=$CTX -n $NS" CL="outbound|443||gslb.lab.internal" STAMP="$(date +%Y-%m-%d_%H%M%S)" OUT="$DIR/docs/test-reports/${STAMP}_dns-${VARIANT}-${MODE}.md"; mkdir -p "$(dirname "$OUT")" "$DIR/tmp/dns-lab" IP_A="$($K get svc backend-a -o jsonpath='{.spec.clusterIP}')" IP_B="$($K get svc backend-b -o jsonpath='{.spec.clusterIP}')" flip_single(){ $K exec deploy/lab-dns -c writer -- sh -c "printf '%s gslb.lab.internal\n' '$1' > /hosts/addn"; } flip_both(){ $K exec deploy/lab-dns -c writer -- sh -c "printf '%s gslb.lab.internal\n%s gslb.lab.internal\n' '$1' '$2' > /hosts/addn"; } digq(){ $K exec deploy/netshoot -- dig +short gslb.lab.internal 2>/dev/null | tr '\n' ' '; } eps(){ istioctl --context="$CTX" proxy-config endpoints "deploy/fortio.$NS" --cluster "$CL" 2>/dev/null; } estat(){ # (1) cluster-level cx 카운터 — annotation(proxyStatsMatcher)으로 포함시킨 정본 지표. # upstream_cx_destroy 델타가 STRICT(flip 시 급증) vs LOGICAL(flip 시 불변)의 핵심. echo "[cluster|443 cx]" $K exec deploy/fortio -c istio-proxy -- pilot-agent request GET "stats?filter=gslb" 2>/dev/null \ | grep -E 'cluster\.outbound\|443.*(upstream_cx_total|upstream_cx_destroy|upstream_cx_active|upstream_cx_connect_fail|membership_change)' \ | sed 's#cluster.outbound|443||gslb.lab.internal.##' || echo "(none)" # (2) per-endpoint cx — stats matcher 와 무관하게 항상 나옴. 어느 backend에 연결이 붙어있나. echo "[endpoint|443 cx]" $K exec deploy/fortio -c istio-proxy -- pilot-agent request GET clusters 2>/dev/null \ | grep -E 'outbound\|443\|\|gslb\.lab\.internal::[0-9]' \ | grep -E 'cx_active|cx_total|rq_total|health_flags' \ | sed 's#outbound|443||gslb.lab.internal::##' } # 요청 루프: 초 단위 동안 body(누가 응답) + http_code 를 타임스탬프로 기록 loop(){ # $1 sec $2 outfile $K exec deploy/netshoot -- sh -c ' end=$(( $(date +%s) + '"$1"' )) while [ $(date +%s) -lt $end ]; do r=$(curl -s -m 2 -w "|%{http_code}" http://gslb.lab.internal/ 2>/dev/null) body=$(printf "%s" "${r%%|*}" | tr -d "\r\n") printf "%s who=%s code=%s\n" "$(date +%H:%M:%S)" "$body" "${r##*|}" sleep 0.4 done' | tee "$2" } snap(){ echo "### $1"; echo '```'; echo "dig : $(digq)"; echo "-- endpoints --"; eps; echo "-- stats --"; estat; echo '```'; } { echo "# DNS resolution 재현 — variant=$VARIANT mode=$MODE" echo; echo "**Date:** $STAMP · **ns:** $NS · **cluster:** $CTX · backend-a=$IP_A backend-b=$IP_B" echo; echo "> 초안(construction 로그). 기대 vs 실제 판정표가 아니라 '무엇을 어떻게 했고 그때 무엇이 보였나' 기록." echo echo "## 1. 변형 적용" $K delete serviceentry gslb-strict gslb-logical --ignore-not-found >/dev/null 2>&1 case "$VARIANT" in strict) $K apply -f "$SC/40-serviceentry-strict.yaml" >/dev/null ;; logical) $K apply -f "$SC/41-serviceentry-logical.yaml" >/dev/null ;; *) echo "unknown variant"; exit 1 ;; esac $K apply -f "$SC/42-virtualservice-80to443.yaml" >/dev/null $K apply -f "$SC/43-destinationrule-tls.yaml" >/dev/null echo "적용: SE=$VARIANT, VS=80to443(no-retry), DR=tls(no-outlier)"; sleep 3 echo case "$MODE" in # ─────────────────────────────────────────────────────────────────────────── mode1) # IP flip 시 기존 롱세션 드롭 여부 — flip 전/후를 load 유지 중에 스냅샷해 cx_destroy 델타 격리 flip_single "$IP_A"; sleep 6 echo "## 2. BEFORE (gslb -> backend-a, load 전)"; snap before echo; echo "## 3. keepalive 롱세션 유지 중 flip(A→B) — 전/후를 load 중에 스냅샷" # 롱세션: fortio keepalive 2커넥션 40s (백그라운드) → upstream 연결을 backend-a 에 워밍 $K exec deploy/fortio -c fortio -- fortio load -c 2 -qps 8 -t 40s -keepalive -quiet http://gslb.lab.internal/ \ > "$DIR/tmp/dns-lab/fortio-$MODE.txt" 2>&1 & FPID=$! # netshoot who= 루프 40s (백그라운드, 파일로만 기록) ( loop 40 "$DIR/tmp/dns-lab/loop-$MODE.txt" >/dev/null 2>&1 ) & LPID=$! sleep 8; echo "### 3a. MID-PRE-FLIP (load 중; 연결이 backend-a 에 워밍됨)"; snap mid_pre echo ">> FLIP A→B @ $(date +%H:%M:%S)"; flip_single "$IP_B" sleep 10; echo "### 3b. MID-POST-FLIP (load 여전히 진행 중 — 3a 대비 cx_destroy 델타가 정체)"; snap mid_post wait $FPID 2>/dev/null; wait $LPID 2>/dev/null echo '```'; echo "-- fortio(keepalive) --"; grep -E 'Sockets used|Code |All done|error' "$DIR/tmp/dns-lab/fortio-$MODE.txt" 2>/dev/null; echo '```' echo; echo "### who= 타임라인 (netshoot; flip 전후 backend 전환, ~1s 간격 샘플)"; echo '```' awk 'NR%2==1' "$DIR/tmp/dns-lab/loop-$MODE.txt" 2>/dev/null; echo '```' echo; echo "## 4. AFTER (load 종료 후)"; snap after echo; echo "## 5. 읽는 법" echo "- STRICT_DNS 기대: 3a→3b 에서 endpoint A→B 교체, membership_change +1, **upstream_cx_destroy 급증(기존 A 연결 drain)**, cx_total(B) 신규. who 는 flip 후 backend-b 로 전환." echo "- LOGICAL_DNS 기대: 3a→3b 에서 endpoint 논리 1개 유지, **cx_destroy 불변(기존 세션 보존)**, who 는 커넥션 재사용 동안 backend-a 잔존." ;; # ─────────────────────────────────────────────────────────────────────────── mode2) # 죽은 IP 유실 + 복구 (STRICT 권장) [ "$VARIANT" = logical ] && echo "> 주의: LOGICAL은 multi-IP A record에서 NACK 위험 — mode2는 strict 권장." flip_both "$IP_A" "$IP_B"; sleep 6 echo "## 2. BEFORE (gslb -> A,B 둘 다; STRICT는 endpoint 2개)"; snap before echo; echo "## 3. backend-a kill (DNS엔 A 잔존 = 죽은 IP)" $K scale deploy/backend-a --replicas=0 >/dev/null; sleep 4 echo "### 3a. no-outlier/no-retry 구간 유실"; loop 15 "$DIR/tmp/dns-lab/loop-$MODE-a.txt" echo; echo "## 4. 복구: outlier(DR44) + retry(VS45) 적용" $K apply -f "$SC/44-destinationrule-tls-outlier.yaml" >/dev/null $K apply -f "$SC/45-virtualservice-80to443-retry.yaml" >/dev/null; sleep 8 echo "### 4a. 복구 후 구간"; loop 15 "$DIR/tmp/dns-lab/loop-$MODE-b.txt" echo; echo "## 5. AFTER"; snap after $K scale deploy/backend-a --replicas=1 >/dev/null echo; echo "## 6. 읽는 법: 3a에서 code!=200(연결실패=유실) 비율 vs 4a에서 0 수렴." ;; # ─────────────────────────────────────────────────────────────────────────── mode3) # LOGICAL 트레이드오프: flip 후 유지 → IP가 죽어야 뒤늦게 재연결 [ "$VARIANT" != logical ] && echo "> 주의: mode3는 LOGICAL 트레이드오프 시연 — variant=logical 로 실행 권장." flip_single "$IP_A"; sleep 6 echo "## 2. BEFORE (gslb -> backend-a)"; snap before echo; echo "## 3. keepalive 세션 유지 중 flip(A→B) — LOGICAL은 기존 세션 유지" $K exec deploy/fortio -c fortio -- fortio load -c 1 -qps 5 -t 60s -keepalive -quiet http://gslb.lab.internal/ \ > "$DIR/tmp/dns-lab/fortio-$MODE.txt" 2>&1 & FPID=$! ( sleep 10; echo ">> FLIP A→B @ $(date +%H:%M:%S)"; flip_single "$IP_B" sleep 20; echo ">> KILL backend-a @ $(date +%H:%M:%S)"; $K scale deploy/backend-a --replicas=0 >/dev/null ) & loop 60 "$DIR/tmp/dns-lab/loop-$MODE.txt" wait $FPID 2>/dev/null echo '```'; echo "-- fortio --"; grep -E 'Sockets used|Code |error' "$DIR/tmp/dns-lab/fortio-$MODE.txt" 2>/dev/null; echo '```' $K scale deploy/backend-a --replicas=1 >/dev/null echo; echo "## 4. AFTER"; snap after echo; echo "## 5. 읽는 법: flip(A→B) 직후에도 who=backend-a 지속(세션 유지) →" echo " backend-a KILL 시점에 잠깐 code!=200(재연결) → 이후 who=backend-b. liveness가 DNS가 아니라 '커넥션 단절'에 걸려 있음을 보여줌." ;; *) echo "unknown mode"; exit 1 ;; esac echo; echo "## 원상복구 힌트" echo '```'; echo "kubectl -n $NS scale deploy/backend-a --replicas=1" echo "kubectl -n $NS apply -f $SC/42-virtualservice-80to443.yaml -f $SC/43-destinationrule-tls.yaml"; echo '```' } | tee "$OUT" echo; echo ">> 리포트 저장: $OUT"