From 13b671cc9fef832b2010bfc641d38030ee2b723b Mon Sep 17 00:00:00 2001 From: m1ngsama Date: Tue, 26 May 2026 14:20:07 +0800 Subject: [PATCH] Add slow-client backpressure regression --- Makefile | 6 +- README.md | 10 ++ docs/CHANGELOG.md | 5 + docs/CICD.md | 3 + docs/QUICKREF.md | 3 + docs/ROADMAP.md | 6 +- scripts/release_check.sh | 8 ++ tests/test_slow_client.sh | 223 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 259 insertions(+), 5 deletions(-) create mode 100755 tests/test_slow_client.sh diff --git a/Makefile b/Makefile index 52f4a88..db978bc 100644 --- a/Makefile +++ b/Makefile @@ -34,7 +34,7 @@ MANDIR ?= $(PREFIX)/share/man SYSTEMD_UNIT_DIR ?= $(PREFIX)/lib/systemd/system CI_TEST_PORT ?= $(if $(PORT),$(PORT),2222) -.PHONY: all clean install install-systemd uninstall uninstall-systemd debug release release-check release-check-strict asan valgrind check test test-advisory ci-test unit-test integration-test anonymous-access-test connection-limit-test security-test stress-test soak-test user-lifecycle-test info +.PHONY: all clean install install-systemd uninstall uninstall-systemd debug release release-check release-check-strict asan valgrind check test test-advisory ci-test unit-test integration-test anonymous-access-test connection-limit-test security-test stress-test soak-test slow-client-test user-lifecycle-test info all: $(TARGETS) @@ -148,6 +148,10 @@ soak-test: all @echo "Running soak tests..." @cd tests && PORT=$${PORT:-2222} ./test_soak.sh $${DURATION:-8} $${RECONNECTS:-5} +slow-client-test: all + @echo "Running slow-client tests..." + @cd tests && PORT=$${PORT:-2222} ./test_slow_client.sh $${DURATION:-8} $${BURST_CHARS:-1600} + user-lifecycle-test: all @echo "Running user lifecycle tests..." @cd tests && PORT=$${PORT:-2222} ./test_user_lifecycle.sh diff --git a/README.md b/README.md index e6a1cc3..8779303 100644 --- a/README.md +++ b/README.md @@ -238,6 +238,7 @@ make connection-limit-test # verify per-IP concurrency and rate limits make security-test # run security feature checks make stress-test # run configurable concurrent-client stress test make soak-test # run idle/reconnect/control-plane soak test +make slow-client-test # run slow interactive-client backpressure test make user-lifecycle-test # run a two-user TUI lifecycle test make ci-test # run the same checks as GitHub Actions @@ -249,6 +250,7 @@ cd tests ./test_connection_limits.sh # per-IP concurrency and rate limits ./test_stress.sh # stress test ./test_soak.sh # soak test +./test_slow_client.sh # slow-client backpressure ./test_user_lifecycle.sh # two-user TUI lifecycle ``` @@ -257,6 +259,8 @@ cd tests - Anonymous access: 2 tests - Security features: 12 tests - Stress test: configurable concurrent clients (`CLIENTS=20 DURATION=60 make stress-test`) +- Slow-client test: an unread interactive SSH client cannot block health, + stats, post, tail, or server survival checks ### Dependencies @@ -361,6 +365,12 @@ Before preparing a release locally: make release-check ``` +Longer local preflight can opt into runtime soak and slow-client coverage: + +```sh +RUN_SOAK=1 RUN_SLOW_CLIENT=1 make release-check +``` + Before publishing package recipes, replace placeholder checksums and run: ```sh diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 586430b..5033f81 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -19,6 +19,9 @@ - Added a VHS tape draft for recording the core TNT terminal-chat experience. - Added live `:inbox` refresh behavior: `r` refreshes the inbox manually, and an open inbox refreshes when a new private message arrives. +- Added `make slow-client-test`, an opt-in regression for an unread + interactive SSH client under backpressure while health, stats, post, tail, + and server survival stay responsive. ### Changed - `make install-systemd` now rewrites the installed unit's `ExecStart` to match @@ -51,6 +54,8 @@ direct slow-reader blocking path. - `make release-check` can now run the soak test with `RUN_SOAK=1`, keeping longer runtime checks opt-in for local release validation. +- `make release-check` can also run the slow-client backpressure test with + `RUN_SLOW_CLIENT=1`. - Room capacity and mention notification bookkeeping now follow `TNT_MAX_CONNECTIONS` instead of a hidden fixed 64-client array limit. - Updated the roadmap to reflect completed `tntctl`, stable exec contract, and diff --git a/docs/CICD.md b/docs/CICD.md index 96d05c6..fa62eab 100644 --- a/docs/CICD.md +++ b/docs/CICD.md @@ -41,6 +41,9 @@ Release policy: 2. Run the local preflight: make release-check + For a longer local runtime gate before publishing or production rollout: + RUN_SOAK=1 RUN_SLOW_CLIENT=1 make release-check + 3. Commit the release changes and create a local tag. Do not push the tag until strict checks pass: git tag v1.0.1 diff --git a/docs/QUICKREF.md b/docs/QUICKREF.md index 0f19bd0..efcb65d 100644 --- a/docs/QUICKREF.md +++ b/docs/QUICKREF.md @@ -15,6 +15,9 @@ TEST make connection-limit-test per-IP concurrency/rate-limit checks make security-test security feature checks make stress-test concurrent-client stress test + make soak-test idle/reconnect/control-plane soak test + make slow-client-test slow interactive-client backpressure test + make user-lifecycle-test two-user TUI lifecycle test make ci-test same checks as GitHub Actions DEBUG diff --git a/docs/ROADMAP.md b/docs/ROADMAP.md index dcc7fd9..2c85a04 100644 --- a/docs/ROADMAP.md +++ b/docs/ROADMAP.md @@ -94,7 +94,7 @@ Goal: make regressions harder to introduce. - add sanitizer jobs and targeted fuzzing for UTF-8, log parsing, and command parsing - ✅ add a configurable soak test for idle sessions, reconnects, and control interface availability -- add deeper slow-client soak coverage with a deliberately backpressured SSH +- ✅ add deeper slow-client coverage with a deliberately backpressured SSH client - keep deployment and test docs aligned with actual runtime behavior - require every user-visible interface change to update docs and tests in the same change set @@ -106,7 +106,5 @@ These are the next changes that should happen before new feature work expands th 1. Decide the daemon naming path: keep `tnt` as the server binary for 1.x, or introduce `tntd` later with a compatibility plan. 2. Finish untangling client-state ownership into a clearer release path. -3. Add deeper slow-client soak coverage with a deliberately backpressured SSH - client. -4. Replace remaining release placeholders with real maintainer metadata and +3. Replace remaining release placeholders with real maintainer metadata and source-archive checksums when cutting a public package release. diff --git a/scripts/release_check.sh b/scripts/release_check.sh index a35d7fe..a80c1ad 100755 --- a/scripts/release_check.sh +++ b/scripts/release_check.sh @@ -21,6 +21,7 @@ Default checks: Environment: RUN_INTEGRATION=1 also run full make test RUN_SOAK=1 also run the configurable soak test + RUN_SLOW_CLIENT=1 also run the slow-client backpressure test PORT=12720 base port for integration tests Strict checks additionally require a clean tree, a vX.Y.Z tag at HEAD, a @@ -123,6 +124,13 @@ if [ "${RUN_SOAK:-0}" = "1" ]; then DURATION="${SOAK_DURATION:-8}" RECONNECTS="${SOAK_RECONNECTS:-5}" fi +if [ "${RUN_SLOW_CLIENT:-0}" = "1" ]; then + step "running slow-client test" + make slow-client-test PORT="$((${PORT:-12720} + 40))" \ + DURATION="${SLOW_CLIENT_DURATION:-8}" \ + BURST_CHARS="${SLOW_CLIENT_BURST_CHARS:-1600}" +fi + tmpdir=$(mktemp -d "${TMPDIR:-/tmp}/tnt-release-check.XXXXXX") cleanup() { rm -rf "$tmpdir" diff --git a/tests/test_slow_client.sh b/tests/test_slow_client.sh new file mode 100755 index 0000000..7cce7e4 --- /dev/null +++ b/tests/test_slow_client.sh @@ -0,0 +1,223 @@ +#!/bin/sh +# Slow interactive-client regression test for TNT. +# Usage: ./test_slow_client.sh [hold_seconds] [burst_chars] + +PORT=${PORT:-2222} +HOLD_SECONDS=${1:-8} +BURST_CHARS=${2:-1600} +BIN="../tnt" +PASS=0 +FAIL=0 +STATE_DIR=$(mktemp -d "${TMPDIR:-/tmp}/tnt-slow-client-test.XXXXXX") +SERVER_PID="" +SLOW_PID="" + +cleanup() { + if [ -n "$SLOW_PID" ]; then + kill "$SLOW_PID" 2>/dev/null || true + wait "$SLOW_PID" 2>/dev/null || true + fi + exec 3>&- 2>/dev/null || true + if [ -n "$SERVER_PID" ]; then + kill "$SERVER_PID" 2>/dev/null || true + wait "$SERVER_PID" 2>/dev/null || true + fi + rm -rf "$STATE_DIR" +} + +trap cleanup EXIT + +case "$HOLD_SECONDS" in + ''|*[!0-9]*) + echo "Error: hold_seconds must be a positive integer" + exit 2 + ;; +esac + +case "$BURST_CHARS" in + ''|*[!0-9]*) + echo "Error: burst_chars must be a positive integer" + exit 2 + ;; +esac + +if [ "$HOLD_SECONDS" -lt 1 ] || [ "$BURST_CHARS" -lt 1 ]; then + echo "Error: hold_seconds and burst_chars must be positive" + exit 2 +fi + +if [ ! -f "$BIN" ]; then + echo "Error: Binary $BIN not found. Run make first." + exit 1 +fi + +SSH_EXEC_OPTS="-n -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o BatchMode=yes -o LogLevel=ERROR -o ConnectTimeout=5 -p $PORT" +SSH_TTY_OPTS="-e none -tt -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o LogLevel=ERROR -o ConnectTimeout=5 -p $PORT" + +run_ssh_timeout() { + seconds=$1 + outfile=$2 + shift 2 + + ssh $SSH_EXEC_OPTS "$@" >"$outfile" 2>&1 & + cmd_pid=$! + elapsed=0 + + while [ "$elapsed" -lt "$seconds" ]; do + if ! kill -0 "$cmd_pid" 2>/dev/null; then + wait "$cmd_pid" + return $? + fi + sleep 1 + elapsed=$((elapsed + 1)) + done + + if kill -0 "$cmd_pid" 2>/dev/null; then + kill "$cmd_pid" 2>/dev/null || true + wait "$cmd_pid" 2>/dev/null || true + fi + return 124 +} + +wait_for_health() { + out="" + for _ in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do + if [ -n "$SERVER_PID" ] && ! kill -0 "$SERVER_PID" 2>/dev/null; then + return 1 + fi + out=$(ssh $SSH_EXEC_OPTS localhost health 2>/dev/null || true) + [ "$out" = "ok" ] && return 0 + sleep 1 + done + return 1 +} + +wait_for_slow_user() { + out="" + for _ in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15; do + if [ -n "$SERVER_PID" ] && ! kill -0 "$SERVER_PID" 2>/dev/null; then + return 1 + fi + out=$(ssh $SSH_EXEC_OPTS localhost users --json 2>/dev/null || true) + printf '%s\n' "$out" | grep -q '"slow"' && return 0 + sleep 1 + done + return 1 +} + +echo "=== TNT Slow Client Test ===" +echo "hold=${HOLD_SECONDS}s burst_chars=$BURST_CHARS port=$PORT" + +TNT_LANG=en "$BIN" \ + --bind 127.0.0.1 \ + --public-host slow.local \ + --max-connections 32 \ + --max-conn-per-ip 32 \ + --max-conn-rate-per-ip 64 \ + --rate-limit 0 \ + --idle-timeout 0 \ + --ssh-log-level 1 \ + -p "$PORT" \ + -d "$STATE_DIR" >"$STATE_DIR/server.log" 2>&1 & +SERVER_PID=$! + +if wait_for_health; then + echo "✓ server started" + PASS=$((PASS + 1)) +else + echo "✗ server failed to start" + sed -n '1,160p' "$STATE_DIR/server.log" + exit 1 +fi + +SLOW_FIFO="$STATE_DIR/slow.out" +mkfifo "$SLOW_FIFO" +exec 3<>"$SLOW_FIFO" + +( + printf 'slow\n' + sleep 2 + i=0 + while [ "$i" -lt "$BURST_CHARS" ]; do + printf 'x' + i=$((i + 1)) + done + sleep "$HOLD_SECONDS" +) | ssh $SSH_TTY_OPTS slow@127.0.0.1 >"$SLOW_FIFO" 2>"$STATE_DIR/slow.err" & +SLOW_PID=$! + +if wait_for_slow_user; then + echo "✓ deliberately unread interactive client reached chat" + PASS=$((PASS + 1)) +else + echo "✗ slow client did not reach chat" + sed -n '1,120p' "$STATE_DIR/slow.err" + FAIL=$((FAIL + 1)) +fi + +sleep 3 + +if run_ssh_timeout 5 "$STATE_DIR/health.out" localhost health && + grep -qx 'ok' "$STATE_DIR/health.out"; then + echo "✓ health stayed responsive while slow client was pressured" + PASS=$((PASS + 1)) +else + echo "✗ health blocked or returned unexpected output" + cat "$STATE_DIR/health.out" 2>/dev/null || true + FAIL=$((FAIL + 1)) +fi + +if run_ssh_timeout 5 "$STATE_DIR/stats.out" localhost stats --json && + grep -q '"status":"ok"' "$STATE_DIR/stats.out"; then + echo "✓ stats stayed responsive while slow client was pressured" + PASS=$((PASS + 1)) +else + echo "✗ stats blocked or returned unexpected output" + cat "$STATE_DIR/stats.out" 2>/dev/null || true + FAIL=$((FAIL + 1)) +fi + +FLOOD_FAIL=0 +i=1 +while [ "$i" -le 8 ]; do + msg=$(printf 'slow-client responsive post %02d %0900d' "$i" 0) + if ! run_ssh_timeout 5 "$STATE_DIR/post-$i.out" probe@localhost post "$msg" || + ! grep -qx 'posted' "$STATE_DIR/post-$i.out"; then + echo "✗ post blocked or failed during slow-client pressure at $i/8" + cat "$STATE_DIR/post-$i.out" 2>/dev/null || true + FAIL=$((FAIL + 1)) + FLOOD_FAIL=1 + break + fi + i=$((i + 1)) +done + +if [ "$FLOOD_FAIL" -eq 0 ]; then + echo "✓ post path stayed responsive during slow-client pressure" + PASS=$((PASS + 1)) +fi + +if run_ssh_timeout 5 "$STATE_DIR/tail.out" localhost "tail -n 5" && + grep -q 'slow-client responsive post 08' "$STATE_DIR/tail.out"; then + echo "✓ tail sees messages posted during slow-client pressure" + PASS=$((PASS + 1)) +else + echo "✗ tail missing slow-client pressure messages" + cat "$STATE_DIR/tail.out" 2>/dev/null || true + FAIL=$((FAIL + 1)) +fi + +if kill -0 "$SERVER_PID" 2>/dev/null; then + echo "✓ server survived slow-client pressure" + PASS=$((PASS + 1)) +else + echo "✗ server exited during slow-client pressure" + sed -n '1,160p' "$STATE_DIR/server.log" + FAIL=$((FAIL + 1)) +fi + +echo "" +echo "PASSED: $PASS" +echo "FAILED: $FAIL" +[ "$FAIL" -eq 0 ] && echo "All tests passed" || echo "Some tests failed" +exit "$FAIL"