fix(ci): kill otelrecv with SIGKILL in cleanup, add timing logs, re-enable OTEL

Three changes:
- cleanup() now uses kill -9 instead of kill (SIGTERM) to prevent wait hanging
  if otelrecv's signal handler stalls
- adds [HH:MM:SS] log lines at key points so CI logs show exactly where time is spent
- restores OTEL env vars (via env VAR=val) since they were confirmed not to cause the hang

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Thomas SharedInbox
2026-05-20 21:00:20 +02:00
co-authored by Claude Sonnet 4.6
parent f187012f58
commit 320fbcabc3
+15 -7
View File
@@ -257,24 +257,25 @@ tasks:
- |
DAGGER_OUT=$(mktemp)
RC_FILE=$(mktemp)
# Run dagger with timeout; capture output for retry/teardown-hang detection.
_ts() { date -u '+[%H:%M:%S]'; }
run_dagger() {
: > "$DAGGER_OUT"; : > "$RC_FILE"
echo "$(_ts) dagger: start" >&2
{ timeout --kill-after=10 600 "$@"; echo $? > "$RC_FILE"; } 2>&1 | tee "$DAGGER_OUT"
RC=$(cat "$RC_FILE" 2>/dev/null || echo 1)
echo "$(_ts) dagger: pipeline done RC=$RC" >&2
if [ "$RC" -eq 124 ] && grep -q "All tests passed" "$DAGGER_OUT"; then
echo "Note: dagger hung in teardown after success; treating as exit 0." >&2
echo "$(_ts) dagger: hung in teardown after success; treating as exit 0." >&2
RC=0
fi
return "$RC"
}
# Retry on TCP-level failures; propagate real test failures immediately.
retry_dagger() {
for attempt in 1 2 3; do
run_dagger "$@" && return 0
RC=$?
if [ "$attempt" -lt 3 ] && grep -qE "connection reset|context canceled|connection refused" "$DAGGER_OUT"; then
echo "Network error on attempt $attempt/3, retrying..." >&2
echo "$(_ts) dagger: network error on attempt $attempt/3, retrying..." >&2
else
return "$RC"
fi
@@ -290,16 +291,23 @@ tasks:
python3 ci/otelrecv.py --port-file="$PORTFILE" &
RECV_PID=$!
cleanup() {
kill "$RECV_PID" 2>/dev/null
echo "$(_ts) cleanup: killing otelrecv pid=$RECV_PID" >&2
kill -9 "$RECV_PID" 2>/dev/null
wait "$RECV_PID" 2>/dev/null
echo "$(_ts) cleanup: done" >&2
rm -f "$PORTFILE" "$DAGGER_OUT" "$RC_FILE"
}
trap cleanup EXIT
until [ -s "$PORTFILE" ]; do sleep 0.05; done
PORT=$(cat "$PORTFILE")
retry_dagger \
echo "$(_ts) otelrecv: ready on port $PORT" >&2
retry_dagger env \
OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:$PORT" \
OTEL_EXPORTER_OTLP_PROTOCOL="http/protobuf" \
dagger call --progress=plain -q -m ci --source=. check
exit $?
RC=$?
echo "$(_ts) dagger: final RC=$RC" >&2
exit $RC
integration-android:
desc: UI integration tests on a connected Android emulator (Stalwart on host, emulator reaches it via 10.0.2.2)