fix(ci): kill otelrecv with SIGKILL in cleanup, add timing logs, re-enable OTEL
Three changes: - cleanup() now uses kill -9 instead of kill (SIGTERM) to prevent wait hanging if otelrecv's signal handler stalls - adds [HH:MM:SS] log lines at key points so CI logs show exactly where time is spent - restores OTEL env vars (via env VAR=val) since they were confirmed not to cause the hang Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
co-authored by
Claude Sonnet 4.6
parent
f187012f58
commit
320fbcabc3
+15
-7
@@ -257,24 +257,25 @@ tasks:
|
||||
- |
|
||||
DAGGER_OUT=$(mktemp)
|
||||
RC_FILE=$(mktemp)
|
||||
# Run dagger with timeout; capture output for retry/teardown-hang detection.
|
||||
_ts() { date -u '+[%H:%M:%S]'; }
|
||||
run_dagger() {
|
||||
: > "$DAGGER_OUT"; : > "$RC_FILE"
|
||||
echo "$(_ts) dagger: start" >&2
|
||||
{ timeout --kill-after=10 600 "$@"; echo $? > "$RC_FILE"; } 2>&1 | tee "$DAGGER_OUT"
|
||||
RC=$(cat "$RC_FILE" 2>/dev/null || echo 1)
|
||||
echo "$(_ts) dagger: pipeline done RC=$RC" >&2
|
||||
if [ "$RC" -eq 124 ] && grep -q "All tests passed" "$DAGGER_OUT"; then
|
||||
echo "Note: dagger hung in teardown after success; treating as exit 0." >&2
|
||||
echo "$(_ts) dagger: hung in teardown after success; treating as exit 0." >&2
|
||||
RC=0
|
||||
fi
|
||||
return "$RC"
|
||||
}
|
||||
# Retry on TCP-level failures; propagate real test failures immediately.
|
||||
retry_dagger() {
|
||||
for attempt in 1 2 3; do
|
||||
run_dagger "$@" && return 0
|
||||
RC=$?
|
||||
if [ "$attempt" -lt 3 ] && grep -qE "connection reset|context canceled|connection refused" "$DAGGER_OUT"; then
|
||||
echo "Network error on attempt $attempt/3, retrying..." >&2
|
||||
echo "$(_ts) dagger: network error on attempt $attempt/3, retrying..." >&2
|
||||
else
|
||||
return "$RC"
|
||||
fi
|
||||
@@ -290,16 +291,23 @@ tasks:
|
||||
python3 ci/otelrecv.py --port-file="$PORTFILE" &
|
||||
RECV_PID=$!
|
||||
cleanup() {
|
||||
kill "$RECV_PID" 2>/dev/null
|
||||
echo "$(_ts) cleanup: killing otelrecv pid=$RECV_PID" >&2
|
||||
kill -9 "$RECV_PID" 2>/dev/null
|
||||
wait "$RECV_PID" 2>/dev/null
|
||||
echo "$(_ts) cleanup: done" >&2
|
||||
rm -f "$PORTFILE" "$DAGGER_OUT" "$RC_FILE"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
until [ -s "$PORTFILE" ]; do sleep 0.05; done
|
||||
PORT=$(cat "$PORTFILE")
|
||||
retry_dagger \
|
||||
echo "$(_ts) otelrecv: ready on port $PORT" >&2
|
||||
retry_dagger env \
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:$PORT" \
|
||||
OTEL_EXPORTER_OTLP_PROTOCOL="http/protobuf" \
|
||||
dagger call --progress=plain -q -m ci --source=. check
|
||||
exit $?
|
||||
RC=$?
|
||||
echo "$(_ts) dagger: final RC=$RC" >&2
|
||||
exit $RC
|
||||
|
||||
integration-android:
|
||||
desc: UI integration tests on a connected Android emulator (Stalwart on host, emulator reaches it via 10.0.2.2)
|
||||
|
||||
Reference in New Issue
Block a user