fix: add timeouts to dagger query, docker info, and portfile loop (#347)

Three unguarded blocking calls caused CI to hang until the 60-min timeout:
- dagger query prune steps had no timeout; || true only catches errors, not hangs
- docker info (added in d905cd6) had no timeout if Docker socket is unresponsive
- until portfile loop in check-dagger spun forever if otel-receiver.py crashed

Fixes: timeout 120 on all dagger query prune calls, timeout 30 on docker info,
and a kill -0 process-alive guard on the portfile until loop with fallback.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Thomas SharedInbox
2026-06-01 21:43:07 +02:00
co-authored by Claude Sonnet 4.6
parent 968db75c69
commit ea5d119706
3 changed files with 15 additions and 6 deletions
+3 -3
View File
@@ -74,7 +74,7 @@ jobs:
# Try host Docker socket (DooD) if runner mounts it
if [ -S /var/run/docker.sock ]; then
if DOCKER_HOST=unix:///var/run/docker.sock docker info >/dev/null 2>&1; then
if DOCKER_HOST=unix:///var/run/docker.sock timeout 30 docker info >/dev/null 2>&1; then
echo "Docker available via host socket."
echo "DOCKER_HOST=unix:///var/run/docker.sock" >> "$GITHUB_ENV"
exit 0
@@ -92,7 +92,7 @@ jobs:
# prune(maxUsedSpace) also reclaims named cache volumes (gradle-cache, go-build-cache, etc.)
# when total cache exceeds the limit; without args only unreferenced entries are removed.
run: |
dagger query '{ engine { localCache { prune(maxUsedSpace: "75gb", targetSpace: "50gb") } } }' || true
timeout 120 dagger query '{ engine { localCache { prune(maxUsedSpace: "75gb", targetSpace: "50gb") } } }' || true
- name: Run Full Check Suite
env:
@@ -104,7 +104,7 @@ jobs:
env:
DAGGER_NO_NAG: "1"
run: |
dagger query '{ engine { localCache { prune(maxUsedSpace: "75gb", targetSpace: "50gb") } } }' || true
timeout 120 dagger query '{ engine { localCache { prune(maxUsedSpace: "75gb", targetSpace: "50gb") } } }' || true
- name: Cleanup TLS credentials
if: always()
+11 -2
View File
@@ -298,7 +298,7 @@ tasks:
echo "$(_ts) dagger: network error on attempt $attempt/3, retrying..." >&2
elif [ "$attempt" -lt 3 ] && grep -q "No space left on device" "$DAGGER_OUT"; then
echo "$(_ts) dagger: disk space error on attempt $attempt/3, pruning Dagger cache..." >&2
dagger query '{ engine { localCache { prune(targetSpace: "20gb") } } }' 2>/dev/null || true
timeout 120 dagger query '{ engine { localCache { prune(targetSpace: "20gb") } } }' 2>/dev/null || true
echo "$(_ts) dagger: waiting 90s for freed space to settle..." >&2
sleep 90
else
@@ -319,7 +319,16 @@ tasks:
rm -f "$PORTFILE" "$DAGGER_OUT" "$RC_FILE"
}
trap cleanup EXIT
until [ -s "$PORTFILE" ]; do sleep 0.05; done
until [ -s "$PORTFILE" ]; do
sleep 0.05
if ! kill -0 "$RECV_PID" 2>/dev/null; then
echo "$(_ts) otel-receiver.py died before writing port file; falling back to plain run" >&2
retry_dagger dagger call --progress=plain -q -m ci --source=. check
RC=$?
rm -f "$PORTFILE" "$DAGGER_OUT" "$RC_FILE"
exit $RC
fi
done
PORT=$(cat "$PORTFILE")
retry_dagger env \
OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:$PORT" \
+1 -1
View File
@@ -24,7 +24,7 @@ for attempt in $(seq 1 $MAX_PROBE_ATTEMPTS); do
fi
if [ "$attempt" -eq "$MAX_PROBE_ATTEMPTS" ]; then
echo "Warning: No Dagger server responded on $host:$port after $MAX_PROBE_ATTEMPTS attempts"
if ! docker info >/dev/null 2>&1; then
if ! timeout 30 docker info >/dev/null 2>&1; then
echo "Error: Remote Dagger engine is unavailable AND local Docker daemon is not running."
echo "Cannot proceed. Ensure either the remote server at $host:$port is accessible"
echo "or that Docker is running locally (check: sudo systemctl start docker)."