fix: add timeouts to dagger query, docker info, and portfile loop (#347)
Three unguarded blocking calls caused CI to hang until the 60-min timeout:
- dagger query prune steps had no timeout; || true only catches errors, not hangs
- docker info (added in d905cd6) had no timeout if Docker socket is unresponsive
- until portfile loop in check-dagger spun forever if otel-receiver.py crashed
Fixes: timeout 120 on all dagger query prune calls, timeout 30 on docker info,
and a kill -0 process-alive guard on the portfile until loop with fallback.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
co-authored by
Claude Sonnet 4.6
parent
968db75c69
commit
ea5d119706
@@ -74,7 +74,7 @@ jobs:
|
||||
|
||||
# Try host Docker socket (DooD) if runner mounts it
|
||||
if [ -S /var/run/docker.sock ]; then
|
||||
if DOCKER_HOST=unix:///var/run/docker.sock docker info >/dev/null 2>&1; then
|
||||
if DOCKER_HOST=unix:///var/run/docker.sock timeout 30 docker info >/dev/null 2>&1; then
|
||||
echo "Docker available via host socket."
|
||||
echo "DOCKER_HOST=unix:///var/run/docker.sock" >> "$GITHUB_ENV"
|
||||
exit 0
|
||||
@@ -92,7 +92,7 @@ jobs:
|
||||
# prune(maxUsedSpace) also reclaims named cache volumes (gradle-cache, go-build-cache, etc.)
|
||||
# when total cache exceeds the limit; without args only unreferenced entries are removed.
|
||||
run: |
|
||||
dagger query '{ engine { localCache { prune(maxUsedSpace: "75gb", targetSpace: "50gb") } } }' || true
|
||||
timeout 120 dagger query '{ engine { localCache { prune(maxUsedSpace: "75gb", targetSpace: "50gb") } } }' || true
|
||||
|
||||
- name: Run Full Check Suite
|
||||
env:
|
||||
@@ -104,7 +104,7 @@ jobs:
|
||||
env:
|
||||
DAGGER_NO_NAG: "1"
|
||||
run: |
|
||||
dagger query '{ engine { localCache { prune(maxUsedSpace: "75gb", targetSpace: "50gb") } } }' || true
|
||||
timeout 120 dagger query '{ engine { localCache { prune(maxUsedSpace: "75gb", targetSpace: "50gb") } } }' || true
|
||||
|
||||
- name: Cleanup TLS credentials
|
||||
if: always()
|
||||
|
||||
+11
-2
@@ -298,7 +298,7 @@ tasks:
|
||||
echo "$(_ts) dagger: network error on attempt $attempt/3, retrying..." >&2
|
||||
elif [ "$attempt" -lt 3 ] && grep -q "No space left on device" "$DAGGER_OUT"; then
|
||||
echo "$(_ts) dagger: disk space error on attempt $attempt/3, pruning Dagger cache..." >&2
|
||||
dagger query '{ engine { localCache { prune(targetSpace: "20gb") } } }' 2>/dev/null || true
|
||||
timeout 120 dagger query '{ engine { localCache { prune(targetSpace: "20gb") } } }' 2>/dev/null || true
|
||||
echo "$(_ts) dagger: waiting 90s for freed space to settle..." >&2
|
||||
sleep 90
|
||||
else
|
||||
@@ -319,7 +319,16 @@ tasks:
|
||||
rm -f "$PORTFILE" "$DAGGER_OUT" "$RC_FILE"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
until [ -s "$PORTFILE" ]; do sleep 0.05; done
|
||||
until [ -s "$PORTFILE" ]; do
|
||||
sleep 0.05
|
||||
if ! kill -0 "$RECV_PID" 2>/dev/null; then
|
||||
echo "$(_ts) otel-receiver.py died before writing port file; falling back to plain run" >&2
|
||||
retry_dagger dagger call --progress=plain -q -m ci --source=. check
|
||||
RC=$?
|
||||
rm -f "$PORTFILE" "$DAGGER_OUT" "$RC_FILE"
|
||||
exit $RC
|
||||
fi
|
||||
done
|
||||
PORT=$(cat "$PORTFILE")
|
||||
retry_dagger env \
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT="http://127.0.0.1:$PORT" \
|
||||
|
||||
@@ -24,7 +24,7 @@ for attempt in $(seq 1 $MAX_PROBE_ATTEMPTS); do
|
||||
fi
|
||||
if [ "$attempt" -eq "$MAX_PROBE_ATTEMPTS" ]; then
|
||||
echo "Warning: No Dagger server responded on $host:$port after $MAX_PROBE_ATTEMPTS attempts"
|
||||
if ! docker info >/dev/null 2>&1; then
|
||||
if ! timeout 30 docker info >/dev/null 2>&1; then
|
||||
echo "Error: Remote Dagger engine is unavailable AND local Docker daemon is not running."
|
||||
echo "Cannot proceed. Ensure either the remote server at $host:$port is accessible"
|
||||
echo "or that Docker is running locally (check: sudo systemctl start docker)."
|
||||
|
||||
Reference in New Issue
Block a user