From 1681fb920274b3607f5ec2884b2ff083406d0e72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bot=20of=20Thomas=20G=C3=BCttler?= Date: Wed, 3 Jun 2026 13:07:37 +0200 Subject: [PATCH] =?UTF-8?q?fix:=20fail=20fast=20in=20CI=20=E2=80=94=20para?= =?UTF-8?q?llel=20hygiene/layer=20checks,=20no=20spurious=20retries=20(#35?= =?UTF-8?q?0)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Closes #349 Two bugs prevented `check-dagger` from failing fast when checks failed: - **Hygiene + Layers checked sequentially** — they are cheap structural checks with no dependency on each other. Running them in parallel (`errgroup.Group`) means failures are reported sooner. - **Spurious retries from `errgroup.WithContext`** — the backend and integration tests previously shared a derived context via `errgroup.WithContext`. When one test failed, the context was cancelled, causing the sibling test to emit `"context canceled"` in Dagger's `--progress=plain` output. The `retry_dagger` function in `Taskfile.yml` matched that string as a transient network error and re-ran the entire pipeline up to 3 times — a real test failure could take 30+ minutes to be reported instead of ~10. **Fix in `ci/main.go`:** - Hygiene + layers now run in parallel with `errgroup.Group` - Backend + integration tests now use `errgroup.Group` (no shared cancel context), so a failure in one does not emit `"context canceled"` for the other **Fix in `Taskfile.yml`:** - Removed `context canceled` from the `retry_dagger` grep pattern; the remaining patterns (`connection reset`, `context deadline exceeded`, `connection refused`, `invalid return status code`) still cover genuine network/engine transients ## Test plan - [ ] Confirm the Forgejo CI run completes and, when a check fails, it fails fast (no 3× retry loop in logs) - [ ] Verify `task check-dagger` still retries on actual connection errors 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Thomas SharedInbox Co-authored-by: guettli Reviewed-on: https://codeberg.org/guettli/sharedinbox/pulls/350 --- Taskfile.yml | 2 +- ci/main.go | 26 ++++++++++++++++++-------- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/Taskfile.yml b/Taskfile.yml index 885e433..06c9718 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -294,7 +294,7 @@ tasks: for attempt in 1 2 3; do run_dagger "$@" && return 0 RC=$? - if [ "$attempt" -lt 3 ] && { grep -qE "connection reset|context canceled|context deadline exceeded|connection refused|invalid return status code" "$DAGGER_OUT" || [ "$RC" -eq 2 ]; }; then + if [ "$attempt" -lt 3 ] && { grep -qE "connection reset|context deadline exceeded|connection refused|invalid return status code" "$DAGGER_OUT" || [ "$RC" -eq 2 ]; }; then echo "$(_ts) dagger: network error on attempt $attempt/3, retrying..." >&2 elif [ "$attempt" -lt 3 ] && grep -q "No space left on device" "$DAGGER_OUT"; then echo "$(_ts) dagger: disk space error on attempt $attempt/3, pruning Dagger cache..." >&2 diff --git a/ci/main.go b/ci/main.go index ed10fa9..934e261 100644 --- a/ci/main.go +++ b/ci/main.go @@ -480,11 +480,18 @@ func (m *Ci) Check(ctx context.Context) (string, error) { ctx, cancel := context.WithTimeout(ctx, 30*time.Minute) defer cancel() - if _, err := m.CheckHygiene(ctx); err != nil { - return "Hygiene check failed", err - } - if _, err := m.CheckLayers(ctx); err != nil { - return "Layer check failed", err + // Run cheap structural checks in parallel for faster fail detection. + var fastEg errgroup.Group + fastEg.Go(func() error { + _, err := m.CheckHygiene(ctx) + return err + }) + fastEg.Go(func() error { + _, err := m.CheckLayers(ctx) + return err + }) + if err := fastEg.Wait(); err != nil { + return "", err } checkSetup := m.setup(m.checkSrc()) @@ -508,16 +515,19 @@ func (m *Ci) Check(ctx context.Context) (string, error) { return coverage, err } + // Use errgroup.Group (not WithContext) so a failing test does not cancel its + // sibling via context — which would surface as "context canceled" in dagger + // output and trigger spurious retries in check-dagger. var testBackend, testIntegration string - eg, egCtx := errgroup.WithContext(ctx) + var eg errgroup.Group eg.Go(func() error { var e error - testBackend, e = m.TestBackend(egCtx) + testBackend, e = m.TestBackend(ctx) return e }) eg.Go(func() error { var e error - testIntegration, e = m.TestIntegration(egCtx) + testIntegration, e = m.TestIntegration(ctx) return e }) if err := eg.Wait(); err != nil {