fix: fail fast in CI — parallel hygiene/layer checks, no spurious retries (#350)
## Summary Closes #349 Two bugs prevented `check-dagger` from failing fast when checks failed: - **Hygiene + Layers checked sequentially** — they are cheap structural checks with no dependency on each other. Running them in parallel (`errgroup.Group`) means failures are reported sooner. - **Spurious retries from `errgroup.WithContext`** — the backend and integration tests previously shared a derived context via `errgroup.WithContext`. When one test failed, the context was cancelled, causing the sibling test to emit `"context canceled"` in Dagger's `--progress=plain` output. The `retry_dagger` function in `Taskfile.yml` matched that string as a transient network error and re-ran the entire pipeline up to 3 times — a real test failure could take 30+ minutes to be reported instead of ~10. **Fix in `ci/main.go`:** - Hygiene + layers now run in parallel with `errgroup.Group` - Backend + integration tests now use `errgroup.Group` (no shared cancel context), so a failure in one does not emit `"context canceled"` for the other **Fix in `Taskfile.yml`:** - Removed `context canceled` from the `retry_dagger` grep pattern; the remaining patterns (`connection reset`, `context deadline exceeded`, `connection refused`, `invalid return status code`) still cover genuine network/engine transients ## Test plan - [ ] Confirm the Forgejo CI run completes and, when a check fails, it fails fast (no 3× retry loop in logs) - [ ] Verify `task check-dagger` still retries on actual connection errors 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: Thomas SharedInbox <sharedinbox@thomas-guettler.de> Co-authored-by: guettli <guettli@noreply.codeberg.org> Reviewed-on: https://codeberg.org/guettli/sharedinbox/pulls/350
This commit was merged in pull request #350.
This commit is contained in:
committed by
guettli
co-authored by
guettli
Thomas SharedInbox
parent
d7a9c2b4f8
commit
1681fb9202
+1
-1
@@ -294,7 +294,7 @@ tasks:
|
||||
for attempt in 1 2 3; do
|
||||
run_dagger "$@" && return 0
|
||||
RC=$?
|
||||
if [ "$attempt" -lt 3 ] && { grep -qE "connection reset|context canceled|context deadline exceeded|connection refused|invalid return status code" "$DAGGER_OUT" || [ "$RC" -eq 2 ]; }; then
|
||||
if [ "$attempt" -lt 3 ] && { grep -qE "connection reset|context deadline exceeded|connection refused|invalid return status code" "$DAGGER_OUT" || [ "$RC" -eq 2 ]; }; then
|
||||
echo "$(_ts) dagger: network error on attempt $attempt/3, retrying..." >&2
|
||||
elif [ "$attempt" -lt 3 ] && grep -q "No space left on device" "$DAGGER_OUT"; then
|
||||
echo "$(_ts) dagger: disk space error on attempt $attempt/3, pruning Dagger cache..." >&2
|
||||
|
||||
+18
-8
@@ -480,11 +480,18 @@ func (m *Ci) Check(ctx context.Context) (string, error) {
|
||||
ctx, cancel := context.WithTimeout(ctx, 30*time.Minute)
|
||||
defer cancel()
|
||||
|
||||
if _, err := m.CheckHygiene(ctx); err != nil {
|
||||
return "Hygiene check failed", err
|
||||
}
|
||||
if _, err := m.CheckLayers(ctx); err != nil {
|
||||
return "Layer check failed", err
|
||||
// Run cheap structural checks in parallel for faster fail detection.
|
||||
var fastEg errgroup.Group
|
||||
fastEg.Go(func() error {
|
||||
_, err := m.CheckHygiene(ctx)
|
||||
return err
|
||||
})
|
||||
fastEg.Go(func() error {
|
||||
_, err := m.CheckLayers(ctx)
|
||||
return err
|
||||
})
|
||||
if err := fastEg.Wait(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
checkSetup := m.setup(m.checkSrc())
|
||||
@@ -508,16 +515,19 @@ func (m *Ci) Check(ctx context.Context) (string, error) {
|
||||
return coverage, err
|
||||
}
|
||||
|
||||
// Use errgroup.Group (not WithContext) so a failing test does not cancel its
|
||||
// sibling via context — which would surface as "context canceled" in dagger
|
||||
// output and trigger spurious retries in check-dagger.
|
||||
var testBackend, testIntegration string
|
||||
eg, egCtx := errgroup.WithContext(ctx)
|
||||
var eg errgroup.Group
|
||||
eg.Go(func() error {
|
||||
var e error
|
||||
testBackend, e = m.TestBackend(egCtx)
|
||||
testBackend, e = m.TestBackend(ctx)
|
||||
return e
|
||||
})
|
||||
eg.Go(func() error {
|
||||
var e error
|
||||
testIntegration, e = m.TestIntegration(egCtx)
|
||||
testIntegration, e = m.TestIntegration(ctx)
|
||||
return e
|
||||
})
|
||||
if err := eg.Wait(); err != nil {
|
||||
|
||||
Reference in New Issue
Block a user