diff --git a/.forgejo/workflows/monitor.yml b/.forgejo/workflows/monitor.yml new file mode 100644 index 0000000..c1205e1 --- /dev/null +++ b/.forgejo/workflows/monitor.yml @@ -0,0 +1,18 @@ +name: Monitor Agent Loop + +on: + schedule: + - cron: '0 */2 * * *' # every 2 hours + workflow_dispatch: + +jobs: + monitor: + name: Check Agent Loop Health + runs-on: ubuntu-latest + timeout-minutes: 5 + + steps: + - uses: actions/checkout@v4 + + - name: Check agent loop heartbeat + run: python3 scripts/agent_loop.py monitor diff --git a/scripts/agent_loop.py b/scripts/agent_loop.py index 8d201b5..ba0bce1 100755 --- a/scripts/agent_loop.py +++ b/scripts/agent_loop.py @@ -57,7 +57,9 @@ os.environ["PATH"] = ( REPO = "guettli/sharedinbox" REPO_URL = f"https://codeberg.org/{REPO}" STATE_FILE = Path.home() / ".sharedinbox-agent-state.json" +HEARTBEAT_FILE = Path.home() / ".sharedinbox-agent-heartbeat" MAX_AGENT_AGE_SECONDS = 3600 # 1 hour +MAX_HEARTBEAT_AGE_SECONDS = 7200 # 2 hours CLAUDE_PROJECTS_DIR = Path.home() / ".claude" / "projects" / ( "-" + str(Path.home())[1:].replace("/", "-") ) @@ -309,6 +311,12 @@ def _clear_state() -> None: STATE_FILE.unlink(missing_ok=True) +def _update_heartbeat() -> None: + """Record that the agent loop ran right now.""" + HEARTBEAT_FILE.write_text(datetime.now(timezone.utc).isoformat()) + HEARTBEAT_FILE.chmod(0o600) + + def _find_session_uuid(session_name: str) -> str | None: """Return the Claude session UUID for *session_name*, or None if not found. @@ -478,12 +486,44 @@ def cmd_list() -> int: return 0 +# ── monitor subcommand ──────────────────────────────────────────────────────── + + +def cmd_monitor() -> int: + """Check that the agent loop has run within the last 2 hours. + + Exits 0 if healthy, 1 if the heartbeat is missing or stale. + Intended to be called from a scheduled CI job or cron every 2 hours. + """ + if not HEARTBEAT_FILE.exists(): + print( + f"WARNING: Agent loop heartbeat file missing — " + f"the loop may not have run yet or the file was deleted ({HEARTBEAT_FILE})." + ) + return 1 + try: + last_run = datetime.fromisoformat(HEARTBEAT_FILE.read_text().strip()) + except ValueError: + print(f"WARNING: Agent loop heartbeat file is corrupted: {HEARTBEAT_FILE}") + return 1 + age = (datetime.now(timezone.utc) - last_run).total_seconds() + if age > MAX_HEARTBEAT_AGE_SECONDS: + print( + f"WARNING: Agent loop last ran {age / 3600:.1f}h ago " + f"(limit: {MAX_HEARTBEAT_AGE_SECONDS // 3600}h) — the loop may be stalled." + ) + return 1 + print(f"Agent loop is healthy. Last run: {age / 60:.0f} min ago.") + return 0 + + # ── main flow ───────────────────────────────────────────────────────────────── def _run_loop() -> int: now = datetime.now(timezone.utc) print(f"---------------------- Starting {now.strftime('%Y-%m-%d %H:%MZ')}") + _update_heartbeat() state = _read_state() @@ -884,10 +924,13 @@ def main() -> int: parser = argparse.ArgumentParser(prog="agent_loop") sub = parser.add_subparsers(dest="cmd") sub.add_parser("list", help="List recent agent sessions") + sub.add_parser("monitor", help="Check that the loop ran within the last 2 hours") args = parser.parse_args() if args.cmd == "list": return cmd_list() + if args.cmd == "monitor": + return cmd_monitor() return _run_loop() diff --git a/scripts/test_agent_loop.py b/scripts/test_agent_loop.py index 4750cbd..57e1ef7 100644 --- a/scripts/test_agent_loop.py +++ b/scripts/test_agent_loop.py @@ -6,6 +6,7 @@ import json import os import tempfile import unittest +from datetime import datetime, timedelta, timezone from pathlib import Path from unittest.mock import MagicMock, patch @@ -784,5 +785,70 @@ class TestCatchupSkipsQuestionIssues(unittest.TestCase): mock_merge.assert_called_once_with(50) +class TestHeartbeat(unittest.TestCase): + """Tests for _update_heartbeat() and cmd_monitor().""" + + def setUp(self): + self._tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".heartbeat") + self._tmp.close() + self._orig = agent_loop.HEARTBEAT_FILE + agent_loop.HEARTBEAT_FILE = Path(self._tmp.name) + Path(self._tmp.name).unlink() # Start with no heartbeat file. + + def tearDown(self): + agent_loop.HEARTBEAT_FILE = self._orig + Path(self._tmp.name).unlink(missing_ok=True) + + def test_update_heartbeat_writes_timestamp(self): + agent_loop._update_heartbeat() + content = Path(self._tmp.name).read_text().strip() + dt = datetime.fromisoformat(content) + age = (datetime.now(timezone.utc) - dt).total_seconds() + self.assertLess(age, 5) + + def test_update_heartbeat_creates_file(self): + self.assertFalse(Path(self._tmp.name).exists()) + agent_loop._update_heartbeat() + self.assertTrue(Path(self._tmp.name).exists()) + + def test_monitor_healthy_when_recent(self): + agent_loop._update_heartbeat() + result = agent_loop.cmd_monitor() + self.assertEqual(result, 0) + + def test_monitor_warns_when_heartbeat_missing(self): + buf = io.StringIO() + with contextlib.redirect_stdout(buf): + result = agent_loop.cmd_monitor() + self.assertEqual(result, 1) + self.assertIn("WARNING", buf.getvalue()) + + def test_monitor_warns_when_stale(self): + stale = (datetime.now(timezone.utc) - timedelta(hours=3)).isoformat() + Path(self._tmp.name).write_text(stale) + buf = io.StringIO() + with contextlib.redirect_stdout(buf): + result = agent_loop.cmd_monitor() + self.assertEqual(result, 1) + self.assertIn("WARNING", buf.getvalue()) + + def test_monitor_warns_when_corrupted(self): + Path(self._tmp.name).write_text("not-a-timestamp") + buf = io.StringIO() + with contextlib.redirect_stdout(buf): + result = agent_loop.cmd_monitor() + self.assertEqual(result, 1) + self.assertIn("WARNING", buf.getvalue()) + + def test_run_loop_updates_heartbeat(self): + self.assertFalse(Path(self._tmp.name).exists()) + with patch("agent_loop._read_state", return_value=None), \ + patch("agent_loop._open_issue_prs", return_value=[]), \ + patch("agent_loop._latest_main_ci_run", return_value=None), \ + patch("agent_loop._ready_issues", return_value=[]): + agent_loop._run_loop() + self.assertTrue(Path(self._tmp.name).exists()) + + if __name__ == "__main__": unittest.main()