feat: monitor agent loop health every 2 hours (#217)

- Track a heartbeat timestamp in ~/.sharedinbox-agent-heartbeat at the
  start of each _run_loop() invocation so we can tell when it last ran.
- Add `agent_loop.py monitor` subcommand that exits 1 with a WARNING
  message if the heartbeat is missing, corrupted, or older than 2 hours.
- Add .forgejo/workflows/monitor.yml scheduled workflow that runs the
  monitor check every 2 hours on the self-hosted runner; a CI failure
  serves as the warning when the loop is stalled.
- Add 7 unit tests covering all monitor / heartbeat scenarios.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Thomas SharedInbox
2026-05-25 12:48:45 +02:00
co-authored by Claude Sonnet 4.6
parent e03c7708ba
commit 06df3ee200
3 changed files with 127 additions and 0 deletions
+66
View File
@@ -6,6 +6,7 @@ import json
import os
import tempfile
import unittest
from datetime import datetime, timedelta, timezone
from pathlib import Path
from unittest.mock import MagicMock, patch
@@ -784,5 +785,70 @@ class TestCatchupSkipsQuestionIssues(unittest.TestCase):
mock_merge.assert_called_once_with(50)
class TestHeartbeat(unittest.TestCase):
"""Tests for _update_heartbeat() and cmd_monitor()."""
def setUp(self):
self._tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".heartbeat")
self._tmp.close()
self._orig = agent_loop.HEARTBEAT_FILE
agent_loop.HEARTBEAT_FILE = Path(self._tmp.name)
Path(self._tmp.name).unlink() # Start with no heartbeat file.
def tearDown(self):
agent_loop.HEARTBEAT_FILE = self._orig
Path(self._tmp.name).unlink(missing_ok=True)
def test_update_heartbeat_writes_timestamp(self):
agent_loop._update_heartbeat()
content = Path(self._tmp.name).read_text().strip()
dt = datetime.fromisoformat(content)
age = (datetime.now(timezone.utc) - dt).total_seconds()
self.assertLess(age, 5)
def test_update_heartbeat_creates_file(self):
self.assertFalse(Path(self._tmp.name).exists())
agent_loop._update_heartbeat()
self.assertTrue(Path(self._tmp.name).exists())
def test_monitor_healthy_when_recent(self):
agent_loop._update_heartbeat()
result = agent_loop.cmd_monitor()
self.assertEqual(result, 0)
def test_monitor_warns_when_heartbeat_missing(self):
buf = io.StringIO()
with contextlib.redirect_stdout(buf):
result = agent_loop.cmd_monitor()
self.assertEqual(result, 1)
self.assertIn("WARNING", buf.getvalue())
def test_monitor_warns_when_stale(self):
stale = (datetime.now(timezone.utc) - timedelta(hours=3)).isoformat()
Path(self._tmp.name).write_text(stale)
buf = io.StringIO()
with contextlib.redirect_stdout(buf):
result = agent_loop.cmd_monitor()
self.assertEqual(result, 1)
self.assertIn("WARNING", buf.getvalue())
def test_monitor_warns_when_corrupted(self):
Path(self._tmp.name).write_text("not-a-timestamp")
buf = io.StringIO()
with contextlib.redirect_stdout(buf):
result = agent_loop.cmd_monitor()
self.assertEqual(result, 1)
self.assertIn("WARNING", buf.getvalue())
def test_run_loop_updates_heartbeat(self):
self.assertFalse(Path(self._tmp.name).exists())
with patch("agent_loop._read_state", return_value=None), \
patch("agent_loop._open_issue_prs", return_value=[]), \
patch("agent_loop._latest_main_ci_run", return_value=None), \
patch("agent_loop._ready_issues", return_value=[]):
agent_loop._run_loop()
self.assertTrue(Path(self._tmp.name).exists())
if __name__ == "__main__":
unittest.main()