fix(monitor): merge check results into live state, never overwrite
checkByID snapshotted a Site under RLock, ran a network check for seconds, then handleStatusChange wrote the entire stale struct back into liveState. Any concurrent mutation during the check — a user pause, a config edit, or a push heartbeat — was silently reverted. Worst case: a heartbeat set UP and an in-flight checkPush overwrote it with a stale DOWN, firing a false alert. Introduce applyState(id, mutate): a single read-modify-write helper that runs the mutator against the CURRENT live entry under the write lock, so config and Paused are preserved automatically and status transitions are computed from the true current status. Route handleStatusChange, RecordHeartbeat, ToggleSitePause and checkGroup through it. Logs and alerts now fire after the lock is released, off the critical section. Push false-DOWN is closed by a guard: a non-UP result whose snapshot LastCheck predates the live LastCheck is dropped, since a heartbeat (or newer check) superseded it. HTTP/probe stamp LastCheck=now before the call, so they are unaffected (and serial per site anyway). Also fixes a latent bug where RecordHeartbeat read StatusChangedAt after overwriting it, always reporting "was down 0s"; downSince is now captured before mutation. Adds regression tests for pause/config-edit/heartbeat-during-check and removed-site-dropped. Full suite green under -race.
This commit was merged in pull request #98.
This commit is contained in:
@@ -1077,6 +1077,96 @@ func TestConcurrent_RecordCheckAndGetHistory(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// --- Group 10: liveState merge (lost-update race) ---
|
||||
|
||||
// A pause that lands while a check is in flight must survive the check's
|
||||
// write-back. The old code snapshotted the site, ran the check, then wrote the
|
||||
// whole stale struct back — reverting the pause.
|
||||
func TestHandleStatusChange_PauseDuringCheckSurvives(t *testing.T) {
|
||||
ms := newMockStore()
|
||||
e := newTestEngine(ms)
|
||||
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0}
|
||||
injectSite(e, site)
|
||||
|
||||
// `site` is the stale snapshot the check ran against (Paused=false).
|
||||
// Meanwhile the user pauses the monitor.
|
||||
e.ToggleSitePause(1)
|
||||
|
||||
// Check completes and folds its result in using the stale snapshot.
|
||||
e.handleStatusChange(site, "DOWN", 500, 0, "boom")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if !s.Paused {
|
||||
t.Error("pause was reverted by a stale check write-back")
|
||||
}
|
||||
if s.Status != "DOWN" {
|
||||
t.Errorf("expected check result still applied (DOWN), got %s", s.Status)
|
||||
}
|
||||
}
|
||||
|
||||
// A config edit that lands while a check is in flight must survive; the check
|
||||
// must not resurrect the old config from its snapshot.
|
||||
func TestHandleStatusChange_ConfigEditDuringCheckSurvives(t *testing.T) {
|
||||
ms := newMockStore()
|
||||
e := newTestEngine(ms)
|
||||
site := models.Site{ID: 1, Name: "test", URL: "http://old.com", Type: "http", Status: "UP", MaxRetries: 0, Interval: 30}
|
||||
injectSite(e, site)
|
||||
|
||||
// Config changes mid-check.
|
||||
e.UpdateSiteConfig(models.Site{ID: 1, Name: "test", URL: "http://new.com", Type: "http", Interval: 60})
|
||||
|
||||
// Stale check (ran against http://old.com) folds its result in.
|
||||
e.handleStatusChange(site, "UP", 200, 5*time.Millisecond, "")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if s.URL != "http://new.com" {
|
||||
t.Errorf("config edit reverted: URL=%s", s.URL)
|
||||
}
|
||||
if s.Interval != 60 {
|
||||
t.Errorf("config edit reverted: Interval=%d", s.Interval)
|
||||
}
|
||||
}
|
||||
|
||||
// The classic push false-DOWN: a heartbeat marks the monitor UP while a
|
||||
// staleness evaluation (computed from the older LastCheck) is mid-flight.
|
||||
// The stale DOWN must not overwrite the fresh heartbeat.
|
||||
func TestHandleStatusChange_HeartbeatNotOverwrittenByStaleDown(t *testing.T) {
|
||||
ms := newMockStore()
|
||||
e := newTestEngine(ms)
|
||||
// Snapshot the engine would have taken before evaluating staleness:
|
||||
// LastCheck is old, so checkPush decided "DOWN".
|
||||
snap := models.Site{ID: 1, Name: "push", Type: "push", Token: "tok", Status: "UP", Interval: 10, LastCheck: time.Now().Add(-120 * time.Second)}
|
||||
injectSite(e, snap)
|
||||
|
||||
// A heartbeat lands first, advancing LastCheck and confirming UP.
|
||||
if !e.RecordHeartbeat("tok") {
|
||||
t.Fatal("heartbeat rejected")
|
||||
}
|
||||
|
||||
// Now the in-flight stale evaluation tries to write DOWN.
|
||||
e.handleStatusChange(snap, "DOWN", 0, 0, "heartbeat missed")
|
||||
|
||||
s, _ := getSite(e, 1)
|
||||
if s.Status != "UP" {
|
||||
t.Errorf("stale DOWN overwrote a fresh heartbeat: status=%s", s.Status)
|
||||
}
|
||||
}
|
||||
|
||||
// A check result for a site removed mid-check must be dropped, not recreate it.
|
||||
func TestHandleStatusChange_RemovedSiteDropped(t *testing.T) {
|
||||
ms := newMockStore()
|
||||
e := newTestEngine(ms)
|
||||
site := models.Site{ID: 1, Name: "test", Status: "UP", MaxRetries: 0}
|
||||
injectSite(e, site)
|
||||
|
||||
e.RemoveSite(1)
|
||||
e.handleStatusChange(site, "DOWN", 500, 0, "boom")
|
||||
|
||||
if _, ok := getSite(e, 1); ok {
|
||||
t.Error("removed site was recreated by a late check write-back")
|
||||
}
|
||||
}
|
||||
|
||||
// --- Utilities ---
|
||||
|
||||
func containsStr(s, substr string) bool {
|
||||
|
||||
Reference in New Issue
Block a user