fix(engine): six correctness fixes for the state machine

1. Group auto-pause trap: remove the one-way Paused=true mutation from checkGroup — monitorRoutine skipped paused groups, so they could never re-evaluate or auto-unpause. 2. Retry logic: apply MaxRetries to all →DOWN transitions, not just UP→DOWN. New monitors (PENDING) no longer alert on first transient failure when retries are configured. 3. Shutdown drain hole: track checker goroutines with checkerWG so Stop() waits for in-flight checks before draining the write queue. Final drainWrites() catches any writes enqueued after the writer's own drain. 4. Probe-ingest writer bypass: route SaveCheckFromNode through the engine's serialized dbWriter instead of writing directly to the store from the HTTP handler. 5. Dead-probe expiry: expire stale probe results (>3× site interval) before aggregation so a dead probe can't poison status forever. Also clean probeResults in RemoveSite. 6. Maintenance-cache N+1: replace per-check DB query with a fully-resolved in-memory cache refreshed every poll cycle. One GetActiveMaintenanceWindows() call instead of N IsMonitorInMaintenance. ImportData now wipes check_history, state_changes, and alert_health so re-inserted IDs don't inherit stale history from prior occupants.
2026-06-11 13:40:31 -04:00
parent 61c28fac62
commit 5d5153351e
7 changed files with 335 additions and 39 deletions
@@ -60,6 +60,9 @@ type Engine struct {
 	recheckMu sync.RWMutex
 	recheck   map[int]chan struct{}

+	maintCacheMu sync.RWMutex
+	maintCache   map[int]bool
+
 	db                  store.Store
 	insecureSkipVerify  bool
 	allowPrivateTargets bool
@@ -67,10 +70,11 @@ type Engine struct {
 	strictClient        *http.Client
 	insecureClient      *http.Client

-	dbWrites chan dbWrite
-	writerWG sync.WaitGroup
-	cancel   context.CancelFunc
-	stopOnce sync.Once
+	dbWrites  chan dbWrite
+	writerWG  sync.WaitGroup
+	checkerWG sync.WaitGroup
+	cancel    context.CancelFunc
+	stopOnce  sync.Once
 }

 func NewEngine(s store.Store) *Engine {
@@ -231,7 +235,9 @@ func (e *Engine) Stop() {
 		if e.cancel != nil {
 			e.cancel()
 		}
+		e.checkerWG.Wait()
 		e.writerWG.Wait()
+		e.drainWrites()
 	})
 }

@@ -400,7 +406,9 @@ func (e *Engine) Start(ctx context.Context) {
 	e.writerWG.Add(1)
 	go e.dbWriter(ctx)

+	e.checkerWG.Add(1)
 	go func() {
+		defer e.checkerWG.Done()
 		for {
 			select {
 			case <-ctx.Done():
@@ -408,6 +416,8 @@ func (e *Engine) Start(ctx context.Context) {
 			default:
 			}

+			e.refreshMaintenanceCache()
+
 			sites, err := e.db.GetSites()
 			if err != nil {
 				e.AddLog(fmt.Sprintf("Failed to load sites: %v", err))
@@ -438,7 +448,11 @@ func (e *Engine) Start(ctx context.Context) {
 					e.liveState[s.ID] = s
 					e.addToTokenIndex(s)
 					e.mu.Unlock()
-					go e.monitorRoutine(ctx, s.ID)
+					e.checkerWG.Add(1)
+					go func(id int) {
+						defer e.checkerWG.Done()
+						e.monitorRoutine(ctx, id)
+					}(s.ID)
 				}
 			}

@@ -450,7 +464,11 @@ func (e *Engine) Start(ctx context.Context) {
 		}
 	}()

-	go e.maintenancePruner(ctx)
+	e.checkerWG.Add(1)
+	go func() {
+		defer e.checkerWG.Done()
+		e.maintenancePruner(ctx)
+	}()
 }

 func (e *Engine) maintenancePruner(ctx context.Context) {
@@ -529,6 +547,10 @@ func (e *Engine) RemoveSite(id int) {
 	e.mu.Unlock()
 	e.removeHistory(id)

+	e.probeResultsMu.Lock()
+	delete(e.probeResults, id)
+	e.probeResultsMu.Unlock()
+
 	e.recheckMu.Lock()
 	delete(e.recheck, id)
 	e.recheckMu.Unlock()
@@ -748,22 +770,22 @@ func (e *Engine) handleStatusChange(snap models.Site, rawStatus string, code int
 		}

 		// Status + failure-count transition, based on the CURRENT live status.
-		switch {
-		case prev == "UP" && rawStatus != "UP":
-			s.FailureCount++
+		if rawStatus == "UP" {
+			s.FailureCount = 0
+			s.Status = "UP"
+		} else {
+			if s.FailureCount <= s.MaxRetries {
+				s.FailureCount++
+			}
 			if s.FailureCount > s.MaxRetries {
+				if s.Status != rawStatus {
+					confirmedDown = true
+				}
 				s.Status = rawStatus
 				s.FailureCount = s.MaxRetries + 1
-				confirmedDown = true
 			} else {
 				failedCheck = true
 			}
-		case rawStatus == "UP":
-			s.FailureCount = 0
-			s.Status = "UP"
-		default:
-			s.Status = rawStatus
-			s.FailureCount = s.MaxRetries + 1
 		}
 		failCount = s.FailureCount

@@ -927,11 +949,39 @@ func (e *Engine) TestAlert(alertID int) error {
 }

 func (e *Engine) isInMaintenance(monitorID int) bool {
-	inMaint, err := e.db.IsMonitorInMaintenance(monitorID)
+	e.maintCacheMu.RLock()
+	defer e.maintCacheMu.RUnlock()
+	return e.maintCache[monitorID]
+}
+
+func (e *Engine) refreshMaintenanceCache() {
+	windows, err := e.db.GetActiveMaintenanceWindows()
 	if err != nil {
-		return false
+		return
 	}
-	return inMaint
+
+	directMaint := make(map[int]bool)
+	var globalMaint bool
+	for _, w := range windows {
+		if w.MonitorID == 0 {
+			globalMaint = true
+		} else {
+			directMaint[w.MonitorID] = true
+		}
+	}
+
+	resolved := make(map[int]bool)
+	e.mu.RLock()
+	for id, site := range e.liveState {
+		if globalMaint || directMaint[id] || (site.ParentID > 0 && directMaint[site.ParentID]) {
+			resolved[id] = true
+		}
+	}
+	e.mu.RUnlock()
+
+	e.maintCacheMu.Lock()
+	e.maintCache = resolved
+	e.maintCacheMu.Unlock()
 }

 func (e *Engine) GetDisplayStatus(site models.Site) string {
@@ -948,15 +998,11 @@ func (e *Engine) checkGroup(site models.Site) {
 	e.mu.RLock()
 	status := "UP"
 	hasChildren := false
-	allPaused := true
 	for _, child := range e.liveState {
 		if child.ParentID != site.ID || child.Type == "group" {
 			continue
 		}
 		hasChildren = true
-		if !child.Paused {
-			allPaused = false
-		}
 		if child.Paused || e.isInMaintenance(child.ID) {
 			continue
 		}
@@ -978,17 +1024,31 @@ func (e *Engine) checkGroup(site models.Site) {

 	e.applyState(site.ID, func(s *models.Site) {
 		s.Status = status
-		if hasChildren && allPaused {
-			s.Paused = true
-		}
 	})
 }

+func (e *Engine) EnqueueProbeCheck(siteID int, nodeID string, latencyNs int64, isUp bool) {
+	e.enqueueWrite(writeProbeCheck{siteID: siteID, nodeID: nodeID, latencyNs: latencyNs, isUp: isUp})
+}
+
 func (e *Engine) SetAggStrategy(strategy AggregationStrategy) {
 	e.aggStrategy = strategy
 }

 func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, isUp bool, errorReason string) {
+	e.mu.RLock()
+	site, exists := e.liveState[siteID]
+	e.mu.RUnlock()
+	if !exists {
+		return
+	}
+
+	staleAfter := time.Duration(site.Interval) * time.Second * 3
+	if staleAfter < time.Minute {
+		staleAfter = time.Minute
+	}
+
+	now := time.Now()
 	e.probeResultsMu.Lock()
 	if e.probeResults[siteID] == nil {
 		e.probeResults[siteID] = make(map[string]NodeResult)
@@ -997,24 +1057,21 @@ func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, i
 		NodeID:      nodeID,
 		IsUp:        isUp,
 		LatencyNs:   latencyNs,
-		CheckedAt:   time.Now(),
+		CheckedAt:   now,
 		ErrorReason: errorReason,
 	}
 	results := make([]NodeResult, 0, len(e.probeResults[siteID]))
-	for _, r := range e.probeResults[siteID] {
+	for id, r := range e.probeResults[siteID] {
+		if now.Sub(r.CheckedAt) > staleAfter {
+			delete(e.probeResults[siteID], id)
+			continue
+		}
 		results = append(results, r)
 	}
 	e.probeResultsMu.Unlock()

 	aggUp, avgLatency := AggregateStatus(results, e.aggStrategy)

-	e.mu.RLock()
-	site, exists := e.liveState[siteID]
-	e.mu.RUnlock()
-	if !exists {
-		return
-	}
-
 	rawStatus := "UP"
 	if !aggUp {
 		rawStatus = "DOWN"