5d5153351e
1. Group auto-pause trap: remove the one-way Paused=true mutation from checkGroup — monitorRoutine skipped paused groups, so they could never re-evaluate or auto-unpause. 2. Retry logic: apply MaxRetries to all →DOWN transitions, not just UP→DOWN. New monitors (PENDING) no longer alert on first transient failure when retries are configured. 3. Shutdown drain hole: track checker goroutines with checkerWG so Stop() waits for in-flight checks before draining the write queue. Final drainWrites() catches any writes enqueued after the writer's own drain. 4. Probe-ingest writer bypass: route SaveCheckFromNode through the engine's serialized dbWriter instead of writing directly to the store from the HTTP handler. 5. Dead-probe expiry: expire stale probe results (>3× site interval) before aggregation so a dead probe can't poison status forever. Also clean probeResults in RemoveSite. 6. Maintenance-cache N+1: replace per-check DB query with a fully-resolved in-memory cache refreshed every poll cycle. One GetActiveMaintenanceWindows() call instead of N IsMonitorInMaintenance. ImportData now wipes check_history, state_changes, and alert_health so re-inserted IDs don't inherit stale history from prior occupants.
59 lines
1.7 KiB
Go
59 lines
1.7 KiB
Go
package monitor
|
|
|
|
import (
|
|
"gitea.lerkolabs.com/lerkolabs/uptop/internal/models"
|
|
"gitea.lerkolabs.com/lerkolabs/uptop/internal/store"
|
|
)
|
|
|
|
// dbWrite is a single unit of deferred persistence. The engine enqueues these
|
|
// onto a buffered channel; a single writer goroutine drains and executes them,
|
|
// serializing all writes through one connection and surfacing errors instead of
|
|
// discarding them. desc names the write for diagnostics on drop/failure.
|
|
type dbWrite interface {
|
|
exec(s store.Store) error
|
|
desc() string
|
|
}
|
|
|
|
type writeLog struct{ message string }
|
|
|
|
func (w writeLog) exec(s store.Store) error { return s.SaveLog(w.message) }
|
|
func (w writeLog) desc() string { return "log" }
|
|
|
|
type writeCheck struct {
|
|
siteID int
|
|
latencyNs int64
|
|
isUp bool
|
|
}
|
|
|
|
func (w writeCheck) exec(s store.Store) error { return s.SaveCheck(w.siteID, w.latencyNs, w.isUp) }
|
|
func (w writeCheck) desc() string { return "check" }
|
|
|
|
type writeStateChange struct {
|
|
siteID int
|
|
fromStatus string
|
|
toStatus string
|
|
reason string
|
|
}
|
|
|
|
func (w writeStateChange) exec(s store.Store) error {
|
|
return s.SaveStateChange(w.siteID, w.fromStatus, w.toStatus, w.reason)
|
|
}
|
|
func (w writeStateChange) desc() string { return "state-change" }
|
|
|
|
type writeAlertHealth struct{ rec models.AlertHealthRecord }
|
|
|
|
func (w writeAlertHealth) exec(s store.Store) error { return s.SaveAlertHealth(w.rec) }
|
|
func (w writeAlertHealth) desc() string { return "alert-health" }
|
|
|
|
type writeProbeCheck struct {
|
|
siteID int
|
|
nodeID string
|
|
latencyNs int64
|
|
isUp bool
|
|
}
|
|
|
|
func (w writeProbeCheck) exec(s store.Store) error {
|
|
return s.SaveCheckFromNode(w.siteID, w.nodeID, w.latencyNs, w.isUp)
|
|
}
|
|
func (w writeProbeCheck) desc() string { return "probe-check" }
|