Files
uptop/internal/monitor/history.go
T
lerko 8b39d4c1a1
CI / test (pull_request) Successful in 2m36s
CI / lint (pull_request) Successful in 56s
CI / vulncheck (pull_request) Successful in 51s
fix(monitor): serialize DB writes through a single drained writer
Every check spawned `go e.db.Save*(...)` with the error discarded: a
fire-and-forget goroutine per log line, check, state change, and alert
health update. SaveLog ran a full-table prune DELETE on every insert and
SaveCheck a COUNT + conditional prune on every check, so the hot path
amplified each write into several statements. Nothing tracked these
goroutines, so at shutdown they raced the store's Close() — writes to a
closing DB, silently swallowed.

Introduce a single writer goroutine that drains a buffered channel of
typed dbWrite values (log/check/state-change/alert-health). Writes are
enqueued non-blocking; a saturated queue drops and notes it in the
in-memory log rather than blocking the check loop. Write errors are now
logged instead of discarded. Retention moves off the hot path: SaveLog
and SaveCheck become plain INSERTs, and PruneLogs/PruneCheckHistory/
PruneStateChanges run on a 10-minute timer inside the writer (single
keep-newest-N-per-site pass via a window function). state_changes was
previously never pruned — now bounded.

Add Engine.Stop(): cancels the engine's context, then waits for the
writer to drain every buffered write before returning. main wires it in
before the deferred store Close() so no write races a closed DB.

SQLite gains busy_timeout=5000 and synchronous=NORMAL, applied via the
DSN so every pooled connection inherits them (a post-open PRAGMA only
touches one connection); WAL moves to the DSN too. :memory: test DBs are
left as-is.

Tests: writer drains on Stop, Stop is idempotent, and the prune queries
keep newest-N per site / N logs on real SQLite. Full suite green under
-race.
2026-06-10 18:14:28 -04:00

90 lines
1.9 KiB
Go

package monitor
import "time"
const maxHistoryLen = 60
type SiteHistory struct {
Latencies []time.Duration
Statuses []bool
TotalChecks int
UpChecks int
}
func (e *Engine) InitHistory() {
all, err := e.db.LoadAllHistory(maxHistoryLen)
if err != nil {
e.AddLog("Failed to load check history: " + err.Error())
return
}
e.histMu.Lock()
defer e.histMu.Unlock()
for siteID, records := range all {
h := &SiteHistory{}
for _, r := range records {
h.TotalChecks++
if r.IsUp {
h.UpChecks++
}
h.Latencies = append(h.Latencies, time.Duration(r.LatencyNs))
h.Statuses = append(h.Statuses, r.IsUp)
}
e.histories[siteID] = h
}
if len(all) > 0 {
e.AddLog("Loaded check history from database")
}
}
func (e *Engine) recordCheck(siteID int, latency time.Duration, isUp bool) {
e.histMu.Lock()
defer e.histMu.Unlock()
h, ok := e.histories[siteID]
if !ok {
h = &SiteHistory{}
e.histories[siteID] = h
}
h.TotalChecks++
if isUp {
h.UpChecks++
}
h.Latencies = append(h.Latencies, latency)
if len(h.Latencies) > maxHistoryLen {
h.Latencies = h.Latencies[len(h.Latencies)-maxHistoryLen:]
}
h.Statuses = append(h.Statuses, isUp)
if len(h.Statuses) > maxHistoryLen {
h.Statuses = h.Statuses[len(h.Statuses)-maxHistoryLen:]
}
e.enqueueWrite(writeCheck{siteID: siteID, latencyNs: latency.Nanoseconds(), isUp: isUp})
}
func (e *Engine) GetHistory(siteID int) (SiteHistory, bool) {
e.histMu.RLock()
defer e.histMu.RUnlock()
h, ok := e.histories[siteID]
if !ok {
return SiteHistory{}, false
}
cp := SiteHistory{
TotalChecks: h.TotalChecks,
UpChecks: h.UpChecks,
Latencies: make([]time.Duration, len(h.Latencies)),
Statuses: make([]bool, len(h.Statuses)),
}
copy(cp.Latencies, h.Latencies)
copy(cp.Statuses, h.Statuses)
return cp, true
}
func (e *Engine) removeHistory(siteID int) {
e.histMu.Lock()
defer e.histMu.Unlock()
delete(e.histories, siteID)
}