8b39d4c1a1
Every check spawned `go e.db.Save*(...)` with the error discarded: a fire-and-forget goroutine per log line, check, state change, and alert health update. SaveLog ran a full-table prune DELETE on every insert and SaveCheck a COUNT + conditional prune on every check, so the hot path amplified each write into several statements. Nothing tracked these goroutines, so at shutdown they raced the store's Close() — writes to a closing DB, silently swallowed. Introduce a single writer goroutine that drains a buffered channel of typed dbWrite values (log/check/state-change/alert-health). Writes are enqueued non-blocking; a saturated queue drops and notes it in the in-memory log rather than blocking the check loop. Write errors are now logged instead of discarded. Retention moves off the hot path: SaveLog and SaveCheck become plain INSERTs, and PruneLogs/PruneCheckHistory/ PruneStateChanges run on a 10-minute timer inside the writer (single keep-newest-N-per-site pass via a window function). state_changes was previously never pruned — now bounded. Add Engine.Stop(): cancels the engine's context, then waits for the writer to drain every buffered write before returning. main wires it in before the deferred store Close() so no write races a closed DB. SQLite gains busy_timeout=5000 and synchronous=NORMAL, applied via the DSN so every pooled connection inherits them (a post-open PRAGMA only touches one connection); WAL moves to the DSN too. :memory: test DBs are left as-is. Tests: writer drains on Stop, Stop is idempotent, and the prune queries keep newest-N per site / N logs on real SQLite. Full suite green under -race.
90 lines
1.9 KiB
Go
90 lines
1.9 KiB
Go
package monitor
|
|
|
|
import "time"
|
|
|
|
const maxHistoryLen = 60
|
|
|
|
type SiteHistory struct {
|
|
Latencies []time.Duration
|
|
Statuses []bool
|
|
TotalChecks int
|
|
UpChecks int
|
|
}
|
|
|
|
func (e *Engine) InitHistory() {
|
|
all, err := e.db.LoadAllHistory(maxHistoryLen)
|
|
if err != nil {
|
|
e.AddLog("Failed to load check history: " + err.Error())
|
|
return
|
|
}
|
|
e.histMu.Lock()
|
|
defer e.histMu.Unlock()
|
|
for siteID, records := range all {
|
|
h := &SiteHistory{}
|
|
for _, r := range records {
|
|
h.TotalChecks++
|
|
if r.IsUp {
|
|
h.UpChecks++
|
|
}
|
|
h.Latencies = append(h.Latencies, time.Duration(r.LatencyNs))
|
|
h.Statuses = append(h.Statuses, r.IsUp)
|
|
}
|
|
e.histories[siteID] = h
|
|
}
|
|
if len(all) > 0 {
|
|
e.AddLog("Loaded check history from database")
|
|
}
|
|
}
|
|
|
|
func (e *Engine) recordCheck(siteID int, latency time.Duration, isUp bool) {
|
|
e.histMu.Lock()
|
|
defer e.histMu.Unlock()
|
|
|
|
h, ok := e.histories[siteID]
|
|
if !ok {
|
|
h = &SiteHistory{}
|
|
e.histories[siteID] = h
|
|
}
|
|
|
|
h.TotalChecks++
|
|
if isUp {
|
|
h.UpChecks++
|
|
}
|
|
|
|
h.Latencies = append(h.Latencies, latency)
|
|
if len(h.Latencies) > maxHistoryLen {
|
|
h.Latencies = h.Latencies[len(h.Latencies)-maxHistoryLen:]
|
|
}
|
|
|
|
h.Statuses = append(h.Statuses, isUp)
|
|
if len(h.Statuses) > maxHistoryLen {
|
|
h.Statuses = h.Statuses[len(h.Statuses)-maxHistoryLen:]
|
|
}
|
|
|
|
e.enqueueWrite(writeCheck{siteID: siteID, latencyNs: latency.Nanoseconds(), isUp: isUp})
|
|
}
|
|
|
|
func (e *Engine) GetHistory(siteID int) (SiteHistory, bool) {
|
|
e.histMu.RLock()
|
|
defer e.histMu.RUnlock()
|
|
h, ok := e.histories[siteID]
|
|
if !ok {
|
|
return SiteHistory{}, false
|
|
}
|
|
cp := SiteHistory{
|
|
TotalChecks: h.TotalChecks,
|
|
UpChecks: h.UpChecks,
|
|
Latencies: make([]time.Duration, len(h.Latencies)),
|
|
Statuses: make([]bool, len(h.Statuses)),
|
|
}
|
|
copy(cp.Latencies, h.Latencies)
|
|
copy(cp.Statuses, h.Statuses)
|
|
return cp, true
|
|
}
|
|
|
|
func (e *Engine) removeHistory(siteID int) {
|
|
e.histMu.Lock()
|
|
defer e.histMu.Unlock()
|
|
delete(e.histories, siteID)
|
|
}
|