fix(monitor): serialize DB writes through a single drained writer
Every check spawned `go e.db.Save*(...)` with the error discarded: a fire-and-forget goroutine per log line, check, state change, and alert health update. SaveLog ran a full-table prune DELETE on every insert and SaveCheck a COUNT + conditional prune on every check, so the hot path amplified each write into several statements. Nothing tracked these goroutines, so at shutdown they raced the store's Close() — writes to a closing DB, silently swallowed. Introduce a single writer goroutine that drains a buffered channel of typed dbWrite values (log/check/state-change/alert-health). Writes are enqueued non-blocking; a saturated queue drops and notes it in the in-memory log rather than blocking the check loop. Write errors are now logged instead of discarded. Retention moves off the hot path: SaveLog and SaveCheck become plain INSERTs, and PruneLogs/PruneCheckHistory/ PruneStateChanges run on a 10-minute timer inside the writer (single keep-newest-N-per-site pass via a window function). state_changes was previously never pruned — now bounded. Add Engine.Stop(): cancels the engine's context, then waits for the writer to drain every buffered write before returning. main wires it in before the deferred store Close() so no write races a closed DB. SQLite gains busy_timeout=5000 and synchronous=NORMAL, applied via the DSN so every pooled connection inherits them (a post-open PRAGMA only touches one connection); WAL moves to the DSN too. :memory: test DBs are left as-is. Tests: writer drains on Stop, Stop is idempotent, and the prune queries keep newest-N per site / N logs on real SQLite. Full suite green under -race.
This commit was merged in pull request #99.
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package store
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
@@ -316,6 +317,69 @@ func TestDeleteSiteCascade(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestPruneLogs(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
|
||||
for i := 0; i < maxLogRows+50; i++ {
|
||||
if err := s.SaveLog(fmt.Sprintf("log %d", i)); err != nil {
|
||||
t.Fatalf("SaveLog: %v", err)
|
||||
}
|
||||
}
|
||||
if err := s.PruneLogs(); err != nil {
|
||||
t.Fatalf("PruneLogs: %v", err)
|
||||
}
|
||||
|
||||
logs, err := s.LoadLogs(maxLogRows * 2)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadLogs: %v", err)
|
||||
}
|
||||
if len(logs) != maxLogRows {
|
||||
t.Errorf("expected %d logs after prune, got %d", maxLogRows, len(logs))
|
||||
}
|
||||
// Newest must survive; oldest must be gone (membership, not position —
|
||||
// LoadLogs ordering ties when rows share a created_at second).
|
||||
present := make(map[string]bool, len(logs))
|
||||
for _, l := range logs {
|
||||
present[l] = true
|
||||
}
|
||||
if !present[fmt.Sprintf("log %d", maxLogRows+50-1)] {
|
||||
t.Error("newest log was pruned")
|
||||
}
|
||||
if present["log 0"] {
|
||||
t.Error("oldest log survived prune")
|
||||
}
|
||||
}
|
||||
|
||||
func TestPruneCheckHistory(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
|
||||
for i := 0; i < maxCheckHistory+5; i++ {
|
||||
if err := s.SaveCheck(1, int64(i), true); err != nil {
|
||||
t.Fatalf("SaveCheck site 1: %v", err)
|
||||
}
|
||||
}
|
||||
for i := 0; i < 3; i++ {
|
||||
if err := s.SaveCheck(2, int64(i), true); err != nil {
|
||||
t.Fatalf("SaveCheck site 2: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
if err := s.PruneCheckHistory(); err != nil {
|
||||
t.Fatalf("PruneCheckHistory: %v", err)
|
||||
}
|
||||
|
||||
history, err := s.LoadAllHistory(maxCheckHistory * 2)
|
||||
if err != nil {
|
||||
t.Fatalf("LoadAllHistory: %v", err)
|
||||
}
|
||||
if len(history[1]) != maxCheckHistory {
|
||||
t.Errorf("site 1: expected %d rows after prune, got %d", maxCheckHistory, len(history[1]))
|
||||
}
|
||||
if len(history[2]) != 3 {
|
||||
t.Errorf("site 2: expected 3 rows untouched, got %d", len(history[2]))
|
||||
}
|
||||
}
|
||||
|
||||
func TestPruneExpiredMaintenanceWindows(t *testing.T) {
|
||||
s := newTestStore(t)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user