fix(monitor): serialize DB writes through a single drained writer
Every check spawned `go e.db.Save*(...)` with the error discarded: a fire-and-forget goroutine per log line, check, state change, and alert health update. SaveLog ran a full-table prune DELETE on every insert and SaveCheck a COUNT + conditional prune on every check, so the hot path amplified each write into several statements. Nothing tracked these goroutines, so at shutdown they raced the store's Close() — writes to a closing DB, silently swallowed. Introduce a single writer goroutine that drains a buffered channel of typed dbWrite values (log/check/state-change/alert-health). Writes are enqueued non-blocking; a saturated queue drops and notes it in the in-memory log rather than blocking the check loop. Write errors are now logged instead of discarded. Retention moves off the hot path: SaveLog and SaveCheck become plain INSERTs, and PruneLogs/PruneCheckHistory/ PruneStateChanges run on a 10-minute timer inside the writer (single keep-newest-N-per-site pass via a window function). state_changes was previously never pruned — now bounded. Add Engine.Stop(): cancels the engine's context, then waits for the writer to drain every buffered write before returning. main wires it in before the deferred store Close() so no write races a closed DB. SQLite gains busy_timeout=5000 and synchronous=NORMAL, applied via the DSN so every pooled connection inherits them (a post-open PRAGMA only touches one connection); WAL moves to the DSN too. :memory: test DBs are left as-is. Tests: writer drains on Stop, Stop is idempotent, and the prune queries keep newest-N per site / N logs on real SQLite. Full suite green under -race.
This commit was merged in pull request #99.
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
package monitor
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"sync"
|
||||
"testing"
|
||||
@@ -145,6 +146,10 @@ func (m *mockStore) LoadAllHistory(limit int) (map[int][]models.CheckRecord, err
|
||||
return m.history, nil
|
||||
}
|
||||
|
||||
func (m *mockStore) PruneLogs() error { return nil }
|
||||
func (m *mockStore) PruneCheckHistory() error { return nil }
|
||||
func (m *mockStore) PruneStateChanges() error { return nil }
|
||||
|
||||
// --- Helpers ---
|
||||
|
||||
func newTestEngine(ms *mockStore) *Engine {
|
||||
@@ -1167,6 +1172,51 @@ func TestHandleStatusChange_RemovedSiteDropped(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// --- Group 11: single DB writer ---
|
||||
|
||||
// Writes enqueued through the engine are persisted by the writer goroutine and
|
||||
// fully drained when the engine stops — no fire-and-forget, no lost writes.
|
||||
func TestDBWriter_DrainsOnStop(t *testing.T) {
|
||||
ms := newMockStore()
|
||||
e := newTestEngine(ms)
|
||||
e.Start(context.Background())
|
||||
|
||||
e.enqueueWrite(writeCheck{siteID: 7, latencyNs: 100, isUp: true})
|
||||
e.enqueueWrite(writeLog{message: "drain-me"})
|
||||
|
||||
e.Stop() // blocks until the writer has drained the queue
|
||||
|
||||
ms.mu.Lock()
|
||||
defer ms.mu.Unlock()
|
||||
gotCheck := false
|
||||
for _, c := range ms.savedChecks {
|
||||
if c.SiteID == 7 {
|
||||
gotCheck = true
|
||||
}
|
||||
}
|
||||
if !gotCheck {
|
||||
t.Error("check was not persisted before Stop returned")
|
||||
}
|
||||
gotLog := false
|
||||
for _, l := range ms.savedLogs {
|
||||
if l == "drain-me" {
|
||||
gotLog = true
|
||||
}
|
||||
}
|
||||
if !gotLog {
|
||||
t.Error("log was not persisted before Stop returned")
|
||||
}
|
||||
}
|
||||
|
||||
// Stop must be idempotent — safe to call more than once.
|
||||
func TestEngineStop_Idempotent(t *testing.T) {
|
||||
ms := newMockStore()
|
||||
e := newTestEngine(ms)
|
||||
e.Start(context.Background())
|
||||
e.Stop()
|
||||
e.Stop() // must not panic or block
|
||||
}
|
||||
|
||||
// --- Utilities ---
|
||||
|
||||
func containsStr(s, substr string) bool {
|
||||
|
||||
Reference in New Issue
Block a user