fix(monitor): serialize DB writes through a single drained writer
CI / test (pull_request) Successful in 2m36s
CI / lint (pull_request) Successful in 56s
CI / vulncheck (pull_request) Successful in 51s

Every check spawned `go e.db.Save*(...)` with the error discarded: a
fire-and-forget goroutine per log line, check, state change, and alert
health update. SaveLog ran a full-table prune DELETE on every insert and
SaveCheck a COUNT + conditional prune on every check, so the hot path
amplified each write into several statements. Nothing tracked these
goroutines, so at shutdown they raced the store's Close() — writes to a
closing DB, silently swallowed.

Introduce a single writer goroutine that drains a buffered channel of
typed dbWrite values (log/check/state-change/alert-health). Writes are
enqueued non-blocking; a saturated queue drops and notes it in the
in-memory log rather than blocking the check loop. Write errors are now
logged instead of discarded. Retention moves off the hot path: SaveLog
and SaveCheck become plain INSERTs, and PruneLogs/PruneCheckHistory/
PruneStateChanges run on a 10-minute timer inside the writer (single
keep-newest-N-per-site pass via a window function). state_changes was
previously never pruned — now bounded.

Add Engine.Stop(): cancels the engine's context, then waits for the
writer to drain every buffered write before returning. main wires it in
before the deferred store Close() so no write races a closed DB.

SQLite gains busy_timeout=5000 and synchronous=NORMAL, applied via the
DSN so every pooled connection inherits them (a post-open PRAGMA only
touches one connection); WAL moves to the DSN too. :memory: test DBs are
left as-is.

Tests: writer drains on Stop, Stop is idempotent, and the prune queries
keep newest-N per site / N logs on real SQLite. Full suite green under
-race.
This commit was merged in pull request #99.
This commit is contained in:
2026-06-10 17:11:12 -04:00
parent 5e7faf9ea7
commit 8b39d4c1a1
12 changed files with 344 additions and 39 deletions
+12 -4
View File
@@ -2,6 +2,7 @@ package store
import (
"database/sql"
"fmt"
"log"
_ "github.com/mattn/go-sqlite3"
@@ -10,13 +11,20 @@ import (
type SQLiteDialect struct{}
func NewSQLiteStore(path string) (*SQLStore, error) {
s, err := NewSQLStore("sqlite3", path, &SQLiteDialect{})
// Apply pragmas via the DSN so every pooled connection gets them — a
// post-open PRAGMA Exec only affects a single connection. WAL allows
// concurrent readers alongside the single writer goroutine; busy_timeout
// rides out brief lock contention; synchronous=NORMAL is durable under WAL
// and far faster than the FULL default. (:memory: is left untouched —
// these pragmas are no-ops or harmful for the in-memory test DB.)
dsn := path
if path != ":memory:" {
dsn = fmt.Sprintf("file:%s?_journal_mode=WAL&_busy_timeout=5000&_synchronous=NORMAL", path)
}
s, err := NewSQLStore("sqlite3", dsn, &SQLiteDialect{})
if err != nil {
return nil, err
}
if _, err := s.db.Exec("PRAGMA journal_mode=WAL"); err != nil {
log.Printf("WAL mode failed: %v", err)
}
return s, nil
}
+48 -23
View File
@@ -13,10 +13,11 @@ import (
)
const (
maxCheckHistory = 1000
checkHistoryPruneAt = 1100
maxMaintenanceExport = 1000
maxRequestBody = 1 << 20
maxCheckHistory = 1000
maxLogRows = 200
maxStateChangesPerSite = 5000
maxMaintenanceExport = 1000
maxRequestBody = 1 << 20
)
type SQLStore struct {
@@ -407,21 +408,39 @@ func (s *SQLStore) SaveCheck(siteID int, latencyNs int64, isUp bool) error {
return s.SaveCheckFromNode(siteID, "", latencyNs, isUp)
}
// SaveCheckFromNode inserts a single check row. Retention is handled out of
// band by PruneCheckHistory on a timer, not per-insert, to keep the write hot
// path a plain INSERT.
func (s *SQLStore) SaveCheckFromNode(siteID int, nodeID string, latencyNs int64, isUp bool) error {
_, err := s.db.Exec(s.q("INSERT INTO check_history (site_id, node_id, latency_ns, is_up) VALUES (?, ?, ?, ?)"), siteID, nodeID, latencyNs, isUp)
if err != nil {
return err
}
var count int
_ = s.db.QueryRow(s.q("SELECT COUNT(*) FROM check_history WHERE site_id = ?"), siteID).Scan(&count)
if count > checkHistoryPruneAt {
pruneQuery := fmt.Sprintf(`DELETE FROM check_history WHERE site_id = ? AND id NOT IN (
SELECT id FROM check_history WHERE site_id = ? ORDER BY checked_at DESC LIMIT %d
)`, maxCheckHistory)
_, err = s.db.Exec(s.q(pruneQuery), siteID, siteID)
return err
}
return nil
return err
}
// PruneCheckHistory trims check_history to the newest maxCheckHistory rows per
// site, across all sites, in one pass. Intended to run periodically.
func (s *SQLStore) PruneCheckHistory() error {
q := fmt.Sprintf(`DELETE FROM check_history WHERE id IN (
SELECT id FROM (
SELECT id, ROW_NUMBER() OVER (PARTITION BY site_id ORDER BY checked_at DESC, id DESC) AS rn
FROM check_history
) ranked WHERE rn > %d
)`, maxCheckHistory)
_, err := s.db.Exec(s.q(q))
return err
}
// PruneStateChanges trims state_changes to the newest maxStateChangesPerSite
// rows per site. Generous so realistic SLA windows are unaffected; bounds the
// otherwise unbounded growth of a flapping monitor's history.
func (s *SQLStore) PruneStateChanges() error {
q := fmt.Sprintf(`DELETE FROM state_changes WHERE id IN (
SELECT id FROM (
SELECT id, ROW_NUMBER() OVER (PARTITION BY site_id ORDER BY changed_at DESC, id DESC) AS rn
FROM state_changes
) ranked WHERE rn > %d
)`, maxStateChangesPerSite)
_, err := s.db.Exec(s.q(q))
return err
}
func (s *SQLStore) RegisterNode(node models.ProbeNode) error {
@@ -494,14 +513,20 @@ func (s *SQLStore) SaveAlertHealth(h models.AlertHealthRecord) error {
return err
}
// SaveLog inserts a single log row. Retention is handled by PruneLogs on a
// timer, not per-insert.
func (s *SQLStore) SaveLog(message string) error {
_, err := s.db.Exec(s.q("INSERT INTO logs (message) VALUES (?)"), message)
if err != nil {
return err
}
_, err = s.db.Exec(s.q(`DELETE FROM logs WHERE id NOT IN (
SELECT id FROM logs ORDER BY created_at DESC LIMIT 200
)`))
return err
}
// PruneLogs trims the logs table to the newest maxLogRows rows. The id DESC
// tiebreak keeps ordering deterministic when rows share a created_at second.
func (s *SQLStore) PruneLogs() error {
q := fmt.Sprintf(`DELETE FROM logs WHERE id NOT IN (
SELECT id FROM logs ORDER BY created_at DESC, id DESC LIMIT %d
)`, maxLogRows)
_, err := s.db.Exec(s.q(q))
return err
}
+64
View File
@@ -1,6 +1,7 @@
package store
import (
"fmt"
"testing"
"time"
@@ -316,6 +317,69 @@ func TestDeleteSiteCascade(t *testing.T) {
}
}
func TestPruneLogs(t *testing.T) {
s := newTestStore(t)
for i := 0; i < maxLogRows+50; i++ {
if err := s.SaveLog(fmt.Sprintf("log %d", i)); err != nil {
t.Fatalf("SaveLog: %v", err)
}
}
if err := s.PruneLogs(); err != nil {
t.Fatalf("PruneLogs: %v", err)
}
logs, err := s.LoadLogs(maxLogRows * 2)
if err != nil {
t.Fatalf("LoadLogs: %v", err)
}
if len(logs) != maxLogRows {
t.Errorf("expected %d logs after prune, got %d", maxLogRows, len(logs))
}
// Newest must survive; oldest must be gone (membership, not position —
// LoadLogs ordering ties when rows share a created_at second).
present := make(map[string]bool, len(logs))
for _, l := range logs {
present[l] = true
}
if !present[fmt.Sprintf("log %d", maxLogRows+50-1)] {
t.Error("newest log was pruned")
}
if present["log 0"] {
t.Error("oldest log survived prune")
}
}
func TestPruneCheckHistory(t *testing.T) {
s := newTestStore(t)
for i := 0; i < maxCheckHistory+5; i++ {
if err := s.SaveCheck(1, int64(i), true); err != nil {
t.Fatalf("SaveCheck site 1: %v", err)
}
}
for i := 0; i < 3; i++ {
if err := s.SaveCheck(2, int64(i), true); err != nil {
t.Fatalf("SaveCheck site 2: %v", err)
}
}
if err := s.PruneCheckHistory(); err != nil {
t.Fatalf("PruneCheckHistory: %v", err)
}
history, err := s.LoadAllHistory(maxCheckHistory * 2)
if err != nil {
t.Fatalf("LoadAllHistory: %v", err)
}
if len(history[1]) != maxCheckHistory {
t.Errorf("site 1: expected %d rows after prune, got %d", maxCheckHistory, len(history[1]))
}
if len(history[2]) != 3 {
t.Errorf("site 2: expected 3 rows untouched, got %d", len(history[2]))
}
}
func TestPruneExpiredMaintenanceWindows(t *testing.T) {
s := newTestStore(t)
+3
View File
@@ -39,11 +39,13 @@ type Store interface {
SaveCheck(siteID int, latencyNs int64, isUp bool) error
SaveCheckFromNode(siteID int, nodeID string, latencyNs int64, isUp bool) error
LoadAllHistory(limit int) (map[int][]models.CheckRecord, error)
PruneCheckHistory() error
// State Changes
SaveStateChange(siteID int, fromStatus, toStatus, errorReason string) error
GetStateChanges(siteID int, limit int) ([]models.StateChange, error)
GetStateChangesSince(siteID int, since time.Time) ([]models.StateChange, error)
PruneStateChanges() error
// Nodes
RegisterNode(node models.ProbeNode) error
@@ -59,6 +61,7 @@ type Store interface {
// Logs
SaveLog(message string) error
LoadLogs(limit int) ([]string, error)
PruneLogs() error
// Maintenance Windows
GetActiveMaintenanceWindows() ([]models.MaintenanceWindow, error)