0cad80c352
Backfill tool, crop tool, demo tape, seed data, and setup script extracted from the uptop repo for clean separation.
369 lines
11 KiB
Go
369 lines
11 KiB
Go
package main
|
|
|
|
import (
|
|
"database/sql"
|
|
"fmt"
|
|
"math/rand/v2"
|
|
"os"
|
|
"time"
|
|
|
|
_ "github.com/mattn/go-sqlite3"
|
|
)
|
|
|
|
func main() {
|
|
if len(os.Args) < 2 {
|
|
fmt.Fprintln(os.Stderr, "usage: backfill <db-path>")
|
|
os.Exit(1)
|
|
}
|
|
db, err := sql.Open("sqlite3", os.Args[1])
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "open: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
defer db.Close()
|
|
|
|
ids, err := loadSiteIDs(db)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "load site IDs: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
rng := rand.New(rand.NewPCG(42, 0)) //nolint:gosec // deterministic seed for reproducible demo data
|
|
now := time.Now().UTC()
|
|
|
|
if err := backfillHistory(db, rng, now, ids); err != nil {
|
|
fmt.Fprintf(os.Stderr, "history: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
if err := backfillStateChanges(db, now, ids); err != nil {
|
|
fmt.Fprintf(os.Stderr, "state changes: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
if err := backfillLogs(db, now); err != nil {
|
|
fmt.Fprintf(os.Stderr, "logs: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
if err := backfillNodes(db, now); err != nil {
|
|
fmt.Fprintf(os.Stderr, "nodes: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
if err := backfillMaintenance(db, now, ids); err != nil {
|
|
fmt.Fprintf(os.Stderr, "maintenance: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
alertIDs, err := loadAlertIDs(db)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "load alert IDs: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
if err := backfillAlertHealth(db, now, alertIDs); err != nil {
|
|
fmt.Fprintf(os.Stderr, "alert health: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
var count int
|
|
_ = db.QueryRow("SELECT COUNT(*) FROM check_history").Scan(&count)
|
|
fmt.Printf("Backfill complete: %d check records\n", count)
|
|
|
|
var token string
|
|
if err := db.QueryRow("SELECT token FROM sites WHERE name='Nightly Backup'").Scan(&token); err == nil {
|
|
fmt.Printf("PUSH_TOKEN=%s\n", token)
|
|
}
|
|
}
|
|
|
|
func loadSiteIDs(db *sql.DB) (map[string]int, error) {
|
|
rows, err := db.Query("SELECT id, name FROM sites")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return scanNameIDs(rows)
|
|
}
|
|
|
|
func loadAlertIDs(db *sql.DB) (map[string]int, error) {
|
|
rows, err := db.Query("SELECT id, name FROM alerts")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return scanNameIDs(rows)
|
|
}
|
|
|
|
func scanNameIDs(rows *sql.Rows) (map[string]int, error) {
|
|
defer rows.Close()
|
|
ids := make(map[string]int)
|
|
for rows.Next() {
|
|
var id int
|
|
var name string
|
|
if err := rows.Scan(&id, &name); err != nil {
|
|
return nil, err
|
|
}
|
|
ids[name] = id
|
|
}
|
|
return ids, rows.Err()
|
|
}
|
|
|
|
// backfillAlertHealth seeds realistic send health so the Alerts tab shows recent,
|
|
// healthy "last sent" times and green health dots instead of "never" across the board.
|
|
func backfillAlertHealth(db *sql.DB, now time.Time, alertIDs map[string]int) error {
|
|
type health struct {
|
|
name string
|
|
sentAgo time.Duration
|
|
ok bool
|
|
sends int
|
|
fails int
|
|
}
|
|
rows := []health{
|
|
{"Discord Homelab", 4 * time.Minute, true, 37, 0},
|
|
{"Slack Ops", 9 * time.Minute, true, 21, 1},
|
|
{"Ntfy Alerts", 1 * time.Hour, true, 12, 0},
|
|
{"Email Oncall", 3 * time.Hour, true, 5, 0},
|
|
}
|
|
|
|
tx, err := db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() { _ = tx.Rollback() }()
|
|
|
|
stmt, err := tx.Prepare("INSERT OR REPLACE INTO alert_health (alert_id, last_send_at, last_send_ok, last_error, send_count, fail_count) VALUES (?, ?, ?, ?, ?, ?)")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer stmt.Close()
|
|
|
|
for _, r := range rows {
|
|
id, ok := alertIDs[r.name]
|
|
if !ok {
|
|
continue
|
|
}
|
|
sentAt := now.Add(-r.sentAgo).Format("2006-01-02 15:04:05")
|
|
if _, err := stmt.Exec(id, sentAt, r.ok, "", r.sends, r.fails); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return tx.Commit()
|
|
}
|
|
|
|
type monitorProfile struct {
|
|
name string
|
|
minMs int
|
|
maxMs int
|
|
downFrom int // first DOWN check index (-1 = always up)
|
|
downTo int // exclusive end of the DOWN window; use 60 (total) for a still-down monitor
|
|
}
|
|
|
|
func backfillHistory(db *sql.DB, rng *rand.Rand, now time.Time, ids map[string]int) error {
|
|
// Latency ranges reflect monitoring public services over the internet, so the
|
|
// detail histogram brackets the live latency the engine measures at capture time.
|
|
// 60 checks * 24m spacing = a 24h window; dip indices place outages within it.
|
|
profiles := []monitorProfile{
|
|
{"Nextcloud", 200, 600, 47, 48}, // brief blip ~5h ago, recovered
|
|
{"Jellyfin", 40, 180, 15, 16}, // brief blip ~18h ago, recovered
|
|
{"Home Assistant", 30, 120, -1, 0}, //
|
|
{"Gitea", 50, 200, -1, 0}, //
|
|
{"Traefik Dashboard", 60, 200, -1, 0}, //
|
|
{"Vaultwarden", 80, 250, -1, 0}, //
|
|
{"Personal Blog", 40, 160, -1, 0}, //
|
|
{"Immich", 60, 300, 30, 31}, // brief blip ~12h ago; periodic spikes below
|
|
{"Auth Portal", 30, 90, 40, 60}, // DOWN ~8h ago, still down
|
|
{"Edge Router", 5, 20, -1, 0}, // ping
|
|
{"Postgres", 1, 6, -1, 0}, // port
|
|
{"DNS Primary", 8, 30, -1, 0}, // dns
|
|
}
|
|
|
|
tx, err := db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() { _ = tx.Rollback() }()
|
|
|
|
stmt, err := tx.Prepare("INSERT INTO check_history (site_id, latency_ns, is_up, checked_at) VALUES (?, ?, ?, ?)")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer stmt.Close()
|
|
|
|
const total = 60
|
|
for _, p := range profiles {
|
|
siteID, ok := ids[p.name]
|
|
if !ok {
|
|
continue
|
|
}
|
|
for i := 0; i < total; i++ {
|
|
minutesAgo := (total - i) * 24
|
|
checkedAt := now.Add(-time.Duration(minutesAgo) * time.Minute)
|
|
|
|
var latencyNs int64
|
|
isUp := true
|
|
|
|
if p.downFrom >= 0 && i >= p.downFrom && i < p.downTo {
|
|
latencyNs = 0
|
|
isUp = false
|
|
} else {
|
|
ms := p.minMs + rng.IntN(p.maxMs-p.minMs)
|
|
if p.name == "Immich" && i%17 == 0 {
|
|
ms = 250 + rng.IntN(100)
|
|
}
|
|
latencyNs = int64(ms) * 1_000_000
|
|
}
|
|
|
|
if _, err := stmt.Exec(siteID, latencyNs, isUp, checkedAt.Format("2006-01-02 15:04:05")); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return tx.Commit()
|
|
}
|
|
|
|
func backfillStateChanges(db *sql.DB, now time.Time, ids map[string]int) error {
|
|
type sc struct {
|
|
name string
|
|
from string
|
|
to string
|
|
reason string
|
|
at time.Time
|
|
}
|
|
// Timed to line up with the history dips (Nextcloud ~5h, Immich ~12h, Jellyfin ~18h)
|
|
// and the still-down Auth Portal (~8h), so detail panels read coherently.
|
|
changes := []sc{
|
|
{"Nextcloud", "UP", "DOWN", "read timeout", now.Add(-5 * time.Hour).Add(-8 * time.Minute)},
|
|
{"Nextcloud", "DOWN", "UP", "", now.Add(-5 * time.Hour)},
|
|
{"Auth Portal", "UP", "DOWN", "no such host", now.Add(-8 * time.Hour)},
|
|
{"Immich", "UP", "DOWN", "502 Bad Gateway", now.Add(-12 * time.Hour).Add(-8 * time.Minute)},
|
|
{"Immich", "DOWN", "UP", "", now.Add(-12 * time.Hour)},
|
|
{"Jellyfin", "UP", "DOWN", "connection reset", now.Add(-18 * time.Hour).Add(-5 * time.Minute)},
|
|
{"Jellyfin", "DOWN", "UP", "", now.Add(-18 * time.Hour)},
|
|
}
|
|
|
|
tx, err := db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() { _ = tx.Rollback() }()
|
|
|
|
stmt, err := tx.Prepare("INSERT INTO state_changes (site_id, from_status, to_status, error_reason, changed_at) VALUES (?, ?, ?, ?, ?)")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer stmt.Close()
|
|
|
|
for _, c := range changes {
|
|
siteID, ok := ids[c.name]
|
|
if !ok {
|
|
continue
|
|
}
|
|
if _, err := stmt.Exec(siteID, c.from, c.to, c.reason, c.at.Format("2006-01-02 15:04:05")); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return tx.Commit()
|
|
}
|
|
|
|
func backfillLogs(db *sql.DB, now time.Time) error {
|
|
type logEntry struct {
|
|
text string
|
|
at time.Time
|
|
}
|
|
ago := func(h, m, s int) time.Time {
|
|
return now.Add(-(time.Duration(h)*time.Hour + time.Duration(m)*time.Minute + time.Duration(s)*time.Second))
|
|
}
|
|
// Ordered newest-first. The bracket time is derived from `at` (not hardcoded), so the
|
|
// Logs view — which renders the leading [HH:MM] — reads chronologically. Outage times
|
|
// line up with the state changes and history dips above.
|
|
logs := []logEntry{
|
|
{"Monitor 'Nextcloud' recovered (was down 8m)", ago(5, 0, 0)},
|
|
{"Monitor 'Nextcloud' confirmed DOWN: read timeout", ago(5, 8, 0)},
|
|
{"Monitor 'Nextcloud' failed check 2/2", ago(5, 8, 30)},
|
|
{"Monitor 'Nextcloud' failed check 1/2", ago(5, 9, 0)},
|
|
{"Monitor 'Auth Portal' confirmed DOWN: no such host", ago(8, 0, 0)},
|
|
{"Monitor 'Auth Portal' failed check 2/2", ago(8, 0, 30)},
|
|
{"Monitor 'Auth Portal' failed check 1/2", ago(8, 1, 0)},
|
|
{"Monitor 'Immich' recovered (was down 8m)", ago(12, 0, 0)},
|
|
{"Monitor 'Immich' confirmed DOWN: 502 Bad Gateway", ago(12, 8, 0)},
|
|
{"Monitor 'Immich' failed check 3/3", ago(12, 8, 30)},
|
|
{"Monitor 'Immich' failed check 2/3", ago(12, 9, 0)},
|
|
{"Monitor 'Immich' failed check 1/3", ago(12, 9, 30)},
|
|
{"Monitor 'Jellyfin' recovered (was down 5m)", ago(18, 0, 0)},
|
|
{"Monitor 'Jellyfin' confirmed DOWN: connection reset", ago(18, 5, 0)},
|
|
{"Monitor 'Jellyfin' failed check 2/2", ago(18, 5, 30)},
|
|
{"Monitor 'Jellyfin' failed check 1/2", ago(18, 6, 0)},
|
|
{"SSL warning: certificate for 'Personal Blog' expires in 9 days", ago(20, 0, 0)},
|
|
{"Engine RESUMED (Active)", ago(22, 0, 0)},
|
|
{"Loaded check history from database", ago(22, 0, 5)},
|
|
}
|
|
|
|
tx, err := db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() { _ = tx.Rollback() }()
|
|
|
|
stmt, err := tx.Prepare("INSERT INTO logs (message, created_at) VALUES (?, ?)")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer stmt.Close()
|
|
|
|
for _, l := range logs {
|
|
// Bracket in local time to match the engine's live AddLog timestamps;
|
|
// created_at stays UTC to match the store's CURRENT_TIMESTAMP ordering.
|
|
msg := "[" + l.at.Local().Format("15:04") + "] " + l.text
|
|
if _, err := stmt.Exec(msg, l.at.Format("2006-01-02 15:04:05")); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return tx.Commit()
|
|
}
|
|
|
|
func backfillNodes(db *sql.DB, now time.Time) error {
|
|
// Multiple regions to show distributed probes. All seen "now" so they read ONLINE
|
|
// for the whole capture window (kept under the 60s freshness threshold by the tape).
|
|
nodes := []struct{ id, name, region string }{
|
|
{"node-use1", "leader", "us-east"},
|
|
{"node-euw1", "probe-eu", "eu-west"},
|
|
{"node-apse1", "probe-ap", "ap-southeast"},
|
|
}
|
|
ts := now.Format("2006-01-02 15:04:05")
|
|
for _, n := range nodes {
|
|
if _, err := db.Exec(
|
|
"INSERT OR REPLACE INTO nodes (id, name, region, last_seen, version) VALUES (?, ?, ?, ?, ?)",
|
|
n.id, n.name, n.region, ts, "2026.05.1",
|
|
); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func backfillMaintenance(db *sql.DB, now time.Time, ids map[string]int) error {
|
|
tx, err := db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() { _ = tx.Rollback() }()
|
|
|
|
stmt, err := tx.Prepare("INSERT INTO maintenance_windows (monitor_id, title, description, type, start_time, end_time, created_by) VALUES (?, ?, ?, ?, ?, ?, ?)")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer stmt.Close()
|
|
|
|
jellyfinID := ids["Jellyfin"]
|
|
past := now.Add(-3 * 24 * time.Hour)
|
|
if _, err := stmt.Exec(jellyfinID, "Jellyfin upgrade", "Upgrade to v10.10 + plugin updates", "maintenance",
|
|
past.Format("2006-01-02 15:04:05"),
|
|
past.Add(2*time.Hour).Format("2006-01-02 15:04:05"),
|
|
"admin"); err != nil {
|
|
return err
|
|
}
|
|
|
|
future := now.Add(2 * 24 * time.Hour)
|
|
if _, err := stmt.Exec(0, "Network switch replacement", "Replacing core switch in rack 2", "maintenance",
|
|
future.Format("2006-01-02 15:04:05"),
|
|
future.Add(4*time.Hour).Format("2006-01-02 15:04:05"),
|
|
"admin"); err != nil {
|
|
return err
|
|
}
|
|
|
|
return tx.Commit()
|
|
}
|