03cbe283df
Rework the VHS demo so the README screenshots actually entice a download. Demo data / tooling: - seed.yaml: real, reachable service URLs (detail now shows nextcloud.com, not example.com); Auth Portal -> non-resolving home.arpa host so it reads as a believable, reliably-DOWN monitor - backfill: transient outages for Nextcloud/Jellyfin/Immich aligned with their state changes (uptime % now matches); log timestamps derived from now so the Logs view reads chronologically; real SSL warning; three probe nodes across regions; seeded alert send health - demo.tape: shorter warm-up, added Nodes + theme captures, ordered so every shot stays inside the 60s node-freshness window (consistent probe count) - vhs/crop: new tool to trim the empty terminal border around each screenshot - setup.sh: build backfill up front for deterministic timing; UPTOP_DEMO=1 Supporting code: - persist alert send health (new alert_health table, load on startup, best-effort save on send) so health/last-sent survive restarts - latency Min/Avg/Max ignore failed checks (no more "Min 0ms") - correct "probe"/"probes" pluralization - stable status dot instead of an animated spinner under UPTOP_DEMO
369 lines
11 KiB
Go
369 lines
11 KiB
Go
package main
|
|
|
|
import (
|
|
"database/sql"
|
|
"fmt"
|
|
"math/rand/v2"
|
|
"os"
|
|
"time"
|
|
|
|
_ "github.com/mattn/go-sqlite3"
|
|
)
|
|
|
|
func main() {
|
|
if len(os.Args) < 2 {
|
|
fmt.Fprintln(os.Stderr, "usage: backfill <db-path>")
|
|
os.Exit(1)
|
|
}
|
|
db, err := sql.Open("sqlite3", os.Args[1])
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "open: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
defer db.Close()
|
|
|
|
ids, err := loadSiteIDs(db)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "load site IDs: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
rng := rand.New(rand.NewPCG(42, 0)) //nolint:gosec // deterministic seed for reproducible demo data
|
|
now := time.Now().UTC()
|
|
|
|
if err := backfillHistory(db, rng, now, ids); err != nil {
|
|
fmt.Fprintf(os.Stderr, "history: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
if err := backfillStateChanges(db, now, ids); err != nil {
|
|
fmt.Fprintf(os.Stderr, "state changes: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
if err := backfillLogs(db, now); err != nil {
|
|
fmt.Fprintf(os.Stderr, "logs: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
if err := backfillNodes(db, now); err != nil {
|
|
fmt.Fprintf(os.Stderr, "nodes: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
if err := backfillMaintenance(db, now, ids); err != nil {
|
|
fmt.Fprintf(os.Stderr, "maintenance: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
alertIDs, err := loadAlertIDs(db)
|
|
if err != nil {
|
|
fmt.Fprintf(os.Stderr, "load alert IDs: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
if err := backfillAlertHealth(db, now, alertIDs); err != nil {
|
|
fmt.Fprintf(os.Stderr, "alert health: %v\n", err)
|
|
os.Exit(1)
|
|
}
|
|
|
|
var count int
|
|
_ = db.QueryRow("SELECT COUNT(*) FROM check_history").Scan(&count)
|
|
fmt.Printf("Backfill complete: %d check records\n", count)
|
|
|
|
var token string
|
|
if err := db.QueryRow("SELECT token FROM sites WHERE name='Nightly Backup'").Scan(&token); err == nil {
|
|
fmt.Printf("PUSH_TOKEN=%s\n", token)
|
|
}
|
|
}
|
|
|
|
func loadSiteIDs(db *sql.DB) (map[string]int, error) {
|
|
rows, err := db.Query("SELECT id, name FROM sites")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return scanNameIDs(rows)
|
|
}
|
|
|
|
func loadAlertIDs(db *sql.DB) (map[string]int, error) {
|
|
rows, err := db.Query("SELECT id, name FROM alerts")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return scanNameIDs(rows)
|
|
}
|
|
|
|
func scanNameIDs(rows *sql.Rows) (map[string]int, error) {
|
|
defer rows.Close()
|
|
ids := make(map[string]int)
|
|
for rows.Next() {
|
|
var id int
|
|
var name string
|
|
if err := rows.Scan(&id, &name); err != nil {
|
|
return nil, err
|
|
}
|
|
ids[name] = id
|
|
}
|
|
return ids, rows.Err()
|
|
}
|
|
|
|
// backfillAlertHealth seeds realistic send health so the Alerts tab shows recent,
|
|
// healthy "last sent" times and green health dots instead of "never" across the board.
|
|
func backfillAlertHealth(db *sql.DB, now time.Time, alertIDs map[string]int) error {
|
|
type health struct {
|
|
name string
|
|
sentAgo time.Duration
|
|
ok bool
|
|
sends int
|
|
fails int
|
|
}
|
|
rows := []health{
|
|
{"Discord Homelab", 4 * time.Minute, true, 37, 0},
|
|
{"Slack Ops", 9 * time.Minute, true, 21, 1},
|
|
{"Ntfy Alerts", 1 * time.Hour, true, 12, 0},
|
|
{"Email Oncall", 3 * time.Hour, true, 5, 0},
|
|
}
|
|
|
|
tx, err := db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() { _ = tx.Rollback() }()
|
|
|
|
stmt, err := tx.Prepare("INSERT OR REPLACE INTO alert_health (alert_id, last_send_at, last_send_ok, last_error, send_count, fail_count) VALUES (?, ?, ?, ?, ?, ?)")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer stmt.Close()
|
|
|
|
for _, r := range rows {
|
|
id, ok := alertIDs[r.name]
|
|
if !ok {
|
|
continue
|
|
}
|
|
sentAt := now.Add(-r.sentAgo).Format("2006-01-02 15:04:05")
|
|
if _, err := stmt.Exec(id, sentAt, r.ok, "", r.sends, r.fails); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return tx.Commit()
|
|
}
|
|
|
|
type monitorProfile struct {
|
|
name string
|
|
minMs int
|
|
maxMs int
|
|
downFrom int // first DOWN check index (-1 = always up)
|
|
downTo int // exclusive end of the DOWN window; use 60 (total) for a still-down monitor
|
|
}
|
|
|
|
func backfillHistory(db *sql.DB, rng *rand.Rand, now time.Time, ids map[string]int) error {
|
|
// Latency ranges reflect monitoring public services over the internet, so the
|
|
// detail histogram brackets the live latency the engine measures at capture time.
|
|
// 60 checks * 24m spacing = a 24h window; dip indices place outages within it.
|
|
profiles := []monitorProfile{
|
|
{"Nextcloud", 200, 600, 47, 48}, // brief blip ~5h ago, recovered
|
|
{"Jellyfin", 40, 180, 15, 16}, // brief blip ~18h ago, recovered
|
|
{"Home Assistant", 30, 120, -1, 0}, //
|
|
{"Gitea", 50, 200, -1, 0}, //
|
|
{"Traefik Dashboard", 60, 200, -1, 0}, //
|
|
{"Vaultwarden", 80, 250, -1, 0}, //
|
|
{"Personal Blog", 40, 160, -1, 0}, //
|
|
{"Immich", 60, 300, 30, 31}, // brief blip ~12h ago; periodic spikes below
|
|
{"Auth Portal", 30, 90, 40, 60}, // DOWN ~8h ago, still down
|
|
{"Edge Router", 5, 20, -1, 0}, // ping
|
|
{"Postgres", 1, 6, -1, 0}, // port
|
|
{"DNS Primary", 8, 30, -1, 0}, // dns
|
|
}
|
|
|
|
tx, err := db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() { _ = tx.Rollback() }()
|
|
|
|
stmt, err := tx.Prepare("INSERT INTO check_history (site_id, latency_ns, is_up, checked_at) VALUES (?, ?, ?, ?)")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer stmt.Close()
|
|
|
|
const total = 60
|
|
for _, p := range profiles {
|
|
siteID, ok := ids[p.name]
|
|
if !ok {
|
|
continue
|
|
}
|
|
for i := 0; i < total; i++ {
|
|
minutesAgo := (total - i) * 24
|
|
checkedAt := now.Add(-time.Duration(minutesAgo) * time.Minute)
|
|
|
|
var latencyNs int64
|
|
isUp := true
|
|
|
|
if p.downFrom >= 0 && i >= p.downFrom && i < p.downTo {
|
|
latencyNs = 0
|
|
isUp = false
|
|
} else {
|
|
ms := p.minMs + rng.IntN(p.maxMs-p.minMs)
|
|
if p.name == "Immich" && i%17 == 0 {
|
|
ms = 250 + rng.IntN(100)
|
|
}
|
|
latencyNs = int64(ms) * 1_000_000
|
|
}
|
|
|
|
if _, err := stmt.Exec(siteID, latencyNs, isUp, checkedAt.Format("2006-01-02 15:04:05")); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
return tx.Commit()
|
|
}
|
|
|
|
func backfillStateChanges(db *sql.DB, now time.Time, ids map[string]int) error {
|
|
type sc struct {
|
|
name string
|
|
from string
|
|
to string
|
|
reason string
|
|
at time.Time
|
|
}
|
|
// Timed to line up with the history dips (Nextcloud ~5h, Immich ~12h, Jellyfin ~18h)
|
|
// and the still-down Auth Portal (~8h), so detail panels read coherently.
|
|
changes := []sc{
|
|
{"Nextcloud", "UP", "DOWN", "read timeout", now.Add(-5 * time.Hour).Add(-8 * time.Minute)},
|
|
{"Nextcloud", "DOWN", "UP", "", now.Add(-5 * time.Hour)},
|
|
{"Auth Portal", "UP", "DOWN", "no such host", now.Add(-8 * time.Hour)},
|
|
{"Immich", "UP", "DOWN", "502 Bad Gateway", now.Add(-12 * time.Hour).Add(-8 * time.Minute)},
|
|
{"Immich", "DOWN", "UP", "", now.Add(-12 * time.Hour)},
|
|
{"Jellyfin", "UP", "DOWN", "connection reset", now.Add(-18 * time.Hour).Add(-5 * time.Minute)},
|
|
{"Jellyfin", "DOWN", "UP", "", now.Add(-18 * time.Hour)},
|
|
}
|
|
|
|
tx, err := db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() { _ = tx.Rollback() }()
|
|
|
|
stmt, err := tx.Prepare("INSERT INTO state_changes (site_id, from_status, to_status, error_reason, changed_at) VALUES (?, ?, ?, ?, ?)")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer stmt.Close()
|
|
|
|
for _, c := range changes {
|
|
siteID, ok := ids[c.name]
|
|
if !ok {
|
|
continue
|
|
}
|
|
if _, err := stmt.Exec(siteID, c.from, c.to, c.reason, c.at.Format("2006-01-02 15:04:05")); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return tx.Commit()
|
|
}
|
|
|
|
func backfillLogs(db *sql.DB, now time.Time) error {
|
|
type logEntry struct {
|
|
text string
|
|
at time.Time
|
|
}
|
|
ago := func(h, m, s int) time.Time {
|
|
return now.Add(-(time.Duration(h)*time.Hour + time.Duration(m)*time.Minute + time.Duration(s)*time.Second))
|
|
}
|
|
// Ordered newest-first. The bracket time is derived from `at` (not hardcoded), so the
|
|
// Logs view — which renders the leading [HH:MM] — reads chronologically. Outage times
|
|
// line up with the state changes and history dips above.
|
|
logs := []logEntry{
|
|
{"Monitor 'Nextcloud' recovered (was down 8m)", ago(5, 0, 0)},
|
|
{"Monitor 'Nextcloud' confirmed DOWN: read timeout", ago(5, 8, 0)},
|
|
{"Monitor 'Nextcloud' failed check 2/2", ago(5, 8, 30)},
|
|
{"Monitor 'Nextcloud' failed check 1/2", ago(5, 9, 0)},
|
|
{"Monitor 'Auth Portal' confirmed DOWN: no such host", ago(8, 0, 0)},
|
|
{"Monitor 'Auth Portal' failed check 2/2", ago(8, 0, 30)},
|
|
{"Monitor 'Auth Portal' failed check 1/2", ago(8, 1, 0)},
|
|
{"Monitor 'Immich' recovered (was down 8m)", ago(12, 0, 0)},
|
|
{"Monitor 'Immich' confirmed DOWN: 502 Bad Gateway", ago(12, 8, 0)},
|
|
{"Monitor 'Immich' failed check 3/3", ago(12, 8, 30)},
|
|
{"Monitor 'Immich' failed check 2/3", ago(12, 9, 0)},
|
|
{"Monitor 'Immich' failed check 1/3", ago(12, 9, 30)},
|
|
{"Monitor 'Jellyfin' recovered (was down 5m)", ago(18, 0, 0)},
|
|
{"Monitor 'Jellyfin' confirmed DOWN: connection reset", ago(18, 5, 0)},
|
|
{"Monitor 'Jellyfin' failed check 2/2", ago(18, 5, 30)},
|
|
{"Monitor 'Jellyfin' failed check 1/2", ago(18, 6, 0)},
|
|
{"SSL warning: certificate for 'Personal Blog' expires in 9 days", ago(20, 0, 0)},
|
|
{"Engine RESUMED (Active)", ago(22, 0, 0)},
|
|
{"Loaded check history from database", ago(22, 0, 5)},
|
|
}
|
|
|
|
tx, err := db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() { _ = tx.Rollback() }()
|
|
|
|
stmt, err := tx.Prepare("INSERT INTO logs (message, created_at) VALUES (?, ?)")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer stmt.Close()
|
|
|
|
for _, l := range logs {
|
|
// Bracket in local time to match the engine's live AddLog timestamps;
|
|
// created_at stays UTC to match the store's CURRENT_TIMESTAMP ordering.
|
|
msg := "[" + l.at.Local().Format("15:04") + "] " + l.text
|
|
if _, err := stmt.Exec(msg, l.at.Format("2006-01-02 15:04:05")); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return tx.Commit()
|
|
}
|
|
|
|
func backfillNodes(db *sql.DB, now time.Time) error {
|
|
// Multiple regions to show distributed probes. All seen "now" so they read ONLINE
|
|
// for the whole capture window (kept under the 60s freshness threshold by the tape).
|
|
nodes := []struct{ id, name, region string }{
|
|
{"node-use1", "leader", "us-east"},
|
|
{"node-euw1", "probe-eu", "eu-west"},
|
|
{"node-apse1", "probe-ap", "ap-southeast"},
|
|
}
|
|
ts := now.Format("2006-01-02 15:04:05")
|
|
for _, n := range nodes {
|
|
if _, err := db.Exec(
|
|
"INSERT OR REPLACE INTO nodes (id, name, region, last_seen, version) VALUES (?, ?, ?, ?, ?)",
|
|
n.id, n.name, n.region, ts, "2026.05.1",
|
|
); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func backfillMaintenance(db *sql.DB, now time.Time, ids map[string]int) error {
|
|
tx, err := db.Begin()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer func() { _ = tx.Rollback() }()
|
|
|
|
stmt, err := tx.Prepare("INSERT INTO maintenance_windows (monitor_id, title, description, type, start_time, end_time, created_by) VALUES (?, ?, ?, ?, ?, ?, ?)")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer stmt.Close()
|
|
|
|
jellyfinID := ids["Jellyfin"]
|
|
past := now.Add(-3 * 24 * time.Hour)
|
|
if _, err := stmt.Exec(jellyfinID, "Jellyfin upgrade", "Upgrade to v10.10 + plugin updates", "maintenance",
|
|
past.Format("2006-01-02 15:04:05"),
|
|
past.Add(2*time.Hour).Format("2006-01-02 15:04:05"),
|
|
"admin"); err != nil {
|
|
return err
|
|
}
|
|
|
|
future := now.Add(2 * 24 * time.Hour)
|
|
if _, err := stmt.Exec(0, "Network switch replacement", "Replacing core switch in rack 2", "maintenance",
|
|
future.Format("2006-01-02 15:04:05"),
|
|
future.Add(4*time.Hour).Format("2006-01-02 15:04:05"),
|
|
"admin"); err != nil {
|
|
return err
|
|
}
|
|
|
|
return tx.Commit()
|
|
}
|