feat(cluster): add probe execution mode, check extraction, and result aggregation

Phase 2 of distributed probing:
- Extract check logic into standalone RunCheck() for use by probes
- Add probe cluster mode: stateless nodes that fetch assignments, execute
  checks, and report results to the leader
- Add multi-node result aggregation with configurable strategy
  (any-down, majority-down, all-down)
- Leader ingests probe results into engine live state and triggers alerts
- New env vars: UPKEEP_NODE_ID, UPKEEP_NODE_NAME, UPKEEP_NODE_REGION,
  UPKEEP_AGG_STRATEGY
- Example docker-compose.probe.yml with leader + 2 regional probes
This commit is contained in:
2026-05-16 11:19:57 -04:00
parent ca9faa0acd
commit ca5a42314f
8 changed files with 592 additions and 215 deletions
+44
View File
@@ -0,0 +1,44 @@
package monitor
import "time"
type AggregationStrategy string
const (
AggAnyDown AggregationStrategy = "any-down"
AggMajorityDown AggregationStrategy = "majority-down"
AggAllDown AggregationStrategy = "all-down"
)
type NodeResult struct {
NodeID string
IsUp bool
LatencyNs int64
CheckedAt time.Time
}
func AggregateStatus(results []NodeResult, strategy AggregationStrategy) (isUp bool, avgLatencyNs int64) {
if len(results) == 0 {
return true, 0
}
upCount := 0
var totalLatency int64
for _, r := range results {
if r.IsUp {
upCount++
}
totalLatency += r.LatencyNs
}
avgLatencyNs = totalLatency / int64(len(results))
switch strategy {
case AggMajorityDown:
isUp = upCount > len(results)/2
case AggAllDown:
isUp = upCount > 0
default:
isUp = upCount == len(results)
}
return
}
+218
View File
@@ -0,0 +1,218 @@
package monitor
import (
"context"
"go-upkeep/internal/models"
"net"
"net/http"
"strconv"
"strings"
"time"
"github.com/miekg/dns"
probing "github.com/prometheus-community/pro-bing"
)
type CheckResult struct {
SiteID int
Status string // "UP", "DOWN", "SSL EXP"
StatusCode int
LatencyNs int64
HasSSL bool
CertExpiry time.Time
}
func RunCheck(site models.Site, strict, insecure *http.Client, globalInsecure bool) CheckResult {
switch site.Type {
case "http":
return runHTTPCheck(site, strict, insecure, globalInsecure)
case "ping":
return runPingCheck(site)
case "port":
return runPortCheck(site)
case "dns":
return runDNSCheck(site)
default:
return CheckResult{SiteID: site.ID, Status: "DOWN"}
}
}
func runHTTPCheck(site models.Site, strict, insecure *http.Client, globalInsecure bool) CheckResult {
method := site.Method
if method == "" {
method = "GET"
}
timeout := siteTimeout(site)
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
req, err := http.NewRequestWithContext(ctx, method, site.URL, nil)
if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN"}
}
client := strict
if globalInsecure || site.IgnoreTLS {
client = insecure
}
start := time.Now()
resp, err := client.Do(req)
latency := time.Since(start)
result := CheckResult{
SiteID: site.ID,
Status: "UP",
LatencyNs: latency.Nanoseconds(),
}
if err != nil {
result.Status = "DOWN"
return result
}
defer resp.Body.Close()
result.StatusCode = resp.StatusCode
if !isCodeAccepted(resp.StatusCode, site.AcceptedCodes) {
result.Status = "DOWN"
}
if site.CheckSSL && resp.TLS != nil && len(resp.TLS.PeerCertificates) > 0 {
result.HasSSL = true
cert := resp.TLS.PeerCertificates[0]
result.CertExpiry = cert.NotAfter
if time.Now().After(cert.NotAfter) {
result.Status = "SSL EXP"
}
}
return result
}
func runPingCheck(site models.Site) CheckResult {
host := site.Hostname
if host == "" {
host = site.URL
}
pinger, err := probing.NewPinger(host)
if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN"}
}
pinger.Count = 1
pinger.Timeout = siteTimeout(site)
pinger.SetPrivileged(false)
start := time.Now()
err = pinger.Run()
latency := time.Since(start)
if err != nil || pinger.Statistics().PacketsRecv == 0 {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()}
}
stats := pinger.Statistics()
return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: stats.AvgRtt.Nanoseconds()}
}
func runPortCheck(site models.Site) CheckResult {
host := site.Hostname
if host == "" {
host = site.URL
}
addr := net.JoinHostPort(host, strconv.Itoa(site.Port))
timeout := siteTimeout(site)
start := time.Now()
conn, err := net.DialTimeout("tcp", addr, timeout)
latency := time.Since(start)
if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()}
}
conn.Close()
return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: latency.Nanoseconds()}
}
func runDNSCheck(site models.Site) CheckResult {
host := site.Hostname
if host == "" {
host = site.URL
}
server := site.DNSServer
if server == "" {
server = "1.1.1.1"
}
if _, _, err := net.SplitHostPort(server); err != nil {
server = net.JoinHostPort(server, "53")
}
qtype := dns.TypeA
switch site.DNSResolveType {
case "AAAA":
qtype = dns.TypeAAAA
case "MX":
qtype = dns.TypeMX
case "CNAME":
qtype = dns.TypeCNAME
case "TXT":
qtype = dns.TypeTXT
case "NS":
qtype = dns.TypeNS
case "SOA":
qtype = dns.TypeSOA
case "SRV":
qtype = dns.TypeSRV
case "PTR":
qtype = dns.TypePTR
}
m := new(dns.Msg)
m.SetQuestion(dns.Fqdn(host), qtype)
c := new(dns.Client)
c.Timeout = siteTimeout(site)
start := time.Now()
r, _, err := c.Exchange(m, server)
latency := time.Since(start)
if err != nil {
return CheckResult{SiteID: site.ID, Status: "DOWN", LatencyNs: latency.Nanoseconds()}
}
if r.Rcode != dns.RcodeSuccess {
return CheckResult{SiteID: site.ID, Status: "DOWN", StatusCode: r.Rcode, LatencyNs: latency.Nanoseconds()}
}
return CheckResult{SiteID: site.ID, Status: "UP", LatencyNs: latency.Nanoseconds()}
}
func siteTimeout(site models.Site) time.Duration {
if site.Timeout > 0 {
return time.Duration(site.Timeout) * time.Second
}
return 5 * time.Second
}
func isCodeAccepted(code int, accepted string) bool {
if accepted == "" {
return code >= 200 && code < 300
}
for _, part := range strings.Split(accepted, ",") {
part = strings.TrimSpace(part)
if strings.Contains(part, "-") {
bounds := strings.SplitN(part, "-", 2)
lo, err1 := strconv.Atoi(strings.TrimSpace(bounds[0]))
hi, err2 := strconv.Atoi(strings.TrimSpace(bounds[1]))
if err1 == nil && err2 == nil && code >= lo && code <= hi {
return true
}
} else {
if v, err := strconv.Atoi(part); err == nil && code == v {
return true
}
}
}
return false
}
+64 -215
View File
@@ -7,15 +7,9 @@ import (
"go-upkeep/internal/alert"
"go-upkeep/internal/models"
"go-upkeep/internal/store"
"net"
"net/http"
"strconv"
"strings"
"sync"
"time"
"github.com/miekg/dns"
probing "github.com/prometheus-community/pro-bing"
)
type Engine struct {
@@ -33,6 +27,10 @@ type Engine struct {
tokenIndex map[string]int
probeResultsMu sync.RWMutex
probeResults map[int]map[string]NodeResult
aggStrategy AggregationStrategy
db store.Store
insecureSkipVerify bool
strictClient *http.Client
@@ -41,11 +39,13 @@ type Engine struct {
func NewEngine(s store.Store) *Engine {
return &Engine{
liveState: make(map[int]models.Site),
histories: make(map[int]*SiteHistory),
tokenIndex: make(map[string]int),
isActive: true,
db: s,
liveState: make(map[int]models.Site),
histories: make(map[int]*SiteHistory),
tokenIndex: make(map[string]int),
probeResults: make(map[int]map[string]NodeResult),
aggStrategy: AggAnyDown,
isActive: true,
db: s,
strictClient: &http.Client{
Transport: &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: false}},
},
@@ -310,19 +310,20 @@ func (e *Engine) checkByID(id int) {
if !exists || site.Paused {
return
}
switch site.Type {
case "http":
e.checkHTTP(site)
case "push":
e.checkPush(site)
case "ping":
e.checkPing(site)
case "port":
e.checkPort(site)
case "dns":
e.checkDNS(site)
case "group":
e.checkGroup(site)
default:
result := RunCheck(site, e.strictClient, e.insecureClient, e.insecureSkipVerify)
updatedSite := site
updatedSite.HasSSL = result.HasSSL
updatedSite.CertExpiry = result.CertExpiry
updatedSite.Latency = time.Duration(result.LatencyNs)
updatedSite.LastCheck = time.Now()
e.handleStatusChange(updatedSite, result.Status, result.StatusCode, time.Duration(result.LatencyNs))
}
}
@@ -335,61 +336,6 @@ func (e *Engine) checkPush(site models.Site) {
}
}
func (e *Engine) checkHTTP(site models.Site) {
method := site.Method
if method == "" {
method = "GET"
}
timeout := siteTimeout(site)
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
req, err := http.NewRequestWithContext(ctx, method, site.URL, nil)
if err != nil {
e.handleStatusChange(site, "DOWN", 0, 0)
return
}
client := e.strictClient
if e.insecureSkipVerify || site.IgnoreTLS {
client = e.insecureClient
}
start := time.Now()
resp, err := client.Do(req)
latency := time.Since(start)
rawStatus := "UP"
rawCode := 0
var certExpiry time.Time
hasSSL := false
if err != nil {
rawStatus = "DOWN"
} else {
defer resp.Body.Close()
rawCode = resp.StatusCode
if !isCodeAccepted(rawCode, site.AcceptedCodes) {
rawStatus = "DOWN"
}
if site.CheckSSL && resp.TLS != nil && len(resp.TLS.PeerCertificates) > 0 {
hasSSL = true
cert := resp.TLS.PeerCertificates[0]
certExpiry = cert.NotAfter
if time.Now().After(cert.NotAfter) {
rawStatus = "SSL EXP"
}
}
}
updatedSite := site
updatedSite.HasSSL = hasSSL
updatedSite.CertExpiry = certExpiry
updatedSite.Latency = latency
updatedSite.LastCheck = time.Now()
e.handleStatusChange(updatedSite, rawStatus, rawCode, latency)
}
func (e *Engine) handleStatusChange(site models.Site, rawStatus string, code int, latency time.Duration) {
if !e.IsActive() {
return
@@ -462,94 +408,6 @@ func (e *Engine) triggerAlert(alertID int, title, message string) {
}
}
func siteTimeout(site models.Site) time.Duration {
if site.Timeout > 0 {
return time.Duration(site.Timeout) * time.Second
}
return 5 * time.Second
}
func isCodeAccepted(code int, accepted string) bool {
if accepted == "" {
return code >= 200 && code < 300
}
for _, part := range strings.Split(accepted, ",") {
part = strings.TrimSpace(part)
if strings.Contains(part, "-") {
bounds := strings.SplitN(part, "-", 2)
lo, err1 := strconv.Atoi(strings.TrimSpace(bounds[0]))
hi, err2 := strconv.Atoi(strings.TrimSpace(bounds[1]))
if err1 == nil && err2 == nil && code >= lo && code <= hi {
return true
}
} else {
if v, err := strconv.Atoi(part); err == nil && code == v {
return true
}
}
}
return false
}
func (e *Engine) checkPing(site models.Site) {
host := site.Hostname
if host == "" {
host = site.URL
}
pinger, err := probing.NewPinger(host)
if err != nil {
e.handleStatusChange(site, "DOWN", 0, 0)
e.AddLog(fmt.Sprintf("Ping '%s' resolve failed: %v", site.Name, err))
return
}
pinger.Count = 1
pinger.Timeout = siteTimeout(site)
pinger.SetPrivileged(false)
start := time.Now()
err = pinger.Run()
latency := time.Since(start)
if err != nil || pinger.Statistics().PacketsRecv == 0 {
updatedSite := site
updatedSite.Latency = latency
updatedSite.LastCheck = time.Now()
e.handleStatusChange(updatedSite, "DOWN", 0, latency)
return
}
stats := pinger.Statistics()
updatedSite := site
updatedSite.Latency = stats.AvgRtt
updatedSite.LastCheck = time.Now()
e.handleStatusChange(updatedSite, "UP", 0, stats.AvgRtt)
}
func (e *Engine) checkPort(site models.Site) {
host := site.Hostname
if host == "" {
host = site.URL
}
addr := net.JoinHostPort(host, strconv.Itoa(site.Port))
timeout := siteTimeout(site)
start := time.Now()
conn, err := net.DialTimeout("tcp", addr, timeout)
latency := time.Since(start)
updatedSite := site
updatedSite.Latency = latency
updatedSite.LastCheck = time.Now()
if err != nil {
e.handleStatusChange(updatedSite, "DOWN", 0, latency)
return
}
conn.Close()
e.handleStatusChange(updatedSite, "UP", 0, latency)
}
func (e *Engine) checkGroup(site models.Site) {
e.mu.RLock()
status := "UP"
@@ -588,63 +446,54 @@ func (e *Engine) checkGroup(site models.Site) {
e.mu.Unlock()
}
func (e *Engine) checkDNS(site models.Site) {
host := site.Hostname
if host == "" {
host = site.URL
func (e *Engine) SetAggStrategy(strategy AggregationStrategy) {
e.aggStrategy = strategy
}
func (e *Engine) IngestProbeResult(nodeID string, siteID int, latencyNs int64, isUp bool) {
e.probeResultsMu.Lock()
if e.probeResults[siteID] == nil {
e.probeResults[siteID] = make(map[string]NodeResult)
}
e.probeResults[siteID][nodeID] = NodeResult{
NodeID: nodeID,
IsUp: isUp,
LatencyNs: latencyNs,
CheckedAt: time.Now(),
}
results := make([]NodeResult, 0, len(e.probeResults[siteID]))
for _, r := range e.probeResults[siteID] {
results = append(results, r)
}
e.probeResultsMu.Unlock()
aggUp, avgLatency := AggregateStatus(results, e.aggStrategy)
e.mu.RLock()
site, exists := e.liveState[siteID]
e.mu.RUnlock()
if !exists {
return
}
server := site.DNSServer
if server == "" {
server = "1.1.1.1"
rawStatus := "UP"
if !aggUp {
rawStatus = "DOWN"
}
if _, _, err := net.SplitHostPort(server); err != nil {
server = net.JoinHostPort(server, "53")
}
qtype := dns.TypeA
switch site.DNSResolveType {
case "AAAA":
qtype = dns.TypeAAAA
case "MX":
qtype = dns.TypeMX
case "CNAME":
qtype = dns.TypeCNAME
case "TXT":
qtype = dns.TypeTXT
case "NS":
qtype = dns.TypeNS
case "SOA":
qtype = dns.TypeSOA
case "SRV":
qtype = dns.TypeSRV
case "PTR":
qtype = dns.TypePTR
}
m := new(dns.Msg)
m.SetQuestion(dns.Fqdn(host), qtype)
c := new(dns.Client)
c.Timeout = siteTimeout(site)
start := time.Now()
r, _, err := c.Exchange(m, server)
latency := time.Since(start)
updatedSite := site
updatedSite.Latency = latency
updatedSite.Latency = time.Duration(avgLatency)
updatedSite.LastCheck = time.Now()
if err != nil {
e.handleStatusChange(updatedSite, "DOWN", 0, latency)
return
}
if r.Rcode != dns.RcodeSuccess {
e.handleStatusChange(updatedSite, "DOWN", r.Rcode, latency)
return
}
e.handleStatusChange(updatedSite, "UP", 0, latency)
e.handleStatusChange(updatedSite, rawStatus, 0, time.Duration(avgLatency))
}
func (e *Engine) GetProbeResults(siteID int) map[string]NodeResult {
e.probeResultsMu.RLock()
defer e.probeResultsMu.RUnlock()
src := e.probeResults[siteID]
cp := make(map[string]NodeResult, len(src))
for k, v := range src {
cp[k] = v
}
return cp
}