feat(cluster): add probe execution mode, check extraction, and result aggregation

Phase 2 of distributed probing:
- Extract check logic into standalone RunCheck() for use by probes
- Add probe cluster mode: stateless nodes that fetch assignments, execute
  checks, and report results to the leader
- Add multi-node result aggregation with configurable strategy
  (any-down, majority-down, all-down)
- Leader ingests probe results into engine live state and triggers alerts
- New env vars: UPKEEP_NODE_ID, UPKEEP_NODE_NAME, UPKEEP_NODE_REGION,
  UPKEEP_AGG_STRATEGY
- Example docker-compose.probe.yml with leader + 2 regional probes
This commit is contained in:
2026-05-16 11:19:57 -04:00
parent ca9faa0acd
commit ca5a42314f
8 changed files with 592 additions and 215 deletions
+44
View File
@@ -0,0 +1,44 @@
package monitor
import "time"
type AggregationStrategy string
const (
AggAnyDown AggregationStrategy = "any-down"
AggMajorityDown AggregationStrategy = "majority-down"
AggAllDown AggregationStrategy = "all-down"
)
type NodeResult struct {
NodeID string
IsUp bool
LatencyNs int64
CheckedAt time.Time
}
func AggregateStatus(results []NodeResult, strategy AggregationStrategy) (isUp bool, avgLatencyNs int64) {
if len(results) == 0 {
return true, 0
}
upCount := 0
var totalLatency int64
for _, r := range results {
if r.IsUp {
upCount++
}
totalLatency += r.LatencyNs
}
avgLatencyNs = totalLatency / int64(len(results))
switch strategy {
case AggMajorityDown:
isUp = upCount > len(results)/2
case AggAllDown:
isUp = upCount > 0
default:
isUp = upCount == len(results)
}
return
}