feat(cluster): add probe execution mode, check extraction, and result aggregation
Phase 2 of distributed probing: - Extract check logic into standalone RunCheck() for use by probes - Add probe cluster mode: stateless nodes that fetch assignments, execute checks, and report results to the leader - Add multi-node result aggregation with configurable strategy (any-down, majority-down, all-down) - Leader ingests probe results into engine live state and triggers alerts - New env vars: UPKEEP_NODE_ID, UPKEEP_NODE_NAME, UPKEEP_NODE_REGION, UPKEEP_AGG_STRATEGY - Example docker-compose.probe.yml with leader + 2 regional probes
This commit is contained in:
@@ -166,6 +166,44 @@ func runServe(args []string) {
|
||||
clusterKey = v
|
||||
}
|
||||
|
||||
nodeID := os.Getenv("UPKEEP_NODE_ID")
|
||||
nodeName := os.Getenv("UPKEEP_NODE_NAME")
|
||||
nodeRegion := os.Getenv("UPKEEP_NODE_REGION")
|
||||
aggStrategy := os.Getenv("UPKEEP_AGG_STRATEGY")
|
||||
|
||||
if clusterMode == "probe" {
|
||||
if nodeID == "" {
|
||||
fmt.Fprintln(os.Stderr, "UPKEEP_NODE_ID is required for probe mode")
|
||||
os.Exit(1)
|
||||
}
|
||||
if clusterPeer == "" {
|
||||
fmt.Fprintln(os.Stderr, "UPKEEP_PEER_URL is required for probe mode")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
fmt.Printf("Cluster: Running as PROBE (node=%s, region=%s)\n", nodeID, nodeRegion)
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
done := make(chan os.Signal, 1)
|
||||
signal.Notify(done, os.Interrupt, syscall.SIGINT, syscall.SIGTERM)
|
||||
go func() {
|
||||
<-done
|
||||
cancel()
|
||||
}()
|
||||
|
||||
if err := cluster.RunProbe(ctx, cluster.ProbeConfig{
|
||||
NodeID: nodeID,
|
||||
NodeName: nodeName,
|
||||
Region: nodeRegion,
|
||||
LeaderURL: clusterPeer,
|
||||
SharedKey: clusterKey,
|
||||
Interval: 30,
|
||||
}); err != nil {
|
||||
fmt.Fprintf(os.Stderr, "Probe error: %v\n", err)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
fs := flag.NewFlagSet("serve", flag.ExitOnError)
|
||||
port := fs.Int("port", portVal, "SSH Port")
|
||||
flagDBType := fs.String("db-type", dbType, "Database type")
|
||||
@@ -214,6 +252,9 @@ func runServe(args []string) {
|
||||
if os.Getenv("UPKEEP_INSECURE_SKIP_VERIFY") == "true" {
|
||||
eng.SetInsecureSkipVerify(true)
|
||||
}
|
||||
if aggStrategy != "" {
|
||||
eng.SetAggStrategy(monitor.AggregationStrategy(aggStrategy))
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
|
||||
Reference in New Issue
Block a user