diff --git a/agents/rap-node-agent/cmd/mesh-live-smoke/main.go b/agents/rap-node-agent/cmd/mesh-live-smoke/main.go index c8cf593..56e0393 100644 --- a/agents/rap-node-agent/cmd/mesh-live-smoke/main.go +++ b/agents/rap-node-agent/cmd/mesh-live-smoke/main.go @@ -51,6 +51,7 @@ type smokeReport struct { FabricVPNPressureLevel string `json:"fabric_vpn_pressure_level"` FabricVPNPressureScore int `json:"fabric_vpn_pressure_score"` FabricVPNPressureReason []string `json:"fabric_vpn_pressure_reasons"` + FabricVPNPressureAction string `json:"fabric_vpn_pressure_action"` FabricVPNRouteRecovered bool `json:"fabric_vpn_route_recovered"` FabricVPNRouteSwitches uint64 `json:"fabric_vpn_route_switch_count"` FabricVPNRecoveryMS int64 `json:"fabric_vpn_route_recovery_ms"` @@ -160,7 +161,7 @@ func run(ctx context.Context) (smokeReport, error) { if err != nil { return smokeReport{}, fmt.Errorf("fabric vpn packet session smoke: %w", err) } - fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow, fabricVPNPressureLevel, fabricVPNPressureScore, fabricVPNPressureReasons := smokeVPNFlowSchedulerBulkPressure() + fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow, fabricVPNPressureLevel, fabricVPNPressureScore, fabricVPNPressureReasons, fabricVPNPressureAction := smokeVPNFlowSchedulerBulkPressure() fabricVPNRouteRecovered, fabricVPNRouteSwitches, fabricVPNRecoveryMS, fabricVPNRecoveryMaxMS, fabricVPNRecoveryAvgMS, fabricVPNRecoveryReason := smokeVPNFlowSchedulerRouteRecovery() fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx) if err != nil { @@ -190,6 +191,7 @@ func run(ctx context.Context) (smokeReport, error) { FabricVPNPressureLevel: fabricVPNPressureLevel, FabricVPNPressureScore: fabricVPNPressureScore, FabricVPNPressureReason: fabricVPNPressureReasons, + FabricVPNPressureAction: fabricVPNPressureAction, FabricVPNRouteRecovered: fabricVPNRouteRecovered, FabricVPNRouteSwitches: fabricVPNRouteSwitches, FabricVPNRecoveryMS: fabricVPNRecoveryMS, @@ -209,7 +211,7 @@ func run(ctx context.Context) (smokeReport, error) { }, nil } -func smokeVPNFlowSchedulerBulkPressure() (bool, int, int, int, int, string, int, []string) { +func smokeVPNFlowSchedulerBulkPressure() (bool, int, int, int, int, string, int, []string, string) { scheduler := vpnruntime.NewFabricFlowScheduler(32, 16) bulkPacket := []byte("bulk") interactivePacket := []byte("interactive-rdp-like") @@ -233,7 +235,39 @@ func smokeVPNFlowSchedulerBulkPressure() (bool, int, int, int, int, string, int, snapshot.RecommendedParallelWindows[vpnruntime.FabricTrafficClassInteractive], snapshot.PressureLevel, snapshot.PressureScore, - snapshot.PressureReasons + snapshot.PressureReasons, + smokeVPNPressureAction(snapshot) +} + +func smokeVPNPressureAction(snapshot vpnruntime.FabricFlowSchedulerSnapshot) string { + if containsSmokeString(snapshot.PressureReasons, "drops") || snapshot.QualityWindowDropCount > 0 { + return "shed_or_reroute" + } + if containsSmokeString(snapshot.PressureReasons, "route_failures") || snapshot.QualityWindowFailureCount > 0 || snapshot.FailingChannelCount > 0 { + return "rebuild_or_reroute" + } + if containsSmokeString(snapshot.PressureReasons, "route_recovery") || snapshot.RouteSwitchCount > 0 { + return "observe_recovery" + } + if containsSmokeString(snapshot.PressureReasons, "slow_channels") || snapshot.SlowChannelCount > 0 || snapshot.QualityWindowSlowCount > 0 { + return "prefer_faster_route" + } + if containsSmokeString(snapshot.PressureReasons, "bulk_pressure") || snapshot.BulkPressureActive { + return "throttle_bulk" + } + if snapshot.AdaptiveBackpressureActive || snapshot.BackpressureActive { + return "reduce_parallelism" + } + return "observe" +} + +func containsSmokeString(values []string, needle string) bool { + for _, value := range values { + if value == needle { + return true + } + } + return false } func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64, int64, int64, string) { diff --git a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md index 22ecb55..19df906 100644 --- a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md +++ b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md @@ -475,6 +475,8 @@ recovery timing, reason counts, and recommended per-class windows. The `flow_pressure` summary includes a `recommended_action` such as `observe`, `throttle_bulk`, `reduce_parallelism`, `prefer_faster_route`, `observe_recovery`, `rebuild_or_reroute`, or `shed_or_reroute`. +`mesh-live-smoke` reports the recommended action for its mixed bulk/interactive +load scenario. Nodes advertise the `vpn_fabric_flow_pressure` capability when that heartbeat summary is available. When the VPN fabric ingress runtime has not been initialized yet, the heartbeat