diff --git a/agents/rap-node-agent/cmd/mesh-live-smoke/main.go b/agents/rap-node-agent/cmd/mesh-live-smoke/main.go index bc6171f..b314827 100644 --- a/agents/rap-node-agent/cmd/mesh-live-smoke/main.go +++ b/agents/rap-node-agent/cmd/mesh-live-smoke/main.go @@ -53,6 +53,7 @@ type smokeReport struct { FabricVPNRecoveryMS int64 `json:"fabric_vpn_route_recovery_ms"` FabricVPNRecoveryMaxMS int64 `json:"fabric_vpn_route_recovery_max_ms"` FabricVPNRecoveryAvgMS int64 `json:"fabric_vpn_route_recovery_avg_ms"` + FabricVPNRecoveryReason string `json:"fabric_vpn_route_recovery_reason"` FabricQUICAccepted bool `json:"fabric_quic_accepted"` FabricQUICEndpoint string `json:"fabric_quic_endpoint"` FabricQUICPressure int `json:"fabric_quic_capacity_pressure_percent"` @@ -157,7 +158,7 @@ func run(ctx context.Context) (smokeReport, error) { return smokeReport{}, fmt.Errorf("fabric vpn packet session smoke: %w", err) } fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow := smokeVPNFlowSchedulerBulkPressure() - fabricVPNRouteRecovered, fabricVPNRouteSwitches, fabricVPNRecoveryMS, fabricVPNRecoveryMaxMS, fabricVPNRecoveryAvgMS := smokeVPNFlowSchedulerRouteRecovery() + fabricVPNRouteRecovered, fabricVPNRouteSwitches, fabricVPNRecoveryMS, fabricVPNRecoveryMaxMS, fabricVPNRecoveryAvgMS, fabricVPNRecoveryReason := smokeVPNFlowSchedulerRouteRecovery() fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx) if err != nil { return smokeReport{}, fmt.Errorf("fabric quic smoke: %w", err) @@ -188,6 +189,7 @@ func run(ctx context.Context) (smokeReport, error) { FabricVPNRecoveryMS: fabricVPNRecoveryMS, FabricVPNRecoveryMaxMS: fabricVPNRecoveryMaxMS, FabricVPNRecoveryAvgMS: fabricVPNRecoveryAvgMS, + FabricVPNRecoveryReason: fabricVPNRecoveryReason, FabricQUICAccepted: fabricQUICAccepted, FabricQUICEndpoint: fabricQUICEndpoint, FabricQUICPressure: fabricQUICPressure, @@ -225,7 +227,7 @@ func smokeVPNFlowSchedulerBulkPressure() (bool, int, int, int, int) { snapshot.RecommendedParallelWindows[vpnruntime.FabricTrafficClassInteractive] } -func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64, int64, int64) { +func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64, int64, int64, string) { scheduler := vpnruntime.NewFabricFlowScheduler(8, 16) channelID := "vpn-smoke-flow-0" scheduler.RecordRouteFailure(channelID, "route-primary", "node-primary", fmt.Errorf("smoke primary unavailable"), time.Millisecond) @@ -239,7 +241,8 @@ func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64, int64, int64) { snapshot.RouteSwitchCount, stat.LastRouteRecoveryMillis, snapshot.RouteRecoveryMaxMillis, - snapshot.RouteRecoveryAvgMillis + snapshot.RouteRecoveryAvgMillis, + stat.LastRouteSwitchReason } func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) { diff --git a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md index f26932d..db7c926 100644 --- a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md +++ b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md @@ -456,6 +456,8 @@ across recovered channels for quick load-test health checks. Route recovery telemetry now includes normalized switch reasons and aggregate reason counts, so load tests can distinguish peer failures, timeouts, and other route-break causes. +`mesh-live-smoke` reports the synthetic route-recovery reason beside recovery +timing and switch count. Endpoint ranking treats `capacity_limited` observations as a soft pressure penalty instead of a hard recent failure, enabling load spreading without marking the carrier unhealthy.