diff --git a/agents/rap-node-agent/cmd/mesh-live-smoke/main.go b/agents/rap-node-agent/cmd/mesh-live-smoke/main.go index cfc09bf..bd91e57 100644 --- a/agents/rap-node-agent/cmd/mesh-live-smoke/main.go +++ b/agents/rap-node-agent/cmd/mesh-live-smoke/main.go @@ -50,6 +50,7 @@ type smokeReport struct { FabricVPNInteractiveWin int `json:"fabric_vpn_interactive_parallel_window"` FabricVPNRouteRecovered bool `json:"fabric_vpn_route_recovered"` FabricVPNRouteSwitches uint64 `json:"fabric_vpn_route_switch_count"` + FabricVPNRecoveryMS int64 `json:"fabric_vpn_route_recovery_ms"` FabricQUICAccepted bool `json:"fabric_quic_accepted"` FabricQUICEndpoint string `json:"fabric_quic_endpoint"` FabricQUICPressure int `json:"fabric_quic_capacity_pressure_percent"` @@ -154,7 +155,7 @@ func run(ctx context.Context) (smokeReport, error) { return smokeReport{}, fmt.Errorf("fabric vpn packet session smoke: %w", err) } fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow := smokeVPNFlowSchedulerBulkPressure() - fabricVPNRouteRecovered, fabricVPNRouteSwitches := smokeVPNFlowSchedulerRouteRecovery() + fabricVPNRouteRecovered, fabricVPNRouteSwitches, fabricVPNRecoveryMS := smokeVPNFlowSchedulerRouteRecovery() fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx) if err != nil { return smokeReport{}, fmt.Errorf("fabric quic smoke: %w", err) @@ -182,6 +183,7 @@ func run(ctx context.Context) (smokeReport, error) { FabricVPNInteractiveWin: fabricVPNInteractiveWindow, FabricVPNRouteRecovered: fabricVPNRouteRecovered, FabricVPNRouteSwitches: fabricVPNRouteSwitches, + FabricVPNRecoveryMS: fabricVPNRecoveryMS, FabricQUICAccepted: fabricQUICAccepted, FabricQUICEndpoint: fabricQUICEndpoint, FabricQUICPressure: fabricQUICPressure, @@ -219,17 +221,19 @@ func smokeVPNFlowSchedulerBulkPressure() (bool, int, int, int, int) { snapshot.RecommendedParallelWindows[vpnruntime.FabricTrafficClassInteractive] } -func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64) { +func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64) { scheduler := vpnruntime.NewFabricFlowScheduler(8, 16) channelID := "vpn-smoke-flow-0" scheduler.RecordRouteFailure(channelID, "route-primary", "node-primary", fmt.Errorf("smoke primary unavailable"), time.Millisecond) + time.Sleep(time.Millisecond) scheduler.RecordRouteSuccess(channelID, "route-alternate", "node-alternate", time.Millisecond) snapshot := scheduler.Snapshot() stat := snapshot.ChannelStats[channelID] return stat.LastRecoveredFromRouteID == "route-primary" && stat.LastRouteID == "route-alternate" && snapshot.RouteRecoveredChannelCount == 1, - snapshot.RouteSwitchCount + snapshot.RouteSwitchCount, + stat.LastRouteRecoveryMillis } func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) { diff --git a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md index 96274cd..f1956a5 100644 --- a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md +++ b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md @@ -445,6 +445,8 @@ the failed route a channel recovered from, and aggregate recovered-channel / switch counts, making alternate-route recovery measurable during load tests. `mesh-live-smoke` now also exercises a primary-route failure followed by an alternate-route success and reports the resulting route switch count. +The same smoke output reports measured route recovery milliseconds for the +synthetic failover path. Route recovery telemetry includes failure/switch timestamps and recovery duration in milliseconds for each recovered flow channel. Endpoint ranking treats `capacity_limited` observations as a soft pressure