From d43910d2c0979edf36d58a2c9e5621180b676f81 Mon Sep 17 00:00:00 2001 From: Mikhail Date: Sat, 16 May 2026 13:09:08 +0300 Subject: [PATCH] Smoke test VPN route recovery --- .../rap-node-agent/cmd/mesh-live-smoke/main.go | 18 ++++++++++++++++++ .../DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md | 2 ++ 2 files changed, 20 insertions(+) diff --git a/agents/rap-node-agent/cmd/mesh-live-smoke/main.go b/agents/rap-node-agent/cmd/mesh-live-smoke/main.go index 411b400..cfc09bf 100644 --- a/agents/rap-node-agent/cmd/mesh-live-smoke/main.go +++ b/agents/rap-node-agent/cmd/mesh-live-smoke/main.go @@ -48,6 +48,8 @@ type smokeReport struct { FabricVPNInteractive int `json:"fabric_vpn_interactive_or_control_channels"` FabricVPNBulkWindow int `json:"fabric_vpn_bulk_parallel_window"` FabricVPNInteractiveWin int `json:"fabric_vpn_interactive_parallel_window"` + FabricVPNRouteRecovered bool `json:"fabric_vpn_route_recovered"` + FabricVPNRouteSwitches uint64 `json:"fabric_vpn_route_switch_count"` FabricQUICAccepted bool `json:"fabric_quic_accepted"` FabricQUICEndpoint string `json:"fabric_quic_endpoint"` FabricQUICPressure int `json:"fabric_quic_capacity_pressure_percent"` @@ -152,6 +154,7 @@ func run(ctx context.Context) (smokeReport, error) { return smokeReport{}, fmt.Errorf("fabric vpn packet session smoke: %w", err) } fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow := smokeVPNFlowSchedulerBulkPressure() + fabricVPNRouteRecovered, fabricVPNRouteSwitches := smokeVPNFlowSchedulerRouteRecovery() fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx) if err != nil { return smokeReport{}, fmt.Errorf("fabric quic smoke: %w", err) @@ -177,6 +180,8 @@ func run(ctx context.Context) (smokeReport, error) { FabricVPNInteractive: fabricVPNInteractiveChannels, FabricVPNBulkWindow: fabricVPNBulkWindow, FabricVPNInteractiveWin: fabricVPNInteractiveWindow, + FabricVPNRouteRecovered: fabricVPNRouteRecovered, + FabricVPNRouteSwitches: fabricVPNRouteSwitches, FabricQUICAccepted: fabricQUICAccepted, FabricQUICEndpoint: fabricQUICEndpoint, FabricQUICPressure: fabricQUICPressure, @@ -214,6 +219,19 @@ func smokeVPNFlowSchedulerBulkPressure() (bool, int, int, int, int) { snapshot.RecommendedParallelWindows[vpnruntime.FabricTrafficClassInteractive] } +func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64) { + scheduler := vpnruntime.NewFabricFlowScheduler(8, 16) + channelID := "vpn-smoke-flow-0" + scheduler.RecordRouteFailure(channelID, "route-primary", "node-primary", fmt.Errorf("smoke primary unavailable"), time.Millisecond) + scheduler.RecordRouteSuccess(channelID, "route-alternate", "node-alternate", time.Millisecond) + snapshot := scheduler.Snapshot() + stat := snapshot.ChannelStats[channelID] + return stat.LastRecoveredFromRouteID == "route-primary" && + stat.LastRouteID == "route-alternate" && + snapshot.RouteRecoveredChannelCount == 1, + snapshot.RouteSwitchCount +} + func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) { server, err := mesh.StartQUICFabricServer(ctx, mesh.QUICFabricServerConfig{ ListenAddr: "127.0.0.1:0", diff --git a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md index 0117ceb..aecbe18 100644 --- a/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md +++ b/docs/architecture/DISTRIBUTED_FABRIC_NODE_PROTOCOL_PLAN.md @@ -443,6 +443,8 @@ pressure activation plus bulk/interactive window recommendations. Flow-scheduler route recovery telemetry now records per-channel route switches, the failed route a channel recovered from, and aggregate recovered-channel / switch counts, making alternate-route recovery measurable during load tests. +`mesh-live-smoke` now also exercises a primary-route failure followed by an +alternate-route success and reports the resulting route switch count. Endpoint ranking treats `capacity_limited` observations as a soft pressure penalty instead of a hard recent failure, enabling load spreading without marking the carrier unhealthy.