Smoke test VPN route recovery

This commit is contained in:
2026-05-16 13:09:08 +03:00
parent d5c089d120
commit d43910d2c0
2 changed files with 20 additions and 0 deletions
@@ -48,6 +48,8 @@ type smokeReport struct {
FabricVPNInteractive int `json:"fabric_vpn_interactive_or_control_channels"` FabricVPNInteractive int `json:"fabric_vpn_interactive_or_control_channels"`
FabricVPNBulkWindow int `json:"fabric_vpn_bulk_parallel_window"` FabricVPNBulkWindow int `json:"fabric_vpn_bulk_parallel_window"`
FabricVPNInteractiveWin int `json:"fabric_vpn_interactive_parallel_window"` FabricVPNInteractiveWin int `json:"fabric_vpn_interactive_parallel_window"`
FabricVPNRouteRecovered bool `json:"fabric_vpn_route_recovered"`
FabricVPNRouteSwitches uint64 `json:"fabric_vpn_route_switch_count"`
FabricQUICAccepted bool `json:"fabric_quic_accepted"` FabricQUICAccepted bool `json:"fabric_quic_accepted"`
FabricQUICEndpoint string `json:"fabric_quic_endpoint"` FabricQUICEndpoint string `json:"fabric_quic_endpoint"`
FabricQUICPressure int `json:"fabric_quic_capacity_pressure_percent"` FabricQUICPressure int `json:"fabric_quic_capacity_pressure_percent"`
@@ -152,6 +154,7 @@ func run(ctx context.Context) (smokeReport, error) {
return smokeReport{}, fmt.Errorf("fabric vpn packet session smoke: %w", err) return smokeReport{}, fmt.Errorf("fabric vpn packet session smoke: %w", err)
} }
fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow := smokeVPNFlowSchedulerBulkPressure() fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow := smokeVPNFlowSchedulerBulkPressure()
fabricVPNRouteRecovered, fabricVPNRouteSwitches := smokeVPNFlowSchedulerRouteRecovery()
fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx) fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx)
if err != nil { if err != nil {
return smokeReport{}, fmt.Errorf("fabric quic smoke: %w", err) return smokeReport{}, fmt.Errorf("fabric quic smoke: %w", err)
@@ -177,6 +180,8 @@ func run(ctx context.Context) (smokeReport, error) {
FabricVPNInteractive: fabricVPNInteractiveChannels, FabricVPNInteractive: fabricVPNInteractiveChannels,
FabricVPNBulkWindow: fabricVPNBulkWindow, FabricVPNBulkWindow: fabricVPNBulkWindow,
FabricVPNInteractiveWin: fabricVPNInteractiveWindow, FabricVPNInteractiveWin: fabricVPNInteractiveWindow,
FabricVPNRouteRecovered: fabricVPNRouteRecovered,
FabricVPNRouteSwitches: fabricVPNRouteSwitches,
FabricQUICAccepted: fabricQUICAccepted, FabricQUICAccepted: fabricQUICAccepted,
FabricQUICEndpoint: fabricQUICEndpoint, FabricQUICEndpoint: fabricQUICEndpoint,
FabricQUICPressure: fabricQUICPressure, FabricQUICPressure: fabricQUICPressure,
@@ -214,6 +219,19 @@ func smokeVPNFlowSchedulerBulkPressure() (bool, int, int, int, int) {
snapshot.RecommendedParallelWindows[vpnruntime.FabricTrafficClassInteractive] snapshot.RecommendedParallelWindows[vpnruntime.FabricTrafficClassInteractive]
} }
func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64) {
scheduler := vpnruntime.NewFabricFlowScheduler(8, 16)
channelID := "vpn-smoke-flow-0"
scheduler.RecordRouteFailure(channelID, "route-primary", "node-primary", fmt.Errorf("smoke primary unavailable"), time.Millisecond)
scheduler.RecordRouteSuccess(channelID, "route-alternate", "node-alternate", time.Millisecond)
snapshot := scheduler.Snapshot()
stat := snapshot.ChannelStats[channelID]
return stat.LastRecoveredFromRouteID == "route-primary" &&
stat.LastRouteID == "route-alternate" &&
snapshot.RouteRecoveredChannelCount == 1,
snapshot.RouteSwitchCount
}
func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) { func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) {
server, err := mesh.StartQUICFabricServer(ctx, mesh.QUICFabricServerConfig{ server, err := mesh.StartQUICFabricServer(ctx, mesh.QUICFabricServerConfig{
ListenAddr: "127.0.0.1:0", ListenAddr: "127.0.0.1:0",
@@ -443,6 +443,8 @@ pressure activation plus bulk/interactive window recommendations.
Flow-scheduler route recovery telemetry now records per-channel route switches, Flow-scheduler route recovery telemetry now records per-channel route switches,
the failed route a channel recovered from, and aggregate recovered-channel / the failed route a channel recovered from, and aggregate recovered-channel /
switch counts, making alternate-route recovery measurable during load tests. switch counts, making alternate-route recovery measurable during load tests.
`mesh-live-smoke` now also exercises a primary-route failure followed by an
alternate-route success and reports the resulting route switch count.
Endpoint ranking treats `capacity_limited` observations as a soft pressure Endpoint ranking treats `capacity_limited` observations as a soft pressure
penalty instead of a hard recent failure, enabling load spreading without penalty instead of a hard recent failure, enabling load spreading without
marking the carrier unhealthy. marking the carrier unhealthy.