Report VPN route recovery time in smoke

This commit is contained in:
2026-05-16 13:12:40 +03:00
parent c8e7bd3717
commit 0363bb8c9c
2 changed files with 9 additions and 3 deletions
@@ -50,6 +50,7 @@ type smokeReport struct {
FabricVPNInteractiveWin int `json:"fabric_vpn_interactive_parallel_window"` FabricVPNInteractiveWin int `json:"fabric_vpn_interactive_parallel_window"`
FabricVPNRouteRecovered bool `json:"fabric_vpn_route_recovered"` FabricVPNRouteRecovered bool `json:"fabric_vpn_route_recovered"`
FabricVPNRouteSwitches uint64 `json:"fabric_vpn_route_switch_count"` FabricVPNRouteSwitches uint64 `json:"fabric_vpn_route_switch_count"`
FabricVPNRecoveryMS int64 `json:"fabric_vpn_route_recovery_ms"`
FabricQUICAccepted bool `json:"fabric_quic_accepted"` FabricQUICAccepted bool `json:"fabric_quic_accepted"`
FabricQUICEndpoint string `json:"fabric_quic_endpoint"` FabricQUICEndpoint string `json:"fabric_quic_endpoint"`
FabricQUICPressure int `json:"fabric_quic_capacity_pressure_percent"` FabricQUICPressure int `json:"fabric_quic_capacity_pressure_percent"`
@@ -154,7 +155,7 @@ func run(ctx context.Context) (smokeReport, error) {
return smokeReport{}, fmt.Errorf("fabric vpn packet session smoke: %w", err) return smokeReport{}, fmt.Errorf("fabric vpn packet session smoke: %w", err)
} }
fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow := smokeVPNFlowSchedulerBulkPressure() fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow := smokeVPNFlowSchedulerBulkPressure()
fabricVPNRouteRecovered, fabricVPNRouteSwitches := smokeVPNFlowSchedulerRouteRecovery() fabricVPNRouteRecovered, fabricVPNRouteSwitches, fabricVPNRecoveryMS := smokeVPNFlowSchedulerRouteRecovery()
fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx) fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx)
if err != nil { if err != nil {
return smokeReport{}, fmt.Errorf("fabric quic smoke: %w", err) return smokeReport{}, fmt.Errorf("fabric quic smoke: %w", err)
@@ -182,6 +183,7 @@ func run(ctx context.Context) (smokeReport, error) {
FabricVPNInteractiveWin: fabricVPNInteractiveWindow, FabricVPNInteractiveWin: fabricVPNInteractiveWindow,
FabricVPNRouteRecovered: fabricVPNRouteRecovered, FabricVPNRouteRecovered: fabricVPNRouteRecovered,
FabricVPNRouteSwitches: fabricVPNRouteSwitches, FabricVPNRouteSwitches: fabricVPNRouteSwitches,
FabricVPNRecoveryMS: fabricVPNRecoveryMS,
FabricQUICAccepted: fabricQUICAccepted, FabricQUICAccepted: fabricQUICAccepted,
FabricQUICEndpoint: fabricQUICEndpoint, FabricQUICEndpoint: fabricQUICEndpoint,
FabricQUICPressure: fabricQUICPressure, FabricQUICPressure: fabricQUICPressure,
@@ -219,17 +221,19 @@ func smokeVPNFlowSchedulerBulkPressure() (bool, int, int, int, int) {
snapshot.RecommendedParallelWindows[vpnruntime.FabricTrafficClassInteractive] snapshot.RecommendedParallelWindows[vpnruntime.FabricTrafficClassInteractive]
} }
func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64) { func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64) {
scheduler := vpnruntime.NewFabricFlowScheduler(8, 16) scheduler := vpnruntime.NewFabricFlowScheduler(8, 16)
channelID := "vpn-smoke-flow-0" channelID := "vpn-smoke-flow-0"
scheduler.RecordRouteFailure(channelID, "route-primary", "node-primary", fmt.Errorf("smoke primary unavailable"), time.Millisecond) scheduler.RecordRouteFailure(channelID, "route-primary", "node-primary", fmt.Errorf("smoke primary unavailable"), time.Millisecond)
time.Sleep(time.Millisecond)
scheduler.RecordRouteSuccess(channelID, "route-alternate", "node-alternate", time.Millisecond) scheduler.RecordRouteSuccess(channelID, "route-alternate", "node-alternate", time.Millisecond)
snapshot := scheduler.Snapshot() snapshot := scheduler.Snapshot()
stat := snapshot.ChannelStats[channelID] stat := snapshot.ChannelStats[channelID]
return stat.LastRecoveredFromRouteID == "route-primary" && return stat.LastRecoveredFromRouteID == "route-primary" &&
stat.LastRouteID == "route-alternate" && stat.LastRouteID == "route-alternate" &&
snapshot.RouteRecoveredChannelCount == 1, snapshot.RouteRecoveredChannelCount == 1,
snapshot.RouteSwitchCount snapshot.RouteSwitchCount,
stat.LastRouteRecoveryMillis
} }
func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) { func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) {
@@ -445,6 +445,8 @@ the failed route a channel recovered from, and aggregate recovered-channel /
switch counts, making alternate-route recovery measurable during load tests. switch counts, making alternate-route recovery measurable during load tests.
`mesh-live-smoke` now also exercises a primary-route failure followed by an `mesh-live-smoke` now also exercises a primary-route failure followed by an
alternate-route success and reports the resulting route switch count. alternate-route success and reports the resulting route switch count.
The same smoke output reports measured route recovery milliseconds for the
synthetic failover path.
Route recovery telemetry includes failure/switch timestamps and recovery Route recovery telemetry includes failure/switch timestamps and recovery
duration in milliseconds for each recovered flow channel. duration in milliseconds for each recovered flow channel.
Endpoint ranking treats `capacity_limited` observations as a soft pressure Endpoint ranking treats `capacity_limited` observations as a soft pressure