Report route recovery reason in smoke

This commit is contained in:
2026-05-16 13:18:26 +03:00
parent 9ea49c8338
commit c97044cd34
2 changed files with 8 additions and 3 deletions
@@ -53,6 +53,7 @@ type smokeReport struct {
FabricVPNRecoveryMS int64 `json:"fabric_vpn_route_recovery_ms"` FabricVPNRecoveryMS int64 `json:"fabric_vpn_route_recovery_ms"`
FabricVPNRecoveryMaxMS int64 `json:"fabric_vpn_route_recovery_max_ms"` FabricVPNRecoveryMaxMS int64 `json:"fabric_vpn_route_recovery_max_ms"`
FabricVPNRecoveryAvgMS int64 `json:"fabric_vpn_route_recovery_avg_ms"` FabricVPNRecoveryAvgMS int64 `json:"fabric_vpn_route_recovery_avg_ms"`
FabricVPNRecoveryReason string `json:"fabric_vpn_route_recovery_reason"`
FabricQUICAccepted bool `json:"fabric_quic_accepted"` FabricQUICAccepted bool `json:"fabric_quic_accepted"`
FabricQUICEndpoint string `json:"fabric_quic_endpoint"` FabricQUICEndpoint string `json:"fabric_quic_endpoint"`
FabricQUICPressure int `json:"fabric_quic_capacity_pressure_percent"` FabricQUICPressure int `json:"fabric_quic_capacity_pressure_percent"`
@@ -157,7 +158,7 @@ func run(ctx context.Context) (smokeReport, error) {
return smokeReport{}, fmt.Errorf("fabric vpn packet session smoke: %w", err) return smokeReport{}, fmt.Errorf("fabric vpn packet session smoke: %w", err)
} }
fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow := smokeVPNFlowSchedulerBulkPressure() fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow := smokeVPNFlowSchedulerBulkPressure()
fabricVPNRouteRecovered, fabricVPNRouteSwitches, fabricVPNRecoveryMS, fabricVPNRecoveryMaxMS, fabricVPNRecoveryAvgMS := smokeVPNFlowSchedulerRouteRecovery() fabricVPNRouteRecovered, fabricVPNRouteSwitches, fabricVPNRecoveryMS, fabricVPNRecoveryMaxMS, fabricVPNRecoveryAvgMS, fabricVPNRecoveryReason := smokeVPNFlowSchedulerRouteRecovery()
fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx) fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx)
if err != nil { if err != nil {
return smokeReport{}, fmt.Errorf("fabric quic smoke: %w", err) return smokeReport{}, fmt.Errorf("fabric quic smoke: %w", err)
@@ -188,6 +189,7 @@ func run(ctx context.Context) (smokeReport, error) {
FabricVPNRecoveryMS: fabricVPNRecoveryMS, FabricVPNRecoveryMS: fabricVPNRecoveryMS,
FabricVPNRecoveryMaxMS: fabricVPNRecoveryMaxMS, FabricVPNRecoveryMaxMS: fabricVPNRecoveryMaxMS,
FabricVPNRecoveryAvgMS: fabricVPNRecoveryAvgMS, FabricVPNRecoveryAvgMS: fabricVPNRecoveryAvgMS,
FabricVPNRecoveryReason: fabricVPNRecoveryReason,
FabricQUICAccepted: fabricQUICAccepted, FabricQUICAccepted: fabricQUICAccepted,
FabricQUICEndpoint: fabricQUICEndpoint, FabricQUICEndpoint: fabricQUICEndpoint,
FabricQUICPressure: fabricQUICPressure, FabricQUICPressure: fabricQUICPressure,
@@ -225,7 +227,7 @@ func smokeVPNFlowSchedulerBulkPressure() (bool, int, int, int, int) {
snapshot.RecommendedParallelWindows[vpnruntime.FabricTrafficClassInteractive] snapshot.RecommendedParallelWindows[vpnruntime.FabricTrafficClassInteractive]
} }
func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64, int64, int64) { func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64, int64, int64, string) {
scheduler := vpnruntime.NewFabricFlowScheduler(8, 16) scheduler := vpnruntime.NewFabricFlowScheduler(8, 16)
channelID := "vpn-smoke-flow-0" channelID := "vpn-smoke-flow-0"
scheduler.RecordRouteFailure(channelID, "route-primary", "node-primary", fmt.Errorf("smoke primary unavailable"), time.Millisecond) scheduler.RecordRouteFailure(channelID, "route-primary", "node-primary", fmt.Errorf("smoke primary unavailable"), time.Millisecond)
@@ -239,7 +241,8 @@ func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64, int64, int64) {
snapshot.RouteSwitchCount, snapshot.RouteSwitchCount,
stat.LastRouteRecoveryMillis, stat.LastRouteRecoveryMillis,
snapshot.RouteRecoveryMaxMillis, snapshot.RouteRecoveryMaxMillis,
snapshot.RouteRecoveryAvgMillis snapshot.RouteRecoveryAvgMillis,
stat.LastRouteSwitchReason
} }
func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) { func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) {
@@ -456,6 +456,8 @@ across recovered channels for quick load-test health checks.
Route recovery telemetry now includes normalized switch reasons and aggregate Route recovery telemetry now includes normalized switch reasons and aggregate
reason counts, so load tests can distinguish peer failures, timeouts, and other reason counts, so load tests can distinguish peer failures, timeouts, and other
route-break causes. route-break causes.
`mesh-live-smoke` reports the synthetic route-recovery reason beside recovery
timing and switch count.
Endpoint ranking treats `capacity_limited` observations as a soft pressure Endpoint ranking treats `capacity_limited` observations as a soft pressure
penalty instead of a hard recent failure, enabling load spreading without penalty instead of a hard recent failure, enabling load spreading without
marking the carrier unhealthy. marking the carrier unhealthy.