Report route recovery reason in smoke
This commit is contained in:
@@ -53,6 +53,7 @@ type smokeReport struct {
|
|||||||
FabricVPNRecoveryMS int64 `json:"fabric_vpn_route_recovery_ms"`
|
FabricVPNRecoveryMS int64 `json:"fabric_vpn_route_recovery_ms"`
|
||||||
FabricVPNRecoveryMaxMS int64 `json:"fabric_vpn_route_recovery_max_ms"`
|
FabricVPNRecoveryMaxMS int64 `json:"fabric_vpn_route_recovery_max_ms"`
|
||||||
FabricVPNRecoveryAvgMS int64 `json:"fabric_vpn_route_recovery_avg_ms"`
|
FabricVPNRecoveryAvgMS int64 `json:"fabric_vpn_route_recovery_avg_ms"`
|
||||||
|
FabricVPNRecoveryReason string `json:"fabric_vpn_route_recovery_reason"`
|
||||||
FabricQUICAccepted bool `json:"fabric_quic_accepted"`
|
FabricQUICAccepted bool `json:"fabric_quic_accepted"`
|
||||||
FabricQUICEndpoint string `json:"fabric_quic_endpoint"`
|
FabricQUICEndpoint string `json:"fabric_quic_endpoint"`
|
||||||
FabricQUICPressure int `json:"fabric_quic_capacity_pressure_percent"`
|
FabricQUICPressure int `json:"fabric_quic_capacity_pressure_percent"`
|
||||||
@@ -157,7 +158,7 @@ func run(ctx context.Context) (smokeReport, error) {
|
|||||||
return smokeReport{}, fmt.Errorf("fabric vpn packet session smoke: %w", err)
|
return smokeReport{}, fmt.Errorf("fabric vpn packet session smoke: %w", err)
|
||||||
}
|
}
|
||||||
fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow := smokeVPNFlowSchedulerBulkPressure()
|
fabricVPNBulkPressure, fabricVPNBulkChannels, fabricVPNInteractiveChannels, fabricVPNBulkWindow, fabricVPNInteractiveWindow := smokeVPNFlowSchedulerBulkPressure()
|
||||||
fabricVPNRouteRecovered, fabricVPNRouteSwitches, fabricVPNRecoveryMS, fabricVPNRecoveryMaxMS, fabricVPNRecoveryAvgMS := smokeVPNFlowSchedulerRouteRecovery()
|
fabricVPNRouteRecovered, fabricVPNRouteSwitches, fabricVPNRecoveryMS, fabricVPNRecoveryMaxMS, fabricVPNRecoveryAvgMS, fabricVPNRecoveryReason := smokeVPNFlowSchedulerRouteRecovery()
|
||||||
fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx)
|
fabricQUICAccepted, fabricQUICEndpoint, fabricQUICPressure, err := smokeQUICFabricSession(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return smokeReport{}, fmt.Errorf("fabric quic smoke: %w", err)
|
return smokeReport{}, fmt.Errorf("fabric quic smoke: %w", err)
|
||||||
@@ -188,6 +189,7 @@ func run(ctx context.Context) (smokeReport, error) {
|
|||||||
FabricVPNRecoveryMS: fabricVPNRecoveryMS,
|
FabricVPNRecoveryMS: fabricVPNRecoveryMS,
|
||||||
FabricVPNRecoveryMaxMS: fabricVPNRecoveryMaxMS,
|
FabricVPNRecoveryMaxMS: fabricVPNRecoveryMaxMS,
|
||||||
FabricVPNRecoveryAvgMS: fabricVPNRecoveryAvgMS,
|
FabricVPNRecoveryAvgMS: fabricVPNRecoveryAvgMS,
|
||||||
|
FabricVPNRecoveryReason: fabricVPNRecoveryReason,
|
||||||
FabricQUICAccepted: fabricQUICAccepted,
|
FabricQUICAccepted: fabricQUICAccepted,
|
||||||
FabricQUICEndpoint: fabricQUICEndpoint,
|
FabricQUICEndpoint: fabricQUICEndpoint,
|
||||||
FabricQUICPressure: fabricQUICPressure,
|
FabricQUICPressure: fabricQUICPressure,
|
||||||
@@ -225,7 +227,7 @@ func smokeVPNFlowSchedulerBulkPressure() (bool, int, int, int, int) {
|
|||||||
snapshot.RecommendedParallelWindows[vpnruntime.FabricTrafficClassInteractive]
|
snapshot.RecommendedParallelWindows[vpnruntime.FabricTrafficClassInteractive]
|
||||||
}
|
}
|
||||||
|
|
||||||
func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64, int64, int64) {
|
func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64, int64, int64, string) {
|
||||||
scheduler := vpnruntime.NewFabricFlowScheduler(8, 16)
|
scheduler := vpnruntime.NewFabricFlowScheduler(8, 16)
|
||||||
channelID := "vpn-smoke-flow-0"
|
channelID := "vpn-smoke-flow-0"
|
||||||
scheduler.RecordRouteFailure(channelID, "route-primary", "node-primary", fmt.Errorf("smoke primary unavailable"), time.Millisecond)
|
scheduler.RecordRouteFailure(channelID, "route-primary", "node-primary", fmt.Errorf("smoke primary unavailable"), time.Millisecond)
|
||||||
@@ -239,7 +241,8 @@ func smokeVPNFlowSchedulerRouteRecovery() (bool, uint64, int64, int64, int64) {
|
|||||||
snapshot.RouteSwitchCount,
|
snapshot.RouteSwitchCount,
|
||||||
stat.LastRouteRecoveryMillis,
|
stat.LastRouteRecoveryMillis,
|
||||||
snapshot.RouteRecoveryMaxMillis,
|
snapshot.RouteRecoveryMaxMillis,
|
||||||
snapshot.RouteRecoveryAvgMillis
|
snapshot.RouteRecoveryAvgMillis,
|
||||||
|
stat.LastRouteSwitchReason
|
||||||
}
|
}
|
||||||
|
|
||||||
func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) {
|
func smokeQUICFabricSession(ctx context.Context) (bool, string, int, error) {
|
||||||
|
|||||||
@@ -456,6 +456,8 @@ across recovered channels for quick load-test health checks.
|
|||||||
Route recovery telemetry now includes normalized switch reasons and aggregate
|
Route recovery telemetry now includes normalized switch reasons and aggregate
|
||||||
reason counts, so load tests can distinguish peer failures, timeouts, and other
|
reason counts, so load tests can distinguish peer failures, timeouts, and other
|
||||||
route-break causes.
|
route-break causes.
|
||||||
|
`mesh-live-smoke` reports the synthetic route-recovery reason beside recovery
|
||||||
|
timing and switch count.
|
||||||
Endpoint ranking treats `capacity_limited` observations as a soft pressure
|
Endpoint ranking treats `capacity_limited` observations as a soft pressure
|
||||||
penalty instead of a hard recent failure, enabling load spreading without
|
penalty instead of a hard recent failure, enabling load spreading without
|
||||||
marking the carrier unhealthy.
|
marking the carrier unhealthy.
|
||||||
|
|||||||
Reference in New Issue
Block a user