Track VPN route switch reasons

This commit is contained in:
2026-05-16 13:17:35 +03:00
parent f23e11a8fd
commit 9ea49c8338
3 changed files with 41 additions and 1 deletions
@@ -187,6 +187,7 @@ type fabricFlowQueue struct {
LastRecoveredFromRouteID string LastRecoveredFromRouteID string
LastRecoveredNextHop string LastRecoveredNextHop string
LastRouteSwitchAt time.Time LastRouteSwitchAt time.Time
LastRouteSwitchReason string
LastRouteRecoveryMillis int64 LastRouteRecoveryMillis int64
RouteSwitchCount uint64 RouteSwitchCount uint64
LastError string LastError string
@@ -264,6 +265,7 @@ type FabricFlowSchedulerSnapshot struct {
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"` RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
RouteRecoveryMaxMillis int64 `json:"route_recovery_max_ms,omitempty"` RouteRecoveryMaxMillis int64 `json:"route_recovery_max_ms,omitempty"`
RouteRecoveryAvgMillis int64 `json:"route_recovery_avg_ms,omitempty"` RouteRecoveryAvgMillis int64 `json:"route_recovery_avg_ms,omitempty"`
RouteSwitchReasonCounts map[string]int `json:"route_switch_reason_counts,omitempty"`
SlowChannelCount int `json:"slow_channel_count"` SlowChannelCount int `json:"slow_channel_count"`
FailingChannelCount int `json:"failing_channel_count"` FailingChannelCount int `json:"failing_channel_count"`
QualityWindowSampleCount int `json:"quality_window_sample_count"` QualityWindowSampleCount int `json:"quality_window_sample_count"`
@@ -301,6 +303,7 @@ type FabricFlowStat struct {
LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"` LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"`
LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"` LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"`
LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"` LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"`
LastRouteSwitchReason string `json:"last_route_switch_reason,omitempty"`
LastRouteRecoveryMillis int64 `json:"last_route_recovery_ms,omitempty"` LastRouteRecoveryMillis int64 `json:"last_route_recovery_ms,omitempty"`
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"` RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
LastError string `json:"last_error,omitempty"` LastError string `json:"last_error,omitempty"`
@@ -661,6 +664,7 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
QueueDepths: map[string]int{}, QueueDepths: map[string]int{},
TrafficClassCounts: map[string]int{}, TrafficClassCounts: map[string]int{},
RecommendedParallelWindows: map[string]int{}, RecommendedParallelWindows: map[string]int{},
RouteSwitchReasonCounts: map[string]int{},
ChannelStats: map[string]FabricFlowStat{}, ChannelStats: map[string]FabricFlowStat{},
} }
if s == nil { if s == nil {
@@ -710,6 +714,7 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
LastFailedRouteGeneration: queue.LastFailedRouteGeneration, LastFailedRouteGeneration: queue.LastFailedRouteGeneration,
LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID, LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID,
LastRecoveredNextHop: queue.LastRecoveredNextHop, LastRecoveredNextHop: queue.LastRecoveredNextHop,
LastRouteSwitchReason: queue.LastRouteSwitchReason,
LastRouteRecoveryMillis: queue.LastRouteRecoveryMillis, LastRouteRecoveryMillis: queue.LastRouteRecoveryMillis,
RouteSwitchCount: queue.RouteSwitchCount, RouteSwitchCount: queue.RouteSwitchCount,
LastError: queue.LastError, LastError: queue.LastError,
@@ -771,6 +776,7 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID, LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID,
LastRecoveredNextHop: stat.LastRecoveredNextHop, LastRecoveredNextHop: stat.LastRecoveredNextHop,
LastRouteSwitchAt: stat.LastRouteSwitchAt, LastRouteSwitchAt: stat.LastRouteSwitchAt,
LastRouteSwitchReason: stat.LastRouteSwitchReason,
LastRouteRecoveryMillis: stat.LastRouteRecoveryMillis, LastRouteRecoveryMillis: stat.LastRouteRecoveryMillis,
RouteSwitchCount: stat.RouteSwitchCount, RouteSwitchCount: stat.RouteSwitchCount,
LastError: stat.LastError, LastError: stat.LastError,
@@ -802,6 +808,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
snapshot.RouteSwitchCount += queue.RouteSwitchCount snapshot.RouteSwitchCount += queue.RouteSwitchCount
if queue.LastRecoveredFromRouteID != "" { if queue.LastRecoveredFromRouteID != "" {
snapshot.RouteRecoveredChannelCount++ snapshot.RouteRecoveredChannelCount++
if reason := strings.TrimSpace(queue.LastRouteSwitchReason); reason != "" {
snapshot.RouteSwitchReasonCounts[reason]++
}
if queue.LastRouteRecoveryMillis > snapshot.RouteRecoveryMaxMillis { if queue.LastRouteRecoveryMillis > snapshot.RouteRecoveryMaxMillis {
snapshot.RouteRecoveryMaxMillis = queue.LastRouteRecoveryMillis snapshot.RouteRecoveryMaxMillis = queue.LastRouteRecoveryMillis
} }
@@ -827,6 +836,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
if routeRecoverySamples > 0 { if routeRecoverySamples > 0 {
snapshot.RouteRecoveryAvgMillis = routeRecoveryTotalMillis / routeRecoverySamples snapshot.RouteRecoveryAvgMillis = routeRecoveryTotalMillis / routeRecoverySamples
} }
if len(snapshot.RouteSwitchReasonCounts) == 0 {
snapshot.RouteSwitchReasonCounts = nil
}
snapshot.BulkPressureChannelCount = snapshot.TrafficClassCounts[FabricTrafficClassBulk] snapshot.BulkPressureChannelCount = snapshot.TrafficClassCounts[FabricTrafficClassBulk]
snapshot.InteractiveOrControlCount = snapshot.TrafficClassCounts[FabricTrafficClassControl] + snapshot.TrafficClassCounts[FabricTrafficClassInteractive] snapshot.InteractiveOrControlCount = snapshot.TrafficClassCounts[FabricTrafficClassControl] + snapshot.TrafficClassCounts[FabricTrafficClassInteractive]
bulkPressureThreshold := s.adaptivePolicy.BulkPressureChannelThreshold bulkPressureThreshold := s.adaptivePolicy.BulkPressureChannelThreshold
@@ -912,6 +924,27 @@ func classWindowLimit(policy FabricServiceChannelAdaptivePolicy, trafficClass st
return maxWindow return maxWindow
} }
func normalizeFabricRouteSwitchReason(reason string) string {
reason = strings.ToLower(strings.TrimSpace(reason))
if reason == "" {
return "route_failure"
}
replacer := strings.NewReplacer(" ", "_", "\t", "_", "\n", "_", "\r", "_", ":", "_", ";", "_", ",", "_")
reason = replacer.Replace(reason)
for strings.Contains(reason, "__") {
reason = strings.ReplaceAll(reason, "__", "_")
}
reason = strings.Trim(reason, "_")
if reason == "" {
return "route_failure"
}
if len(reason) > 80 {
reason = reason[:80]
reason = strings.TrimRight(reason, "_")
}
return reason
}
func clampFabricWindow(value, minValue, maxValue int) int { func clampFabricWindow(value, minValue, maxValue int) int {
if value < minValue { if value < minValue {
return minValue return minValue
@@ -1113,11 +1146,13 @@ func (s *FabricFlowScheduler) RecordRouteSuccessWithProvenance(channelID string,
queue := s.ensureQueueLocked(channelID) queue := s.ensureQueueLocked(channelID)
failedRouteID := strings.TrimSpace(queue.LastFailedRouteID) failedRouteID := strings.TrimSpace(queue.LastFailedRouteID)
failedNextHop := strings.TrimSpace(queue.LastNextHop) failedNextHop := strings.TrimSpace(queue.LastNextHop)
failedReason := normalizeFabricRouteSwitchReason(queue.LastError)
if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) { if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) {
switchedAt := time.Now().UTC() switchedAt := time.Now().UTC()
queue.LastRecoveredFromRouteID = failedRouteID queue.LastRecoveredFromRouteID = failedRouteID
queue.LastRecoveredNextHop = failedNextHop queue.LastRecoveredNextHop = failedNextHop
queue.LastRouteSwitchAt = switchedAt queue.LastRouteSwitchAt = switchedAt
queue.LastRouteSwitchReason = failedReason
queue.LastRouteRecoveryMillis = 0 queue.LastRouteRecoveryMillis = 0
if !queue.LastRouteFailureAt.IsZero() { if !queue.LastRouteFailureAt.IsZero() {
queue.LastRouteRecoveryMillis = switchedAt.Sub(queue.LastRouteFailureAt).Milliseconds() queue.LastRouteRecoveryMillis = switchedAt.Sub(queue.LastRouteFailureAt).Milliseconds()
@@ -1532,6 +1532,7 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
} }
if statA.LastRecoveredFromRouteID != "route-primary" || if statA.LastRecoveredFromRouteID != "route-primary" ||
statA.LastRecoveredNextHop != "relay-primary" || statA.LastRecoveredNextHop != "relay-primary" ||
statA.LastRouteSwitchReason != "production_mesh_next_peer_is_unavailable" ||
statA.RouteSwitchCount != 1 || statA.RouteSwitchCount != 1 ||
statA.LastRouteFailureAt == "" || statA.LastRouteFailureAt == "" ||
statA.LastRouteSwitchAt == "" || statA.LastRouteSwitchAt == "" ||
@@ -1539,7 +1540,8 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 || snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 ||
snapshot.FlowScheduler.RouteSwitchCount != 1 || snapshot.FlowScheduler.RouteSwitchCount != 1 ||
snapshot.FlowScheduler.RouteRecoveryMaxMillis != statA.LastRouteRecoveryMillis || snapshot.FlowScheduler.RouteRecoveryMaxMillis != statA.LastRouteRecoveryMillis ||
snapshot.FlowScheduler.RouteRecoveryAvgMillis != statA.LastRouteRecoveryMillis { snapshot.FlowScheduler.RouteRecoveryAvgMillis != statA.LastRouteRecoveryMillis ||
snapshot.FlowScheduler.RouteSwitchReasonCounts["production_mesh_next_peer_is_unavailable"] != 1 {
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler) t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
} }
if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 { if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 {
@@ -453,6 +453,9 @@ Route recovery telemetry includes failure/switch timestamps and recovery
duration in milliseconds for each recovered flow channel. duration in milliseconds for each recovered flow channel.
Scheduler snapshots also aggregate route recovery max/average milliseconds Scheduler snapshots also aggregate route recovery max/average milliseconds
across recovered channels for quick load-test health checks. across recovered channels for quick load-test health checks.
Route recovery telemetry now includes normalized switch reasons and aggregate
reason counts, so load tests can distinguish peer failures, timeouts, and other
route-break causes.
Endpoint ranking treats `capacity_limited` observations as a soft pressure Endpoint ranking treats `capacity_limited` observations as a soft pressure
penalty instead of a hard recent failure, enabling load spreading without penalty instead of a hard recent failure, enabling load spreading without
marking the carrier unhealthy. marking the carrier unhealthy.