Track VPN route switch reasons
This commit is contained in:
@@ -187,6 +187,7 @@ type fabricFlowQueue struct {
|
|||||||
LastRecoveredFromRouteID string
|
LastRecoveredFromRouteID string
|
||||||
LastRecoveredNextHop string
|
LastRecoveredNextHop string
|
||||||
LastRouteSwitchAt time.Time
|
LastRouteSwitchAt time.Time
|
||||||
|
LastRouteSwitchReason string
|
||||||
LastRouteRecoveryMillis int64
|
LastRouteRecoveryMillis int64
|
||||||
RouteSwitchCount uint64
|
RouteSwitchCount uint64
|
||||||
LastError string
|
LastError string
|
||||||
@@ -264,6 +265,7 @@ type FabricFlowSchedulerSnapshot struct {
|
|||||||
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
|
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
|
||||||
RouteRecoveryMaxMillis int64 `json:"route_recovery_max_ms,omitempty"`
|
RouteRecoveryMaxMillis int64 `json:"route_recovery_max_ms,omitempty"`
|
||||||
RouteRecoveryAvgMillis int64 `json:"route_recovery_avg_ms,omitempty"`
|
RouteRecoveryAvgMillis int64 `json:"route_recovery_avg_ms,omitempty"`
|
||||||
|
RouteSwitchReasonCounts map[string]int `json:"route_switch_reason_counts,omitempty"`
|
||||||
SlowChannelCount int `json:"slow_channel_count"`
|
SlowChannelCount int `json:"slow_channel_count"`
|
||||||
FailingChannelCount int `json:"failing_channel_count"`
|
FailingChannelCount int `json:"failing_channel_count"`
|
||||||
QualityWindowSampleCount int `json:"quality_window_sample_count"`
|
QualityWindowSampleCount int `json:"quality_window_sample_count"`
|
||||||
@@ -301,6 +303,7 @@ type FabricFlowStat struct {
|
|||||||
LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"`
|
LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"`
|
||||||
LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"`
|
LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"`
|
||||||
LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"`
|
LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"`
|
||||||
|
LastRouteSwitchReason string `json:"last_route_switch_reason,omitempty"`
|
||||||
LastRouteRecoveryMillis int64 `json:"last_route_recovery_ms,omitempty"`
|
LastRouteRecoveryMillis int64 `json:"last_route_recovery_ms,omitempty"`
|
||||||
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
|
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
|
||||||
LastError string `json:"last_error,omitempty"`
|
LastError string `json:"last_error,omitempty"`
|
||||||
@@ -661,6 +664,7 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
|||||||
QueueDepths: map[string]int{},
|
QueueDepths: map[string]int{},
|
||||||
TrafficClassCounts: map[string]int{},
|
TrafficClassCounts: map[string]int{},
|
||||||
RecommendedParallelWindows: map[string]int{},
|
RecommendedParallelWindows: map[string]int{},
|
||||||
|
RouteSwitchReasonCounts: map[string]int{},
|
||||||
ChannelStats: map[string]FabricFlowStat{},
|
ChannelStats: map[string]FabricFlowStat{},
|
||||||
}
|
}
|
||||||
if s == nil {
|
if s == nil {
|
||||||
@@ -710,6 +714,7 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
|||||||
LastFailedRouteGeneration: queue.LastFailedRouteGeneration,
|
LastFailedRouteGeneration: queue.LastFailedRouteGeneration,
|
||||||
LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID,
|
LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID,
|
||||||
LastRecoveredNextHop: queue.LastRecoveredNextHop,
|
LastRecoveredNextHop: queue.LastRecoveredNextHop,
|
||||||
|
LastRouteSwitchReason: queue.LastRouteSwitchReason,
|
||||||
LastRouteRecoveryMillis: queue.LastRouteRecoveryMillis,
|
LastRouteRecoveryMillis: queue.LastRouteRecoveryMillis,
|
||||||
RouteSwitchCount: queue.RouteSwitchCount,
|
RouteSwitchCount: queue.RouteSwitchCount,
|
||||||
LastError: queue.LastError,
|
LastError: queue.LastError,
|
||||||
@@ -771,6 +776,7 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
|||||||
LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID,
|
LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID,
|
||||||
LastRecoveredNextHop: stat.LastRecoveredNextHop,
|
LastRecoveredNextHop: stat.LastRecoveredNextHop,
|
||||||
LastRouteSwitchAt: stat.LastRouteSwitchAt,
|
LastRouteSwitchAt: stat.LastRouteSwitchAt,
|
||||||
|
LastRouteSwitchReason: stat.LastRouteSwitchReason,
|
||||||
LastRouteRecoveryMillis: stat.LastRouteRecoveryMillis,
|
LastRouteRecoveryMillis: stat.LastRouteRecoveryMillis,
|
||||||
RouteSwitchCount: stat.RouteSwitchCount,
|
RouteSwitchCount: stat.RouteSwitchCount,
|
||||||
LastError: stat.LastError,
|
LastError: stat.LastError,
|
||||||
@@ -802,6 +808,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
|||||||
snapshot.RouteSwitchCount += queue.RouteSwitchCount
|
snapshot.RouteSwitchCount += queue.RouteSwitchCount
|
||||||
if queue.LastRecoveredFromRouteID != "" {
|
if queue.LastRecoveredFromRouteID != "" {
|
||||||
snapshot.RouteRecoveredChannelCount++
|
snapshot.RouteRecoveredChannelCount++
|
||||||
|
if reason := strings.TrimSpace(queue.LastRouteSwitchReason); reason != "" {
|
||||||
|
snapshot.RouteSwitchReasonCounts[reason]++
|
||||||
|
}
|
||||||
if queue.LastRouteRecoveryMillis > snapshot.RouteRecoveryMaxMillis {
|
if queue.LastRouteRecoveryMillis > snapshot.RouteRecoveryMaxMillis {
|
||||||
snapshot.RouteRecoveryMaxMillis = queue.LastRouteRecoveryMillis
|
snapshot.RouteRecoveryMaxMillis = queue.LastRouteRecoveryMillis
|
||||||
}
|
}
|
||||||
@@ -827,6 +836,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
|||||||
if routeRecoverySamples > 0 {
|
if routeRecoverySamples > 0 {
|
||||||
snapshot.RouteRecoveryAvgMillis = routeRecoveryTotalMillis / routeRecoverySamples
|
snapshot.RouteRecoveryAvgMillis = routeRecoveryTotalMillis / routeRecoverySamples
|
||||||
}
|
}
|
||||||
|
if len(snapshot.RouteSwitchReasonCounts) == 0 {
|
||||||
|
snapshot.RouteSwitchReasonCounts = nil
|
||||||
|
}
|
||||||
snapshot.BulkPressureChannelCount = snapshot.TrafficClassCounts[FabricTrafficClassBulk]
|
snapshot.BulkPressureChannelCount = snapshot.TrafficClassCounts[FabricTrafficClassBulk]
|
||||||
snapshot.InteractiveOrControlCount = snapshot.TrafficClassCounts[FabricTrafficClassControl] + snapshot.TrafficClassCounts[FabricTrafficClassInteractive]
|
snapshot.InteractiveOrControlCount = snapshot.TrafficClassCounts[FabricTrafficClassControl] + snapshot.TrafficClassCounts[FabricTrafficClassInteractive]
|
||||||
bulkPressureThreshold := s.adaptivePolicy.BulkPressureChannelThreshold
|
bulkPressureThreshold := s.adaptivePolicy.BulkPressureChannelThreshold
|
||||||
@@ -912,6 +924,27 @@ func classWindowLimit(policy FabricServiceChannelAdaptivePolicy, trafficClass st
|
|||||||
return maxWindow
|
return maxWindow
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func normalizeFabricRouteSwitchReason(reason string) string {
|
||||||
|
reason = strings.ToLower(strings.TrimSpace(reason))
|
||||||
|
if reason == "" {
|
||||||
|
return "route_failure"
|
||||||
|
}
|
||||||
|
replacer := strings.NewReplacer(" ", "_", "\t", "_", "\n", "_", "\r", "_", ":", "_", ";", "_", ",", "_")
|
||||||
|
reason = replacer.Replace(reason)
|
||||||
|
for strings.Contains(reason, "__") {
|
||||||
|
reason = strings.ReplaceAll(reason, "__", "_")
|
||||||
|
}
|
||||||
|
reason = strings.Trim(reason, "_")
|
||||||
|
if reason == "" {
|
||||||
|
return "route_failure"
|
||||||
|
}
|
||||||
|
if len(reason) > 80 {
|
||||||
|
reason = reason[:80]
|
||||||
|
reason = strings.TrimRight(reason, "_")
|
||||||
|
}
|
||||||
|
return reason
|
||||||
|
}
|
||||||
|
|
||||||
func clampFabricWindow(value, minValue, maxValue int) int {
|
func clampFabricWindow(value, minValue, maxValue int) int {
|
||||||
if value < minValue {
|
if value < minValue {
|
||||||
return minValue
|
return minValue
|
||||||
@@ -1113,11 +1146,13 @@ func (s *FabricFlowScheduler) RecordRouteSuccessWithProvenance(channelID string,
|
|||||||
queue := s.ensureQueueLocked(channelID)
|
queue := s.ensureQueueLocked(channelID)
|
||||||
failedRouteID := strings.TrimSpace(queue.LastFailedRouteID)
|
failedRouteID := strings.TrimSpace(queue.LastFailedRouteID)
|
||||||
failedNextHop := strings.TrimSpace(queue.LastNextHop)
|
failedNextHop := strings.TrimSpace(queue.LastNextHop)
|
||||||
|
failedReason := normalizeFabricRouteSwitchReason(queue.LastError)
|
||||||
if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) {
|
if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) {
|
||||||
switchedAt := time.Now().UTC()
|
switchedAt := time.Now().UTC()
|
||||||
queue.LastRecoveredFromRouteID = failedRouteID
|
queue.LastRecoveredFromRouteID = failedRouteID
|
||||||
queue.LastRecoveredNextHop = failedNextHop
|
queue.LastRecoveredNextHop = failedNextHop
|
||||||
queue.LastRouteSwitchAt = switchedAt
|
queue.LastRouteSwitchAt = switchedAt
|
||||||
|
queue.LastRouteSwitchReason = failedReason
|
||||||
queue.LastRouteRecoveryMillis = 0
|
queue.LastRouteRecoveryMillis = 0
|
||||||
if !queue.LastRouteFailureAt.IsZero() {
|
if !queue.LastRouteFailureAt.IsZero() {
|
||||||
queue.LastRouteRecoveryMillis = switchedAt.Sub(queue.LastRouteFailureAt).Milliseconds()
|
queue.LastRouteRecoveryMillis = switchedAt.Sub(queue.LastRouteFailureAt).Milliseconds()
|
||||||
|
|||||||
@@ -1532,6 +1532,7 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
|
|||||||
}
|
}
|
||||||
if statA.LastRecoveredFromRouteID != "route-primary" ||
|
if statA.LastRecoveredFromRouteID != "route-primary" ||
|
||||||
statA.LastRecoveredNextHop != "relay-primary" ||
|
statA.LastRecoveredNextHop != "relay-primary" ||
|
||||||
|
statA.LastRouteSwitchReason != "production_mesh_next_peer_is_unavailable" ||
|
||||||
statA.RouteSwitchCount != 1 ||
|
statA.RouteSwitchCount != 1 ||
|
||||||
statA.LastRouteFailureAt == "" ||
|
statA.LastRouteFailureAt == "" ||
|
||||||
statA.LastRouteSwitchAt == "" ||
|
statA.LastRouteSwitchAt == "" ||
|
||||||
@@ -1539,7 +1540,8 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
|
|||||||
snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 ||
|
snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 ||
|
||||||
snapshot.FlowScheduler.RouteSwitchCount != 1 ||
|
snapshot.FlowScheduler.RouteSwitchCount != 1 ||
|
||||||
snapshot.FlowScheduler.RouteRecoveryMaxMillis != statA.LastRouteRecoveryMillis ||
|
snapshot.FlowScheduler.RouteRecoveryMaxMillis != statA.LastRouteRecoveryMillis ||
|
||||||
snapshot.FlowScheduler.RouteRecoveryAvgMillis != statA.LastRouteRecoveryMillis {
|
snapshot.FlowScheduler.RouteRecoveryAvgMillis != statA.LastRouteRecoveryMillis ||
|
||||||
|
snapshot.FlowScheduler.RouteSwitchReasonCounts["production_mesh_next_peer_is_unavailable"] != 1 {
|
||||||
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
|
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
|
||||||
}
|
}
|
||||||
if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 {
|
if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 {
|
||||||
|
|||||||
@@ -453,6 +453,9 @@ Route recovery telemetry includes failure/switch timestamps and recovery
|
|||||||
duration in milliseconds for each recovered flow channel.
|
duration in milliseconds for each recovered flow channel.
|
||||||
Scheduler snapshots also aggregate route recovery max/average milliseconds
|
Scheduler snapshots also aggregate route recovery max/average milliseconds
|
||||||
across recovered channels for quick load-test health checks.
|
across recovered channels for quick load-test health checks.
|
||||||
|
Route recovery telemetry now includes normalized switch reasons and aggregate
|
||||||
|
reason counts, so load tests can distinguish peer failures, timeouts, and other
|
||||||
|
route-break causes.
|
||||||
Endpoint ranking treats `capacity_limited` observations as a soft pressure
|
Endpoint ranking treats `capacity_limited` observations as a soft pressure
|
||||||
penalty instead of a hard recent failure, enabling load spreading without
|
penalty instead of a hard recent failure, enabling load spreading without
|
||||||
marking the carrier unhealthy.
|
marking the carrier unhealthy.
|
||||||
|
|||||||
Reference in New Issue
Block a user