Track VPN route switch reasons
This commit is contained in:
@@ -187,6 +187,7 @@ type fabricFlowQueue struct {
|
||||
LastRecoveredFromRouteID string
|
||||
LastRecoveredNextHop string
|
||||
LastRouteSwitchAt time.Time
|
||||
LastRouteSwitchReason string
|
||||
LastRouteRecoveryMillis int64
|
||||
RouteSwitchCount uint64
|
||||
LastError string
|
||||
@@ -264,6 +265,7 @@ type FabricFlowSchedulerSnapshot struct {
|
||||
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
|
||||
RouteRecoveryMaxMillis int64 `json:"route_recovery_max_ms,omitempty"`
|
||||
RouteRecoveryAvgMillis int64 `json:"route_recovery_avg_ms,omitempty"`
|
||||
RouteSwitchReasonCounts map[string]int `json:"route_switch_reason_counts,omitempty"`
|
||||
SlowChannelCount int `json:"slow_channel_count"`
|
||||
FailingChannelCount int `json:"failing_channel_count"`
|
||||
QualityWindowSampleCount int `json:"quality_window_sample_count"`
|
||||
@@ -301,6 +303,7 @@ type FabricFlowStat struct {
|
||||
LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"`
|
||||
LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"`
|
||||
LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"`
|
||||
LastRouteSwitchReason string `json:"last_route_switch_reason,omitempty"`
|
||||
LastRouteRecoveryMillis int64 `json:"last_route_recovery_ms,omitempty"`
|
||||
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
@@ -661,6 +664,7 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
||||
QueueDepths: map[string]int{},
|
||||
TrafficClassCounts: map[string]int{},
|
||||
RecommendedParallelWindows: map[string]int{},
|
||||
RouteSwitchReasonCounts: map[string]int{},
|
||||
ChannelStats: map[string]FabricFlowStat{},
|
||||
}
|
||||
if s == nil {
|
||||
@@ -710,6 +714,7 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
||||
LastFailedRouteGeneration: queue.LastFailedRouteGeneration,
|
||||
LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID,
|
||||
LastRecoveredNextHop: queue.LastRecoveredNextHop,
|
||||
LastRouteSwitchReason: queue.LastRouteSwitchReason,
|
||||
LastRouteRecoveryMillis: queue.LastRouteRecoveryMillis,
|
||||
RouteSwitchCount: queue.RouteSwitchCount,
|
||||
LastError: queue.LastError,
|
||||
@@ -771,6 +776,7 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
||||
LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID,
|
||||
LastRecoveredNextHop: stat.LastRecoveredNextHop,
|
||||
LastRouteSwitchAt: stat.LastRouteSwitchAt,
|
||||
LastRouteSwitchReason: stat.LastRouteSwitchReason,
|
||||
LastRouteRecoveryMillis: stat.LastRouteRecoveryMillis,
|
||||
RouteSwitchCount: stat.RouteSwitchCount,
|
||||
LastError: stat.LastError,
|
||||
@@ -802,6 +808,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
||||
snapshot.RouteSwitchCount += queue.RouteSwitchCount
|
||||
if queue.LastRecoveredFromRouteID != "" {
|
||||
snapshot.RouteRecoveredChannelCount++
|
||||
if reason := strings.TrimSpace(queue.LastRouteSwitchReason); reason != "" {
|
||||
snapshot.RouteSwitchReasonCounts[reason]++
|
||||
}
|
||||
if queue.LastRouteRecoveryMillis > snapshot.RouteRecoveryMaxMillis {
|
||||
snapshot.RouteRecoveryMaxMillis = queue.LastRouteRecoveryMillis
|
||||
}
|
||||
@@ -827,6 +836,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
||||
if routeRecoverySamples > 0 {
|
||||
snapshot.RouteRecoveryAvgMillis = routeRecoveryTotalMillis / routeRecoverySamples
|
||||
}
|
||||
if len(snapshot.RouteSwitchReasonCounts) == 0 {
|
||||
snapshot.RouteSwitchReasonCounts = nil
|
||||
}
|
||||
snapshot.BulkPressureChannelCount = snapshot.TrafficClassCounts[FabricTrafficClassBulk]
|
||||
snapshot.InteractiveOrControlCount = snapshot.TrafficClassCounts[FabricTrafficClassControl] + snapshot.TrafficClassCounts[FabricTrafficClassInteractive]
|
||||
bulkPressureThreshold := s.adaptivePolicy.BulkPressureChannelThreshold
|
||||
@@ -912,6 +924,27 @@ func classWindowLimit(policy FabricServiceChannelAdaptivePolicy, trafficClass st
|
||||
return maxWindow
|
||||
}
|
||||
|
||||
func normalizeFabricRouteSwitchReason(reason string) string {
|
||||
reason = strings.ToLower(strings.TrimSpace(reason))
|
||||
if reason == "" {
|
||||
return "route_failure"
|
||||
}
|
||||
replacer := strings.NewReplacer(" ", "_", "\t", "_", "\n", "_", "\r", "_", ":", "_", ";", "_", ",", "_")
|
||||
reason = replacer.Replace(reason)
|
||||
for strings.Contains(reason, "__") {
|
||||
reason = strings.ReplaceAll(reason, "__", "_")
|
||||
}
|
||||
reason = strings.Trim(reason, "_")
|
||||
if reason == "" {
|
||||
return "route_failure"
|
||||
}
|
||||
if len(reason) > 80 {
|
||||
reason = reason[:80]
|
||||
reason = strings.TrimRight(reason, "_")
|
||||
}
|
||||
return reason
|
||||
}
|
||||
|
||||
func clampFabricWindow(value, minValue, maxValue int) int {
|
||||
if value < minValue {
|
||||
return minValue
|
||||
@@ -1113,11 +1146,13 @@ func (s *FabricFlowScheduler) RecordRouteSuccessWithProvenance(channelID string,
|
||||
queue := s.ensureQueueLocked(channelID)
|
||||
failedRouteID := strings.TrimSpace(queue.LastFailedRouteID)
|
||||
failedNextHop := strings.TrimSpace(queue.LastNextHop)
|
||||
failedReason := normalizeFabricRouteSwitchReason(queue.LastError)
|
||||
if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) {
|
||||
switchedAt := time.Now().UTC()
|
||||
queue.LastRecoveredFromRouteID = failedRouteID
|
||||
queue.LastRecoveredNextHop = failedNextHop
|
||||
queue.LastRouteSwitchAt = switchedAt
|
||||
queue.LastRouteSwitchReason = failedReason
|
||||
queue.LastRouteRecoveryMillis = 0
|
||||
if !queue.LastRouteFailureAt.IsZero() {
|
||||
queue.LastRouteRecoveryMillis = switchedAt.Sub(queue.LastRouteFailureAt).Milliseconds()
|
||||
|
||||
@@ -1532,6 +1532,7 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
|
||||
}
|
||||
if statA.LastRecoveredFromRouteID != "route-primary" ||
|
||||
statA.LastRecoveredNextHop != "relay-primary" ||
|
||||
statA.LastRouteSwitchReason != "production_mesh_next_peer_is_unavailable" ||
|
||||
statA.RouteSwitchCount != 1 ||
|
||||
statA.LastRouteFailureAt == "" ||
|
||||
statA.LastRouteSwitchAt == "" ||
|
||||
@@ -1539,7 +1540,8 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
|
||||
snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 ||
|
||||
snapshot.FlowScheduler.RouteSwitchCount != 1 ||
|
||||
snapshot.FlowScheduler.RouteRecoveryMaxMillis != statA.LastRouteRecoveryMillis ||
|
||||
snapshot.FlowScheduler.RouteRecoveryAvgMillis != statA.LastRouteRecoveryMillis {
|
||||
snapshot.FlowScheduler.RouteRecoveryAvgMillis != statA.LastRouteRecoveryMillis ||
|
||||
snapshot.FlowScheduler.RouteSwitchReasonCounts["production_mesh_next_peer_is_unavailable"] != 1 {
|
||||
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
|
||||
}
|
||||
if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 {
|
||||
|
||||
Reference in New Issue
Block a user