Expose VPN route recovery telemetry
This commit is contained in:
@@ -183,6 +183,10 @@ type fabricFlowQueue struct {
|
||||
LastFailedRouteID string
|
||||
LastFailedRoutePolicyVersion string
|
||||
LastFailedRouteGeneration string
|
||||
LastRecoveredFromRouteID string
|
||||
LastRecoveredNextHop string
|
||||
LastRouteSwitchAt time.Time
|
||||
RouteSwitchCount uint64
|
||||
LastError string
|
||||
ConsecutiveFailures uint64
|
||||
StallCount uint64
|
||||
@@ -254,6 +258,8 @@ type FabricFlowSchedulerSnapshot struct {
|
||||
BulkPressureActive bool `json:"bulk_pressure_active,omitempty"`
|
||||
BulkPressureChannelCount int `json:"bulk_pressure_channel_count,omitempty"`
|
||||
InteractiveOrControlCount int `json:"interactive_or_control_channel_count,omitempty"`
|
||||
RouteRecoveredChannelCount int `json:"route_recovered_channel_count,omitempty"`
|
||||
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
|
||||
SlowChannelCount int `json:"slow_channel_count"`
|
||||
FailingChannelCount int `json:"failing_channel_count"`
|
||||
QualityWindowSampleCount int `json:"quality_window_sample_count"`
|
||||
@@ -287,6 +293,10 @@ type FabricFlowStat struct {
|
||||
LastFailedRouteID string `json:"last_failed_route_id,omitempty"`
|
||||
LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"`
|
||||
LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"`
|
||||
LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"`
|
||||
LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"`
|
||||
LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"`
|
||||
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
ConsecutiveFailures uint64 `json:"consecutive_failures"`
|
||||
StallCount uint64 `json:"stall_count"`
|
||||
@@ -690,6 +700,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
||||
LastFailedRouteID: queue.LastFailedRouteID,
|
||||
LastFailedRoutePolicyVersion: queue.LastFailedRoutePolicyVersion,
|
||||
LastFailedRouteGeneration: queue.LastFailedRouteGeneration,
|
||||
LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID,
|
||||
LastRecoveredNextHop: queue.LastRecoveredNextHop,
|
||||
RouteSwitchCount: queue.RouteSwitchCount,
|
||||
LastError: queue.LastError,
|
||||
ConsecutiveFailures: queue.ConsecutiveFailures,
|
||||
StallCount: queue.StallCount,
|
||||
@@ -717,6 +730,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
||||
if !qualityStats.LastUpdatedAt.IsZero() {
|
||||
stat.QualityWindowLastUpdatedAt = qualityStats.LastUpdatedAt.UTC().Format(time.RFC3339Nano)
|
||||
}
|
||||
if !queue.LastRouteSwitchAt.IsZero() {
|
||||
stat.LastRouteSwitchAt = queue.LastRouteSwitchAt.UTC().Format(time.RFC3339Nano)
|
||||
}
|
||||
snapshot.ChannelStats[channelID] = FabricFlowStat{
|
||||
Depth: stat.Depth,
|
||||
TrafficClass: stat.TrafficClass,
|
||||
@@ -739,6 +755,10 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
||||
LastFailedRouteID: stat.LastFailedRouteID,
|
||||
LastFailedRoutePolicyVersion: stat.LastFailedRoutePolicyVersion,
|
||||
LastFailedRouteGeneration: stat.LastFailedRouteGeneration,
|
||||
LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID,
|
||||
LastRecoveredNextHop: stat.LastRecoveredNextHop,
|
||||
LastRouteSwitchAt: stat.LastRouteSwitchAt,
|
||||
RouteSwitchCount: stat.RouteSwitchCount,
|
||||
LastError: stat.LastError,
|
||||
ConsecutiveFailures: stat.ConsecutiveFailures,
|
||||
StallCount: stat.StallCount,
|
||||
@@ -765,6 +785,10 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
||||
snapshot.QualityWindowFailureCount += qualityStats.FailureCount
|
||||
snapshot.QualityWindowSlowCount += qualityStats.SlowCount
|
||||
snapshot.QualityWindowDropCount += qualityStats.DropCount
|
||||
snapshot.RouteSwitchCount += queue.RouteSwitchCount
|
||||
if queue.LastRecoveredFromRouteID != "" {
|
||||
snapshot.RouteRecoveredChannelCount++
|
||||
}
|
||||
if queue.Depth >= s.queueCapacity || qualityStats.DropCount > 0 {
|
||||
snapshot.BackpressureActive = true
|
||||
}
|
||||
@@ -1065,6 +1089,14 @@ func (s *FabricFlowScheduler) RecordRouteSuccessWithProvenance(channelID string,
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
queue := s.ensureQueueLocked(channelID)
|
||||
failedRouteID := strings.TrimSpace(queue.LastFailedRouteID)
|
||||
failedNextHop := strings.TrimSpace(queue.LastNextHop)
|
||||
if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) {
|
||||
queue.LastRecoveredFromRouteID = failedRouteID
|
||||
queue.LastRecoveredNextHop = failedNextHop
|
||||
queue.LastRouteSwitchAt = time.Now().UTC()
|
||||
queue.RouteSwitchCount++
|
||||
}
|
||||
queue.LastRouteID = routeID
|
||||
queue.RoutePolicyVersion = strings.TrimSpace(provenance.PolicyVersion)
|
||||
queue.RouteGeneration = strings.TrimSpace(provenance.Generation)
|
||||
|
||||
@@ -1530,6 +1530,14 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
|
||||
if statA.LastRouteID != "route-alternate" || statA.LastFailedRouteID != "" || statA.ConsecutiveFailures != 0 {
|
||||
t.Fatalf("channel A stat = %+v, want recovered on alternate route", statA)
|
||||
}
|
||||
if statA.LastRecoveredFromRouteID != "route-primary" ||
|
||||
statA.LastRecoveredNextHop != "relay-primary" ||
|
||||
statA.RouteSwitchCount != 1 ||
|
||||
statA.LastRouteSwitchAt == "" ||
|
||||
snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 ||
|
||||
snapshot.FlowScheduler.RouteSwitchCount != 1 {
|
||||
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
|
||||
}
|
||||
if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 {
|
||||
t.Fatalf("channel B stat = %+v, want primary route memory preserved", statB)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user