Expose VPN route recovery telemetry

This commit is contained in:
2026-05-16 13:07:42 +03:00
parent 21fe965553
commit d5c089d120
3 changed files with 43 additions and 0 deletions
@@ -183,6 +183,10 @@ type fabricFlowQueue struct {
LastFailedRouteID string
LastFailedRoutePolicyVersion string
LastFailedRouteGeneration string
LastRecoveredFromRouteID string
LastRecoveredNextHop string
LastRouteSwitchAt time.Time
RouteSwitchCount uint64
LastError string
ConsecutiveFailures uint64
StallCount uint64
@@ -254,6 +258,8 @@ type FabricFlowSchedulerSnapshot struct {
BulkPressureActive bool `json:"bulk_pressure_active,omitempty"`
BulkPressureChannelCount int `json:"bulk_pressure_channel_count,omitempty"`
InteractiveOrControlCount int `json:"interactive_or_control_channel_count,omitempty"`
RouteRecoveredChannelCount int `json:"route_recovered_channel_count,omitempty"`
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
SlowChannelCount int `json:"slow_channel_count"`
FailingChannelCount int `json:"failing_channel_count"`
QualityWindowSampleCount int `json:"quality_window_sample_count"`
@@ -287,6 +293,10 @@ type FabricFlowStat struct {
LastFailedRouteID string `json:"last_failed_route_id,omitempty"`
LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"`
LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"`
LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"`
LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"`
LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"`
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
LastError string `json:"last_error,omitempty"`
ConsecutiveFailures uint64 `json:"consecutive_failures"`
StallCount uint64 `json:"stall_count"`
@@ -690,6 +700,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
LastFailedRouteID: queue.LastFailedRouteID,
LastFailedRoutePolicyVersion: queue.LastFailedRoutePolicyVersion,
LastFailedRouteGeneration: queue.LastFailedRouteGeneration,
LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID,
LastRecoveredNextHop: queue.LastRecoveredNextHop,
RouteSwitchCount: queue.RouteSwitchCount,
LastError: queue.LastError,
ConsecutiveFailures: queue.ConsecutiveFailures,
StallCount: queue.StallCount,
@@ -717,6 +730,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
if !qualityStats.LastUpdatedAt.IsZero() {
stat.QualityWindowLastUpdatedAt = qualityStats.LastUpdatedAt.UTC().Format(time.RFC3339Nano)
}
if !queue.LastRouteSwitchAt.IsZero() {
stat.LastRouteSwitchAt = queue.LastRouteSwitchAt.UTC().Format(time.RFC3339Nano)
}
snapshot.ChannelStats[channelID] = FabricFlowStat{
Depth: stat.Depth,
TrafficClass: stat.TrafficClass,
@@ -739,6 +755,10 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
LastFailedRouteID: stat.LastFailedRouteID,
LastFailedRoutePolicyVersion: stat.LastFailedRoutePolicyVersion,
LastFailedRouteGeneration: stat.LastFailedRouteGeneration,
LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID,
LastRecoveredNextHop: stat.LastRecoveredNextHop,
LastRouteSwitchAt: stat.LastRouteSwitchAt,
RouteSwitchCount: stat.RouteSwitchCount,
LastError: stat.LastError,
ConsecutiveFailures: stat.ConsecutiveFailures,
StallCount: stat.StallCount,
@@ -765,6 +785,10 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
snapshot.QualityWindowFailureCount += qualityStats.FailureCount
snapshot.QualityWindowSlowCount += qualityStats.SlowCount
snapshot.QualityWindowDropCount += qualityStats.DropCount
snapshot.RouteSwitchCount += queue.RouteSwitchCount
if queue.LastRecoveredFromRouteID != "" {
snapshot.RouteRecoveredChannelCount++
}
if queue.Depth >= s.queueCapacity || qualityStats.DropCount > 0 {
snapshot.BackpressureActive = true
}
@@ -1065,6 +1089,14 @@ func (s *FabricFlowScheduler) RecordRouteSuccessWithProvenance(channelID string,
s.mu.Lock()
defer s.mu.Unlock()
queue := s.ensureQueueLocked(channelID)
failedRouteID := strings.TrimSpace(queue.LastFailedRouteID)
failedNextHop := strings.TrimSpace(queue.LastNextHop)
if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) {
queue.LastRecoveredFromRouteID = failedRouteID
queue.LastRecoveredNextHop = failedNextHop
queue.LastRouteSwitchAt = time.Now().UTC()
queue.RouteSwitchCount++
}
queue.LastRouteID = routeID
queue.RoutePolicyVersion = strings.TrimSpace(provenance.PolicyVersion)
queue.RouteGeneration = strings.TrimSpace(provenance.Generation)
@@ -1530,6 +1530,14 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
if statA.LastRouteID != "route-alternate" || statA.LastFailedRouteID != "" || statA.ConsecutiveFailures != 0 {
t.Fatalf("channel A stat = %+v, want recovered on alternate route", statA)
}
if statA.LastRecoveredFromRouteID != "route-primary" ||
statA.LastRecoveredNextHop != "relay-primary" ||
statA.RouteSwitchCount != 1 ||
statA.LastRouteSwitchAt == "" ||
snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 ||
snapshot.FlowScheduler.RouteSwitchCount != 1 {
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
}
if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 {
t.Fatalf("channel B stat = %+v, want primary route memory preserved", statB)
}