Expose VPN route recovery telemetry

This commit is contained in:
2026-05-16 13:07:42 +03:00
parent 21fe965553
commit d5c089d120
3 changed files with 43 additions and 0 deletions
@@ -183,6 +183,10 @@ type fabricFlowQueue struct {
LastFailedRouteID string LastFailedRouteID string
LastFailedRoutePolicyVersion string LastFailedRoutePolicyVersion string
LastFailedRouteGeneration string LastFailedRouteGeneration string
LastRecoveredFromRouteID string
LastRecoveredNextHop string
LastRouteSwitchAt time.Time
RouteSwitchCount uint64
LastError string LastError string
ConsecutiveFailures uint64 ConsecutiveFailures uint64
StallCount uint64 StallCount uint64
@@ -254,6 +258,8 @@ type FabricFlowSchedulerSnapshot struct {
BulkPressureActive bool `json:"bulk_pressure_active,omitempty"` BulkPressureActive bool `json:"bulk_pressure_active,omitempty"`
BulkPressureChannelCount int `json:"bulk_pressure_channel_count,omitempty"` BulkPressureChannelCount int `json:"bulk_pressure_channel_count,omitempty"`
InteractiveOrControlCount int `json:"interactive_or_control_channel_count,omitempty"` InteractiveOrControlCount int `json:"interactive_or_control_channel_count,omitempty"`
RouteRecoveredChannelCount int `json:"route_recovered_channel_count,omitempty"`
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
SlowChannelCount int `json:"slow_channel_count"` SlowChannelCount int `json:"slow_channel_count"`
FailingChannelCount int `json:"failing_channel_count"` FailingChannelCount int `json:"failing_channel_count"`
QualityWindowSampleCount int `json:"quality_window_sample_count"` QualityWindowSampleCount int `json:"quality_window_sample_count"`
@@ -287,6 +293,10 @@ type FabricFlowStat struct {
LastFailedRouteID string `json:"last_failed_route_id,omitempty"` LastFailedRouteID string `json:"last_failed_route_id,omitempty"`
LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"` LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"`
LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"` LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"`
LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"`
LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"`
LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"`
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
LastError string `json:"last_error,omitempty"` LastError string `json:"last_error,omitempty"`
ConsecutiveFailures uint64 `json:"consecutive_failures"` ConsecutiveFailures uint64 `json:"consecutive_failures"`
StallCount uint64 `json:"stall_count"` StallCount uint64 `json:"stall_count"`
@@ -690,6 +700,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
LastFailedRouteID: queue.LastFailedRouteID, LastFailedRouteID: queue.LastFailedRouteID,
LastFailedRoutePolicyVersion: queue.LastFailedRoutePolicyVersion, LastFailedRoutePolicyVersion: queue.LastFailedRoutePolicyVersion,
LastFailedRouteGeneration: queue.LastFailedRouteGeneration, LastFailedRouteGeneration: queue.LastFailedRouteGeneration,
LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID,
LastRecoveredNextHop: queue.LastRecoveredNextHop,
RouteSwitchCount: queue.RouteSwitchCount,
LastError: queue.LastError, LastError: queue.LastError,
ConsecutiveFailures: queue.ConsecutiveFailures, ConsecutiveFailures: queue.ConsecutiveFailures,
StallCount: queue.StallCount, StallCount: queue.StallCount,
@@ -717,6 +730,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
if !qualityStats.LastUpdatedAt.IsZero() { if !qualityStats.LastUpdatedAt.IsZero() {
stat.QualityWindowLastUpdatedAt = qualityStats.LastUpdatedAt.UTC().Format(time.RFC3339Nano) stat.QualityWindowLastUpdatedAt = qualityStats.LastUpdatedAt.UTC().Format(time.RFC3339Nano)
} }
if !queue.LastRouteSwitchAt.IsZero() {
stat.LastRouteSwitchAt = queue.LastRouteSwitchAt.UTC().Format(time.RFC3339Nano)
}
snapshot.ChannelStats[channelID] = FabricFlowStat{ snapshot.ChannelStats[channelID] = FabricFlowStat{
Depth: stat.Depth, Depth: stat.Depth,
TrafficClass: stat.TrafficClass, TrafficClass: stat.TrafficClass,
@@ -739,6 +755,10 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
LastFailedRouteID: stat.LastFailedRouteID, LastFailedRouteID: stat.LastFailedRouteID,
LastFailedRoutePolicyVersion: stat.LastFailedRoutePolicyVersion, LastFailedRoutePolicyVersion: stat.LastFailedRoutePolicyVersion,
LastFailedRouteGeneration: stat.LastFailedRouteGeneration, LastFailedRouteGeneration: stat.LastFailedRouteGeneration,
LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID,
LastRecoveredNextHop: stat.LastRecoveredNextHop,
LastRouteSwitchAt: stat.LastRouteSwitchAt,
RouteSwitchCount: stat.RouteSwitchCount,
LastError: stat.LastError, LastError: stat.LastError,
ConsecutiveFailures: stat.ConsecutiveFailures, ConsecutiveFailures: stat.ConsecutiveFailures,
StallCount: stat.StallCount, StallCount: stat.StallCount,
@@ -765,6 +785,10 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
snapshot.QualityWindowFailureCount += qualityStats.FailureCount snapshot.QualityWindowFailureCount += qualityStats.FailureCount
snapshot.QualityWindowSlowCount += qualityStats.SlowCount snapshot.QualityWindowSlowCount += qualityStats.SlowCount
snapshot.QualityWindowDropCount += qualityStats.DropCount snapshot.QualityWindowDropCount += qualityStats.DropCount
snapshot.RouteSwitchCount += queue.RouteSwitchCount
if queue.LastRecoveredFromRouteID != "" {
snapshot.RouteRecoveredChannelCount++
}
if queue.Depth >= s.queueCapacity || qualityStats.DropCount > 0 { if queue.Depth >= s.queueCapacity || qualityStats.DropCount > 0 {
snapshot.BackpressureActive = true snapshot.BackpressureActive = true
} }
@@ -1065,6 +1089,14 @@ func (s *FabricFlowScheduler) RecordRouteSuccessWithProvenance(channelID string,
s.mu.Lock() s.mu.Lock()
defer s.mu.Unlock() defer s.mu.Unlock()
queue := s.ensureQueueLocked(channelID) queue := s.ensureQueueLocked(channelID)
failedRouteID := strings.TrimSpace(queue.LastFailedRouteID)
failedNextHop := strings.TrimSpace(queue.LastNextHop)
if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) {
queue.LastRecoveredFromRouteID = failedRouteID
queue.LastRecoveredNextHop = failedNextHop
queue.LastRouteSwitchAt = time.Now().UTC()
queue.RouteSwitchCount++
}
queue.LastRouteID = routeID queue.LastRouteID = routeID
queue.RoutePolicyVersion = strings.TrimSpace(provenance.PolicyVersion) queue.RoutePolicyVersion = strings.TrimSpace(provenance.PolicyVersion)
queue.RouteGeneration = strings.TrimSpace(provenance.Generation) queue.RouteGeneration = strings.TrimSpace(provenance.Generation)
@@ -1530,6 +1530,14 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
if statA.LastRouteID != "route-alternate" || statA.LastFailedRouteID != "" || statA.ConsecutiveFailures != 0 { if statA.LastRouteID != "route-alternate" || statA.LastFailedRouteID != "" || statA.ConsecutiveFailures != 0 {
t.Fatalf("channel A stat = %+v, want recovered on alternate route", statA) t.Fatalf("channel A stat = %+v, want recovered on alternate route", statA)
} }
if statA.LastRecoveredFromRouteID != "route-primary" ||
statA.LastRecoveredNextHop != "relay-primary" ||
statA.RouteSwitchCount != 1 ||
statA.LastRouteSwitchAt == "" ||
snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 ||
snapshot.FlowScheduler.RouteSwitchCount != 1 {
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
}
if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 { if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 {
t.Fatalf("channel B stat = %+v, want primary route memory preserved", statB) t.Fatalf("channel B stat = %+v, want primary route memory preserved", statB)
} }
@@ -440,6 +440,9 @@ bulk and interactive/control channel counts, making mixed browser/RDP load
diagnosis explicit when bulk windows are reduced to protect interactive traffic. diagnosis explicit when bulk windows are reduced to protect interactive traffic.
`mesh-live-smoke` now exercises that mixed-load scheduler path and reports bulk `mesh-live-smoke` now exercises that mixed-load scheduler path and reports bulk
pressure activation plus bulk/interactive window recommendations. pressure activation plus bulk/interactive window recommendations.
Flow-scheduler route recovery telemetry now records per-channel route switches,
the failed route a channel recovered from, and aggregate recovered-channel /
switch counts, making alternate-route recovery measurable during load tests.
Endpoint ranking treats `capacity_limited` observations as a soft pressure Endpoint ranking treats `capacity_limited` observations as a soft pressure
penalty instead of a hard recent failure, enabling load spreading without penalty instead of a hard recent failure, enabling load spreading without
marking the carrier unhealthy. marking the carrier unhealthy.