Measure VPN route recovery time

This commit is contained in:
2026-05-16 13:11:09 +03:00
parent d43910d2c0
commit c8e7bd3717
3 changed files with 24 additions and 1 deletions
@@ -183,9 +183,11 @@ type fabricFlowQueue struct {
LastFailedRouteID string LastFailedRouteID string
LastFailedRoutePolicyVersion string LastFailedRoutePolicyVersion string
LastFailedRouteGeneration string LastFailedRouteGeneration string
LastRouteFailureAt time.Time
LastRecoveredFromRouteID string LastRecoveredFromRouteID string
LastRecoveredNextHop string LastRecoveredNextHop string
LastRouteSwitchAt time.Time LastRouteSwitchAt time.Time
LastRouteRecoveryMillis int64
RouteSwitchCount uint64 RouteSwitchCount uint64
LastError string LastError string
ConsecutiveFailures uint64 ConsecutiveFailures uint64
@@ -293,9 +295,11 @@ type FabricFlowStat struct {
LastFailedRouteID string `json:"last_failed_route_id,omitempty"` LastFailedRouteID string `json:"last_failed_route_id,omitempty"`
LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"` LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"`
LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"` LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"`
LastRouteFailureAt string `json:"last_route_failure_at,omitempty"`
LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"` LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"`
LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"` LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"`
LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"` LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"`
LastRouteRecoveryMillis int64 `json:"last_route_recovery_ms,omitempty"`
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"` RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
LastError string `json:"last_error,omitempty"` LastError string `json:"last_error,omitempty"`
ConsecutiveFailures uint64 `json:"consecutive_failures"` ConsecutiveFailures uint64 `json:"consecutive_failures"`
@@ -702,6 +706,7 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
LastFailedRouteGeneration: queue.LastFailedRouteGeneration, LastFailedRouteGeneration: queue.LastFailedRouteGeneration,
LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID, LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID,
LastRecoveredNextHop: queue.LastRecoveredNextHop, LastRecoveredNextHop: queue.LastRecoveredNextHop,
LastRouteRecoveryMillis: queue.LastRouteRecoveryMillis,
RouteSwitchCount: queue.RouteSwitchCount, RouteSwitchCount: queue.RouteSwitchCount,
LastError: queue.LastError, LastError: queue.LastError,
ConsecutiveFailures: queue.ConsecutiveFailures, ConsecutiveFailures: queue.ConsecutiveFailures,
@@ -730,6 +735,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
if !qualityStats.LastUpdatedAt.IsZero() { if !qualityStats.LastUpdatedAt.IsZero() {
stat.QualityWindowLastUpdatedAt = qualityStats.LastUpdatedAt.UTC().Format(time.RFC3339Nano) stat.QualityWindowLastUpdatedAt = qualityStats.LastUpdatedAt.UTC().Format(time.RFC3339Nano)
} }
if !queue.LastRouteFailureAt.IsZero() {
stat.LastRouteFailureAt = queue.LastRouteFailureAt.UTC().Format(time.RFC3339Nano)
}
if !queue.LastRouteSwitchAt.IsZero() { if !queue.LastRouteSwitchAt.IsZero() {
stat.LastRouteSwitchAt = queue.LastRouteSwitchAt.UTC().Format(time.RFC3339Nano) stat.LastRouteSwitchAt = queue.LastRouteSwitchAt.UTC().Format(time.RFC3339Nano)
} }
@@ -755,9 +763,11 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
LastFailedRouteID: stat.LastFailedRouteID, LastFailedRouteID: stat.LastFailedRouteID,
LastFailedRoutePolicyVersion: stat.LastFailedRoutePolicyVersion, LastFailedRoutePolicyVersion: stat.LastFailedRoutePolicyVersion,
LastFailedRouteGeneration: stat.LastFailedRouteGeneration, LastFailedRouteGeneration: stat.LastFailedRouteGeneration,
LastRouteFailureAt: stat.LastRouteFailureAt,
LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID, LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID,
LastRecoveredNextHop: stat.LastRecoveredNextHop, LastRecoveredNextHop: stat.LastRecoveredNextHop,
LastRouteSwitchAt: stat.LastRouteSwitchAt, LastRouteSwitchAt: stat.LastRouteSwitchAt,
LastRouteRecoveryMillis: stat.LastRouteRecoveryMillis,
RouteSwitchCount: stat.RouteSwitchCount, RouteSwitchCount: stat.RouteSwitchCount,
LastError: stat.LastError, LastError: stat.LastError,
ConsecutiveFailures: stat.ConsecutiveFailures, ConsecutiveFailures: stat.ConsecutiveFailures,
@@ -1092,9 +1102,17 @@ func (s *FabricFlowScheduler) RecordRouteSuccessWithProvenance(channelID string,
failedRouteID := strings.TrimSpace(queue.LastFailedRouteID) failedRouteID := strings.TrimSpace(queue.LastFailedRouteID)
failedNextHop := strings.TrimSpace(queue.LastNextHop) failedNextHop := strings.TrimSpace(queue.LastNextHop)
if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) { if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) {
switchedAt := time.Now().UTC()
queue.LastRecoveredFromRouteID = failedRouteID queue.LastRecoveredFromRouteID = failedRouteID
queue.LastRecoveredNextHop = failedNextHop queue.LastRecoveredNextHop = failedNextHop
queue.LastRouteSwitchAt = time.Now().UTC() queue.LastRouteSwitchAt = switchedAt
queue.LastRouteRecoveryMillis = 0
if !queue.LastRouteFailureAt.IsZero() {
queue.LastRouteRecoveryMillis = switchedAt.Sub(queue.LastRouteFailureAt).Milliseconds()
if queue.LastRouteRecoveryMillis < 0 {
queue.LastRouteRecoveryMillis = 0
}
}
queue.RouteSwitchCount++ queue.RouteSwitchCount++
} }
queue.LastRouteID = routeID queue.LastRouteID = routeID
@@ -1140,6 +1158,7 @@ func (s *FabricFlowScheduler) RecordRouteFailureWithProvenance(channelID string,
queue.LastFailedRouteID = routeID queue.LastFailedRouteID = routeID
queue.LastFailedRoutePolicyVersion = strings.TrimSpace(provenance.PolicyVersion) queue.LastFailedRoutePolicyVersion = strings.TrimSpace(provenance.PolicyVersion)
queue.LastFailedRouteGeneration = strings.TrimSpace(provenance.Generation) queue.LastFailedRouteGeneration = strings.TrimSpace(provenance.Generation)
queue.LastRouteFailureAt = time.Now().UTC()
if fp := strings.TrimSpace(provenance.RecoveryPolicyFingerprint); fp != "" { if fp := strings.TrimSpace(provenance.RecoveryPolicyFingerprint); fp != "" {
queue.RecoveryPolicyFingerprint = fp queue.RecoveryPolicyFingerprint = fp
} }
@@ -1533,7 +1533,9 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
if statA.LastRecoveredFromRouteID != "route-primary" || if statA.LastRecoveredFromRouteID != "route-primary" ||
statA.LastRecoveredNextHop != "relay-primary" || statA.LastRecoveredNextHop != "relay-primary" ||
statA.RouteSwitchCount != 1 || statA.RouteSwitchCount != 1 ||
statA.LastRouteFailureAt == "" ||
statA.LastRouteSwitchAt == "" || statA.LastRouteSwitchAt == "" ||
statA.LastRouteRecoveryMillis < 0 ||
snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 || snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 ||
snapshot.FlowScheduler.RouteSwitchCount != 1 { snapshot.FlowScheduler.RouteSwitchCount != 1 {
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler) t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
@@ -445,6 +445,8 @@ the failed route a channel recovered from, and aggregate recovered-channel /
switch counts, making alternate-route recovery measurable during load tests. switch counts, making alternate-route recovery measurable during load tests.
`mesh-live-smoke` now also exercises a primary-route failure followed by an `mesh-live-smoke` now also exercises a primary-route failure followed by an
alternate-route success and reports the resulting route switch count. alternate-route success and reports the resulting route switch count.
Route recovery telemetry includes failure/switch timestamps and recovery
duration in milliseconds for each recovered flow channel.
Endpoint ranking treats `capacity_limited` observations as a soft pressure Endpoint ranking treats `capacity_limited` observations as a soft pressure
penalty instead of a hard recent failure, enabling load spreading without penalty instead of a hard recent failure, enabling load spreading without
marking the carrier unhealthy. marking the carrier unhealthy.