Measure VPN route recovery time

This commit is contained in:
2026-05-16 13:11:09 +03:00
parent d43910d2c0
commit c8e7bd3717
3 changed files with 24 additions and 1 deletions
@@ -183,9 +183,11 @@ type fabricFlowQueue struct {
LastFailedRouteID string
LastFailedRoutePolicyVersion string
LastFailedRouteGeneration string
LastRouteFailureAt time.Time
LastRecoveredFromRouteID string
LastRecoveredNextHop string
LastRouteSwitchAt time.Time
LastRouteRecoveryMillis int64
RouteSwitchCount uint64
LastError string
ConsecutiveFailures uint64
@@ -293,9 +295,11 @@ type FabricFlowStat struct {
LastFailedRouteID string `json:"last_failed_route_id,omitempty"`
LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"`
LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"`
LastRouteFailureAt string `json:"last_route_failure_at,omitempty"`
LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"`
LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"`
LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"`
LastRouteRecoveryMillis int64 `json:"last_route_recovery_ms,omitempty"`
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
LastError string `json:"last_error,omitempty"`
ConsecutiveFailures uint64 `json:"consecutive_failures"`
@@ -702,6 +706,7 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
LastFailedRouteGeneration: queue.LastFailedRouteGeneration,
LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID,
LastRecoveredNextHop: queue.LastRecoveredNextHop,
LastRouteRecoveryMillis: queue.LastRouteRecoveryMillis,
RouteSwitchCount: queue.RouteSwitchCount,
LastError: queue.LastError,
ConsecutiveFailures: queue.ConsecutiveFailures,
@@ -730,6 +735,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
if !qualityStats.LastUpdatedAt.IsZero() {
stat.QualityWindowLastUpdatedAt = qualityStats.LastUpdatedAt.UTC().Format(time.RFC3339Nano)
}
if !queue.LastRouteFailureAt.IsZero() {
stat.LastRouteFailureAt = queue.LastRouteFailureAt.UTC().Format(time.RFC3339Nano)
}
if !queue.LastRouteSwitchAt.IsZero() {
stat.LastRouteSwitchAt = queue.LastRouteSwitchAt.UTC().Format(time.RFC3339Nano)
}
@@ -755,9 +763,11 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
LastFailedRouteID: stat.LastFailedRouteID,
LastFailedRoutePolicyVersion: stat.LastFailedRoutePolicyVersion,
LastFailedRouteGeneration: stat.LastFailedRouteGeneration,
LastRouteFailureAt: stat.LastRouteFailureAt,
LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID,
LastRecoveredNextHop: stat.LastRecoveredNextHop,
LastRouteSwitchAt: stat.LastRouteSwitchAt,
LastRouteRecoveryMillis: stat.LastRouteRecoveryMillis,
RouteSwitchCount: stat.RouteSwitchCount,
LastError: stat.LastError,
ConsecutiveFailures: stat.ConsecutiveFailures,
@@ -1092,9 +1102,17 @@ func (s *FabricFlowScheduler) RecordRouteSuccessWithProvenance(channelID string,
failedRouteID := strings.TrimSpace(queue.LastFailedRouteID)
failedNextHop := strings.TrimSpace(queue.LastNextHop)
if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) {
switchedAt := time.Now().UTC()
queue.LastRecoveredFromRouteID = failedRouteID
queue.LastRecoveredNextHop = failedNextHop
queue.LastRouteSwitchAt = time.Now().UTC()
queue.LastRouteSwitchAt = switchedAt
queue.LastRouteRecoveryMillis = 0
if !queue.LastRouteFailureAt.IsZero() {
queue.LastRouteRecoveryMillis = switchedAt.Sub(queue.LastRouteFailureAt).Milliseconds()
if queue.LastRouteRecoveryMillis < 0 {
queue.LastRouteRecoveryMillis = 0
}
}
queue.RouteSwitchCount++
}
queue.LastRouteID = routeID
@@ -1140,6 +1158,7 @@ func (s *FabricFlowScheduler) RecordRouteFailureWithProvenance(channelID string,
queue.LastFailedRouteID = routeID
queue.LastFailedRoutePolicyVersion = strings.TrimSpace(provenance.PolicyVersion)
queue.LastFailedRouteGeneration = strings.TrimSpace(provenance.Generation)
queue.LastRouteFailureAt = time.Now().UTC()
if fp := strings.TrimSpace(provenance.RecoveryPolicyFingerprint); fp != "" {
queue.RecoveryPolicyFingerprint = fp
}
@@ -1533,7 +1533,9 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
if statA.LastRecoveredFromRouteID != "route-primary" ||
statA.LastRecoveredNextHop != "relay-primary" ||
statA.RouteSwitchCount != 1 ||
statA.LastRouteFailureAt == "" ||
statA.LastRouteSwitchAt == "" ||
statA.LastRouteRecoveryMillis < 0 ||
snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 ||
snapshot.FlowScheduler.RouteSwitchCount != 1 {
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
@@ -445,6 +445,8 @@ the failed route a channel recovered from, and aggregate recovered-channel /
switch counts, making alternate-route recovery measurable during load tests.
`mesh-live-smoke` now also exercises a primary-route failure followed by an
alternate-route success and reports the resulting route switch count.
Route recovery telemetry includes failure/switch timestamps and recovery
duration in milliseconds for each recovered flow channel.
Endpoint ranking treats `capacity_limited` observations as a soft pressure
penalty instead of a hard recent failure, enabling load spreading without
marking the carrier unhealthy.