Measure VPN route recovery time
This commit is contained in:
@@ -183,9 +183,11 @@ type fabricFlowQueue struct {
|
||||
LastFailedRouteID string
|
||||
LastFailedRoutePolicyVersion string
|
||||
LastFailedRouteGeneration string
|
||||
LastRouteFailureAt time.Time
|
||||
LastRecoveredFromRouteID string
|
||||
LastRecoveredNextHop string
|
||||
LastRouteSwitchAt time.Time
|
||||
LastRouteRecoveryMillis int64
|
||||
RouteSwitchCount uint64
|
||||
LastError string
|
||||
ConsecutiveFailures uint64
|
||||
@@ -293,9 +295,11 @@ type FabricFlowStat struct {
|
||||
LastFailedRouteID string `json:"last_failed_route_id,omitempty"`
|
||||
LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"`
|
||||
LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"`
|
||||
LastRouteFailureAt string `json:"last_route_failure_at,omitempty"`
|
||||
LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"`
|
||||
LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"`
|
||||
LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"`
|
||||
LastRouteRecoveryMillis int64 `json:"last_route_recovery_ms,omitempty"`
|
||||
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
|
||||
LastError string `json:"last_error,omitempty"`
|
||||
ConsecutiveFailures uint64 `json:"consecutive_failures"`
|
||||
@@ -702,6 +706,7 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
||||
LastFailedRouteGeneration: queue.LastFailedRouteGeneration,
|
||||
LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID,
|
||||
LastRecoveredNextHop: queue.LastRecoveredNextHop,
|
||||
LastRouteRecoveryMillis: queue.LastRouteRecoveryMillis,
|
||||
RouteSwitchCount: queue.RouteSwitchCount,
|
||||
LastError: queue.LastError,
|
||||
ConsecutiveFailures: queue.ConsecutiveFailures,
|
||||
@@ -730,6 +735,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
||||
if !qualityStats.LastUpdatedAt.IsZero() {
|
||||
stat.QualityWindowLastUpdatedAt = qualityStats.LastUpdatedAt.UTC().Format(time.RFC3339Nano)
|
||||
}
|
||||
if !queue.LastRouteFailureAt.IsZero() {
|
||||
stat.LastRouteFailureAt = queue.LastRouteFailureAt.UTC().Format(time.RFC3339Nano)
|
||||
}
|
||||
if !queue.LastRouteSwitchAt.IsZero() {
|
||||
stat.LastRouteSwitchAt = queue.LastRouteSwitchAt.UTC().Format(time.RFC3339Nano)
|
||||
}
|
||||
@@ -755,9 +763,11 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
||||
LastFailedRouteID: stat.LastFailedRouteID,
|
||||
LastFailedRoutePolicyVersion: stat.LastFailedRoutePolicyVersion,
|
||||
LastFailedRouteGeneration: stat.LastFailedRouteGeneration,
|
||||
LastRouteFailureAt: stat.LastRouteFailureAt,
|
||||
LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID,
|
||||
LastRecoveredNextHop: stat.LastRecoveredNextHop,
|
||||
LastRouteSwitchAt: stat.LastRouteSwitchAt,
|
||||
LastRouteRecoveryMillis: stat.LastRouteRecoveryMillis,
|
||||
RouteSwitchCount: stat.RouteSwitchCount,
|
||||
LastError: stat.LastError,
|
||||
ConsecutiveFailures: stat.ConsecutiveFailures,
|
||||
@@ -1092,9 +1102,17 @@ func (s *FabricFlowScheduler) RecordRouteSuccessWithProvenance(channelID string,
|
||||
failedRouteID := strings.TrimSpace(queue.LastFailedRouteID)
|
||||
failedNextHop := strings.TrimSpace(queue.LastNextHop)
|
||||
if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) {
|
||||
switchedAt := time.Now().UTC()
|
||||
queue.LastRecoveredFromRouteID = failedRouteID
|
||||
queue.LastRecoveredNextHop = failedNextHop
|
||||
queue.LastRouteSwitchAt = time.Now().UTC()
|
||||
queue.LastRouteSwitchAt = switchedAt
|
||||
queue.LastRouteRecoveryMillis = 0
|
||||
if !queue.LastRouteFailureAt.IsZero() {
|
||||
queue.LastRouteRecoveryMillis = switchedAt.Sub(queue.LastRouteFailureAt).Milliseconds()
|
||||
if queue.LastRouteRecoveryMillis < 0 {
|
||||
queue.LastRouteRecoveryMillis = 0
|
||||
}
|
||||
}
|
||||
queue.RouteSwitchCount++
|
||||
}
|
||||
queue.LastRouteID = routeID
|
||||
@@ -1140,6 +1158,7 @@ func (s *FabricFlowScheduler) RecordRouteFailureWithProvenance(channelID string,
|
||||
queue.LastFailedRouteID = routeID
|
||||
queue.LastFailedRoutePolicyVersion = strings.TrimSpace(provenance.PolicyVersion)
|
||||
queue.LastFailedRouteGeneration = strings.TrimSpace(provenance.Generation)
|
||||
queue.LastRouteFailureAt = time.Now().UTC()
|
||||
if fp := strings.TrimSpace(provenance.RecoveryPolicyFingerprint); fp != "" {
|
||||
queue.RecoveryPolicyFingerprint = fp
|
||||
}
|
||||
|
||||
@@ -1533,7 +1533,9 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
|
||||
if statA.LastRecoveredFromRouteID != "route-primary" ||
|
||||
statA.LastRecoveredNextHop != "relay-primary" ||
|
||||
statA.RouteSwitchCount != 1 ||
|
||||
statA.LastRouteFailureAt == "" ||
|
||||
statA.LastRouteSwitchAt == "" ||
|
||||
statA.LastRouteRecoveryMillis < 0 ||
|
||||
snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 ||
|
||||
snapshot.FlowScheduler.RouteSwitchCount != 1 {
|
||||
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
|
||||
|
||||
Reference in New Issue
Block a user