Measure VPN route recovery time
This commit is contained in:
@@ -183,9 +183,11 @@ type fabricFlowQueue struct {
|
|||||||
LastFailedRouteID string
|
LastFailedRouteID string
|
||||||
LastFailedRoutePolicyVersion string
|
LastFailedRoutePolicyVersion string
|
||||||
LastFailedRouteGeneration string
|
LastFailedRouteGeneration string
|
||||||
|
LastRouteFailureAt time.Time
|
||||||
LastRecoveredFromRouteID string
|
LastRecoveredFromRouteID string
|
||||||
LastRecoveredNextHop string
|
LastRecoveredNextHop string
|
||||||
LastRouteSwitchAt time.Time
|
LastRouteSwitchAt time.Time
|
||||||
|
LastRouteRecoveryMillis int64
|
||||||
RouteSwitchCount uint64
|
RouteSwitchCount uint64
|
||||||
LastError string
|
LastError string
|
||||||
ConsecutiveFailures uint64
|
ConsecutiveFailures uint64
|
||||||
@@ -293,9 +295,11 @@ type FabricFlowStat struct {
|
|||||||
LastFailedRouteID string `json:"last_failed_route_id,omitempty"`
|
LastFailedRouteID string `json:"last_failed_route_id,omitempty"`
|
||||||
LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"`
|
LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"`
|
||||||
LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"`
|
LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"`
|
||||||
|
LastRouteFailureAt string `json:"last_route_failure_at,omitempty"`
|
||||||
LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"`
|
LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"`
|
||||||
LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"`
|
LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"`
|
||||||
LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"`
|
LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"`
|
||||||
|
LastRouteRecoveryMillis int64 `json:"last_route_recovery_ms,omitempty"`
|
||||||
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
|
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
|
||||||
LastError string `json:"last_error,omitempty"`
|
LastError string `json:"last_error,omitempty"`
|
||||||
ConsecutiveFailures uint64 `json:"consecutive_failures"`
|
ConsecutiveFailures uint64 `json:"consecutive_failures"`
|
||||||
@@ -702,6 +706,7 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
|||||||
LastFailedRouteGeneration: queue.LastFailedRouteGeneration,
|
LastFailedRouteGeneration: queue.LastFailedRouteGeneration,
|
||||||
LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID,
|
LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID,
|
||||||
LastRecoveredNextHop: queue.LastRecoveredNextHop,
|
LastRecoveredNextHop: queue.LastRecoveredNextHop,
|
||||||
|
LastRouteRecoveryMillis: queue.LastRouteRecoveryMillis,
|
||||||
RouteSwitchCount: queue.RouteSwitchCount,
|
RouteSwitchCount: queue.RouteSwitchCount,
|
||||||
LastError: queue.LastError,
|
LastError: queue.LastError,
|
||||||
ConsecutiveFailures: queue.ConsecutiveFailures,
|
ConsecutiveFailures: queue.ConsecutiveFailures,
|
||||||
@@ -730,6 +735,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
|||||||
if !qualityStats.LastUpdatedAt.IsZero() {
|
if !qualityStats.LastUpdatedAt.IsZero() {
|
||||||
stat.QualityWindowLastUpdatedAt = qualityStats.LastUpdatedAt.UTC().Format(time.RFC3339Nano)
|
stat.QualityWindowLastUpdatedAt = qualityStats.LastUpdatedAt.UTC().Format(time.RFC3339Nano)
|
||||||
}
|
}
|
||||||
|
if !queue.LastRouteFailureAt.IsZero() {
|
||||||
|
stat.LastRouteFailureAt = queue.LastRouteFailureAt.UTC().Format(time.RFC3339Nano)
|
||||||
|
}
|
||||||
if !queue.LastRouteSwitchAt.IsZero() {
|
if !queue.LastRouteSwitchAt.IsZero() {
|
||||||
stat.LastRouteSwitchAt = queue.LastRouteSwitchAt.UTC().Format(time.RFC3339Nano)
|
stat.LastRouteSwitchAt = queue.LastRouteSwitchAt.UTC().Format(time.RFC3339Nano)
|
||||||
}
|
}
|
||||||
@@ -755,9 +763,11 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
|||||||
LastFailedRouteID: stat.LastFailedRouteID,
|
LastFailedRouteID: stat.LastFailedRouteID,
|
||||||
LastFailedRoutePolicyVersion: stat.LastFailedRoutePolicyVersion,
|
LastFailedRoutePolicyVersion: stat.LastFailedRoutePolicyVersion,
|
||||||
LastFailedRouteGeneration: stat.LastFailedRouteGeneration,
|
LastFailedRouteGeneration: stat.LastFailedRouteGeneration,
|
||||||
|
LastRouteFailureAt: stat.LastRouteFailureAt,
|
||||||
LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID,
|
LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID,
|
||||||
LastRecoveredNextHop: stat.LastRecoveredNextHop,
|
LastRecoveredNextHop: stat.LastRecoveredNextHop,
|
||||||
LastRouteSwitchAt: stat.LastRouteSwitchAt,
|
LastRouteSwitchAt: stat.LastRouteSwitchAt,
|
||||||
|
LastRouteRecoveryMillis: stat.LastRouteRecoveryMillis,
|
||||||
RouteSwitchCount: stat.RouteSwitchCount,
|
RouteSwitchCount: stat.RouteSwitchCount,
|
||||||
LastError: stat.LastError,
|
LastError: stat.LastError,
|
||||||
ConsecutiveFailures: stat.ConsecutiveFailures,
|
ConsecutiveFailures: stat.ConsecutiveFailures,
|
||||||
@@ -1092,9 +1102,17 @@ func (s *FabricFlowScheduler) RecordRouteSuccessWithProvenance(channelID string,
|
|||||||
failedRouteID := strings.TrimSpace(queue.LastFailedRouteID)
|
failedRouteID := strings.TrimSpace(queue.LastFailedRouteID)
|
||||||
failedNextHop := strings.TrimSpace(queue.LastNextHop)
|
failedNextHop := strings.TrimSpace(queue.LastNextHop)
|
||||||
if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) {
|
if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) {
|
||||||
|
switchedAt := time.Now().UTC()
|
||||||
queue.LastRecoveredFromRouteID = failedRouteID
|
queue.LastRecoveredFromRouteID = failedRouteID
|
||||||
queue.LastRecoveredNextHop = failedNextHop
|
queue.LastRecoveredNextHop = failedNextHop
|
||||||
queue.LastRouteSwitchAt = time.Now().UTC()
|
queue.LastRouteSwitchAt = switchedAt
|
||||||
|
queue.LastRouteRecoveryMillis = 0
|
||||||
|
if !queue.LastRouteFailureAt.IsZero() {
|
||||||
|
queue.LastRouteRecoveryMillis = switchedAt.Sub(queue.LastRouteFailureAt).Milliseconds()
|
||||||
|
if queue.LastRouteRecoveryMillis < 0 {
|
||||||
|
queue.LastRouteRecoveryMillis = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
queue.RouteSwitchCount++
|
queue.RouteSwitchCount++
|
||||||
}
|
}
|
||||||
queue.LastRouteID = routeID
|
queue.LastRouteID = routeID
|
||||||
@@ -1140,6 +1158,7 @@ func (s *FabricFlowScheduler) RecordRouteFailureWithProvenance(channelID string,
|
|||||||
queue.LastFailedRouteID = routeID
|
queue.LastFailedRouteID = routeID
|
||||||
queue.LastFailedRoutePolicyVersion = strings.TrimSpace(provenance.PolicyVersion)
|
queue.LastFailedRoutePolicyVersion = strings.TrimSpace(provenance.PolicyVersion)
|
||||||
queue.LastFailedRouteGeneration = strings.TrimSpace(provenance.Generation)
|
queue.LastFailedRouteGeneration = strings.TrimSpace(provenance.Generation)
|
||||||
|
queue.LastRouteFailureAt = time.Now().UTC()
|
||||||
if fp := strings.TrimSpace(provenance.RecoveryPolicyFingerprint); fp != "" {
|
if fp := strings.TrimSpace(provenance.RecoveryPolicyFingerprint); fp != "" {
|
||||||
queue.RecoveryPolicyFingerprint = fp
|
queue.RecoveryPolicyFingerprint = fp
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1533,7 +1533,9 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
|
|||||||
if statA.LastRecoveredFromRouteID != "route-primary" ||
|
if statA.LastRecoveredFromRouteID != "route-primary" ||
|
||||||
statA.LastRecoveredNextHop != "relay-primary" ||
|
statA.LastRecoveredNextHop != "relay-primary" ||
|
||||||
statA.RouteSwitchCount != 1 ||
|
statA.RouteSwitchCount != 1 ||
|
||||||
|
statA.LastRouteFailureAt == "" ||
|
||||||
statA.LastRouteSwitchAt == "" ||
|
statA.LastRouteSwitchAt == "" ||
|
||||||
|
statA.LastRouteRecoveryMillis < 0 ||
|
||||||
snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 ||
|
snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 ||
|
||||||
snapshot.FlowScheduler.RouteSwitchCount != 1 {
|
snapshot.FlowScheduler.RouteSwitchCount != 1 {
|
||||||
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
|
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
|
||||||
|
|||||||
@@ -445,6 +445,8 @@ the failed route a channel recovered from, and aggregate recovered-channel /
|
|||||||
switch counts, making alternate-route recovery measurable during load tests.
|
switch counts, making alternate-route recovery measurable during load tests.
|
||||||
`mesh-live-smoke` now also exercises a primary-route failure followed by an
|
`mesh-live-smoke` now also exercises a primary-route failure followed by an
|
||||||
alternate-route success and reports the resulting route switch count.
|
alternate-route success and reports the resulting route switch count.
|
||||||
|
Route recovery telemetry includes failure/switch timestamps and recovery
|
||||||
|
duration in milliseconds for each recovered flow channel.
|
||||||
Endpoint ranking treats `capacity_limited` observations as a soft pressure
|
Endpoint ranking treats `capacity_limited` observations as a soft pressure
|
||||||
penalty instead of a hard recent failure, enabling load spreading without
|
penalty instead of a hard recent failure, enabling load spreading without
|
||||||
marking the carrier unhealthy.
|
marking the carrier unhealthy.
|
||||||
|
|||||||
Reference in New Issue
Block a user