Expose VPN route recovery telemetry
This commit is contained in:
@@ -183,6 +183,10 @@ type fabricFlowQueue struct {
|
|||||||
LastFailedRouteID string
|
LastFailedRouteID string
|
||||||
LastFailedRoutePolicyVersion string
|
LastFailedRoutePolicyVersion string
|
||||||
LastFailedRouteGeneration string
|
LastFailedRouteGeneration string
|
||||||
|
LastRecoveredFromRouteID string
|
||||||
|
LastRecoveredNextHop string
|
||||||
|
LastRouteSwitchAt time.Time
|
||||||
|
RouteSwitchCount uint64
|
||||||
LastError string
|
LastError string
|
||||||
ConsecutiveFailures uint64
|
ConsecutiveFailures uint64
|
||||||
StallCount uint64
|
StallCount uint64
|
||||||
@@ -254,6 +258,8 @@ type FabricFlowSchedulerSnapshot struct {
|
|||||||
BulkPressureActive bool `json:"bulk_pressure_active,omitempty"`
|
BulkPressureActive bool `json:"bulk_pressure_active,omitempty"`
|
||||||
BulkPressureChannelCount int `json:"bulk_pressure_channel_count,omitempty"`
|
BulkPressureChannelCount int `json:"bulk_pressure_channel_count,omitempty"`
|
||||||
InteractiveOrControlCount int `json:"interactive_or_control_channel_count,omitempty"`
|
InteractiveOrControlCount int `json:"interactive_or_control_channel_count,omitempty"`
|
||||||
|
RouteRecoveredChannelCount int `json:"route_recovered_channel_count,omitempty"`
|
||||||
|
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
|
||||||
SlowChannelCount int `json:"slow_channel_count"`
|
SlowChannelCount int `json:"slow_channel_count"`
|
||||||
FailingChannelCount int `json:"failing_channel_count"`
|
FailingChannelCount int `json:"failing_channel_count"`
|
||||||
QualityWindowSampleCount int `json:"quality_window_sample_count"`
|
QualityWindowSampleCount int `json:"quality_window_sample_count"`
|
||||||
@@ -287,6 +293,10 @@ type FabricFlowStat struct {
|
|||||||
LastFailedRouteID string `json:"last_failed_route_id,omitempty"`
|
LastFailedRouteID string `json:"last_failed_route_id,omitempty"`
|
||||||
LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"`
|
LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"`
|
||||||
LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"`
|
LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"`
|
||||||
|
LastRecoveredFromRouteID string `json:"last_recovered_from_route_id,omitempty"`
|
||||||
|
LastRecoveredNextHop string `json:"last_recovered_next_hop,omitempty"`
|
||||||
|
LastRouteSwitchAt string `json:"last_route_switch_at,omitempty"`
|
||||||
|
RouteSwitchCount uint64 `json:"route_switch_count,omitempty"`
|
||||||
LastError string `json:"last_error,omitempty"`
|
LastError string `json:"last_error,omitempty"`
|
||||||
ConsecutiveFailures uint64 `json:"consecutive_failures"`
|
ConsecutiveFailures uint64 `json:"consecutive_failures"`
|
||||||
StallCount uint64 `json:"stall_count"`
|
StallCount uint64 `json:"stall_count"`
|
||||||
@@ -690,6 +700,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
|||||||
LastFailedRouteID: queue.LastFailedRouteID,
|
LastFailedRouteID: queue.LastFailedRouteID,
|
||||||
LastFailedRoutePolicyVersion: queue.LastFailedRoutePolicyVersion,
|
LastFailedRoutePolicyVersion: queue.LastFailedRoutePolicyVersion,
|
||||||
LastFailedRouteGeneration: queue.LastFailedRouteGeneration,
|
LastFailedRouteGeneration: queue.LastFailedRouteGeneration,
|
||||||
|
LastRecoveredFromRouteID: queue.LastRecoveredFromRouteID,
|
||||||
|
LastRecoveredNextHop: queue.LastRecoveredNextHop,
|
||||||
|
RouteSwitchCount: queue.RouteSwitchCount,
|
||||||
LastError: queue.LastError,
|
LastError: queue.LastError,
|
||||||
ConsecutiveFailures: queue.ConsecutiveFailures,
|
ConsecutiveFailures: queue.ConsecutiveFailures,
|
||||||
StallCount: queue.StallCount,
|
StallCount: queue.StallCount,
|
||||||
@@ -717,6 +730,9 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
|||||||
if !qualityStats.LastUpdatedAt.IsZero() {
|
if !qualityStats.LastUpdatedAt.IsZero() {
|
||||||
stat.QualityWindowLastUpdatedAt = qualityStats.LastUpdatedAt.UTC().Format(time.RFC3339Nano)
|
stat.QualityWindowLastUpdatedAt = qualityStats.LastUpdatedAt.UTC().Format(time.RFC3339Nano)
|
||||||
}
|
}
|
||||||
|
if !queue.LastRouteSwitchAt.IsZero() {
|
||||||
|
stat.LastRouteSwitchAt = queue.LastRouteSwitchAt.UTC().Format(time.RFC3339Nano)
|
||||||
|
}
|
||||||
snapshot.ChannelStats[channelID] = FabricFlowStat{
|
snapshot.ChannelStats[channelID] = FabricFlowStat{
|
||||||
Depth: stat.Depth,
|
Depth: stat.Depth,
|
||||||
TrafficClass: stat.TrafficClass,
|
TrafficClass: stat.TrafficClass,
|
||||||
@@ -739,6 +755,10 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
|||||||
LastFailedRouteID: stat.LastFailedRouteID,
|
LastFailedRouteID: stat.LastFailedRouteID,
|
||||||
LastFailedRoutePolicyVersion: stat.LastFailedRoutePolicyVersion,
|
LastFailedRoutePolicyVersion: stat.LastFailedRoutePolicyVersion,
|
||||||
LastFailedRouteGeneration: stat.LastFailedRouteGeneration,
|
LastFailedRouteGeneration: stat.LastFailedRouteGeneration,
|
||||||
|
LastRecoveredFromRouteID: stat.LastRecoveredFromRouteID,
|
||||||
|
LastRecoveredNextHop: stat.LastRecoveredNextHop,
|
||||||
|
LastRouteSwitchAt: stat.LastRouteSwitchAt,
|
||||||
|
RouteSwitchCount: stat.RouteSwitchCount,
|
||||||
LastError: stat.LastError,
|
LastError: stat.LastError,
|
||||||
ConsecutiveFailures: stat.ConsecutiveFailures,
|
ConsecutiveFailures: stat.ConsecutiveFailures,
|
||||||
StallCount: stat.StallCount,
|
StallCount: stat.StallCount,
|
||||||
@@ -765,6 +785,10 @@ func (s *FabricFlowScheduler) Snapshot() FabricFlowSchedulerSnapshot {
|
|||||||
snapshot.QualityWindowFailureCount += qualityStats.FailureCount
|
snapshot.QualityWindowFailureCount += qualityStats.FailureCount
|
||||||
snapshot.QualityWindowSlowCount += qualityStats.SlowCount
|
snapshot.QualityWindowSlowCount += qualityStats.SlowCount
|
||||||
snapshot.QualityWindowDropCount += qualityStats.DropCount
|
snapshot.QualityWindowDropCount += qualityStats.DropCount
|
||||||
|
snapshot.RouteSwitchCount += queue.RouteSwitchCount
|
||||||
|
if queue.LastRecoveredFromRouteID != "" {
|
||||||
|
snapshot.RouteRecoveredChannelCount++
|
||||||
|
}
|
||||||
if queue.Depth >= s.queueCapacity || qualityStats.DropCount > 0 {
|
if queue.Depth >= s.queueCapacity || qualityStats.DropCount > 0 {
|
||||||
snapshot.BackpressureActive = true
|
snapshot.BackpressureActive = true
|
||||||
}
|
}
|
||||||
@@ -1065,6 +1089,14 @@ func (s *FabricFlowScheduler) RecordRouteSuccessWithProvenance(channelID string,
|
|||||||
s.mu.Lock()
|
s.mu.Lock()
|
||||||
defer s.mu.Unlock()
|
defer s.mu.Unlock()
|
||||||
queue := s.ensureQueueLocked(channelID)
|
queue := s.ensureQueueLocked(channelID)
|
||||||
|
failedRouteID := strings.TrimSpace(queue.LastFailedRouteID)
|
||||||
|
failedNextHop := strings.TrimSpace(queue.LastNextHop)
|
||||||
|
if failedRouteID != "" && strings.TrimSpace(routeID) != "" && failedRouteID != strings.TrimSpace(routeID) {
|
||||||
|
queue.LastRecoveredFromRouteID = failedRouteID
|
||||||
|
queue.LastRecoveredNextHop = failedNextHop
|
||||||
|
queue.LastRouteSwitchAt = time.Now().UTC()
|
||||||
|
queue.RouteSwitchCount++
|
||||||
|
}
|
||||||
queue.LastRouteID = routeID
|
queue.LastRouteID = routeID
|
||||||
queue.RoutePolicyVersion = strings.TrimSpace(provenance.PolicyVersion)
|
queue.RoutePolicyVersion = strings.TrimSpace(provenance.PolicyVersion)
|
||||||
queue.RouteGeneration = strings.TrimSpace(provenance.Generation)
|
queue.RouteGeneration = strings.TrimSpace(provenance.Generation)
|
||||||
|
|||||||
@@ -1530,6 +1530,14 @@ func TestFabricClientPacketIngressIsolatesRouteFailoverPerLogicalChannel(t *test
|
|||||||
if statA.LastRouteID != "route-alternate" || statA.LastFailedRouteID != "" || statA.ConsecutiveFailures != 0 {
|
if statA.LastRouteID != "route-alternate" || statA.LastFailedRouteID != "" || statA.ConsecutiveFailures != 0 {
|
||||||
t.Fatalf("channel A stat = %+v, want recovered on alternate route", statA)
|
t.Fatalf("channel A stat = %+v, want recovered on alternate route", statA)
|
||||||
}
|
}
|
||||||
|
if statA.LastRecoveredFromRouteID != "route-primary" ||
|
||||||
|
statA.LastRecoveredNextHop != "relay-primary" ||
|
||||||
|
statA.RouteSwitchCount != 1 ||
|
||||||
|
statA.LastRouteSwitchAt == "" ||
|
||||||
|
snapshot.FlowScheduler.RouteRecoveredChannelCount != 1 ||
|
||||||
|
snapshot.FlowScheduler.RouteSwitchCount != 1 {
|
||||||
|
t.Fatalf("route recovery telemetry = stat:%+v scheduler:%+v", statA, snapshot.FlowScheduler)
|
||||||
|
}
|
||||||
if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 {
|
if statB.LastRouteID != "route-primary" || statB.LastFailedRouteID != "" || statB.ConsecutiveFailures != 0 {
|
||||||
t.Fatalf("channel B stat = %+v, want primary route memory preserved", statB)
|
t.Fatalf("channel B stat = %+v, want primary route memory preserved", statB)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -440,6 +440,9 @@ bulk and interactive/control channel counts, making mixed browser/RDP load
|
|||||||
diagnosis explicit when bulk windows are reduced to protect interactive traffic.
|
diagnosis explicit when bulk windows are reduced to protect interactive traffic.
|
||||||
`mesh-live-smoke` now exercises that mixed-load scheduler path and reports bulk
|
`mesh-live-smoke` now exercises that mixed-load scheduler path and reports bulk
|
||||||
pressure activation plus bulk/interactive window recommendations.
|
pressure activation plus bulk/interactive window recommendations.
|
||||||
|
Flow-scheduler route recovery telemetry now records per-channel route switches,
|
||||||
|
the failed route a channel recovered from, and aggregate recovered-channel /
|
||||||
|
switch counts, making alternate-route recovery measurable during load tests.
|
||||||
Endpoint ranking treats `capacity_limited` observations as a soft pressure
|
Endpoint ranking treats `capacity_limited` observations as a soft pressure
|
||||||
penalty instead of a hard recent failure, enabling load spreading without
|
penalty instead of a hard recent failure, enabling load spreading without
|
||||||
marking the carrier unhealthy.
|
marking the carrier unhealthy.
|
||||||
|
|||||||
Reference in New Issue
Block a user