Summarize QUIC pressure in heartbeat

This commit is contained in:
2026-05-16 13:00:24 +03:00
parent 66597a5b15
commit ebdae833fc
3 changed files with 92 additions and 0 deletions
@@ -49,6 +49,7 @@ const (
maxVPNFabricEndpointHealthReportEntries = 32 maxVPNFabricEndpointHealthReportEntries = 32
maxVPNFabricEndpointObservationEntries = 256 maxVPNFabricEndpointObservationEntries = 256
maxVPNFabricCapacityCounterEntries = 32 maxVPNFabricCapacityCounterEntries = 32
maxVPNFabricQUICPressureReportEntries = 16
vpnFabricEndpointObservationMaxAge = 30 * time.Minute vpnFabricEndpointObservationMaxAge = 30 * time.Minute
meshRendezvousLeaseReportSchema = "c17z18.mesh_rendezvous_lease_report.v1" meshRendezvousLeaseReportSchema = "c17z18.mesh_rendezvous_lease_report.v1"
meshRendezvousLeaseTelemetryCapability = "mesh_rendezvous_lease_telemetry" meshRendezvousLeaseTelemetryCapability = "mesh_rendezvous_lease_telemetry"
@@ -442,6 +443,16 @@ type vpnFabricCapacityCounter struct {
LastSeenUnixSec int64 `json:"last_seen_unix_sec"` LastSeenUnixSec int64 `json:"last_seen_unix_sec"`
} }
type vpnFabricQUICPressureEntry struct {
PeerID string `json:"peer_id,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
ActiveStreams int `json:"active_streams"`
MaxStreams int `json:"max_streams"`
CapacityPressurePercent int `json:"capacity_pressure_percent"`
Saturated bool `json:"saturated,omitempty"`
LastUsedUnixSec int64 `json:"last_used_unix_sec,omitempty"`
}
type vpnFabricEndpointObservationStore struct { type vpnFabricEndpointObservationStore struct {
reporterNodeID string reporterNodeID string
mu sync.Mutex mu sync.Mutex
@@ -855,6 +866,43 @@ func (s *vpnFabricSessionDialStats) Report(observedAt time.Time) map[string]any
return report return report
} }
func vpnFabricQUICPressureReport(snapshot mesh.QUICFabricTransportSnapshot, maxEntries int) []vpnFabricQUICPressureEntry {
if len(snapshot.Connections) == 0 {
return []vpnFabricQUICPressureEntry{}
}
entries := make([]vpnFabricQUICPressureEntry, 0, len(snapshot.Connections))
for _, conn := range snapshot.Connections {
if conn.CapacityPressurePercent <= 0 && conn.ActiveStreams <= 0 && !conn.Saturated {
continue
}
entries = append(entries, vpnFabricQUICPressureEntry{
PeerID: conn.PeerID,
Endpoint: conn.Endpoint,
ActiveStreams: conn.ActiveStreams,
MaxStreams: conn.MaxStreams,
CapacityPressurePercent: conn.CapacityPressurePercent,
Saturated: conn.Saturated,
LastUsedUnixSec: conn.LastUsedUnixSec,
})
}
sort.SliceStable(entries, func(i, j int) bool {
if entries[i].CapacityPressurePercent != entries[j].CapacityPressurePercent {
return entries[i].CapacityPressurePercent > entries[j].CapacityPressurePercent
}
if entries[i].ActiveStreams != entries[j].ActiveStreams {
return entries[i].ActiveStreams > entries[j].ActiveStreams
}
if entries[i].PeerID != entries[j].PeerID {
return entries[i].PeerID < entries[j].PeerID
}
return entries[i].Endpoint < entries[j].Endpoint
})
if maxEntries <= 0 || maxEntries > len(entries) {
maxEntries = len(entries)
}
return entries[:maxEntries]
}
func newFabricServiceChannelAccessStats() *fabricServiceChannelAccessStats { func newFabricServiceChannelAccessStats() *fabricServiceChannelAccessStats {
return &fabricServiceChannelAccessStats{} return &fabricServiceChannelAccessStats{}
} }
@@ -3170,6 +3218,7 @@ func heartbeatPayload(cfg config.Config, identity state.Identity, meshState *syn
if meshState != nil && meshState.VPNFabricQUICTransport != nil { if meshState != nil && meshState.VPNFabricQUICTransport != nil {
quicSnapshot := meshState.VPNFabricQUICTransport.Snapshot() quicSnapshot := meshState.VPNFabricQUICTransport.Snapshot()
report["quic_sessions"] = quicSnapshot report["quic_sessions"] = quicSnapshot
report["quic_capacity_pressure"] = vpnFabricQUICPressureReport(quicSnapshot, maxVPNFabricQUICPressureReportEntries)
report["quic_max_streams_per_conn"] = meshState.VPNFabricQUICTransport.MaxStreamsPerConn report["quic_max_streams_per_conn"] = meshState.VPNFabricQUICTransport.MaxStreamsPerConn
report["quic_idle_ttl_seconds"] = int(meshState.VPNFabricQUICTransport.IdleTTL.Seconds()) report["quic_idle_ttl_seconds"] = int(meshState.VPNFabricQUICTransport.IdleTTL.Seconds())
} }
@@ -765,6 +765,8 @@ func TestHeartbeatPayloadIncludesMeshEndpointReport(t *testing.T) {
t.Fatalf("vpn fabric session report missing: %+v", payload.Metadata) t.Fatalf("vpn fabric session report missing: %+v", payload.Metadata)
} else if report["quic_sessions"] == nil || report["quic_max_streams_per_conn"] != 24 { } else if report["quic_sessions"] == nil || report["quic_max_streams_per_conn"] != 24 {
t.Fatalf("vpn fabric quic session report missing: %+v", report) t.Fatalf("vpn fabric quic session report missing: %+v", report)
} else if report["quic_capacity_pressure"] == nil {
t.Fatalf("vpn fabric quic pressure report missing: %+v", report)
} }
if payload.Capabilities["vpn_fabric_session_stream_shards"] != true { if payload.Capabilities["vpn_fabric_session_stream_shards"] != true {
t.Fatalf("vpn fabric stream shard capability missing: %+v", payload.Capabilities) t.Fatalf("vpn fabric stream shard capability missing: %+v", payload.Capabilities)
@@ -1185,6 +1187,44 @@ func TestMergeEndpointCapacityPressureKeepsStrongerSignal(t *testing.T) {
} }
} }
func TestVPNFabricQUICPressureReportRanksBusyConnections(t *testing.T) {
report := vpnFabricQUICPressureReport(mesh.QUICFabricTransportSnapshot{
Connections: []mesh.QUICFabricConnSnapshot{
{
PeerID: "node-c",
Endpoint: "node-c.example.test:19443",
ActiveStreams: 1,
MaxStreams: 10,
CapacityPressurePercent: 10,
},
{
PeerID: "node-b",
Endpoint: "node-b.example.test:19443",
ActiveStreams: 9,
MaxStreams: 10,
CapacityPressurePercent: 90,
Saturated: true,
LastUsedUnixSec: 100,
},
{
PeerID: "idle",
Endpoint: "idle.example.test:19443",
ActiveStreams: 0,
MaxStreams: 10,
},
},
}, 1)
if len(report) != 1 {
t.Fatalf("report count = %d, want 1: %+v", len(report), report)
}
if report[0].PeerID != "node-b" ||
report[0].CapacityPressurePercent != 90 ||
!report[0].Saturated ||
report[0].LastUsedUnixSec != 100 {
t.Fatalf("unexpected pressure report: %+v", report[0])
}
}
func TestMergedEndpointCandidateObservationsKeepsNewest(t *testing.T) { func TestMergedEndpointCandidateObservationsKeepsNewest(t *testing.T) {
now := time.Now().UTC() now := time.Now().UTC()
merged := mergedEndpointCandidateObservations( merged := mergedEndpointCandidateObservations(
@@ -432,6 +432,9 @@ heartbeats and diagnostics stay stable across reports.
When local live QUIC pressure and recent capacity-limit counters overlap, the When local live QUIC pressure and recent capacity-limit counters overlap, the
ranking input keeps the stronger pressure signal rather than allowing a weak ranking input keeps the stronger pressure signal rather than allowing a weak
fresh sample to hide a saturated endpoint. fresh sample to hide a saturated endpoint.
Heartbeat VPN fabric reports now include a bounded `quic_capacity_pressure`
summary sorted by busiest cached QUIC connection, making overload diagnosis
visible without digging through the full carrier snapshot.
Endpoint ranking treats `capacity_limited` observations as a soft pressure Endpoint ranking treats `capacity_limited` observations as a soft pressure
penalty instead of a hard recent failure, enabling load spreading without penalty instead of a hard recent failure, enabling load spreading without
marking the carrier unhealthy. marking the carrier unhealthy.