Report capacity pressure by endpoint
This commit is contained in:
@@ -48,6 +48,7 @@ const (
|
|||||||
maxMeshRendezvousLeaseReportEntries = 20
|
maxMeshRendezvousLeaseReportEntries = 20
|
||||||
maxVPNFabricEndpointHealthReportEntries = 32
|
maxVPNFabricEndpointHealthReportEntries = 32
|
||||||
maxVPNFabricEndpointObservationEntries = 256
|
maxVPNFabricEndpointObservationEntries = 256
|
||||||
|
maxVPNFabricCapacityCounterEntries = 32
|
||||||
vpnFabricEndpointObservationMaxAge = 30 * time.Minute
|
vpnFabricEndpointObservationMaxAge = 30 * time.Minute
|
||||||
meshRendezvousLeaseReportSchema = "c17z18.mesh_rendezvous_lease_report.v1"
|
meshRendezvousLeaseReportSchema = "c17z18.mesh_rendezvous_lease_report.v1"
|
||||||
meshRendezvousLeaseTelemetryCapability = "mesh_rendezvous_lease_telemetry"
|
meshRendezvousLeaseTelemetryCapability = "mesh_rendezvous_lease_telemetry"
|
||||||
@@ -429,6 +430,16 @@ type vpnFabricSessionDialStats struct {
|
|||||||
LastSelectedUnixSec atomic.Int64
|
LastSelectedUnixSec atomic.Int64
|
||||||
LastCapacityUnixSec atomic.Int64
|
LastCapacityUnixSec atomic.Int64
|
||||||
LastFailureUnixSec atomic.Int64
|
LastFailureUnixSec atomic.Int64
|
||||||
|
capacityMu sync.Mutex
|
||||||
|
capacityByEndpoint map[string]vpnFabricCapacityCounter
|
||||||
|
}
|
||||||
|
|
||||||
|
type vpnFabricCapacityCounter struct {
|
||||||
|
EndpointID string `json:"endpoint_id,omitempty"`
|
||||||
|
Endpoint string `json:"endpoint,omitempty"`
|
||||||
|
Transport string `json:"transport,omitempty"`
|
||||||
|
Count int64 `json:"count"`
|
||||||
|
LastSeenUnixSec int64 `json:"last_seen_unix_sec"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type vpnFabricEndpointObservationStore struct {
|
type vpnFabricEndpointObservationStore struct {
|
||||||
@@ -438,7 +449,9 @@ type vpnFabricEndpointObservationStore struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func newVPNFabricSessionDialStats() *vpnFabricSessionDialStats {
|
func newVPNFabricSessionDialStats() *vpnFabricSessionDialStats {
|
||||||
return &vpnFabricSessionDialStats{}
|
return &vpnFabricSessionDialStats{
|
||||||
|
capacityByEndpoint: map[string]vpnFabricCapacityCounter{},
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func newVPNFabricEndpointObservationStore(reporterNodeID ...string) *vpnFabricEndpointObservationStore {
|
func newVPNFabricEndpointObservationStore(reporterNodeID ...string) *vpnFabricEndpointObservationStore {
|
||||||
@@ -637,13 +650,102 @@ func (s *vpnFabricSessionDialStats) ObserveCapacityLimited(target mesh.FabricTra
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
s.ObserveCandidateFailure("capacity_limited")
|
s.ObserveCandidateFailure("capacity_limited")
|
||||||
s.LastCapacityEndpoint.Store(strings.TrimSpace(target.Endpoint))
|
endpoint := strings.TrimSpace(target.Endpoint)
|
||||||
|
endpointID := strings.TrimSpace(target.EndpointID)
|
||||||
|
s.LastCapacityEndpoint.Store(endpoint)
|
||||||
transport := strings.TrimSpace(target.Transport)
|
transport := strings.TrimSpace(target.Transport)
|
||||||
if transport == "" {
|
if transport == "" {
|
||||||
transport = "legacy_peer_endpoint"
|
transport = "legacy_peer_endpoint"
|
||||||
}
|
}
|
||||||
s.LastCapacityTransport.Store(transport)
|
s.LastCapacityTransport.Store(transport)
|
||||||
s.LastCapacityUnixSec.Store(time.Now().UTC().Unix())
|
observedAt := time.Now().UTC().Unix()
|
||||||
|
s.LastCapacityUnixSec.Store(observedAt)
|
||||||
|
key := endpointID
|
||||||
|
if key == "" {
|
||||||
|
key = transport + "|" + endpoint
|
||||||
|
}
|
||||||
|
if key == "|" {
|
||||||
|
key = "unknown"
|
||||||
|
}
|
||||||
|
s.capacityMu.Lock()
|
||||||
|
if s.capacityByEndpoint == nil {
|
||||||
|
s.capacityByEndpoint = map[string]vpnFabricCapacityCounter{}
|
||||||
|
}
|
||||||
|
counter := s.capacityByEndpoint[key]
|
||||||
|
counter.EndpointID = endpointID
|
||||||
|
counter.Endpoint = endpoint
|
||||||
|
counter.Transport = transport
|
||||||
|
counter.Count++
|
||||||
|
counter.LastSeenUnixSec = observedAt
|
||||||
|
s.capacityByEndpoint[key] = counter
|
||||||
|
s.pruneCapacityCountersLocked(maxVPNFabricCapacityCounterEntries)
|
||||||
|
s.capacityMu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *vpnFabricSessionDialStats) pruneCapacityCountersLocked(maxEntries int) {
|
||||||
|
if s == nil || maxEntries <= 0 || len(s.capacityByEndpoint) <= maxEntries {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
values := make([]vpnFabricCapacityCounter, 0, len(s.capacityByEndpoint))
|
||||||
|
for _, counter := range s.capacityByEndpoint {
|
||||||
|
values = append(values, counter)
|
||||||
|
}
|
||||||
|
sort.SliceStable(values, func(i, j int) bool {
|
||||||
|
if values[i].LastSeenUnixSec != values[j].LastSeenUnixSec {
|
||||||
|
return values[i].LastSeenUnixSec > values[j].LastSeenUnixSec
|
||||||
|
}
|
||||||
|
if values[i].Count != values[j].Count {
|
||||||
|
return values[i].Count > values[j].Count
|
||||||
|
}
|
||||||
|
return values[i].Endpoint < values[j].Endpoint
|
||||||
|
})
|
||||||
|
keep := make(map[string]struct{}, maxEntries)
|
||||||
|
for _, counter := range values[:maxEntries] {
|
||||||
|
key := strings.TrimSpace(counter.EndpointID)
|
||||||
|
if key == "" {
|
||||||
|
key = strings.TrimSpace(counter.Transport) + "|" + strings.TrimSpace(counter.Endpoint)
|
||||||
|
}
|
||||||
|
if key == "|" {
|
||||||
|
key = "unknown"
|
||||||
|
}
|
||||||
|
keep[key] = struct{}{}
|
||||||
|
}
|
||||||
|
for key := range s.capacityByEndpoint {
|
||||||
|
if _, ok := keep[key]; !ok {
|
||||||
|
delete(s.capacityByEndpoint, key)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (s *vpnFabricSessionDialStats) capacityCountersSnapshot(maxEntries int) []vpnFabricCapacityCounter {
|
||||||
|
if s == nil {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
s.capacityMu.Lock()
|
||||||
|
defer s.capacityMu.Unlock()
|
||||||
|
if len(s.capacityByEndpoint) == 0 {
|
||||||
|
return []vpnFabricCapacityCounter{}
|
||||||
|
}
|
||||||
|
values := make([]vpnFabricCapacityCounter, 0, len(s.capacityByEndpoint))
|
||||||
|
for _, counter := range s.capacityByEndpoint {
|
||||||
|
values = append(values, counter)
|
||||||
|
}
|
||||||
|
sort.SliceStable(values, func(i, j int) bool {
|
||||||
|
if values[i].LastSeenUnixSec != values[j].LastSeenUnixSec {
|
||||||
|
return values[i].LastSeenUnixSec > values[j].LastSeenUnixSec
|
||||||
|
}
|
||||||
|
if values[i].Count != values[j].Count {
|
||||||
|
return values[i].Count > values[j].Count
|
||||||
|
}
|
||||||
|
if values[i].EndpointID != values[j].EndpointID {
|
||||||
|
return values[i].EndpointID < values[j].EndpointID
|
||||||
|
}
|
||||||
|
return values[i].Endpoint < values[j].Endpoint
|
||||||
|
})
|
||||||
|
if maxEntries <= 0 || maxEntries > len(values) {
|
||||||
|
maxEntries = len(values)
|
||||||
|
}
|
||||||
|
return values[:maxEntries]
|
||||||
}
|
}
|
||||||
|
|
||||||
func (s *vpnFabricSessionDialStats) ObserveAllCandidatesFailed() {
|
func (s *vpnFabricSessionDialStats) ObserveAllCandidatesFailed() {
|
||||||
@@ -702,6 +804,7 @@ func (s *vpnFabricSessionDialStats) Report(observedAt time.Time) map[string]any
|
|||||||
"last_selected_unix_sec": s.LastSelectedUnixSec.Load(),
|
"last_selected_unix_sec": s.LastSelectedUnixSec.Load(),
|
||||||
"last_capacity_unix_sec": s.LastCapacityUnixSec.Load(),
|
"last_capacity_unix_sec": s.LastCapacityUnixSec.Load(),
|
||||||
"last_failure_unix_sec": s.LastFailureUnixSec.Load(),
|
"last_failure_unix_sec": s.LastFailureUnixSec.Load(),
|
||||||
|
"capacity_pressure": s.capacityCountersSnapshot(maxVPNFabricCapacityCounterEntries),
|
||||||
}
|
}
|
||||||
if value, ok := s.LastTransport.Load().(string); ok && value != "" {
|
if value, ok := s.LastTransport.Load().(string); ok && value != "" {
|
||||||
report["last_transport"] = value
|
report["last_transport"] = value
|
||||||
|
|||||||
@@ -777,8 +777,14 @@ func TestVPNFabricSessionDialStatsReport(t *testing.T) {
|
|||||||
stats := newVPNFabricSessionDialStats()
|
stats := newVPNFabricSessionDialStats()
|
||||||
stats.Attempts.Add(1)
|
stats.Attempts.Add(1)
|
||||||
stats.ObserveCapacityLimited(mesh.FabricTransportTarget{
|
stats.ObserveCapacityLimited(mesh.FabricTransportTarget{
|
||||||
Endpoint: "quic://node-b.example.test:19443",
|
EndpointID: "node-b-quic",
|
||||||
Transport: "direct_quic",
|
Endpoint: "quic://node-b.example.test:19443",
|
||||||
|
Transport: "direct_quic",
|
||||||
|
})
|
||||||
|
stats.ObserveCapacityLimited(mesh.FabricTransportTarget{
|
||||||
|
EndpointID: "node-b-quic",
|
||||||
|
Endpoint: "quic://node-b.example.test:19443",
|
||||||
|
Transport: "direct_quic",
|
||||||
})
|
})
|
||||||
stats.ObserveCandidateFailure("session_open_failed")
|
stats.ObserveCandidateFailure("session_open_failed")
|
||||||
stats.ObserveSelected(mesh.FabricTransportTarget{
|
stats.ObserveSelected(mesh.FabricTransportTarget{
|
||||||
@@ -790,8 +796,8 @@ func TestVPNFabricSessionDialStatsReport(t *testing.T) {
|
|||||||
report := stats.Report(time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC))
|
report := stats.Report(time.Date(2026, 5, 16, 12, 0, 0, 0, time.UTC))
|
||||||
if report["attempts"] != int64(1) ||
|
if report["attempts"] != int64(1) ||
|
||||||
report["selected"] != int64(1) ||
|
report["selected"] != int64(1) ||
|
||||||
report["candidate_failures"] != int64(2) ||
|
report["candidate_failures"] != int64(3) ||
|
||||||
report["capacity_limited"] != int64(1) ||
|
report["capacity_limited"] != int64(2) ||
|
||||||
report["session_open_failures"] != int64(1) ||
|
report["session_open_failures"] != int64(1) ||
|
||||||
report["quic_selected"] != int64(1) ||
|
report["quic_selected"] != int64(1) ||
|
||||||
report["pinned_cert_selected"] != int64(1) ||
|
report["pinned_cert_selected"] != int64(1) ||
|
||||||
@@ -802,6 +808,17 @@ func TestVPNFabricSessionDialStatsReport(t *testing.T) {
|
|||||||
report["last_failure_reason"] != "session_open_failed" {
|
report["last_failure_reason"] != "session_open_failed" {
|
||||||
t.Fatalf("unexpected dial stats report: %+v", report)
|
t.Fatalf("unexpected dial stats report: %+v", report)
|
||||||
}
|
}
|
||||||
|
capacityPressure, ok := report["capacity_pressure"].([]vpnFabricCapacityCounter)
|
||||||
|
if !ok || len(capacityPressure) != 1 {
|
||||||
|
t.Fatalf("capacity pressure counters missing: %+v", report["capacity_pressure"])
|
||||||
|
}
|
||||||
|
if capacityPressure[0].EndpointID != "node-b-quic" ||
|
||||||
|
capacityPressure[0].Endpoint != "quic://node-b.example.test:19443" ||
|
||||||
|
capacityPressure[0].Transport != "direct_quic" ||
|
||||||
|
capacityPressure[0].Count != 2 ||
|
||||||
|
capacityPressure[0].LastSeenUnixSec <= 0 {
|
||||||
|
t.Fatalf("unexpected capacity pressure counter: %+v", capacityPressure[0])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestFabricSessionOpenFailureReasonClassifiesCapacity(t *testing.T) {
|
func TestFabricSessionOpenFailureReasonClassifiesCapacity(t *testing.T) {
|
||||||
|
|||||||
@@ -374,6 +374,9 @@ but saturated carrier.
|
|||||||
VPN fabric dial telemetry records the last capacity-limited endpoint and
|
VPN fabric dial telemetry records the last capacity-limited endpoint and
|
||||||
transport, making stream saturation visible without poisoning endpoint health
|
transport, making stream saturation visible without poisoning endpoint health
|
||||||
observations.
|
observations.
|
||||||
|
The same dial telemetry now keeps bounded per-endpoint capacity-pressure
|
||||||
|
counters, so operators can see whether stream saturation is occasional or
|
||||||
|
concentrated on a specific QUIC carrier.
|
||||||
Endpoint ranking treats `capacity_limited` observations as a soft pressure
|
Endpoint ranking treats `capacity_limited` observations as a soft pressure
|
||||||
penalty instead of a hard recent failure, enabling load spreading without
|
penalty instead of a hard recent failure, enabling load spreading without
|
||||||
marking the carrier unhealthy.
|
marking the carrier unhealthy.
|
||||||
|
|||||||
Reference in New Issue
Block a user