Files
rdp-proxy/agents/rap-node-agent/internal/mesh/peer_connection_manager.go
T
2026-05-18 21:33:39 +03:00

517 lines
19 KiB
Go

package mesh
import (
"context"
"fmt"
"net/http"
"strings"
"sync"
"time"
)
const (
PeerConnectionProbeReachable = "reachable"
PeerConnectionProbeUnreachable = "unreachable"
PeerConnectionProbeDeferred = "deferred"
PeerConnectionProbeSkipped = "skipped"
)
const (
DefaultPeerConnectionProbeTimeout = 2 * time.Second
)
type PeerConnectionManagerConfig struct {
Local PeerIdentity
PeerCache *PeerCache
Tracker *PeerConnectionTracker
RendezvousLeases []PeerRendezvousLease
HTTPClient *http.Client
QUICTransport *QUICFabricTransport
PreferredRegion string
ProbeTimeout time.Duration
Now func() time.Time
}
type PeerConnectionManager struct {
local PeerIdentity
peerCache *PeerCache
tracker *PeerConnectionTracker
rendezvousLeases []PeerRendezvousLease
httpClient *http.Client
quicTransport *QUICFabricTransport
preferredRegion string
probeTimeout time.Duration
now func() time.Time
mu sync.Mutex
lastCycle PeerConnectionManagerCycle
}
type PeerConnectionManagerCycle struct {
Mode string `json:"mode"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
ProbeTimeoutMs int `json:"probe_timeout_ms"`
IntentCount int `json:"intent_count"`
Attempted int `json:"attempted"`
Succeeded int `json:"succeeded"`
Failed int `json:"failed"`
Deferred int `json:"deferred"`
Skipped int `json:"skipped"`
RendezvousRequiredCount int `json:"rendezvous_required_count"`
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
RelayControlCount int `json:"relay_control_count"`
RecoveryPlan PeerRecoveryPlan `json:"recovery_plan"`
IntentPlan PeerConnectionIntentPlan `json:"intent_plan"`
Results []PeerConnectionProbeResult `json:"results,omitempty"`
}
type PeerConnectionManagerSnapshot struct {
LastCycle PeerConnectionManagerCycle `json:"last_cycle"`
}
type PeerConnectionProbeResult struct {
NodeID string `json:"node_id"`
LinkStatus string `json:"link_status"`
Action string `json:"action"`
Reason string `json:"reason"`
Endpoint string `json:"endpoint,omitempty"`
SelectedCandidateID string `json:"selected_candidate_id,omitempty"`
SelectedEndpoint string `json:"selected_endpoint,omitempty"`
ConnectionState PeerConnectionState `json:"connection_state"`
TransportMode string `json:"transport_mode"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
LatencyMs int `json:"latency_ms,omitempty"`
FailureReason string `json:"failure_reason,omitempty"`
CandidateResults []PeerConnectionCandidateProbeResult `json:"candidate_results,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
}
type PeerConnectionCandidateProbeResult struct {
CandidateID string `json:"candidate_id,omitempty"`
Endpoint string `json:"endpoint"`
Transport string `json:"transport,omitempty"`
LinkStatus string `json:"link_status"`
LatencyMs int `json:"latency_ms,omitempty"`
FailureReason string `json:"failure_reason,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
}
type peerConnectionProbeTarget struct {
CandidateID string
Endpoint string
Transport string
PeerCertSHA256 string
}
func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionManager {
probeTimeout := cfg.ProbeTimeout
if probeTimeout <= 0 {
probeTimeout = DefaultPeerConnectionProbeTimeout
}
httpClient := cfg.HTTPClient
if httpClient == nil {
httpClient = &http.Client{
Transport: &http.Transport{
MaxIdleConns: 64,
MaxIdleConnsPerHost: 8,
IdleConnTimeout: 90 * time.Second,
},
Timeout: probeTimeout + time.Second,
}
}
now := cfg.Now
if now == nil {
now = func() time.Time { return time.Now().UTC() }
}
return &PeerConnectionManager{
local: cfg.Local,
peerCache: cfg.PeerCache,
tracker: cfg.Tracker,
rendezvousLeases: append([]PeerRendezvousLease{}, cfg.RendezvousLeases...),
httpClient: httpClient,
quicTransport: cfg.QUICTransport,
preferredRegion: strings.TrimSpace(cfg.PreferredRegion),
probeTimeout: probeTimeout,
now: now,
}
}
func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionManagerCycle {
peerCache, rendezvousLeases := m.peerConfigSnapshot()
if m == nil || peerCache == nil || m.tracker == nil {
return PeerConnectionManagerCycle{}
}
startedAt := normalizedNow(m.now())
peerSnapshot := peerCache.Snapshot()
recoveryPlan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: peerSnapshot,
Connections: m.tracker.Snapshot(),
TargetReadyPeers: DefaultStablePeerTarget,
MaxProbeCandidates: DefaultRecoveryProbeLimit,
Now: startedAt,
})
intentPlan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: peerSnapshot,
RecoveryPlan: recoveryPlan,
RendezvousLeases: rendezvousLeases,
PreferredRegion: m.preferredRegion,
Now: startedAt,
})
entriesByNode := map[string]PeerCacheEntry{}
for _, entry := range peerSnapshot.Entries {
entriesByNode[entry.NodeID] = entry
}
cycle := PeerConnectionManagerCycle{
Mode: recoveryPlan.Mode,
StartedAt: startedAt,
ProbeTimeoutMs: int(m.probeTimeout.Milliseconds()),
IntentCount: intentPlan.IntentCount,
RendezvousRequiredCount: intentPlan.RendezvousRequiredCount,
RendezvousResolvedCount: intentPlan.RendezvousResolvedCount,
RelayControlCount: intentPlan.RelayControlCount,
RecoveryPlan: recoveryPlan,
IntentPlan: intentPlan,
Results: make([]PeerConnectionProbeResult, 0, len(intentPlan.Intents)),
}
for _, intent := range intentPlan.Intents {
result := m.probeIntent(ctx, intent, entriesByNode[intent.NodeID])
cycle.Results = append(cycle.Results, result)
switch result.LinkStatus {
case PeerConnectionProbeReachable:
cycle.Attempted++
cycle.Succeeded++
case PeerConnectionProbeUnreachable:
cycle.Attempted++
cycle.Failed++
case PeerConnectionProbeDeferred:
cycle.Deferred++
case PeerConnectionProbeSkipped:
cycle.Skipped++
}
}
cycle.CompletedAt = normalizedNow(m.now())
m.mu.Lock()
m.lastCycle = cycle
m.mu.Unlock()
return cycle
}
func (m *PeerConnectionManager) Snapshot() PeerConnectionManagerSnapshot {
if m == nil {
return PeerConnectionManagerSnapshot{}
}
m.mu.Lock()
defer m.mu.Unlock()
return PeerConnectionManagerSnapshot{LastCycle: m.lastCycle}
}
func (m *PeerConnectionManager) UpdatePeerConfig(peerCache *PeerCache, rendezvousLeases []PeerRendezvousLease) {
if m == nil {
return
}
m.mu.Lock()
defer m.mu.Unlock()
m.peerCache = peerCache
m.rendezvousLeases = append([]PeerRendezvousLease{}, rendezvousLeases...)
}
func (m *PeerConnectionManager) UpdateQUICTransport(transport *QUICFabricTransport) {
if m == nil {
return
}
m.mu.Lock()
defer m.mu.Unlock()
m.quicTransport = transport
}
func (m *PeerConnectionManager) peerConfigSnapshot() (*PeerCache, []PeerRendezvousLease) {
if m == nil {
return nil, nil
}
m.mu.Lock()
defer m.mu.Unlock()
return m.peerCache, append([]PeerRendezvousLease{}, m.rendezvousLeases...)
}
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent, cacheEntry PeerCacheEntry) PeerConnectionProbeResult {
startedAt := normalizedNow(m.now())
result := PeerConnectionProbeResult{
NodeID: intent.NodeID,
Action: intent.Action,
Reason: intent.Reason,
Endpoint: intent.Endpoint,
TransportMode: intent.TransportMode,
RequiresRendezvous: intent.RequiresRendezvous,
RendezvousResolved: intent.RendezvousResolved,
DirectCandidate: intent.DirectCandidate,
RelayCandidate: intent.RelayCandidate,
RendezvousLeaseID: intent.RendezvousLeaseID,
RelayNodeID: intent.RelayNodeID,
RelayEndpoint: intent.RelayEndpoint,
StartedAt: startedAt,
}
peer := PeerCacheEntry{
NodeID: intent.NodeID,
Endpoint: intent.Endpoint,
Warm: true,
WarmReason: intent.Reason,
RecoverySeed: intent.RecoverySeed,
BestCandidateID: intent.BestCandidateID,
BestTransport: intent.Transport,
RendezvousLeaseID: intent.RendezvousLeaseID,
RelayNodeID: intent.RelayNodeID,
RelayEndpoint: intent.RelayEndpoint,
RelayControl: intent.RelayCandidate,
BestPeerCertSHA256: firstNonEmpty(intent.BestPeerCertSHA256, cacheEntry.BestPeerCertSHA256),
}
if intent.RequiresRendezvous {
result.LinkStatus = PeerConnectionProbeDeferred
result.FailureReason = "rendezvous_required"
result.ConnectionState = m.tracker.RecordDeferred(peer, result.FailureReason, startedAt)
result.CompletedAt = normalizedNow(m.now())
return result
}
if strings.TrimSpace(intent.Endpoint) == "" || (!intent.DirectCandidate && !intent.RelayCandidate) {
result.LinkStatus = PeerConnectionProbeDeferred
result.FailureReason = "direct_candidate_unavailable"
if intent.RelayCandidate {
result.FailureReason = "relay_candidate_unavailable"
}
result.ConnectionState = m.tracker.RecordDeferred(peer, result.FailureReason, startedAt)
result.CompletedAt = normalizedNow(m.now())
return result
}
if !m.tracker.ShouldProbe(intent.NodeID, startedAt) {
result.LinkStatus = PeerConnectionProbeSkipped
result.FailureReason = "backoff_active"
result.ConnectionState = m.connectionState(intent.NodeID)
result.CompletedAt = normalizedNow(m.now())
return result
}
target := PeerIdentity{
ClusterID: m.local.ClusterID,
NodeID: intent.NodeID,
}
target.NodeID = peerConnectionProbeTargetNodeID(intent, m.local.NodeID)
targets := []peerConnectionProbeTarget{{
CandidateID: intent.BestCandidateID,
Endpoint: intent.Endpoint,
Transport: intent.Transport,
PeerCertSHA256: intent.BestPeerCertSHA256,
}}
if intent.DirectCandidate || peerConnectionShouldProbeDirectUpgrade(intent, cacheEntry) {
targets = peerConnectionProbeTargets(intent, cacheEntry)
}
var lastFailure string
for _, probeTarget := range targets {
probePeer := peer
probePeer.Endpoint = strings.TrimRight(strings.TrimSpace(probeTarget.Endpoint), "/")
probePeer.BestCandidateID = strings.TrimSpace(probeTarget.CandidateID)
probePeer.BestCandidateAddr = probePeer.Endpoint
probePeer.BestTransport = strings.TrimSpace(probeTarget.Transport)
probePeer.BestPeerCertSHA256 = firstNonEmpty(probeTarget.PeerCertSHA256, probePeer.BestPeerCertSHA256)
if probePeer.Endpoint == "" {
continue
}
candidateStartedAt := normalizedNow(m.now())
m.tracker.BeginProbe(probePeer, candidateStartedAt)
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
err := m.probePeerTarget(probeCtx, probePeer, target)
cancel()
completedAt := normalizedNow(m.now())
candidateResult := PeerConnectionCandidateProbeResult{
CandidateID: probePeer.BestCandidateID,
Endpoint: probePeer.Endpoint,
Transport: probePeer.BestTransport,
StartedAt: candidateStartedAt,
CompletedAt: completedAt,
}
if err != nil {
lastFailure = err.Error()
candidateResult.LinkStatus = PeerConnectionProbeUnreachable
candidateResult.FailureReason = lastFailure
result.CandidateResults = append(result.CandidateResults, candidateResult)
continue
}
latency := int(completedAt.Sub(candidateStartedAt).Milliseconds())
if latency < 0 {
latency = 0
}
candidateResult.LinkStatus = PeerConnectionProbeReachable
candidateResult.LatencyMs = latency
result.CandidateResults = append(result.CandidateResults, candidateResult)
result.LinkStatus = PeerConnectionProbeReachable
result.Endpoint = probePeer.Endpoint
result.SelectedCandidateID = probePeer.BestCandidateID
result.SelectedEndpoint = probePeer.Endpoint
result.LatencyMs = latency
if probeTargetUsesDirectQUIC(probeTarget) {
result.ConnectionState = m.tracker.RecordSuccessForPeer(probePeer, latency, completedAt)
} else if intent.RelayCandidate {
result.ConnectionState = m.tracker.RecordRelayReady(probePeer, latency, completedAt)
} else {
result.ConnectionState = m.tracker.RecordSuccessForPeer(probePeer, latency, completedAt)
}
result.CompletedAt = completedAt
return result
}
completedAt := normalizedNow(m.now())
if lastFailure == "" {
lastFailure = "no_probe_endpoint_available"
}
result.LinkStatus = PeerConnectionProbeUnreachable
result.FailureReason = lastFailure
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, lastFailure, completedAt)
result.CompletedAt = completedAt
return result
}
func peerConnectionProbeTargetNodeID(intent PeerConnectionIntent, localNodeID string) string {
if intent.RelayCandidate && strings.TrimSpace(intent.RelayNodeID) != "" && strings.TrimSpace(intent.RelayNodeID) != strings.TrimSpace(localNodeID) {
return intent.RelayNodeID
}
return intent.NodeID
}
func (m *PeerConnectionManager) probePeerTarget(ctx context.Context, probePeer PeerCacheEntry, target PeerIdentity) error {
endpoint := strings.TrimRight(strings.TrimSpace(probePeer.Endpoint), "/")
transport := strings.TrimSpace(probePeer.BestTransport)
if hasLegacyEndpointScheme(endpoint) {
return fmt.Errorf("non_quic_probe_rejected")
}
if peerConnectionTargetIsQUIC(transport, endpoint) {
carrier, selectedTarget, err := FabricTransportForTarget(FabricTransportTarget{
EndpointID: probePeer.BestCandidateID,
PeerID: target.NodeID,
Endpoint: endpoint,
Transport: transport,
Timeout: m.probeTimeout,
PeerCertSHA256: strings.TrimSpace(probePeer.BestPeerCertSHA256),
}, m.quicTransport)
if err != nil {
return err
}
session, err := carrier.Connect(ctx, selectedTarget)
if err != nil {
return err
}
return session.Close()
}
return fmt.Errorf("non_quic_probe_rejected")
}
func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) []peerConnectionProbeTarget {
seen := map[string]struct{}{}
out := make([]peerConnectionProbeTarget, 0, len(cacheEntry.EndpointCandidates)+1)
fallbackPeerCertSHA256 := firstNonEmpty(
strings.TrimSpace(cacheEntry.BestPeerCertSHA256),
strings.TrimSpace(intent.BestPeerCertSHA256),
)
add := func(candidateID, endpoint, transport, peerCertSHA256 string) {
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
if endpoint == "" {
return
}
if endpointHasUnspecifiedHost(endpoint) {
return
}
key := candidateID + "|" + endpoint
if _, ok := seen[key]; ok {
return
}
seen[key] = struct{}{}
if strings.TrimSpace(peerCertSHA256) == "" {
peerCertSHA256 = fallbackPeerCertSHA256
}
out = append(out, peerConnectionProbeTarget{
CandidateID: strings.TrimSpace(candidateID),
Endpoint: endpoint,
Transport: strings.TrimSpace(transport),
PeerCertSHA256: strings.TrimSpace(peerCertSHA256),
})
}
for _, candidate := range cacheEntry.EndpointCandidates {
if !candidateUsableForDirectProbe(candidate) {
continue
}
add(candidate.EndpointID, candidate.Address, candidate.Transport, candidatePeerCertSHA256(candidate))
}
add(intent.BestCandidateID, intent.Endpoint, intent.Transport, cacheEntry.BestPeerCertSHA256)
return out
}
func peerConnectionShouldProbeDirectUpgrade(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) bool {
if intent.DirectCandidate {
return true
}
if strings.TrimSpace(intent.ConnectionState) != PeerConnectionRelayReady &&
!intent.RelayCandidate &&
strings.TrimSpace(intent.TransportMode) != PeerTransportModeRelayControl {
return false
}
for _, candidate := range cacheEntry.EndpointCandidates {
if candidateUsableForDirectProbe(candidate) {
return true
}
}
return false
}
func probeTargetUsesDirectQUIC(target peerConnectionProbeTarget) bool {
transport := strings.ToLower(strings.TrimSpace(target.Transport))
if strings.Contains(transport, "relay") || strings.Contains(transport, "reverse") || strings.Contains(transport, "outbound") {
return false
}
return peerConnectionTargetIsQUIC(target.Transport, target.Endpoint)
}
func peerConnectionTargetIsQUIC(transport string, endpoint string) bool {
return isQUICOnlyCandidateTransport(transport) || strings.HasPrefix(strings.ToLower(strings.TrimSpace(endpoint)), "quic://")
}
func candidateUsableForDirectProbe(candidate PeerEndpointCandidate) bool {
endpoint := strings.TrimSpace(candidate.Address)
if endpoint == "" || strings.HasPrefix(endpoint, "relay://") || strings.HasPrefix(endpoint, "outbound://") {
return false
}
if endpointHasUnspecifiedHost(endpoint) {
return false
}
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
transport := strings.ToLower(strings.TrimSpace(candidate.Transport))
if connectivity == "outbound_only" || connectivity == "relay_required" || reachability == "outbound_only" || reachability == "relay" {
return false
}
return transport == "" ||
strings.Contains(transport, "direct_quic") ||
transport == "quic" ||
transport == "lan_quic" ||
transport == "ice_quic" ||
strings.HasPrefix(endpoint, "quic://")
}
func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionState {
snapshot := m.tracker.Snapshot()
for _, entry := range snapshot.Entries {
if entry.NodeID == nodeID {
return entry
}
}
return PeerConnectionState{NodeID: nodeID, State: PeerConnectionDisconnected}
}
func (c Client) withHTTPClient(httpClient *http.Client) Client {
c.HTTPClient = httpClient
return c
}