Record project continuation changes

This commit is contained in:
2026-05-12 21:02:29 +03:00
parent 3059d1d7a3
commit 8f69d53193
339 changed files with 101111 additions and 1769 deletions
@@ -63,10 +63,12 @@ const (
ProductionChannelVPNPacket = "vpn_packet"
ProductionMessageVPNPacketBatch = "vpn.packet_batch"
FabricServiceClassVPNPackets = "vpn_packets"
FabricServiceClassRemoteWorkspace = "remote_workspace"
FabricServiceChannelBulk = "bulk"
FabricServiceChannelControl = "control"
FabricServiceChannelInteractive = "interactive"
FabricServiceChannelReliable = "reliable"
FabricServiceChannelDroppable = "droppable"
MaxProductionEnvelopePayloadBytes = 4096
MaxProductionVPNPacketPayloadBytes = 256 * 1024
MaxProductionEnvelopeFutureSkew = time.Minute
@@ -59,9 +59,9 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
reasons := []string{"base"}
switch candidate.Transport {
case "direct_tcp_tls":
case "direct_tcp_tls", "direct_http", "direct_https":
score += 35
reasons = append(reasons, "transport:direct_tcp_tls")
reasons = append(reasons, "transport:direct")
case "wss":
score += 25
reasons = append(reasons, "transport:wss")
@@ -37,27 +37,28 @@ type PeerCacheSnapshot struct {
}
type PeerCacheEntry struct {
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
EndpointCount int `json:"endpoint_count"`
CandidateCount int `json:"candidate_count"`
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
BestCandidateAddr string `json:"best_candidate_addr,omitempty"`
BestTransport string `json:"best_transport,omitempty"`
BestReachability string `json:"best_reachability,omitempty"`
BestConnectivity string `json:"best_connectivity,omitempty"`
BestNATType string `json:"best_nat_type,omitempty"`
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
BestCandidateScore int `json:"best_candidate_score,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
EndpointCount int `json:"endpoint_count"`
CandidateCount int `json:"candidate_count"`
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
BestCandidateAddr string `json:"best_candidate_addr,omitempty"`
BestTransport string `json:"best_transport,omitempty"`
BestReachability string `json:"best_reachability,omitempty"`
BestConnectivity string `json:"best_connectivity,omitempty"`
BestNATType string `json:"best_nat_type,omitempty"`
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
BestCandidateScore int `json:"best_candidate_score,omitempty"`
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
}
type peerCacheBuildEntry struct {
@@ -117,6 +118,10 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
MaxVerificationAge: time.Hour,
})
if len(scored) > 0 {
entry.EndpointCandidates = make([]PeerEndpointCandidate, 0, len(scored))
for _, scoredCandidate := range scored {
entry.EndpointCandidates = append(entry.EndpointCandidates, scoredCandidate.Candidate)
}
entry.BestCandidateID = scored[0].Candidate.EndpointID
entry.BestCandidateAddr = scored[0].Candidate.Address
entry.BestTransport = scored[0].Candidate.Transport
@@ -66,24 +66,44 @@ type PeerConnectionManagerSnapshot struct {
}
type PeerConnectionProbeResult struct {
NodeID string `json:"node_id"`
LinkStatus string `json:"link_status"`
Action string `json:"action"`
Reason string `json:"reason"`
Endpoint string `json:"endpoint,omitempty"`
ConnectionState PeerConnectionState `json:"connection_state"`
TransportMode string `json:"transport_mode"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
LatencyMs int `json:"latency_ms,omitempty"`
FailureReason string `json:"failure_reason,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
NodeID string `json:"node_id"`
LinkStatus string `json:"link_status"`
Action string `json:"action"`
Reason string `json:"reason"`
Endpoint string `json:"endpoint,omitempty"`
SelectedCandidateID string `json:"selected_candidate_id,omitempty"`
SelectedEndpoint string `json:"selected_endpoint,omitempty"`
ConnectionState PeerConnectionState `json:"connection_state"`
TransportMode string `json:"transport_mode"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
LatencyMs int `json:"latency_ms,omitempty"`
FailureReason string `json:"failure_reason,omitempty"`
CandidateResults []PeerConnectionCandidateProbeResult `json:"candidate_results,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
}
type PeerConnectionCandidateProbeResult struct {
CandidateID string `json:"candidate_id,omitempty"`
Endpoint string `json:"endpoint"`
Transport string `json:"transport,omitempty"`
LinkStatus string `json:"link_status"`
LatencyMs int `json:"latency_ms,omitempty"`
FailureReason string `json:"failure_reason,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
}
type peerConnectionProbeTarget struct {
CandidateID string
Endpoint string
Transport string
}
func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionManager {
@@ -137,6 +157,10 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
RendezvousLeases: rendezvousLeases,
Now: startedAt,
})
entriesByNode := map[string]PeerCacheEntry{}
for _, entry := range peerSnapshot.Entries {
entriesByNode[entry.NodeID] = entry
}
cycle := PeerConnectionManagerCycle{
Mode: recoveryPlan.Mode,
StartedAt: startedAt,
@@ -150,7 +174,7 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
Results: make([]PeerConnectionProbeResult, 0, len(intentPlan.Intents)),
}
for _, intent := range intentPlan.Intents {
result := m.probeIntent(ctx, intent)
result := m.probeIntent(ctx, intent, entriesByNode[intent.NodeID])
cycle.Results = append(cycle.Results, result)
switch result.LinkStatus {
case PeerConnectionProbeReachable:
@@ -200,7 +224,7 @@ func (m *PeerConnectionManager) peerConfigSnapshot() (*PeerCache, []PeerRendezvo
return m.peerCache, append([]PeerRendezvousLease{}, m.rendezvousLeases...)
}
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent) PeerConnectionProbeResult {
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent, cacheEntry PeerCacheEntry) PeerConnectionProbeResult {
startedAt := normalizedNow(m.now())
result := PeerConnectionProbeResult{
NodeID: intent.NodeID,
@@ -254,9 +278,6 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
result.CompletedAt = normalizedNow(m.now())
return result
}
m.tracker.BeginProbe(peer, startedAt)
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
defer cancel()
target := PeerIdentity{
ClusterID: m.local.ClusterID,
NodeID: intent.NodeID,
@@ -264,30 +285,118 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
if intent.RelayCandidate && intent.RelayNodeID != "" {
target.NodeID = intent.RelayNodeID
}
_, err := NewClient(strings.TrimRight(intent.Endpoint, "/")).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
completedAt := normalizedNow(m.now())
if err != nil {
result.LinkStatus = PeerConnectionProbeUnreachable
result.FailureReason = err.Error()
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, err.Error(), completedAt)
targets := []peerConnectionProbeTarget{{
CandidateID: intent.BestCandidateID,
Endpoint: intent.Endpoint,
Transport: intent.Transport,
}}
if intent.DirectCandidate {
targets = peerConnectionProbeTargets(intent, cacheEntry)
}
var lastFailure string
for _, probeTarget := range targets {
probePeer := peer
probePeer.Endpoint = strings.TrimRight(strings.TrimSpace(probeTarget.Endpoint), "/")
probePeer.BestCandidateID = strings.TrimSpace(probeTarget.CandidateID)
probePeer.BestCandidateAddr = probePeer.Endpoint
probePeer.BestTransport = strings.TrimSpace(probeTarget.Transport)
if probePeer.Endpoint == "" {
continue
}
candidateStartedAt := normalizedNow(m.now())
m.tracker.BeginProbe(probePeer, candidateStartedAt)
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
_, err := NewClient(probePeer.Endpoint).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
cancel()
completedAt := normalizedNow(m.now())
candidateResult := PeerConnectionCandidateProbeResult{
CandidateID: probePeer.BestCandidateID,
Endpoint: probePeer.Endpoint,
Transport: probePeer.BestTransport,
StartedAt: candidateStartedAt,
CompletedAt: completedAt,
}
if err != nil {
lastFailure = err.Error()
candidateResult.LinkStatus = PeerConnectionProbeUnreachable
candidateResult.FailureReason = lastFailure
result.CandidateResults = append(result.CandidateResults, candidateResult)
continue
}
latency := int(completedAt.Sub(candidateStartedAt).Milliseconds())
if latency < 0 {
latency = 0
}
candidateResult.LinkStatus = PeerConnectionProbeReachable
candidateResult.LatencyMs = latency
result.CandidateResults = append(result.CandidateResults, candidateResult)
result.LinkStatus = PeerConnectionProbeReachable
result.Endpoint = probePeer.Endpoint
result.SelectedCandidateID = probePeer.BestCandidateID
result.SelectedEndpoint = probePeer.Endpoint
result.LatencyMs = latency
if intent.RelayCandidate {
result.ConnectionState = m.tracker.RecordRelayReady(probePeer, latency, completedAt)
} else {
result.ConnectionState = m.tracker.RecordSuccessForPeer(probePeer, latency, completedAt)
}
result.CompletedAt = completedAt
return result
}
latency := int(completedAt.Sub(startedAt).Milliseconds())
if latency < 0 {
latency = 0
}
result.LinkStatus = PeerConnectionProbeReachable
result.LatencyMs = latency
if intent.RelayCandidate {
result.ConnectionState = m.tracker.RecordRelayReady(peer, latency, completedAt)
} else {
result.ConnectionState = m.tracker.RecordSuccess(intent.NodeID, latency, completedAt)
completedAt := normalizedNow(m.now())
if lastFailure == "" {
lastFailure = "no_probe_endpoint_available"
}
result.LinkStatus = PeerConnectionProbeUnreachable
result.FailureReason = lastFailure
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, lastFailure, completedAt)
result.CompletedAt = completedAt
return result
}
func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCacheEntry) []peerConnectionProbeTarget {
seen := map[string]struct{}{}
out := make([]peerConnectionProbeTarget, 0, len(cacheEntry.EndpointCandidates)+1)
add := func(candidateID, endpoint, transport string) {
endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/")
if endpoint == "" {
return
}
key := candidateID + "|" + endpoint
if _, ok := seen[key]; ok {
return
}
seen[key] = struct{}{}
out = append(out, peerConnectionProbeTarget{
CandidateID: strings.TrimSpace(candidateID),
Endpoint: endpoint,
Transport: strings.TrimSpace(transport),
})
}
for _, candidate := range cacheEntry.EndpointCandidates {
if !candidateUsableForDirectProbe(candidate) {
continue
}
add(candidate.EndpointID, candidate.Address, candidate.Transport)
}
add(intent.BestCandidateID, intent.Endpoint, intent.Transport)
return out
}
func candidateUsableForDirectProbe(candidate PeerEndpointCandidate) bool {
endpoint := strings.TrimSpace(candidate.Address)
if endpoint == "" || strings.HasPrefix(endpoint, "relay://") || strings.HasPrefix(endpoint, "outbound://") {
return false
}
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
transport := strings.ToLower(strings.TrimSpace(candidate.Transport))
if connectivity == "outbound_only" || connectivity == "relay_required" || reachability == "outbound_only" || reachability == "relay" {
return false
}
return transport == "" || strings.Contains(transport, "direct") || transport == "wss" || strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://")
}
func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionState {
snapshot := m.tracker.Snapshot()
for _, entry := range snapshot.Entries {
@@ -188,3 +188,71 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
t.Fatalf("unexpected tracker snapshot: %+v", snapshot)
}
}
func TestPeerConnectionManagerFallsBackAcrossEndpointCandidates(t *testing.T) {
now := time.Date(2026, 4, 30, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
}.Handler())
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-dead",
NodeID: "node-b",
Transport: "direct_http",
Address: "http://127.0.0.1:1",
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 1,
},
{
EndpointID: "node-b-live",
NodeID: "node-b",
Transport: "direct_http",
Address: server.URL,
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 2,
},
},
},
WarmPeerLimit: 1,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
HTTPClient: &http.Client{Timeout: 100 * time.Millisecond},
ProbeTimeout: 100 * time.Millisecond,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 1 || cycle.Succeeded != 1 || cycle.Failed != 0 || len(cycle.Results) != 1 {
t.Fatalf("unexpected cycle: %+v", cycle)
}
result := cycle.Results[0]
if result.LinkStatus != PeerConnectionProbeReachable || result.SelectedCandidateID != "node-b-live" || result.SelectedEndpoint != server.URL {
t.Fatalf("fallback did not select live candidate: %+v", result)
}
if len(result.CandidateResults) != 2 ||
result.CandidateResults[0].LinkStatus != PeerConnectionProbeUnreachable ||
result.CandidateResults[1].LinkStatus != PeerConnectionProbeReachable {
t.Fatalf("candidate probe trail mismatch: %+v", result.CandidateResults)
}
snapshot := tracker.Snapshot()
if snapshot.Ready != 1 || len(snapshot.Entries) != 1 || snapshot.Entries[0].BestCandidateID != "node-b-live" || snapshot.Entries[0].Endpoint != server.URL {
t.Fatalf("tracker did not retain selected candidate: %+v", snapshot)
}
}
@@ -138,6 +138,32 @@ func (t *PeerConnectionTracker) RecordSuccess(nodeID string, latencyMs int, now
return entry
}
func (t *PeerConnectionTracker) RecordSuccessForPeer(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entry(peer, now)
entry.ConsecutiveSuccesses++
entry.ConsecutiveFailures = 0
entry.LastLatencyMs = latencyMs
entry.LastFailureReason = ""
entry.LastProbeAt = now
entry.BackoffUntil = time.Time{}
nextState := PeerConnectionReady
if latencyMs >= 500 {
nextState = PeerConnectionDegraded
}
if entry.State != nextState {
entry.State = nextState
entry.LastTransitionAt = now
}
t.entries[peer.NodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordRelayReady(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
@@ -34,12 +34,20 @@ func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope,
return err
}
}
if envelope.ChannelClass != ProductionChannelFabricControl {
maxPayloadBytes := MaxProductionEnvelopePayloadBytes
switch envelope.ChannelClass {
case ProductionChannelFabricControl:
if envelope.MessageType != ProductionMessageFabricControl {
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
}
case ProductionChannelVPNPacket:
if envelope.MessageType != ProductionMessageVPNPacketBatch {
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
}
maxPayloadBytes = MaxProductionVPNPacketPayloadBytes
default:
return ErrUnauthorizedChannel
}
if envelope.MessageType != ProductionMessageFabricControl {
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
}
if envelope.TTL <= 0 {
return ErrTTLExhausted
}
@@ -58,8 +66,8 @@ func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope,
if envelope.PayloadLength != len(envelope.Payload) {
return fmt.Errorf("%w: payload_length mismatch", ErrForwardEnvelopeInvalid)
}
if envelope.PayloadLength > MaxProductionEnvelopePayloadBytes {
return fmt.Errorf("%w: payload exceeds fabric-control limit", ErrForwardEnvelopeInvalid)
if envelope.PayloadLength > maxPayloadBytes {
return fmt.Errorf("%w: payload exceeds channel limit", ErrForwardEnvelopeInvalid)
}
if envelope.PayloadHash == "" {
return fmt.Errorf("%w: payload_hash is required", ErrForwardEnvelopeInvalid)
@@ -22,7 +22,7 @@ func ValidateProductionEnvelopeRouteConfig(local PeerIdentity, envelope Producti
if route.ExpiresAt.IsZero() || !route.ExpiresAt.After(now.UTC()) || envelope.ExpiresAt.After(route.ExpiresAt) {
return ErrRouteExpired
}
if !contains(route.AllowedChannels, ProductionChannelFabricControl) {
if !contains(route.AllowedChannels, envelope.ChannelClass) {
return ErrUnauthorizedChannel
}
path := routePath(route)
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff