рабочий вариант, но скороть 10 МБит
build / backend (push) Has been cancelled
build / node-agent (push) Has been cancelled
build / worker (push) Has been cancelled

This commit is contained in:
2026-05-22 21:46:49 +03:00
parent 469fa0e860
commit 20d361a886
280 changed files with 954890 additions and 18524 deletions
@@ -1,111 +0,0 @@
package mesh
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"time"
)
type Client struct {
BaseURL string
HTTPClient *http.Client
}
func NewClient(baseURL string) Client {
return Client{
BaseURL: baseURL,
HTTPClient: &http.Client{
Timeout: 5 * time.Second,
},
}
}
func (c Client) SendHealth(ctx context.Context, message HealthMessage) (HealthAck, error) {
payload, err := json.Marshal(message)
if err != nil {
return HealthAck{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/health", bytes.NewReader(payload))
if err != nil {
return HealthAck{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return HealthAck{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return HealthAck{}, fmt.Errorf("mesh health rejected with status %d", resp.StatusCode)
}
var ack HealthAck
if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil {
return HealthAck{}, err
}
return ack, nil
}
func (c Client) SendSynthetic(ctx context.Context, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
payload, err := json.Marshal(envelope)
if err != nil {
return SyntheticEnvelope{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/synthetic/probe", bytes.NewReader(payload))
if err != nil {
return SyntheticEnvelope{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return SyntheticEnvelope{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return SyntheticEnvelope{}, fmt.Errorf("mesh synthetic probe rejected with status %d", resp.StatusCode)
}
var ack SyntheticEnvelope
if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil {
return SyntheticEnvelope{}, err
}
return ack, nil
}
func (c Client) SendProduction(ctx context.Context, envelope ProductionEnvelope) (ProductionForwardResult, error) {
payload, err := json.Marshal(envelope)
if err != nil {
return ProductionForwardResult{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/forward", bytes.NewReader(payload))
if err != nil {
return ProductionForwardResult{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return ProductionForwardResult{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return ProductionForwardResult{}, fmt.Errorf("mesh production forward rejected with status %d", resp.StatusCode)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return ProductionForwardResult{}, err
}
return result, nil
}
@@ -70,7 +70,7 @@ const (
FabricServiceChannelReliable = "reliable"
FabricServiceChannelDroppable = "droppable"
MaxProductionEnvelopePayloadBytes = 4096
MaxProductionVPNPacketPayloadBytes = 256 * 1024
MaxProductionVPNPacketPayloadBytes = 8 * 1024 * 1024
MaxProductionEnvelopeFutureSkew = time.Minute
ProductionForwardQUICStreamID = 1
WebIngressForwardQUICStreamID = 2
@@ -203,22 +203,6 @@ type SyntheticRelayQueueMetrics struct {
QueueDepths map[string]int `json:"queue_depths"`
}
type HealthMessage struct {
ProtocolVersion string `json:"protocol_version"`
From PeerIdentity `json:"from"`
To PeerIdentity `json:"to"`
ObservedAt time.Time `json:"observed_at"`
LinkStatus string `json:"link_status"`
LatencyMs *int `json:"latency_ms,omitempty"`
QualityScore *int `json:"quality_score,omitempty"`
}
type HealthAck struct {
ProtocolVersion string `json:"protocol_version"`
Accepted bool `json:"accepted"`
By PeerIdentity `json:"by"`
}
type ProductionEnvelope struct {
FabricProtocolVersion string `json:"fabric_protocol_version"`
MessageID string `json:"message_id"`
@@ -1,6 +1,7 @@
package mesh
import (
"encoding/json"
"sort"
"strings"
"time"
@@ -9,6 +10,9 @@ import (
type EndpointCandidateScoreOptions struct {
ChannelClass string
PreferredRegion string
SiteID string
LocalityGroupID string
LocalNATGroupID string
Now time.Time
MaxVerificationAge time.Duration
Observations map[string]EndpointCandidateHealthObservation
@@ -21,6 +25,7 @@ type EndpointCandidateHealthObservation struct {
EndpointID string `json:"endpoint_id"`
Source string `json:"source,omitempty"`
ReporterNodeID string `json:"reporter_node_id,omitempty"`
ReporterRegion string `json:"reporter_region,omitempty"`
LastLatencyMs int64 `json:"last_latency_ms,omitempty"`
SuccessCount uint64 `json:"success_count,omitempty"`
FailureCount uint64 `json:"failure_count,omitempty"`
@@ -114,6 +119,9 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
case "direct":
score += 30
reasons = append(reasons, "connectivity:direct")
case "private_lan":
score += 36
reasons = append(reasons, "connectivity:private_lan")
case "outbound_only":
score += 5
reasons = append(reasons, "connectivity:outbound_only")
@@ -167,6 +175,7 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
score += 18
reasons = append(reasons, "policy:private-lan")
}
score, reasons = applyLocalityPreferences(candidate, opts, score, reasons)
if hasPolicyTag(candidate.PolicyTags, "costly") {
score -= 10
reasons = append(reasons, "policy:costly")
@@ -193,7 +202,7 @@ func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCa
}
}
if observation, ok := opts.Observations[candidate.EndpointID]; ok {
observationScore, observationReasons := scoreEndpointCandidateObservation(observation, opts)
observationScore, observationReasons := scoreEndpointCandidateObservation(candidate, observation, opts)
score += observationScore
reasons = append(reasons, observationReasons...)
}
@@ -225,7 +234,7 @@ func scoreEndpointCandidateCapacityPressure(pressure EndpointCandidateCapacityPr
return -penalty, []string{"capacity:pressure"}
}
func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObservation, opts EndpointCandidateScoreOptions) (int, []string) {
func scoreEndpointCandidateObservation(candidate PeerEndpointCandidate, observation EndpointCandidateHealthObservation, opts EndpointCandidateScoreOptions) (int, []string) {
score := 0
reasons := []string{"observation:present"}
if !opts.Now.IsZero() && !observation.ObservedAt.IsZero() && opts.MaxObservationAge > 0 {
@@ -236,6 +245,18 @@ func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObserv
score += 6
reasons = append(reasons, "observation:fresh")
}
observationScope := endpointCandidateObservationScope(candidate, observation, opts)
if observationScope != "" {
reasons = append(reasons, "observation_scope:"+observationScope)
}
if endpointRequiresExternalNetworkVerification(candidate) && (observationScope == "self" || observationScope == "same_area") {
reasons = append(reasons, "observation:non_authoritative_same_area_public")
if strings.TrimSpace(observation.LastFailureReason) == "capacity_limited" {
score -= 4
reasons = append(reasons, "capacity:limited")
}
return score, reasons
}
switch {
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 50:
score += 24
@@ -286,6 +307,118 @@ func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObserv
return score, reasons
}
func endpointCandidateObservationScope(candidate PeerEndpointCandidate, observation EndpointCandidateHealthObservation, opts EndpointCandidateScoreOptions) string {
if strings.TrimSpace(observation.ReporterNodeID) != "" &&
strings.TrimSpace(candidate.NodeID) != "" &&
strings.EqualFold(strings.TrimSpace(observation.ReporterNodeID), strings.TrimSpace(candidate.NodeID)) {
return "self"
}
reporterRegion := strings.TrimSpace(observation.ReporterRegion)
if reporterRegion == "" && strings.EqualFold(strings.TrimSpace(observation.Source), "local_vpn_fabric_session") {
reporterRegion = strings.TrimSpace(opts.PreferredRegion)
}
candidateRegion := strings.TrimSpace(candidate.Region)
if reporterRegion == "" || candidateRegion == "" {
return ""
}
if strings.EqualFold(reporterRegion, candidateRegion) {
return "same_area"
}
return "cross_area"
}
func endpointRequiresExternalNetworkVerification(candidate PeerEndpointCandidate) bool {
if !strings.EqualFold(strings.TrimSpace(candidate.Reachability), "public") {
return false
}
if len(candidate.Metadata) == 0 || !json.Valid(candidate.Metadata) {
return false
}
var metadata struct {
VerificationScope string `json:"verification_scope,omitempty"`
}
if err := json.Unmarshal(candidate.Metadata, &metadata); err != nil {
return false
}
return strings.EqualFold(strings.TrimSpace(metadata.VerificationScope), "external-network-required")
}
func applyLocalityPreferences(candidate PeerEndpointCandidate, opts EndpointCandidateScoreOptions, score int, reasons []string) (int, []string) {
locality := endpointCandidateLocality(candidate, opts)
switch locality {
case "local_segment":
score += 65
reasons = append(reasons, "locality:local_segment")
case "same_nat":
score += 45
reasons = append(reasons, "locality:same_nat")
case "private_scoped":
score += 20
reasons = append(reasons, "locality:private_scoped")
case "private_unscoped":
score -= 35
reasons = append(reasons, "locality:private_unscoped")
case "private_foreign":
score -= 90
reasons = append(reasons, "locality:private_foreign")
case "public_fallback":
score -= 5
reasons = append(reasons, "locality:public_fallback")
}
return score, reasons
}
func endpointCandidateLocality(candidate PeerEndpointCandidate, opts EndpointCandidateScoreOptions) string {
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
isPrivate := reachability == "private" || connectivity == "private_lan" || endpointHasPrivateHost(candidate.Address)
if !isPrivate {
if reachability == "public" && endpointRequiresExternalNetworkVerification(candidate) {
return "public_fallback"
}
return ""
}
metadata := decodeEndpointCandidateLocalityMetadata(candidate.Metadata)
localityGroupID := strings.TrimSpace(opts.LocalityGroupID)
if localityGroupID != "" && strings.TrimSpace(metadata.LocalityGroupID) != "" &&
strings.EqualFold(strings.TrimSpace(metadata.LocalityGroupID), localityGroupID) {
return "local_segment"
}
if opts.LocalNATGroupID != "" && metadata.NATGroupID != "" && strings.EqualFold(metadata.NATGroupID, strings.TrimSpace(opts.LocalNATGroupID)) {
return "same_nat"
}
if strings.TrimSpace(opts.SiteID) != "" && metadata.SiteID != "" && strings.EqualFold(metadata.SiteID, strings.TrimSpace(opts.SiteID)) {
return "private_scoped"
}
if hasPolicyTag(candidate.PolicyTags, "private-lan") || hasPolicyTag(candidate.PolicyTags, "corp-lan") || hasPolicyTag(candidate.PolicyTags, "same-site") {
return "private_scoped"
}
if metadata.LocalityGroupID != "" || metadata.SiteID != "" || metadata.NATGroupID != "" {
return "private_foreign"
}
return "private_unscoped"
}
type endpointCandidateLocalityMetadata struct {
SiteID string `json:"site_id,omitempty"`
LocalityGroupID string `json:"locality_group_id,omitempty"`
NATGroupID string `json:"nat_group_id,omitempty"`
}
func decodeEndpointCandidateLocalityMetadata(raw json.RawMessage) endpointCandidateLocalityMetadata {
if len(raw) == 0 || !json.Valid(raw) {
return endpointCandidateLocalityMetadata{}
}
var metadata endpointCandidateLocalityMetadata
if err := json.Unmarshal(raw, &metadata); err != nil {
return endpointCandidateLocalityMetadata{}
}
metadata.SiteID = strings.TrimSpace(metadata.SiteID)
metadata.LocalityGroupID = strings.TrimSpace(metadata.LocalityGroupID)
metadata.NATGroupID = strings.TrimSpace(metadata.NATGroupID)
return metadata
}
func hasPolicyTag(tags []string, needle string) bool {
for _, tag := range tags {
if strings.EqualFold(strings.TrimSpace(tag), needle) {
@@ -1,6 +1,7 @@
package mesh
import (
"encoding/json"
"testing"
"time"
)
@@ -526,6 +527,161 @@ func TestRankPeerEndpointCandidatesSpreadsFreshCapacityPressure(t *testing.T) {
}
}
func TestRankPeerEndpointCandidatesIgnoresSameAreaPublicVerificationFailures(t *testing.T) {
now := time.Date(2026, 5, 19, 12, 0, 0, 0, time.UTC)
candidate := PeerEndpointCandidate{
EndpointID: "test-1-public",
NodeID: "test-1",
Transport: "direct_quic",
Address: "quic://94.141.118.222:19191",
Reachability: "public",
NATType: "port_restricted",
ConnectivityMode: "direct",
Region: "home-test",
Priority: 2,
Metadata: json.RawMessage(`{"verification_scope":"external-network-required"}`),
}
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{candidate}, EndpointCandidateScoreOptions{
PreferredRegion: "home-test",
Now: now,
MaxObservationAge: time.Minute,
Observations: map[string]EndpointCandidateHealthObservation{
"test-1-public": {
EndpointID: "test-1-public",
ReporterNodeID: "home-1",
ReporterRegion: "home-test",
FailureCount: 4,
LastFailureReason: "context_deadline_exceeded",
ReliabilityScore: 20,
ObservedAt: now,
},
},
})
if len(ranked) != 1 {
t.Fatalf("ranked length = %d, want 1", len(ranked))
}
if !containsReason(ranked[0].Reasons, "observation:non_authoritative_same_area_public") {
t.Fatalf("same-area public observation should be non-authoritative: %+v", ranked[0].Reasons)
}
if containsReason(ranked[0].Reasons, "history:failure") || containsReason(ranked[0].Reasons, "failure:recent") {
t.Fatalf("same-area public failures should not demote candidate: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesUsesCrossAreaPublicVerificationFailures(t *testing.T) {
now := time.Date(2026, 5, 19, 12, 0, 0, 0, time.UTC)
candidate := PeerEndpointCandidate{
EndpointID: "test-1-public",
NodeID: "test-1",
Transport: "direct_quic",
Address: "quic://94.141.118.222:19191",
Reachability: "public",
NATType: "port_restricted",
ConnectivityMode: "direct",
Region: "home-test",
Priority: 2,
Metadata: json.RawMessage(`{"verification_scope":"external-network-required"}`),
}
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{candidate}, EndpointCandidateScoreOptions{
PreferredRegion: "usa",
Now: now,
MaxObservationAge: time.Minute,
Observations: map[string]EndpointCandidateHealthObservation{
"test-1-public": {
EndpointID: "test-1-public",
ReporterNodeID: "usa-los-1",
ReporterRegion: "usa",
FailureCount: 4,
LastFailureReason: "context_deadline_exceeded",
ReliabilityScore: 20,
ObservedAt: now,
},
},
})
if len(ranked) != 1 {
t.Fatalf("ranked length = %d, want 1", len(ranked))
}
if !containsReason(ranked[0].Reasons, "observation_scope:cross_area") {
t.Fatalf("cross-area scope missing: %+v", ranked[0].Reasons)
}
if !containsReason(ranked[0].Reasons, "history:failure") || !containsReason(ranked[0].Reasons, "failure:recent") {
t.Fatalf("cross-area public failures should demote candidate: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesPrefersScopedPrivateLANOverPublic(t *testing.T) {
now := time.Date(2026, 5, 19, 13, 0, 0, 0, time.UTC)
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://94.141.118.222:19191",
Reachability: "public",
ConnectivityMode: "direct",
NATType: "port_restricted",
Priority: 2,
},
{
EndpointID: "node-b-private",
NodeID: "node-b",
Transport: "lan_quic",
Address: "quic://192.168.200.61:19134",
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 1,
Metadata: json.RawMessage(`{"locality_group_id":"home-test","nat_group_id":"home-router"}`),
},
}, EndpointCandidateScoreOptions{
PreferredRegion: "home-test",
LocalityGroupID: "home-test",
LocalNATGroupID: "home-router",
Now: now,
})
if ranked[0].Candidate.EndpointID != "node-b-private" {
t.Fatalf("top endpoint = %q, want node-b-private: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if !containsReason(ranked[0].Reasons, "locality:local_segment") {
t.Fatalf("missing locality group reason: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesPenalizesForeignPrivateEndpoint(t *testing.T) {
now := time.Date(2026, 5, 19, 13, 0, 0, 0, time.UTC)
ranked := RankPeerEndpointCandidates([]PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://94.141.118.222:19191",
Reachability: "public",
ConnectivityMode: "direct",
Priority: 2,
},
{
EndpointID: "node-b-private-foreign",
NodeID: "node-b",
Transport: "lan_quic",
Address: "quic://10.24.10.20:19443",
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 1,
Metadata: json.RawMessage(`{"locality_group_id":"other-site","nat_group_id":"other-nat"}`),
},
}, EndpointCandidateScoreOptions{
PreferredRegion: "home-test",
LocalityGroupID: "home-test",
LocalNATGroupID: "home-router",
Now: now,
})
if ranked[0].Candidate.EndpointID != "node-b-public" {
t.Fatalf("top endpoint = %q, want node-b-public: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if !containsReason(ranked[1].Reasons, "locality:private_foreign") {
t.Fatalf("missing foreign private reason: %+v", ranked[1].Reasons)
}
}
func containsReason(reasons []string, reason string) bool {
for _, item := range reasons {
if item == reason {
@@ -23,7 +23,7 @@ func FabricTransportTargetFromRegistryEndpoint(endpoint FabricRegistryEndpoint)
return FabricTransportTarget{
EndpointID: strings.TrimSpace(endpoint.EndpointID),
PeerID: strings.TrimSpace(endpoint.EndpointID),
Endpoint: strings.TrimSpace(endpoint.Address),
Endpoint: fabricControlEndpointAddress(endpoint),
Transport: strings.TrimSpace(endpoint.Transport),
PeerCertSHA256: strings.TrimSpace(endpoint.PeerCertSHA256),
Timeout: 5 * time.Second,
@@ -32,6 +32,28 @@ func FabricTransportTargetFromRegistryEndpoint(endpoint FabricRegistryEndpoint)
}
}
func fabricControlEndpointAddress(endpoint FabricRegistryEndpoint) string {
if mapped := fabricControlMetadataString(endpoint.Metadata, "maps_to"); mapped != "" {
if strings.Contains(mapped, "://") {
return mapped
}
return "quic://" + mapped
}
return strings.TrimSpace(endpoint.Address)
}
func fabricControlMetadataString(raw json.RawMessage, key string) string {
if len(raw) == 0 {
return ""
}
var metadata map[string]any
if err := json.Unmarshal(raw, &metadata); err != nil {
return ""
}
value, _ := metadata[key].(string)
return strings.TrimSpace(value)
}
func SendFabricControlForward(ctx context.Context, transport FabricTransport, endpoint FabricRegistryEndpoint, payload []byte, timeout time.Duration) (FabricControlForwardResult, error) {
if transport == nil {
return FabricControlForwardResult{}, fmt.Errorf("fabric control transport is unavailable")
@@ -137,7 +137,7 @@ type FabricAdjacency struct {
PressurePercent int
Healthy bool
PassiveOutbound bool
LocalSegmentID string
LocalityGroupID string
NATGroupID string
LastObservedAt time.Time
LastFailureReason string
@@ -0,0 +1,74 @@
package mesh
import (
"context"
"fmt"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/fabricproto"
)
func ProbeFabricTarget(ctx context.Context, target FabricTransportTarget) (time.Duration, error) {
target.Timeout = positiveDurationOr(target.Timeout, 2*time.Second)
target.InboundBuffer = positiveIntOr(target.InboundBuffer, 2)
target.ErrorBuffer = positiveIntOr(target.ErrorBuffer, 2)
transport, normalizedTarget, err := FabricTransportForTarget(target, nil)
if err != nil {
return 0, err
}
session, err := transport.Connect(ctx, normalizedTarget)
if err != nil {
_ = transport.Close()
return 0, err
}
defer func() {
_ = session.Close()
_ = transport.Close()
}()
startedAt := time.Now()
sequence := uint64(startedAt.UnixNano())
if err := session.Send(ctx, fabricproto.Frame{
Type: fabricproto.FramePing,
TrafficClass: fabricproto.TrafficClassReliable,
Sequence: sequence,
Payload: []byte("fabric-live-probe"),
}); err != nil {
return 0, err
}
for {
select {
case frame, ok := <-session.Frames():
if !ok {
return 0, fmt.Errorf("fabric live probe session closed")
}
if frame.Type == fabricproto.FramePong && frame.Sequence == sequence {
return time.Since(startedAt), nil
}
case err, ok := <-session.Errors():
if !ok {
return 0, fmt.Errorf("fabric live probe error channel closed")
}
if err != nil {
return 0, err
}
case <-ctx.Done():
return 0, ctx.Err()
}
}
}
func positiveDurationOr(value time.Duration, fallback time.Duration) time.Duration {
if value > 0 {
return value
}
return fallback
}
func positiveIntOr(value int, fallback int) int {
if value > 0 {
return value
}
return fallback
}
@@ -59,7 +59,7 @@ func StartQUICFabricServer(ctx context.Context, cfg QUICFabricServerConfig) (*QU
if len(tlsConfig.NextProtos) == 0 {
tlsConfig.NextProtos = []string{fabricQUICNextProto}
}
listener, err := quic.ListenAddr(cfg.ListenAddr, tlsConfig, cfg.QUICConfig)
listener, err := quic.ListenAddr(cfg.ListenAddr, tlsConfig, defaultQUICFabricConfig(cfg.QUICConfig))
if err != nil {
return nil, err
}
@@ -132,7 +132,7 @@ func (s *QUICFabricServer) handleConn(ctx context.Context, conn *quic.Conn) {
func (s *QUICFabricServer) handleStream(ctx context.Context, conn *quic.Conn, stream *quic.Stream) {
session := fabricproto.NewSession(fabricproto.SessionConfig{})
sender := quicStreamFrameSender{stream: stream}
sender := &quicStreamFrameSender{stream: stream}
defer func() { _ = stream.Close() }()
s.logFabricSession(FabricSessionEventLogEntry{
Event: "fabric_session_quic_stream_opened",
@@ -207,7 +207,7 @@ type quicStreamFrameSender struct {
mu sync.Mutex
}
func (s quicStreamFrameSender) SendFrame(ctx context.Context, frame fabricproto.Frame) error {
func (s *quicStreamFrameSender) SendFrame(ctx context.Context, frame fabricproto.Frame) error {
if s.stream == nil {
return fmt.Errorf("quic fabric stream is closed")
}
@@ -22,6 +22,9 @@ const fabricQUICNextProto = "rap-fabric-data-session-v1"
const fabricQUICReverseHelloPrefix = "rap-fabric-reverse-hello-v1:"
const defaultQUICFabricConnIdleTTL = 5 * time.Minute
const defaultQUICFabricMaxStreamsPerConn = 64
const defaultQUICFabricHandshakeIdleTimeout = 8 * time.Second
const defaultQUICFabricMaxIdleTimeout = 90 * time.Second
const defaultQUICFabricKeepAlivePeriod = 15 * time.Second
const ErrQUICFabricStreamLimitReached = quicFabricError("quic fabric stream limit reached")
type quicFabricError string
@@ -31,20 +34,20 @@ func (e quicFabricError) Error() string {
}
type QUICFabricTransport struct {
Config *quic.Config
LocalPeerID string
IdleTTL time.Duration
MaxStreamsPerConn int
DialAddr func(context.Context, string, *tls.Config, *quic.Config) (*quic.Conn, error)
mu sync.Mutex
conns map[string]*quicFabricConnEntry
reverseConns map[string]*quicFabricConnEntry
inboundProductionHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
inboundWebIngressHandler func(context.Context, []byte) ([]byte, error)
Config *quic.Config
LocalPeerID string
IdleTTL time.Duration
MaxStreamsPerConn int
DialAddr func(context.Context, string, *tls.Config, *quic.Config) (*quic.Conn, error)
mu sync.Mutex
conns map[string]*quicFabricConnEntry
reverseConns map[string]*quicFabricConnEntry
inboundProductionHandler func(context.Context, ProductionEnvelope) (ProductionForwardResult, error)
inboundWebIngressHandler func(context.Context, []byte) ([]byte, error)
inboundFabricControlHandler func(context.Context, []byte) ([]byte, error)
inboundSyntheticHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
logger FabricSessionEventLogger
stats QUICFabricTransportStats
inboundSyntheticHandler func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error)
logger FabricSessionEventLogger
stats QUICFabricTransportStats
}
type QUICFabricTransportStats struct {
@@ -109,7 +112,25 @@ type quicFabricConnEntry struct {
}
func NewQUICFabricTransport(config *quic.Config) *QUICFabricTransport {
return &QUICFabricTransport{Config: config, IdleTTL: defaultQUICFabricConnIdleTTL, MaxStreamsPerConn: defaultQUICFabricMaxStreamsPerConn, conns: map[string]*quicFabricConnEntry{}, reverseConns: map[string]*quicFabricConnEntry{}}
return &QUICFabricTransport{Config: defaultQUICFabricConfig(config), IdleTTL: defaultQUICFabricConnIdleTTL, MaxStreamsPerConn: defaultQUICFabricMaxStreamsPerConn, conns: map[string]*quicFabricConnEntry{}, reverseConns: map[string]*quicFabricConnEntry{}}
}
func defaultQUICFabricConfig(config *quic.Config) *quic.Config {
out := &quic.Config{}
if config != nil {
clone := *config
out = &clone
}
if out.HandshakeIdleTimeout <= 0 {
out.HandshakeIdleTimeout = defaultQUICFabricHandshakeIdleTimeout
}
if out.MaxIdleTimeout <= 0 {
out.MaxIdleTimeout = defaultQUICFabricMaxIdleTimeout
}
if out.KeepAlivePeriod <= 0 {
out.KeepAlivePeriod = defaultQUICFabricKeepAlivePeriod
}
return out
}
func (t *QUICFabricTransport) SetInboundHandlers(production func(context.Context, ProductionEnvelope) (ProductionForwardResult, error), synthetic func(context.Context, SyntheticEnvelope) (SyntheticEnvelope, error), logger FabricSessionEventLogger) {
@@ -150,6 +171,7 @@ func quicTLSConfigForTarget(target FabricTransportTarget) *tls.Config {
expectedFingerprint := normalizeCertSHA256(target.PeerCertSHA256)
config := &tls.Config{NextProtos: []string{fabricQUICNextProto}}
if expectedFingerprint == "" {
config.InsecureSkipVerify = true
return config
}
config.InsecureSkipVerify = true
@@ -198,9 +220,12 @@ func (t *QUICFabricTransport) Connect(ctx context.Context, target FabricTranspor
stream, err := conn.OpenStreamSync(ctx)
if err != nil {
t.releaseStream(connKey)
t.evictConnByKey(connKey, conn)
t.evictConn(target, conn)
if closeConn {
_ = conn.CloseWithError(1, "open stream failed")
} else {
_ = conn.CloseWithError(1, "cached stream open failed")
}
return nil, err
}
@@ -680,8 +705,28 @@ func (t *QUICFabricTransport) evictConn(target FabricTransportTarget, conn *quic
t.mu.Unlock()
}
func (t *QUICFabricTransport) evictConnByKey(key string, conn *quic.Conn) {
if t == nil || key == "" || conn == nil {
return
}
t.mu.Lock()
defer t.mu.Unlock()
if strings.HasPrefix(key, "reverse\x00") {
peerID := strings.TrimPrefix(key, "reverse\x00")
if entry := t.reverseConns[peerID]; entry != nil && entry.conn == conn {
delete(t.reverseConns, peerID)
t.stats.ClosedEvicted++
}
return
}
if entry := t.conns[key]; entry != nil && entry.conn == conn {
delete(t.conns, key)
t.stats.ClosedEvicted++
}
}
func (t *QUICFabricTransport) pruneIdleLocked(now time.Time) {
if t == nil || len(t.conns) == 0 {
if t == nil {
return
}
ttl := t.IdleTTL
@@ -897,7 +942,13 @@ func (s *quicFabricSession) Send(ctx context.Context, frame fabricproto.Frame) e
s.writeMu.Lock()
defer s.writeMu.Unlock()
s.applyWriteDeadline(ctx)
return fabricproto.WriteFrame(s.stream, frame)
if err := fabricproto.WriteFrame(s.stream, frame); err != nil {
if s.transport != nil && s.conn != nil {
s.transport.evictConnByKey(s.connKey, s.conn)
}
return err
}
return nil
}
func (s *quicFabricSession) Frames() <-chan fabricproto.Frame {
@@ -21,7 +21,7 @@ const (
type FabricRoutePlannerConfig struct {
ClusterID string
LocalNodeID string
LocalSegmentID string
LocalityGroupID string
LocalNATGroupID string
DefaultCapacity int
RelayCapacity int
@@ -34,13 +34,13 @@ type FabricRoutePlannerConfig struct {
}
type FabricCandidateMetadata struct {
LocalSegmentID string `json:"local_segment_id,omitempty"`
NATGroupID string `json:"nat_group_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
ViaNodeID string `json:"via_node_id,omitempty"`
STUNServer string `json:"stun_server,omitempty"`
ICEFoundation string `json:"ice_foundation,omitempty"`
LocalityGroupID string `json:"locality_group_id,omitempty"`
NATGroupID string `json:"nat_group_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
ViaNodeID string `json:"via_node_id,omitempty"`
STUNServer string `json:"stun_server,omitempty"`
ICEFoundation string `json:"ice_foundation,omitempty"`
}
func FabricRouteSetForPeerEndpointCandidates(targetNodeID string, candidates []PeerEndpointCandidate, cfg FabricRoutePlannerConfig) FabricRouteSet {
@@ -141,7 +141,7 @@ func fabricRouteModeForPeerEndpointCandidate(candidate PeerEndpointCandidate, me
}
reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability))
connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode))
if sameLocalSegment(metadata, cfg) || sameNATGroup(metadata, cfg) {
if sameLocalityGroup(metadata, cfg) || sameNATGroup(metadata, cfg) {
return FabricRouteLAN
}
if reachability == FabricCandidateReachabilityRelay || connectivity == FabricConnectivityRelayRequired || strings.TrimSpace(metadata.RelayEndpoint) != "" {
@@ -240,12 +240,12 @@ func candidatePressureCount(endpointID string, cfg FabricRoutePlannerConfig) int
return 0
}
func sameLocalSegment(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
localSegment := strings.TrimSpace(cfg.LocalSegmentID)
if localSegment == "" {
func sameLocalityGroup(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
localityGroup := strings.TrimSpace(cfg.LocalityGroupID)
if localityGroup == "" {
return false
}
return strings.EqualFold(strings.TrimSpace(metadata.LocalSegmentID), localSegment)
return strings.EqualFold(strings.TrimSpace(metadata.LocalityGroupID), localityGroup)
}
func sameNATGroup(metadata FabricCandidateMetadata, cfg FabricRoutePlannerConfig) bool {
@@ -7,7 +7,7 @@ import (
)
func TestFabricRouteSetForPeerEndpointCandidatesPrefersLocalLAN(t *testing.T) {
metadata, _ := json.Marshal(FabricCandidateMetadata{LocalSegmentID: "site-a", NATGroupID: "nat-a"})
metadata, _ := json.Marshal(FabricCandidateMetadata{LocalityGroupID: "home-lan", NATGroupID: "nat-a"})
routeSet := FabricRouteSetForPeerEndpointCandidates("node-b", []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
@@ -31,7 +31,7 @@ func TestFabricRouteSetForPeerEndpointCandidatesPrefersLocalLAN(t *testing.T) {
}, FabricRoutePlannerConfig{
ClusterID: "cluster-1",
LocalNodeID: "node-a",
LocalSegmentID: "site-a",
LocalityGroupID: "home-lan",
DefaultCapacity: 200,
Now: time.Unix(100, 0).UTC(),
})
@@ -172,7 +172,7 @@ func TestFabricRouteSetForPeerEndpointCandidatesRejectsNonQUIC(t *testing.T) {
ConnectivityMode: "direct",
},
{
EndpointID: "node-b-legacy-relay",
EndpointID: "node-b-compat-relay",
NodeID: "node-b",
Transport: "relay",
Address: "quic://node-r:19443",
@@ -180,7 +180,7 @@ func TestFabricRouteSetForPeerEndpointCandidatesRejectsNonQUIC(t *testing.T) {
ConnectivityMode: "relay_required",
},
{
EndpointID: "node-b-legacy-reverse",
EndpointID: "node-b-compat-reverse",
NodeID: "node-b",
Transport: "outbound_reverse",
Address: "quic://node-b:19443",
@@ -4,7 +4,6 @@ import (
"context"
"crypto/tls"
"fmt"
"net/http"
"strings"
"time"
@@ -30,7 +29,6 @@ type FabricTransportTarget struct {
Endpoint string
Transport string
Token string
Header http.Header
TLSConfig *tls.Config
PeerCertSHA256 string
Timeout time.Duration
@@ -11,6 +11,8 @@ const DefaultWarmPeerLimit = 8
type PeerCacheConfig struct {
Local PeerIdentity
LocalityGroupID string
LocalNATGroupID string
PeerEndpoints map[string]string
PeerEndpointCandidates map[string][]PeerEndpointCandidate
PeerEndpointObservations map[string]EndpointCandidateHealthObservation
@@ -59,11 +61,12 @@ type PeerCacheEntry struct {
BestCandidateScore int `json:"best_candidate_score,omitempty"`
BestScoreReasons []string `json:"best_score_reasons,omitempty"`
BestPeerCertSHA256 string `json:"best_peer_cert_sha256,omitempty"`
PublicIngressCount int `json:"public_ingress_count,omitempty"`
EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
RelayQUIC bool `json:"relay_quic"`
}
type peerCacheBuildEntry struct {
@@ -119,6 +122,8 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
scored := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelFabricControl,
PreferredRegion: cfg.PreferredRegion,
LocalityGroupID: cfg.LocalityGroupID,
LocalNATGroupID: cfg.LocalNATGroupID,
Now: now,
MaxVerificationAge: time.Hour,
Observations: cfg.PeerEndpointObservations,
@@ -129,6 +134,7 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
for _, scoredCandidate := range scored {
entry.EndpointCandidates = append(entry.EndpointCandidates, scoredCandidate.Candidate)
}
entry.PublicIngressCount = publicIngressCountFromCandidates(entry.EndpointCandidates)
entry.BestCandidateID = scored[0].Candidate.EndpointID
entry.BestCandidateAddr = scored[0].Candidate.Address
entry.BestTransport = scored[0].Candidate.Transport
@@ -197,9 +203,9 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
entry.RendezvousLeaseID = lease.LeaseID
entry.RelayNodeID = lease.RelayNodeID
entry.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
entry.RelayControl = true
entry.RelayQUIC = true
entry.CandidateCount = maxInt(entry.CandidateCount, 1)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{firstNonEmpty(lease.ConnectivityMode, "relay_required"), "relay_control"})
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{firstNonEmpty(lease.ConnectivityMode, "relay_required"), "relay_quic"})
if useLeaseEndpoint {
if localRelay {
entry.BestTransport = "reverse_quic"
@@ -225,7 +231,7 @@ func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
entry.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
}
entry.EndpointCount = maxInt(entry.EndpointCount, 1)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{"relay_control"})
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{"relay_quic"})
}
}
out := make([]peerCacheBuildEntry, 0, len(entries))
@@ -334,13 +340,37 @@ func warmPeerPriority(entry peerCacheBuildEntry) int {
if entry.bestScore > 0 {
score += entry.bestScore
}
if entry.RelayControl {
if entry.RelayQUIC {
score += 300
}
if entry.PublicIngressCount > 0 {
score += entry.PublicIngressCount * 75
}
score += entry.CandidateCount
return score
}
func publicIngressCountFromCandidates(candidates []PeerEndpointCandidate) int {
if len(candidates) == 0 {
return 0
}
distinct := map[string]struct{}{}
for _, candidate := range candidates {
if strings.ToLower(strings.TrimSpace(candidate.Reachability)) != "public" {
continue
}
if !strings.Contains(strings.ToLower(strings.TrimSpace(candidate.Transport)), "quic") {
continue
}
address := strings.TrimSpace(candidate.Address)
if address == "" {
continue
}
distinct[address] = struct{}{}
}
return len(distinct)
}
func warmPeerReason(entry peerCacheBuildEntry) string {
if entry.adjacentRoutePeer {
return "route_adjacent"
@@ -348,7 +378,7 @@ func warmPeerReason(entry peerCacheBuildEntry) string {
if entry.RecoverySeed {
return "recovery_seed"
}
if entry.RelayControl {
if entry.RelayQUIC {
return "rendezvous_lease"
}
if entry.BestCandidateID != "" {
@@ -98,6 +98,9 @@ func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
if entry.BestCandidateID != "node-b-public" || !entry.Warm {
t.Fatalf("unexpected candidate selection: %+v", entry)
}
if entry.PublicIngressCount != 1 {
t.Fatalf("public ingress count = %d, want 1", entry.PublicIngressCount)
}
}
func TestPeerCacheAppliesEndpointHealthObservations(t *testing.T) {
@@ -224,3 +227,12 @@ func peerCacheEntryByID(snapshot PeerCacheSnapshot, nodeID string) (PeerCacheEnt
}
return PeerCacheEntry{}, false
}
func containsString(values []string, want string) bool {
for _, value := range values {
if value == want {
return true
}
}
return false
}
@@ -21,7 +21,7 @@ const (
PeerTransportModeCorporateLAN = "corporate_lan"
PeerTransportModeOutboundOnly = "outbound_only"
PeerTransportModeRelayRequired = "relay_required"
PeerTransportModeRelayControl = "relay_control"
PeerTransportModeRelayQUIC = "relay_quic"
PeerTransportModeUnknown = "unknown"
)
@@ -44,7 +44,7 @@ type PeerConnectionIntentPlan struct {
CorporateLANCount int `json:"corporate_lan_count"`
OutboundOnlyCount int `json:"outbound_only_count"`
RelayRequiredCount int `json:"relay_required_count"`
RelayControlCount int `json:"relay_control_count"`
RelayQUICCount int `json:"relay_quic_count"`
RendezvousRequiredCount int `json:"rendezvous_required_count"`
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
RendezvousLeaseCount int `json:"rendezvous_lease_count"`
@@ -113,8 +113,8 @@ func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectio
RendezvousLeaseID: entry.RendezvousLeaseID,
RelayNodeID: entry.RelayNodeID,
RelayEndpoint: entry.RelayEndpoint,
RelayCandidate: entry.RelayControl,
ControlPlaneOnly: entry.RelayControl,
RelayCandidate: entry.RelayQUIC,
ControlPlaneOnly: entry.RelayQUIC,
RecoverySeed: candidate.RecoverySeed || entry.RecoverySeed,
Priority: candidate.Priority,
GeneratedAt: now,
@@ -163,8 +163,8 @@ func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectio
plan.OutboundOnlyCount++
case PeerTransportModeRelayRequired:
plan.RelayRequiredCount++
case PeerTransportModeRelayControl:
plan.RelayControlCount++
case PeerTransportModeRelayQUIC:
plan.RelayQUICCount++
}
if intent.RequiresRendezvous {
plan.RendezvousRequiredCount++
@@ -266,7 +266,7 @@ func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLeas
} else {
intent.Transport = firstNonEmpty(lease.Transport, "relay_quic")
}
intent.TransportMode = PeerTransportModeRelayControl
intent.TransportMode = PeerTransportModeRelayQUIC
intent.RequiresRendezvous = false
intent.RendezvousResolved = true
intent.DirectCandidate = false
@@ -170,11 +170,11 @@ func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
Now: now,
})
if plan.IntentCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousResolvedCount != 1 || plan.RendezvousRequiredCount != 0 {
if plan.IntentCount != 1 || plan.RelayQUICCount != 1 || plan.RendezvousResolvedCount != 1 || plan.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected relay-control plan counts: %+v", plan)
}
intent := plan.Intents[0]
if intent.TransportMode != PeerTransportModeRelayControl ||
if intent.TransportMode != PeerTransportModeRelayQUIC ||
intent.Endpoint != "quic://node-r:19443" ||
intent.RelayNodeID != "node-r" ||
intent.RendezvousLeaseID != "lease-node-b-via-node-r" ||
@@ -239,7 +239,7 @@ func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T
Now: now,
})
if plan.RendezvousResolvedCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousRequiredCount != 0 {
if plan.RendezvousResolvedCount != 1 || plan.RelayQUICCount != 1 || plan.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected reselected plan counts: %+v", plan)
}
intent := plan.Intents[0]
@@ -3,7 +3,6 @@ package mesh
import (
"context"
"fmt"
"net/http"
"strings"
"sync"
"time"
@@ -25,7 +24,6 @@ type PeerConnectionManagerConfig struct {
PeerCache *PeerCache
Tracker *PeerConnectionTracker
RendezvousLeases []PeerRendezvousLease
HTTPClient *http.Client
QUICTransport *QUICFabricTransport
PreferredRegion string
ProbeTimeout time.Duration
@@ -37,7 +35,6 @@ type PeerConnectionManager struct {
peerCache *PeerCache
tracker *PeerConnectionTracker
rendezvousLeases []PeerRendezvousLease
httpClient *http.Client
quicTransport *QUICFabricTransport
preferredRegion string
probeTimeout time.Duration
@@ -60,7 +57,7 @@ type PeerConnectionManagerCycle struct {
Skipped int `json:"skipped"`
RendezvousRequiredCount int `json:"rendezvous_required_count"`
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
RelayControlCount int `json:"relay_control_count"`
RelayQUICCount int `json:"relay_quic_count"`
RecoveryPlan PeerRecoveryPlan `json:"recovery_plan"`
IntentPlan PeerConnectionIntentPlan `json:"intent_plan"`
Results []PeerConnectionProbeResult `json:"results,omitempty"`
@@ -117,17 +114,6 @@ func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionMa
if probeTimeout <= 0 {
probeTimeout = DefaultPeerConnectionProbeTimeout
}
httpClient := cfg.HTTPClient
if httpClient == nil {
httpClient = &http.Client{
Transport: &http.Transport{
MaxIdleConns: 64,
MaxIdleConnsPerHost: 8,
IdleConnTimeout: 90 * time.Second,
},
Timeout: probeTimeout + time.Second,
}
}
now := cfg.Now
if now == nil {
now = func() time.Time { return time.Now().UTC() }
@@ -137,7 +123,6 @@ func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionMa
peerCache: cfg.PeerCache,
tracker: cfg.Tracker,
rendezvousLeases: append([]PeerRendezvousLease{}, cfg.RendezvousLeases...),
httpClient: httpClient,
quicTransport: cfg.QUICTransport,
preferredRegion: strings.TrimSpace(cfg.PreferredRegion),
probeTimeout: probeTimeout,
@@ -157,6 +142,7 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
Connections: m.tracker.Snapshot(),
TargetReadyPeers: DefaultStablePeerTarget,
MaxProbeCandidates: DefaultRecoveryProbeLimit,
PreferredRegion: m.preferredRegion,
Now: startedAt,
})
intentPlan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
@@ -177,7 +163,7 @@ func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionMan
IntentCount: intentPlan.IntentCount,
RendezvousRequiredCount: intentPlan.RendezvousRequiredCount,
RendezvousResolvedCount: intentPlan.RendezvousResolvedCount,
RelayControlCount: intentPlan.RelayControlCount,
RelayQUICCount: intentPlan.RelayQUICCount,
RecoveryPlan: recoveryPlan,
IntentPlan: intentPlan,
Results: make([]PeerConnectionProbeResult, 0, len(intentPlan.Intents)),
@@ -270,7 +256,7 @@ func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConn
RendezvousLeaseID: intent.RendezvousLeaseID,
RelayNodeID: intent.RelayNodeID,
RelayEndpoint: intent.RelayEndpoint,
RelayControl: intent.RelayCandidate,
RelayQUIC: intent.RelayCandidate,
BestPeerCertSHA256: firstNonEmpty(intent.BestPeerCertSHA256, cacheEntry.BestPeerCertSHA256),
}
if intent.RequiresRendezvous {
@@ -385,7 +371,7 @@ func peerConnectionProbeTargetNodeID(intent PeerConnectionIntent, localNodeID st
func (m *PeerConnectionManager) probePeerTarget(ctx context.Context, probePeer PeerCacheEntry, target PeerIdentity) error {
endpoint := strings.TrimRight(strings.TrimSpace(probePeer.Endpoint), "/")
transport := strings.TrimSpace(probePeer.BestTransport)
if hasLegacyEndpointScheme(endpoint) {
if hasUnsupportedEndpointScheme(endpoint) {
return fmt.Errorf("non_quic_probe_rejected")
}
if peerConnectionTargetIsQUIC(transport, endpoint) {
@@ -445,7 +431,7 @@ func peerConnectionProbeTargets(intent PeerConnectionIntent, cacheEntry PeerCach
}
add(candidate.EndpointID, candidate.Address, candidate.Transport, candidatePeerCertSHA256(candidate))
}
add(intent.BestCandidateID, intent.Endpoint, intent.Transport, cacheEntry.BestPeerCertSHA256)
add(intent.BestCandidateID, intent.Endpoint, intent.Transport, intent.BestPeerCertSHA256)
return out
}
@@ -455,7 +441,7 @@ func peerConnectionShouldProbeDirectUpgrade(intent PeerConnectionIntent, cacheEn
}
if strings.TrimSpace(intent.ConnectionState) != PeerConnectionRelayReady &&
!intent.RelayCandidate &&
strings.TrimSpace(intent.TransportMode) != PeerTransportModeRelayControl {
strings.TrimSpace(intent.TransportMode) != PeerTransportModeRelayQUIC {
return false
}
for _, candidate := range cacheEntry.EndpointCandidates {
@@ -509,8 +495,3 @@ func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionSta
}
return PeerConnectionState{NodeID: nodeID, State: PeerConnectionDisconnected}
}
func (c Client) withHTTPClient(httpClient *http.Client) Client {
c.HTTPClient = httpClient
return c
}
@@ -3,7 +3,6 @@ package mesh
import (
"context"
"encoding/json"
"net/http"
"testing"
"time"
)
@@ -90,7 +89,7 @@ func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testin
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-b": "http://127.0.0.1:1",
"node-b": "quic://127.0.0.1:1",
},
WarmPeerLimit: 1,
Now: now,
@@ -100,7 +99,6 @@ func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testin
Local: local,
PeerCache: cache,
Tracker: tracker,
HTTPClient: &http.Client{Timeout: 20 * time.Millisecond},
ProbeTimeout: 20 * time.Millisecond,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
@@ -121,7 +119,7 @@ func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testin
}
}
func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
func TestPeerConnectionManagerProbesRelayQUICLease(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
tlsConfig := testQUICTLSConfig(t)
@@ -188,7 +186,7 @@ func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
if cycle.Attempted != 1 ||
cycle.Succeeded != 1 ||
cycle.Deferred != 0 ||
cycle.RelayControlCount != 1 ||
cycle.RelayQUICCount != 1 ||
cycle.RendezvousResolvedCount != 1 ||
cycle.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected relay-control cycle: %+v", cycle)
@@ -227,11 +225,11 @@ func TestPeerConnectionProbeTargetsFallsBackToBestPeerCertSHA256(t *testing.T) {
BestPeerCertSHA256: "intent-cert",
}
cacheEntry := PeerCacheEntry{
NodeID: "node-b",
BestPeerCertSHA256: "cache-cert",
BestCandidateID: "node-b-best",
BestTransport: "direct_quic",
Endpoint: "quic://94.141.118.222:19199",
NodeID: "node-b",
BestPeerCertSHA256: "cache-cert",
BestCandidateID: "node-b-best",
BestTransport: "direct_quic",
Endpoint: "quic://94.141.118.222:19199",
EndpointCandidates: []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
@@ -259,6 +257,49 @@ func TestPeerConnectionProbeTargetsFallsBackToBestPeerCertSHA256(t *testing.T) {
}
}
func TestPeerConnectionProbeTargetsUsesRelayLeaseCertForRelayEndpoint(t *testing.T) {
intent := PeerConnectionIntent{
NodeID: "node-b",
BestCandidateID: "lease-node-b-via-node-r",
Endpoint: "quic://195.123.240.88:19131",
Transport: "relay_quic",
BestPeerCertSHA256: "relay-cert",
RelayCandidate: true,
ConnectionState: PeerConnectionBackoff,
}
cacheEntry := PeerCacheEntry{
NodeID: "node-b",
BestPeerCertSHA256: "direct-cert",
EndpointCandidates: []PeerEndpointCandidate{
{
EndpointID: "node-b-private",
NodeID: "node-b",
Transport: "direct_quic",
Address: "quic://192.168.200.61:19132",
Reachability: "private",
ConnectivityMode: "private_lan",
Priority: 1,
Metadata: peerConnectionProbeMetadata(t, "direct-cert"),
},
},
}
targets := peerConnectionProbeTargets(intent, cacheEntry)
if len(targets) != 2 {
t.Fatalf("target count = %d, want 2", len(targets))
}
for _, target := range targets {
if target.Endpoint != "quic://195.123.240.88:19131" {
continue
}
if target.PeerCertSHA256 != "relay-cert" {
t.Fatalf("relay endpoint cert = %q, want relay-cert", target.PeerCertSHA256)
}
return
}
t.Fatalf("relay endpoint target not found: %+v", targets)
}
func TestPeerConnectionProbeTargetsUpgradeRelayReadyPeerToDirectQUIC(t *testing.T) {
now := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
current := now
@@ -36,7 +36,7 @@ type PeerConnectionState struct {
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
RelayQUIC bool `json:"relay_quic"`
ConsecutiveSuccesses int `json:"consecutive_successes"`
ConsecutiveFailures int `json:"consecutive_failures"`
LastLatencyMs int `json:"last_latency_ms,omitempty"`
@@ -287,7 +287,7 @@ func (t *PeerConnectionTracker) entry(peer PeerCacheEntry, now time.Time) PeerCo
entry.RendezvousLeaseID = peer.RendezvousLeaseID
entry.RelayNodeID = peer.RelayNodeID
entry.RelayEndpoint = peer.RelayEndpoint
entry.RelayControl = peer.RelayControl
entry.RelayQUIC = peer.RelayQUIC
return entry
}
@@ -21,6 +21,7 @@ type PeerRecoveryPlanConfig struct {
Connections PeerConnectionSnapshot
TargetReadyPeers int
MaxProbeCandidates int
PreferredRegion string
Now time.Time
}
@@ -42,6 +43,7 @@ type PeerRecoveryPlan struct {
type PeerRecoveryCandidate struct {
NodeID string `json:"node_id"`
Endpoint string `json:"endpoint,omitempty"`
Region string `json:"region,omitempty"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
@@ -57,6 +59,7 @@ type PeerRecoveryCandidate struct {
type peerRecoveryCandidateBuild struct {
PeerRecoveryCandidate
PublicIngressCount int
}
func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
@@ -96,6 +99,7 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
ready := 0
degraded := 0
backoff := 0
readyExternalRegions := map[string]struct{}{}
for nodeID, connection := range connectionByNode {
entry, ok := entryByNode[nodeID]
if !ok || strings.TrimSpace(entry.Endpoint) == "" {
@@ -104,6 +108,10 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
switch connection.State {
case PeerConnectionReady:
ready++
region := strings.TrimSpace(entry.BestRegion)
if region != "" && (strings.TrimSpace(cfg.PreferredRegion) == "" || !strings.EqualFold(region, cfg.PreferredRegion)) {
readyExternalRegions[strings.ToLower(region)] = struct{}{}
}
case PeerConnectionRelayReady:
// Relay-ready peers remain valuable for control-plane reachability,
// but they do not satisfy the target for direct-ready transport paths.
@@ -125,6 +133,7 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
if mode == PeerRecoveryModeSteady {
limit = target
}
missingExternalRegions := missingPeerRecoveryExternalRegions(cfg.PeerCache, cfg.PreferredRegion, readyExternalRegions, target)
candidates := make([]peerRecoveryCandidateBuild, 0, len(cfg.PeerCache.Entries))
for _, entry := range cfg.PeerCache.Entries {
@@ -138,13 +147,14 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
if connection.State == PeerConnectionBackoff && connection.BackoffUntil.After(now) {
continue
}
reason, ok := peerRecoveryCandidateReason(mode, entry, connection)
reason, ok := peerRecoveryCandidateReason(mode, entry, connection, missingExternalRegions, cfg.PreferredRegion)
if !ok {
continue
}
candidate := PeerRecoveryCandidate{
NodeID: entry.NodeID,
Endpoint: strings.TrimSpace(entry.Endpoint),
Region: strings.TrimSpace(entry.BestRegion),
Warm: entry.Warm,
WarmReason: entry.WarmReason,
RecoverySeed: entry.RecoverySeed,
@@ -155,9 +165,12 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
LastLatencyMs: connection.LastLatencyMs,
BackoffUntil: connection.BackoffUntil,
Reason: reason,
Priority: peerRecoveryCandidatePriority(entry, connection, reason),
Priority: peerRecoveryCandidatePriority(entry, connection, reason, cfg.PreferredRegion),
}
candidates = append(candidates, peerRecoveryCandidateBuild{PeerRecoveryCandidate: candidate})
candidates = append(candidates, peerRecoveryCandidateBuild{
PeerRecoveryCandidate: candidate,
PublicIngressCount: entry.PublicIngressCount,
})
}
sort.SliceStable(candidates, func(i, j int) bool {
if candidates[i].Priority != candidates[j].Priority {
@@ -166,7 +179,7 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
return candidates[i].NodeID < candidates[j].NodeID
})
if len(candidates) > limit {
candidates = candidates[:limit]
candidates = trimPeerRecoveryCandidates(candidates, limit, cfg.PreferredRegion)
}
outCandidates := make([]PeerRecoveryCandidate, 0, len(candidates))
@@ -194,11 +207,143 @@ func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
}
}
func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection PeerConnectionState) (string, bool) {
func missingPeerRecoveryExternalRegions(snapshot PeerCacheSnapshot, preferredRegion string, readyExternalRegions map[string]struct{}, target int) map[string]struct{} {
preferredRegion = strings.TrimSpace(preferredRegion)
availableExternalRegions := map[string]struct{}{}
for _, entry := range snapshot.Entries {
region := strings.TrimSpace(entry.BestRegion)
if region == "" {
continue
}
if preferredRegion != "" && strings.EqualFold(region, preferredRegion) {
continue
}
availableExternalRegions[strings.ToLower(region)] = struct{}{}
}
if len(availableExternalRegions) == 0 {
return nil
}
desiredExternal := len(availableExternalRegions)
if desiredExternal > 2 {
desiredExternal = 2
}
if target > 0 && desiredExternal > target {
desiredExternal = target
}
if len(readyExternalRegions) >= desiredExternal {
return nil
}
missing := map[string]struct{}{}
for region := range availableExternalRegions {
if _, ok := readyExternalRegions[region]; ok {
continue
}
missing[region] = struct{}{}
}
if len(missing) == 0 {
return nil
}
return missing
}
func trimPeerRecoveryCandidates(candidates []peerRecoveryCandidateBuild, limit int, preferredRegion string) []peerRecoveryCandidateBuild {
if len(candidates) <= limit || limit <= 0 {
return candidates
}
preferredRegion = strings.TrimSpace(preferredRegion)
externalRegions := map[string]struct{}{}
for _, candidate := range candidates {
region := strings.TrimSpace(candidate.Region)
if region == "" || (preferredRegion != "" && strings.EqualFold(region, preferredRegion)) {
continue
}
externalRegions[strings.ToLower(region)] = struct{}{}
}
if len(externalRegions) < 2 {
return candidates[:limit]
}
selected := make([]peerRecoveryCandidateBuild, 0, limit)
selectedNodeIDs := map[string]struct{}{}
selectedRegions := map[string]struct{}{}
for _, candidate := range candidates {
if len(selected) >= limit {
break
}
region := strings.TrimSpace(candidate.Region)
if region == "" || (preferredRegion != "" && strings.EqualFold(region, preferredRegion)) {
continue
}
regionKey := strings.ToLower(region)
if _, exists := selectedRegions[regionKey]; exists {
continue
}
selected = append(selected, candidate)
selectedNodeIDs[candidate.NodeID] = struct{}{}
selectedRegions[regionKey] = struct{}{}
}
if len(selected) < limit && !selectedHasPublicIngress(selected) {
for _, candidate := range candidates {
if len(selected) >= limit {
break
}
if _, exists := selectedNodeIDs[candidate.NodeID]; exists {
continue
}
if candidatePublicIngressCount(candidate) <= 0 {
continue
}
selected = append(selected, candidate)
selectedNodeIDs[candidate.NodeID] = struct{}{}
break
}
}
for _, candidate := range candidates {
if len(selected) >= limit {
break
}
if _, exists := selectedNodeIDs[candidate.NodeID]; exists {
continue
}
selected = append(selected, candidate)
selectedNodeIDs[candidate.NodeID] = struct{}{}
}
if len(selected) > limit {
selected = selected[:limit]
}
return selected
}
func selectedHasPublicIngress(candidates []peerRecoveryCandidateBuild) bool {
for _, candidate := range candidates {
if candidatePublicIngressCount(candidate) > 0 {
return true
}
}
return false
}
func candidatePublicIngressCount(candidate peerRecoveryCandidateBuild) int {
return candidate.PublicIngressCount
}
func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection PeerConnectionState, missingExternalRegions map[string]struct{}, preferredRegion string) (string, bool) {
if mode == PeerRecoveryModeSteady {
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
return "maintain_ready", true
}
region := strings.ToLower(strings.TrimSpace(entry.BestRegion))
if region != "" && len(missingExternalRegions) > 0 {
if _, ok := missingExternalRegions[region]; ok {
if preferredRegion == "" || !strings.EqualFold(strings.TrimSpace(entry.BestRegion), preferredRegion) {
if connection.State == PeerConnectionDegraded {
return "recover_external_area", true
}
if entry.Warm || entry.RecoverySeed || connection.State == PeerConnectionDisconnected || connection.State == PeerConnectionConnecting {
return "recover_external_area", true
}
}
}
}
return "", false
}
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
@@ -216,7 +361,7 @@ func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection P
return "recover_peer", true
}
func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnectionState, reason string) int {
func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnectionState, reason string, preferredRegion string) int {
score := 0
if entry.Warm {
score += 1000
@@ -237,6 +382,17 @@ func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnecti
if entry.BestCandidateID != "" {
score += 150
}
if entry.PublicIngressCount > 0 {
score += entry.PublicIngressCount * 90
}
preferredRegion = strings.TrimSpace(preferredRegion)
entryRegion := strings.TrimSpace(entry.BestRegion)
switch {
case preferredRegion != "" && entryRegion != "" && !strings.EqualFold(entryRegion, preferredRegion):
score += 275
case preferredRegion != "" && entryRegion != "" && strings.EqualFold(entryRegion, preferredRegion):
score += 25
}
score += entry.BestCandidateScore / 10
switch connection.State {
case PeerConnectionReady, PeerConnectionRelayReady:
@@ -251,6 +407,8 @@ func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnecti
switch reason {
case "maintain_ready":
score += 500
case "recover_external_area":
score += 450
case "recover_degraded":
score += 300
case "recover_seed":
@@ -82,7 +82,7 @@ func TestPeerRecoveryPlanTreatsRelayReadyPeersAsRecoveryGap(t *testing.T) {
RendezvousLeaseID: "lease-1",
RelayNodeID: "node-r",
RelayEndpoint: "quic://relay:19443",
RelayControl: true,
RelayQUIC: true,
},
},
},
@@ -121,6 +121,129 @@ func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) {
}
}
func TestPeerRecoveryPlanPrefersExternalRegionsWhenTrimmingReadyPeers(t *testing.T) {
now := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-home-a", Endpoint: "quic://node-home-a:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-b", Endpoint: "quic://node-home-b:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-usa", Endpoint: "quic://node-usa:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa"},
{NodeID: "node-ifcm", Endpoint: "quic://node-ifcm:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "ifcm"},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-home-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-usa", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-ifcm", State: PeerConnectionReady, LastLatencyMs: 20},
}},
PreferredRegion: "home",
Now: now,
})
if len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("candidate count = %d, want %d", len(plan.Candidates), DefaultStablePeerTarget)
}
if !recoveryPlanHasCandidate(plan, "node-usa", "maintain_ready") || !recoveryPlanHasCandidate(plan, "node-ifcm", "maintain_ready") {
t.Fatalf("expected external-region peers to be retained: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanPrefersPublicIngressAtSameRegion(t *testing.T) {
now := time.Date(2026, 5, 18, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-home-private-a", Endpoint: "quic://10.0.0.2:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-private-b", Endpoint: "quic://10.0.0.3:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-public", Endpoint: "quic://94.141.118.222:19199", Warm: true, WarmReason: "route_adjacent", BestRegion: "home", PublicIngressCount: 1},
{NodeID: "node-usa", Endpoint: "quic://195.123.240.88:19131", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa", PublicIngressCount: 1},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-home-private-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-private-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-public", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-usa", State: PeerConnectionReady, LastLatencyMs: 20},
}},
PreferredRegion: "home",
Now: now,
})
if len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("candidate count = %d, want %d", len(plan.Candidates), DefaultStablePeerTarget)
}
if !recoveryPlanHasCandidate(plan, "node-home-public", "maintain_ready") {
t.Fatalf("expected public-ingress home peer to be retained: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanRetainsDistinctExternalRegionsWhenAvailable(t *testing.T) {
now := time.Date(2026, 5, 19, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-home-a", Endpoint: "quic://node-home-a:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-b", Endpoint: "quic://node-home-b:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-home-c", Endpoint: "quic://node-home-c:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home"},
{NodeID: "node-usa-a", Endpoint: "quic://node-usa-a:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa", PublicIngressCount: 1},
{NodeID: "node-usa-b", Endpoint: "quic://node-usa-b:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa", PublicIngressCount: 1},
{NodeID: "node-ifcm", Endpoint: "quic://node-ifcm:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "ifcm", PublicIngressCount: 1},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-home-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-home-c", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-usa-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-usa-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-ifcm", State: PeerConnectionReady, LastLatencyMs: 20},
}},
PreferredRegion: "home",
Now: now,
})
if len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("candidate count = %d, want %d", len(plan.Candidates), DefaultStablePeerTarget)
}
if !recoveryPlanHasCandidate(plan, "node-usa-a", "maintain_ready") && !recoveryPlanHasCandidate(plan, "node-usa-b", "maintain_ready") {
t.Fatalf("expected at least one usa candidate to be retained: %+v", plan.Candidates)
}
if !recoveryPlanHasCandidate(plan, "node-ifcm", "maintain_ready") {
t.Fatalf("expected ifcm candidate to be retained for area diversity: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanSteadyModeAddsMissingExternalAreaCandidate(t *testing.T) {
now := time.Date(2026, 5, 19, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-test-a", Endpoint: "quic://node-test-a:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "test"},
{NodeID: "node-test-b", Endpoint: "quic://node-test-b:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "test"},
{NodeID: "node-usa", Endpoint: "quic://node-usa:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "usa", PublicIngressCount: 1},
{NodeID: "node-home", Endpoint: "quic://node-home:19443", Warm: true, WarmReason: "route_adjacent", BestRegion: "home", PublicIngressCount: 1},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-test-a", State: PeerConnectionReady, LastLatencyMs: 10},
{NodeID: "node-test-b", State: PeerConnectionReady, LastLatencyMs: 10},
{NodeID: "node-usa", State: PeerConnectionReady, LastLatencyMs: 10},
{NodeID: "node-home", State: PeerConnectionDegraded, LastLatencyMs: 20},
}},
PreferredRegion: "test",
Now: now,
})
if len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("candidate count = %d, want %d", len(plan.Candidates), DefaultStablePeerTarget)
}
if !recoveryPlanHasCandidate(plan, "node-home", "recover_external_area") {
t.Fatalf("expected missing external area candidate to be retained: %+v", plan.Candidates)
}
}
func recoveryPlanPeer(nodeID string, warm bool, recoverySeed bool, warmReason string) PeerCacheEntry {
return PeerCacheEntry{
NodeID: nodeID,
@@ -280,6 +280,9 @@ func (t *QUICProductionForwardTransport) sendProductionOnSession(ctx context.Con
return fabricproto.Frame{}, 0, ErrForwardPeerUnavailable
}
if err != nil {
if frame, ok := drainProductionResponseFrame(session, sequence); ok {
return frame, time.Since(started).Milliseconds(), nil
}
return fabricproto.Frame{}, 0, err
}
case frame, ok := <-session.Frames():
@@ -294,6 +297,25 @@ func (t *QUICProductionForwardTransport) sendProductionOnSession(ctx context.Con
}
}
func drainProductionResponseFrame(session FabricTransportSession, sequence uint64) (fabricproto.Frame, bool) {
if session == nil {
return fabricproto.Frame{}, false
}
for {
select {
case frame, ok := <-session.Frames():
if !ok {
return fabricproto.Frame{}, false
}
if frame.Type == fabricproto.FrameData && frame.StreamID == ProductionForwardQUICStreamID && frame.Sequence == sequence {
return frame, true
}
default:
return fabricproto.Frame{}, false
}
}
}
func decodeQUICProductionForwardResponse(payload []byte) (ProductionForwardResult, error) {
var response quicProductionForwardResponse
if err := json.Unmarshal(payload, &response); err != nil {
@@ -283,12 +283,28 @@ func (r *FabricRegistry) ResolveService(req FabricRegistryResolveRequest) Fabric
return FabricRegistryResolvedService{Found: false, Reason: "service_required"}
}
scopeOrder := fabricRegistryScopeResolutionOrder(req.Scope, req.OrganizationID)
if resolved := r.resolveServiceFromRecords(req, service, scopeOrder, false); resolved.Found || resolved.Reason == "no_usable_endpoints" {
return resolved
}
if resolved := r.resolveServiceFromRecords(req, service, scopeOrder, true); resolved.Found || resolved.Reason == "no_usable_endpoints" {
return resolved
}
return FabricRegistryResolvedService{Found: false, Service: service, Reason: "no_active_record"}
}
func (r *FabricRegistry) resolveServiceFromRecords(req FabricRegistryResolveRequest, service string, scopeOrder []string, candidateOnly bool) FabricRegistryResolvedService {
for _, scope := range scopeOrder {
organizationID := strings.TrimSpace(req.OrganizationID)
if scope != FabricRegistryScopeOrganization {
organizationID = ""
}
record, ok := r.Active(req.ClusterID, service, scope, organizationID, req.Now)
var record FabricRegistryGossipRecord
var ok bool
if candidateOnly {
record, ok = r.Candidate(req.ClusterID, service, scope, organizationID, req.Now)
} else {
record, ok = r.Active(req.ClusterID, service, scope, organizationID, req.Now)
}
if !ok {
continue
}
@@ -306,9 +322,28 @@ func (r *FabricRegistry) ResolveService(req FabricRegistryResolveRequest) Fabric
RecordEpoch: record.Epoch,
RecordHash: hex.EncodeToString(sum[:]),
Endpoints: endpoints,
Reason: fabricRegistryResolveReason(candidateOnly),
}
}
return FabricRegistryResolvedService{Found: false, Service: service, Reason: "no_active_record"}
return FabricRegistryResolvedService{Found: false, Service: service}
}
func (r *FabricRegistry) Candidate(clusterID, service, scope, organizationID string, now time.Time) (FabricRegistryGossipRecord, bool) {
if r == nil {
return FabricRegistryGossipRecord{}, false
}
entry, ok := r.candidates[fabricRegistryKey(clusterID, service, scope, organizationID)]
if !ok || entry.State != FabricRegistryCandidate || !entry.Record.ExpiresAt.After(registryNow(now)) {
return FabricRegistryGossipRecord{}, false
}
return entry.Record, true
}
func fabricRegistryResolveReason(candidateOnly bool) string {
if candidateOnly {
return "candidate_record_pending_live_verification"
}
return ""
}
func (r *FabricRegistry) Snapshot(now time.Time) FabricRegistrySnapshot {
@@ -507,7 +542,7 @@ func validateFabricRegistryGossipRecord(record FabricRegistryGossipRecord, polic
if strings.TrimSpace(endpoint.EndpointID) == "" || strings.TrimSpace(endpoint.Address) == "" || strings.TrimSpace(endpoint.Transport) == "" {
return fmt.Errorf("fabric registry gossip record contains invalid endpoint")
}
if !isQUICOnlyCandidateTransport(endpoint.Transport) || hasLegacyEndpointScheme(endpoint.Address) {
if !isQUICOnlyCandidateTransport(endpoint.Transport) || hasUnsupportedEndpointScheme(endpoint.Address) {
return fmt.Errorf("fabric registry gossip endpoint must be QUIC-only")
}
if len(endpoint.Metadata) > 0 && !json.Valid(endpoint.Metadata) {
@@ -605,7 +640,7 @@ func selectFabricRegistryEndpoints(endpoints []FabricRegistryEndpoint, preferred
preferredRegion = strings.TrimSpace(preferredRegion)
out := make([]FabricRegistryEndpoint, 0, len(endpoints))
for _, endpoint := range endpoints {
if strings.TrimSpace(endpoint.Address) == "" || !isQUICOnlyCandidateTransport(endpoint.Transport) || hasLegacyEndpointScheme(endpoint.Address) {
if strings.TrimSpace(endpoint.Address) == "" || !isQUICOnlyCandidateTransport(endpoint.Transport) || hasUnsupportedEndpointScheme(endpoint.Address) {
continue
}
out = append(out, endpoint)
@@ -636,16 +671,10 @@ func probeFabricRegistryEndpoint(ctx context.Context, transport FabricTransport,
if timeout <= 0 {
timeout = 2 * time.Second
}
target := FabricTransportTarget{
EndpointID: endpoint.EndpointID,
PeerID: endpoint.EndpointID,
Endpoint: endpoint.Address,
Transport: endpoint.Transport,
PeerCertSHA256: endpoint.PeerCertSHA256,
Timeout: timeout,
InboundBuffer: 2,
ErrorBuffer: 2,
}
target := FabricTransportTargetFromRegistryEndpoint(endpoint)
target.Timeout = timeout
target.InboundBuffer = 2
target.ErrorBuffer = 2
startedAt := time.Now()
session, err := transport.Connect(ctx, target)
if err != nil {
@@ -45,7 +45,7 @@ func TestFabricRegistryGossipRecordRequiresTrustedSignature(t *testing.T) {
}
}
func TestFabricRegistryRejectsLegacyEndpointAndExpiredRecord(t *testing.T) {
func TestFabricRegistryRejectsDisallowedEndpointAndExpiredRecord(t *testing.T) {
now := time.Date(2026, 5, 18, 10, 0, 0, 0, time.UTC)
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
@@ -65,7 +65,7 @@ func TestFabricRegistryRejectsLegacyEndpointAndExpiredRecord(t *testing.T) {
},
Now: now,
}); err == nil {
t.Fatal("legacy HTTP endpoint was accepted")
t.Fatal("compat HTTP endpoint was accepted")
}
expired := testFabricRegistryGossipRecord(now.Add(-2*time.Hour), 11)
expired.ExpiresAt = now.Add(-time.Minute)
@@ -523,7 +523,7 @@ func (s *RemoteWorkspaceFrameProbeSink) AcceptRemoteWorkspaceFrameBatchProbe(_ c
AckedFrames: acceptedFrames,
Backpressure: false,
DropPolicy: "drop_droppable_overflow_ack_accepted",
DeliverySequence: s.sequence,
DeliverySequence: uint64(s.sequence),
DeliveredAt: now.Format(time.RFC3339Nano),
}
s.last = receipt
@@ -695,6 +695,24 @@ func isValidRemoteWorkspaceAdapterSessionID(adapterSessionID string) bool {
return true
}
func isValidRemoteWorkspaceAdapterMailboxConsumerID(consumerID string) bool {
consumerID = strings.TrimSpace(consumerID)
if consumerID == "" || len(consumerID) > 128 {
return false
}
for _, ch := range consumerID {
switch {
case ch >= 'a' && ch <= 'z':
case ch >= 'A' && ch <= 'Z':
case ch >= '0' && ch <= '9':
case ch == '-', ch == '_', ch == '.', ch == ':':
default:
return false
}
}
return true
}
func actionToAdapterSessionState(action string) string {
switch action {
case "expire":
@@ -106,7 +106,7 @@ func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
if strings.TrimSpace(nodeID) == "" || strings.TrimSpace(endpoint) == "" {
return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint")
}
if hasLegacyEndpointScheme(endpoint) {
if hasUnsupportedEndpointScheme(endpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC peer endpoint")
}
}
@@ -124,7 +124,7 @@ func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
strings.TrimSpace(candidate.ConnectivityMode) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid peer endpoint candidate")
}
if !isQUICOnlyCandidateTransport(candidate.Transport) || hasLegacyEndpointScheme(candidate.Address) {
if !isQUICOnlyCandidateTransport(candidate.Transport) || hasUnsupportedEndpointScheme(candidate.Address) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC peer endpoint candidate")
}
}
@@ -185,12 +185,12 @@ func validatePeerDirectory(entries []PeerDirectoryEntry, localNodeID string) err
return nil
}
func hasLegacyEndpointScheme(endpoint string) bool {
func hasUnsupportedEndpointScheme(endpoint string) bool {
endpoint = strings.ToLower(strings.TrimSpace(endpoint))
return strings.HasPrefix(endpoint, "http://") ||
strings.HasPrefix(endpoint, "https://") ||
strings.HasPrefix(endpoint, "ws://") ||
strings.HasPrefix(endpoint, "wss://")
if endpoint == "" || !strings.Contains(endpoint, "://") {
return false
}
return !strings.HasPrefix(endpoint, "quic://")
}
func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
@@ -205,7 +205,7 @@ func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
strings.TrimSpace(seed.Transport) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid recovery seed")
}
if !isQUICOnlyCandidateTransport(seed.Transport) || hasLegacyEndpointScheme(seed.Endpoint) {
if !isQUICOnlyCandidateTransport(seed.Transport) || hasUnsupportedEndpointScheme(seed.Endpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC recovery seed")
}
if _, duplicate := seen[key]; duplicate {
@@ -241,7 +241,7 @@ func validateRendezvousLeases(leases []PeerRendezvousLease, routes []SyntheticRo
(len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) {
return fmt.Errorf("scoped synthetic mesh config contains invalid rendezvous lease")
}
if !isQUICOnlyCandidateTransport(lease.Transport) || hasLegacyEndpointScheme(lease.RelayEndpoint) {
if !isQUICOnlyCandidateTransport(lease.Transport) || hasUnsupportedEndpointScheme(lease.RelayEndpoint) {
return fmt.Errorf("scoped synthetic mesh config contains non-QUIC rendezvous lease")
}
if _, duplicate := seen[lease.LeaseID]; duplicate {
@@ -174,7 +174,7 @@ func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointCandidate(t *testing
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpoint(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedPeerEndpoint(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
@@ -189,7 +189,7 @@ func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpoint(t *testing.T) {
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateTransport(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedPeerEndpointCandidateTransport(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
@@ -215,7 +215,7 @@ func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateTransport(t
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyPeerEndpointCandidateScheme(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedPeerEndpointCandidateScheme(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
@@ -295,7 +295,7 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRecoverySeed(t *testing.T) {
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyRecoverySeed(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedRecoverySeed(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
@@ -337,7 +337,7 @@ func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
}
}
func TestLoadScopedSyntheticConfigRejectsLegacyRendezvousLease(t *testing.T) {
func TestLoadScopedSyntheticConfigRejectsDisallowedRendezvousLease(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17z12.synthetic.v1",
ClusterID: "cluster-1",
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -12,6 +12,21 @@ import (
type VPNPacketBatchPayload struct {
SchemaVersion string `json:"schema_version"`
VPNConnectionID string `json:"vpn_connection_id"`
TunnelID string `json:"tunnel_id,omitempty"`
PoolID string `json:"pool_id,omitempty"`
ServiceID string `json:"service_id,omitempty"`
LocalServiceID string `json:"local_service_id,omitempty"`
RemoteServiceID string `json:"remote_service_id,omitempty"`
ServiceKind string `json:"service_kind,omitempty"`
ServiceClass string `json:"service_class,omitempty"`
ServiceRole string `json:"service_role,omitempty"`
RouteLeaseID string `json:"route_lease_id,omitempty"`
RouteGeneration string `json:"route_generation,omitempty"`
DataPlane string `json:"data_plane,omitempty"`
TransportOwner string `json:"transport_owner,omitempty"`
RouteVisibility string `json:"route_visibility,omitempty"`
TrafficClasses []string `json:"traffic_classes,omitempty"`
StreamShards int `json:"stream_shards,omitempty"`
Direction string `json:"direction"`
Packets [][]byte `json:"packets"`
SentAt time.Time `json:"sent_at"`