Initial project snapshot

This commit is contained in:
2026-04-28 22:29:50 +03:00
commit 8ba0561f4f
365 changed files with 91832 additions and 0 deletions
@@ -0,0 +1,111 @@
package mesh
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"time"
)
type Client struct {
BaseURL string
HTTPClient *http.Client
}
func NewClient(baseURL string) Client {
return Client{
BaseURL: baseURL,
HTTPClient: &http.Client{
Timeout: 5 * time.Second,
},
}
}
func (c Client) SendHealth(ctx context.Context, message HealthMessage) (HealthAck, error) {
payload, err := json.Marshal(message)
if err != nil {
return HealthAck{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/health", bytes.NewReader(payload))
if err != nil {
return HealthAck{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return HealthAck{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return HealthAck{}, fmt.Errorf("mesh health rejected with status %d", resp.StatusCode)
}
var ack HealthAck
if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil {
return HealthAck{}, err
}
return ack, nil
}
func (c Client) SendSynthetic(ctx context.Context, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
payload, err := json.Marshal(envelope)
if err != nil {
return SyntheticEnvelope{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/synthetic/probe", bytes.NewReader(payload))
if err != nil {
return SyntheticEnvelope{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return SyntheticEnvelope{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return SyntheticEnvelope{}, fmt.Errorf("mesh synthetic probe rejected with status %d", resp.StatusCode)
}
var ack SyntheticEnvelope
if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil {
return SyntheticEnvelope{}, err
}
return ack, nil
}
func (c Client) SendProduction(ctx context.Context, envelope ProductionEnvelope) (ProductionForwardResult, error) {
payload, err := json.Marshal(envelope)
if err != nil {
return ProductionForwardResult{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/forward", bytes.NewReader(payload))
if err != nil {
return ProductionForwardResult{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return ProductionForwardResult{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return ProductionForwardResult{}, fmt.Errorf("mesh production forward rejected with status %d", resp.StatusCode)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return ProductionForwardResult{}, err
}
return result, nil
}
@@ -0,0 +1,288 @@
package mesh
import (
"encoding/json"
"errors"
"time"
)
const ProtocolVersion = "mesh-control-v1"
var (
ErrClusterMismatch = errors.New("mesh peer cluster mismatch")
ErrNodeMismatch = errors.New("mesh peer node mismatch")
ErrForwardDisabled = errors.New("production payload forwarding is disabled by mesh production gate")
ErrForwardRuntimeUnavailable = errors.New("production mesh forwarding runtime is unavailable for this route or stage")
ErrForwardPeerUnavailable = errors.New("production mesh next peer is unavailable")
ErrForwardEnvelopeInvalid = errors.New("production mesh envelope is invalid")
ErrForwardObservationFailed = errors.New("production mesh envelope observation failed")
ErrMeshRuntimeDisabled = errors.New("mesh synthetic runtime is disabled")
ErrUnsupportedSyntheticMessage = errors.New("unsupported synthetic mesh message")
ErrRouteIDRequired = errors.New("mesh synthetic route id is required")
ErrRouteNotFound = errors.New("mesh synthetic route not found")
ErrInvalidRoutePath = errors.New("mesh synthetic route path is invalid")
ErrRouteExpired = errors.New("mesh synthetic route is expired")
ErrTTLExhausted = errors.New("mesh synthetic route ttl exhausted")
ErrLoopDetected = errors.New("mesh synthetic route loop detected")
ErrUnauthorizedChannel = errors.New("mesh synthetic channel is not authorized")
ErrSyntheticPeerUnavailable = errors.New("mesh synthetic next peer is unavailable")
ErrNoHealthySyntheticRoute = errors.New("mesh synthetic no healthy route available")
ErrSyntheticRelayQueueFull = errors.New("mesh synthetic relay queue is full")
ErrSyntheticRelayQueueEmpty = errors.New("mesh synthetic relay queue is empty")
ErrSyntheticPayloadTooLarge = errors.New("mesh synthetic payload is too large")
ErrSyntheticOrganizationMismatch = errors.New("mesh synthetic organization mismatch")
ErrUnsupportedSyntheticService = errors.New("unsupported synthetic test service")
ErrSyntheticRequestInvalid = errors.New("mesh synthetic request is invalid")
)
const (
SyntheticMessageProbe = "fabric.probe"
SyntheticMessageProbeAck = "fabric.probe_ack"
SyntheticMessageRouteHealth = "fabric.route_health"
SyntheticMessageRouteHealthAck = "fabric.route_health_ack"
SyntheticMessageTelemetry = "fabric.telemetry"
SyntheticMessageTestService = "fabric.test_service"
SyntheticMessageTestServiceAck = "fabric.test_service_ack"
SyntheticTestServiceType = "synthetic.echo"
SyntheticDefaultTestOrganizationID = "org-test"
SyntheticDefaultMaxTestPayloadBytes = 4096
SyntheticChannelFabricControl = "fabric_control"
SyntheticChannelRouteControl = "route_control"
SyntheticChannelTelemetry = "telemetry"
SyntheticRouteStateUnknown = "unknown"
SyntheticRouteStateHealthy = "healthy"
SyntheticRouteStateDegraded = "degraded"
SyntheticRouteStateFailed = "failed"
ProductionChannelFabricControl = "fabric_control"
ProductionMessageFabricControl = "fabric.control"
MaxProductionEnvelopePayloadBytes = 4096
MaxProductionEnvelopeFutureSkew = time.Minute
)
type PeerIdentity struct {
ClusterID string `json:"cluster_id"`
NodeID string `json:"node_id"`
}
type SyntheticRoute struct {
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
SourceNodeID string `json:"source_node_id"`
DestinationNodeID string `json:"destination_node_id"`
Hops []string `json:"hops"`
AllowedChannels []string `json:"allowed_channels"`
ExpiresAt time.Time `json:"expires_at"`
MaxTTL int `json:"max_ttl"`
MaxHops int `json:"max_hops"`
RouteVersion string `json:"route_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
}
type SyntheticEnvelope struct {
ProtocolVersion string `json:"protocol_version"`
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
From PeerIdentity `json:"from"`
To PeerIdentity `json:"to"`
Channel string `json:"channel"`
MessageType string `json:"message_type"`
TTL int `json:"ttl"`
HopCount int `json:"hop_count"`
Visited []string `json:"visited"`
Sequence uint64 `json:"sequence"`
SentAt time.Time `json:"sent_at"`
Payload json.RawMessage `json:"payload,omitempty"`
}
type SyntheticProbePayload struct {
ProbeID string `json:"probe_id"`
SentAt time.Time `json:"sent_at"`
}
type SyntheticProbeAckPayload struct {
ProbeID string `json:"probe_id"`
Path []string `json:"path"`
AcceptedAt time.Time `json:"accepted_at"`
}
type SyntheticRouteObservation struct {
RouteID string `json:"route_id"`
State string `json:"state"`
LastSuccessAt time.Time `json:"last_success_at,omitempty"`
LastFailureAt time.Time `json:"last_failure_at,omitempty"`
LastFailureReason string `json:"last_failure_reason,omitempty"`
SuccessCount uint64 `json:"success_count"`
FailureCount uint64 `json:"failure_count"`
LastLatencyMs int64 `json:"last_latency_ms,omitempty"`
RouteVersion string `json:"route_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
}
type SyntheticRouteHealthResult struct {
RequestedRouteID string `json:"requested_route_id"`
SelectedRouteID string `json:"selected_route_id"`
FallbackUsed bool `json:"fallback_used"`
Ack SyntheticEnvelope `json:"ack"`
Observation SyntheticRouteObservation `json:"observation"`
}
type SyntheticTestServiceRequest struct {
RequestID string `json:"request_id"`
OrganizationID string `json:"organization_id"`
ServiceType string `json:"service_type"`
Payload string `json:"payload"`
SentAt time.Time `json:"sent_at"`
}
type SyntheticTestServiceResponse struct {
RequestID string `json:"request_id"`
OrganizationID string `json:"organization_id"`
ServiceType string `json:"service_type"`
EchoPayload string `json:"echo_payload"`
Path []string `json:"path"`
AcceptedAt time.Time `json:"accepted_at"`
}
type SyntheticTestServiceResult struct {
RequestedRouteID string `json:"requested_route_id"`
SelectedRouteID string `json:"selected_route_id"`
FallbackUsed bool `json:"fallback_used"`
Ack SyntheticEnvelope `json:"ack"`
Response SyntheticTestServiceResponse `json:"response"`
Observation SyntheticRouteObservation `json:"observation"`
}
type SyntheticRouteCacheVersion struct {
RouteVersion string `json:"route_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
}
type SyntheticRelayQueuePolicy struct {
Channel string `json:"channel"`
Capacity int `json:"capacity"`
Droppable bool `json:"droppable"`
}
type SyntheticRelayEnqueueResult struct {
Channel string `json:"channel"`
QueueDepth int `json:"queue_depth"`
QueueCapacity int `json:"queue_capacity"`
Dropped bool `json:"dropped"`
DroppedSequence uint64 `json:"dropped_sequence,omitempty"`
AcceptedSequence uint64 `json:"accepted_sequence"`
}
type SyntheticRelayQueueMetrics struct {
Enqueued uint64 `json:"enqueued"`
Dequeued uint64 `json:"dequeued"`
Dropped uint64 `json:"dropped"`
Rejected uint64 `json:"rejected"`
LastRejectReason string `json:"last_reject_reason,omitempty"`
QueueDepths map[string]int `json:"queue_depths"`
}
type HealthMessage struct {
ProtocolVersion string `json:"protocol_version"`
From PeerIdentity `json:"from"`
To PeerIdentity `json:"to"`
ObservedAt time.Time `json:"observed_at"`
LinkStatus string `json:"link_status"`
LatencyMs *int `json:"latency_ms,omitempty"`
QualityScore *int `json:"quality_score,omitempty"`
}
type HealthAck struct {
ProtocolVersion string `json:"protocol_version"`
Accepted bool `json:"accepted"`
By PeerIdentity `json:"by"`
}
type ProductionEnvelope struct {
FabricProtocolVersion string `json:"fabric_protocol_version"`
MessageID string `json:"message_id"`
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
SourceNodeID string `json:"source_node_id"`
DestinationNodeID string `json:"destination_node_id"`
CurrentHopNodeID string `json:"current_hop_node_id"`
NextHopNodeID string `json:"next_hop_node_id"`
RoutePath []string `json:"route_path,omitempty"`
VisitedNodeIDs []string `json:"visited_node_ids,omitempty"`
ChannelClass string `json:"channel_class"`
MessageType string `json:"message_type"`
TTL int `json:"ttl"`
HopCount int `json:"hop_count"`
CreatedAt time.Time `json:"created_at"`
ExpiresAt time.Time `json:"expires_at"`
PayloadLength int `json:"payload_length"`
PayloadHash string `json:"payload_hash"`
Payload json.RawMessage `json:"payload,omitempty"`
}
type ProductionEnvelopeObservation struct {
MessageID string `json:"message_id"`
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
SourceNodeID string `json:"source_node_id"`
DestinationNodeID string `json:"destination_node_id"`
CurrentHopNodeID string `json:"current_hop_node_id"`
NextHopNodeID string `json:"next_hop_node_id"`
RoutePath []string `json:"route_path,omitempty"`
VisitedNodeIDs []string `json:"visited_node_ids,omitempty"`
ChannelClass string `json:"channel_class"`
MessageType string `json:"message_type"`
TTL int `json:"ttl"`
HopCount int `json:"hop_count"`
PayloadLength int `json:"payload_length"`
PayloadHash string `json:"payload_hash"`
ObservedAt time.Time `json:"observed_at"`
}
type ProductionForwardResult struct {
Accepted bool `json:"accepted"`
Delivered bool `json:"delivered"`
Forwarded bool `json:"forwarded"`
By PeerIdentity `json:"by"`
MessageID string `json:"message_id"`
RouteID string `json:"route_id"`
NextNodeID string `json:"next_node_id,omitempty"`
}
type ProductionForwardLogEntry struct {
Event string `json:"event"`
RouteID string `json:"route_id,omitempty"`
MessageID string `json:"message_id,omitempty"`
ClusterID string `json:"cluster_id,omitempty"`
LocalNodeID string `json:"local_node_id,omitempty"`
SourceNodeID string `json:"source_node_id,omitempty"`
DestinationNodeID string `json:"destination_node_id,omitempty"`
CurrentHopNodeID string `json:"current_hop_node_id,omitempty"`
NextHopNodeID string `json:"next_hop_node_id,omitempty"`
ChannelClass string `json:"channel_class,omitempty"`
MessageType string `json:"message_type,omitempty"`
Reason string `json:"reason,omitempty"`
StatusCode int `json:"status_code,omitempty"`
TTL int `json:"ttl,omitempty"`
HopCount int `json:"hop_count,omitempty"`
RoutePathLength int `json:"route_path_length,omitempty"`
VisitedCount int `json:"visited_count,omitempty"`
PayloadLength int `json:"payload_length,omitempty"`
OccurredAt time.Time `json:"occurred_at"`
}
func ValidatePeer(local PeerIdentity, remote PeerIdentity) error {
if local.ClusterID == "" || remote.ClusterID == "" || local.ClusterID != remote.ClusterID {
return ErrClusterMismatch
}
if remote.NodeID == "" {
return ErrNodeMismatch
}
return nil
}
@@ -0,0 +1,258 @@
package mesh
import (
"sort"
"strings"
"time"
)
type EndpointCandidateScoreOptions struct {
ChannelClass string
PreferredRegion string
Now time.Time
MaxVerificationAge time.Duration
Observations map[string]EndpointCandidateHealthObservation
MaxObservationAge time.Duration
}
type EndpointCandidateHealthObservation struct {
EndpointID string `json:"endpoint_id"`
LastLatencyMs int64 `json:"last_latency_ms,omitempty"`
SuccessCount uint64 `json:"success_count,omitempty"`
FailureCount uint64 `json:"failure_count,omitempty"`
LastFailureReason string `json:"last_failure_reason,omitempty"`
ReliabilityScore int `json:"reliability_score,omitempty"`
ObservedAt time.Time `json:"observed_at,omitempty"`
}
type ScoredPeerEndpointCandidate struct {
Candidate PeerEndpointCandidate `json:"candidate"`
Score int `json:"score"`
Reasons []string `json:"reasons,omitempty"`
}
func RankPeerEndpointCandidates(candidates []PeerEndpointCandidate, opts EndpointCandidateScoreOptions) []ScoredPeerEndpointCandidate {
if len(candidates) == 0 {
return nil
}
out := make([]ScoredPeerEndpointCandidate, 0, len(candidates))
for _, candidate := range candidates {
out = append(out, scorePeerEndpointCandidate(candidate, opts))
}
sort.SliceStable(out, func(i, j int) bool {
if out[i].Score != out[j].Score {
return out[i].Score > out[j].Score
}
if out[i].Candidate.Priority != out[j].Candidate.Priority {
return out[i].Candidate.Priority < out[j].Candidate.Priority
}
if out[i].Candidate.NodeID != out[j].Candidate.NodeID {
return out[i].Candidate.NodeID < out[j].Candidate.NodeID
}
return out[i].Candidate.EndpointID < out[j].Candidate.EndpointID
})
return out
}
func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCandidateScoreOptions) ScoredPeerEndpointCandidate {
score := 100
reasons := []string{"base"}
switch candidate.Transport {
case "direct_tcp_tls":
score += 35
reasons = append(reasons, "transport:direct_tcp_tls")
case "wss":
score += 25
reasons = append(reasons, "transport:wss")
case "outbound_reverse":
score += 10
reasons = append(reasons, "transport:outbound_reverse")
case "relay":
score += 5
reasons = append(reasons, "transport:relay")
default:
score -= 100
reasons = append(reasons, "transport:unknown")
}
switch candidate.Reachability {
case "public":
score += 30
reasons = append(reasons, "reachability:public")
case "private":
score += 15
reasons = append(reasons, "reachability:private")
case "relay":
score += 5
reasons = append(reasons, "reachability:relay")
case "outbound_only":
score -= 5
reasons = append(reasons, "reachability:outbound_only")
default:
score -= 15
reasons = append(reasons, "reachability:unknown")
}
switch candidate.ConnectivityMode {
case "direct":
score += 30
reasons = append(reasons, "connectivity:direct")
case "outbound_only":
score += 5
reasons = append(reasons, "connectivity:outbound_only")
case "relay_required":
score -= 5
reasons = append(reasons, "connectivity:relay_required")
default:
score -= 10
reasons = append(reasons, "connectivity:unknown")
}
switch candidate.NATType {
case "", "none":
score += 15
reasons = append(reasons, "nat:none")
case "full_cone":
score += 10
reasons = append(reasons, "nat:full_cone")
case "restricted", "port_restricted":
score += 3
reasons = append(reasons, "nat:restricted")
case "symmetric":
score -= 20
reasons = append(reasons, "nat:symmetric")
case "blocked":
score -= 60
reasons = append(reasons, "nat:blocked")
default:
score -= 8
reasons = append(reasons, "nat:unknown")
}
if candidate.Priority > 0 {
score -= candidate.Priority
reasons = append(reasons, "priority")
}
if opts.PreferredRegion != "" && candidate.Region != "" {
if strings.EqualFold(candidate.Region, opts.PreferredRegion) {
score += 12
reasons = append(reasons, "region:preferred")
} else {
score -= 4
reasons = append(reasons, "region:remote")
}
}
if hasPolicyTag(candidate.PolicyTags, "fast-path") {
score += 10
reasons = append(reasons, "policy:fast-path")
}
if hasPolicyTag(candidate.PolicyTags, "private-lan") || hasPolicyTag(candidate.PolicyTags, "corp-lan") || hasPolicyTag(candidate.PolicyTags, "same-site") {
score += 18
reasons = append(reasons, "policy:private-lan")
}
if hasPolicyTag(candidate.PolicyTags, "costly") {
score -= 10
reasons = append(reasons, "policy:costly")
}
if opts.ChannelClass == SyntheticChannelFabricControl || opts.ChannelClass == SyntheticChannelRouteControl {
if candidate.ConnectivityMode == "direct" {
score += 8
reasons = append(reasons, "channel:control-direct")
}
if candidate.Transport == "relay" {
score -= 8
reasons = append(reasons, "channel:control-relay-penalty")
}
}
if !opts.Now.IsZero() && candidate.LastVerifiedAt != nil && opts.MaxVerificationAge > 0 {
age := opts.Now.Sub(candidate.LastVerifiedAt.UTC())
if age >= 0 && age <= opts.MaxVerificationAge {
score += 8
reasons = append(reasons, "verified:fresh")
} else {
score -= 12
reasons = append(reasons, "verified:stale")
}
}
if observation, ok := opts.Observations[candidate.EndpointID]; ok {
observationScore, observationReasons := scoreEndpointCandidateObservation(observation, opts)
score += observationScore
reasons = append(reasons, observationReasons...)
}
return ScoredPeerEndpointCandidate{
Candidate: candidate,
Score: score,
Reasons: reasons,
}
}
func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObservation, opts EndpointCandidateScoreOptions) (int, []string) {
score := 0
reasons := []string{"observation:present"}
if !opts.Now.IsZero() && !observation.ObservedAt.IsZero() && opts.MaxObservationAge > 0 {
age := opts.Now.Sub(observation.ObservedAt.UTC())
if age < 0 || age > opts.MaxObservationAge {
return -12, []string{"observation:stale"}
}
score += 6
reasons = append(reasons, "observation:fresh")
}
switch {
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 50:
score += 18
reasons = append(reasons, "latency:low")
case observation.LastLatencyMs <= 150:
score += 8
reasons = append(reasons, "latency:moderate")
case observation.LastLatencyMs > 0:
score -= 10
reasons = append(reasons, "latency:high")
}
if observation.ReliabilityScore > 0 {
switch {
case observation.ReliabilityScore >= 90:
score += 15
reasons = append(reasons, "reliability:high")
case observation.ReliabilityScore >= 70:
score += 5
reasons = append(reasons, "reliability:moderate")
default:
score -= 12
reasons = append(reasons, "reliability:low")
}
}
if observation.SuccessCount > 0 {
score += boundedInt(int(observation.SuccessCount), 1, 10)
reasons = append(reasons, "history:success")
}
if observation.FailureCount > 0 {
score -= boundedInt(int(observation.FailureCount)*6, 6, 30)
reasons = append(reasons, "history:failure")
}
if strings.TrimSpace(observation.LastFailureReason) != "" {
score -= 8
reasons = append(reasons, "failure:recent")
}
return score, reasons
}
func hasPolicyTag(tags []string, needle string) bool {
for _, tag := range tags {
if strings.EqualFold(strings.TrimSpace(tag), needle) {
return true
}
}
return false
}
func boundedInt(value, minValue, maxValue int) int {
if value < minValue {
return minValue
}
if value > maxValue {
return maxValue
}
return value
}
@@ -0,0 +1,278 @@
package mesh
import (
"testing"
"time"
)
func TestRankPeerEndpointCandidatesPrefersDirectFreshPublicPath(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
fresh := now.Add(-time.Minute)
stale := now.Add(-2 * time.Hour)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Address: "relay.example.test/node-b",
Reachability: "relay",
NATType: "symmetric",
ConnectivityMode: "relay_required",
Region: "us",
Priority: 1,
LastVerifiedAt: &fresh,
},
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Region: "eu",
Priority: 10,
PolicyTags: []string{"fast-path"},
LastVerifiedAt: &fresh,
},
{
EndpointID: "node-b-private-stale",
NodeID: "node-b",
Transport: "wss",
Address: "10.0.0.5:443",
Reachability: "private",
NATType: "restricted",
ConnectivityMode: "direct",
Region: "eu",
Priority: 5,
LastVerifiedAt: &stale,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelFabricControl,
PreferredRegion: "eu",
Now: now,
MaxVerificationAge: time.Hour,
})
if len(ranked) != 3 {
t.Fatalf("ranked length = %d, want 3", len(ranked))
}
if ranked[0].Candidate.EndpointID != "node-b-public" {
t.Fatalf("top endpoint = %q, want node-b-public: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if ranked[0].Score <= ranked[1].Score {
t.Fatalf("top score = %d, second = %d", ranked[0].Score, ranked[1].Score)
}
if !containsReason(ranked[0].Reasons, "policy:fast-path") || !containsReason(ranked[0].Reasons, "verified:fresh") {
t.Fatalf("top reasons missing expected hints: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesUsesDeterministicTieBreak(t *testing.T) {
candidates := []PeerEndpointCandidate{
{
EndpointID: "endpoint-b",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.21:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
{
EndpointID: "endpoint-a",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{})
if ranked[0].Candidate.EndpointID != "endpoint-a" {
t.Fatalf("tie top endpoint = %q, want endpoint-a", ranked[0].Candidate.EndpointID)
}
}
func TestRankPeerEndpointCandidatesPrefersCorporatePrivateEndpoint(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Region: "corp-eu",
Priority: 10,
},
{
EndpointID: "node-b-corp-lan",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "10.24.10.20:19001",
Reachability: "private",
NATType: "none",
ConnectivityMode: "direct",
Region: "corp-eu",
Priority: 1,
PolicyTags: []string{"corp-lan", "same-site"},
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelFabricControl,
PreferredRegion: "corp-eu",
Now: now,
})
if ranked[0].Candidate.EndpointID != "node-b-corp-lan" {
t.Fatalf("top endpoint = %q, want node-b-corp-lan: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if !containsReason(ranked[0].Reasons, "policy:private-lan") || !containsReason(ranked[0].Reasons, "region:preferred") {
t.Fatalf("corp LAN reasons missing: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesDoesNotDropRelayRequiredFallback(t *testing.T) {
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-outbound",
NodeID: "node-b",
Transport: "outbound_reverse",
Address: "node-b.reverse.local",
Reachability: "outbound_only",
NATType: "symmetric",
ConnectivityMode: "outbound_only",
Priority: 20,
},
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Address: "relay.example.test/node-b",
Reachability: "relay",
NATType: "blocked",
ConnectivityMode: "relay_required",
Priority: 30,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelRouteControl,
})
if len(ranked) != 2 {
t.Fatalf("ranked length = %d, want 2", len(ranked))
}
for _, item := range ranked {
if item.Candidate.EndpointID == "" {
t.Fatalf("ranked candidate lost identity: %+v", item)
}
}
}
func TestRankPeerEndpointCandidatesUsesHealthObservationOverlay(t *testing.T) {
now := time.Date(2026, 4, 28, 13, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
{
EndpointID: "node-b-wss",
NodeID: "node-b",
Transport: "wss",
Address: "node-b.example.test",
Reachability: "public",
NATType: "restricted",
ConnectivityMode: "direct",
Priority: 10,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
Now: now,
MaxObservationAge: 5 * time.Minute,
Observations: map[string]EndpointCandidateHealthObservation{
"node-b-direct": {
EndpointID: "node-b-direct",
LastLatencyMs: 240,
FailureCount: 3,
LastFailureReason: "connect_timeout",
ReliabilityScore: 50,
ObservedAt: now.Add(-time.Minute),
},
"node-b-wss": {
EndpointID: "node-b-wss",
LastLatencyMs: 35,
SuccessCount: 8,
ReliabilityScore: 95,
ObservedAt: now.Add(-time.Minute),
},
},
})
if ranked[0].Candidate.EndpointID != "node-b-wss" {
t.Fatalf("top endpoint = %q, want node-b-wss: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if !containsReason(ranked[0].Reasons, "latency:low") || !containsReason(ranked[0].Reasons, "reliability:high") {
t.Fatalf("top reasons missing health hints: %+v", ranked[0].Reasons)
}
if !containsReason(ranked[1].Reasons, "history:failure") || !containsReason(ranked[1].Reasons, "failure:recent") {
t.Fatalf("failed endpoint reasons missing failure hints: %+v", ranked[1].Reasons)
}
}
func TestRankPeerEndpointCandidatesTreatsStaleObservationAsPenalty(t *testing.T) {
now := time.Date(2026, 4, 28, 13, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
Now: now,
MaxObservationAge: 5 * time.Minute,
Observations: map[string]EndpointCandidateHealthObservation{
"node-b-direct": {
EndpointID: "node-b-direct",
LastLatencyMs: 20,
ObservedAt: now.Add(-time.Hour),
},
},
})
if !containsReason(ranked[0].Reasons, "observation:stale") {
t.Fatalf("reasons missing stale observation: %+v", ranked[0].Reasons)
}
if containsReason(ranked[0].Reasons, "latency:low") {
t.Fatalf("stale observation should not contribute latency: %+v", ranked[0].Reasons)
}
}
func containsReason(reasons []string, reason string) bool {
for _, item := range reasons {
if item == reason {
return true
}
}
return false
}
@@ -0,0 +1,42 @@
package mesh
import (
"context"
"net/http"
"strings"
)
// HTTPPeerTransport sends synthetic mesh envelopes to explicitly configured
// peer endpoints. It is intentionally narrow: production forwarding remains
// disabled and only SyntheticRuntime messages use this transport.
type HTTPPeerTransport struct {
PeerURLs map[string]string
HTTPClient *http.Client
}
func NewHTTPPeerTransport(peerURLs map[string]string) *HTTPPeerTransport {
normalized := make(map[string]string, len(peerURLs))
for nodeID, baseURL := range peerURLs {
nodeID = strings.TrimSpace(nodeID)
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
if nodeID != "" && baseURL != "" {
normalized[nodeID] = baseURL
}
}
return &HTTPPeerTransport{PeerURLs: normalized}
}
func (t *HTTPPeerTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
if t == nil {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/")
if baseURL == "" {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
client := NewClient(baseURL)
if t.HTTPClient != nil {
client.HTTPClient = t.HTTPClient
}
return client.SendSynthetic(ctx, envelope)
}
@@ -0,0 +1,130 @@
package mesh
import (
"context"
"errors"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestHTTPPeerTransportDirectSyntheticProbe(t *testing.T) {
nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
defer nodeA.Close()
nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
defer nodeB.Close()
route := liveSyntheticRoute("route-direct", []string{"node-a", "node-b"})
routes := []SyntheticRoute{route}
nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-b": nodeB.URL})
nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{})
ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-direct")
if err != nil {
t.Fatalf("send live direct probe: %v", err)
}
if ack.MessageType != SyntheticMessageProbeAck {
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
}
payload := decodeAckPayload(t, ack)
if got, want := payload.Path, []string{"node-a", "node-b"}; !sameStrings(got, want) {
t.Fatalf("path = %v, want %v", got, want)
}
}
func TestHTTPPeerTransportSingleRelaySyntheticProbe(t *testing.T) {
nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
defer nodeA.Close()
nodeR := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"})
defer nodeR.Close()
nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
defer nodeB.Close()
route := liveSyntheticRoute("route-relay", []string{"node-a", "node-r", "node-b"})
routes := []SyntheticRoute{route}
nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-r": nodeR.URL})
nodeR.Runtime = newLiveRuntime(nodeR.Local, routes, map[string]string{"node-b": nodeB.URL})
nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{})
ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-relay")
if err != nil {
t.Fatalf("send live relay probe: %v", err)
}
if ack.MessageType != SyntheticMessageProbeAck {
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
}
payload := decodeAckPayload(t, ack)
if got, want := payload.Path, []string{"node-a", "node-r", "node-b"}; !sameStrings(got, want) {
t.Fatalf("path = %v, want %v", got, want)
}
}
func TestHTTPPeerTransportMissingPeer(t *testing.T) {
transport := NewHTTPPeerTransport(map[string]string{})
_, err := transport.SendSynthetic(context.Background(), "node-missing", SyntheticEnvelope{})
if !errors.Is(err, ErrSyntheticPeerUnavailable) {
t.Fatalf("err = %v, want ErrSyntheticPeerUnavailable", err)
}
}
type liveSyntheticNode struct {
Local PeerIdentity
Runtime *SyntheticRuntime
URL string
server *httptest.Server
}
func newLiveSyntheticNode(t *testing.T, local PeerIdentity) *liveSyntheticNode {
t.Helper()
node := &liveSyntheticNode{Local: local}
node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Server{Local: node.Local, SyntheticRuntime: node.Runtime}.Handler().ServeHTTP(w, r)
}))
node.URL = node.server.URL
return node
}
func (n *liveSyntheticNode) Close() {
if n.server != nil {
n.server.Close()
}
}
func newLiveRuntime(local PeerIdentity, routes []SyntheticRoute, peers map[string]string) *SyntheticRuntime {
return NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: local,
Routes: routes,
Transport: NewHTTPPeerTransport(peers),
})
}
func liveSyntheticRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: hops,
AllowedChannels: []string{SyntheticChannelFabricControl},
MaxTTL: 8,
MaxHops: 8,
ExpiresAt: time.Now().UTC().Add(time.Hour),
RouteVersion: "route-v1",
PolicyVersion: "policy-v1",
PeerDirectoryVersion: "peers-v1",
}
}
func sameStrings(left, right []string) bool {
if len(left) != len(right) {
return false
}
for i := range left {
if left[i] != right[i] {
return false
}
}
return true
}
@@ -0,0 +1,374 @@
package mesh
import (
"sort"
"strings"
"time"
)
const DefaultWarmPeerLimit = 8
type PeerCacheConfig struct {
Local PeerIdentity
PeerEndpoints map[string]string
PeerEndpointCandidates map[string][]PeerEndpointCandidate
PeerDirectory []PeerDirectoryEntry
RecoverySeeds []PeerRecoverySeed
RendezvousLeases []PeerRendezvousLease
Routes []SyntheticRoute
WarmPeerLimit int
PreferredRegion string
Now time.Time
}
type PeerCache struct {
snapshot PeerCacheSnapshot
}
type PeerCacheSnapshot struct {
ClusterID string `json:"cluster_id"`
LocalNodeID string `json:"local_node_id"`
PeerCount int `json:"peer_count"`
WarmPeerCount int `json:"warm_peer_count"`
RecoverySeedCount int `json:"recovery_seed_count"`
RendezvousLeaseCount int `json:"rendezvous_lease_count"`
BuiltAt time.Time `json:"built_at"`
Entries []PeerCacheEntry `json:"entries"`
}
type PeerCacheEntry struct {
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
EndpointCount int `json:"endpoint_count"`
CandidateCount int `json:"candidate_count"`
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
BestCandidateAddr string `json:"best_candidate_addr,omitempty"`
BestTransport string `json:"best_transport,omitempty"`
BestReachability string `json:"best_reachability,omitempty"`
BestConnectivity string `json:"best_connectivity,omitempty"`
BestNATType string `json:"best_nat_type,omitempty"`
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
BestCandidateScore int `json:"best_candidate_score,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
}
type peerCacheBuildEntry struct {
PeerCacheEntry
adjacentRoutePeer bool
bestScore int
}
func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
now := cfg.Now.UTC()
if now.IsZero() {
now = time.Now().UTC()
}
limit := cfg.WarmPeerLimit
if limit <= 0 {
limit = DefaultWarmPeerLimit
}
entries := map[string]*peerCacheBuildEntry{}
for _, item := range cfg.PeerDirectory {
nodeID := strings.TrimSpace(item.NodeID)
if nodeID == "" || nodeID == cfg.Local.NodeID {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.RouteIDs = mergeStrings(entry.RouteIDs, item.RouteIDs)
entry.EndpointCount = maxInt(entry.EndpointCount, item.EndpointCount)
entry.CandidateCount = maxInt(entry.CandidateCount, item.CandidateCount)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, item.ConnectivityModes)
entry.RecoverySeed = entry.RecoverySeed || item.RecoverySeed
}
for nodeID, endpoint := range cfg.PeerEndpoints {
nodeID = strings.TrimSpace(nodeID)
endpoint = strings.TrimSpace(endpoint)
if nodeID == "" || nodeID == cfg.Local.NodeID || endpoint == "" {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.Endpoint = endpoint
entry.EndpointCount = maxInt(entry.EndpointCount, 1)
}
for nodeID, candidates := range cfg.PeerEndpointCandidates {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" || nodeID == cfg.Local.NodeID || len(candidates) == 0 {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.CandidateCount = maxInt(entry.CandidateCount, len(candidates))
for _, candidate := range candidates {
if strings.TrimSpace(candidate.ConnectivityMode) != "" {
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{candidate.ConnectivityMode})
}
}
scored := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelFabricControl,
PreferredRegion: cfg.PreferredRegion,
Now: now,
MaxVerificationAge: time.Hour,
})
if len(scored) > 0 {
entry.BestCandidateID = scored[0].Candidate.EndpointID
entry.BestCandidateAddr = scored[0].Candidate.Address
entry.BestTransport = scored[0].Candidate.Transport
entry.BestReachability = scored[0].Candidate.Reachability
entry.BestConnectivity = scored[0].Candidate.ConnectivityMode
entry.BestNATType = scored[0].Candidate.NATType
entry.BestPolicyTags = append([]string{}, scored[0].Candidate.PolicyTags...)
entry.BestCandidateScore = scored[0].Score
entry.bestScore = scored[0].Score
if strings.TrimSpace(scored[0].Candidate.Address) != "" {
entry.Endpoint = strings.TrimSpace(scored[0].Candidate.Address)
}
}
}
for _, route := range cfg.Routes {
path := routePath(route)
localIndex := indexOf(path, cfg.Local.NodeID)
if localIndex < 0 {
continue
}
for _, nodeID := range path {
if nodeID == "" || nodeID == cfg.Local.NodeID {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.RouteIDs = mergeStrings(entry.RouteIDs, []string{route.RouteID})
}
for _, adjacentIndex := range []int{localIndex - 1, localIndex + 1} {
if adjacentIndex < 0 || adjacentIndex >= len(path) {
continue
}
nodeID := path[adjacentIndex]
if nodeID == "" || nodeID == cfg.Local.NodeID {
continue
}
peerCacheEntry(entries, nodeID).adjacentRoutePeer = true
}
}
for _, seed := range cfg.RecoverySeeds {
nodeID := strings.TrimSpace(seed.NodeID)
if nodeID == "" || nodeID == cfg.Local.NodeID {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.RecoverySeed = true
if entry.Endpoint == "" {
entry.Endpoint = strings.TrimSpace(seed.Endpoint)
}
if strings.TrimSpace(seed.ConnectivityMode) != "" {
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{seed.ConnectivityMode})
}
}
rendezvousLeases := 0
for _, lease := range cfg.RendezvousLeases {
if !leaseUsableForPeerCache(lease, cfg.Local.NodeID, now) {
continue
}
rendezvousLeases++
if lease.PeerNodeID != cfg.Local.NodeID {
entry := peerCacheEntry(entries, lease.PeerNodeID)
useLeaseEndpoint := shouldUseRendezvousEndpoint(*entry)
entry.RendezvousLeaseID = lease.LeaseID
entry.RelayNodeID = lease.RelayNodeID
entry.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
entry.RelayControl = true
entry.CandidateCount = maxInt(entry.CandidateCount, 1)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{firstNonEmpty(lease.ConnectivityMode, "relay_required"), "relay_control"})
if useLeaseEndpoint {
entry.BestTransport = firstNonEmpty(lease.Transport, "relay_control")
entry.BestReachability = "relay"
entry.BestConnectivity = firstNonEmpty(lease.ConnectivityMode, "relay_required")
entry.Endpoint = entry.RelayEndpoint
entry.BestCandidateID = lease.LeaseID
entry.BestCandidateAddr = entry.RelayEndpoint
entry.bestScore = maxInt(entry.bestScore, 500)
}
}
if lease.PeerNodeID == cfg.Local.NodeID && lease.RelayNodeID != "" && lease.RelayNodeID != cfg.Local.NodeID {
entry := peerCacheEntry(entries, lease.RelayNodeID)
if entry.Endpoint == "" {
entry.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
}
entry.EndpointCount = maxInt(entry.EndpointCount, 1)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{"relay_control"})
}
}
out := make([]peerCacheBuildEntry, 0, len(entries))
recoverySeeds := 0
for _, entry := range entries {
sort.Strings(entry.RouteIDs)
sort.Strings(entry.ConnectivityModes)
if entry.RecoverySeed {
recoverySeeds++
}
out = append(out, *entry)
}
sort.SliceStable(out, func(i, j int) bool {
left := warmPeerPriority(out[i])
right := warmPeerPriority(out[j])
if left != right {
return left > right
}
return out[i].NodeID < out[j].NodeID
})
warm := 0
for i := range out {
if warm >= limit {
break
}
if warmPeerPriority(out[i]) <= 0 {
continue
}
out[i].Warm = true
out[i].WarmReason = warmPeerReason(out[i])
warm++
}
sort.SliceStable(out, func(i, j int) bool {
return out[i].NodeID < out[j].NodeID
})
snapshotEntries := make([]PeerCacheEntry, 0, len(out))
for _, entry := range out {
snapshotEntries = append(snapshotEntries, entry.PeerCacheEntry)
}
return &PeerCache{snapshot: PeerCacheSnapshot{
ClusterID: cfg.Local.ClusterID,
LocalNodeID: cfg.Local.NodeID,
PeerCount: len(snapshotEntries),
WarmPeerCount: warm,
RecoverySeedCount: recoverySeeds,
RendezvousLeaseCount: rendezvousLeases,
BuiltAt: now,
Entries: snapshotEntries,
}}
}
func (c *PeerCache) Snapshot() PeerCacheSnapshot {
if c == nil {
return PeerCacheSnapshot{}
}
snapshot := c.snapshot
snapshot.Entries = append([]PeerCacheEntry{}, c.snapshot.Entries...)
return snapshot
}
func (c *PeerCache) WarmPeerIDs() []string {
snapshot := c.Snapshot()
out := make([]string, 0, snapshot.WarmPeerCount)
for _, entry := range snapshot.Entries {
if entry.Warm {
out = append(out, entry.NodeID)
}
}
return out
}
func peerCacheEntry(entries map[string]*peerCacheBuildEntry, nodeID string) *peerCacheBuildEntry {
if entry, ok := entries[nodeID]; ok {
return entry
}
entry := &peerCacheBuildEntry{PeerCacheEntry: PeerCacheEntry{NodeID: nodeID}}
entries[nodeID] = entry
return entry
}
func warmPeerPriority(entry peerCacheBuildEntry) int {
score := 0
if entry.adjacentRoutePeer {
score += 1000
}
if entry.RecoverySeed {
score += 500
}
if entry.Endpoint != "" {
score += 100
}
if entry.bestScore > 0 {
score += entry.bestScore
}
if entry.RelayControl {
score += 300
}
score += entry.CandidateCount
return score
}
func warmPeerReason(entry peerCacheBuildEntry) string {
if entry.adjacentRoutePeer {
return "route_adjacent"
}
if entry.RecoverySeed {
return "recovery_seed"
}
if entry.RelayControl {
return "rendezvous_lease"
}
if entry.BestCandidateID != "" {
return "endpoint_candidate"
}
if entry.Endpoint != "" {
return "peer_endpoint"
}
return "scoped_peer"
}
func leaseUsableForPeerCache(lease PeerRendezvousLease, localNodeID string, now time.Time) bool {
if strings.TrimSpace(lease.LeaseID) == "" ||
strings.TrimSpace(lease.PeerNodeID) == "" ||
strings.TrimSpace(lease.RelayNodeID) == "" ||
strings.TrimSpace(lease.RelayEndpoint) == "" ||
lease.ExpiresAt.IsZero() ||
!lease.ExpiresAt.After(now) ||
!lease.ControlPlaneOnly {
return false
}
return lease.PeerNodeID != localNodeID || lease.RelayNodeID != localNodeID
}
func shouldUseRendezvousEndpoint(entry peerCacheBuildEntry) bool {
if strings.TrimSpace(entry.Endpoint) == "" {
return true
}
transport := strings.ToLower(strings.TrimSpace(entry.BestTransport))
reachability := strings.ToLower(strings.TrimSpace(entry.BestReachability))
connectivity := strings.ToLower(strings.TrimSpace(entry.BestConnectivity))
return strings.Contains(transport, "relay") ||
strings.Contains(transport, "outbound") ||
reachability == "relay" ||
reachability == "outbound_only" ||
connectivity == "relay_required" ||
connectivity == "outbound_only"
}
func mergeStrings(existing []string, incoming []string) []string {
seen := map[string]struct{}{}
out := make([]string, 0, len(existing)+len(incoming))
for _, value := range append(existing, incoming...) {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if _, ok := seen[value]; ok {
continue
}
seen[value] = struct{}{}
out = append(out, value)
}
return out
}
func maxInt(left, right int) int {
if left > right {
return left
}
return right
}
@@ -0,0 +1,170 @@
package mesh
import (
"testing"
"time"
)
func TestPeerCacheSelectsAdjacentWarmPeersWithinLimit(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-a": "http://node-a:19000",
"node-r": "http://node-r:19000",
"node-c": "http://node-c:19000",
},
Routes: []SyntheticRoute{
peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r", "node-c"}),
},
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-seed", Endpoint: "https://seed.example.test", Transport: "direct_tcp_tls", Priority: 10},
},
WarmPeerLimit: 2,
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
})
warm := cache.WarmPeerIDs()
if len(warm) != 2 || warm[0] != "node-a" || warm[1] != "node-r" {
t.Fatalf("warm peers = %+v, want adjacent node-a/node-r", warm)
}
snapshot := cache.Snapshot()
if snapshot.PeerCount != 4 || snapshot.RecoverySeedCount != 1 {
t.Fatalf("unexpected snapshot counts: %+v", snapshot)
}
}
func TestPeerCachePromotesRecoverySeedAfterRoutePeers(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
Routes: []SyntheticRoute{
peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r"}),
},
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-seed", Endpoint: "wss://seed.example.test/mesh", Transport: "wss", ConnectivityMode: "direct", Priority: 1},
},
WarmPeerLimit: 3,
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
})
warm := cache.WarmPeerIDs()
if len(warm) != 3 || warm[0] != "node-a" || warm[1] != "node-r" || warm[2] != "node-seed" {
t.Fatalf("warm peers = %+v, want adjacent peers then seed", warm)
}
seed, ok := peerCacheEntryByID(cache.Snapshot(), "node-seed")
if !ok || !seed.RecoverySeed || seed.WarmReason != "recovery_seed" {
t.Fatalf("unexpected seed entry: %+v", seed)
}
}
func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Address: "relay.example.test",
Reachability: "relay",
ConnectivityMode: "relay_required",
Priority: 20,
},
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 1,
LastVerifiedAt: &now,
},
},
},
WarmPeerLimit: 1,
Now: now,
})
entry, ok := peerCacheEntryByID(cache.Snapshot(), "node-b")
if !ok {
t.Fatal("node-b missing from cache")
}
if entry.BestCandidateID != "node-b-public" || !entry.Warm {
t.Fatalf("unexpected candidate selection: %+v", entry)
}
}
func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-b": "https://node-b.public.example.test:443",
},
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "https://node-b.public.example.test:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Region: "corp-eu",
Priority: 10,
},
{
EndpointID: "node-b-corp-lan",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "http://10.24.10.20:19001",
Reachability: "private",
NATType: "none",
ConnectivityMode: "direct",
Region: "corp-eu",
Priority: 1,
PolicyTags: []string{"corp-lan"},
},
},
},
PreferredRegion: "corp-eu",
WarmPeerLimit: 1,
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
})
entry, ok := peerCacheEntryByID(cache.Snapshot(), "node-b")
if !ok {
t.Fatal("node-b missing from peer cache")
}
if entry.BestCandidateID != "node-b-corp-lan" || entry.Endpoint != "http://10.24.10.20:19001" {
t.Fatalf("peer cache did not choose corp LAN endpoint: %+v", entry)
}
}
func peerCacheRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: append([]string{}, hops...),
AllowedChannels: []string{SyntheticChannelFabricControl},
ExpiresAt: time.Now().UTC().Add(time.Hour),
}
}
func peerCacheEntryByID(snapshot PeerCacheSnapshot, nodeID string) (PeerCacheEntry, bool) {
for _, entry := range snapshot.Entries {
if entry.NodeID == nodeID {
return entry, true
}
}
return PeerCacheEntry{}, false
}
@@ -0,0 +1,303 @@
package mesh
import (
"net"
"net/netip"
"net/url"
"sort"
"strings"
"time"
)
const (
PeerConnectionIntentMaintain = "maintain"
PeerConnectionIntentProbe = "probe"
PeerConnectionIntentRecover = "recover"
)
const (
PeerTransportModeDirect = "direct"
PeerTransportModePrivateLAN = "private_lan"
PeerTransportModeCorporateLAN = "corporate_lan"
PeerTransportModeOutboundOnly = "outbound_only"
PeerTransportModeRelayRequired = "relay_required"
PeerTransportModeRelayControl = "relay_control"
PeerTransportModeUnknown = "unknown"
)
type PeerConnectionIntentPlanConfig struct {
PeerCache PeerCacheSnapshot
RecoveryPlan PeerRecoveryPlan
RendezvousLeases []PeerRendezvousLease
Now time.Time
}
type PeerConnectionIntentPlan struct {
Mode string `json:"mode"`
IntentCount int `json:"intent_count"`
MaintainCount int `json:"maintain_count"`
ProbeCount int `json:"probe_count"`
RecoverCount int `json:"recover_count"`
DirectCount int `json:"direct_count"`
PrivateLANCount int `json:"private_lan_count"`
CorporateLANCount int `json:"corporate_lan_count"`
OutboundOnlyCount int `json:"outbound_only_count"`
RelayRequiredCount int `json:"relay_required_count"`
RelayControlCount int `json:"relay_control_count"`
RendezvousRequiredCount int `json:"rendezvous_required_count"`
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
RendezvousLeaseCount int `json:"rendezvous_lease_count"`
GeneratedAt time.Time `json:"generated_at"`
Intents []PeerConnectionIntent `json:"intents,omitempty"`
}
type PeerConnectionIntent struct {
NodeID string `json:"node_id"`
Action string `json:"action"`
Reason string `json:"reason"`
Endpoint string `json:"endpoint,omitempty"`
ConnectionState string `json:"connection_state"`
Transport string `json:"transport,omitempty"`
TransportMode string `json:"transport_mode"`
Reachability string `json:"reachability,omitempty"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
NATType string `json:"nat_type,omitempty"`
PolicyTags []string `json:"policy_tags,omitempty"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
ControlPlaneOnly bool `json:"control_plane_only"`
RecoverySeed bool `json:"recovery_seed"`
Priority int `json:"priority"`
GeneratedAt time.Time `json:"generated_at"`
}
func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectionIntentPlan {
now := normalizedNow(cfg.Now)
entryByNode := map[string]PeerCacheEntry{}
for _, entry := range cfg.PeerCache.Entries {
if strings.TrimSpace(entry.NodeID) == "" {
continue
}
entryByNode[entry.NodeID] = entry
}
intents := make([]PeerConnectionIntent, 0, len(cfg.RecoveryPlan.Candidates))
for _, candidate := range cfg.RecoveryPlan.Candidates {
if strings.TrimSpace(candidate.NodeID) == "" {
continue
}
entry := entryByNode[candidate.NodeID]
intent := PeerConnectionIntent{
NodeID: candidate.NodeID,
Action: connectionIntentAction(candidate),
Reason: candidate.Reason,
Endpoint: candidate.Endpoint,
ConnectionState: candidate.ConnectionState,
Transport: firstNonEmpty(candidate.BestTransport, entry.BestTransport),
Reachability: entry.BestReachability,
ConnectivityMode: entry.BestConnectivity,
NATType: entry.BestNATType,
PolicyTags: append([]string{}, entry.BestPolicyTags...),
BestCandidateID: firstNonEmpty(candidate.BestCandidateID, entry.BestCandidateID),
RendezvousLeaseID: entry.RendezvousLeaseID,
RelayNodeID: entry.RelayNodeID,
RelayEndpoint: entry.RelayEndpoint,
RelayCandidate: entry.RelayControl,
ControlPlaneOnly: entry.RelayControl,
RecoverySeed: candidate.RecoverySeed || entry.RecoverySeed,
Priority: candidate.Priority,
GeneratedAt: now,
}
mode, requiresRendezvous, directCandidate := classifyPeerTransport(intent)
intent.TransportMode = mode
intent.RequiresRendezvous = requiresRendezvous
intent.DirectCandidate = directCandidate
if intent.RequiresRendezvous {
if lease, ok := rendezvousLeaseForPeer(cfg.RendezvousLeases, intent.NodeID, now); ok {
applyRendezvousLease(&intent, lease)
}
}
intents = append(intents, intent)
}
sort.SliceStable(intents, func(i, j int) bool {
if intents[i].Priority != intents[j].Priority {
return intents[i].Priority > intents[j].Priority
}
return intents[i].NodeID < intents[j].NodeID
})
plan := PeerConnectionIntentPlan{
Mode: cfg.RecoveryPlan.Mode,
IntentCount: len(intents),
GeneratedAt: now,
Intents: intents,
}
for _, intent := range intents {
switch intent.Action {
case PeerConnectionIntentMaintain:
plan.MaintainCount++
case PeerConnectionIntentProbe:
plan.ProbeCount++
case PeerConnectionIntentRecover:
plan.RecoverCount++
}
switch intent.TransportMode {
case PeerTransportModeDirect:
plan.DirectCount++
case PeerTransportModePrivateLAN:
plan.PrivateLANCount++
case PeerTransportModeCorporateLAN:
plan.CorporateLANCount++
case PeerTransportModeOutboundOnly:
plan.OutboundOnlyCount++
case PeerTransportModeRelayRequired:
plan.RelayRequiredCount++
case PeerTransportModeRelayControl:
plan.RelayControlCount++
}
if intent.RequiresRendezvous {
plan.RendezvousRequiredCount++
}
if intent.RendezvousResolved {
plan.RendezvousResolvedCount++
}
if intent.RendezvousLeaseID != "" {
plan.RendezvousLeaseCount++
}
}
return plan
}
func connectionIntentAction(candidate PeerRecoveryCandidate) string {
switch candidate.Reason {
case "maintain_ready":
return PeerConnectionIntentMaintain
case "recover_degraded", "recover_seed", "recover_warm", "recover_peer":
return PeerConnectionIntentRecover
default:
return PeerConnectionIntentProbe
}
}
func classifyPeerTransport(intent PeerConnectionIntent) (string, bool, bool) {
transport := strings.ToLower(strings.TrimSpace(intent.Transport))
connectivity := strings.ToLower(strings.TrimSpace(intent.ConnectivityMode))
reachability := strings.ToLower(strings.TrimSpace(intent.Reachability))
tags := lowerStringSet(intent.PolicyTags)
if strings.Contains(transport, "relay") || connectivity == "relay_required" || reachability == "relay" {
return PeerTransportModeRelayRequired, true, false
}
if connectivity == "outbound_only" || reachability == "outbound_only" {
return PeerTransportModeOutboundOnly, true, false
}
if tags["corp-lan"] || tags["same-site"] {
return PeerTransportModeCorporateLAN, false, true
}
if tags["private-lan"] || reachability == "private" || endpointHasPrivateHost(intent.Endpoint) {
return PeerTransportModePrivateLAN, false, true
}
if strings.Contains(transport, "direct") || reachability == "public" || connectivity == "direct" {
return PeerTransportModeDirect, false, true
}
return PeerTransportModeUnknown, false, false
}
func rendezvousLeaseForPeer(leases []PeerRendezvousLease, peerNodeID string, now time.Time) (PeerRendezvousLease, bool) {
now = normalizedNow(now)
candidates := make([]PeerRendezvousLease, 0, len(leases))
for _, lease := range leases {
if strings.TrimSpace(lease.PeerNodeID) != peerNodeID ||
strings.TrimSpace(lease.RelayEndpoint) == "" ||
strings.TrimSpace(lease.RelayNodeID) == "" ||
!lease.ControlPlaneOnly ||
lease.ExpiresAt.IsZero() ||
!lease.ExpiresAt.After(now) {
continue
}
candidates = append(candidates, lease)
}
if len(candidates) == 0 {
return PeerRendezvousLease{}, false
}
sort.SliceStable(candidates, func(i, j int) bool {
leftPriority := candidates[i].Priority
rightPriority := candidates[j].Priority
if leftPriority <= 0 {
leftPriority = 100
}
if rightPriority <= 0 {
rightPriority = 100
}
if leftPriority != rightPriority {
return leftPriority < rightPriority
}
if !candidates[i].ExpiresAt.Equal(candidates[j].ExpiresAt) {
return candidates[i].ExpiresAt.After(candidates[j].ExpiresAt)
}
return candidates[i].LeaseID < candidates[j].LeaseID
})
return candidates[0], true
}
func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLease) {
intent.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
intent.Transport = firstNonEmpty(lease.Transport, "relay_control")
intent.TransportMode = PeerTransportModeRelayControl
intent.RequiresRendezvous = false
intent.RendezvousResolved = true
intent.DirectCandidate = false
intent.RelayCandidate = true
intent.RendezvousLeaseID = lease.LeaseID
intent.RelayNodeID = lease.RelayNodeID
intent.RelayEndpoint = intent.Endpoint
intent.ControlPlaneOnly = true
if lease.ConnectivityMode != "" {
intent.ConnectivityMode = lease.ConnectivityMode
}
}
func endpointHasPrivateHost(rawEndpoint string) bool {
rawEndpoint = strings.TrimSpace(rawEndpoint)
if rawEndpoint == "" {
return false
}
host := rawEndpoint
if parsed, err := url.Parse(rawEndpoint); err == nil && parsed.Host != "" {
host = parsed.Host
}
if splitHost, _, err := net.SplitHostPort(host); err == nil {
host = splitHost
}
addr, err := netip.ParseAddr(strings.Trim(host, "[]"))
if err != nil {
return false
}
return addr.IsPrivate() || addr.IsLoopback() || addr.IsLinkLocalUnicast()
}
func lowerStringSet(values []string) map[string]bool {
out := map[string]bool{}
for _, value := range values {
value = strings.ToLower(strings.TrimSpace(value))
if value != "" {
out[value] = true
}
}
return out
}
func firstNonEmpty(values ...string) string {
for _, value := range values {
if strings.TrimSpace(value) != "" {
return strings.TrimSpace(value)
}
}
return ""
}
@@ -0,0 +1,234 @@
package mesh
import (
"testing"
"time"
)
func TestPeerConnectionIntentsClassifyCorporateDirect(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "http://10.24.10.20:19001",
BestTransport: "direct_tcp_tls",
BestReachability: "private",
BestConnectivity: "direct",
BestPolicyTags: []string{"corp-lan", "same-site"},
},
}},
RecoveryPlan: PeerRecoveryPlan{
Mode: PeerRecoveryModeSteady,
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "http://10.24.10.20:19001",
ConnectionState: PeerConnectionReady,
Reason: "maintain_ready",
Priority: 100,
},
},
},
Now: now,
})
if plan.IntentCount != 1 || plan.MaintainCount != 1 || plan.CorporateLANCount != 1 {
t.Fatalf("unexpected plan counts: %+v", plan)
}
intent := plan.Intents[0]
if intent.Action != PeerConnectionIntentMaintain || intent.TransportMode != PeerTransportModeCorporateLAN || intent.RequiresRendezvous {
t.Fatalf("unexpected corporate intent: %+v", intent)
}
}
func TestPeerConnectionIntentsClassifyOutboundAndRelayAsRendezvousRequired(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
BestTransport: "direct_tcp_tls",
BestReachability: "outbound_only",
BestConnectivity: "outbound_only",
},
{
NodeID: "node-c",
Endpoint: "relay://fabric-relay/node-c",
BestTransport: "relay",
BestReachability: "relay",
BestConnectivity: "relay_required",
},
}},
RecoveryPlan: PeerRecoveryPlan{
Mode: PeerRecoveryModeRecovery,
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_warm",
Priority: 90,
},
{
NodeID: "node-c",
Endpoint: "relay://fabric-relay/node-c",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_seed",
Priority: 80,
},
},
},
Now: now,
})
if plan.RecoverCount != 2 || plan.OutboundOnlyCount != 1 || plan.RelayRequiredCount != 1 || plan.RendezvousRequiredCount != 2 {
t.Fatalf("unexpected rendezvous counts: %+v", plan)
}
if plan.Intents[0].Action != PeerConnectionIntentRecover || plan.Intents[1].Action != PeerConnectionIntentRecover {
t.Fatalf("unexpected actions: %+v", plan.Intents)
}
}
func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "relay://fabric/node-b",
BestTransport: "relay",
BestReachability: "relay",
BestConnectivity: "relay_required",
},
}},
RecoveryPlan: PeerRecoveryPlan{
Mode: PeerRecoveryModeRecovery,
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "relay://fabric/node-b",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_warm",
Priority: 100,
},
},
},
RendezvousLeases: []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
ConnectivityMode: "relay_required",
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
},
},
Now: now,
})
if plan.IntentCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousResolvedCount != 1 || plan.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected relay-control plan counts: %+v", plan)
}
intent := plan.Intents[0]
if intent.TransportMode != PeerTransportModeRelayControl ||
intent.Endpoint != "http://node-r:19000" ||
intent.RelayNodeID != "node-r" ||
intent.RendezvousLeaseID != "lease-node-b-via-node-r" ||
!intent.RelayCandidate ||
!intent.RendezvousResolved ||
intent.RequiresRendezvous {
t.Fatalf("unexpected resolved rendezvous intent: %+v", intent)
}
}
func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "relay://fabric/node-b",
BestTransport: "relay",
BestReachability: "relay",
BestConnectivity: "relay_required",
},
}},
RecoveryPlan: PeerRecoveryPlan{
Mode: PeerRecoveryModeRecovery,
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "relay://fabric/node-b",
ConnectionState: PeerConnectionWaiting,
Reason: "recover_warm",
Priority: 100,
},
},
},
RendezvousLeases: []PeerRendezvousLease{
{
LeaseID: "lease-expired-preferred",
PeerNodeID: "node-b",
RelayNodeID: "node-r-old",
RelayEndpoint: "http://node-r-old:19000",
Transport: "relay_control",
ConnectivityMode: "relay_required",
Priority: 1,
ControlPlaneOnly: true,
IssuedAt: now.Add(-10 * time.Minute),
ExpiresAt: now.Add(-time.Second),
},
{
LeaseID: "lease-active-reselected",
PeerNodeID: "node-b",
RelayNodeID: "node-r-new",
RelayEndpoint: "http://node-r-new:19000",
Transport: "relay_control",
ConnectivityMode: "relay_required",
Priority: 20,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
},
},
Now: now,
})
if plan.RendezvousResolvedCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected reselected plan counts: %+v", plan)
}
intent := plan.Intents[0]
if intent.RendezvousLeaseID != "lease-active-reselected" ||
intent.RelayNodeID != "node-r-new" ||
intent.Endpoint != "http://node-r-new:19000" {
t.Fatalf("expired lease was not skipped: %+v", intent)
}
}
func TestPeerConnectionIntentsClassifyPrivateEndpointWithoutCandidateHints(t *testing.T) {
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{NodeID: "node-b", Endpoint: "http://192.168.10.20:19001"},
}},
RecoveryPlan: PeerRecoveryPlan{Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "http://192.168.10.20:19001",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_peer",
Priority: 10,
},
}},
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
})
if plan.PrivateLANCount != 1 || plan.Intents[0].TransportMode != PeerTransportModePrivateLAN || !plan.Intents[0].DirectCandidate {
t.Fatalf("unexpected private endpoint classification: %+v", plan)
}
}
@@ -0,0 +1,304 @@
package mesh
import (
"context"
"net/http"
"strings"
"sync"
"time"
)
const (
PeerConnectionProbeReachable = "reachable"
PeerConnectionProbeUnreachable = "unreachable"
PeerConnectionProbeDeferred = "deferred"
PeerConnectionProbeSkipped = "skipped"
)
const (
DefaultPeerConnectionProbeTimeout = 2 * time.Second
)
type PeerConnectionManagerConfig struct {
Local PeerIdentity
PeerCache *PeerCache
Tracker *PeerConnectionTracker
RendezvousLeases []PeerRendezvousLease
HTTPClient *http.Client
ProbeTimeout time.Duration
Now func() time.Time
}
type PeerConnectionManager struct {
local PeerIdentity
peerCache *PeerCache
tracker *PeerConnectionTracker
rendezvousLeases []PeerRendezvousLease
httpClient *http.Client
probeTimeout time.Duration
now func() time.Time
mu sync.Mutex
lastCycle PeerConnectionManagerCycle
}
type PeerConnectionManagerCycle struct {
Mode string `json:"mode"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
ProbeTimeoutMs int `json:"probe_timeout_ms"`
IntentCount int `json:"intent_count"`
Attempted int `json:"attempted"`
Succeeded int `json:"succeeded"`
Failed int `json:"failed"`
Deferred int `json:"deferred"`
Skipped int `json:"skipped"`
RendezvousRequiredCount int `json:"rendezvous_required_count"`
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
RelayControlCount int `json:"relay_control_count"`
RecoveryPlan PeerRecoveryPlan `json:"recovery_plan"`
IntentPlan PeerConnectionIntentPlan `json:"intent_plan"`
Results []PeerConnectionProbeResult `json:"results,omitempty"`
}
type PeerConnectionManagerSnapshot struct {
LastCycle PeerConnectionManagerCycle `json:"last_cycle"`
}
type PeerConnectionProbeResult struct {
NodeID string `json:"node_id"`
LinkStatus string `json:"link_status"`
Action string `json:"action"`
Reason string `json:"reason"`
Endpoint string `json:"endpoint,omitempty"`
ConnectionState PeerConnectionState `json:"connection_state"`
TransportMode string `json:"transport_mode"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
LatencyMs int `json:"latency_ms,omitempty"`
FailureReason string `json:"failure_reason,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
}
func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionManager {
probeTimeout := cfg.ProbeTimeout
if probeTimeout <= 0 {
probeTimeout = DefaultPeerConnectionProbeTimeout
}
httpClient := cfg.HTTPClient
if httpClient == nil {
httpClient = &http.Client{
Transport: &http.Transport{
MaxIdleConns: 64,
MaxIdleConnsPerHost: 8,
IdleConnTimeout: 90 * time.Second,
},
Timeout: probeTimeout + time.Second,
}
}
now := cfg.Now
if now == nil {
now = func() time.Time { return time.Now().UTC() }
}
return &PeerConnectionManager{
local: cfg.Local,
peerCache: cfg.PeerCache,
tracker: cfg.Tracker,
rendezvousLeases: append([]PeerRendezvousLease{}, cfg.RendezvousLeases...),
httpClient: httpClient,
probeTimeout: probeTimeout,
now: now,
}
}
func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionManagerCycle {
peerCache, rendezvousLeases := m.peerConfigSnapshot()
if m == nil || peerCache == nil || m.tracker == nil {
return PeerConnectionManagerCycle{}
}
startedAt := normalizedNow(m.now())
peerSnapshot := peerCache.Snapshot()
recoveryPlan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: peerSnapshot,
Connections: m.tracker.Snapshot(),
TargetReadyPeers: DefaultStablePeerTarget,
MaxProbeCandidates: DefaultRecoveryProbeLimit,
Now: startedAt,
})
intentPlan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: peerSnapshot,
RecoveryPlan: recoveryPlan,
RendezvousLeases: rendezvousLeases,
Now: startedAt,
})
cycle := PeerConnectionManagerCycle{
Mode: recoveryPlan.Mode,
StartedAt: startedAt,
ProbeTimeoutMs: int(m.probeTimeout.Milliseconds()),
IntentCount: intentPlan.IntentCount,
RendezvousRequiredCount: intentPlan.RendezvousRequiredCount,
RendezvousResolvedCount: intentPlan.RendezvousResolvedCount,
RelayControlCount: intentPlan.RelayControlCount,
RecoveryPlan: recoveryPlan,
IntentPlan: intentPlan,
Results: make([]PeerConnectionProbeResult, 0, len(intentPlan.Intents)),
}
for _, intent := range intentPlan.Intents {
result := m.probeIntent(ctx, intent)
cycle.Results = append(cycle.Results, result)
switch result.LinkStatus {
case PeerConnectionProbeReachable:
cycle.Attempted++
cycle.Succeeded++
case PeerConnectionProbeUnreachable:
cycle.Attempted++
cycle.Failed++
case PeerConnectionProbeDeferred:
cycle.Deferred++
case PeerConnectionProbeSkipped:
cycle.Skipped++
}
}
cycle.CompletedAt = normalizedNow(m.now())
m.mu.Lock()
m.lastCycle = cycle
m.mu.Unlock()
return cycle
}
func (m *PeerConnectionManager) Snapshot() PeerConnectionManagerSnapshot {
if m == nil {
return PeerConnectionManagerSnapshot{}
}
m.mu.Lock()
defer m.mu.Unlock()
return PeerConnectionManagerSnapshot{LastCycle: m.lastCycle}
}
func (m *PeerConnectionManager) UpdatePeerConfig(peerCache *PeerCache, rendezvousLeases []PeerRendezvousLease) {
if m == nil {
return
}
m.mu.Lock()
defer m.mu.Unlock()
m.peerCache = peerCache
m.rendezvousLeases = append([]PeerRendezvousLease{}, rendezvousLeases...)
}
func (m *PeerConnectionManager) peerConfigSnapshot() (*PeerCache, []PeerRendezvousLease) {
if m == nil {
return nil, nil
}
m.mu.Lock()
defer m.mu.Unlock()
return m.peerCache, append([]PeerRendezvousLease{}, m.rendezvousLeases...)
}
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent) PeerConnectionProbeResult {
startedAt := normalizedNow(m.now())
result := PeerConnectionProbeResult{
NodeID: intent.NodeID,
Action: intent.Action,
Reason: intent.Reason,
Endpoint: intent.Endpoint,
TransportMode: intent.TransportMode,
RequiresRendezvous: intent.RequiresRendezvous,
RendezvousResolved: intent.RendezvousResolved,
DirectCandidate: intent.DirectCandidate,
RelayCandidate: intent.RelayCandidate,
RendezvousLeaseID: intent.RendezvousLeaseID,
RelayNodeID: intent.RelayNodeID,
RelayEndpoint: intent.RelayEndpoint,
StartedAt: startedAt,
}
peer := PeerCacheEntry{
NodeID: intent.NodeID,
Endpoint: intent.Endpoint,
Warm: true,
WarmReason: intent.Reason,
RecoverySeed: intent.RecoverySeed,
BestCandidateID: intent.BestCandidateID,
BestTransport: intent.Transport,
RendezvousLeaseID: intent.RendezvousLeaseID,
RelayNodeID: intent.RelayNodeID,
RelayEndpoint: intent.RelayEndpoint,
RelayControl: intent.RelayCandidate,
}
if intent.RequiresRendezvous {
result.LinkStatus = PeerConnectionProbeDeferred
result.FailureReason = "rendezvous_required"
result.ConnectionState = m.tracker.RecordDeferred(peer, result.FailureReason, startedAt)
result.CompletedAt = normalizedNow(m.now())
return result
}
if strings.TrimSpace(intent.Endpoint) == "" || (!intent.DirectCandidate && !intent.RelayCandidate) {
result.LinkStatus = PeerConnectionProbeDeferred
result.FailureReason = "direct_candidate_unavailable"
if intent.RelayCandidate {
result.FailureReason = "relay_candidate_unavailable"
}
result.ConnectionState = m.tracker.RecordDeferred(peer, result.FailureReason, startedAt)
result.CompletedAt = normalizedNow(m.now())
return result
}
if !m.tracker.ShouldProbe(intent.NodeID, startedAt) {
result.LinkStatus = PeerConnectionProbeSkipped
result.FailureReason = "backoff_active"
result.ConnectionState = m.connectionState(intent.NodeID)
result.CompletedAt = normalizedNow(m.now())
return result
}
m.tracker.BeginProbe(peer, startedAt)
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
defer cancel()
target := PeerIdentity{
ClusterID: m.local.ClusterID,
NodeID: intent.NodeID,
}
if intent.RelayCandidate && intent.RelayNodeID != "" {
target.NodeID = intent.RelayNodeID
}
_, err := NewClient(strings.TrimRight(intent.Endpoint, "/")).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
completedAt := normalizedNow(m.now())
if err != nil {
result.LinkStatus = PeerConnectionProbeUnreachable
result.FailureReason = err.Error()
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, err.Error(), completedAt)
result.CompletedAt = completedAt
return result
}
latency := int(completedAt.Sub(startedAt).Milliseconds())
if latency < 0 {
latency = 0
}
result.LinkStatus = PeerConnectionProbeReachable
result.LatencyMs = latency
if intent.RelayCandidate {
result.ConnectionState = m.tracker.RecordRelayReady(peer, latency, completedAt)
} else {
result.ConnectionState = m.tracker.RecordSuccess(intent.NodeID, latency, completedAt)
}
result.CompletedAt = completedAt
return result
}
func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionState {
snapshot := m.tracker.Snapshot()
for _, entry := range snapshot.Entries {
if entry.NodeID == nodeID {
return entry
}
}
return PeerConnectionState{NodeID: nodeID, State: PeerConnectionDisconnected}
}
func (c Client) withHTTPClient(httpClient *http.Client) Client {
c.HTTPClient = httpClient
return c
}
@@ -0,0 +1,190 @@
package mesh
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestPeerConnectionManagerProbesDirectAndDefersRendezvous(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
}.Handler())
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: server.URL,
Reachability: "private",
ConnectivityMode: "direct",
PolicyTags: []string{"corp-lan", "same-site"},
Priority: 1,
},
},
"node-c": {
{
EndpointID: "node-c-relay",
NodeID: "node-c",
Transport: "relay",
Address: "relay://fabric/node-c",
Reachability: "relay",
ConnectivityMode: "relay_required",
Priority: 1,
},
},
},
WarmPeerLimit: 2,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
ProbeTimeout: time.Second,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 1 || cycle.Succeeded != 1 || cycle.Deferred != 1 || cycle.RendezvousRequiredCount != 1 {
t.Fatalf("unexpected cycle: %+v", cycle)
}
snapshot := tracker.Snapshot()
if snapshot.Ready != 1 || snapshot.Waiting != 1 {
t.Fatalf("unexpected tracker snapshot: %+v", snapshot)
}
if cycle.Results[0].NodeID != "node-b" || cycle.Results[0].LinkStatus != PeerConnectionProbeReachable {
t.Fatalf("direct peer was not probed first: %+v", cycle.Results)
}
if cycle.Results[1].NodeID != "node-c" || cycle.Results[1].LinkStatus != PeerConnectionProbeDeferred {
t.Fatalf("relay peer was not deferred: %+v", cycle.Results)
}
}
func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-b": "http://127.0.0.1:1",
},
WarmPeerLimit: 1,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
HTTPClient: &http.Client{Timeout: 20 * time.Millisecond},
ProbeTimeout: 20 * time.Millisecond,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
for i := 0; i < 3; i++ {
manager.ProbeOnce(context.Background())
}
backoff := tracker.Snapshot()
if backoff.Backoff != 1 {
t.Fatalf("expected backoff after repeated failures: %+v", backoff)
}
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 0 || len(cycle.Results) != 0 {
t.Fatalf("active backoff peer should not be attempted: %+v", cycle)
}
}
func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
}.Handler())
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
leases := []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: server.URL,
Transport: "relay_control",
ConnectivityMode: "relay_required",
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
},
}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Address: "relay://fabric/node-b",
Reachability: "relay",
ConnectivityMode: "relay_required",
Priority: 10,
},
},
},
RendezvousLeases: leases,
WarmPeerLimit: 1,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
RendezvousLeases: leases,
ProbeTimeout: time.Second,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 1 ||
cycle.Succeeded != 1 ||
cycle.Deferred != 0 ||
cycle.RelayControlCount != 1 ||
cycle.RendezvousResolvedCount != 1 ||
cycle.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected relay-control cycle: %+v", cycle)
}
if len(cycle.Results) != 1 ||
cycle.Results[0].NodeID != "node-b" ||
cycle.Results[0].RelayNodeID != "node-r" ||
cycle.Results[0].ConnectionState.State != PeerConnectionRelayReady {
t.Fatalf("unexpected relay-control result: %+v", cycle.Results)
}
snapshot := tracker.Snapshot()
if snapshot.RelayReady != 1 || snapshot.Waiting != 0 {
t.Fatalf("unexpected tracker snapshot: %+v", snapshot)
}
}
@@ -0,0 +1,284 @@
package mesh
import (
"sort"
"sync"
"time"
)
const (
PeerConnectionDisconnected = "disconnected"
PeerConnectionConnecting = "connecting"
PeerConnectionReady = "ready"
PeerConnectionRelayReady = "relay_ready"
PeerConnectionDegraded = "degraded"
PeerConnectionBackoff = "backoff"
PeerConnectionWaiting = "waiting_rendezvous"
)
const (
peerConnectionBackoffBase = 5 * time.Second
peerConnectionBackoffMax = time.Minute
)
type PeerConnectionTracker struct {
mu sync.Mutex
entries map[string]PeerConnectionState
}
type PeerConnectionState struct {
NodeID string `json:"node_id"`
State string `json:"state"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
ConsecutiveSuccesses int `json:"consecutive_successes"`
ConsecutiveFailures int `json:"consecutive_failures"`
LastLatencyMs int `json:"last_latency_ms,omitempty"`
LastFailureReason string `json:"last_failure_reason,omitempty"`
LastTransitionAt time.Time `json:"last_transition_at"`
LastProbeAt time.Time `json:"last_probe_at,omitempty"`
BackoffUntil time.Time `json:"backoff_until,omitempty"`
}
type PeerConnectionSnapshot struct {
Total int `json:"total"`
Ready int `json:"ready"`
RelayReady int `json:"relay_ready"`
Degraded int `json:"degraded"`
Backoff int `json:"backoff"`
Waiting int `json:"waiting_rendezvous"`
Connecting int `json:"connecting"`
Disconnected int `json:"disconnected"`
StateCounts map[string]int `json:"state_counts"`
Entries []PeerConnectionState `json:"entries"`
LastTransitionAt time.Time `json:"last_transition_at,omitempty"`
}
func NewPeerConnectionTracker(peerSnapshot PeerCacheSnapshot, now time.Time) *PeerConnectionTracker {
now = normalizedNow(now)
tracker := &PeerConnectionTracker{entries: map[string]PeerConnectionState{}}
for _, peer := range peerSnapshot.Entries {
if !peer.Warm || peer.NodeID == "" {
continue
}
tracker.entries[peer.NodeID] = PeerConnectionState{
NodeID: peer.NodeID,
State: PeerConnectionDisconnected,
Warm: peer.Warm,
WarmReason: peer.WarmReason,
Endpoint: peer.Endpoint,
BestCandidateID: peer.BestCandidateID,
LastTransitionAt: now,
}
}
return tracker
}
func (t *PeerConnectionTracker) ShouldProbe(nodeID string, now time.Time) bool {
if t == nil {
return true
}
t.mu.Lock()
defer t.mu.Unlock()
entry, ok := t.entries[nodeID]
if !ok {
return true
}
now = normalizedNow(now)
return entry.State != PeerConnectionBackoff || entry.BackoffUntil.IsZero() || !entry.BackoffUntil.After(now)
}
func (t *PeerConnectionTracker) BeginProbe(peer PeerCacheEntry, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entry(peer, now)
if entry.State != PeerConnectionReady && entry.State != PeerConnectionDegraded {
entry.State = PeerConnectionConnecting
entry.LastTransitionAt = now
}
entry.LastProbeAt = now
t.entries[peer.NodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordSuccess(nodeID string, latencyMs int, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entries[nodeID]
entry.NodeID = nodeID
entry.ConsecutiveSuccesses++
entry.ConsecutiveFailures = 0
entry.LastLatencyMs = latencyMs
entry.LastFailureReason = ""
entry.LastProbeAt = now
entry.BackoffUntil = time.Time{}
nextState := PeerConnectionReady
if latencyMs >= 500 {
nextState = PeerConnectionDegraded
}
if entry.State != nextState {
entry.State = nextState
entry.LastTransitionAt = now
}
t.entries[nodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordRelayReady(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entry(peer, now)
entry.ConsecutiveSuccesses++
entry.ConsecutiveFailures = 0
entry.LastLatencyMs = latencyMs
entry.LastFailureReason = ""
entry.LastProbeAt = now
entry.BackoffUntil = time.Time{}
if entry.State != PeerConnectionRelayReady {
entry.State = PeerConnectionRelayReady
entry.LastTransitionAt = now
}
t.entries[peer.NodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordFailure(nodeID string, reason string, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entries[nodeID]
entry.NodeID = nodeID
entry.ConsecutiveFailures++
entry.ConsecutiveSuccesses = 0
entry.LastFailureReason = reason
entry.LastProbeAt = now
nextState := PeerConnectionDegraded
if entry.ConsecutiveFailures >= 3 {
nextState = PeerConnectionBackoff
entry.BackoffUntil = now.Add(peerConnectionBackoffDuration(entry.ConsecutiveFailures))
}
if entry.State != nextState {
entry.State = nextState
entry.LastTransitionAt = now
}
t.entries[nodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordDeferred(peer PeerCacheEntry, reason string, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entry(peer, now)
entry.State = PeerConnectionWaiting
entry.LastFailureReason = reason
entry.LastProbeAt = time.Time{}
entry.LastTransitionAt = now
entry.BackoffUntil = time.Time{}
t.entries[peer.NodeID] = entry
return entry
}
func (t *PeerConnectionTracker) Snapshot() PeerConnectionSnapshot {
if t == nil {
return PeerConnectionSnapshot{StateCounts: map[string]int{}}
}
t.mu.Lock()
defer t.mu.Unlock()
entries := make([]PeerConnectionState, 0, len(t.entries))
counts := map[string]int{
PeerConnectionDisconnected: 0,
PeerConnectionConnecting: 0,
PeerConnectionReady: 0,
PeerConnectionRelayReady: 0,
PeerConnectionDegraded: 0,
PeerConnectionBackoff: 0,
PeerConnectionWaiting: 0,
}
var lastTransition time.Time
for _, entry := range t.entries {
entries = append(entries, entry)
counts[entry.State]++
if entry.LastTransitionAt.After(lastTransition) {
lastTransition = entry.LastTransitionAt
}
}
sort.SliceStable(entries, func(i, j int) bool {
return entries[i].NodeID < entries[j].NodeID
})
return PeerConnectionSnapshot{
Total: len(entries),
Ready: counts[PeerConnectionReady],
RelayReady: counts[PeerConnectionRelayReady],
Degraded: counts[PeerConnectionDegraded],
Backoff: counts[PeerConnectionBackoff],
Waiting: counts[PeerConnectionWaiting],
Connecting: counts[PeerConnectionConnecting],
Disconnected: counts[PeerConnectionDisconnected],
StateCounts: counts,
Entries: entries,
LastTransitionAt: lastTransition,
}
}
func (t *PeerConnectionTracker) entry(peer PeerCacheEntry, now time.Time) PeerConnectionState {
entry, ok := t.entries[peer.NodeID]
if !ok {
entry = PeerConnectionState{
NodeID: peer.NodeID,
State: PeerConnectionDisconnected,
LastTransitionAt: now,
}
}
entry.Warm = peer.Warm
entry.WarmReason = peer.WarmReason
entry.Endpoint = peer.Endpoint
entry.BestCandidateID = peer.BestCandidateID
entry.RendezvousLeaseID = peer.RendezvousLeaseID
entry.RelayNodeID = peer.RelayNodeID
entry.RelayEndpoint = peer.RelayEndpoint
entry.RelayControl = peer.RelayControl
return entry
}
func peerConnectionBackoffDuration(failures int) time.Duration {
if failures < 3 {
return 0
}
backoff := peerConnectionBackoffBase * time.Duration(failures-2)
if backoff > peerConnectionBackoffMax {
return peerConnectionBackoffMax
}
return backoff
}
func normalizedNow(now time.Time) time.Time {
if now.IsZero() {
return time.Now().UTC()
}
return now.UTC()
}
@@ -0,0 +1,76 @@
package mesh
import (
"testing"
"time"
)
func TestPeerConnectionTrackerTransitionsReadyAndDegraded(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-b", Warm: true, WarmReason: "route_adjacent", Endpoint: "http://node-b:19000"},
},
}, now)
begin := tracker.BeginProbe(PeerCacheEntry{NodeID: "node-b", Warm: true}, now.Add(time.Second))
if begin.State != PeerConnectionConnecting {
t.Fatalf("begin state = %q, want connecting", begin.State)
}
ready := tracker.RecordSuccess("node-b", 42, now.Add(2*time.Second))
if ready.State != PeerConnectionReady || ready.ConsecutiveSuccesses != 1 || ready.ConsecutiveFailures != 0 {
t.Fatalf("ready state unexpected: %+v", ready)
}
degraded := tracker.RecordSuccess("node-b", 800, now.Add(3*time.Second))
if degraded.State != PeerConnectionDegraded || degraded.LastLatencyMs != 800 {
t.Fatalf("degraded state unexpected: %+v", degraded)
}
}
func TestPeerConnectionTrackerBackoffAfterRepeatedFailures(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
Entries: []PeerCacheEntry{{NodeID: "node-b", Warm: true}},
}, now)
first := tracker.RecordFailure("node-b", "timeout", now.Add(time.Second))
if first.State != PeerConnectionDegraded {
t.Fatalf("first failure state = %q, want degraded", first.State)
}
_ = tracker.RecordFailure("node-b", "timeout", now.Add(2*time.Second))
third := tracker.RecordFailure("node-b", "timeout", now.Add(3*time.Second))
if third.State != PeerConnectionBackoff || third.BackoffUntil.IsZero() {
t.Fatalf("third failure did not enter backoff: %+v", third)
}
if tracker.ShouldProbe("node-b", now.Add(4*time.Second)) {
t.Fatal("ShouldProbe returned true during backoff")
}
if !tracker.ShouldProbe("node-b", third.BackoffUntil.Add(time.Millisecond)) {
t.Fatal("ShouldProbe returned false after backoff")
}
recovered := tracker.RecordSuccess("node-b", 12, third.BackoffUntil.Add(time.Second))
if recovered.State != PeerConnectionReady || recovered.ConsecutiveFailures != 0 || !recovered.BackoffUntil.IsZero() {
t.Fatalf("success did not recover from backoff: %+v", recovered)
}
}
func TestPeerConnectionTrackerSnapshotCountsStates(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-a", Warm: true},
{NodeID: "node-b", Warm: true},
{NodeID: "node-c", Warm: true},
},
}, now)
tracker.RecordSuccess("node-a", 25, now.Add(time.Second))
tracker.RecordFailure("node-b", "timeout", now.Add(time.Second))
tracker.RecordFailure("node-c", "timeout", now.Add(time.Second))
tracker.RecordFailure("node-c", "timeout", now.Add(2*time.Second))
tracker.RecordFailure("node-c", "timeout", now.Add(3*time.Second))
snapshot := tracker.Snapshot()
if snapshot.Total != 3 || snapshot.Ready != 1 || snapshot.Degraded != 1 || snapshot.Backoff != 1 {
t.Fatalf("unexpected snapshot: %+v", snapshot)
}
}
@@ -0,0 +1,276 @@
package mesh
import (
"sort"
"strings"
"time"
)
const (
PeerRecoveryModeSteady = "steady"
PeerRecoveryModeRecovery = "recovery"
)
const (
DefaultStablePeerTarget = 3
DefaultRecoveryProbeLimit = 6
)
type PeerRecoveryPlanConfig struct {
PeerCache PeerCacheSnapshot
Connections PeerConnectionSnapshot
TargetReadyPeers int
MaxProbeCandidates int
Now time.Time
}
type PeerRecoveryPlan struct {
Mode string `json:"mode"`
Healthy bool `json:"healthy"`
TargetReadyPeers int `json:"target_ready_peers"`
ReadyPeerCount int `json:"ready_peer_count"`
DegradedPeerCount int `json:"degraded_peer_count"`
BackoffPeerCount int `json:"backoff_peer_count"`
ConnectablePeerCount int `json:"connectable_peer_count"`
Deficit int `json:"deficit"`
ProbeCandidateCount int `json:"probe_candidate_count"`
RecoverySeedCandidateCount int `json:"recovery_seed_candidate_count"`
GeneratedAt time.Time `json:"generated_at"`
Candidates []PeerRecoveryCandidate `json:"candidates,omitempty"`
}
type PeerRecoveryCandidate struct {
NodeID string `json:"node_id"`
Endpoint string `json:"endpoint,omitempty"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
BestTransport string `json:"best_transport,omitempty"`
ConnectionState string `json:"connection_state"`
ConsecutiveFailures int `json:"consecutive_failures,omitempty"`
LastLatencyMs int `json:"last_latency_ms,omitempty"`
BackoffUntil time.Time `json:"backoff_until,omitempty"`
Reason string `json:"reason"`
Priority int `json:"priority"`
}
type peerRecoveryCandidateBuild struct {
PeerRecoveryCandidate
}
func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
now := normalizedNow(cfg.Now)
target := cfg.TargetReadyPeers
if target <= 0 {
target = DefaultStablePeerTarget
}
limit := cfg.MaxProbeCandidates
if limit <= 0 {
limit = DefaultRecoveryProbeLimit
}
connectable := connectablePeerCount(cfg.PeerCache)
if target > connectable {
target = connectable
}
if limit < target {
limit = target
}
connectionByNode := map[string]PeerConnectionState{}
for _, connection := range cfg.Connections.Entries {
if strings.TrimSpace(connection.NodeID) == "" {
continue
}
connectionByNode[connection.NodeID] = connection
}
entryByNode := map[string]PeerCacheEntry{}
for _, entry := range cfg.PeerCache.Entries {
if strings.TrimSpace(entry.NodeID) == "" {
continue
}
entryByNode[entry.NodeID] = entry
}
ready := 0
degraded := 0
backoff := 0
for nodeID, connection := range connectionByNode {
entry, ok := entryByNode[nodeID]
if !ok || strings.TrimSpace(entry.Endpoint) == "" {
continue
}
switch connection.State {
case PeerConnectionReady, PeerConnectionRelayReady:
ready++
case PeerConnectionDegraded:
degraded++
case PeerConnectionBackoff:
backoff++
}
}
deficit := target - ready
if deficit < 0 {
deficit = 0
}
mode := PeerRecoveryModeSteady
if deficit > 0 {
mode = PeerRecoveryModeRecovery
}
if mode == PeerRecoveryModeSteady {
limit = target
}
candidates := make([]peerRecoveryCandidateBuild, 0, len(cfg.PeerCache.Entries))
for _, entry := range cfg.PeerCache.Entries {
if strings.TrimSpace(entry.NodeID) == "" || strings.TrimSpace(entry.Endpoint) == "" {
continue
}
connection := connectionByNode[entry.NodeID]
if connection.State == "" {
connection.State = PeerConnectionDisconnected
}
if connection.State == PeerConnectionBackoff && connection.BackoffUntil.After(now) {
continue
}
reason, ok := peerRecoveryCandidateReason(mode, entry, connection)
if !ok {
continue
}
candidate := PeerRecoveryCandidate{
NodeID: entry.NodeID,
Endpoint: strings.TrimSpace(entry.Endpoint),
Warm: entry.Warm,
WarmReason: entry.WarmReason,
RecoverySeed: entry.RecoverySeed,
BestCandidateID: entry.BestCandidateID,
BestTransport: entry.BestTransport,
ConnectionState: connection.State,
ConsecutiveFailures: connection.ConsecutiveFailures,
LastLatencyMs: connection.LastLatencyMs,
BackoffUntil: connection.BackoffUntil,
Reason: reason,
Priority: peerRecoveryCandidatePriority(entry, connection, reason),
}
candidates = append(candidates, peerRecoveryCandidateBuild{PeerRecoveryCandidate: candidate})
}
sort.SliceStable(candidates, func(i, j int) bool {
if candidates[i].Priority != candidates[j].Priority {
return candidates[i].Priority > candidates[j].Priority
}
return candidates[i].NodeID < candidates[j].NodeID
})
if len(candidates) > limit {
candidates = candidates[:limit]
}
outCandidates := make([]PeerRecoveryCandidate, 0, len(candidates))
recoverySeedCandidates := 0
for _, candidate := range candidates {
outCandidates = append(outCandidates, candidate.PeerRecoveryCandidate)
if candidate.RecoverySeed {
recoverySeedCandidates++
}
}
return PeerRecoveryPlan{
Mode: mode,
Healthy: deficit == 0,
TargetReadyPeers: target,
ReadyPeerCount: ready,
DegradedPeerCount: degraded,
BackoffPeerCount: backoff,
ConnectablePeerCount: connectable,
Deficit: deficit,
ProbeCandidateCount: len(outCandidates),
RecoverySeedCandidateCount: recoverySeedCandidates,
GeneratedAt: now,
Candidates: outCandidates,
}
}
func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection PeerConnectionState) (string, bool) {
if mode == PeerRecoveryModeSteady {
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
return "maintain_ready", true
}
return "", false
}
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
return "maintain_ready", true
}
if connection.State == PeerConnectionDegraded {
return "recover_degraded", true
}
if entry.Warm {
return "recover_warm", true
}
if entry.RecoverySeed {
return "recover_seed", true
}
return "recover_peer", true
}
func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnectionState, reason string) int {
score := 0
if entry.Warm {
score += 1000
}
switch entry.WarmReason {
case "route_adjacent":
score += 500
case "recovery_seed":
score += 350
case "endpoint_candidate":
score += 200
case "peer_endpoint":
score += 100
}
if entry.RecoverySeed {
score += 250
}
if entry.BestCandidateID != "" {
score += 150
}
score += entry.BestCandidateScore / 10
switch connection.State {
case PeerConnectionReady, PeerConnectionRelayReady:
score += 600
case PeerConnectionDegraded:
score += 350
case PeerConnectionConnecting:
score += 200
case PeerConnectionDisconnected:
score += 100
}
switch reason {
case "maintain_ready":
score += 500
case "recover_degraded":
score += 300
case "recover_seed":
score += 250
case "recover_warm":
score += 150
}
if connection.LastLatencyMs > 0 {
score -= connection.LastLatencyMs / 10
}
if score < 0 {
return 0
}
return score
}
func connectablePeerCount(snapshot PeerCacheSnapshot) int {
count := 0
for _, entry := range snapshot.Entries {
if strings.TrimSpace(entry.NodeID) == "" || strings.TrimSpace(entry.Endpoint) == "" {
continue
}
count++
}
return count
}
@@ -0,0 +1,139 @@
package mesh
import (
"testing"
"time"
)
func TestPeerRecoveryPlanMaintainsBoundedReadyPeers(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
recoveryPlanPeer("node-a", true, false, "route_adjacent"),
recoveryPlanPeer("node-b", true, false, "route_adjacent"),
recoveryPlanPeer("node-c", true, false, "peer_endpoint"),
recoveryPlanPeer("node-d", true, false, "peer_endpoint"),
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-a", State: PeerConnectionReady, LastLatencyMs: 40},
{NodeID: "node-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-c", State: PeerConnectionReady, LastLatencyMs: 30},
{NodeID: "node-d", State: PeerConnectionReady, LastLatencyMs: 10},
}},
Now: now,
})
if plan.Mode != PeerRecoveryModeSteady || !plan.Healthy {
t.Fatalf("unexpected plan health: %+v", plan)
}
if plan.TargetReadyPeers != DefaultStablePeerTarget || len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("unexpected bounded candidates: %+v", plan)
}
for _, candidate := range plan.Candidates {
if candidate.Reason != "maintain_ready" {
t.Fatalf("unexpected candidate reason: %+v", candidate)
}
}
}
func TestPeerRecoveryPlanAddsRecoverySeedWhenReadyDeficit(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
recoveryPlanPeer("node-a", true, false, "route_adjacent"),
recoveryPlanPeer("node-b", true, false, "route_adjacent"),
recoveryPlanPeer("node-seed", false, true, ""),
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-b", State: PeerConnectionBackoff, BackoffUntil: now.Add(time.Minute)},
}},
Now: now,
})
if plan.Mode != PeerRecoveryModeRecovery || plan.Healthy {
t.Fatalf("unexpected recovery mode: %+v", plan)
}
if plan.Deficit != 2 || plan.RecoverySeedCandidateCount != 1 {
t.Fatalf("unexpected deficit/seed count: %+v", plan)
}
if !recoveryPlanHasCandidate(plan, "node-seed", "recover_seed") {
t.Fatalf("recovery seed was not selected: %+v", plan.Candidates)
}
if recoveryPlanHasCandidate(plan, "node-b", "") {
t.Fatalf("active backoff peer should not be selected: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanMaintainsRelayReadyPeersInSteadyMode(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{
NodeID: "node-c",
Endpoint: "http://relay:19001",
Warm: true,
WarmReason: "rendezvous_lease",
RendezvousLeaseID: "lease-1",
RelayNodeID: "node-r",
RelayEndpoint: "http://relay:19001",
RelayControl: true,
},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-c", State: PeerConnectionRelayReady, LastLatencyMs: 15},
}},
Now: now,
})
if plan.Mode != PeerRecoveryModeSteady || !plan.Healthy {
t.Fatalf("unexpected steady plan: %+v", plan)
}
if !recoveryPlanHasCandidate(plan, "node-c", "maintain_ready") {
t.Fatalf("relay-ready peer was not maintained: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{NodeID: "node-a", Warm: true, WarmReason: "route_adjacent"},
recoveryPlanPeer("node-b", true, false, "route_adjacent"),
}},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-b", State: PeerConnectionReady},
}},
Now: now,
})
if plan.TargetReadyPeers != 1 || !plan.Healthy {
t.Fatalf("target should be capped by connectable peers: %+v", plan)
}
}
func recoveryPlanPeer(nodeID string, warm bool, recoverySeed bool, warmReason string) PeerCacheEntry {
return PeerCacheEntry{
NodeID: nodeID,
Endpoint: "http://" + nodeID + ":19001",
Warm: warm,
WarmReason: warmReason,
RecoverySeed: recoverySeed,
}
}
func recoveryPlanHasCandidate(plan PeerRecoveryPlan, nodeID string, reason string) bool {
for _, candidate := range plan.Candidates {
if candidate.NodeID != nodeID {
continue
}
return reason == "" || candidate.Reason == reason
}
return false
}
@@ -0,0 +1,149 @@
package mesh
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"time"
)
func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope, now time.Time) error {
if envelope.FabricProtocolVersion != ProtocolVersion {
return fmt.Errorf("%w: unsupported fabric_protocol_version", ErrForwardEnvelopeInvalid)
}
if envelope.MessageID == "" {
return fmt.Errorf("%w: message_id is required", ErrForwardEnvelopeInvalid)
}
if envelope.RouteID == "" {
return fmt.Errorf("%w: route_id is required", ErrForwardEnvelopeInvalid)
}
if envelope.ClusterID == "" || envelope.ClusterID != local.ClusterID {
return ErrClusterMismatch
}
if envelope.SourceNodeID == "" || envelope.DestinationNodeID == "" {
return fmt.Errorf("%w: source_node_id and destination_node_id are required", ErrForwardEnvelopeInvalid)
}
if envelope.CurrentHopNodeID != local.NodeID {
return ErrNodeMismatch
}
if envelope.NextHopNodeID == "" {
return fmt.Errorf("%w: next_hop_node_id is required", ErrForwardEnvelopeInvalid)
}
if len(envelope.RoutePath) > 0 {
if err := validateProductionRoutePath(local, envelope); err != nil {
return err
}
}
if envelope.ChannelClass != ProductionChannelFabricControl {
return ErrUnauthorizedChannel
}
if envelope.MessageType != ProductionMessageFabricControl {
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
}
if envelope.TTL <= 0 {
return ErrTTLExhausted
}
if envelope.HopCount < 0 {
return fmt.Errorf("%w: hop_count must not be negative", ErrForwardEnvelopeInvalid)
}
if envelope.CreatedAt.IsZero() || envelope.ExpiresAt.IsZero() {
return fmt.Errorf("%w: created_at and expires_at are required", ErrForwardEnvelopeInvalid)
}
if envelope.CreatedAt.After(now.UTC().Add(MaxProductionEnvelopeFutureSkew)) {
return fmt.Errorf("%w: created_at exceeds allowed future skew", ErrForwardEnvelopeInvalid)
}
if !envelope.ExpiresAt.After(now.UTC()) {
return ErrRouteExpired
}
if envelope.PayloadLength != len(envelope.Payload) {
return fmt.Errorf("%w: payload_length mismatch", ErrForwardEnvelopeInvalid)
}
if envelope.PayloadLength > MaxProductionEnvelopePayloadBytes {
return fmt.Errorf("%w: payload exceeds fabric-control limit", ErrForwardEnvelopeInvalid)
}
if envelope.PayloadHash == "" {
return fmt.Errorf("%w: payload_hash is required", ErrForwardEnvelopeInvalid)
}
sum := sha256.Sum256(envelope.Payload)
if envelope.PayloadHash != hex.EncodeToString(sum[:]) {
return fmt.Errorf("%w: payload_hash mismatch", ErrForwardEnvelopeInvalid)
}
return nil
}
func validateProductionRoutePath(local PeerIdentity, envelope ProductionEnvelope) error {
if len(envelope.RoutePath) < 2 {
return ErrInvalidRoutePath
}
if envelope.RoutePath[0] != envelope.SourceNodeID || envelope.RoutePath[len(envelope.RoutePath)-1] != envelope.DestinationNodeID {
return ErrInvalidRoutePath
}
currentIndex := -1
seen := map[string]struct{}{}
for index, nodeID := range envelope.RoutePath {
if nodeID == "" {
return ErrInvalidRoutePath
}
if _, duplicate := seen[nodeID]; duplicate {
return ErrLoopDetected
}
seen[nodeID] = struct{}{}
if nodeID == local.NodeID {
currentIndex = index
}
}
if currentIndex < 0 || envelope.CurrentHopNodeID != local.NodeID {
return ErrNodeMismatch
}
if containsProductionNodeID(envelope.VisitedNodeIDs, local.NodeID) {
return ErrLoopDetected
}
for _, visitedNodeID := range envelope.VisitedNodeIDs {
if visitedNodeID == "" || !containsProductionNodeID(envelope.RoutePath, visitedNodeID) {
return ErrInvalidRoutePath
}
}
if envelope.DestinationNodeID == local.NodeID {
if envelope.NextHopNodeID != local.NodeID {
return ErrInvalidRoutePath
}
return nil
}
if currentIndex >= len(envelope.RoutePath)-1 {
return ErrInvalidRoutePath
}
if envelope.NextHopNodeID != envelope.RoutePath[currentIndex+1] {
return ErrInvalidRoutePath
}
return nil
}
func containsProductionNodeID(values []string, needle string) bool {
for _, value := range values {
if value == needle {
return true
}
}
return false
}
func NewProductionEnvelopeObservation(envelope ProductionEnvelope, observedAt time.Time) ProductionEnvelopeObservation {
return ProductionEnvelopeObservation{
MessageID: envelope.MessageID,
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
SourceNodeID: envelope.SourceNodeID,
DestinationNodeID: envelope.DestinationNodeID,
CurrentHopNodeID: envelope.CurrentHopNodeID,
NextHopNodeID: envelope.NextHopNodeID,
RoutePath: append([]string{}, envelope.RoutePath...),
VisitedNodeIDs: append([]string{}, envelope.VisitedNodeIDs...),
ChannelClass: envelope.ChannelClass,
MessageType: envelope.MessageType,
TTL: envelope.TTL,
HopCount: envelope.HopCount,
PayloadLength: envelope.PayloadLength,
PayloadHash: envelope.PayloadHash,
ObservedAt: observedAt.UTC(),
}
}
@@ -0,0 +1,81 @@
package mesh
import (
"context"
"sync"
)
type ProductionEnvelopeObservationSink struct {
mu sync.Mutex
capacity int
items []ProductionEnvelopeObservation
accepted uint64
dropped uint64
}
type ProductionEnvelopeObservationSinkMetrics struct {
Capacity int `json:"capacity"`
CurrentDepth int `json:"current_depth"`
AcceptedTotal uint64 `json:"accepted_total"`
DroppedOldest uint64 `json:"dropped_oldest"`
}
func NewProductionEnvelopeObservationSink(capacity int) *ProductionEnvelopeObservationSink {
if capacity < 1 {
capacity = 1
}
return &ProductionEnvelopeObservationSink{
capacity: capacity,
items: make([]ProductionEnvelopeObservation, 0, capacity),
}
}
func (s *ProductionEnvelopeObservationSink) Observe(_ context.Context, observation ProductionEnvelopeObservation) error {
s.mu.Lock()
defer s.mu.Unlock()
s.accepted++
if len(s.items) == s.capacity {
copy(s.items, s.items[1:])
s.items[len(s.items)-1] = observation
s.dropped++
return nil
}
s.items = append(s.items, observation)
return nil
}
func (s *ProductionEnvelopeObservationSink) Snapshot() []ProductionEnvelopeObservation {
s.mu.Lock()
defer s.mu.Unlock()
out := make([]ProductionEnvelopeObservation, len(s.items))
copy(out, s.items)
return out
}
func (s *ProductionEnvelopeObservationSink) Len() int {
s.mu.Lock()
defer s.mu.Unlock()
return len(s.items)
}
func (s *ProductionEnvelopeObservationSink) Capacity() int {
s.mu.Lock()
defer s.mu.Unlock()
return s.capacity
}
func (s *ProductionEnvelopeObservationSink) Metrics() ProductionEnvelopeObservationSinkMetrics {
s.mu.Lock()
defer s.mu.Unlock()
return ProductionEnvelopeObservationSinkMetrics{
Capacity: s.capacity,
CurrentDepth: len(s.items),
AcceptedTotal: s.accepted,
DroppedOldest: s.dropped,
}
}
@@ -0,0 +1,80 @@
package mesh
import (
"fmt"
"time"
)
func ValidateProductionEnvelopeRouteConfig(local PeerIdentity, envelope ProductionEnvelope, routes []SyntheticRoute, now time.Time) error {
if len(routes) == 0 {
return nil
}
route, ok := productionRouteByID(routes, envelope.RouteID)
if !ok {
return ErrRouteNotFound
}
if route.ClusterID != envelope.ClusterID || route.ClusterID != local.ClusterID {
return ErrClusterMismatch
}
if route.SourceNodeID != envelope.SourceNodeID || route.DestinationNodeID != envelope.DestinationNodeID {
return ErrInvalidRoutePath
}
if route.ExpiresAt.IsZero() || !route.ExpiresAt.After(now.UTC()) || envelope.ExpiresAt.After(route.ExpiresAt) {
return ErrRouteExpired
}
if !contains(route.AllowedChannels, ProductionChannelFabricControl) {
return ErrUnauthorizedChannel
}
path := routePath(route)
if len(path) < 2 || path[0] != route.SourceNodeID || path[len(path)-1] != route.DestinationNodeID {
return ErrInvalidRoutePath
}
if len(envelope.RoutePath) > 0 && !sameNodePath(envelope.RoutePath, path) {
return ErrInvalidRoutePath
}
if len(path) > 2 && len(envelope.RoutePath) == 0 {
return ErrInvalidRoutePath
}
currentIndex := indexOf(path, local.NodeID)
if currentIndex < 0 || envelope.CurrentHopNodeID != local.NodeID {
return ErrNodeMismatch
}
expectedNextHop := local.NodeID
if local.NodeID != envelope.DestinationNodeID {
if currentIndex >= len(path)-1 {
return ErrInvalidRoutePath
}
expectedNextHop = path[currentIndex+1]
}
if envelope.NextHopNodeID != expectedNextHop {
return ErrInvalidRoutePath
}
if route.MaxTTL > 0 && envelope.TTL > route.MaxTTL {
return fmt.Errorf("%w: ttl exceeds configured route max_ttl", ErrForwardEnvelopeInvalid)
}
if route.MaxHops > 0 && envelope.HopCount > route.MaxHops {
return fmt.Errorf("%w: hop_count exceeds configured route max_hops", ErrForwardEnvelopeInvalid)
}
return nil
}
func productionRouteByID(routes []SyntheticRoute, routeID string) (SyntheticRoute, bool) {
for _, route := range routes {
if route.RouteID == routeID {
return route, true
}
}
return SyntheticRoute{}, false
}
func sameNodePath(a []string, b []string) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}
@@ -0,0 +1,43 @@
package mesh
import (
"context"
"net/http"
"strings"
)
type ProductionForwardTransport interface {
SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error)
}
type HTTPProductionForwardTransport struct {
PeerURLs map[string]string
HTTPClient *http.Client
}
func NewHTTPProductionForwardTransport(peerURLs map[string]string) *HTTPProductionForwardTransport {
normalized := make(map[string]string, len(peerURLs))
for nodeID, baseURL := range peerURLs {
nodeID = strings.TrimSpace(nodeID)
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
if nodeID != "" && baseURL != "" {
normalized[nodeID] = baseURL
}
}
return &HTTPProductionForwardTransport{PeerURLs: normalized}
}
func (t *HTTPProductionForwardTransport) SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error) {
if t == nil {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
}
baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/")
if baseURL == "" {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
}
client := NewClient(baseURL)
if t.HTTPClient != nil {
client.HTTPClient = t.HTTPClient
}
return client.SendProduction(ctx, envelope)
}
@@ -0,0 +1,241 @@
package mesh
import (
"encoding/json"
"fmt"
"os"
"strings"
"time"
)
type ScopedSyntheticConfig struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
LocalNodeID string `json:"local_node_id"`
ConfigVersion string `json:"config_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerEndpoints map[string]string `json:"peer_endpoints"`
PeerEndpointCandidates map[string][]PeerEndpointCandidate `json:"peer_endpoint_candidates,omitempty"`
PeerDirectory []PeerDirectoryEntry `json:"peer_directory,omitempty"`
RecoverySeeds []PeerRecoverySeed `json:"recovery_seeds,omitempty"`
RendezvousLeases []PeerRendezvousLease `json:"rendezvous_leases,omitempty"`
Routes []SyntheticRoute `json:"routes"`
}
type PeerDirectoryEntry struct {
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
EndpointCount int `json:"endpoint_count"`
CandidateCount int `json:"candidate_count"`
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
}
type PeerRecoverySeed struct {
NodeID string `json:"node_id"`
Endpoint string `json:"endpoint"`
Transport string `json:"transport"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
Region string `json:"region,omitempty"`
Priority int `json:"priority"`
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
type PeerRendezvousLease struct {
LeaseID string `json:"lease_id"`
PeerNodeID string `json:"peer_node_id"`
RelayNodeID string `json:"relay_node_id"`
RelayEndpoint string `json:"relay_endpoint"`
Transport string `json:"transport"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
RouteIDs []string `json:"route_ids,omitempty"`
AllowedChannels []string `json:"allowed_channels,omitempty"`
Priority int `json:"priority"`
ControlPlaneOnly bool `json:"control_plane_only"`
IssuedAt time.Time `json:"issued_at"`
ExpiresAt time.Time `json:"expires_at"`
Reason string `json:"reason,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
type PeerEndpointCandidate struct {
EndpointID string `json:"endpoint_id"`
NodeID string `json:"node_id"`
Transport string `json:"transport"`
Address string `json:"address"`
AddressFamily string `json:"address_family,omitempty"`
Reachability string `json:"reachability"`
NATType string `json:"nat_type,omitempty"`
ConnectivityMode string `json:"connectivity_mode"`
Region string `json:"region,omitempty"`
Priority int `json:"priority"`
PolicyTags []string `json:"policy_tags,omitempty"`
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
func LoadScopedSyntheticConfig(path string, local PeerIdentity) (ScopedSyntheticConfig, error) {
payload, err := os.ReadFile(path)
if err != nil {
return ScopedSyntheticConfig{}, err
}
var cfg ScopedSyntheticConfig
if err := json.Unmarshal(payload, &cfg); err != nil {
return ScopedSyntheticConfig{}, fmt.Errorf("parse scoped synthetic mesh config: %w", err)
}
if err := cfg.Validate(local); err != nil {
return ScopedSyntheticConfig{}, err
}
return cfg, nil
}
func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
if cfg.SchemaVersion == "" {
return fmt.Errorf("scoped synthetic mesh config schema_version is required")
}
if cfg.ClusterID == "" || cfg.ClusterID != local.ClusterID {
return ErrClusterMismatch
}
if cfg.LocalNodeID == "" || cfg.LocalNodeID != local.NodeID {
return ErrNodeMismatch
}
for nodeID, endpoint := range cfg.PeerEndpoints {
if strings.TrimSpace(nodeID) == "" || strings.TrimSpace(endpoint) == "" {
return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint")
}
}
for nodeID, candidates := range cfg.PeerEndpointCandidates {
if strings.TrimSpace(nodeID) == "" {
return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint candidate node")
}
for _, candidate := range candidates {
if strings.TrimSpace(candidate.EndpointID) == "" ||
strings.TrimSpace(candidate.NodeID) == "" ||
candidate.NodeID != nodeID ||
strings.TrimSpace(candidate.Transport) == "" ||
strings.TrimSpace(candidate.Address) == "" ||
strings.TrimSpace(candidate.Reachability) == "" ||
strings.TrimSpace(candidate.ConnectivityMode) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid peer endpoint candidate")
}
}
}
if err := validatePeerDirectory(cfg.PeerDirectory, cfg.LocalNodeID); err != nil {
return err
}
if err := validateRecoverySeeds(cfg.RecoverySeeds); err != nil {
return err
}
if err := validateRendezvousLeases(cfg.RendezvousLeases, cfg.Routes, cfg.LocalNodeID); err != nil {
return err
}
for _, route := range cfg.Routes {
if route.ClusterID != cfg.ClusterID {
return ErrClusterMismatch
}
path := routePath(route)
if len(path) < 2 {
return ErrInvalidRoutePath
}
if !contains(path, cfg.LocalNodeID) {
return ErrNodeMismatch
}
if route.ExpiresAt.IsZero() {
return fmt.Errorf("scoped synthetic route %q expires_at is required", route.RouteID)
}
if !route.ExpiresAt.After(time.Now().UTC()) {
return ErrRouteExpired
}
}
return nil
}
func validatePeerDirectory(entries []PeerDirectoryEntry, localNodeID string) error {
seen := map[string]struct{}{}
for _, entry := range entries {
nodeID := strings.TrimSpace(entry.NodeID)
if nodeID == "" || nodeID == localNodeID {
return fmt.Errorf("scoped synthetic mesh config contains invalid peer directory entry")
}
if _, duplicate := seen[nodeID]; duplicate {
return fmt.Errorf("scoped synthetic mesh config contains duplicate peer directory entry")
}
seen[nodeID] = struct{}{}
if entry.EndpointCount < 0 || entry.CandidateCount < 0 {
return fmt.Errorf("scoped synthetic mesh config contains invalid peer directory count")
}
}
return nil
}
func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
if len(seeds) > 20 {
return fmt.Errorf("scoped synthetic mesh config contains too many recovery seeds")
}
seen := map[string]struct{}{}
for _, seed := range seeds {
key := strings.TrimSpace(seed.NodeID) + "\x00" + strings.TrimSpace(seed.Endpoint)
if strings.TrimSpace(seed.NodeID) == "" ||
strings.TrimSpace(seed.Endpoint) == "" ||
strings.TrimSpace(seed.Transport) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid recovery seed")
}
if _, duplicate := seen[key]; duplicate {
return fmt.Errorf("scoped synthetic mesh config contains duplicate recovery seed")
}
seen[key] = struct{}{}
}
return nil
}
func validateRendezvousLeases(leases []PeerRendezvousLease, routes []SyntheticRoute, localNodeID string) error {
if len(leases) > 20 {
return fmt.Errorf("scoped synthetic mesh config contains too many rendezvous leases")
}
routesByID := map[string]SyntheticRoute{}
for _, route := range routes {
if strings.TrimSpace(route.RouteID) != "" {
routesByID[route.RouteID] = route
}
}
seen := map[string]struct{}{}
now := time.Now().UTC()
for _, lease := range leases {
if strings.TrimSpace(lease.LeaseID) == "" ||
strings.TrimSpace(lease.PeerNodeID) == "" ||
strings.TrimSpace(lease.RelayNodeID) == "" ||
strings.TrimSpace(lease.RelayEndpoint) == "" ||
strings.TrimSpace(lease.Transport) == "" ||
lease.PeerNodeID == lease.RelayNodeID ||
!lease.ControlPlaneOnly ||
lease.ExpiresAt.IsZero() ||
!lease.ExpiresAt.After(now) ||
(len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) {
return fmt.Errorf("scoped synthetic mesh config contains invalid rendezvous lease")
}
if _, duplicate := seen[lease.LeaseID]; duplicate {
return fmt.Errorf("scoped synthetic mesh config contains duplicate rendezvous lease")
}
seen[lease.LeaseID] = struct{}{}
if len(lease.RouteIDs) == 0 {
continue
}
visible := false
for _, routeID := range lease.RouteIDs {
route, ok := routesByID[routeID]
if !ok {
return fmt.Errorf("scoped synthetic mesh config contains rendezvous lease for unknown route")
}
path := routePath(route)
if contains(path, localNodeID) && contains(path, lease.PeerNodeID) && contains(path, lease.RelayNodeID) {
visible = true
}
}
if !visible {
return fmt.Errorf("scoped synthetic mesh config contains out-of-scope rendezvous lease")
}
}
return nil
}
@@ -0,0 +1,235 @@
package mesh
import (
"encoding/json"
"errors"
"os"
"path/filepath"
"testing"
"time"
)
func TestLoadScopedSyntheticConfig(t *testing.T) {
expiresAt := time.Now().UTC().Add(time.Hour)
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
ConfigVersion: "config-v1",
PeerDirectoryVersion: "peers-v1",
PolicyVersion: "policy-v1",
PeerEndpoints: map[string]string{"node-b": "http://127.0.0.1:19002"},
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "restricted",
ConnectivityMode: "direct",
Priority: 10,
},
},
},
PeerDirectory: []PeerDirectoryEntry{
{
NodeID: "node-b",
RouteIDs: []string{"route-a-b"},
EndpointCount: 1,
CandidateCount: 1,
ConnectivityModes: []string{"direct"},
RecoverySeed: true,
},
},
RecoverySeeds: []PeerRecoverySeed{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
Transport: "direct_tcp_tls",
ConnectivityMode: "direct",
Priority: 10,
},
},
RendezvousLeases: []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
ConnectivityMode: "relay_required",
RouteIDs: []string{"route-a-b"},
AllowedChannels: []string{"fabric_control", "route_control"},
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: expiresAt.Add(-time.Minute),
ExpiresAt: expiresAt,
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-r", "node-b"})},
})
cfg, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err != nil {
t.Fatalf("load scoped config: %v", err)
}
if cfg.ConfigVersion != "config-v1" || cfg.PeerEndpoints["node-b"] == "" || len(cfg.Routes) != 1 {
t.Fatalf("unexpected config: %+v", cfg)
}
if got := cfg.PeerEndpointCandidates["node-b"]; len(got) != 1 || got[0].EndpointID != "node-b-public" {
t.Fatalf("unexpected endpoint candidates: %+v", cfg.PeerEndpointCandidates)
}
if len(cfg.PeerDirectory) != 1 || cfg.PeerDirectory[0].NodeID != "node-b" || !cfg.PeerDirectory[0].RecoverySeed {
t.Fatalf("unexpected peer directory: %+v", cfg.PeerDirectory)
}
if len(cfg.RecoverySeeds) != 1 || cfg.RecoverySeeds[0].NodeID != "node-b" {
t.Fatalf("unexpected recovery seeds: %+v", cfg.RecoverySeeds)
}
if len(cfg.RendezvousLeases) != 1 || cfg.RendezvousLeases[0].RelayNodeID != "node-r" {
t.Fatalf("unexpected rendezvous leases: %+v", cfg.RendezvousLeases)
}
}
func TestLoadScopedSyntheticConfigRejectsWrongCluster(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-2",
LocalNodeID: "node-a",
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if !errors.Is(err, ErrClusterMismatch) {
t.Fatalf("err = %v, want ErrClusterMismatch", err)
}
}
func TestLoadScopedSyntheticConfigRejectsWrongNode(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-x",
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if !errors.Is(err, ErrNodeMismatch) {
t.Fatalf("err = %v, want ErrNodeMismatch", err)
}
}
func TestLoadScopedSyntheticConfigRejectsExpiredRoute(t *testing.T) {
route := liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})
route.ExpiresAt = time.Now().UTC().Add(-time.Minute)
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
Routes: []SyntheticRoute{route},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if !errors.Is(err, ErrRouteExpired) {
t.Fatalf("err = %v, want ErrRouteExpired", err)
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointCandidate(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-public",
NodeID: "node-c",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
ConnectivityMode: "direct",
},
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected invalid peer endpoint candidate error")
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidPeerDirectory(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
PeerDirectory: []PeerDirectoryEntry{
{NodeID: "node-a"},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected invalid peer directory error")
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidRecoverySeed(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-b", Endpoint: "", Transport: "direct_tcp_tls"},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected invalid recovery seed error")
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17z12.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
RendezvousLeases: []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
RouteIDs: []string{"route-a-b"},
ExpiresAt: time.Now().UTC().Add(time.Hour),
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-r", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected invalid rendezvous lease error")
}
}
func writeScopedConfig(t *testing.T, cfg ScopedSyntheticConfig) string {
t.Helper()
payload, err := json.Marshal(cfg)
if err != nil {
t.Fatalf("marshal config: %v", err)
}
path := filepath.Join(t.TempDir(), "mesh-config.json")
if err := os.WriteFile(path, payload, 0o600); err != nil {
t.Fatalf("write config: %v", err)
}
return path
}
@@ -0,0 +1,291 @@
package mesh
import (
"context"
"encoding/json"
"net/http"
"time"
)
type ProductionEnvelopeObserver func(context.Context, ProductionEnvelopeObservation) error
type ProductionForwardLogger func(ProductionForwardLogEntry)
type Server struct {
Local PeerIdentity
SyntheticRuntime *SyntheticRuntime
ProductionForwardingEnabled bool
ProductionEnvelopeObserver ProductionEnvelopeObserver
ProductionForwardTransport ProductionForwardTransport
ProductionForwardLogger ProductionForwardLogger
ProductionRoutes []SyntheticRoute
}
func (s Server) Handler() http.Handler {
mux := http.NewServeMux()
mux.HandleFunc("/mesh/v1/health", s.handleHealth)
mux.HandleFunc("/mesh/v1/forward", s.handleForward)
mux.HandleFunc("/mesh/v1/synthetic/probe", s.handleSyntheticProbe)
return mux
}
func (s Server) handleHealth(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
var message HealthMessage
if err := json.NewDecoder(r.Body).Decode(&message); err != nil {
http.Error(w, "invalid health message", http.StatusBadRequest)
return
}
if message.ProtocolVersion != ProtocolVersion {
http.Error(w, "unsupported mesh protocol version", http.StatusBadRequest)
return
}
if err := ValidatePeer(s.Local, message.From); err != nil {
http.Error(w, err.Error(), http.StatusForbidden)
return
}
if message.To.NodeID != "" && message.To.NodeID != s.Local.NodeID {
http.Error(w, ErrNodeMismatch.Error(), http.StatusForbidden)
return
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(HealthAck{
ProtocolVersion: ProtocolVersion,
Accepted: true,
By: s.Local,
})
}
func (s Server) handleForward(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
if !s.ProductionForwardingEnabled {
s.logProductionForward(ProductionForwardLogEntry{
Event: "production_forward_rejected",
ClusterID: s.Local.ClusterID,
LocalNodeID: s.Local.NodeID,
Reason: ErrForwardDisabled.Error(),
StatusCode: http.StatusNotImplemented,
OccurredAt: time.Now().UTC(),
})
http.Error(w, ErrForwardDisabled.Error(), http.StatusNotImplemented)
return
}
var envelope ProductionEnvelope
if err := json.NewDecoder(r.Body).Decode(&envelope); err != nil {
s.logProductionForward(ProductionForwardLogEntry{
Event: "production_forward_rejected",
ClusterID: s.Local.ClusterID,
LocalNodeID: s.Local.NodeID,
Reason: "invalid production mesh envelope",
StatusCode: http.StatusBadRequest,
OccurredAt: time.Now().UTC(),
})
http.Error(w, "invalid production mesh envelope", http.StatusBadRequest)
return
}
if err := ValidateProductionEnvelope(s.Local, envelope, time.Now().UTC()); err != nil {
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
return
}
if err := ValidateProductionEnvelopeRouteConfig(s.Local, envelope, s.ProductionRoutes, time.Now().UTC()); err != nil {
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
return
}
s.logProductionForward(productionForwardLogEntry("production_forward_accepted", s.Local, envelope, "", 0))
if s.ProductionEnvelopeObserver != nil {
observation := NewProductionEnvelopeObservation(envelope, time.Now().UTC())
if err := observeProductionEnvelope(r.Context(), s.ProductionEnvelopeObserver, observation); err != nil {
s.logProductionForward(productionForwardLogEntry("production_forward_rejected", s.Local, envelope, ErrForwardObservationFailed.Error(), http.StatusInternalServerError))
http.Error(w, ErrForwardObservationFailed.Error(), http.StatusInternalServerError)
return
}
}
if envelope.DestinationNodeID == s.Local.NodeID {
s.logProductionForward(productionForwardLogEntry("production_forward_delivered", s.Local, envelope, "", http.StatusOK))
writeProductionForwardResult(w, ProductionForwardResult{
Accepted: true,
Delivered: true,
By: s.Local,
MessageID: envelope.MessageID,
RouteID: envelope.RouteID,
})
return
}
if envelope.NextHopNodeID == s.Local.NodeID {
s.rejectProductionForward(w, envelope, ErrLoopDetected, forwardStatusCode(ErrLoopDetected))
return
}
if len(envelope.RoutePath) == 0 && envelope.NextHopNodeID != envelope.DestinationNodeID {
s.rejectProductionForward(w, envelope, ErrForwardRuntimeUnavailable, http.StatusNotImplemented)
return
}
if s.ProductionForwardTransport == nil {
s.rejectProductionForward(w, envelope, ErrForwardRuntimeUnavailable, http.StatusNotImplemented)
return
}
if envelope.TTL <= 1 {
s.rejectProductionForward(w, envelope, ErrTTLExhausted, forwardStatusCode(ErrTTLExhausted))
return
}
forwarded := envelope
forwarded.CurrentHopNodeID = envelope.NextHopNodeID
forwarded.NextHopNodeID = nextProductionHopAfter(envelope.RoutePath, envelope.NextHopNodeID, envelope.DestinationNodeID)
forwarded.TTL = envelope.TTL - 1
forwarded.HopCount = envelope.HopCount + 1
forwarded.VisitedNodeIDs = append(append([]string{}, envelope.VisitedNodeIDs...), s.Local.NodeID)
result, err := s.ProductionForwardTransport.SendProduction(r.Context(), envelope.NextHopNodeID, forwarded)
if err != nil {
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
return
}
s.logProductionForward(productionForwardLogEntry("production_forward_forwarded", s.Local, envelope, "", http.StatusOK))
result.Accepted = true
result.Forwarded = true
result.By = s.Local
result.MessageID = envelope.MessageID
result.RouteID = envelope.RouteID
result.NextNodeID = envelope.NextHopNodeID
writeProductionForwardResult(w, result)
}
func (s Server) rejectProductionForward(w http.ResponseWriter, envelope ProductionEnvelope, err error, statusCode int) {
s.logProductionForward(productionForwardLogEntry("production_forward_rejected", s.Local, envelope, err.Error(), statusCode))
http.Error(w, err.Error(), statusCode)
}
func (s Server) logProductionForward(entry ProductionForwardLogEntry) {
if s.ProductionForwardLogger == nil {
return
}
if entry.OccurredAt.IsZero() {
entry.OccurredAt = time.Now().UTC()
}
s.ProductionForwardLogger(entry)
}
func productionForwardLogEntry(event string, local PeerIdentity, envelope ProductionEnvelope, reason string, statusCode int) ProductionForwardLogEntry {
return ProductionForwardLogEntry{
Event: event,
RouteID: envelope.RouteID,
MessageID: envelope.MessageID,
ClusterID: envelope.ClusterID,
LocalNodeID: local.NodeID,
SourceNodeID: envelope.SourceNodeID,
DestinationNodeID: envelope.DestinationNodeID,
CurrentHopNodeID: envelope.CurrentHopNodeID,
NextHopNodeID: envelope.NextHopNodeID,
ChannelClass: envelope.ChannelClass,
MessageType: envelope.MessageType,
Reason: reason,
StatusCode: statusCode,
TTL: envelope.TTL,
HopCount: envelope.HopCount,
RoutePathLength: len(envelope.RoutePath),
VisitedCount: len(envelope.VisitedNodeIDs),
PayloadLength: envelope.PayloadLength,
OccurredAt: time.Now().UTC(),
}
}
func nextProductionHopAfter(routePath []string, currentNodeID string, destinationNodeID string) string {
if len(routePath) == 0 {
return destinationNodeID
}
for index, nodeID := range routePath {
if nodeID == currentNodeID {
if index >= len(routePath)-1 {
return currentNodeID
}
return routePath[index+1]
}
}
return destinationNodeID
}
func writeProductionForwardResult(w http.ResponseWriter, result ProductionForwardResult) {
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(result)
}
func observeProductionEnvelope(ctx context.Context, observer ProductionEnvelopeObserver, observation ProductionEnvelopeObservation) (err error) {
if observer == nil {
return nil
}
defer func() {
if recover() != nil {
err = ErrForwardObservationFailed
}
}()
return observer(ctx, observation)
}
func (s Server) handleSyntheticProbe(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
if s.SyntheticRuntime == nil {
http.Error(w, ErrMeshRuntimeDisabled.Error(), http.StatusServiceUnavailable)
return
}
var envelope SyntheticEnvelope
if err := json.NewDecoder(r.Body).Decode(&envelope); err != nil {
http.Error(w, "invalid synthetic mesh envelope", http.StatusBadRequest)
return
}
ack, err := s.SyntheticRuntime.Receive(r.Context(), envelope)
if err != nil {
http.Error(w, err.Error(), syntheticStatusCode(err))
return
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(ack)
}
func NewHealthMessage(from, to PeerIdentity) HealthMessage {
status := "reachable"
return HealthMessage{
ProtocolVersion: ProtocolVersion,
From: from,
To: to,
ObservedAt: time.Now().UTC(),
LinkStatus: status,
}
}
func syntheticStatusCode(err error) int {
switch err {
case ErrClusterMismatch, ErrNodeMismatch, ErrUnauthorizedChannel, ErrLoopDetected:
return http.StatusForbidden
case ErrMeshRuntimeDisabled:
return http.StatusServiceUnavailable
case ErrRouteExpired, ErrTTLExhausted, ErrInvalidRoutePath, ErrUnsupportedSyntheticMessage, ErrRouteIDRequired:
return http.StatusBadRequest
case ErrRouteNotFound, ErrSyntheticPeerUnavailable:
return http.StatusNotFound
default:
return http.StatusBadRequest
}
}
func forwardStatusCode(err error) int {
switch err {
case ErrClusterMismatch, ErrNodeMismatch, ErrUnauthorizedChannel, ErrLoopDetected:
return http.StatusForbidden
case ErrRouteExpired, ErrTTLExhausted, ErrInvalidRoutePath, ErrRouteIDRequired:
return http.StatusBadRequest
case ErrForwardRuntimeUnavailable:
return http.StatusNotImplemented
case ErrRouteNotFound:
return http.StatusNotFound
case ErrForwardPeerUnavailable:
return http.StatusBadGateway
default:
return http.StatusBadRequest
}
}
@@ -0,0 +1,802 @@
package mesh
import (
"bytes"
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"errors"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestMeshHealthAcceptsSameCluster(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{Local: local}.Handler())
defer server.Close()
client := NewClient(server.URL)
ack, err := client.SendHealth(context.Background(), NewHealthMessage(
PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
local,
))
if err != nil {
t.Fatalf("send health: %v", err)
}
if !ack.Accepted || ack.By.NodeID != "node-b" {
t.Fatalf("unexpected ack: %+v", ack)
}
}
func TestMeshHealthRejectsClusterMismatch(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{Local: local}.Handler())
defer server.Close()
message := NewHealthMessage(PeerIdentity{ClusterID: "cluster-2", NodeID: "node-a"}, local)
payload, err := json.Marshal(message)
if err != nil {
t.Fatalf("marshal message: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/health", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post health: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusForbidden {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
}
}
func TestMeshForwardingDisabled(t *testing.T) {
server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler())
defer server.Close()
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/octet-stream", bytes.NewReader([]byte("payload")))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusNotImplemented {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
}
}
func TestMeshForwardingGateEnabledStillHasNoProductionRuntime(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
}.Handler())
defer server.Close()
payload, err := json.Marshal(validProductionEnvelope(local))
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusNotImplemented {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
}
}
func TestMeshForwardingGateDeliversFabricControlAtDestination(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
var events []ProductionForwardLogEntry
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
events = append(events, entry)
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = local.NodeID
envelope.CurrentHopNodeID = local.NodeID
envelope.NextHopNodeID = local.NodeID
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
t.Fatalf("decode result: %v", err)
}
if !result.Accepted || !result.Delivered || result.Forwarded || result.By.NodeID != local.NodeID {
t.Fatalf("unexpected result: %+v", result)
}
if !hasProductionForwardEvent(events, "production_forward_accepted") || !hasProductionForwardEvent(events, "production_forward_delivered") {
t.Fatalf("missing production forward events: %+v", events)
}
}
func TestMeshForwardingGateForwardsDirectFabricControlToNextHop(t *testing.T) {
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
var deliveredObservation ProductionEnvelopeObservation
serverC := httptest.NewServer(Server{
Local: nodeC,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
deliveredObservation = observation
return nil
},
}.Handler())
defer serverC.Close()
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
serverB := httptest.NewServer(Server{
Local: nodeB,
ProductionForwardingEnabled: true,
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeC.NodeID: serverC.URL,
}),
}.Handler())
defer serverB.Close()
envelope := validProductionEnvelope(nodeB)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = nodeC.NodeID
envelope.CurrentHopNodeID = nodeB.NodeID
envelope.NextHopNodeID = nodeC.NodeID
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
t.Fatalf("decode result: %v", err)
}
if !result.Accepted || !result.Forwarded || !result.Delivered || result.NextNodeID != nodeC.NodeID || result.By.NodeID != nodeB.NodeID {
t.Fatalf("unexpected forward result: %+v", result)
}
if deliveredObservation.CurrentHopNodeID != nodeC.NodeID || deliveredObservation.MessageID != envelope.MessageID {
t.Fatalf("destination did not observe forwarded envelope: %+v", deliveredObservation)
}
}
func TestMeshForwardingGateForwardsMultiHopFabricControlByRoutePath(t *testing.T) {
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
var deliveredObservation ProductionEnvelopeObservation
var nodeREvents []ProductionForwardLogEntry
var nodeBEvents []ProductionForwardLogEntry
serverC := httptest.NewServer(Server{
Local: nodeC,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
deliveredObservation = observation
return nil
},
}.Handler())
defer serverC.Close()
nodeR := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}
serverR := httptest.NewServer(Server{
Local: nodeR,
ProductionForwardingEnabled: true,
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeC.NodeID: serverC.URL,
}),
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
nodeREvents = append(nodeREvents, entry)
},
}.Handler())
defer serverR.Close()
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
serverB := httptest.NewServer(Server{
Local: nodeB,
ProductionForwardingEnabled: true,
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeR.NodeID: serverR.URL,
}),
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
nodeBEvents = append(nodeBEvents, entry)
},
}.Handler())
defer serverB.Close()
envelope := validProductionEnvelope(nodeB)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = nodeC.NodeID
envelope.CurrentHopNodeID = nodeB.NodeID
envelope.NextHopNodeID = nodeR.NodeID
envelope.RoutePath = []string{"node-a", nodeB.NodeID, nodeR.NodeID, nodeC.NodeID}
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
t.Fatalf("decode result: %v", err)
}
if !result.Accepted || !result.Forwarded || !result.Delivered || result.NextNodeID != nodeR.NodeID || result.By.NodeID != nodeB.NodeID {
t.Fatalf("unexpected multi-hop result: %+v", result)
}
if deliveredObservation.CurrentHopNodeID != nodeC.NodeID || deliveredObservation.NextHopNodeID != nodeC.NodeID {
t.Fatalf("destination did not observe final hop: %+v", deliveredObservation)
}
if len(deliveredObservation.VisitedNodeIDs) != 2 || deliveredObservation.VisitedNodeIDs[0] != nodeB.NodeID || deliveredObservation.VisitedNodeIDs[1] != nodeR.NodeID {
t.Fatalf("visited path not propagated: %+v", deliveredObservation.VisitedNodeIDs)
}
if !hasProductionForwardEvent(nodeBEvents, "production_forward_forwarded") || !hasProductionForwardEvent(nodeREvents, "production_forward_forwarded") {
t.Fatalf("missing relay forward events: nodeB=%+v nodeR=%+v", nodeBEvents, nodeREvents)
}
}
func TestMeshForwardingGateForwardsConfiguredProductionRoute(t *testing.T) {
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
route := configuredProductionRoute("route-1", []string{"node-a", "node-b", "node-r", nodeC.NodeID})
var deliveredObservation ProductionEnvelopeObservation
serverC := httptest.NewServer(Server{
Local: nodeC,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
deliveredObservation = observation
return nil
},
}.Handler())
defer serverC.Close()
nodeR := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}
serverR := httptest.NewServer(Server{
Local: nodeR,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeC.NodeID: serverC.URL,
}),
}.Handler())
defer serverR.Close()
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
serverB := httptest.NewServer(Server{
Local: nodeB,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeR.NodeID: serverR.URL,
}),
}.Handler())
defer serverB.Close()
envelope := validProductionEnvelope(nodeB)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = nodeC.NodeID
envelope.CurrentHopNodeID = nodeB.NodeID
envelope.NextHopNodeID = nodeR.NodeID
envelope.RoutePath = route.Hops
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
}
if deliveredObservation.RouteID != route.RouteID || deliveredObservation.CurrentHopNodeID != nodeC.NodeID {
t.Fatalf("configured route was not delivered: %+v", deliveredObservation)
}
}
func TestMeshForwardingGateRejectsUnknownConfiguredProductionRoute(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{
configuredProductionRoute("route-other", []string{"node-a", local.NodeID, "node-c"}),
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusNotFound {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotFound)
}
}
func TestMeshForwardingGateRejectsConfiguredProductionRouteWrongNextHop(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
route := configuredProductionRoute("route-1", []string{"node-a", local.NodeID, "node-r", "node-c"})
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = "node-c"
envelope.CurrentHopNodeID = local.NodeID
envelope.NextHopNodeID = "node-c"
envelope.RoutePath = route.Hops
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
}
func TestMeshForwardingGateRejectsRoutePathWrongNextHop(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
var events []ProductionForwardLogEntry
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
events = append(events, entry)
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = "node-c"
envelope.CurrentHopNodeID = local.NodeID
envelope.NextHopNodeID = "node-x"
envelope.RoutePath = []string{"node-a", local.NodeID, "node-r", "node-c"}
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
if !hasProductionForwardEvent(events, "production_forward_rejected") {
t.Fatalf("missing reject event: %+v", events)
}
}
func TestMeshForwardingGateRejectsRoutePathLoop(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = "node-c"
envelope.CurrentHopNodeID = local.NodeID
envelope.NextHopNodeID = "node-r"
envelope.RoutePath = []string{"node-a", local.NodeID, "node-r", local.NodeID, "node-c"}
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusForbidden {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
}
}
func TestMeshForwardingGateRejectsInvalidProductionEnvelope(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.PayloadHash = "bad-hash"
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
}
func TestMeshForwardingGateRejectsOversizedProductionEnvelopePayload(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
observed := false
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
observed = true
return nil
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.Payload = json.RawMessage(`"` + string(bytes.Repeat([]byte("a"), MaxProductionEnvelopePayloadBytes+1)) + `"`)
sum := sha256.Sum256(envelope.Payload)
envelope.PayloadLength = len(envelope.Payload)
envelope.PayloadHash = hex.EncodeToString(sum[:])
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
if observed {
t.Fatal("observer called for oversized envelope")
}
}
func TestMeshForwardingGateRejectsFutureCreatedAt(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
observed := false
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
observed = true
return nil
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.CreatedAt = time.Now().UTC().Add(MaxProductionEnvelopeFutureSkew + time.Second)
envelope.ExpiresAt = envelope.CreatedAt.Add(time.Minute)
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
if observed {
t.Fatal("observer called for future-created envelope")
}
}
func TestMeshForwardingGateObservesValidEnvelopeWithoutPayload(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
var observed ProductionEnvelopeObservation
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
observed = observation
return nil
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusNotImplemented {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
}
if observed.MessageID != envelope.MessageID || observed.RouteID != envelope.RouteID {
t.Fatalf("unexpected observation: %+v", observed)
}
if observed.PayloadHash != envelope.PayloadHash || observed.PayloadLength != envelope.PayloadLength {
t.Fatalf("payload metadata missing from observation: %+v", observed)
}
}
func TestMeshForwardingGateDoesNotObserveRejectedEnvelope(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
observed := false
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
observed = true
return nil
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.ClusterID = "wrong-cluster"
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusForbidden {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
}
if observed {
t.Fatal("observer called for rejected envelope")
}
}
func TestMeshForwardingGateFailsClosedWhenObservationFails(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
return errors.New("observer down")
},
}.Handler())
defer server.Close()
payload, err := json.Marshal(validProductionEnvelope(local))
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusInternalServerError {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusInternalServerError)
}
}
func TestMeshForwardingGateFailsClosedWhenObservationPanics(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
panic("observer panic")
},
}.Handler())
defer server.Close()
payload, err := json.Marshal(validProductionEnvelope(local))
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusInternalServerError {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusInternalServerError)
}
}
func TestObserveProductionEnvelopeAllowsNilObserver(t *testing.T) {
if err := observeProductionEnvelope(context.Background(), nil, ProductionEnvelopeObservation{}); err != nil {
t.Fatalf("observeProductionEnvelope nil observer err = %v", err)
}
}
func TestProductionEnvelopeObservationSinkKeepsBoundedMetadata(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
sink := NewProductionEnvelopeObservationSink(2)
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: sink.Observe,
}.Handler())
defer server.Close()
for i := 1; i <= 3; i++ {
envelope := validProductionEnvelope(local)
envelope.MessageID = "message-" + string(rune('0'+i))
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
resp.Body.Close()
if resp.StatusCode != http.StatusNotImplemented {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
}
}
observations := sink.Snapshot()
if len(observations) != 2 {
t.Fatalf("observation count = %d, want 2", len(observations))
}
if observations[0].MessageID != "message-2" || observations[1].MessageID != "message-3" {
t.Fatalf("unexpected bounded observations: %+v", observations)
}
if observations[0].PayloadHash == "" || observations[0].PayloadLength == 0 {
t.Fatalf("payload metadata missing from bounded observation: %+v", observations[0])
}
metrics := sink.Metrics()
if metrics.Capacity != 2 || metrics.CurrentDepth != 2 || metrics.AcceptedTotal != 3 || metrics.DroppedOldest != 1 {
t.Fatalf("unexpected sink metrics: %+v", metrics)
}
}
func TestProductionEnvelopeObservationSinkMetricsStartEmpty(t *testing.T) {
sink := NewProductionEnvelopeObservationSink(3)
metrics := sink.Metrics()
if metrics.Capacity != 3 || metrics.CurrentDepth != 0 || metrics.AcceptedTotal != 0 || metrics.DroppedOldest != 0 {
t.Fatalf("unexpected empty metrics: %+v", metrics)
}
}
func TestMeshForwardingGateRejectsServiceChannel(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.ChannelClass = "render"
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusForbidden {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
}
}
func TestMeshForwardingRequiresPost(t *testing.T) {
server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler())
defer server.Close()
resp, err := http.Get(server.URL + "/mesh/v1/forward")
if err != nil {
t.Fatalf("get forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusMethodNotAllowed {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusMethodNotAllowed)
}
}
func validProductionEnvelope(local PeerIdentity) ProductionEnvelope {
payload := json.RawMessage(`{"kind":"control"}`)
sum := sha256.Sum256(payload)
now := time.Now().UTC()
return ProductionEnvelope{
FabricProtocolVersion: ProtocolVersion,
MessageID: "message-1",
RouteID: "route-1",
ClusterID: local.ClusterID,
SourceNodeID: "node-a",
DestinationNodeID: "node-c",
CurrentHopNodeID: local.NodeID,
NextHopNodeID: "node-c",
ChannelClass: ProductionChannelFabricControl,
MessageType: ProductionMessageFabricControl,
TTL: 4,
HopCount: 1,
CreatedAt: now,
ExpiresAt: now.Add(time.Minute),
PayloadLength: len(payload),
PayloadHash: hex.EncodeToString(sum[:]),
Payload: payload,
}
}
func configuredProductionRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: append([]string{}, hops...),
AllowedChannels: []string{ProductionChannelFabricControl},
ExpiresAt: time.Now().UTC().Add(time.Hour),
MaxTTL: 8,
MaxHops: 8,
}
}
func hasProductionForwardEvent(events []ProductionForwardLogEntry, event string) bool {
for _, item := range events {
if item.Event == event {
return true
}
}
return false
}
func TestSyntheticEndpointDisabledByDefault(t *testing.T) {
server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler())
defer server.Close()
resp, err := http.Post(server.URL+"/mesh/v1/synthetic/probe", "application/json", bytes.NewReader([]byte(`{}`)))
if err != nil {
t.Fatalf("post synthetic probe: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusServiceUnavailable {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusServiceUnavailable)
}
}
@@ -0,0 +1,280 @@
package mesh
import (
"sync"
"time"
)
type SyntheticRelaySchedulerConfig struct {
Enabled bool
Local PeerIdentity
QueuePolicies []SyntheticRelayQueuePolicy
AllowedChannels []string
AllowedMessageTypes []string
Now func() time.Time
Logger func(SyntheticLogEntry)
}
type SyntheticRelayScheduler struct {
enabled bool
local PeerIdentity
policies map[string]SyntheticRelayQueuePolicy
allowedChannels map[string]struct{}
allowedMessageTypes map[string]struct{}
priorityOrder []string
now func() time.Time
logger func(SyntheticLogEntry)
mu sync.Mutex
queues map[string][]SyntheticEnvelope
metrics SyntheticRelayQueueMetrics
}
func NewSyntheticRelayScheduler(cfg SyntheticRelaySchedulerConfig) *SyntheticRelayScheduler {
policies := cfg.QueuePolicies
if len(policies) == 0 {
policies = []SyntheticRelayQueuePolicy{
{Channel: SyntheticChannelFabricControl, Capacity: 64, Droppable: false},
{Channel: SyntheticChannelRouteControl, Capacity: 64, Droppable: false},
{Channel: SyntheticChannelTelemetry, Capacity: 16, Droppable: true},
}
}
policyMap := map[string]SyntheticRelayQueuePolicy{}
allowedChannels := map[string]struct{}{}
priorityOrder := make([]string, 0, len(policies))
for _, policy := range policies {
if policy.Channel == "" {
continue
}
if policy.Capacity <= 0 {
policy.Capacity = 1
}
policyMap[policy.Channel] = policy
allowedChannels[policy.Channel] = struct{}{}
priorityOrder = append(priorityOrder, policy.Channel)
}
for _, channel := range cfg.AllowedChannels {
if channel != "" {
allowedChannels[channel] = struct{}{}
}
}
messageTypes := cfg.AllowedMessageTypes
if len(messageTypes) == 0 {
messageTypes = []string{
SyntheticMessageProbe,
SyntheticMessageProbeAck,
SyntheticMessageRouteHealth,
SyntheticMessageRouteHealthAck,
SyntheticMessageTelemetry,
SyntheticMessageTestService,
SyntheticMessageTestServiceAck,
}
}
allowedMessageTypes := map[string]struct{}{}
for _, messageType := range messageTypes {
if messageType != "" {
allowedMessageTypes[messageType] = struct{}{}
}
}
now := cfg.Now
if now == nil {
now = func() time.Time { return time.Now().UTC() }
}
return &SyntheticRelayScheduler{
enabled: cfg.Enabled,
local: cfg.Local,
policies: policyMap,
allowedChannels: allowedChannels,
allowedMessageTypes: allowedMessageTypes,
priorityOrder: priorityOrder,
now: now,
logger: cfg.Logger,
queues: map[string][]SyntheticEnvelope{},
metrics: SyntheticRelayQueueMetrics{
QueueDepths: map[string]int{},
},
}
}
func (s *SyntheticRelayScheduler) Enqueue(envelope SyntheticEnvelope) (SyntheticRelayEnqueueResult, error) {
if err := s.validateEnvelope(envelope); err != nil {
s.reject(envelope, err)
return SyntheticRelayEnqueueResult{}, err
}
policy := s.policies[envelope.Channel]
result := SyntheticRelayEnqueueResult{
Channel: envelope.Channel,
QueueCapacity: policy.Capacity,
AcceptedSequence: envelope.Sequence,
}
s.mu.Lock()
queue := s.queues[envelope.Channel]
if len(queue) >= policy.Capacity {
if !policy.Droppable {
s.metrics.Rejected++
s.metrics.LastRejectReason = ErrSyntheticRelayQueueFull.Error()
s.mu.Unlock()
s.log(SyntheticLogEntry{
Event: "fabric_relay_rejected",
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
LocalNodeID: s.local.NodeID,
Channel: envelope.Channel,
MessageType: envelope.MessageType,
Reason: ErrSyntheticRelayQueueFull.Error(),
QueueDepth: len(queue),
QueueCapacity: policy.Capacity,
OccurredAt: s.now(),
})
return SyntheticRelayEnqueueResult{}, ErrSyntheticRelayQueueFull
}
result.Dropped = true
result.DroppedSequence = queue[0].Sequence
queue = queue[1:]
s.metrics.Dropped++
}
queue = append(queue, envelope)
s.queues[envelope.Channel] = queue
result.QueueDepth = len(queue)
s.metrics.Enqueued++
s.metrics.QueueDepths[envelope.Channel] = len(queue)
s.mu.Unlock()
s.log(SyntheticLogEntry{
Event: "fabric_relay_enqueued",
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
LocalNodeID: s.local.NodeID,
Channel: envelope.Channel,
MessageType: envelope.MessageType,
QueueDepth: result.QueueDepth,
QueueCapacity: result.QueueCapacity,
Dropped: result.Dropped,
DroppedSequence: result.DroppedSequence,
OccurredAt: s.now(),
})
return result, nil
}
func (s *SyntheticRelayScheduler) Dequeue() (SyntheticEnvelope, error) {
if !s.enabled {
return SyntheticEnvelope{}, ErrMeshRuntimeDisabled
}
s.mu.Lock()
for _, channel := range s.priorityOrder {
queue := s.queues[channel]
if len(queue) == 0 {
continue
}
envelope := queue[0]
queue = queue[1:]
s.queues[channel] = queue
s.metrics.Dequeued++
s.metrics.QueueDepths[channel] = len(queue)
s.mu.Unlock()
s.log(SyntheticLogEntry{
Event: "fabric_relay_dequeued",
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
LocalNodeID: s.local.NodeID,
Channel: envelope.Channel,
MessageType: envelope.MessageType,
QueueDepth: len(queue),
QueueCapacity: s.policies[channel].Capacity,
OccurredAt: s.now(),
})
return envelope, nil
}
s.mu.Unlock()
return SyntheticEnvelope{}, ErrSyntheticRelayQueueEmpty
}
func (s *SyntheticRelayScheduler) SnapshotQueueMetrics() SyntheticRelayQueueMetrics {
s.mu.Lock()
defer s.mu.Unlock()
depths := map[string]int{}
for channel, depth := range s.metrics.QueueDepths {
depths[channel] = depth
}
for channel, queue := range s.queues {
depths[channel] = len(queue)
}
return SyntheticRelayQueueMetrics{
Enqueued: s.metrics.Enqueued,
Dequeued: s.metrics.Dequeued,
Dropped: s.metrics.Dropped,
Rejected: s.metrics.Rejected,
LastRejectReason: s.metrics.LastRejectReason,
QueueDepths: depths,
}
}
func (s *SyntheticRelayScheduler) validateEnvelope(envelope SyntheticEnvelope) error {
if s == nil || !s.enabled {
return ErrMeshRuntimeDisabled
}
if envelope.ProtocolVersion != ProtocolVersion {
return ErrUnsupportedSyntheticMessage
}
if envelope.RouteID == "" {
return ErrRouteIDRequired
}
if envelope.ClusterID == "" || envelope.ClusterID != s.local.ClusterID {
return ErrClusterMismatch
}
if envelope.From.ClusterID != s.local.ClusterID || envelope.From.NodeID == "" {
return ErrNodeMismatch
}
if envelope.To.ClusterID != s.local.ClusterID || envelope.To.NodeID != s.local.NodeID {
return ErrNodeMismatch
}
if envelope.TTL <= 0 {
return ErrTTLExhausted
}
if envelope.HopCount <= 0 {
return ErrInvalidRoutePath
}
if contains(envelope.Visited, s.local.NodeID) {
return ErrLoopDetected
}
if _, ok := s.allowedChannels[envelope.Channel]; !ok {
return ErrUnauthorizedChannel
}
if _, ok := s.policies[envelope.Channel]; !ok {
return ErrUnauthorizedChannel
}
if _, ok := s.allowedMessageTypes[envelope.MessageType]; !ok {
return ErrUnsupportedSyntheticMessage
}
return nil
}
func (s *SyntheticRelayScheduler) reject(envelope SyntheticEnvelope, err error) {
reason := ""
if err != nil {
reason = err.Error()
}
if s != nil {
s.mu.Lock()
s.metrics.Rejected++
s.metrics.LastRejectReason = reason
s.mu.Unlock()
}
if s != nil {
s.log(SyntheticLogEntry{
Event: "fabric_relay_rejected",
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
LocalNodeID: s.local.NodeID,
Channel: envelope.Channel,
MessageType: envelope.MessageType,
Reason: reason,
OccurredAt: s.now(),
})
}
}
func (s *SyntheticRelayScheduler) log(entry SyntheticLogEntry) {
if s.logger != nil {
s.logger(entry)
}
}
@@ -0,0 +1,213 @@
package mesh
import (
"errors"
"testing"
)
func TestSyntheticRelaySchedulerDequeuesByQoSPriority(t *testing.T) {
scheduler := testRelayScheduler()
telemetry := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 1)
routeControl := testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageRouteHealth, 2)
fabricControl := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 3)
if _, err := scheduler.Enqueue(telemetry); err != nil {
t.Fatalf("enqueue telemetry: %v", err)
}
if _, err := scheduler.Enqueue(routeControl); err != nil {
t.Fatalf("enqueue route control: %v", err)
}
if _, err := scheduler.Enqueue(fabricControl); err != nil {
t.Fatalf("enqueue fabric control: %v", err)
}
first, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue first: %v", err)
}
second, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue second: %v", err)
}
third, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue third: %v", err)
}
if first.Channel != SyntheticChannelFabricControl {
t.Fatalf("first channel = %q, want fabric_control", first.Channel)
}
if second.Channel != SyntheticChannelRouteControl {
t.Fatalf("second channel = %q, want route_control", second.Channel)
}
if third.Channel != SyntheticChannelTelemetry {
t.Fatalf("third channel = %q, want telemetry", third.Channel)
}
}
func TestSyntheticRelaySchedulerDropsOldestTelemetryOnly(t *testing.T) {
scheduler := testRelayScheduler()
first := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 1)
second := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 2)
if result, err := scheduler.Enqueue(first); err != nil || result.Dropped {
t.Fatalf("enqueue first result=%+v err=%v", result, err)
}
result, err := scheduler.Enqueue(second)
if err != nil {
t.Fatalf("enqueue second: %v", err)
}
if !result.Dropped || result.DroppedSequence != 1 {
t.Fatalf("result = %+v, want dropped sequence 1", result)
}
dequeued, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue: %v", err)
}
if dequeued.Sequence != 2 {
t.Fatalf("dequeued sequence = %d, want 2", dequeued.Sequence)
}
metrics := scheduler.SnapshotQueueMetrics()
if metrics.Dropped != 1 || metrics.Enqueued != 2 {
t.Fatalf("metrics = %+v, want one drop and two enqueues", metrics)
}
}
func TestSyntheticRelaySchedulerRejectsFullReliableQueue(t *testing.T) {
scheduler := testRelayScheduler()
first := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1)
second := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 2)
if _, err := scheduler.Enqueue(first); err != nil {
t.Fatalf("enqueue first: %v", err)
}
_, err := scheduler.Enqueue(second)
if !errors.Is(err, ErrSyntheticRelayQueueFull) {
t.Fatalf("err = %v, want ErrSyntheticRelayQueueFull", err)
}
dequeued, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue: %v", err)
}
if dequeued.Sequence != 1 {
t.Fatalf("dequeued sequence = %d, want 1", dequeued.Sequence)
}
metrics := scheduler.SnapshotQueueMetrics()
if metrics.Dropped != 0 || metrics.Rejected != 1 {
t.Fatalf("metrics = %+v, want no drop and one rejection", metrics)
}
}
func TestSyntheticRelaySchedulerRejectsInvalidEnvelopes(t *testing.T) {
tests := []struct {
name string
mutate func(*SyntheticEnvelope)
want error
}{
{
name: "wrong cluster",
mutate: func(envelope *SyntheticEnvelope) {
envelope.ClusterID = "cluster-2"
},
want: ErrClusterMismatch,
},
{
name: "wrong node",
mutate: func(envelope *SyntheticEnvelope) {
envelope.To.NodeID = "node-x"
},
want: ErrNodeMismatch,
},
{
name: "unauthorized channel",
mutate: func(envelope *SyntheticEnvelope) {
envelope.Channel = "rdp_render"
},
want: ErrUnauthorizedChannel,
},
{
name: "unsupported message",
mutate: func(envelope *SyntheticEnvelope) {
envelope.MessageType = "rdp.input"
},
want: ErrUnsupportedSyntheticMessage,
},
{
name: "ttl exhausted",
mutate: func(envelope *SyntheticEnvelope) {
envelope.TTL = 0
},
want: ErrTTLExhausted,
},
{
name: "loop detected",
mutate: func(envelope *SyntheticEnvelope) {
envelope.Visited = append(envelope.Visited, "node-r")
},
want: ErrLoopDetected,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
scheduler := testRelayScheduler()
envelope := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1)
tt.mutate(&envelope)
_, err := scheduler.Enqueue(envelope)
if !errors.Is(err, tt.want) {
t.Fatalf("err = %v, want %v", err, tt.want)
}
})
}
}
func TestSyntheticRelaySchedulerDisabledRejects(t *testing.T) {
scheduler := NewSyntheticRelayScheduler(SyntheticRelaySchedulerConfig{
Enabled: false,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
})
_, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1))
if !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
}
if _, err := scheduler.Dequeue(); !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("dequeue err = %v, want ErrMeshRuntimeDisabled", err)
}
}
func TestSyntheticRelaySchedulerQueueDepthSnapshot(t *testing.T) {
scheduler := testRelayScheduler()
if _, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1)); err != nil {
t.Fatalf("enqueue fabric control: %v", err)
}
if _, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageRouteHealth, 2)); err != nil {
t.Fatalf("enqueue route control: %v", err)
}
metrics := scheduler.SnapshotQueueMetrics()
if metrics.QueueDepths[SyntheticChannelFabricControl] != 1 {
t.Fatalf("fabric_control depth = %d, want 1", metrics.QueueDepths[SyntheticChannelFabricControl])
}
if metrics.QueueDepths[SyntheticChannelRouteControl] != 1 {
t.Fatalf("route_control depth = %d, want 1", metrics.QueueDepths[SyntheticChannelRouteControl])
}
}
func testRelayScheduler() *SyntheticRelayScheduler {
return NewSyntheticRelayScheduler(SyntheticRelaySchedulerConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
QueuePolicies: []SyntheticRelayQueuePolicy{
{Channel: SyntheticChannelFabricControl, Capacity: 1, Droppable: false},
{Channel: SyntheticChannelRouteControl, Capacity: 1, Droppable: false},
{Channel: SyntheticChannelTelemetry, Capacity: 1, Droppable: true},
},
})
}
func testRelayEnvelope(channel string, messageType string, sequence uint64) SyntheticEnvelope {
route := testRoute("route-relay-scheduler", []string{"node-a", "node-r", "node-b"})
envelope := testEnvelope(route, "node-a", "node-r")
envelope.Channel = channel
envelope.MessageType = messageType
envelope.Sequence = sequence
return envelope
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,432 @@
package mesh
import (
"context"
"encoding/json"
"errors"
"testing"
"time"
)
type syntheticTestTransport struct {
nodes map[string]*SyntheticRuntime
}
func (t syntheticTestTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
next := t.nodes[nextNodeID]
if next == nil {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
return next.Receive(ctx, envelope)
}
func TestSyntheticRuntimeDirectProbe(t *testing.T) {
route := testRoute("route-direct", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
ack, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-direct")
if err != nil {
t.Fatalf("send probe: %v", err)
}
if ack.MessageType != SyntheticMessageProbeAck {
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
}
if ack.From.NodeID != "node-b" || ack.To.NodeID != "node-a" {
t.Fatalf("unexpected ack peers: from=%+v to=%+v", ack.From, ack.To)
}
payload := decodeAckPayload(t, ack)
if len(payload.Path) != 2 || payload.Path[0] != "node-a" || payload.Path[1] != "node-b" {
t.Fatalf("Path = %#v, want node-a -> node-b", payload.Path)
}
if nodeB.SnapshotMetrics().ProbeAcksCreated != 1 {
t.Fatalf("ProbeAcksCreated = %d, want 1", nodeB.SnapshotMetrics().ProbeAcksCreated)
}
}
func TestSyntheticRuntimeSingleRelayProbe(t *testing.T) {
route := testRoute("route-relay", []string{"node-a", "node-r", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeR := testRuntime("node-r", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-r"] = nodeR
transport.nodes["node-b"] = nodeB
ack, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-relay")
if err != nil {
t.Fatalf("send probe: %v", err)
}
payload := decodeAckPayload(t, ack)
if len(payload.Path) != 3 || payload.Path[0] != "node-a" || payload.Path[1] != "node-r" || payload.Path[2] != "node-b" {
t.Fatalf("Path = %#v, want node-a -> node-r -> node-b", payload.Path)
}
if nodeR.SnapshotMetrics().ProbesForwarded != 1 {
t.Fatalf("ProbesForwarded = %d, want 1", nodeR.SnapshotMetrics().ProbesForwarded)
}
}
func TestSyntheticRuntimeDisabledRejectsProbe(t *testing.T) {
route := testRoute("route-disabled", []string{"node-a", "node-b"})
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: false,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
})
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-disabled")
if !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
}
}
func TestSyntheticRuntimeRejectsWrongCluster(t *testing.T) {
route := testRoute("route-wrong-cluster", []string{"node-a", "node-b"})
nodeB := testRuntime("node-b", syntheticTestTransport{}, route)
envelope := testEnvelope(route, "node-a", "node-b")
envelope.ClusterID = "cluster-2"
_, err := nodeB.Receive(context.Background(), envelope)
if !errors.Is(err, ErrClusterMismatch) {
t.Fatalf("err = %v, want ErrClusterMismatch", err)
}
}
func TestSyntheticRuntimeRejectsWrongNode(t *testing.T) {
route := testRoute("route-wrong-node", []string{"node-a", "node-b"})
nodeB := testRuntime("node-b", syntheticTestTransport{}, route)
envelope := testEnvelope(route, "node-a", "node-c")
_, err := nodeB.Receive(context.Background(), envelope)
if !errors.Is(err, ErrNodeMismatch) {
t.Fatalf("err = %v, want ErrNodeMismatch", err)
}
}
func TestSyntheticRuntimeRejectsUnauthorizedChannel(t *testing.T) {
route := testRoute("route-unauthorized", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
_, err := nodeA.SendProbe(context.Background(), route.RouteID, "rdp_render", "probe-unauthorized")
if !errors.Is(err, ErrUnauthorizedChannel) {
t.Fatalf("err = %v, want ErrUnauthorizedChannel", err)
}
}
func TestSyntheticRuntimeRejectsExpiredRoute(t *testing.T) {
route := testRoute("route-expired", []string{"node-a", "node-b"})
route.ExpiresAt = time.Now().UTC().Add(-time.Minute)
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-expired")
if !errors.Is(err, ErrRouteExpired) {
t.Fatalf("err = %v, want ErrRouteExpired", err)
}
}
func TestSyntheticRuntimeRejectsTTLExhaustion(t *testing.T) {
route := testRoute("route-ttl", []string{"node-a", "node-r", "node-b"})
route.MaxTTL = 1
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeR := testRuntime("node-r", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-r"] = nodeR
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-ttl")
if !errors.Is(err, ErrTTLExhausted) {
t.Fatalf("err = %v, want ErrTTLExhausted", err)
}
}
func TestSyntheticRuntimeRejectsLoop(t *testing.T) {
route := testRoute("route-loop", []string{"node-a", "node-b"})
nodeB := testRuntime("node-b", syntheticTestTransport{}, route)
envelope := testEnvelope(route, "node-a", "node-b")
envelope.Visited = []string{"node-a", "node-b"}
_, err := nodeB.Receive(context.Background(), envelope)
if !errors.Is(err, ErrLoopDetected) {
t.Fatalf("err = %v, want ErrLoopDetected", err)
}
}
func TestSyntheticRuntimeRejectsUnavailablePeer(t *testing.T) {
route := testRoute("route-missing-peer", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}, route)
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-missing-peer")
if !errors.Is(err, ErrSyntheticPeerUnavailable) {
t.Fatalf("err = %v, want ErrSyntheticPeerUnavailable", err)
}
}
func TestSyntheticRuntimeRouteHealthProbeRecordsSuccess(t *testing.T) {
route := testRoute("route-health", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-health")
if err != nil {
t.Fatalf("send route health probe: %v", err)
}
if result.Ack.MessageType != SyntheticMessageRouteHealthAck {
t.Fatalf("MessageType = %q, want %q", result.Ack.MessageType, SyntheticMessageRouteHealthAck)
}
if result.FallbackUsed {
t.Fatal("FallbackUsed = true, want false")
}
observation, ok := nodeA.SnapshotRouteObservation(route.RouteID)
if !ok {
t.Fatal("route observation missing")
}
if observation.State != SyntheticRouteStateHealthy || observation.SuccessCount != 1 {
t.Fatalf("observation = %+v, want healthy success", observation)
}
if observation.PolicyVersion != "policy-v1" || observation.PeerDirectoryVersion != "peers-v1" || observation.RouteVersion != "route-v1" {
t.Fatalf("observation versions = %+v", observation)
}
metrics := nodeA.SnapshotMetrics()
if metrics.RouteHealthProbesSent != 1 || metrics.RouteDeliveriesSucceeded != 1 {
t.Fatalf("metrics = %+v, want health probe success", metrics)
}
}
func TestSyntheticRuntimeRouteHealthUsesDedicatedRouteConfig(t *testing.T) {
base := testRoute("route-effective-health", []string{"node-a", "node-old", "node-b"})
effective := testRoute("route-effective-health", []string{"node-a", "node-new", "node-b"})
effective.RouteVersion = "decision-v1"
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntimeWithRouteHealth("node-a", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
nodeOld := testRuntimeWithRouteHealth("node-old", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
nodeNew := testRuntimeWithRouteHealth("node-new", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
nodeB := testRuntimeWithRouteHealth("node-b", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
transport.nodes["node-a"] = nodeA
transport.nodes["node-old"] = nodeOld
transport.nodes["node-new"] = nodeNew
transport.nodes["node-b"] = nodeB
health, err := nodeA.SendRouteHealthProbe(context.Background(), base.RouteID, SyntheticChannelFabricControl, "probe-health-effective")
if err != nil {
t.Fatalf("send route health probe: %v", err)
}
healthPayload := decodeAckPayload(t, health.Ack)
if got, want := healthPayload.Path, []string{"node-a", "node-new", "node-b"}; !sameStrings(got, want) {
t.Fatalf("route health path = %v, want %v", got, want)
}
if nodeNew.SnapshotMetrics().ProbesForwarded != 1 {
t.Fatalf("node-new forwarded = %d, want 1", nodeNew.SnapshotMetrics().ProbesForwarded)
}
if nodeOld.SnapshotMetrics().ProbesForwarded != 0 {
t.Fatalf("node-old forwarded = %d, want 0 before regular probe", nodeOld.SnapshotMetrics().ProbesForwarded)
}
observation, ok := nodeA.SnapshotRouteObservation(base.RouteID)
if !ok || observation.RouteVersion != "decision-v1" {
t.Fatalf("route health observation = %+v, want decision route version", observation)
}
probe, err := nodeA.SendProbe(context.Background(), base.RouteID, SyntheticChannelFabricControl, "probe-regular")
if err != nil {
t.Fatalf("send regular probe: %v", err)
}
probePayload := decodeAckPayload(t, probe)
if got, want := probePayload.Path, []string{"node-a", "node-old", "node-b"}; !sameStrings(got, want) {
t.Fatalf("regular probe path = %v, want %v", got, want)
}
if nodeOld.SnapshotMetrics().ProbesForwarded != 1 {
t.Fatalf("node-old forwarded = %d, want 1 after regular probe", nodeOld.SnapshotMetrics().ProbesForwarded)
}
}
func TestSyntheticRuntimeRouteHealthUsesFallbackWhenPreferredUnavailable(t *testing.T) {
preferred := testRoute("route-preferred", []string{"node-a", "node-r", "node-b"})
fallback := testRoute("route-fallback", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, preferred, fallback)
nodeB := testRuntime("node-b", transport, preferred, fallback)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendRouteHealthProbeWithFallback(
context.Background(),
preferred.RouteID,
[]string{fallback.RouteID},
SyntheticChannelFabricControl,
"probe-fallback",
)
if err != nil {
t.Fatalf("send route health probe with fallback: %v", err)
}
if !result.FallbackUsed {
t.Fatal("FallbackUsed = false, want true")
}
if result.SelectedRouteID != fallback.RouteID {
t.Fatalf("SelectedRouteID = %q, want %q", result.SelectedRouteID, fallback.RouteID)
}
preferredObservation, ok := nodeA.SnapshotRouteObservation(preferred.RouteID)
if !ok {
t.Fatal("preferred route observation missing")
}
if preferredObservation.State != SyntheticRouteStateFailed || preferredObservation.FailureCount != 1 {
t.Fatalf("preferred observation = %+v, want failed", preferredObservation)
}
fallbackObservation, ok := nodeA.SnapshotRouteObservation(fallback.RouteID)
if !ok {
t.Fatal("fallback route observation missing")
}
if fallbackObservation.State != SyntheticRouteStateHealthy || fallbackObservation.SuccessCount != 1 {
t.Fatalf("fallback observation = %+v, want healthy", fallbackObservation)
}
metrics := nodeA.SnapshotMetrics()
if metrics.FallbackRoutesUsed != 1 || metrics.WarmRoutesPromoted != 1 || metrics.RouteDeliveriesFailed != 1 {
t.Fatalf("metrics = %+v, want fallback promotion and one failed delivery", metrics)
}
}
func TestSyntheticRuntimeRouteCacheInvalidatesOnVersionChange(t *testing.T) {
route := testRoute("route-cache", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
if _, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-cache"); err != nil {
t.Fatalf("send route health probe: %v", err)
}
if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); !ok {
t.Fatal("route observation missing before invalidation")
}
invalidated := nodeA.InvalidateRouteCache("policy_changed", SyntheticRouteCacheVersion{PolicyVersion: "policy-v2"})
if invalidated != 1 {
t.Fatalf("invalidated = %d, want 1", invalidated)
}
if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); ok {
t.Fatal("route observation still present after invalidation")
}
if nodeA.SnapshotMetrics().RouteCacheInvalidations != 1 {
t.Fatalf("RouteCacheInvalidations = %d, want 1", nodeA.SnapshotMetrics().RouteCacheInvalidations)
}
}
func TestSyntheticRuntimeRouteCacheKeepsCurrentVersion(t *testing.T) {
route := testRoute("route-cache-current", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
if _, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-cache-current"); err != nil {
t.Fatalf("send route health probe: %v", err)
}
invalidated := nodeA.InvalidateRouteCache("same_versions", SyntheticRouteCacheVersion{
RouteVersion: "route-v1",
PolicyVersion: "policy-v1",
PeerDirectoryVersion: "peers-v1",
})
if invalidated != 0 {
t.Fatalf("invalidated = %d, want 0", invalidated)
}
if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); !ok {
t.Fatal("route observation missing after same-version invalidation")
}
}
func TestSyntheticRuntimeRouteHealthDisabledRejects(t *testing.T) {
route := testRoute("route-health-disabled", []string{"node-a", "node-b"})
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: false,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
})
_, err := nodeA.SendRouteHealthProbeWithFallback(
context.Background(),
route.RouteID,
[]string{"route-fallback"},
SyntheticChannelFabricControl,
"probe-disabled-health",
)
if !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
}
}
func testRuntime(nodeID string, transport SyntheticTransport, routes ...SyntheticRoute) *SyntheticRuntime {
return NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: nodeID},
Routes: routes,
Transport: transport,
MaxTTL: 8,
MaxHops: 8,
})
}
func testRuntimeWithRouteHealth(nodeID string, transport SyntheticTransport, routes []SyntheticRoute, routeHealthRoutes []SyntheticRoute) *SyntheticRuntime {
return NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: nodeID},
Routes: routes,
RouteHealthRoutes: routeHealthRoutes,
Transport: transport,
MaxTTL: 8,
MaxHops: 8,
})
}
func testRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: hops,
AllowedChannels: []string{SyntheticChannelFabricControl},
ExpiresAt: time.Now().UTC().Add(time.Hour),
MaxTTL: 8,
MaxHops: 8,
RouteVersion: "route-v1",
PolicyVersion: "policy-v1",
PeerDirectoryVersion: "peers-v1",
}
}
func testEnvelope(route SyntheticRoute, fromNodeID string, toNodeID string) SyntheticEnvelope {
payload, _ := json.Marshal(SyntheticProbePayload{
ProbeID: "probe-test",
SentAt: time.Now().UTC(),
})
return SyntheticEnvelope{
ProtocolVersion: ProtocolVersion,
RouteID: route.RouteID,
ClusterID: route.ClusterID,
From: PeerIdentity{ClusterID: route.ClusterID, NodeID: fromNodeID},
To: PeerIdentity{ClusterID: route.ClusterID, NodeID: toNodeID},
Channel: SyntheticChannelFabricControl,
MessageType: SyntheticMessageProbe,
TTL: 8,
HopCount: 1,
Visited: []string{fromNodeID},
Sequence: 1,
SentAt: time.Now().UTC(),
Payload: payload,
}
}
func decodeAckPayload(t *testing.T, envelope SyntheticEnvelope) SyntheticProbeAckPayload {
t.Helper()
var payload SyntheticProbeAckPayload
if err := json.Unmarshal(envelope.Payload, &payload); err != nil {
t.Fatalf("decode ack payload: %v", err)
}
return payload
}
@@ -0,0 +1,235 @@
package mesh
import (
"context"
"encoding/json"
"errors"
"strings"
"testing"
)
func TestSyntheticRuntimeTestServiceDirectRoute(t *testing.T) {
route := testServiceRoute("route-test-service-direct", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-direct", "hello"))
if err != nil {
t.Fatalf("send test service: %v", err)
}
if result.Ack.MessageType != SyntheticMessageTestServiceAck {
t.Fatalf("MessageType = %q, want %q", result.Ack.MessageType, SyntheticMessageTestServiceAck)
}
if result.Response.EchoPayload != "hello" {
t.Fatalf("EchoPayload = %q, want hello", result.Response.EchoPayload)
}
if len(result.Response.Path) != 2 || result.Response.Path[0] != "node-a" || result.Response.Path[1] != "node-b" {
t.Fatalf("Path = %#v, want node-a -> node-b", result.Response.Path)
}
metrics := nodeA.SnapshotMetrics()
if metrics.TestServiceRequestsSent != 1 || metrics.TestServiceDeliveriesSucceeded != 1 {
t.Fatalf("metrics = %+v, want one test service success", metrics)
}
}
func TestSyntheticRuntimeTestServiceSingleRelayRoute(t *testing.T) {
route := testServiceRoute("route-test-service-relay", []string{"node-a", "node-r", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeR := testRuntime("node-r", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-r"] = nodeR
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-relay", "relay"))
if err != nil {
t.Fatalf("send test service: %v", err)
}
if len(result.Response.Path) != 3 || result.Response.Path[0] != "node-a" || result.Response.Path[1] != "node-r" || result.Response.Path[2] != "node-b" {
t.Fatalf("Path = %#v, want node-a -> node-r -> node-b", result.Response.Path)
}
if nodeR.SnapshotMetrics().ProbesForwarded != 1 {
t.Fatalf("ProbesForwarded = %d, want 1", nodeR.SnapshotMetrics().ProbesForwarded)
}
}
func TestSyntheticRuntimeTestServiceUsesForcedFallback(t *testing.T) {
preferred := testServiceRoute("route-test-service-preferred", []string{"node-a", "node-r", "node-b"})
fallback := testServiceRoute("route-test-service-fallback", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, preferred, fallback)
nodeB := testRuntime("node-b", transport, preferred, fallback)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendTestServiceWithFallback(
context.Background(),
preferred.RouteID,
[]string{fallback.RouteID},
SyntheticChannelRouteControl,
testServiceRequest("request-fallback", "fallback"),
)
if err != nil {
t.Fatalf("send test service with fallback: %v", err)
}
if !result.FallbackUsed {
t.Fatal("FallbackUsed = false, want true")
}
if result.SelectedRouteID != fallback.RouteID {
t.Fatalf("SelectedRouteID = %q, want %q", result.SelectedRouteID, fallback.RouteID)
}
if result.Response.EchoPayload != "fallback" {
t.Fatalf("EchoPayload = %q, want fallback", result.Response.EchoPayload)
}
metrics := nodeA.SnapshotMetrics()
if metrics.TestServiceFallbacksUsed != 1 || metrics.TestServiceDeliveriesFailed != 1 || metrics.TestServiceDeliveriesSucceeded != 1 {
t.Fatalf("metrics = %+v, want fallback success with one preferred failure", metrics)
}
}
func TestSyntheticRuntimeTestServiceRejectsWrongOrganization(t *testing.T) {
route := testServiceRoute("route-test-service-wrong-org", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
request := testServiceRequest("request-wrong-org", "hello")
request.OrganizationID = "org-other"
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request)
if !errors.Is(err, ErrSyntheticOrganizationMismatch) {
t.Fatalf("err = %v, want ErrSyntheticOrganizationMismatch", err)
}
}
func TestSyntheticRuntimeTestServiceRejectsUnsupportedService(t *testing.T) {
route := testServiceRoute("route-test-service-unsupported", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
request := testServiceRequest("request-unsupported", "hello")
request.ServiceType = "rdp"
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request)
if !errors.Is(err, ErrUnsupportedSyntheticService) {
t.Fatalf("err = %v, want ErrUnsupportedSyntheticService", err)
}
}
func TestSyntheticRuntimeTestServiceRejectsOversizedPayload(t *testing.T) {
route := testServiceRoute("route-test-service-oversized", []string{"node-a", "node-b"})
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
MaxTestPayloadBytes: 4,
})
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-oversized", "12345"))
if !errors.Is(err, ErrSyntheticPayloadTooLarge) {
t.Fatalf("err = %v, want ErrSyntheticPayloadTooLarge", err)
}
}
func TestSyntheticRuntimeTestServiceRejectsUnauthorizedChannel(t *testing.T) {
route := testServiceRoute("route-test-service-channel", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelFabricControl, testServiceRequest("request-channel", "hello"))
if !errors.Is(err, ErrUnauthorizedChannel) {
t.Fatalf("err = %v, want ErrUnauthorizedChannel", err)
}
}
func TestSyntheticRuntimeTestServiceDisabledRejects(t *testing.T) {
route := testServiceRoute("route-test-service-disabled", []string{"node-a", "node-b"})
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: false,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
})
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-disabled", "hello"))
if !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
}
}
func TestSyntheticRelaySchedulerAcceptsTestServiceMessage(t *testing.T) {
scheduler := testRelayScheduler()
envelope := testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageTestService, 42)
envelope.Payload = mustMarshalTestServiceRequest(testServiceRequest("request-relay-scheduler", "hello"))
if _, err := scheduler.Enqueue(envelope); err != nil {
t.Fatalf("enqueue test service: %v", err)
}
dequeued, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue test service: %v", err)
}
if dequeued.MessageType != SyntheticMessageTestService {
t.Fatalf("MessageType = %q, want %q", dequeued.MessageType, SyntheticMessageTestService)
}
}
func testServiceRoute(routeID string, hops []string) SyntheticRoute {
route := testRoute(routeID, hops)
route.AllowedChannels = []string{SyntheticChannelRouteControl}
return route
}
func testServiceRequest(requestID string, payload string) SyntheticTestServiceRequest {
return SyntheticTestServiceRequest{
RequestID: requestID,
OrganizationID: SyntheticDefaultTestOrganizationID,
ServiceType: SyntheticTestServiceType,
Payload: payload,
}
}
func mustMarshalTestServiceRequest(request SyntheticTestServiceRequest) []byte {
payload, err := json.Marshal(request)
if err != nil {
panic(err)
}
return payload
}
func TestSyntheticRuntimeTestServiceRejectsMissingRequestID(t *testing.T) {
route := testServiceRoute("route-test-service-missing-request", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
request := testServiceRequest("", "hello")
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request)
if !errors.Is(err, ErrSyntheticRequestInvalid) {
t.Fatalf("err = %v, want ErrSyntheticRequestInvalid", err)
}
}
func TestSyntheticRuntimeTestServiceAllowsMaxPayloadBoundary(t *testing.T) {
route := testServiceRoute("route-test-service-max", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
Transport: transport,
MaxTestPayloadBytes: 8,
})
nodeB := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
Routes: []SyntheticRoute{route},
Transport: transport,
MaxTestPayloadBytes: 8,
})
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-max", strings.Repeat("a", 8)))
if err != nil {
t.Fatalf("send test service: %v", err)
}
if result.Response.EchoPayload != strings.Repeat("a", 8) {
t.Fatalf("EchoPayload = %q", result.Response.EchoPayload)
}
}