Initial project snapshot

This commit is contained in:
2026-04-28 22:29:50 +03:00
commit 8ba0561f4f
365 changed files with 91832 additions and 0 deletions
@@ -0,0 +1,70 @@
package agent
import (
"runtime"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
const Version = "0.1.0-c3"
func EnrollmentPayload(clusterID, joinToken string, identity state.Identity) client.EnrollRequest {
return client.EnrollRequest{
ClusterID: clusterID,
JoinToken: joinToken,
NodeName: identity.NodeName,
NodeFingerprint: identity.NodeFingerprint,
PublicKey: identity.PublicKey,
ReportedCapabilities: map[string]any{
"can_accept_client_ingress": false,
"can_accept_node_ingress": false,
"can_route_mesh": false,
"can_run_rdp_worker": true,
"can_run_vnc_worker": false,
"can_run_vpn_exit": false,
"can_run_vpn_connector": false,
"can_run_file_cache": false,
"can_run_update_cache": false,
"can_run_video_relay": false,
"native_node_agent_version": Version,
"service_supervision_enabled": false,
},
ReportedFacts: map[string]any{
"os": runtime.GOOS,
"arch": runtime.GOARCH,
"agent": "rap-node-agent",
"agent_ver": Version,
},
RequestedRoles: []string{},
}
}
func HeartbeatPayload() client.HeartbeatRequest {
return client.HeartbeatRequest{
HealthStatus: "healthy",
ReportedVersion: Version,
Capabilities: map[string]any{
"native_node_agent": true,
},
ServiceStates: map[string]any{
"workload_supervision": "not_implemented_c3",
},
Metadata: map[string]any{
"stage": "c3",
},
}
}
func MeshSelfObservationPayload(identity state.Identity) client.MeshLinkObservationRequest {
return client.MeshLinkObservationRequest{
SourceNodeID: identity.NodeID,
TargetNodeID: identity.NodeID,
LinkStatus: "reachable",
Metadata: map[string]any{
"stage": "c6",
"traffic_forwarding": false,
"observation_type": "self",
},
}
}
@@ -0,0 +1,44 @@
package agent
import (
"testing"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
func TestEnrollmentPayloadDoesNotRequestRolesByDefault(t *testing.T) {
payload := EnrollmentPayload("cluster-1", "join-token", state.Identity{
NodeName: "node-a",
NodeFingerprint: "fp",
PublicKey: "pub",
})
if payload.ClusterID != "cluster-1" || payload.JoinToken != "join-token" {
t.Fatalf("unexpected enrollment payload: %+v", payload)
}
if len(payload.RequestedRoles) != 0 {
t.Fatalf("agent must not self-assign roles: %+v", payload.RequestedRoles)
}
if payload.ReportedCapabilities["can_run_rdp_worker"] != true {
t.Fatalf("expected rdp capability in MVP payload: %+v", payload.ReportedCapabilities)
}
}
func TestHeartbeatPayloadIsStatusOnly(t *testing.T) {
payload := HeartbeatPayload()
if payload.HealthStatus != "healthy" {
t.Fatalf("HealthStatus = %q", payload.HealthStatus)
}
if payload.ServiceStates["workload_supervision"] == "running" {
t.Fatal("C3 must not pretend workload supervision is implemented")
}
}
func TestMeshSelfObservationDoesNotEnableTrafficForwarding(t *testing.T) {
payload := MeshSelfObservationPayload(state.Identity{NodeID: "node-1"})
if payload.SourceNodeID != "node-1" || payload.TargetNodeID != "node-1" {
t.Fatalf("unexpected mesh self observation payload: %+v", payload)
}
if payload.Metadata["traffic_forwarding"] != false {
t.Fatalf("traffic forwarding must stay disabled in C6: %+v", payload.Metadata)
}
}
@@ -0,0 +1,33 @@
package agent
import (
"runtime"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
)
func TelemetryPayload(identity state.Identity, startedAt time.Time) client.TelemetryRequest {
var mem runtime.MemStats
runtime.ReadMemStats(&mem)
used := int64(mem.Alloc)
total := int64(mem.Sys)
processCount := runtime.NumGoroutine()
return client.TelemetryRequest{
MemoryUsedBytes: &used,
MemoryTotalBytes: &total,
ProcessCount: &processCount,
Payload: map[string]any{
"agent": "rap-node-agent",
"agent_version": Version,
"node_name": identity.NodeName,
"os": runtime.GOOS,
"arch": runtime.GOARCH,
"goroutines": runtime.NumGoroutine(),
"uptime_seconds": int64(time.Since(startedAt).Seconds()),
"telemetry_source": "testing_flag",
},
ObservedAt: time.Now().UTC(),
}
}
@@ -0,0 +1,110 @@
package authority
import (
"crypto/ed25519"
"crypto/sha256"
"encoding/base64"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"strings"
)
const (
AuthoritySchemaVersion = "rap.cluster_authority.v1"
SignatureSchemaVersion = "rap.cluster_authority.signature.v1"
AlgorithmEd25519 = "ed25519"
)
var (
ErrInvalidKey = errors.New("invalid cluster authority key")
ErrInvalidSignature = errors.New("invalid cluster authority signature")
ErrInvalidPayload = errors.New("invalid cluster authority payload")
)
type Signature struct {
SchemaVersion string `json:"schema_version"`
Algorithm string `json:"algorithm"`
KeyFingerprint string `json:"key_fingerprint"`
Signature string `json:"signature"`
}
func VerifyRaw(publicKeyB64 string, payload json.RawMessage, signature Signature) error {
if signature.SchemaVersion != SignatureSchemaVersion {
return fmt.Errorf("%w: schema_version must be %s", ErrInvalidSignature, SignatureSchemaVersion)
}
if signature.Algorithm != AlgorithmEd25519 {
return fmt.Errorf("%w: algorithm must be %s", ErrInvalidSignature, AlgorithmEd25519)
}
publicKey, err := decodePublicKey(publicKeyB64)
if err != nil {
return err
}
if signature.KeyFingerprint != Fingerprint(publicKey) {
return fmt.Errorf("%w: key fingerprint mismatch", ErrInvalidSignature)
}
canonical, err := CanonicalJSON(payload)
if err != nil {
return err
}
decodedSignature, err := decodeBase64(strings.TrimSpace(signature.Signature))
if err != nil || len(decodedSignature) != ed25519.SignatureSize {
return fmt.Errorf("%w: signature must be base64 ed25519 signature", ErrInvalidSignature)
}
if !ed25519.Verify(publicKey, canonical, decodedSignature) {
return ErrInvalidSignature
}
return nil
}
func Fingerprint(publicKey ed25519.PublicKey) string {
sum := sha256.Sum256(publicKey)
return "rap-ca-ed25519-" + hex.EncodeToString(sum[:16])
}
func HashRaw(raw json.RawMessage) (string, error) {
canonical, err := CanonicalJSON(raw)
if err != nil {
return "", err
}
sum := sha256.Sum256(canonical)
return hex.EncodeToString(sum[:]), nil
}
func CanonicalJSON(raw json.RawMessage) ([]byte, error) {
if len(raw) == 0 {
return nil, fmt.Errorf("%w: empty payload", ErrInvalidPayload)
}
var value any
if err := json.Unmarshal(raw, &value); err != nil {
return nil, fmt.Errorf("%w: invalid json: %v", ErrInvalidPayload, err)
}
canonical, err := json.Marshal(value)
if err != nil {
return nil, fmt.Errorf("%w: canonical json: %v", ErrInvalidPayload, err)
}
return canonical, nil
}
func decodePublicKey(value string) (ed25519.PublicKey, error) {
decoded, err := decodeBase64(strings.TrimSpace(value))
if err != nil {
return nil, fmt.Errorf("%w: public key must be base64 encoded", ErrInvalidKey)
}
if len(decoded) != ed25519.PublicKeySize {
return nil, fmt.Errorf("%w: public key must decode to %d bytes", ErrInvalidKey, ed25519.PublicKeySize)
}
return ed25519.PublicKey(decoded), nil
}
func decodeBase64(value string) ([]byte, error) {
if value == "" {
return nil, errors.New("empty base64 value")
}
decoded, err := base64.StdEncoding.DecodeString(value)
if err == nil {
return decoded, nil
}
return base64.RawStdEncoding.DecodeString(value)
}
@@ -0,0 +1,52 @@
package authority
import (
"crypto/ed25519"
"encoding/base64"
"encoding/json"
"errors"
"testing"
)
func TestVerifyRawAcceptsSignedPayload(t *testing.T) {
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatalf("GenerateKey: %v", err)
}
payload := json.RawMessage(`{"cluster_id":"cluster-1","schema_version":"test.v1"}`)
canonical, err := CanonicalJSON(payload)
if err != nil {
t.Fatalf("CanonicalJSON: %v", err)
}
signature := Signature{
SchemaVersion: SignatureSchemaVersion,
Algorithm: AlgorithmEd25519,
KeyFingerprint: Fingerprint(publicKey),
Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)),
}
if err := VerifyRaw(base64.StdEncoding.EncodeToString(publicKey), payload, signature); err != nil {
t.Fatalf("VerifyRaw: %v", err)
}
}
func TestVerifyRawRejectsTamperedPayload(t *testing.T) {
publicKey, privateKey, err := ed25519.GenerateKey(nil)
if err != nil {
t.Fatalf("GenerateKey: %v", err)
}
payload := json.RawMessage(`{"cluster_id":"cluster-1","schema_version":"test.v1"}`)
canonical, err := CanonicalJSON(payload)
if err != nil {
t.Fatalf("CanonicalJSON: %v", err)
}
signature := Signature{
SchemaVersion: SignatureSchemaVersion,
Algorithm: AlgorithmEd25519,
KeyFingerprint: Fingerprint(publicKey),
Signature: base64.StdEncoding.EncodeToString(ed25519.Sign(privateKey, canonical)),
}
tampered := json.RawMessage(`{"cluster_id":"cluster-2","schema_version":"test.v1"}`)
if err := VerifyRaw(base64.StdEncoding.EncodeToString(publicKey), tampered, signature); !errors.Is(err, ErrInvalidSignature) {
t.Fatalf("err = %v, want ErrInvalidSignature", err)
}
}
@@ -0,0 +1,400 @@
package client
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"time"
)
type Client struct {
baseURL string
httpClient *http.Client
}
type EnrollRequest struct {
ClusterID string `json:"cluster_id"`
JoinToken string `json:"join_token"`
NodeName string `json:"node_name"`
NodeFingerprint string `json:"node_fingerprint"`
PublicKey string `json:"public_key"`
ReportedCapabilities map[string]any `json:"reported_capabilities"`
ReportedFacts map[string]any `json:"reported_facts"`
RequestedRoles []string `json:"requested_roles"`
}
type EnrollResponse struct {
Status string `json:"status"`
JoinRequest json.RawMessage `json:"join_request"`
}
type EnrollmentBootstrapRequest struct {
ClusterID string `json:"cluster_id"`
NodeFingerprint string `json:"node_fingerprint"`
PublicKey string `json:"public_key"`
}
type EnrollmentBootstrapResponse struct {
Status string `json:"status"`
JoinRequest json.RawMessage `json:"join_request"`
Bootstrap *NodeBootstrap `json:"node_bootstrap,omitempty"`
}
type NodeBootstrap struct {
NodeID string `json:"node_id"`
ClusterID string `json:"cluster_id"`
IdentityStatus string `json:"identity_status"`
Certificate map[string]any `json:"certificate"`
HeartbeatEndpoint string `json:"heartbeat_endpoint"`
ClusterAuthority *ClusterAuthorityDescriptor `json:"cluster_authority,omitempty"`
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"`
}
type HeartbeatRequest struct {
HealthStatus string `json:"health_status"`
ReportedVersion string `json:"reported_version,omitempty"`
Capabilities map[string]any `json:"capabilities"`
ServiceStates map[string]any `json:"service_states"`
Metadata map[string]any `json:"metadata"`
}
type HeartbeatResponse struct {
Heartbeat json.RawMessage `json:"heartbeat"`
TestingFlags EffectiveTestingFlags `json:"testing_flags"`
}
type EffectiveTestingFlags struct {
Enabled bool `json:"enabled"`
TelemetryEnabled bool `json:"telemetry_enabled"`
SyntheticLinksEnabled bool `json:"synthetic_links_enabled"`
HistoryRetentionHours int `json:"history_retention_hours"`
AppliedScopes []string `json:"applied_scopes"`
}
type DesiredWorkload struct {
ServiceType string `json:"service_type"`
DesiredState string `json:"desired_state"`
Version string `json:"version,omitempty"`
RuntimeMode string `json:"runtime_mode"`
ArtifactRef string `json:"artifact_ref,omitempty"`
Config map[string]any `json:"config"`
Environment map[string]any `json:"environment"`
}
type WorkloadStatusRequest struct {
ReportedState string `json:"reported_state"`
RuntimeMode string `json:"runtime_mode"`
Version string `json:"version,omitempty"`
StatusPayload map[string]any `json:"status_payload"`
}
type MeshLinkObservationRequest struct {
SourceNodeID string `json:"source_node_id"`
TargetNodeID string `json:"target_node_id"`
LinkStatus string `json:"link_status"`
LatencyMs *int `json:"latency_ms,omitempty"`
QualityScore *int `json:"quality_score,omitempty"`
Metadata map[string]any `json:"metadata"`
}
type TelemetryRequest struct {
CPUPercent *float64 `json:"cpu_percent,omitempty"`
MemoryUsedBytes *int64 `json:"memory_used_bytes,omitempty"`
MemoryTotalBytes *int64 `json:"memory_total_bytes,omitempty"`
DiskUsedBytes *int64 `json:"disk_used_bytes,omitempty"`
DiskTotalBytes *int64 `json:"disk_total_bytes,omitempty"`
NetworkRxBytes *int64 `json:"network_rx_bytes,omitempty"`
NetworkTxBytes *int64 `json:"network_tx_bytes,omitempty"`
ProcessCount *int `json:"process_count,omitempty"`
Payload map[string]any `json:"payload"`
ObservedAt time.Time `json:"observed_at"`
}
type SyntheticMeshRouteConfig struct {
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
SourceNodeID string `json:"source_node_id"`
DestinationNodeID string `json:"destination_node_id"`
Hops []string `json:"hops"`
AllowedChannels []string `json:"allowed_channels"`
ExpiresAt time.Time `json:"expires_at"`
MaxTTL int `json:"max_ttl"`
MaxHops int `json:"max_hops"`
RouteVersion string `json:"route_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
}
type SyntheticMeshConfig struct {
Enabled bool `json:"enabled"`
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
LocalNodeID string `json:"local_node_id"`
AuthorityRequired bool `json:"authority_required"`
ClusterAuthority *ClusterAuthorityDescriptor `json:"cluster_authority,omitempty"`
AuthorityPayload json.RawMessage `json:"authority_payload,omitempty"`
AuthoritySignature *ClusterSignature `json:"authority_signature,omitempty"`
ConfigVersion string `json:"config_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerEndpoints map[string]string `json:"peer_endpoints"`
PeerEndpointCandidates map[string][]PeerEndpointCandidate `json:"peer_endpoint_candidates,omitempty"`
PeerDirectory []PeerDirectoryEntry `json:"peer_directory,omitempty"`
RecoverySeeds []PeerRecoverySeed `json:"recovery_seeds,omitempty"`
RendezvousLeases []PeerRendezvousLease `json:"rendezvous_leases,omitempty"`
RendezvousRelayPolicy *RendezvousRelayPolicyReport `json:"rendezvous_relay_policy,omitempty"`
RoutePathDecisions *RoutePathDecisionReport `json:"route_path_decisions,omitempty"`
Routes []SyntheticMeshRouteConfig `json:"routes"`
ProductionForwarding bool `json:"production_forwarding"`
}
type ClusterAuthorityDescriptor struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
AuthorityState string `json:"authority_state"`
KeyAlgorithm string `json:"key_algorithm"`
PublicKey string `json:"public_key"`
PublicKeyFingerprint string `json:"public_key_fingerprint"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
type ClusterSignature struct {
SchemaVersion string `json:"schema_version"`
Algorithm string `json:"algorithm"`
KeyFingerprint string `json:"key_fingerprint"`
Signature string `json:"signature"`
SignedAt time.Time `json:"signed_at"`
}
type PeerDirectoryEntry struct {
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
EndpointCount int `json:"endpoint_count"`
CandidateCount int `json:"candidate_count"`
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
}
type PeerRecoverySeed struct {
NodeID string `json:"node_id"`
Endpoint string `json:"endpoint"`
Transport string `json:"transport"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
Region string `json:"region,omitempty"`
Priority int `json:"priority"`
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
type PeerRendezvousLease struct {
LeaseID string `json:"lease_id"`
PeerNodeID string `json:"peer_node_id"`
RelayNodeID string `json:"relay_node_id"`
RelayEndpoint string `json:"relay_endpoint"`
Transport string `json:"transport"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
RouteIDs []string `json:"route_ids,omitempty"`
AllowedChannels []string `json:"allowed_channels,omitempty"`
Priority int `json:"priority"`
ControlPlaneOnly bool `json:"control_plane_only"`
IssuedAt time.Time `json:"issued_at"`
ExpiresAt time.Time `json:"expires_at"`
Reason string `json:"reason,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
type RendezvousRelayPolicyDecision struct {
RouteID string `json:"route_id,omitempty"`
PeerNodeID string `json:"peer_node_id"`
WithdrawnLeaseID string `json:"withdrawn_lease_id,omitempty"`
StaleRelayNodeID string `json:"stale_relay_node_id,omitempty"`
SelectedRelayID string `json:"selected_relay_id,omitempty"`
SelectedEndpoint string `json:"selected_endpoint,omitempty"`
Score int `json:"score,omitempty"`
Reason string `json:"reason"`
ScoreReasons []string `json:"score_reasons,omitempty"`
ReporterNodeID string `json:"reporter_node_id,omitempty"`
}
type RendezvousRelayPolicyReport struct {
SchemaVersion string `json:"schema_version"`
ScoringMode string `json:"scoring_mode"`
FeedbackMaxAgeSeconds int `json:"feedback_max_age_seconds"`
StaleRelayCount int `json:"stale_relay_count"`
WithdrawnLeaseCount int `json:"withdrawn_lease_count"`
ReplacementLeaseCount int `json:"replacement_lease_count"`
Decisions []RendezvousRelayPolicyDecision `json:"decisions,omitempty"`
}
type RoutePathDecision struct {
DecisionID string `json:"decision_id"`
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
LocalNodeID string `json:"local_node_id"`
SourceNodeID string `json:"source_node_id"`
DestinationNodeID string `json:"destination_node_id"`
OriginalHops []string `json:"original_hops"`
EffectiveHops []string `json:"effective_hops"`
PreviousHopID string `json:"previous_hop_id,omitempty"`
NextHopID string `json:"next_hop_id,omitempty"`
LocalRole string `json:"local_role"`
SelectedRelayID string `json:"selected_relay_id,omitempty"`
SelectedRelayEndpoint string `json:"selected_relay_endpoint,omitempty"`
StaleRelayNodeID string `json:"stale_relay_node_id,omitempty"`
RendezvousPeerNodeID string `json:"rendezvous_peer_node_id,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RendezvousLeaseReason string `json:"rendezvous_lease_reason,omitempty"`
DecisionSource string `json:"decision_source"`
Generation string `json:"generation"`
PathScore int `json:"path_score,omitempty"`
ScoreReasons []string `json:"score_reasons,omitempty"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
ExpiresAt time.Time `json:"expires_at"`
}
type RoutePathDecisionReport struct {
SchemaVersion string `json:"schema_version"`
DecisionMode string `json:"decision_mode"`
Generation string `json:"generation"`
DecisionCount int `json:"decision_count"`
ReplacementDecisionCount int `json:"replacement_decision_count"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
Decisions []RoutePathDecision `json:"decisions,omitempty"`
}
type PeerEndpointCandidate struct {
EndpointID string `json:"endpoint_id"`
NodeID string `json:"node_id"`
Transport string `json:"transport"`
Address string `json:"address"`
AddressFamily string `json:"address_family,omitempty"`
Reachability string `json:"reachability"`
NATType string `json:"nat_type,omitempty"`
ConnectivityMode string `json:"connectivity_mode"`
Region string `json:"region,omitempty"`
Priority int `json:"priority"`
PolicyTags []string `json:"policy_tags,omitempty"`
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
func New(baseURL string) *Client {
return &Client{
baseURL: baseURL,
httpClient: &http.Client{
Timeout: 15 * time.Second,
},
}
}
func (c *Client) Enroll(ctx context.Context, request EnrollRequest) (EnrollResponse, error) {
var response EnrollResponse
if err := c.postJSON(ctx, "/node-agents/enroll", request, &response); err != nil {
return EnrollResponse{}, err
}
return response, nil
}
func (c *Client) BootstrapEnrollment(ctx context.Context, joinRequestID string, request EnrollmentBootstrapRequest) (EnrollmentBootstrapResponse, error) {
var response EnrollmentBootstrapResponse
path := fmt.Sprintf("/node-agents/enrollments/%s/bootstrap", joinRequestID)
if err := c.postJSON(ctx, path, request, &response); err != nil {
return EnrollmentBootstrapResponse{}, err
}
return response, nil
}
func (c *Client) Heartbeat(ctx context.Context, clusterID, nodeID string, request HeartbeatRequest) (HeartbeatResponse, error) {
var response HeartbeatResponse
path := fmt.Sprintf("/clusters/%s/nodes/%s/heartbeats", clusterID, nodeID)
if err := c.postJSON(ctx, path, request, &response); err != nil {
return HeartbeatResponse{}, err
}
return response, nil
}
func (c *Client) DesiredWorkloads(ctx context.Context, clusterID, nodeID string) ([]DesiredWorkload, error) {
var response struct {
DesiredWorkloads []DesiredWorkload `json:"desired_workloads"`
}
path := fmt.Sprintf("/clusters/%s/nodes/%s/workloads/desired", clusterID, nodeID)
if err := c.getJSON(ctx, path, &response); err != nil {
return nil, err
}
return response.DesiredWorkloads, nil
}
func (c *Client) ReportWorkloadStatus(ctx context.Context, clusterID, nodeID, serviceType string, request WorkloadStatusRequest) error {
path := fmt.Sprintf("/clusters/%s/nodes/%s/workloads/%s/status", clusterID, nodeID, serviceType)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) ReportMeshLink(ctx context.Context, clusterID string, request MeshLinkObservationRequest) error {
path := fmt.Sprintf("/clusters/%s/mesh/links", clusterID)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) ReportTelemetry(ctx context.Context, clusterID, nodeID string, request TelemetryRequest) error {
path := fmt.Sprintf("/clusters/%s/nodes/%s/telemetry", clusterID, nodeID)
return c.postJSON(ctx, path, request, nil)
}
func (c *Client) SyntheticMeshConfig(ctx context.Context, clusterID, nodeID string) (SyntheticMeshConfig, error) {
var response struct {
Config SyntheticMeshConfig `json:"synthetic_mesh_config"`
}
path := fmt.Sprintf("/clusters/%s/nodes/%s/mesh/synthetic-config", clusterID, nodeID)
if err := c.getJSON(ctx, path, &response); err != nil {
return SyntheticMeshConfig{}, err
}
return response.Config, nil
}
func (c *Client) getJSON(ctx context.Context, path string, response any) error {
httpReq, err := http.NewRequestWithContext(ctx, http.MethodGet, c.baseURL+path, nil)
if err != nil {
return err
}
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return err
}
defer httpResp.Body.Close()
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return fmt.Errorf("backend returned status %d", httpResp.StatusCode)
}
if response == nil {
return nil
}
return json.NewDecoder(httpResp.Body).Decode(response)
}
func (c *Client) postJSON(ctx context.Context, path string, request any, response any) error {
payload, err := json.Marshal(request)
if err != nil {
return err
}
httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+path, bytes.NewReader(payload))
if err != nil {
return err
}
httpReq.Header.Set("Content-Type", "application/json")
httpResp, err := c.httpClient.Do(httpReq)
if err != nil {
return err
}
defer httpResp.Body.Close()
if httpResp.StatusCode < 200 || httpResp.StatusCode >= 300 {
return fmt.Errorf("backend returned status %d", httpResp.StatusCode)
}
if response == nil {
return nil
}
return json.NewDecoder(httpResp.Body).Decode(response)
}
@@ -0,0 +1,183 @@
package config
import (
"errors"
"flag"
"os"
"path/filepath"
"strconv"
"strings"
"time"
)
const MaxMeshProductionObservationSinkCapacity = 10000
type Config struct {
BackendURL string
ClusterID string
ClusterAuthorityPublicKey string
ClusterAuthorityFingerprint string
JoinToken string
NodeName string
StateDir string
WorkloadSupervisionEnabled bool
HeartbeatInterval time.Duration
EnrollmentPollInterval time.Duration
EnrollmentPollTimeout time.Duration
MeshSyntheticRuntimeEnabled bool
MeshProductionForwardingEnabled bool
MeshProductionObservationSinkCapacity int
MeshListenAddr string
MeshAdvertiseEndpoint string
MeshAdvertiseEndpointsJSON string
MeshAdvertiseTransport string
MeshConnectivityMode string
MeshNATType string
MeshRegion string
MeshSyntheticConfigPath string
MeshPeerEndpointsJSON string
MeshSyntheticRoutesJSON string
}
func Load(args []string, env map[string]string) (Config, error) {
if env == nil {
env = readEnv()
}
defaultStateDir := filepath.Join(".", ".rap-node-agent")
fs := flag.NewFlagSet("rap-node-agent", flag.ContinueOnError)
cfg := Config{}
fs.StringVar(&cfg.BackendURL, "backend-url", getEnv(env, "RAP_BACKEND_URL", "http://127.0.0.1:8080/api/v1"), "Backend API base URL.")
fs.StringVar(&cfg.ClusterID, "cluster-id", getEnv(env, "RAP_CLUSTER_ID", ""), "Cluster ID.")
fs.StringVar(&cfg.ClusterAuthorityPublicKey, "cluster-authority-public-key", getEnv(env, "RAP_CLUSTER_AUTHORITY_PUBLIC_KEY", ""), "Pinned cluster authority Ed25519 public key.")
fs.StringVar(&cfg.ClusterAuthorityFingerprint, "cluster-authority-fingerprint", getEnv(env, "RAP_CLUSTER_AUTHORITY_FINGERPRINT", ""), "Pinned cluster authority key fingerprint.")
fs.StringVar(&cfg.JoinToken, "join-token", getEnv(env, "RAP_JOIN_TOKEN", ""), "Short-lived node join token.")
fs.StringVar(&cfg.NodeName, "node-name", getEnv(env, "RAP_NODE_NAME", hostnameOrDefault()), "Node display name.")
fs.StringVar(&cfg.StateDir, "state-dir", getEnv(env, "RAP_NODE_STATE_DIR", defaultStateDir), "Local node-agent state directory.")
fs.BoolVar(&cfg.WorkloadSupervisionEnabled, "workload-supervision-enabled", getEnvBool(env, "RAP_WORKLOAD_SUPERVISION_ENABLED", false), "Enable desired workload polling and status reporting. Disabled by default while service runtime is not implemented.")
fs.BoolVar(&cfg.MeshSyntheticRuntimeEnabled, "mesh-synthetic-runtime-enabled", getEnvBool(env, "RAP_MESH_SYNTHETIC_RUNTIME_ENABLED", false), "Enable C17A synthetic fabric probe runtime. Disabled by default.")
fs.BoolVar(&cfg.MeshProductionForwardingEnabled, "mesh-production-forwarding-enabled", getEnvBool(env, "RAP_MESH_PRODUCTION_FORWARDING_ENABLED", false), "Enable production fabric-control direct next-hop forwarding gate. Disabled by default.")
fs.IntVar(&cfg.MeshProductionObservationSinkCapacity, "mesh-production-observation-sink-capacity", getEnvSignedInt(env, "RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY", 0), "Bounded local metadata-only production envelope observation sink capacity. Disabled when 0.")
fs.StringVar(&cfg.MeshListenAddr, "mesh-listen-addr", getEnv(env, "RAP_MESH_LISTEN_ADDR", ""), "Listen address for disabled-by-default C17E synthetic mesh HTTP endpoint.")
fs.StringVar(&cfg.MeshAdvertiseEndpoint, "mesh-advertise-endpoint", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINT", ""), "Advertised mesh endpoint reported to the Control Plane. Empty disables endpoint reporting.")
fs.StringVar(&cfg.MeshAdvertiseEndpointsJSON, "mesh-advertise-endpoints-json", getEnv(env, "RAP_MESH_ADVERTISE_ENDPOINTS_JSON", ""), "JSON array of advertised mesh endpoint candidates, including private/corporate endpoints.")
fs.StringVar(&cfg.MeshAdvertiseTransport, "mesh-advertise-transport", getEnv(env, "RAP_MESH_ADVERTISE_TRANSPORT", "direct_tcp_tls"), "Transport label for the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshConnectivityMode, "mesh-connectivity-mode", getEnv(env, "RAP_MESH_CONNECTIVITY_MODE", "direct"), "Connectivity mode reported with the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshNATType, "mesh-nat-type", getEnv(env, "RAP_MESH_NAT_TYPE", "unknown"), "NAT type hint reported with the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshRegion, "mesh-region", getEnv(env, "RAP_MESH_REGION", ""), "Optional region/site hint for the advertised mesh endpoint.")
fs.StringVar(&cfg.MeshSyntheticConfigPath, "mesh-synthetic-config", getEnv(env, "RAP_MESH_SYNTHETIC_CONFIG", ""), "Path to scoped synthetic mesh config snapshot. Preferred over debug JSON env.")
fs.StringVar(&cfg.MeshPeerEndpointsJSON, "mesh-peer-endpoints-json", getEnv(env, "RAP_MESH_PEER_ENDPOINTS_JSON", ""), "JSON object mapping peer node_id to synthetic mesh endpoint URL.")
fs.StringVar(&cfg.MeshSyntheticRoutesJSON, "mesh-synthetic-routes-json", getEnv(env, "RAP_MESH_SYNTHETIC_ROUTES_JSON", ""), "JSON array of synthetic mesh routes for test-only runtime.")
heartbeatSeconds := getEnvInt(env, "RAP_HEARTBEAT_INTERVAL_SECONDS", 15)
fs.DurationVar(&cfg.HeartbeatInterval, "heartbeat-interval", time.Duration(heartbeatSeconds)*time.Second, "Heartbeat interval.")
enrollmentPollIntervalSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_INTERVAL_SECONDS", 5)
enrollmentPollTimeoutSeconds := getEnvInt(env, "RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS", 600)
fs.DurationVar(&cfg.EnrollmentPollInterval, "enrollment-poll-interval", time.Duration(enrollmentPollIntervalSeconds)*time.Second, "Enrollment approval polling interval.")
fs.DurationVar(&cfg.EnrollmentPollTimeout, "enrollment-poll-timeout", time.Duration(enrollmentPollTimeoutSeconds)*time.Second, "Enrollment approval polling timeout.")
if err := fs.Parse(args); err != nil {
return Config{}, err
}
cfg.BackendURL = strings.TrimRight(strings.TrimSpace(cfg.BackendURL), "/")
cfg.ClusterID = strings.TrimSpace(cfg.ClusterID)
cfg.ClusterAuthorityPublicKey = strings.TrimSpace(cfg.ClusterAuthorityPublicKey)
cfg.ClusterAuthorityFingerprint = strings.TrimSpace(cfg.ClusterAuthorityFingerprint)
cfg.JoinToken = strings.TrimSpace(cfg.JoinToken)
cfg.NodeName = strings.TrimSpace(cfg.NodeName)
cfg.StateDir = strings.TrimSpace(cfg.StateDir)
cfg.MeshListenAddr = strings.TrimSpace(cfg.MeshListenAddr)
cfg.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/")
cfg.MeshAdvertiseEndpointsJSON = strings.TrimSpace(cfg.MeshAdvertiseEndpointsJSON)
cfg.MeshAdvertiseTransport = strings.TrimSpace(cfg.MeshAdvertiseTransport)
cfg.MeshConnectivityMode = strings.TrimSpace(cfg.MeshConnectivityMode)
cfg.MeshNATType = strings.TrimSpace(cfg.MeshNATType)
cfg.MeshRegion = strings.TrimSpace(cfg.MeshRegion)
cfg.MeshSyntheticConfigPath = strings.TrimSpace(cfg.MeshSyntheticConfigPath)
cfg.MeshPeerEndpointsJSON = strings.TrimSpace(cfg.MeshPeerEndpointsJSON)
cfg.MeshSyntheticRoutesJSON = strings.TrimSpace(cfg.MeshSyntheticRoutesJSON)
if cfg.BackendURL == "" {
return Config{}, errors.New("backend URL is required")
}
if cfg.NodeName == "" {
return Config{}, errors.New("node name is required")
}
if cfg.StateDir == "" {
return Config{}, errors.New("state dir is required")
}
if cfg.HeartbeatInterval <= 0 {
return Config{}, errors.New("heartbeat interval must be positive")
}
if cfg.EnrollmentPollInterval <= 0 {
return Config{}, errors.New("enrollment poll interval must be positive")
}
if cfg.EnrollmentPollTimeout < 0 {
return Config{}, errors.New("enrollment poll timeout must not be negative")
}
if cfg.MeshProductionObservationSinkCapacity < 0 {
return Config{}, errors.New("mesh production observation sink capacity must not be negative")
}
if cfg.MeshProductionObservationSinkCapacity > MaxMeshProductionObservationSinkCapacity {
return Config{}, errors.New("mesh production observation sink capacity exceeds maximum")
}
return cfg, nil
}
func readEnv() map[string]string {
out := map[string]string{}
for _, pair := range os.Environ() {
key, value, ok := strings.Cut(pair, "=")
if ok {
out[key] = value
}
}
return out
}
func getEnv(env map[string]string, key, fallback string) string {
if value := strings.TrimSpace(env[key]); value != "" {
return value
}
return fallback
}
func getEnvInt(env map[string]string, key string, fallback int) int {
value := strings.TrimSpace(env[key])
if value == "" {
return fallback
}
parsed, err := strconv.Atoi(value)
if err != nil || parsed <= 0 {
return fallback
}
return parsed
}
func getEnvSignedInt(env map[string]string, key string, fallback int) int {
value := strings.TrimSpace(env[key])
if value == "" {
return fallback
}
parsed, err := strconv.Atoi(value)
if err != nil {
return fallback
}
return parsed
}
func getEnvBool(env map[string]string, key string, fallback bool) bool {
value := strings.ToLower(strings.TrimSpace(env[key]))
switch value {
case "1", "true", "yes", "y", "on":
return true
case "0", "false", "no", "n", "off":
return false
default:
return fallback
}
}
func hostnameOrDefault() string {
host, err := os.Hostname()
if err != nil || strings.TrimSpace(host) == "" {
return "rap-node"
}
return host
}
@@ -0,0 +1,104 @@
package config
import (
"testing"
"time"
)
func TestLoadConfigFromEnvAndArgs(t *testing.T) {
cfg, err := Load([]string{"-node-name", "node-b"}, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1/",
"RAP_CLUSTER_ID": "cluster-1",
"RAP_CLUSTER_AUTHORITY_PUBLIC_KEY": "public-key-b64",
"RAP_CLUSTER_AUTHORITY_FINGERPRINT": "rap-ca-ed25519-test",
"RAP_JOIN_TOKEN": "join-token",
"RAP_NODE_NAME": "node-a",
"RAP_NODE_STATE_DIR": "/tmp/rap-node",
"RAP_WORKLOAD_SUPERVISION_ENABLED": "true",
"RAP_HEARTBEAT_INTERVAL_SECONDS": "7",
"RAP_ENROLLMENT_POLL_INTERVAL_SECONDS": "3",
"RAP_ENROLLMENT_POLL_TIMEOUT_SECONDS": "30",
"RAP_MESH_SYNTHETIC_RUNTIME_ENABLED": "true",
"RAP_MESH_PRODUCTION_FORWARDING_ENABLED": "true",
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "5",
"RAP_MESH_LISTEN_ADDR": "127.0.0.1:19001",
"RAP_MESH_ADVERTISE_ENDPOINT": "https://node-a.example.test:443/",
"RAP_MESH_ADVERTISE_ENDPOINTS_JSON": `[{"endpoint_id":"node-a-lan","address":"10.10.0.20:19001"}]`,
"RAP_MESH_ADVERTISE_TRANSPORT": "wss",
"RAP_MESH_CONNECTIVITY_MODE": "outbound_only",
"RAP_MESH_NAT_TYPE": "symmetric",
"RAP_MESH_REGION": "eu",
"RAP_MESH_SYNTHETIC_CONFIG": "/tmp/rap-node/mesh-synthetic.json",
"RAP_MESH_PEER_ENDPOINTS_JSON": `{"node-b":"http://127.0.0.1:19002"}`,
"RAP_MESH_SYNTHETIC_ROUTES_JSON": `[{"route_id":"route-1"}]`,
})
if err != nil {
t.Fatalf("load config: %v", err)
}
if cfg.BackendURL != "http://backend/api/v1" {
t.Fatalf("BackendURL = %q", cfg.BackendURL)
}
if cfg.NodeName != "node-b" {
t.Fatalf("NodeName = %q", cfg.NodeName)
}
if cfg.ClusterAuthorityPublicKey != "public-key-b64" || cfg.ClusterAuthorityFingerprint != "rap-ca-ed25519-test" {
t.Fatalf("unexpected cluster authority pin config: %+v", cfg)
}
if cfg.HeartbeatInterval != 7*time.Second {
t.Fatalf("HeartbeatInterval = %s", cfg.HeartbeatInterval)
}
if cfg.EnrollmentPollInterval != 3*time.Second || cfg.EnrollmentPollTimeout != 30*time.Second {
t.Fatalf("unexpected enrollment polling config: %+v", cfg)
}
if !cfg.WorkloadSupervisionEnabled {
t.Fatal("WorkloadSupervisionEnabled = false, want true")
}
if !cfg.MeshSyntheticRuntimeEnabled {
t.Fatal("MeshSyntheticRuntimeEnabled = false, want true")
}
if !cfg.MeshProductionForwardingEnabled {
t.Fatal("MeshProductionForwardingEnabled = false, want true")
}
if cfg.MeshProductionObservationSinkCapacity != 5 {
t.Fatalf("MeshProductionObservationSinkCapacity = %d, want 5", cfg.MeshProductionObservationSinkCapacity)
}
if cfg.MeshListenAddr != "127.0.0.1:19001" {
t.Fatalf("MeshListenAddr = %q", cfg.MeshListenAddr)
}
if cfg.MeshAdvertiseEndpoint != "https://node-a.example.test:443" ||
cfg.MeshAdvertiseEndpointsJSON == "" ||
cfg.MeshAdvertiseTransport != "wss" ||
cfg.MeshConnectivityMode != "outbound_only" ||
cfg.MeshNATType != "symmetric" ||
cfg.MeshRegion != "eu" {
t.Fatalf("unexpected mesh advertise config: %+v", cfg)
}
if cfg.MeshSyntheticConfigPath != "/tmp/rap-node/mesh-synthetic.json" {
t.Fatalf("MeshSyntheticConfigPath = %q", cfg.MeshSyntheticConfigPath)
}
if cfg.MeshPeerEndpointsJSON == "" || cfg.MeshSyntheticRoutesJSON == "" {
t.Fatalf("mesh live synthetic config was not loaded: %+v", cfg)
}
}
func TestLoadConfigRejectsNegativeProductionObservationSinkCapacity(t *testing.T) {
_, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "-1",
})
if err == nil {
t.Fatal("Load returned nil error for negative sink capacity")
}
}
func TestLoadConfigRejectsTooLargeProductionObservationSinkCapacity(t *testing.T) {
_, err := Load(nil, map[string]string{
"RAP_BACKEND_URL": "http://backend/api/v1",
"RAP_NODE_NAME": "node-a",
"RAP_MESH_PRODUCTION_OBSERVATION_SINK_CAPACITY": "10001",
})
if err == nil {
t.Fatal("Load returned nil error for too-large sink capacity")
}
}
@@ -0,0 +1,111 @@
package mesh
import (
"bytes"
"context"
"encoding/json"
"fmt"
"net/http"
"time"
)
type Client struct {
BaseURL string
HTTPClient *http.Client
}
func NewClient(baseURL string) Client {
return Client{
BaseURL: baseURL,
HTTPClient: &http.Client{
Timeout: 5 * time.Second,
},
}
}
func (c Client) SendHealth(ctx context.Context, message HealthMessage) (HealthAck, error) {
payload, err := json.Marshal(message)
if err != nil {
return HealthAck{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/health", bytes.NewReader(payload))
if err != nil {
return HealthAck{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return HealthAck{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return HealthAck{}, fmt.Errorf("mesh health rejected with status %d", resp.StatusCode)
}
var ack HealthAck
if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil {
return HealthAck{}, err
}
return ack, nil
}
func (c Client) SendSynthetic(ctx context.Context, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
payload, err := json.Marshal(envelope)
if err != nil {
return SyntheticEnvelope{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/synthetic/probe", bytes.NewReader(payload))
if err != nil {
return SyntheticEnvelope{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return SyntheticEnvelope{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return SyntheticEnvelope{}, fmt.Errorf("mesh synthetic probe rejected with status %d", resp.StatusCode)
}
var ack SyntheticEnvelope
if err := json.NewDecoder(resp.Body).Decode(&ack); err != nil {
return SyntheticEnvelope{}, err
}
return ack, nil
}
func (c Client) SendProduction(ctx context.Context, envelope ProductionEnvelope) (ProductionForwardResult, error) {
payload, err := json.Marshal(envelope)
if err != nil {
return ProductionForwardResult{}, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.BaseURL+"/mesh/v1/forward", bytes.NewReader(payload))
if err != nil {
return ProductionForwardResult{}, err
}
req.Header.Set("Content-Type", "application/json")
httpClient := c.HTTPClient
if httpClient == nil {
httpClient = http.DefaultClient
}
resp, err := httpClient.Do(req)
if err != nil {
return ProductionForwardResult{}, err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return ProductionForwardResult{}, fmt.Errorf("mesh production forward rejected with status %d", resp.StatusCode)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
return ProductionForwardResult{}, err
}
return result, nil
}
@@ -0,0 +1,288 @@
package mesh
import (
"encoding/json"
"errors"
"time"
)
const ProtocolVersion = "mesh-control-v1"
var (
ErrClusterMismatch = errors.New("mesh peer cluster mismatch")
ErrNodeMismatch = errors.New("mesh peer node mismatch")
ErrForwardDisabled = errors.New("production payload forwarding is disabled by mesh production gate")
ErrForwardRuntimeUnavailable = errors.New("production mesh forwarding runtime is unavailable for this route or stage")
ErrForwardPeerUnavailable = errors.New("production mesh next peer is unavailable")
ErrForwardEnvelopeInvalid = errors.New("production mesh envelope is invalid")
ErrForwardObservationFailed = errors.New("production mesh envelope observation failed")
ErrMeshRuntimeDisabled = errors.New("mesh synthetic runtime is disabled")
ErrUnsupportedSyntheticMessage = errors.New("unsupported synthetic mesh message")
ErrRouteIDRequired = errors.New("mesh synthetic route id is required")
ErrRouteNotFound = errors.New("mesh synthetic route not found")
ErrInvalidRoutePath = errors.New("mesh synthetic route path is invalid")
ErrRouteExpired = errors.New("mesh synthetic route is expired")
ErrTTLExhausted = errors.New("mesh synthetic route ttl exhausted")
ErrLoopDetected = errors.New("mesh synthetic route loop detected")
ErrUnauthorizedChannel = errors.New("mesh synthetic channel is not authorized")
ErrSyntheticPeerUnavailable = errors.New("mesh synthetic next peer is unavailable")
ErrNoHealthySyntheticRoute = errors.New("mesh synthetic no healthy route available")
ErrSyntheticRelayQueueFull = errors.New("mesh synthetic relay queue is full")
ErrSyntheticRelayQueueEmpty = errors.New("mesh synthetic relay queue is empty")
ErrSyntheticPayloadTooLarge = errors.New("mesh synthetic payload is too large")
ErrSyntheticOrganizationMismatch = errors.New("mesh synthetic organization mismatch")
ErrUnsupportedSyntheticService = errors.New("unsupported synthetic test service")
ErrSyntheticRequestInvalid = errors.New("mesh synthetic request is invalid")
)
const (
SyntheticMessageProbe = "fabric.probe"
SyntheticMessageProbeAck = "fabric.probe_ack"
SyntheticMessageRouteHealth = "fabric.route_health"
SyntheticMessageRouteHealthAck = "fabric.route_health_ack"
SyntheticMessageTelemetry = "fabric.telemetry"
SyntheticMessageTestService = "fabric.test_service"
SyntheticMessageTestServiceAck = "fabric.test_service_ack"
SyntheticTestServiceType = "synthetic.echo"
SyntheticDefaultTestOrganizationID = "org-test"
SyntheticDefaultMaxTestPayloadBytes = 4096
SyntheticChannelFabricControl = "fabric_control"
SyntheticChannelRouteControl = "route_control"
SyntheticChannelTelemetry = "telemetry"
SyntheticRouteStateUnknown = "unknown"
SyntheticRouteStateHealthy = "healthy"
SyntheticRouteStateDegraded = "degraded"
SyntheticRouteStateFailed = "failed"
ProductionChannelFabricControl = "fabric_control"
ProductionMessageFabricControl = "fabric.control"
MaxProductionEnvelopePayloadBytes = 4096
MaxProductionEnvelopeFutureSkew = time.Minute
)
type PeerIdentity struct {
ClusterID string `json:"cluster_id"`
NodeID string `json:"node_id"`
}
type SyntheticRoute struct {
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
SourceNodeID string `json:"source_node_id"`
DestinationNodeID string `json:"destination_node_id"`
Hops []string `json:"hops"`
AllowedChannels []string `json:"allowed_channels"`
ExpiresAt time.Time `json:"expires_at"`
MaxTTL int `json:"max_ttl"`
MaxHops int `json:"max_hops"`
RouteVersion string `json:"route_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
}
type SyntheticEnvelope struct {
ProtocolVersion string `json:"protocol_version"`
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
From PeerIdentity `json:"from"`
To PeerIdentity `json:"to"`
Channel string `json:"channel"`
MessageType string `json:"message_type"`
TTL int `json:"ttl"`
HopCount int `json:"hop_count"`
Visited []string `json:"visited"`
Sequence uint64 `json:"sequence"`
SentAt time.Time `json:"sent_at"`
Payload json.RawMessage `json:"payload,omitempty"`
}
type SyntheticProbePayload struct {
ProbeID string `json:"probe_id"`
SentAt time.Time `json:"sent_at"`
}
type SyntheticProbeAckPayload struct {
ProbeID string `json:"probe_id"`
Path []string `json:"path"`
AcceptedAt time.Time `json:"accepted_at"`
}
type SyntheticRouteObservation struct {
RouteID string `json:"route_id"`
State string `json:"state"`
LastSuccessAt time.Time `json:"last_success_at,omitempty"`
LastFailureAt time.Time `json:"last_failure_at,omitempty"`
LastFailureReason string `json:"last_failure_reason,omitempty"`
SuccessCount uint64 `json:"success_count"`
FailureCount uint64 `json:"failure_count"`
LastLatencyMs int64 `json:"last_latency_ms,omitempty"`
RouteVersion string `json:"route_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
}
type SyntheticRouteHealthResult struct {
RequestedRouteID string `json:"requested_route_id"`
SelectedRouteID string `json:"selected_route_id"`
FallbackUsed bool `json:"fallback_used"`
Ack SyntheticEnvelope `json:"ack"`
Observation SyntheticRouteObservation `json:"observation"`
}
type SyntheticTestServiceRequest struct {
RequestID string `json:"request_id"`
OrganizationID string `json:"organization_id"`
ServiceType string `json:"service_type"`
Payload string `json:"payload"`
SentAt time.Time `json:"sent_at"`
}
type SyntheticTestServiceResponse struct {
RequestID string `json:"request_id"`
OrganizationID string `json:"organization_id"`
ServiceType string `json:"service_type"`
EchoPayload string `json:"echo_payload"`
Path []string `json:"path"`
AcceptedAt time.Time `json:"accepted_at"`
}
type SyntheticTestServiceResult struct {
RequestedRouteID string `json:"requested_route_id"`
SelectedRouteID string `json:"selected_route_id"`
FallbackUsed bool `json:"fallback_used"`
Ack SyntheticEnvelope `json:"ack"`
Response SyntheticTestServiceResponse `json:"response"`
Observation SyntheticRouteObservation `json:"observation"`
}
type SyntheticRouteCacheVersion struct {
RouteVersion string `json:"route_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
}
type SyntheticRelayQueuePolicy struct {
Channel string `json:"channel"`
Capacity int `json:"capacity"`
Droppable bool `json:"droppable"`
}
type SyntheticRelayEnqueueResult struct {
Channel string `json:"channel"`
QueueDepth int `json:"queue_depth"`
QueueCapacity int `json:"queue_capacity"`
Dropped bool `json:"dropped"`
DroppedSequence uint64 `json:"dropped_sequence,omitempty"`
AcceptedSequence uint64 `json:"accepted_sequence"`
}
type SyntheticRelayQueueMetrics struct {
Enqueued uint64 `json:"enqueued"`
Dequeued uint64 `json:"dequeued"`
Dropped uint64 `json:"dropped"`
Rejected uint64 `json:"rejected"`
LastRejectReason string `json:"last_reject_reason,omitempty"`
QueueDepths map[string]int `json:"queue_depths"`
}
type HealthMessage struct {
ProtocolVersion string `json:"protocol_version"`
From PeerIdentity `json:"from"`
To PeerIdentity `json:"to"`
ObservedAt time.Time `json:"observed_at"`
LinkStatus string `json:"link_status"`
LatencyMs *int `json:"latency_ms,omitempty"`
QualityScore *int `json:"quality_score,omitempty"`
}
type HealthAck struct {
ProtocolVersion string `json:"protocol_version"`
Accepted bool `json:"accepted"`
By PeerIdentity `json:"by"`
}
type ProductionEnvelope struct {
FabricProtocolVersion string `json:"fabric_protocol_version"`
MessageID string `json:"message_id"`
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
SourceNodeID string `json:"source_node_id"`
DestinationNodeID string `json:"destination_node_id"`
CurrentHopNodeID string `json:"current_hop_node_id"`
NextHopNodeID string `json:"next_hop_node_id"`
RoutePath []string `json:"route_path,omitempty"`
VisitedNodeIDs []string `json:"visited_node_ids,omitempty"`
ChannelClass string `json:"channel_class"`
MessageType string `json:"message_type"`
TTL int `json:"ttl"`
HopCount int `json:"hop_count"`
CreatedAt time.Time `json:"created_at"`
ExpiresAt time.Time `json:"expires_at"`
PayloadLength int `json:"payload_length"`
PayloadHash string `json:"payload_hash"`
Payload json.RawMessage `json:"payload,omitempty"`
}
type ProductionEnvelopeObservation struct {
MessageID string `json:"message_id"`
RouteID string `json:"route_id"`
ClusterID string `json:"cluster_id"`
SourceNodeID string `json:"source_node_id"`
DestinationNodeID string `json:"destination_node_id"`
CurrentHopNodeID string `json:"current_hop_node_id"`
NextHopNodeID string `json:"next_hop_node_id"`
RoutePath []string `json:"route_path,omitempty"`
VisitedNodeIDs []string `json:"visited_node_ids,omitempty"`
ChannelClass string `json:"channel_class"`
MessageType string `json:"message_type"`
TTL int `json:"ttl"`
HopCount int `json:"hop_count"`
PayloadLength int `json:"payload_length"`
PayloadHash string `json:"payload_hash"`
ObservedAt time.Time `json:"observed_at"`
}
type ProductionForwardResult struct {
Accepted bool `json:"accepted"`
Delivered bool `json:"delivered"`
Forwarded bool `json:"forwarded"`
By PeerIdentity `json:"by"`
MessageID string `json:"message_id"`
RouteID string `json:"route_id"`
NextNodeID string `json:"next_node_id,omitempty"`
}
type ProductionForwardLogEntry struct {
Event string `json:"event"`
RouteID string `json:"route_id,omitempty"`
MessageID string `json:"message_id,omitempty"`
ClusterID string `json:"cluster_id,omitempty"`
LocalNodeID string `json:"local_node_id,omitempty"`
SourceNodeID string `json:"source_node_id,omitempty"`
DestinationNodeID string `json:"destination_node_id,omitempty"`
CurrentHopNodeID string `json:"current_hop_node_id,omitempty"`
NextHopNodeID string `json:"next_hop_node_id,omitempty"`
ChannelClass string `json:"channel_class,omitempty"`
MessageType string `json:"message_type,omitempty"`
Reason string `json:"reason,omitempty"`
StatusCode int `json:"status_code,omitempty"`
TTL int `json:"ttl,omitempty"`
HopCount int `json:"hop_count,omitempty"`
RoutePathLength int `json:"route_path_length,omitempty"`
VisitedCount int `json:"visited_count,omitempty"`
PayloadLength int `json:"payload_length,omitempty"`
OccurredAt time.Time `json:"occurred_at"`
}
func ValidatePeer(local PeerIdentity, remote PeerIdentity) error {
if local.ClusterID == "" || remote.ClusterID == "" || local.ClusterID != remote.ClusterID {
return ErrClusterMismatch
}
if remote.NodeID == "" {
return ErrNodeMismatch
}
return nil
}
@@ -0,0 +1,258 @@
package mesh
import (
"sort"
"strings"
"time"
)
type EndpointCandidateScoreOptions struct {
ChannelClass string
PreferredRegion string
Now time.Time
MaxVerificationAge time.Duration
Observations map[string]EndpointCandidateHealthObservation
MaxObservationAge time.Duration
}
type EndpointCandidateHealthObservation struct {
EndpointID string `json:"endpoint_id"`
LastLatencyMs int64 `json:"last_latency_ms,omitempty"`
SuccessCount uint64 `json:"success_count,omitempty"`
FailureCount uint64 `json:"failure_count,omitempty"`
LastFailureReason string `json:"last_failure_reason,omitempty"`
ReliabilityScore int `json:"reliability_score,omitempty"`
ObservedAt time.Time `json:"observed_at,omitempty"`
}
type ScoredPeerEndpointCandidate struct {
Candidate PeerEndpointCandidate `json:"candidate"`
Score int `json:"score"`
Reasons []string `json:"reasons,omitempty"`
}
func RankPeerEndpointCandidates(candidates []PeerEndpointCandidate, opts EndpointCandidateScoreOptions) []ScoredPeerEndpointCandidate {
if len(candidates) == 0 {
return nil
}
out := make([]ScoredPeerEndpointCandidate, 0, len(candidates))
for _, candidate := range candidates {
out = append(out, scorePeerEndpointCandidate(candidate, opts))
}
sort.SliceStable(out, func(i, j int) bool {
if out[i].Score != out[j].Score {
return out[i].Score > out[j].Score
}
if out[i].Candidate.Priority != out[j].Candidate.Priority {
return out[i].Candidate.Priority < out[j].Candidate.Priority
}
if out[i].Candidate.NodeID != out[j].Candidate.NodeID {
return out[i].Candidate.NodeID < out[j].Candidate.NodeID
}
return out[i].Candidate.EndpointID < out[j].Candidate.EndpointID
})
return out
}
func scorePeerEndpointCandidate(candidate PeerEndpointCandidate, opts EndpointCandidateScoreOptions) ScoredPeerEndpointCandidate {
score := 100
reasons := []string{"base"}
switch candidate.Transport {
case "direct_tcp_tls":
score += 35
reasons = append(reasons, "transport:direct_tcp_tls")
case "wss":
score += 25
reasons = append(reasons, "transport:wss")
case "outbound_reverse":
score += 10
reasons = append(reasons, "transport:outbound_reverse")
case "relay":
score += 5
reasons = append(reasons, "transport:relay")
default:
score -= 100
reasons = append(reasons, "transport:unknown")
}
switch candidate.Reachability {
case "public":
score += 30
reasons = append(reasons, "reachability:public")
case "private":
score += 15
reasons = append(reasons, "reachability:private")
case "relay":
score += 5
reasons = append(reasons, "reachability:relay")
case "outbound_only":
score -= 5
reasons = append(reasons, "reachability:outbound_only")
default:
score -= 15
reasons = append(reasons, "reachability:unknown")
}
switch candidate.ConnectivityMode {
case "direct":
score += 30
reasons = append(reasons, "connectivity:direct")
case "outbound_only":
score += 5
reasons = append(reasons, "connectivity:outbound_only")
case "relay_required":
score -= 5
reasons = append(reasons, "connectivity:relay_required")
default:
score -= 10
reasons = append(reasons, "connectivity:unknown")
}
switch candidate.NATType {
case "", "none":
score += 15
reasons = append(reasons, "nat:none")
case "full_cone":
score += 10
reasons = append(reasons, "nat:full_cone")
case "restricted", "port_restricted":
score += 3
reasons = append(reasons, "nat:restricted")
case "symmetric":
score -= 20
reasons = append(reasons, "nat:symmetric")
case "blocked":
score -= 60
reasons = append(reasons, "nat:blocked")
default:
score -= 8
reasons = append(reasons, "nat:unknown")
}
if candidate.Priority > 0 {
score -= candidate.Priority
reasons = append(reasons, "priority")
}
if opts.PreferredRegion != "" && candidate.Region != "" {
if strings.EqualFold(candidate.Region, opts.PreferredRegion) {
score += 12
reasons = append(reasons, "region:preferred")
} else {
score -= 4
reasons = append(reasons, "region:remote")
}
}
if hasPolicyTag(candidate.PolicyTags, "fast-path") {
score += 10
reasons = append(reasons, "policy:fast-path")
}
if hasPolicyTag(candidate.PolicyTags, "private-lan") || hasPolicyTag(candidate.PolicyTags, "corp-lan") || hasPolicyTag(candidate.PolicyTags, "same-site") {
score += 18
reasons = append(reasons, "policy:private-lan")
}
if hasPolicyTag(candidate.PolicyTags, "costly") {
score -= 10
reasons = append(reasons, "policy:costly")
}
if opts.ChannelClass == SyntheticChannelFabricControl || opts.ChannelClass == SyntheticChannelRouteControl {
if candidate.ConnectivityMode == "direct" {
score += 8
reasons = append(reasons, "channel:control-direct")
}
if candidate.Transport == "relay" {
score -= 8
reasons = append(reasons, "channel:control-relay-penalty")
}
}
if !opts.Now.IsZero() && candidate.LastVerifiedAt != nil && opts.MaxVerificationAge > 0 {
age := opts.Now.Sub(candidate.LastVerifiedAt.UTC())
if age >= 0 && age <= opts.MaxVerificationAge {
score += 8
reasons = append(reasons, "verified:fresh")
} else {
score -= 12
reasons = append(reasons, "verified:stale")
}
}
if observation, ok := opts.Observations[candidate.EndpointID]; ok {
observationScore, observationReasons := scoreEndpointCandidateObservation(observation, opts)
score += observationScore
reasons = append(reasons, observationReasons...)
}
return ScoredPeerEndpointCandidate{
Candidate: candidate,
Score: score,
Reasons: reasons,
}
}
func scoreEndpointCandidateObservation(observation EndpointCandidateHealthObservation, opts EndpointCandidateScoreOptions) (int, []string) {
score := 0
reasons := []string{"observation:present"}
if !opts.Now.IsZero() && !observation.ObservedAt.IsZero() && opts.MaxObservationAge > 0 {
age := opts.Now.Sub(observation.ObservedAt.UTC())
if age < 0 || age > opts.MaxObservationAge {
return -12, []string{"observation:stale"}
}
score += 6
reasons = append(reasons, "observation:fresh")
}
switch {
case observation.LastLatencyMs > 0 && observation.LastLatencyMs <= 50:
score += 18
reasons = append(reasons, "latency:low")
case observation.LastLatencyMs <= 150:
score += 8
reasons = append(reasons, "latency:moderate")
case observation.LastLatencyMs > 0:
score -= 10
reasons = append(reasons, "latency:high")
}
if observation.ReliabilityScore > 0 {
switch {
case observation.ReliabilityScore >= 90:
score += 15
reasons = append(reasons, "reliability:high")
case observation.ReliabilityScore >= 70:
score += 5
reasons = append(reasons, "reliability:moderate")
default:
score -= 12
reasons = append(reasons, "reliability:low")
}
}
if observation.SuccessCount > 0 {
score += boundedInt(int(observation.SuccessCount), 1, 10)
reasons = append(reasons, "history:success")
}
if observation.FailureCount > 0 {
score -= boundedInt(int(observation.FailureCount)*6, 6, 30)
reasons = append(reasons, "history:failure")
}
if strings.TrimSpace(observation.LastFailureReason) != "" {
score -= 8
reasons = append(reasons, "failure:recent")
}
return score, reasons
}
func hasPolicyTag(tags []string, needle string) bool {
for _, tag := range tags {
if strings.EqualFold(strings.TrimSpace(tag), needle) {
return true
}
}
return false
}
func boundedInt(value, minValue, maxValue int) int {
if value < minValue {
return minValue
}
if value > maxValue {
return maxValue
}
return value
}
@@ -0,0 +1,278 @@
package mesh
import (
"testing"
"time"
)
func TestRankPeerEndpointCandidatesPrefersDirectFreshPublicPath(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
fresh := now.Add(-time.Minute)
stale := now.Add(-2 * time.Hour)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Address: "relay.example.test/node-b",
Reachability: "relay",
NATType: "symmetric",
ConnectivityMode: "relay_required",
Region: "us",
Priority: 1,
LastVerifiedAt: &fresh,
},
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Region: "eu",
Priority: 10,
PolicyTags: []string{"fast-path"},
LastVerifiedAt: &fresh,
},
{
EndpointID: "node-b-private-stale",
NodeID: "node-b",
Transport: "wss",
Address: "10.0.0.5:443",
Reachability: "private",
NATType: "restricted",
ConnectivityMode: "direct",
Region: "eu",
Priority: 5,
LastVerifiedAt: &stale,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelFabricControl,
PreferredRegion: "eu",
Now: now,
MaxVerificationAge: time.Hour,
})
if len(ranked) != 3 {
t.Fatalf("ranked length = %d, want 3", len(ranked))
}
if ranked[0].Candidate.EndpointID != "node-b-public" {
t.Fatalf("top endpoint = %q, want node-b-public: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if ranked[0].Score <= ranked[1].Score {
t.Fatalf("top score = %d, second = %d", ranked[0].Score, ranked[1].Score)
}
if !containsReason(ranked[0].Reasons, "policy:fast-path") || !containsReason(ranked[0].Reasons, "verified:fresh") {
t.Fatalf("top reasons missing expected hints: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesUsesDeterministicTieBreak(t *testing.T) {
candidates := []PeerEndpointCandidate{
{
EndpointID: "endpoint-b",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.21:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
{
EndpointID: "endpoint-a",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{})
if ranked[0].Candidate.EndpointID != "endpoint-a" {
t.Fatalf("tie top endpoint = %q, want endpoint-a", ranked[0].Candidate.EndpointID)
}
}
func TestRankPeerEndpointCandidatesPrefersCorporatePrivateEndpoint(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Region: "corp-eu",
Priority: 10,
},
{
EndpointID: "node-b-corp-lan",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "10.24.10.20:19001",
Reachability: "private",
NATType: "none",
ConnectivityMode: "direct",
Region: "corp-eu",
Priority: 1,
PolicyTags: []string{"corp-lan", "same-site"},
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelFabricControl,
PreferredRegion: "corp-eu",
Now: now,
})
if ranked[0].Candidate.EndpointID != "node-b-corp-lan" {
t.Fatalf("top endpoint = %q, want node-b-corp-lan: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if !containsReason(ranked[0].Reasons, "policy:private-lan") || !containsReason(ranked[0].Reasons, "region:preferred") {
t.Fatalf("corp LAN reasons missing: %+v", ranked[0].Reasons)
}
}
func TestRankPeerEndpointCandidatesDoesNotDropRelayRequiredFallback(t *testing.T) {
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-outbound",
NodeID: "node-b",
Transport: "outbound_reverse",
Address: "node-b.reverse.local",
Reachability: "outbound_only",
NATType: "symmetric",
ConnectivityMode: "outbound_only",
Priority: 20,
},
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Address: "relay.example.test/node-b",
Reachability: "relay",
NATType: "blocked",
ConnectivityMode: "relay_required",
Priority: 30,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelRouteControl,
})
if len(ranked) != 2 {
t.Fatalf("ranked length = %d, want 2", len(ranked))
}
for _, item := range ranked {
if item.Candidate.EndpointID == "" {
t.Fatalf("ranked candidate lost identity: %+v", item)
}
}
}
func TestRankPeerEndpointCandidatesUsesHealthObservationOverlay(t *testing.T) {
now := time.Date(2026, 4, 28, 13, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
{
EndpointID: "node-b-wss",
NodeID: "node-b",
Transport: "wss",
Address: "node-b.example.test",
Reachability: "public",
NATType: "restricted",
ConnectivityMode: "direct",
Priority: 10,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
Now: now,
MaxObservationAge: 5 * time.Minute,
Observations: map[string]EndpointCandidateHealthObservation{
"node-b-direct": {
EndpointID: "node-b-direct",
LastLatencyMs: 240,
FailureCount: 3,
LastFailureReason: "connect_timeout",
ReliabilityScore: 50,
ObservedAt: now.Add(-time.Minute),
},
"node-b-wss": {
EndpointID: "node-b-wss",
LastLatencyMs: 35,
SuccessCount: 8,
ReliabilityScore: 95,
ObservedAt: now.Add(-time.Minute),
},
},
})
if ranked[0].Candidate.EndpointID != "node-b-wss" {
t.Fatalf("top endpoint = %q, want node-b-wss: %+v", ranked[0].Candidate.EndpointID, ranked)
}
if !containsReason(ranked[0].Reasons, "latency:low") || !containsReason(ranked[0].Reasons, "reliability:high") {
t.Fatalf("top reasons missing health hints: %+v", ranked[0].Reasons)
}
if !containsReason(ranked[1].Reasons, "history:failure") || !containsReason(ranked[1].Reasons, "failure:recent") {
t.Fatalf("failed endpoint reasons missing failure hints: %+v", ranked[1].Reasons)
}
}
func TestRankPeerEndpointCandidatesTreatsStaleObservationAsPenalty(t *testing.T) {
now := time.Date(2026, 4, 28, 13, 0, 0, 0, time.UTC)
candidates := []PeerEndpointCandidate{
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 10,
},
}
ranked := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
Now: now,
MaxObservationAge: 5 * time.Minute,
Observations: map[string]EndpointCandidateHealthObservation{
"node-b-direct": {
EndpointID: "node-b-direct",
LastLatencyMs: 20,
ObservedAt: now.Add(-time.Hour),
},
},
})
if !containsReason(ranked[0].Reasons, "observation:stale") {
t.Fatalf("reasons missing stale observation: %+v", ranked[0].Reasons)
}
if containsReason(ranked[0].Reasons, "latency:low") {
t.Fatalf("stale observation should not contribute latency: %+v", ranked[0].Reasons)
}
}
func containsReason(reasons []string, reason string) bool {
for _, item := range reasons {
if item == reason {
return true
}
}
return false
}
@@ -0,0 +1,42 @@
package mesh
import (
"context"
"net/http"
"strings"
)
// HTTPPeerTransport sends synthetic mesh envelopes to explicitly configured
// peer endpoints. It is intentionally narrow: production forwarding remains
// disabled and only SyntheticRuntime messages use this transport.
type HTTPPeerTransport struct {
PeerURLs map[string]string
HTTPClient *http.Client
}
func NewHTTPPeerTransport(peerURLs map[string]string) *HTTPPeerTransport {
normalized := make(map[string]string, len(peerURLs))
for nodeID, baseURL := range peerURLs {
nodeID = strings.TrimSpace(nodeID)
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
if nodeID != "" && baseURL != "" {
normalized[nodeID] = baseURL
}
}
return &HTTPPeerTransport{PeerURLs: normalized}
}
func (t *HTTPPeerTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
if t == nil {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/")
if baseURL == "" {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
client := NewClient(baseURL)
if t.HTTPClient != nil {
client.HTTPClient = t.HTTPClient
}
return client.SendSynthetic(ctx, envelope)
}
@@ -0,0 +1,130 @@
package mesh
import (
"context"
"errors"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestHTTPPeerTransportDirectSyntheticProbe(t *testing.T) {
nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
defer nodeA.Close()
nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
defer nodeB.Close()
route := liveSyntheticRoute("route-direct", []string{"node-a", "node-b"})
routes := []SyntheticRoute{route}
nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-b": nodeB.URL})
nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{})
ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-direct")
if err != nil {
t.Fatalf("send live direct probe: %v", err)
}
if ack.MessageType != SyntheticMessageProbeAck {
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
}
payload := decodeAckPayload(t, ack)
if got, want := payload.Path, []string{"node-a", "node-b"}; !sameStrings(got, want) {
t.Fatalf("path = %v, want %v", got, want)
}
}
func TestHTTPPeerTransportSingleRelaySyntheticProbe(t *testing.T) {
nodeA := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
defer nodeA.Close()
nodeR := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"})
defer nodeR.Close()
nodeB := newLiveSyntheticNode(t, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"})
defer nodeB.Close()
route := liveSyntheticRoute("route-relay", []string{"node-a", "node-r", "node-b"})
routes := []SyntheticRoute{route}
nodeA.Runtime = newLiveRuntime(nodeA.Local, routes, map[string]string{"node-r": nodeR.URL})
nodeR.Runtime = newLiveRuntime(nodeR.Local, routes, map[string]string{"node-b": nodeB.URL})
nodeB.Runtime = newLiveRuntime(nodeB.Local, routes, map[string]string{})
ack, err := nodeA.Runtime.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-live-relay")
if err != nil {
t.Fatalf("send live relay probe: %v", err)
}
if ack.MessageType != SyntheticMessageProbeAck {
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
}
payload := decodeAckPayload(t, ack)
if got, want := payload.Path, []string{"node-a", "node-r", "node-b"}; !sameStrings(got, want) {
t.Fatalf("path = %v, want %v", got, want)
}
}
func TestHTTPPeerTransportMissingPeer(t *testing.T) {
transport := NewHTTPPeerTransport(map[string]string{})
_, err := transport.SendSynthetic(context.Background(), "node-missing", SyntheticEnvelope{})
if !errors.Is(err, ErrSyntheticPeerUnavailable) {
t.Fatalf("err = %v, want ErrSyntheticPeerUnavailable", err)
}
}
type liveSyntheticNode struct {
Local PeerIdentity
Runtime *SyntheticRuntime
URL string
server *httptest.Server
}
func newLiveSyntheticNode(t *testing.T, local PeerIdentity) *liveSyntheticNode {
t.Helper()
node := &liveSyntheticNode{Local: local}
node.server = httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Server{Local: node.Local, SyntheticRuntime: node.Runtime}.Handler().ServeHTTP(w, r)
}))
node.URL = node.server.URL
return node
}
func (n *liveSyntheticNode) Close() {
if n.server != nil {
n.server.Close()
}
}
func newLiveRuntime(local PeerIdentity, routes []SyntheticRoute, peers map[string]string) *SyntheticRuntime {
return NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: local,
Routes: routes,
Transport: NewHTTPPeerTransport(peers),
})
}
func liveSyntheticRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: hops,
AllowedChannels: []string{SyntheticChannelFabricControl},
MaxTTL: 8,
MaxHops: 8,
ExpiresAt: time.Now().UTC().Add(time.Hour),
RouteVersion: "route-v1",
PolicyVersion: "policy-v1",
PeerDirectoryVersion: "peers-v1",
}
}
func sameStrings(left, right []string) bool {
if len(left) != len(right) {
return false
}
for i := range left {
if left[i] != right[i] {
return false
}
}
return true
}
@@ -0,0 +1,374 @@
package mesh
import (
"sort"
"strings"
"time"
)
const DefaultWarmPeerLimit = 8
type PeerCacheConfig struct {
Local PeerIdentity
PeerEndpoints map[string]string
PeerEndpointCandidates map[string][]PeerEndpointCandidate
PeerDirectory []PeerDirectoryEntry
RecoverySeeds []PeerRecoverySeed
RendezvousLeases []PeerRendezvousLease
Routes []SyntheticRoute
WarmPeerLimit int
PreferredRegion string
Now time.Time
}
type PeerCache struct {
snapshot PeerCacheSnapshot
}
type PeerCacheSnapshot struct {
ClusterID string `json:"cluster_id"`
LocalNodeID string `json:"local_node_id"`
PeerCount int `json:"peer_count"`
WarmPeerCount int `json:"warm_peer_count"`
RecoverySeedCount int `json:"recovery_seed_count"`
RendezvousLeaseCount int `json:"rendezvous_lease_count"`
BuiltAt time.Time `json:"built_at"`
Entries []PeerCacheEntry `json:"entries"`
}
type PeerCacheEntry struct {
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
EndpointCount int `json:"endpoint_count"`
CandidateCount int `json:"candidate_count"`
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
BestCandidateAddr string `json:"best_candidate_addr,omitempty"`
BestTransport string `json:"best_transport,omitempty"`
BestReachability string `json:"best_reachability,omitempty"`
BestConnectivity string `json:"best_connectivity,omitempty"`
BestNATType string `json:"best_nat_type,omitempty"`
BestPolicyTags []string `json:"best_policy_tags,omitempty"`
BestCandidateScore int `json:"best_candidate_score,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
}
type peerCacheBuildEntry struct {
PeerCacheEntry
adjacentRoutePeer bool
bestScore int
}
func NewPeerCache(cfg PeerCacheConfig) *PeerCache {
now := cfg.Now.UTC()
if now.IsZero() {
now = time.Now().UTC()
}
limit := cfg.WarmPeerLimit
if limit <= 0 {
limit = DefaultWarmPeerLimit
}
entries := map[string]*peerCacheBuildEntry{}
for _, item := range cfg.PeerDirectory {
nodeID := strings.TrimSpace(item.NodeID)
if nodeID == "" || nodeID == cfg.Local.NodeID {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.RouteIDs = mergeStrings(entry.RouteIDs, item.RouteIDs)
entry.EndpointCount = maxInt(entry.EndpointCount, item.EndpointCount)
entry.CandidateCount = maxInt(entry.CandidateCount, item.CandidateCount)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, item.ConnectivityModes)
entry.RecoverySeed = entry.RecoverySeed || item.RecoverySeed
}
for nodeID, endpoint := range cfg.PeerEndpoints {
nodeID = strings.TrimSpace(nodeID)
endpoint = strings.TrimSpace(endpoint)
if nodeID == "" || nodeID == cfg.Local.NodeID || endpoint == "" {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.Endpoint = endpoint
entry.EndpointCount = maxInt(entry.EndpointCount, 1)
}
for nodeID, candidates := range cfg.PeerEndpointCandidates {
nodeID = strings.TrimSpace(nodeID)
if nodeID == "" || nodeID == cfg.Local.NodeID || len(candidates) == 0 {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.CandidateCount = maxInt(entry.CandidateCount, len(candidates))
for _, candidate := range candidates {
if strings.TrimSpace(candidate.ConnectivityMode) != "" {
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{candidate.ConnectivityMode})
}
}
scored := RankPeerEndpointCandidates(candidates, EndpointCandidateScoreOptions{
ChannelClass: SyntheticChannelFabricControl,
PreferredRegion: cfg.PreferredRegion,
Now: now,
MaxVerificationAge: time.Hour,
})
if len(scored) > 0 {
entry.BestCandidateID = scored[0].Candidate.EndpointID
entry.BestCandidateAddr = scored[0].Candidate.Address
entry.BestTransport = scored[0].Candidate.Transport
entry.BestReachability = scored[0].Candidate.Reachability
entry.BestConnectivity = scored[0].Candidate.ConnectivityMode
entry.BestNATType = scored[0].Candidate.NATType
entry.BestPolicyTags = append([]string{}, scored[0].Candidate.PolicyTags...)
entry.BestCandidateScore = scored[0].Score
entry.bestScore = scored[0].Score
if strings.TrimSpace(scored[0].Candidate.Address) != "" {
entry.Endpoint = strings.TrimSpace(scored[0].Candidate.Address)
}
}
}
for _, route := range cfg.Routes {
path := routePath(route)
localIndex := indexOf(path, cfg.Local.NodeID)
if localIndex < 0 {
continue
}
for _, nodeID := range path {
if nodeID == "" || nodeID == cfg.Local.NodeID {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.RouteIDs = mergeStrings(entry.RouteIDs, []string{route.RouteID})
}
for _, adjacentIndex := range []int{localIndex - 1, localIndex + 1} {
if adjacentIndex < 0 || adjacentIndex >= len(path) {
continue
}
nodeID := path[adjacentIndex]
if nodeID == "" || nodeID == cfg.Local.NodeID {
continue
}
peerCacheEntry(entries, nodeID).adjacentRoutePeer = true
}
}
for _, seed := range cfg.RecoverySeeds {
nodeID := strings.TrimSpace(seed.NodeID)
if nodeID == "" || nodeID == cfg.Local.NodeID {
continue
}
entry := peerCacheEntry(entries, nodeID)
entry.RecoverySeed = true
if entry.Endpoint == "" {
entry.Endpoint = strings.TrimSpace(seed.Endpoint)
}
if strings.TrimSpace(seed.ConnectivityMode) != "" {
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{seed.ConnectivityMode})
}
}
rendezvousLeases := 0
for _, lease := range cfg.RendezvousLeases {
if !leaseUsableForPeerCache(lease, cfg.Local.NodeID, now) {
continue
}
rendezvousLeases++
if lease.PeerNodeID != cfg.Local.NodeID {
entry := peerCacheEntry(entries, lease.PeerNodeID)
useLeaseEndpoint := shouldUseRendezvousEndpoint(*entry)
entry.RendezvousLeaseID = lease.LeaseID
entry.RelayNodeID = lease.RelayNodeID
entry.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
entry.RelayControl = true
entry.CandidateCount = maxInt(entry.CandidateCount, 1)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{firstNonEmpty(lease.ConnectivityMode, "relay_required"), "relay_control"})
if useLeaseEndpoint {
entry.BestTransport = firstNonEmpty(lease.Transport, "relay_control")
entry.BestReachability = "relay"
entry.BestConnectivity = firstNonEmpty(lease.ConnectivityMode, "relay_required")
entry.Endpoint = entry.RelayEndpoint
entry.BestCandidateID = lease.LeaseID
entry.BestCandidateAddr = entry.RelayEndpoint
entry.bestScore = maxInt(entry.bestScore, 500)
}
}
if lease.PeerNodeID == cfg.Local.NodeID && lease.RelayNodeID != "" && lease.RelayNodeID != cfg.Local.NodeID {
entry := peerCacheEntry(entries, lease.RelayNodeID)
if entry.Endpoint == "" {
entry.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
}
entry.EndpointCount = maxInt(entry.EndpointCount, 1)
entry.ConnectivityModes = mergeStrings(entry.ConnectivityModes, []string{"relay_control"})
}
}
out := make([]peerCacheBuildEntry, 0, len(entries))
recoverySeeds := 0
for _, entry := range entries {
sort.Strings(entry.RouteIDs)
sort.Strings(entry.ConnectivityModes)
if entry.RecoverySeed {
recoverySeeds++
}
out = append(out, *entry)
}
sort.SliceStable(out, func(i, j int) bool {
left := warmPeerPriority(out[i])
right := warmPeerPriority(out[j])
if left != right {
return left > right
}
return out[i].NodeID < out[j].NodeID
})
warm := 0
for i := range out {
if warm >= limit {
break
}
if warmPeerPriority(out[i]) <= 0 {
continue
}
out[i].Warm = true
out[i].WarmReason = warmPeerReason(out[i])
warm++
}
sort.SliceStable(out, func(i, j int) bool {
return out[i].NodeID < out[j].NodeID
})
snapshotEntries := make([]PeerCacheEntry, 0, len(out))
for _, entry := range out {
snapshotEntries = append(snapshotEntries, entry.PeerCacheEntry)
}
return &PeerCache{snapshot: PeerCacheSnapshot{
ClusterID: cfg.Local.ClusterID,
LocalNodeID: cfg.Local.NodeID,
PeerCount: len(snapshotEntries),
WarmPeerCount: warm,
RecoverySeedCount: recoverySeeds,
RendezvousLeaseCount: rendezvousLeases,
BuiltAt: now,
Entries: snapshotEntries,
}}
}
func (c *PeerCache) Snapshot() PeerCacheSnapshot {
if c == nil {
return PeerCacheSnapshot{}
}
snapshot := c.snapshot
snapshot.Entries = append([]PeerCacheEntry{}, c.snapshot.Entries...)
return snapshot
}
func (c *PeerCache) WarmPeerIDs() []string {
snapshot := c.Snapshot()
out := make([]string, 0, snapshot.WarmPeerCount)
for _, entry := range snapshot.Entries {
if entry.Warm {
out = append(out, entry.NodeID)
}
}
return out
}
func peerCacheEntry(entries map[string]*peerCacheBuildEntry, nodeID string) *peerCacheBuildEntry {
if entry, ok := entries[nodeID]; ok {
return entry
}
entry := &peerCacheBuildEntry{PeerCacheEntry: PeerCacheEntry{NodeID: nodeID}}
entries[nodeID] = entry
return entry
}
func warmPeerPriority(entry peerCacheBuildEntry) int {
score := 0
if entry.adjacentRoutePeer {
score += 1000
}
if entry.RecoverySeed {
score += 500
}
if entry.Endpoint != "" {
score += 100
}
if entry.bestScore > 0 {
score += entry.bestScore
}
if entry.RelayControl {
score += 300
}
score += entry.CandidateCount
return score
}
func warmPeerReason(entry peerCacheBuildEntry) string {
if entry.adjacentRoutePeer {
return "route_adjacent"
}
if entry.RecoverySeed {
return "recovery_seed"
}
if entry.RelayControl {
return "rendezvous_lease"
}
if entry.BestCandidateID != "" {
return "endpoint_candidate"
}
if entry.Endpoint != "" {
return "peer_endpoint"
}
return "scoped_peer"
}
func leaseUsableForPeerCache(lease PeerRendezvousLease, localNodeID string, now time.Time) bool {
if strings.TrimSpace(lease.LeaseID) == "" ||
strings.TrimSpace(lease.PeerNodeID) == "" ||
strings.TrimSpace(lease.RelayNodeID) == "" ||
strings.TrimSpace(lease.RelayEndpoint) == "" ||
lease.ExpiresAt.IsZero() ||
!lease.ExpiresAt.After(now) ||
!lease.ControlPlaneOnly {
return false
}
return lease.PeerNodeID != localNodeID || lease.RelayNodeID != localNodeID
}
func shouldUseRendezvousEndpoint(entry peerCacheBuildEntry) bool {
if strings.TrimSpace(entry.Endpoint) == "" {
return true
}
transport := strings.ToLower(strings.TrimSpace(entry.BestTransport))
reachability := strings.ToLower(strings.TrimSpace(entry.BestReachability))
connectivity := strings.ToLower(strings.TrimSpace(entry.BestConnectivity))
return strings.Contains(transport, "relay") ||
strings.Contains(transport, "outbound") ||
reachability == "relay" ||
reachability == "outbound_only" ||
connectivity == "relay_required" ||
connectivity == "outbound_only"
}
func mergeStrings(existing []string, incoming []string) []string {
seen := map[string]struct{}{}
out := make([]string, 0, len(existing)+len(incoming))
for _, value := range append(existing, incoming...) {
value = strings.TrimSpace(value)
if value == "" {
continue
}
if _, ok := seen[value]; ok {
continue
}
seen[value] = struct{}{}
out = append(out, value)
}
return out
}
func maxInt(left, right int) int {
if left > right {
return left
}
return right
}
@@ -0,0 +1,170 @@
package mesh
import (
"testing"
"time"
)
func TestPeerCacheSelectsAdjacentWarmPeersWithinLimit(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-a": "http://node-a:19000",
"node-r": "http://node-r:19000",
"node-c": "http://node-c:19000",
},
Routes: []SyntheticRoute{
peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r", "node-c"}),
},
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-seed", Endpoint: "https://seed.example.test", Transport: "direct_tcp_tls", Priority: 10},
},
WarmPeerLimit: 2,
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
})
warm := cache.WarmPeerIDs()
if len(warm) != 2 || warm[0] != "node-a" || warm[1] != "node-r" {
t.Fatalf("warm peers = %+v, want adjacent node-a/node-r", warm)
}
snapshot := cache.Snapshot()
if snapshot.PeerCount != 4 || snapshot.RecoverySeedCount != 1 {
t.Fatalf("unexpected snapshot counts: %+v", snapshot)
}
}
func TestPeerCachePromotesRecoverySeedAfterRoutePeers(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
Routes: []SyntheticRoute{
peerCacheRoute("route-1", []string{"node-a", local.NodeID, "node-r"}),
},
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-seed", Endpoint: "wss://seed.example.test/mesh", Transport: "wss", ConnectivityMode: "direct", Priority: 1},
},
WarmPeerLimit: 3,
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
})
warm := cache.WarmPeerIDs()
if len(warm) != 3 || warm[0] != "node-a" || warm[1] != "node-r" || warm[2] != "node-seed" {
t.Fatalf("warm peers = %+v, want adjacent peers then seed", warm)
}
seed, ok := peerCacheEntryByID(cache.Snapshot(), "node-seed")
if !ok || !seed.RecoverySeed || seed.WarmReason != "recovery_seed" {
t.Fatalf("unexpected seed entry: %+v", seed)
}
}
func TestPeerCacheUsesBestEndpointCandidate(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Address: "relay.example.test",
Reachability: "relay",
ConnectivityMode: "relay_required",
Priority: 20,
},
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Priority: 1,
LastVerifiedAt: &now,
},
},
},
WarmPeerLimit: 1,
Now: now,
})
entry, ok := peerCacheEntryByID(cache.Snapshot(), "node-b")
if !ok {
t.Fatal("node-b missing from cache")
}
if entry.BestCandidateID != "node-b-public" || !entry.Warm {
t.Fatalf("unexpected candidate selection: %+v", entry)
}
}
func TestPeerCacheUsesPreferredCorporateEndpointAddress(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-b": "https://node-b.public.example.test:443",
},
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "https://node-b.public.example.test:443",
Reachability: "public",
NATType: "none",
ConnectivityMode: "direct",
Region: "corp-eu",
Priority: 10,
},
{
EndpointID: "node-b-corp-lan",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "http://10.24.10.20:19001",
Reachability: "private",
NATType: "none",
ConnectivityMode: "direct",
Region: "corp-eu",
Priority: 1,
PolicyTags: []string{"corp-lan"},
},
},
},
PreferredRegion: "corp-eu",
WarmPeerLimit: 1,
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
})
entry, ok := peerCacheEntryByID(cache.Snapshot(), "node-b")
if !ok {
t.Fatal("node-b missing from peer cache")
}
if entry.BestCandidateID != "node-b-corp-lan" || entry.Endpoint != "http://10.24.10.20:19001" {
t.Fatalf("peer cache did not choose corp LAN endpoint: %+v", entry)
}
}
func peerCacheRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: append([]string{}, hops...),
AllowedChannels: []string{SyntheticChannelFabricControl},
ExpiresAt: time.Now().UTC().Add(time.Hour),
}
}
func peerCacheEntryByID(snapshot PeerCacheSnapshot, nodeID string) (PeerCacheEntry, bool) {
for _, entry := range snapshot.Entries {
if entry.NodeID == nodeID {
return entry, true
}
}
return PeerCacheEntry{}, false
}
@@ -0,0 +1,303 @@
package mesh
import (
"net"
"net/netip"
"net/url"
"sort"
"strings"
"time"
)
const (
PeerConnectionIntentMaintain = "maintain"
PeerConnectionIntentProbe = "probe"
PeerConnectionIntentRecover = "recover"
)
const (
PeerTransportModeDirect = "direct"
PeerTransportModePrivateLAN = "private_lan"
PeerTransportModeCorporateLAN = "corporate_lan"
PeerTransportModeOutboundOnly = "outbound_only"
PeerTransportModeRelayRequired = "relay_required"
PeerTransportModeRelayControl = "relay_control"
PeerTransportModeUnknown = "unknown"
)
type PeerConnectionIntentPlanConfig struct {
PeerCache PeerCacheSnapshot
RecoveryPlan PeerRecoveryPlan
RendezvousLeases []PeerRendezvousLease
Now time.Time
}
type PeerConnectionIntentPlan struct {
Mode string `json:"mode"`
IntentCount int `json:"intent_count"`
MaintainCount int `json:"maintain_count"`
ProbeCount int `json:"probe_count"`
RecoverCount int `json:"recover_count"`
DirectCount int `json:"direct_count"`
PrivateLANCount int `json:"private_lan_count"`
CorporateLANCount int `json:"corporate_lan_count"`
OutboundOnlyCount int `json:"outbound_only_count"`
RelayRequiredCount int `json:"relay_required_count"`
RelayControlCount int `json:"relay_control_count"`
RendezvousRequiredCount int `json:"rendezvous_required_count"`
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
RendezvousLeaseCount int `json:"rendezvous_lease_count"`
GeneratedAt time.Time `json:"generated_at"`
Intents []PeerConnectionIntent `json:"intents,omitempty"`
}
type PeerConnectionIntent struct {
NodeID string `json:"node_id"`
Action string `json:"action"`
Reason string `json:"reason"`
Endpoint string `json:"endpoint,omitempty"`
ConnectionState string `json:"connection_state"`
Transport string `json:"transport,omitempty"`
TransportMode string `json:"transport_mode"`
Reachability string `json:"reachability,omitempty"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
NATType string `json:"nat_type,omitempty"`
PolicyTags []string `json:"policy_tags,omitempty"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
ControlPlaneOnly bool `json:"control_plane_only"`
RecoverySeed bool `json:"recovery_seed"`
Priority int `json:"priority"`
GeneratedAt time.Time `json:"generated_at"`
}
func PlanPeerConnectionIntents(cfg PeerConnectionIntentPlanConfig) PeerConnectionIntentPlan {
now := normalizedNow(cfg.Now)
entryByNode := map[string]PeerCacheEntry{}
for _, entry := range cfg.PeerCache.Entries {
if strings.TrimSpace(entry.NodeID) == "" {
continue
}
entryByNode[entry.NodeID] = entry
}
intents := make([]PeerConnectionIntent, 0, len(cfg.RecoveryPlan.Candidates))
for _, candidate := range cfg.RecoveryPlan.Candidates {
if strings.TrimSpace(candidate.NodeID) == "" {
continue
}
entry := entryByNode[candidate.NodeID]
intent := PeerConnectionIntent{
NodeID: candidate.NodeID,
Action: connectionIntentAction(candidate),
Reason: candidate.Reason,
Endpoint: candidate.Endpoint,
ConnectionState: candidate.ConnectionState,
Transport: firstNonEmpty(candidate.BestTransport, entry.BestTransport),
Reachability: entry.BestReachability,
ConnectivityMode: entry.BestConnectivity,
NATType: entry.BestNATType,
PolicyTags: append([]string{}, entry.BestPolicyTags...),
BestCandidateID: firstNonEmpty(candidate.BestCandidateID, entry.BestCandidateID),
RendezvousLeaseID: entry.RendezvousLeaseID,
RelayNodeID: entry.RelayNodeID,
RelayEndpoint: entry.RelayEndpoint,
RelayCandidate: entry.RelayControl,
ControlPlaneOnly: entry.RelayControl,
RecoverySeed: candidate.RecoverySeed || entry.RecoverySeed,
Priority: candidate.Priority,
GeneratedAt: now,
}
mode, requiresRendezvous, directCandidate := classifyPeerTransport(intent)
intent.TransportMode = mode
intent.RequiresRendezvous = requiresRendezvous
intent.DirectCandidate = directCandidate
if intent.RequiresRendezvous {
if lease, ok := rendezvousLeaseForPeer(cfg.RendezvousLeases, intent.NodeID, now); ok {
applyRendezvousLease(&intent, lease)
}
}
intents = append(intents, intent)
}
sort.SliceStable(intents, func(i, j int) bool {
if intents[i].Priority != intents[j].Priority {
return intents[i].Priority > intents[j].Priority
}
return intents[i].NodeID < intents[j].NodeID
})
plan := PeerConnectionIntentPlan{
Mode: cfg.RecoveryPlan.Mode,
IntentCount: len(intents),
GeneratedAt: now,
Intents: intents,
}
for _, intent := range intents {
switch intent.Action {
case PeerConnectionIntentMaintain:
plan.MaintainCount++
case PeerConnectionIntentProbe:
plan.ProbeCount++
case PeerConnectionIntentRecover:
plan.RecoverCount++
}
switch intent.TransportMode {
case PeerTransportModeDirect:
plan.DirectCount++
case PeerTransportModePrivateLAN:
plan.PrivateLANCount++
case PeerTransportModeCorporateLAN:
plan.CorporateLANCount++
case PeerTransportModeOutboundOnly:
plan.OutboundOnlyCount++
case PeerTransportModeRelayRequired:
plan.RelayRequiredCount++
case PeerTransportModeRelayControl:
plan.RelayControlCount++
}
if intent.RequiresRendezvous {
plan.RendezvousRequiredCount++
}
if intent.RendezvousResolved {
plan.RendezvousResolvedCount++
}
if intent.RendezvousLeaseID != "" {
plan.RendezvousLeaseCount++
}
}
return plan
}
func connectionIntentAction(candidate PeerRecoveryCandidate) string {
switch candidate.Reason {
case "maintain_ready":
return PeerConnectionIntentMaintain
case "recover_degraded", "recover_seed", "recover_warm", "recover_peer":
return PeerConnectionIntentRecover
default:
return PeerConnectionIntentProbe
}
}
func classifyPeerTransport(intent PeerConnectionIntent) (string, bool, bool) {
transport := strings.ToLower(strings.TrimSpace(intent.Transport))
connectivity := strings.ToLower(strings.TrimSpace(intent.ConnectivityMode))
reachability := strings.ToLower(strings.TrimSpace(intent.Reachability))
tags := lowerStringSet(intent.PolicyTags)
if strings.Contains(transport, "relay") || connectivity == "relay_required" || reachability == "relay" {
return PeerTransportModeRelayRequired, true, false
}
if connectivity == "outbound_only" || reachability == "outbound_only" {
return PeerTransportModeOutboundOnly, true, false
}
if tags["corp-lan"] || tags["same-site"] {
return PeerTransportModeCorporateLAN, false, true
}
if tags["private-lan"] || reachability == "private" || endpointHasPrivateHost(intent.Endpoint) {
return PeerTransportModePrivateLAN, false, true
}
if strings.Contains(transport, "direct") || reachability == "public" || connectivity == "direct" {
return PeerTransportModeDirect, false, true
}
return PeerTransportModeUnknown, false, false
}
func rendezvousLeaseForPeer(leases []PeerRendezvousLease, peerNodeID string, now time.Time) (PeerRendezvousLease, bool) {
now = normalizedNow(now)
candidates := make([]PeerRendezvousLease, 0, len(leases))
for _, lease := range leases {
if strings.TrimSpace(lease.PeerNodeID) != peerNodeID ||
strings.TrimSpace(lease.RelayEndpoint) == "" ||
strings.TrimSpace(lease.RelayNodeID) == "" ||
!lease.ControlPlaneOnly ||
lease.ExpiresAt.IsZero() ||
!lease.ExpiresAt.After(now) {
continue
}
candidates = append(candidates, lease)
}
if len(candidates) == 0 {
return PeerRendezvousLease{}, false
}
sort.SliceStable(candidates, func(i, j int) bool {
leftPriority := candidates[i].Priority
rightPriority := candidates[j].Priority
if leftPriority <= 0 {
leftPriority = 100
}
if rightPriority <= 0 {
rightPriority = 100
}
if leftPriority != rightPriority {
return leftPriority < rightPriority
}
if !candidates[i].ExpiresAt.Equal(candidates[j].ExpiresAt) {
return candidates[i].ExpiresAt.After(candidates[j].ExpiresAt)
}
return candidates[i].LeaseID < candidates[j].LeaseID
})
return candidates[0], true
}
func applyRendezvousLease(intent *PeerConnectionIntent, lease PeerRendezvousLease) {
intent.Endpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/")
intent.Transport = firstNonEmpty(lease.Transport, "relay_control")
intent.TransportMode = PeerTransportModeRelayControl
intent.RequiresRendezvous = false
intent.RendezvousResolved = true
intent.DirectCandidate = false
intent.RelayCandidate = true
intent.RendezvousLeaseID = lease.LeaseID
intent.RelayNodeID = lease.RelayNodeID
intent.RelayEndpoint = intent.Endpoint
intent.ControlPlaneOnly = true
if lease.ConnectivityMode != "" {
intent.ConnectivityMode = lease.ConnectivityMode
}
}
func endpointHasPrivateHost(rawEndpoint string) bool {
rawEndpoint = strings.TrimSpace(rawEndpoint)
if rawEndpoint == "" {
return false
}
host := rawEndpoint
if parsed, err := url.Parse(rawEndpoint); err == nil && parsed.Host != "" {
host = parsed.Host
}
if splitHost, _, err := net.SplitHostPort(host); err == nil {
host = splitHost
}
addr, err := netip.ParseAddr(strings.Trim(host, "[]"))
if err != nil {
return false
}
return addr.IsPrivate() || addr.IsLoopback() || addr.IsLinkLocalUnicast()
}
func lowerStringSet(values []string) map[string]bool {
out := map[string]bool{}
for _, value := range values {
value = strings.ToLower(strings.TrimSpace(value))
if value != "" {
out[value] = true
}
}
return out
}
func firstNonEmpty(values ...string) string {
for _, value := range values {
if strings.TrimSpace(value) != "" {
return strings.TrimSpace(value)
}
}
return ""
}
@@ -0,0 +1,234 @@
package mesh
import (
"testing"
"time"
)
func TestPeerConnectionIntentsClassifyCorporateDirect(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "http://10.24.10.20:19001",
BestTransport: "direct_tcp_tls",
BestReachability: "private",
BestConnectivity: "direct",
BestPolicyTags: []string{"corp-lan", "same-site"},
},
}},
RecoveryPlan: PeerRecoveryPlan{
Mode: PeerRecoveryModeSteady,
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "http://10.24.10.20:19001",
ConnectionState: PeerConnectionReady,
Reason: "maintain_ready",
Priority: 100,
},
},
},
Now: now,
})
if plan.IntentCount != 1 || plan.MaintainCount != 1 || plan.CorporateLANCount != 1 {
t.Fatalf("unexpected plan counts: %+v", plan)
}
intent := plan.Intents[0]
if intent.Action != PeerConnectionIntentMaintain || intent.TransportMode != PeerTransportModeCorporateLAN || intent.RequiresRendezvous {
t.Fatalf("unexpected corporate intent: %+v", intent)
}
}
func TestPeerConnectionIntentsClassifyOutboundAndRelayAsRendezvousRequired(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
BestTransport: "direct_tcp_tls",
BestReachability: "outbound_only",
BestConnectivity: "outbound_only",
},
{
NodeID: "node-c",
Endpoint: "relay://fabric-relay/node-c",
BestTransport: "relay",
BestReachability: "relay",
BestConnectivity: "relay_required",
},
}},
RecoveryPlan: PeerRecoveryPlan{
Mode: PeerRecoveryModeRecovery,
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_warm",
Priority: 90,
},
{
NodeID: "node-c",
Endpoint: "relay://fabric-relay/node-c",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_seed",
Priority: 80,
},
},
},
Now: now,
})
if plan.RecoverCount != 2 || plan.OutboundOnlyCount != 1 || plan.RelayRequiredCount != 1 || plan.RendezvousRequiredCount != 2 {
t.Fatalf("unexpected rendezvous counts: %+v", plan)
}
if plan.Intents[0].Action != PeerConnectionIntentRecover || plan.Intents[1].Action != PeerConnectionIntentRecover {
t.Fatalf("unexpected actions: %+v", plan.Intents)
}
}
func TestPeerConnectionIntentsResolveRendezvousWithRelayLease(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "relay://fabric/node-b",
BestTransport: "relay",
BestReachability: "relay",
BestConnectivity: "relay_required",
},
}},
RecoveryPlan: PeerRecoveryPlan{
Mode: PeerRecoveryModeRecovery,
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "relay://fabric/node-b",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_warm",
Priority: 100,
},
},
},
RendezvousLeases: []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
ConnectivityMode: "relay_required",
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
},
},
Now: now,
})
if plan.IntentCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousResolvedCount != 1 || plan.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected relay-control plan counts: %+v", plan)
}
intent := plan.Intents[0]
if intent.TransportMode != PeerTransportModeRelayControl ||
intent.Endpoint != "http://node-r:19000" ||
intent.RelayNodeID != "node-r" ||
intent.RendezvousLeaseID != "lease-node-b-via-node-r" ||
!intent.RelayCandidate ||
!intent.RendezvousResolved ||
intent.RequiresRendezvous {
t.Fatalf("unexpected resolved rendezvous intent: %+v", intent)
}
}
func TestPeerConnectionIntentsSkipExpiredRendezvousLeaseAndReselect(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{
NodeID: "node-b",
Endpoint: "relay://fabric/node-b",
BestTransport: "relay",
BestReachability: "relay",
BestConnectivity: "relay_required",
},
}},
RecoveryPlan: PeerRecoveryPlan{
Mode: PeerRecoveryModeRecovery,
Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "relay://fabric/node-b",
ConnectionState: PeerConnectionWaiting,
Reason: "recover_warm",
Priority: 100,
},
},
},
RendezvousLeases: []PeerRendezvousLease{
{
LeaseID: "lease-expired-preferred",
PeerNodeID: "node-b",
RelayNodeID: "node-r-old",
RelayEndpoint: "http://node-r-old:19000",
Transport: "relay_control",
ConnectivityMode: "relay_required",
Priority: 1,
ControlPlaneOnly: true,
IssuedAt: now.Add(-10 * time.Minute),
ExpiresAt: now.Add(-time.Second),
},
{
LeaseID: "lease-active-reselected",
PeerNodeID: "node-b",
RelayNodeID: "node-r-new",
RelayEndpoint: "http://node-r-new:19000",
Transport: "relay_control",
ConnectivityMode: "relay_required",
Priority: 20,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
},
},
Now: now,
})
if plan.RendezvousResolvedCount != 1 || plan.RelayControlCount != 1 || plan.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected reselected plan counts: %+v", plan)
}
intent := plan.Intents[0]
if intent.RendezvousLeaseID != "lease-active-reselected" ||
intent.RelayNodeID != "node-r-new" ||
intent.Endpoint != "http://node-r-new:19000" {
t.Fatalf("expired lease was not skipped: %+v", intent)
}
}
func TestPeerConnectionIntentsClassifyPrivateEndpointWithoutCandidateHints(t *testing.T) {
plan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{NodeID: "node-b", Endpoint: "http://192.168.10.20:19001"},
}},
RecoveryPlan: PeerRecoveryPlan{Candidates: []PeerRecoveryCandidate{
{
NodeID: "node-b",
Endpoint: "http://192.168.10.20:19001",
ConnectionState: PeerConnectionDisconnected,
Reason: "recover_peer",
Priority: 10,
},
}},
Now: time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC),
})
if plan.PrivateLANCount != 1 || plan.Intents[0].TransportMode != PeerTransportModePrivateLAN || !plan.Intents[0].DirectCandidate {
t.Fatalf("unexpected private endpoint classification: %+v", plan)
}
}
@@ -0,0 +1,304 @@
package mesh
import (
"context"
"net/http"
"strings"
"sync"
"time"
)
const (
PeerConnectionProbeReachable = "reachable"
PeerConnectionProbeUnreachable = "unreachable"
PeerConnectionProbeDeferred = "deferred"
PeerConnectionProbeSkipped = "skipped"
)
const (
DefaultPeerConnectionProbeTimeout = 2 * time.Second
)
type PeerConnectionManagerConfig struct {
Local PeerIdentity
PeerCache *PeerCache
Tracker *PeerConnectionTracker
RendezvousLeases []PeerRendezvousLease
HTTPClient *http.Client
ProbeTimeout time.Duration
Now func() time.Time
}
type PeerConnectionManager struct {
local PeerIdentity
peerCache *PeerCache
tracker *PeerConnectionTracker
rendezvousLeases []PeerRendezvousLease
httpClient *http.Client
probeTimeout time.Duration
now func() time.Time
mu sync.Mutex
lastCycle PeerConnectionManagerCycle
}
type PeerConnectionManagerCycle struct {
Mode string `json:"mode"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
ProbeTimeoutMs int `json:"probe_timeout_ms"`
IntentCount int `json:"intent_count"`
Attempted int `json:"attempted"`
Succeeded int `json:"succeeded"`
Failed int `json:"failed"`
Deferred int `json:"deferred"`
Skipped int `json:"skipped"`
RendezvousRequiredCount int `json:"rendezvous_required_count"`
RendezvousResolvedCount int `json:"rendezvous_resolved_count"`
RelayControlCount int `json:"relay_control_count"`
RecoveryPlan PeerRecoveryPlan `json:"recovery_plan"`
IntentPlan PeerConnectionIntentPlan `json:"intent_plan"`
Results []PeerConnectionProbeResult `json:"results,omitempty"`
}
type PeerConnectionManagerSnapshot struct {
LastCycle PeerConnectionManagerCycle `json:"last_cycle"`
}
type PeerConnectionProbeResult struct {
NodeID string `json:"node_id"`
LinkStatus string `json:"link_status"`
Action string `json:"action"`
Reason string `json:"reason"`
Endpoint string `json:"endpoint,omitempty"`
ConnectionState PeerConnectionState `json:"connection_state"`
TransportMode string `json:"transport_mode"`
RequiresRendezvous bool `json:"requires_rendezvous"`
RendezvousResolved bool `json:"rendezvous_resolved"`
DirectCandidate bool `json:"direct_candidate"`
RelayCandidate bool `json:"relay_candidate"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
LatencyMs int `json:"latency_ms,omitempty"`
FailureReason string `json:"failure_reason,omitempty"`
StartedAt time.Time `json:"started_at"`
CompletedAt time.Time `json:"completed_at"`
}
func NewPeerConnectionManager(cfg PeerConnectionManagerConfig) *PeerConnectionManager {
probeTimeout := cfg.ProbeTimeout
if probeTimeout <= 0 {
probeTimeout = DefaultPeerConnectionProbeTimeout
}
httpClient := cfg.HTTPClient
if httpClient == nil {
httpClient = &http.Client{
Transport: &http.Transport{
MaxIdleConns: 64,
MaxIdleConnsPerHost: 8,
IdleConnTimeout: 90 * time.Second,
},
Timeout: probeTimeout + time.Second,
}
}
now := cfg.Now
if now == nil {
now = func() time.Time { return time.Now().UTC() }
}
return &PeerConnectionManager{
local: cfg.Local,
peerCache: cfg.PeerCache,
tracker: cfg.Tracker,
rendezvousLeases: append([]PeerRendezvousLease{}, cfg.RendezvousLeases...),
httpClient: httpClient,
probeTimeout: probeTimeout,
now: now,
}
}
func (m *PeerConnectionManager) ProbeOnce(ctx context.Context) PeerConnectionManagerCycle {
peerCache, rendezvousLeases := m.peerConfigSnapshot()
if m == nil || peerCache == nil || m.tracker == nil {
return PeerConnectionManagerCycle{}
}
startedAt := normalizedNow(m.now())
peerSnapshot := peerCache.Snapshot()
recoveryPlan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: peerSnapshot,
Connections: m.tracker.Snapshot(),
TargetReadyPeers: DefaultStablePeerTarget,
MaxProbeCandidates: DefaultRecoveryProbeLimit,
Now: startedAt,
})
intentPlan := PlanPeerConnectionIntents(PeerConnectionIntentPlanConfig{
PeerCache: peerSnapshot,
RecoveryPlan: recoveryPlan,
RendezvousLeases: rendezvousLeases,
Now: startedAt,
})
cycle := PeerConnectionManagerCycle{
Mode: recoveryPlan.Mode,
StartedAt: startedAt,
ProbeTimeoutMs: int(m.probeTimeout.Milliseconds()),
IntentCount: intentPlan.IntentCount,
RendezvousRequiredCount: intentPlan.RendezvousRequiredCount,
RendezvousResolvedCount: intentPlan.RendezvousResolvedCount,
RelayControlCount: intentPlan.RelayControlCount,
RecoveryPlan: recoveryPlan,
IntentPlan: intentPlan,
Results: make([]PeerConnectionProbeResult, 0, len(intentPlan.Intents)),
}
for _, intent := range intentPlan.Intents {
result := m.probeIntent(ctx, intent)
cycle.Results = append(cycle.Results, result)
switch result.LinkStatus {
case PeerConnectionProbeReachable:
cycle.Attempted++
cycle.Succeeded++
case PeerConnectionProbeUnreachable:
cycle.Attempted++
cycle.Failed++
case PeerConnectionProbeDeferred:
cycle.Deferred++
case PeerConnectionProbeSkipped:
cycle.Skipped++
}
}
cycle.CompletedAt = normalizedNow(m.now())
m.mu.Lock()
m.lastCycle = cycle
m.mu.Unlock()
return cycle
}
func (m *PeerConnectionManager) Snapshot() PeerConnectionManagerSnapshot {
if m == nil {
return PeerConnectionManagerSnapshot{}
}
m.mu.Lock()
defer m.mu.Unlock()
return PeerConnectionManagerSnapshot{LastCycle: m.lastCycle}
}
func (m *PeerConnectionManager) UpdatePeerConfig(peerCache *PeerCache, rendezvousLeases []PeerRendezvousLease) {
if m == nil {
return
}
m.mu.Lock()
defer m.mu.Unlock()
m.peerCache = peerCache
m.rendezvousLeases = append([]PeerRendezvousLease{}, rendezvousLeases...)
}
func (m *PeerConnectionManager) peerConfigSnapshot() (*PeerCache, []PeerRendezvousLease) {
if m == nil {
return nil, nil
}
m.mu.Lock()
defer m.mu.Unlock()
return m.peerCache, append([]PeerRendezvousLease{}, m.rendezvousLeases...)
}
func (m *PeerConnectionManager) probeIntent(ctx context.Context, intent PeerConnectionIntent) PeerConnectionProbeResult {
startedAt := normalizedNow(m.now())
result := PeerConnectionProbeResult{
NodeID: intent.NodeID,
Action: intent.Action,
Reason: intent.Reason,
Endpoint: intent.Endpoint,
TransportMode: intent.TransportMode,
RequiresRendezvous: intent.RequiresRendezvous,
RendezvousResolved: intent.RendezvousResolved,
DirectCandidate: intent.DirectCandidate,
RelayCandidate: intent.RelayCandidate,
RendezvousLeaseID: intent.RendezvousLeaseID,
RelayNodeID: intent.RelayNodeID,
RelayEndpoint: intent.RelayEndpoint,
StartedAt: startedAt,
}
peer := PeerCacheEntry{
NodeID: intent.NodeID,
Endpoint: intent.Endpoint,
Warm: true,
WarmReason: intent.Reason,
RecoverySeed: intent.RecoverySeed,
BestCandidateID: intent.BestCandidateID,
BestTransport: intent.Transport,
RendezvousLeaseID: intent.RendezvousLeaseID,
RelayNodeID: intent.RelayNodeID,
RelayEndpoint: intent.RelayEndpoint,
RelayControl: intent.RelayCandidate,
}
if intent.RequiresRendezvous {
result.LinkStatus = PeerConnectionProbeDeferred
result.FailureReason = "rendezvous_required"
result.ConnectionState = m.tracker.RecordDeferred(peer, result.FailureReason, startedAt)
result.CompletedAt = normalizedNow(m.now())
return result
}
if strings.TrimSpace(intent.Endpoint) == "" || (!intent.DirectCandidate && !intent.RelayCandidate) {
result.LinkStatus = PeerConnectionProbeDeferred
result.FailureReason = "direct_candidate_unavailable"
if intent.RelayCandidate {
result.FailureReason = "relay_candidate_unavailable"
}
result.ConnectionState = m.tracker.RecordDeferred(peer, result.FailureReason, startedAt)
result.CompletedAt = normalizedNow(m.now())
return result
}
if !m.tracker.ShouldProbe(intent.NodeID, startedAt) {
result.LinkStatus = PeerConnectionProbeSkipped
result.FailureReason = "backoff_active"
result.ConnectionState = m.connectionState(intent.NodeID)
result.CompletedAt = normalizedNow(m.now())
return result
}
m.tracker.BeginProbe(peer, startedAt)
probeCtx, cancel := context.WithTimeout(ctx, m.probeTimeout)
defer cancel()
target := PeerIdentity{
ClusterID: m.local.ClusterID,
NodeID: intent.NodeID,
}
if intent.RelayCandidate && intent.RelayNodeID != "" {
target.NodeID = intent.RelayNodeID
}
_, err := NewClient(strings.TrimRight(intent.Endpoint, "/")).withHTTPClient(m.httpClient).SendHealth(probeCtx, NewHealthMessage(m.local, target))
completedAt := normalizedNow(m.now())
if err != nil {
result.LinkStatus = PeerConnectionProbeUnreachable
result.FailureReason = err.Error()
result.ConnectionState = m.tracker.RecordFailure(intent.NodeID, err.Error(), completedAt)
result.CompletedAt = completedAt
return result
}
latency := int(completedAt.Sub(startedAt).Milliseconds())
if latency < 0 {
latency = 0
}
result.LinkStatus = PeerConnectionProbeReachable
result.LatencyMs = latency
if intent.RelayCandidate {
result.ConnectionState = m.tracker.RecordRelayReady(peer, latency, completedAt)
} else {
result.ConnectionState = m.tracker.RecordSuccess(intent.NodeID, latency, completedAt)
}
result.CompletedAt = completedAt
return result
}
func (m *PeerConnectionManager) connectionState(nodeID string) PeerConnectionState {
snapshot := m.tracker.Snapshot()
for _, entry := range snapshot.Entries {
if entry.NodeID == nodeID {
return entry
}
}
return PeerConnectionState{NodeID: nodeID, State: PeerConnectionDisconnected}
}
func (c Client) withHTTPClient(httpClient *http.Client) Client {
c.HTTPClient = httpClient
return c
}
@@ -0,0 +1,190 @@
package mesh
import (
"context"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestPeerConnectionManagerProbesDirectAndDefersRendezvous(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
}.Handler())
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-direct",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: server.URL,
Reachability: "private",
ConnectivityMode: "direct",
PolicyTags: []string{"corp-lan", "same-site"},
Priority: 1,
},
},
"node-c": {
{
EndpointID: "node-c-relay",
NodeID: "node-c",
Transport: "relay",
Address: "relay://fabric/node-c",
Reachability: "relay",
ConnectivityMode: "relay_required",
Priority: 1,
},
},
},
WarmPeerLimit: 2,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
ProbeTimeout: time.Second,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 1 || cycle.Succeeded != 1 || cycle.Deferred != 1 || cycle.RendezvousRequiredCount != 1 {
t.Fatalf("unexpected cycle: %+v", cycle)
}
snapshot := tracker.Snapshot()
if snapshot.Ready != 1 || snapshot.Waiting != 1 {
t.Fatalf("unexpected tracker snapshot: %+v", snapshot)
}
if cycle.Results[0].NodeID != "node-b" || cycle.Results[0].LinkStatus != PeerConnectionProbeReachable {
t.Fatalf("direct peer was not probed first: %+v", cycle.Results)
}
if cycle.Results[1].NodeID != "node-c" || cycle.Results[1].LinkStatus != PeerConnectionProbeDeferred {
t.Fatalf("relay peer was not deferred: %+v", cycle.Results)
}
}
func TestPeerConnectionManagerRecordsFailureAndSuppressesActiveBackoff(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpoints: map[string]string{
"node-b": "http://127.0.0.1:1",
},
WarmPeerLimit: 1,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
HTTPClient: &http.Client{Timeout: 20 * time.Millisecond},
ProbeTimeout: 20 * time.Millisecond,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
for i := 0; i < 3; i++ {
manager.ProbeOnce(context.Background())
}
backoff := tracker.Snapshot()
if backoff.Backoff != 1 {
t.Fatalf("expected backoff after repeated failures: %+v", backoff)
}
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 0 || len(cycle.Results) != 0 {
t.Fatalf("active backoff peer should not be attempted: %+v", cycle)
}
}
func TestPeerConnectionManagerProbesRelayControlLease(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
current := now
server := httptest.NewServer(Server{
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
}.Handler())
defer server.Close()
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"}
leases := []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: server.URL,
Transport: "relay_control",
ConnectivityMode: "relay_required",
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: now.Add(-time.Minute),
ExpiresAt: now.Add(time.Minute),
},
}
cache := NewPeerCache(PeerCacheConfig{
Local: local,
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-relay",
NodeID: "node-b",
Transport: "relay",
Address: "relay://fabric/node-b",
Reachability: "relay",
ConnectivityMode: "relay_required",
Priority: 10,
},
},
},
RendezvousLeases: leases,
WarmPeerLimit: 1,
Now: now,
})
tracker := NewPeerConnectionTracker(cache.Snapshot(), now)
manager := NewPeerConnectionManager(PeerConnectionManagerConfig{
Local: local,
PeerCache: cache,
Tracker: tracker,
RendezvousLeases: leases,
ProbeTimeout: time.Second,
Now: func() time.Time {
current = current.Add(10 * time.Millisecond)
return current
},
})
cycle := manager.ProbeOnce(context.Background())
if cycle.Attempted != 1 ||
cycle.Succeeded != 1 ||
cycle.Deferred != 0 ||
cycle.RelayControlCount != 1 ||
cycle.RendezvousResolvedCount != 1 ||
cycle.RendezvousRequiredCount != 0 {
t.Fatalf("unexpected relay-control cycle: %+v", cycle)
}
if len(cycle.Results) != 1 ||
cycle.Results[0].NodeID != "node-b" ||
cycle.Results[0].RelayNodeID != "node-r" ||
cycle.Results[0].ConnectionState.State != PeerConnectionRelayReady {
t.Fatalf("unexpected relay-control result: %+v", cycle.Results)
}
snapshot := tracker.Snapshot()
if snapshot.RelayReady != 1 || snapshot.Waiting != 0 {
t.Fatalf("unexpected tracker snapshot: %+v", snapshot)
}
}
@@ -0,0 +1,284 @@
package mesh
import (
"sort"
"sync"
"time"
)
const (
PeerConnectionDisconnected = "disconnected"
PeerConnectionConnecting = "connecting"
PeerConnectionReady = "ready"
PeerConnectionRelayReady = "relay_ready"
PeerConnectionDegraded = "degraded"
PeerConnectionBackoff = "backoff"
PeerConnectionWaiting = "waiting_rendezvous"
)
const (
peerConnectionBackoffBase = 5 * time.Second
peerConnectionBackoffMax = time.Minute
)
type PeerConnectionTracker struct {
mu sync.Mutex
entries map[string]PeerConnectionState
}
type PeerConnectionState struct {
NodeID string `json:"node_id"`
State string `json:"state"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
Endpoint string `json:"endpoint,omitempty"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
RendezvousLeaseID string `json:"rendezvous_lease_id,omitempty"`
RelayNodeID string `json:"relay_node_id,omitempty"`
RelayEndpoint string `json:"relay_endpoint,omitempty"`
RelayControl bool `json:"relay_control"`
ConsecutiveSuccesses int `json:"consecutive_successes"`
ConsecutiveFailures int `json:"consecutive_failures"`
LastLatencyMs int `json:"last_latency_ms,omitempty"`
LastFailureReason string `json:"last_failure_reason,omitempty"`
LastTransitionAt time.Time `json:"last_transition_at"`
LastProbeAt time.Time `json:"last_probe_at,omitempty"`
BackoffUntil time.Time `json:"backoff_until,omitempty"`
}
type PeerConnectionSnapshot struct {
Total int `json:"total"`
Ready int `json:"ready"`
RelayReady int `json:"relay_ready"`
Degraded int `json:"degraded"`
Backoff int `json:"backoff"`
Waiting int `json:"waiting_rendezvous"`
Connecting int `json:"connecting"`
Disconnected int `json:"disconnected"`
StateCounts map[string]int `json:"state_counts"`
Entries []PeerConnectionState `json:"entries"`
LastTransitionAt time.Time `json:"last_transition_at,omitempty"`
}
func NewPeerConnectionTracker(peerSnapshot PeerCacheSnapshot, now time.Time) *PeerConnectionTracker {
now = normalizedNow(now)
tracker := &PeerConnectionTracker{entries: map[string]PeerConnectionState{}}
for _, peer := range peerSnapshot.Entries {
if !peer.Warm || peer.NodeID == "" {
continue
}
tracker.entries[peer.NodeID] = PeerConnectionState{
NodeID: peer.NodeID,
State: PeerConnectionDisconnected,
Warm: peer.Warm,
WarmReason: peer.WarmReason,
Endpoint: peer.Endpoint,
BestCandidateID: peer.BestCandidateID,
LastTransitionAt: now,
}
}
return tracker
}
func (t *PeerConnectionTracker) ShouldProbe(nodeID string, now time.Time) bool {
if t == nil {
return true
}
t.mu.Lock()
defer t.mu.Unlock()
entry, ok := t.entries[nodeID]
if !ok {
return true
}
now = normalizedNow(now)
return entry.State != PeerConnectionBackoff || entry.BackoffUntil.IsZero() || !entry.BackoffUntil.After(now)
}
func (t *PeerConnectionTracker) BeginProbe(peer PeerCacheEntry, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entry(peer, now)
if entry.State != PeerConnectionReady && entry.State != PeerConnectionDegraded {
entry.State = PeerConnectionConnecting
entry.LastTransitionAt = now
}
entry.LastProbeAt = now
t.entries[peer.NodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordSuccess(nodeID string, latencyMs int, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entries[nodeID]
entry.NodeID = nodeID
entry.ConsecutiveSuccesses++
entry.ConsecutiveFailures = 0
entry.LastLatencyMs = latencyMs
entry.LastFailureReason = ""
entry.LastProbeAt = now
entry.BackoffUntil = time.Time{}
nextState := PeerConnectionReady
if latencyMs >= 500 {
nextState = PeerConnectionDegraded
}
if entry.State != nextState {
entry.State = nextState
entry.LastTransitionAt = now
}
t.entries[nodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordRelayReady(peer PeerCacheEntry, latencyMs int, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entry(peer, now)
entry.ConsecutiveSuccesses++
entry.ConsecutiveFailures = 0
entry.LastLatencyMs = latencyMs
entry.LastFailureReason = ""
entry.LastProbeAt = now
entry.BackoffUntil = time.Time{}
if entry.State != PeerConnectionRelayReady {
entry.State = PeerConnectionRelayReady
entry.LastTransitionAt = now
}
t.entries[peer.NodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordFailure(nodeID string, reason string, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entries[nodeID]
entry.NodeID = nodeID
entry.ConsecutiveFailures++
entry.ConsecutiveSuccesses = 0
entry.LastFailureReason = reason
entry.LastProbeAt = now
nextState := PeerConnectionDegraded
if entry.ConsecutiveFailures >= 3 {
nextState = PeerConnectionBackoff
entry.BackoffUntil = now.Add(peerConnectionBackoffDuration(entry.ConsecutiveFailures))
}
if entry.State != nextState {
entry.State = nextState
entry.LastTransitionAt = now
}
t.entries[nodeID] = entry
return entry
}
func (t *PeerConnectionTracker) RecordDeferred(peer PeerCacheEntry, reason string, now time.Time) PeerConnectionState {
if t == nil {
return PeerConnectionState{}
}
t.mu.Lock()
defer t.mu.Unlock()
now = normalizedNow(now)
entry := t.entry(peer, now)
entry.State = PeerConnectionWaiting
entry.LastFailureReason = reason
entry.LastProbeAt = time.Time{}
entry.LastTransitionAt = now
entry.BackoffUntil = time.Time{}
t.entries[peer.NodeID] = entry
return entry
}
func (t *PeerConnectionTracker) Snapshot() PeerConnectionSnapshot {
if t == nil {
return PeerConnectionSnapshot{StateCounts: map[string]int{}}
}
t.mu.Lock()
defer t.mu.Unlock()
entries := make([]PeerConnectionState, 0, len(t.entries))
counts := map[string]int{
PeerConnectionDisconnected: 0,
PeerConnectionConnecting: 0,
PeerConnectionReady: 0,
PeerConnectionRelayReady: 0,
PeerConnectionDegraded: 0,
PeerConnectionBackoff: 0,
PeerConnectionWaiting: 0,
}
var lastTransition time.Time
for _, entry := range t.entries {
entries = append(entries, entry)
counts[entry.State]++
if entry.LastTransitionAt.After(lastTransition) {
lastTransition = entry.LastTransitionAt
}
}
sort.SliceStable(entries, func(i, j int) bool {
return entries[i].NodeID < entries[j].NodeID
})
return PeerConnectionSnapshot{
Total: len(entries),
Ready: counts[PeerConnectionReady],
RelayReady: counts[PeerConnectionRelayReady],
Degraded: counts[PeerConnectionDegraded],
Backoff: counts[PeerConnectionBackoff],
Waiting: counts[PeerConnectionWaiting],
Connecting: counts[PeerConnectionConnecting],
Disconnected: counts[PeerConnectionDisconnected],
StateCounts: counts,
Entries: entries,
LastTransitionAt: lastTransition,
}
}
func (t *PeerConnectionTracker) entry(peer PeerCacheEntry, now time.Time) PeerConnectionState {
entry, ok := t.entries[peer.NodeID]
if !ok {
entry = PeerConnectionState{
NodeID: peer.NodeID,
State: PeerConnectionDisconnected,
LastTransitionAt: now,
}
}
entry.Warm = peer.Warm
entry.WarmReason = peer.WarmReason
entry.Endpoint = peer.Endpoint
entry.BestCandidateID = peer.BestCandidateID
entry.RendezvousLeaseID = peer.RendezvousLeaseID
entry.RelayNodeID = peer.RelayNodeID
entry.RelayEndpoint = peer.RelayEndpoint
entry.RelayControl = peer.RelayControl
return entry
}
func peerConnectionBackoffDuration(failures int) time.Duration {
if failures < 3 {
return 0
}
backoff := peerConnectionBackoffBase * time.Duration(failures-2)
if backoff > peerConnectionBackoffMax {
return peerConnectionBackoffMax
}
return backoff
}
func normalizedNow(now time.Time) time.Time {
if now.IsZero() {
return time.Now().UTC()
}
return now.UTC()
}
@@ -0,0 +1,76 @@
package mesh
import (
"testing"
"time"
)
func TestPeerConnectionTrackerTransitionsReadyAndDegraded(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-b", Warm: true, WarmReason: "route_adjacent", Endpoint: "http://node-b:19000"},
},
}, now)
begin := tracker.BeginProbe(PeerCacheEntry{NodeID: "node-b", Warm: true}, now.Add(time.Second))
if begin.State != PeerConnectionConnecting {
t.Fatalf("begin state = %q, want connecting", begin.State)
}
ready := tracker.RecordSuccess("node-b", 42, now.Add(2*time.Second))
if ready.State != PeerConnectionReady || ready.ConsecutiveSuccesses != 1 || ready.ConsecutiveFailures != 0 {
t.Fatalf("ready state unexpected: %+v", ready)
}
degraded := tracker.RecordSuccess("node-b", 800, now.Add(3*time.Second))
if degraded.State != PeerConnectionDegraded || degraded.LastLatencyMs != 800 {
t.Fatalf("degraded state unexpected: %+v", degraded)
}
}
func TestPeerConnectionTrackerBackoffAfterRepeatedFailures(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
Entries: []PeerCacheEntry{{NodeID: "node-b", Warm: true}},
}, now)
first := tracker.RecordFailure("node-b", "timeout", now.Add(time.Second))
if first.State != PeerConnectionDegraded {
t.Fatalf("first failure state = %q, want degraded", first.State)
}
_ = tracker.RecordFailure("node-b", "timeout", now.Add(2*time.Second))
third := tracker.RecordFailure("node-b", "timeout", now.Add(3*time.Second))
if third.State != PeerConnectionBackoff || third.BackoffUntil.IsZero() {
t.Fatalf("third failure did not enter backoff: %+v", third)
}
if tracker.ShouldProbe("node-b", now.Add(4*time.Second)) {
t.Fatal("ShouldProbe returned true during backoff")
}
if !tracker.ShouldProbe("node-b", third.BackoffUntil.Add(time.Millisecond)) {
t.Fatal("ShouldProbe returned false after backoff")
}
recovered := tracker.RecordSuccess("node-b", 12, third.BackoffUntil.Add(time.Second))
if recovered.State != PeerConnectionReady || recovered.ConsecutiveFailures != 0 || !recovered.BackoffUntil.IsZero() {
t.Fatalf("success did not recover from backoff: %+v", recovered)
}
}
func TestPeerConnectionTrackerSnapshotCountsStates(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
tracker := NewPeerConnectionTracker(PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{NodeID: "node-a", Warm: true},
{NodeID: "node-b", Warm: true},
{NodeID: "node-c", Warm: true},
},
}, now)
tracker.RecordSuccess("node-a", 25, now.Add(time.Second))
tracker.RecordFailure("node-b", "timeout", now.Add(time.Second))
tracker.RecordFailure("node-c", "timeout", now.Add(time.Second))
tracker.RecordFailure("node-c", "timeout", now.Add(2*time.Second))
tracker.RecordFailure("node-c", "timeout", now.Add(3*time.Second))
snapshot := tracker.Snapshot()
if snapshot.Total != 3 || snapshot.Ready != 1 || snapshot.Degraded != 1 || snapshot.Backoff != 1 {
t.Fatalf("unexpected snapshot: %+v", snapshot)
}
}
@@ -0,0 +1,276 @@
package mesh
import (
"sort"
"strings"
"time"
)
const (
PeerRecoveryModeSteady = "steady"
PeerRecoveryModeRecovery = "recovery"
)
const (
DefaultStablePeerTarget = 3
DefaultRecoveryProbeLimit = 6
)
type PeerRecoveryPlanConfig struct {
PeerCache PeerCacheSnapshot
Connections PeerConnectionSnapshot
TargetReadyPeers int
MaxProbeCandidates int
Now time.Time
}
type PeerRecoveryPlan struct {
Mode string `json:"mode"`
Healthy bool `json:"healthy"`
TargetReadyPeers int `json:"target_ready_peers"`
ReadyPeerCount int `json:"ready_peer_count"`
DegradedPeerCount int `json:"degraded_peer_count"`
BackoffPeerCount int `json:"backoff_peer_count"`
ConnectablePeerCount int `json:"connectable_peer_count"`
Deficit int `json:"deficit"`
ProbeCandidateCount int `json:"probe_candidate_count"`
RecoverySeedCandidateCount int `json:"recovery_seed_candidate_count"`
GeneratedAt time.Time `json:"generated_at"`
Candidates []PeerRecoveryCandidate `json:"candidates,omitempty"`
}
type PeerRecoveryCandidate struct {
NodeID string `json:"node_id"`
Endpoint string `json:"endpoint,omitempty"`
Warm bool `json:"warm"`
WarmReason string `json:"warm_reason,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
BestCandidateID string `json:"best_candidate_id,omitempty"`
BestTransport string `json:"best_transport,omitempty"`
ConnectionState string `json:"connection_state"`
ConsecutiveFailures int `json:"consecutive_failures,omitempty"`
LastLatencyMs int `json:"last_latency_ms,omitempty"`
BackoffUntil time.Time `json:"backoff_until,omitempty"`
Reason string `json:"reason"`
Priority int `json:"priority"`
}
type peerRecoveryCandidateBuild struct {
PeerRecoveryCandidate
}
func PlanPeerRecovery(cfg PeerRecoveryPlanConfig) PeerRecoveryPlan {
now := normalizedNow(cfg.Now)
target := cfg.TargetReadyPeers
if target <= 0 {
target = DefaultStablePeerTarget
}
limit := cfg.MaxProbeCandidates
if limit <= 0 {
limit = DefaultRecoveryProbeLimit
}
connectable := connectablePeerCount(cfg.PeerCache)
if target > connectable {
target = connectable
}
if limit < target {
limit = target
}
connectionByNode := map[string]PeerConnectionState{}
for _, connection := range cfg.Connections.Entries {
if strings.TrimSpace(connection.NodeID) == "" {
continue
}
connectionByNode[connection.NodeID] = connection
}
entryByNode := map[string]PeerCacheEntry{}
for _, entry := range cfg.PeerCache.Entries {
if strings.TrimSpace(entry.NodeID) == "" {
continue
}
entryByNode[entry.NodeID] = entry
}
ready := 0
degraded := 0
backoff := 0
for nodeID, connection := range connectionByNode {
entry, ok := entryByNode[nodeID]
if !ok || strings.TrimSpace(entry.Endpoint) == "" {
continue
}
switch connection.State {
case PeerConnectionReady, PeerConnectionRelayReady:
ready++
case PeerConnectionDegraded:
degraded++
case PeerConnectionBackoff:
backoff++
}
}
deficit := target - ready
if deficit < 0 {
deficit = 0
}
mode := PeerRecoveryModeSteady
if deficit > 0 {
mode = PeerRecoveryModeRecovery
}
if mode == PeerRecoveryModeSteady {
limit = target
}
candidates := make([]peerRecoveryCandidateBuild, 0, len(cfg.PeerCache.Entries))
for _, entry := range cfg.PeerCache.Entries {
if strings.TrimSpace(entry.NodeID) == "" || strings.TrimSpace(entry.Endpoint) == "" {
continue
}
connection := connectionByNode[entry.NodeID]
if connection.State == "" {
connection.State = PeerConnectionDisconnected
}
if connection.State == PeerConnectionBackoff && connection.BackoffUntil.After(now) {
continue
}
reason, ok := peerRecoveryCandidateReason(mode, entry, connection)
if !ok {
continue
}
candidate := PeerRecoveryCandidate{
NodeID: entry.NodeID,
Endpoint: strings.TrimSpace(entry.Endpoint),
Warm: entry.Warm,
WarmReason: entry.WarmReason,
RecoverySeed: entry.RecoverySeed,
BestCandidateID: entry.BestCandidateID,
BestTransport: entry.BestTransport,
ConnectionState: connection.State,
ConsecutiveFailures: connection.ConsecutiveFailures,
LastLatencyMs: connection.LastLatencyMs,
BackoffUntil: connection.BackoffUntil,
Reason: reason,
Priority: peerRecoveryCandidatePriority(entry, connection, reason),
}
candidates = append(candidates, peerRecoveryCandidateBuild{PeerRecoveryCandidate: candidate})
}
sort.SliceStable(candidates, func(i, j int) bool {
if candidates[i].Priority != candidates[j].Priority {
return candidates[i].Priority > candidates[j].Priority
}
return candidates[i].NodeID < candidates[j].NodeID
})
if len(candidates) > limit {
candidates = candidates[:limit]
}
outCandidates := make([]PeerRecoveryCandidate, 0, len(candidates))
recoverySeedCandidates := 0
for _, candidate := range candidates {
outCandidates = append(outCandidates, candidate.PeerRecoveryCandidate)
if candidate.RecoverySeed {
recoverySeedCandidates++
}
}
return PeerRecoveryPlan{
Mode: mode,
Healthy: deficit == 0,
TargetReadyPeers: target,
ReadyPeerCount: ready,
DegradedPeerCount: degraded,
BackoffPeerCount: backoff,
ConnectablePeerCount: connectable,
Deficit: deficit,
ProbeCandidateCount: len(outCandidates),
RecoverySeedCandidateCount: recoverySeedCandidates,
GeneratedAt: now,
Candidates: outCandidates,
}
}
func peerRecoveryCandidateReason(mode string, entry PeerCacheEntry, connection PeerConnectionState) (string, bool) {
if mode == PeerRecoveryModeSteady {
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
return "maintain_ready", true
}
return "", false
}
if connection.State == PeerConnectionReady || connection.State == PeerConnectionRelayReady {
return "maintain_ready", true
}
if connection.State == PeerConnectionDegraded {
return "recover_degraded", true
}
if entry.Warm {
return "recover_warm", true
}
if entry.RecoverySeed {
return "recover_seed", true
}
return "recover_peer", true
}
func peerRecoveryCandidatePriority(entry PeerCacheEntry, connection PeerConnectionState, reason string) int {
score := 0
if entry.Warm {
score += 1000
}
switch entry.WarmReason {
case "route_adjacent":
score += 500
case "recovery_seed":
score += 350
case "endpoint_candidate":
score += 200
case "peer_endpoint":
score += 100
}
if entry.RecoverySeed {
score += 250
}
if entry.BestCandidateID != "" {
score += 150
}
score += entry.BestCandidateScore / 10
switch connection.State {
case PeerConnectionReady, PeerConnectionRelayReady:
score += 600
case PeerConnectionDegraded:
score += 350
case PeerConnectionConnecting:
score += 200
case PeerConnectionDisconnected:
score += 100
}
switch reason {
case "maintain_ready":
score += 500
case "recover_degraded":
score += 300
case "recover_seed":
score += 250
case "recover_warm":
score += 150
}
if connection.LastLatencyMs > 0 {
score -= connection.LastLatencyMs / 10
}
if score < 0 {
return 0
}
return score
}
func connectablePeerCount(snapshot PeerCacheSnapshot) int {
count := 0
for _, entry := range snapshot.Entries {
if strings.TrimSpace(entry.NodeID) == "" || strings.TrimSpace(entry.Endpoint) == "" {
continue
}
count++
}
return count
}
@@ -0,0 +1,139 @@
package mesh
import (
"testing"
"time"
)
func TestPeerRecoveryPlanMaintainsBoundedReadyPeers(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
recoveryPlanPeer("node-a", true, false, "route_adjacent"),
recoveryPlanPeer("node-b", true, false, "route_adjacent"),
recoveryPlanPeer("node-c", true, false, "peer_endpoint"),
recoveryPlanPeer("node-d", true, false, "peer_endpoint"),
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-a", State: PeerConnectionReady, LastLatencyMs: 40},
{NodeID: "node-b", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-c", State: PeerConnectionReady, LastLatencyMs: 30},
{NodeID: "node-d", State: PeerConnectionReady, LastLatencyMs: 10},
}},
Now: now,
})
if plan.Mode != PeerRecoveryModeSteady || !plan.Healthy {
t.Fatalf("unexpected plan health: %+v", plan)
}
if plan.TargetReadyPeers != DefaultStablePeerTarget || len(plan.Candidates) != DefaultStablePeerTarget {
t.Fatalf("unexpected bounded candidates: %+v", plan)
}
for _, candidate := range plan.Candidates {
if candidate.Reason != "maintain_ready" {
t.Fatalf("unexpected candidate reason: %+v", candidate)
}
}
}
func TestPeerRecoveryPlanAddsRecoverySeedWhenReadyDeficit(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
recoveryPlanPeer("node-a", true, false, "route_adjacent"),
recoveryPlanPeer("node-b", true, false, "route_adjacent"),
recoveryPlanPeer("node-seed", false, true, ""),
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-a", State: PeerConnectionReady, LastLatencyMs: 20},
{NodeID: "node-b", State: PeerConnectionBackoff, BackoffUntil: now.Add(time.Minute)},
}},
Now: now,
})
if plan.Mode != PeerRecoveryModeRecovery || plan.Healthy {
t.Fatalf("unexpected recovery mode: %+v", plan)
}
if plan.Deficit != 2 || plan.RecoverySeedCandidateCount != 1 {
t.Fatalf("unexpected deficit/seed count: %+v", plan)
}
if !recoveryPlanHasCandidate(plan, "node-seed", "recover_seed") {
t.Fatalf("recovery seed was not selected: %+v", plan.Candidates)
}
if recoveryPlanHasCandidate(plan, "node-b", "") {
t.Fatalf("active backoff peer should not be selected: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanMaintainsRelayReadyPeersInSteadyMode(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{
Entries: []PeerCacheEntry{
{
NodeID: "node-c",
Endpoint: "http://relay:19001",
Warm: true,
WarmReason: "rendezvous_lease",
RendezvousLeaseID: "lease-1",
RelayNodeID: "node-r",
RelayEndpoint: "http://relay:19001",
RelayControl: true,
},
},
},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-c", State: PeerConnectionRelayReady, LastLatencyMs: 15},
}},
Now: now,
})
if plan.Mode != PeerRecoveryModeSteady || !plan.Healthy {
t.Fatalf("unexpected steady plan: %+v", plan)
}
if !recoveryPlanHasCandidate(plan, "node-c", "maintain_ready") {
t.Fatalf("relay-ready peer was not maintained: %+v", plan.Candidates)
}
}
func TestPeerRecoveryPlanCapsTargetByConnectablePeers(t *testing.T) {
now := time.Date(2026, 4, 28, 12, 0, 0, 0, time.UTC)
plan := PlanPeerRecovery(PeerRecoveryPlanConfig{
PeerCache: PeerCacheSnapshot{Entries: []PeerCacheEntry{
{NodeID: "node-a", Warm: true, WarmReason: "route_adjacent"},
recoveryPlanPeer("node-b", true, false, "route_adjacent"),
}},
Connections: PeerConnectionSnapshot{Entries: []PeerConnectionState{
{NodeID: "node-b", State: PeerConnectionReady},
}},
Now: now,
})
if plan.TargetReadyPeers != 1 || !plan.Healthy {
t.Fatalf("target should be capped by connectable peers: %+v", plan)
}
}
func recoveryPlanPeer(nodeID string, warm bool, recoverySeed bool, warmReason string) PeerCacheEntry {
return PeerCacheEntry{
NodeID: nodeID,
Endpoint: "http://" + nodeID + ":19001",
Warm: warm,
WarmReason: warmReason,
RecoverySeed: recoverySeed,
}
}
func recoveryPlanHasCandidate(plan PeerRecoveryPlan, nodeID string, reason string) bool {
for _, candidate := range plan.Candidates {
if candidate.NodeID != nodeID {
continue
}
return reason == "" || candidate.Reason == reason
}
return false
}
@@ -0,0 +1,149 @@
package mesh
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"time"
)
func ValidateProductionEnvelope(local PeerIdentity, envelope ProductionEnvelope, now time.Time) error {
if envelope.FabricProtocolVersion != ProtocolVersion {
return fmt.Errorf("%w: unsupported fabric_protocol_version", ErrForwardEnvelopeInvalid)
}
if envelope.MessageID == "" {
return fmt.Errorf("%w: message_id is required", ErrForwardEnvelopeInvalid)
}
if envelope.RouteID == "" {
return fmt.Errorf("%w: route_id is required", ErrForwardEnvelopeInvalid)
}
if envelope.ClusterID == "" || envelope.ClusterID != local.ClusterID {
return ErrClusterMismatch
}
if envelope.SourceNodeID == "" || envelope.DestinationNodeID == "" {
return fmt.Errorf("%w: source_node_id and destination_node_id are required", ErrForwardEnvelopeInvalid)
}
if envelope.CurrentHopNodeID != local.NodeID {
return ErrNodeMismatch
}
if envelope.NextHopNodeID == "" {
return fmt.Errorf("%w: next_hop_node_id is required", ErrForwardEnvelopeInvalid)
}
if len(envelope.RoutePath) > 0 {
if err := validateProductionRoutePath(local, envelope); err != nil {
return err
}
}
if envelope.ChannelClass != ProductionChannelFabricControl {
return ErrUnauthorizedChannel
}
if envelope.MessageType != ProductionMessageFabricControl {
return fmt.Errorf("%w: unsupported message_type", ErrForwardEnvelopeInvalid)
}
if envelope.TTL <= 0 {
return ErrTTLExhausted
}
if envelope.HopCount < 0 {
return fmt.Errorf("%w: hop_count must not be negative", ErrForwardEnvelopeInvalid)
}
if envelope.CreatedAt.IsZero() || envelope.ExpiresAt.IsZero() {
return fmt.Errorf("%w: created_at and expires_at are required", ErrForwardEnvelopeInvalid)
}
if envelope.CreatedAt.After(now.UTC().Add(MaxProductionEnvelopeFutureSkew)) {
return fmt.Errorf("%w: created_at exceeds allowed future skew", ErrForwardEnvelopeInvalid)
}
if !envelope.ExpiresAt.After(now.UTC()) {
return ErrRouteExpired
}
if envelope.PayloadLength != len(envelope.Payload) {
return fmt.Errorf("%w: payload_length mismatch", ErrForwardEnvelopeInvalid)
}
if envelope.PayloadLength > MaxProductionEnvelopePayloadBytes {
return fmt.Errorf("%w: payload exceeds fabric-control limit", ErrForwardEnvelopeInvalid)
}
if envelope.PayloadHash == "" {
return fmt.Errorf("%w: payload_hash is required", ErrForwardEnvelopeInvalid)
}
sum := sha256.Sum256(envelope.Payload)
if envelope.PayloadHash != hex.EncodeToString(sum[:]) {
return fmt.Errorf("%w: payload_hash mismatch", ErrForwardEnvelopeInvalid)
}
return nil
}
func validateProductionRoutePath(local PeerIdentity, envelope ProductionEnvelope) error {
if len(envelope.RoutePath) < 2 {
return ErrInvalidRoutePath
}
if envelope.RoutePath[0] != envelope.SourceNodeID || envelope.RoutePath[len(envelope.RoutePath)-1] != envelope.DestinationNodeID {
return ErrInvalidRoutePath
}
currentIndex := -1
seen := map[string]struct{}{}
for index, nodeID := range envelope.RoutePath {
if nodeID == "" {
return ErrInvalidRoutePath
}
if _, duplicate := seen[nodeID]; duplicate {
return ErrLoopDetected
}
seen[nodeID] = struct{}{}
if nodeID == local.NodeID {
currentIndex = index
}
}
if currentIndex < 0 || envelope.CurrentHopNodeID != local.NodeID {
return ErrNodeMismatch
}
if containsProductionNodeID(envelope.VisitedNodeIDs, local.NodeID) {
return ErrLoopDetected
}
for _, visitedNodeID := range envelope.VisitedNodeIDs {
if visitedNodeID == "" || !containsProductionNodeID(envelope.RoutePath, visitedNodeID) {
return ErrInvalidRoutePath
}
}
if envelope.DestinationNodeID == local.NodeID {
if envelope.NextHopNodeID != local.NodeID {
return ErrInvalidRoutePath
}
return nil
}
if currentIndex >= len(envelope.RoutePath)-1 {
return ErrInvalidRoutePath
}
if envelope.NextHopNodeID != envelope.RoutePath[currentIndex+1] {
return ErrInvalidRoutePath
}
return nil
}
func containsProductionNodeID(values []string, needle string) bool {
for _, value := range values {
if value == needle {
return true
}
}
return false
}
func NewProductionEnvelopeObservation(envelope ProductionEnvelope, observedAt time.Time) ProductionEnvelopeObservation {
return ProductionEnvelopeObservation{
MessageID: envelope.MessageID,
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
SourceNodeID: envelope.SourceNodeID,
DestinationNodeID: envelope.DestinationNodeID,
CurrentHopNodeID: envelope.CurrentHopNodeID,
NextHopNodeID: envelope.NextHopNodeID,
RoutePath: append([]string{}, envelope.RoutePath...),
VisitedNodeIDs: append([]string{}, envelope.VisitedNodeIDs...),
ChannelClass: envelope.ChannelClass,
MessageType: envelope.MessageType,
TTL: envelope.TTL,
HopCount: envelope.HopCount,
PayloadLength: envelope.PayloadLength,
PayloadHash: envelope.PayloadHash,
ObservedAt: observedAt.UTC(),
}
}
@@ -0,0 +1,81 @@
package mesh
import (
"context"
"sync"
)
type ProductionEnvelopeObservationSink struct {
mu sync.Mutex
capacity int
items []ProductionEnvelopeObservation
accepted uint64
dropped uint64
}
type ProductionEnvelopeObservationSinkMetrics struct {
Capacity int `json:"capacity"`
CurrentDepth int `json:"current_depth"`
AcceptedTotal uint64 `json:"accepted_total"`
DroppedOldest uint64 `json:"dropped_oldest"`
}
func NewProductionEnvelopeObservationSink(capacity int) *ProductionEnvelopeObservationSink {
if capacity < 1 {
capacity = 1
}
return &ProductionEnvelopeObservationSink{
capacity: capacity,
items: make([]ProductionEnvelopeObservation, 0, capacity),
}
}
func (s *ProductionEnvelopeObservationSink) Observe(_ context.Context, observation ProductionEnvelopeObservation) error {
s.mu.Lock()
defer s.mu.Unlock()
s.accepted++
if len(s.items) == s.capacity {
copy(s.items, s.items[1:])
s.items[len(s.items)-1] = observation
s.dropped++
return nil
}
s.items = append(s.items, observation)
return nil
}
func (s *ProductionEnvelopeObservationSink) Snapshot() []ProductionEnvelopeObservation {
s.mu.Lock()
defer s.mu.Unlock()
out := make([]ProductionEnvelopeObservation, len(s.items))
copy(out, s.items)
return out
}
func (s *ProductionEnvelopeObservationSink) Len() int {
s.mu.Lock()
defer s.mu.Unlock()
return len(s.items)
}
func (s *ProductionEnvelopeObservationSink) Capacity() int {
s.mu.Lock()
defer s.mu.Unlock()
return s.capacity
}
func (s *ProductionEnvelopeObservationSink) Metrics() ProductionEnvelopeObservationSinkMetrics {
s.mu.Lock()
defer s.mu.Unlock()
return ProductionEnvelopeObservationSinkMetrics{
Capacity: s.capacity,
CurrentDepth: len(s.items),
AcceptedTotal: s.accepted,
DroppedOldest: s.dropped,
}
}
@@ -0,0 +1,80 @@
package mesh
import (
"fmt"
"time"
)
func ValidateProductionEnvelopeRouteConfig(local PeerIdentity, envelope ProductionEnvelope, routes []SyntheticRoute, now time.Time) error {
if len(routes) == 0 {
return nil
}
route, ok := productionRouteByID(routes, envelope.RouteID)
if !ok {
return ErrRouteNotFound
}
if route.ClusterID != envelope.ClusterID || route.ClusterID != local.ClusterID {
return ErrClusterMismatch
}
if route.SourceNodeID != envelope.SourceNodeID || route.DestinationNodeID != envelope.DestinationNodeID {
return ErrInvalidRoutePath
}
if route.ExpiresAt.IsZero() || !route.ExpiresAt.After(now.UTC()) || envelope.ExpiresAt.After(route.ExpiresAt) {
return ErrRouteExpired
}
if !contains(route.AllowedChannels, ProductionChannelFabricControl) {
return ErrUnauthorizedChannel
}
path := routePath(route)
if len(path) < 2 || path[0] != route.SourceNodeID || path[len(path)-1] != route.DestinationNodeID {
return ErrInvalidRoutePath
}
if len(envelope.RoutePath) > 0 && !sameNodePath(envelope.RoutePath, path) {
return ErrInvalidRoutePath
}
if len(path) > 2 && len(envelope.RoutePath) == 0 {
return ErrInvalidRoutePath
}
currentIndex := indexOf(path, local.NodeID)
if currentIndex < 0 || envelope.CurrentHopNodeID != local.NodeID {
return ErrNodeMismatch
}
expectedNextHop := local.NodeID
if local.NodeID != envelope.DestinationNodeID {
if currentIndex >= len(path)-1 {
return ErrInvalidRoutePath
}
expectedNextHop = path[currentIndex+1]
}
if envelope.NextHopNodeID != expectedNextHop {
return ErrInvalidRoutePath
}
if route.MaxTTL > 0 && envelope.TTL > route.MaxTTL {
return fmt.Errorf("%w: ttl exceeds configured route max_ttl", ErrForwardEnvelopeInvalid)
}
if route.MaxHops > 0 && envelope.HopCount > route.MaxHops {
return fmt.Errorf("%w: hop_count exceeds configured route max_hops", ErrForwardEnvelopeInvalid)
}
return nil
}
func productionRouteByID(routes []SyntheticRoute, routeID string) (SyntheticRoute, bool) {
for _, route := range routes {
if route.RouteID == routeID {
return route, true
}
}
return SyntheticRoute{}, false
}
func sameNodePath(a []string, b []string) bool {
if len(a) != len(b) {
return false
}
for i := range a {
if a[i] != b[i] {
return false
}
}
return true
}
@@ -0,0 +1,43 @@
package mesh
import (
"context"
"net/http"
"strings"
)
type ProductionForwardTransport interface {
SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error)
}
type HTTPProductionForwardTransport struct {
PeerURLs map[string]string
HTTPClient *http.Client
}
func NewHTTPProductionForwardTransport(peerURLs map[string]string) *HTTPProductionForwardTransport {
normalized := make(map[string]string, len(peerURLs))
for nodeID, baseURL := range peerURLs {
nodeID = strings.TrimSpace(nodeID)
baseURL = strings.TrimRight(strings.TrimSpace(baseURL), "/")
if nodeID != "" && baseURL != "" {
normalized[nodeID] = baseURL
}
}
return &HTTPProductionForwardTransport{PeerURLs: normalized}
}
func (t *HTTPProductionForwardTransport) SendProduction(ctx context.Context, nextNodeID string, envelope ProductionEnvelope) (ProductionForwardResult, error) {
if t == nil {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
}
baseURL := strings.TrimRight(strings.TrimSpace(t.PeerURLs[nextNodeID]), "/")
if baseURL == "" {
return ProductionForwardResult{}, ErrForwardPeerUnavailable
}
client := NewClient(baseURL)
if t.HTTPClient != nil {
client.HTTPClient = t.HTTPClient
}
return client.SendProduction(ctx, envelope)
}
@@ -0,0 +1,241 @@
package mesh
import (
"encoding/json"
"fmt"
"os"
"strings"
"time"
)
type ScopedSyntheticConfig struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
LocalNodeID string `json:"local_node_id"`
ConfigVersion string `json:"config_version,omitempty"`
PeerDirectoryVersion string `json:"peer_directory_version,omitempty"`
PolicyVersion string `json:"policy_version,omitempty"`
PeerEndpoints map[string]string `json:"peer_endpoints"`
PeerEndpointCandidates map[string][]PeerEndpointCandidate `json:"peer_endpoint_candidates,omitempty"`
PeerDirectory []PeerDirectoryEntry `json:"peer_directory,omitempty"`
RecoverySeeds []PeerRecoverySeed `json:"recovery_seeds,omitempty"`
RendezvousLeases []PeerRendezvousLease `json:"rendezvous_leases,omitempty"`
Routes []SyntheticRoute `json:"routes"`
}
type PeerDirectoryEntry struct {
NodeID string `json:"node_id"`
RouteIDs []string `json:"route_ids,omitempty"`
EndpointCount int `json:"endpoint_count"`
CandidateCount int `json:"candidate_count"`
ConnectivityModes []string `json:"connectivity_modes,omitempty"`
RecoverySeed bool `json:"recovery_seed"`
}
type PeerRecoverySeed struct {
NodeID string `json:"node_id"`
Endpoint string `json:"endpoint"`
Transport string `json:"transport"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
Region string `json:"region,omitempty"`
Priority int `json:"priority"`
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
type PeerRendezvousLease struct {
LeaseID string `json:"lease_id"`
PeerNodeID string `json:"peer_node_id"`
RelayNodeID string `json:"relay_node_id"`
RelayEndpoint string `json:"relay_endpoint"`
Transport string `json:"transport"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
RouteIDs []string `json:"route_ids,omitempty"`
AllowedChannels []string `json:"allowed_channels,omitempty"`
Priority int `json:"priority"`
ControlPlaneOnly bool `json:"control_plane_only"`
IssuedAt time.Time `json:"issued_at"`
ExpiresAt time.Time `json:"expires_at"`
Reason string `json:"reason,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
type PeerEndpointCandidate struct {
EndpointID string `json:"endpoint_id"`
NodeID string `json:"node_id"`
Transport string `json:"transport"`
Address string `json:"address"`
AddressFamily string `json:"address_family,omitempty"`
Reachability string `json:"reachability"`
NATType string `json:"nat_type,omitempty"`
ConnectivityMode string `json:"connectivity_mode"`
Region string `json:"region,omitempty"`
Priority int `json:"priority"`
PolicyTags []string `json:"policy_tags,omitempty"`
LastVerifiedAt *time.Time `json:"last_verified_at,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
func LoadScopedSyntheticConfig(path string, local PeerIdentity) (ScopedSyntheticConfig, error) {
payload, err := os.ReadFile(path)
if err != nil {
return ScopedSyntheticConfig{}, err
}
var cfg ScopedSyntheticConfig
if err := json.Unmarshal(payload, &cfg); err != nil {
return ScopedSyntheticConfig{}, fmt.Errorf("parse scoped synthetic mesh config: %w", err)
}
if err := cfg.Validate(local); err != nil {
return ScopedSyntheticConfig{}, err
}
return cfg, nil
}
func (cfg ScopedSyntheticConfig) Validate(local PeerIdentity) error {
if cfg.SchemaVersion == "" {
return fmt.Errorf("scoped synthetic mesh config schema_version is required")
}
if cfg.ClusterID == "" || cfg.ClusterID != local.ClusterID {
return ErrClusterMismatch
}
if cfg.LocalNodeID == "" || cfg.LocalNodeID != local.NodeID {
return ErrNodeMismatch
}
for nodeID, endpoint := range cfg.PeerEndpoints {
if strings.TrimSpace(nodeID) == "" || strings.TrimSpace(endpoint) == "" {
return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint")
}
}
for nodeID, candidates := range cfg.PeerEndpointCandidates {
if strings.TrimSpace(nodeID) == "" {
return fmt.Errorf("scoped synthetic mesh config contains empty peer endpoint candidate node")
}
for _, candidate := range candidates {
if strings.TrimSpace(candidate.EndpointID) == "" ||
strings.TrimSpace(candidate.NodeID) == "" ||
candidate.NodeID != nodeID ||
strings.TrimSpace(candidate.Transport) == "" ||
strings.TrimSpace(candidate.Address) == "" ||
strings.TrimSpace(candidate.Reachability) == "" ||
strings.TrimSpace(candidate.ConnectivityMode) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid peer endpoint candidate")
}
}
}
if err := validatePeerDirectory(cfg.PeerDirectory, cfg.LocalNodeID); err != nil {
return err
}
if err := validateRecoverySeeds(cfg.RecoverySeeds); err != nil {
return err
}
if err := validateRendezvousLeases(cfg.RendezvousLeases, cfg.Routes, cfg.LocalNodeID); err != nil {
return err
}
for _, route := range cfg.Routes {
if route.ClusterID != cfg.ClusterID {
return ErrClusterMismatch
}
path := routePath(route)
if len(path) < 2 {
return ErrInvalidRoutePath
}
if !contains(path, cfg.LocalNodeID) {
return ErrNodeMismatch
}
if route.ExpiresAt.IsZero() {
return fmt.Errorf("scoped synthetic route %q expires_at is required", route.RouteID)
}
if !route.ExpiresAt.After(time.Now().UTC()) {
return ErrRouteExpired
}
}
return nil
}
func validatePeerDirectory(entries []PeerDirectoryEntry, localNodeID string) error {
seen := map[string]struct{}{}
for _, entry := range entries {
nodeID := strings.TrimSpace(entry.NodeID)
if nodeID == "" || nodeID == localNodeID {
return fmt.Errorf("scoped synthetic mesh config contains invalid peer directory entry")
}
if _, duplicate := seen[nodeID]; duplicate {
return fmt.Errorf("scoped synthetic mesh config contains duplicate peer directory entry")
}
seen[nodeID] = struct{}{}
if entry.EndpointCount < 0 || entry.CandidateCount < 0 {
return fmt.Errorf("scoped synthetic mesh config contains invalid peer directory count")
}
}
return nil
}
func validateRecoverySeeds(seeds []PeerRecoverySeed) error {
if len(seeds) > 20 {
return fmt.Errorf("scoped synthetic mesh config contains too many recovery seeds")
}
seen := map[string]struct{}{}
for _, seed := range seeds {
key := strings.TrimSpace(seed.NodeID) + "\x00" + strings.TrimSpace(seed.Endpoint)
if strings.TrimSpace(seed.NodeID) == "" ||
strings.TrimSpace(seed.Endpoint) == "" ||
strings.TrimSpace(seed.Transport) == "" {
return fmt.Errorf("scoped synthetic mesh config contains invalid recovery seed")
}
if _, duplicate := seen[key]; duplicate {
return fmt.Errorf("scoped synthetic mesh config contains duplicate recovery seed")
}
seen[key] = struct{}{}
}
return nil
}
func validateRendezvousLeases(leases []PeerRendezvousLease, routes []SyntheticRoute, localNodeID string) error {
if len(leases) > 20 {
return fmt.Errorf("scoped synthetic mesh config contains too many rendezvous leases")
}
routesByID := map[string]SyntheticRoute{}
for _, route := range routes {
if strings.TrimSpace(route.RouteID) != "" {
routesByID[route.RouteID] = route
}
}
seen := map[string]struct{}{}
now := time.Now().UTC()
for _, lease := range leases {
if strings.TrimSpace(lease.LeaseID) == "" ||
strings.TrimSpace(lease.PeerNodeID) == "" ||
strings.TrimSpace(lease.RelayNodeID) == "" ||
strings.TrimSpace(lease.RelayEndpoint) == "" ||
strings.TrimSpace(lease.Transport) == "" ||
lease.PeerNodeID == lease.RelayNodeID ||
!lease.ControlPlaneOnly ||
lease.ExpiresAt.IsZero() ||
!lease.ExpiresAt.After(now) ||
(len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) {
return fmt.Errorf("scoped synthetic mesh config contains invalid rendezvous lease")
}
if _, duplicate := seen[lease.LeaseID]; duplicate {
return fmt.Errorf("scoped synthetic mesh config contains duplicate rendezvous lease")
}
seen[lease.LeaseID] = struct{}{}
if len(lease.RouteIDs) == 0 {
continue
}
visible := false
for _, routeID := range lease.RouteIDs {
route, ok := routesByID[routeID]
if !ok {
return fmt.Errorf("scoped synthetic mesh config contains rendezvous lease for unknown route")
}
path := routePath(route)
if contains(path, localNodeID) && contains(path, lease.PeerNodeID) && contains(path, lease.RelayNodeID) {
visible = true
}
}
if !visible {
return fmt.Errorf("scoped synthetic mesh config contains out-of-scope rendezvous lease")
}
}
return nil
}
@@ -0,0 +1,235 @@
package mesh
import (
"encoding/json"
"errors"
"os"
"path/filepath"
"testing"
"time"
)
func TestLoadScopedSyntheticConfig(t *testing.T) {
expiresAt := time.Now().UTC().Add(time.Hour)
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
ConfigVersion: "config-v1",
PeerDirectoryVersion: "peers-v1",
PolicyVersion: "policy-v1",
PeerEndpoints: map[string]string{"node-b": "http://127.0.0.1:19002"},
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-public",
NodeID: "node-b",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
NATType: "restricted",
ConnectivityMode: "direct",
Priority: 10,
},
},
},
PeerDirectory: []PeerDirectoryEntry{
{
NodeID: "node-b",
RouteIDs: []string{"route-a-b"},
EndpointCount: 1,
CandidateCount: 1,
ConnectivityModes: []string{"direct"},
RecoverySeed: true,
},
},
RecoverySeeds: []PeerRecoverySeed{
{
NodeID: "node-b",
Endpoint: "https://node-b.example.test:443",
Transport: "direct_tcp_tls",
ConnectivityMode: "direct",
Priority: 10,
},
},
RendezvousLeases: []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
ConnectivityMode: "relay_required",
RouteIDs: []string{"route-a-b"},
AllowedChannels: []string{"fabric_control", "route_control"},
Priority: 10,
ControlPlaneOnly: true,
IssuedAt: expiresAt.Add(-time.Minute),
ExpiresAt: expiresAt,
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-r", "node-b"})},
})
cfg, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err != nil {
t.Fatalf("load scoped config: %v", err)
}
if cfg.ConfigVersion != "config-v1" || cfg.PeerEndpoints["node-b"] == "" || len(cfg.Routes) != 1 {
t.Fatalf("unexpected config: %+v", cfg)
}
if got := cfg.PeerEndpointCandidates["node-b"]; len(got) != 1 || got[0].EndpointID != "node-b-public" {
t.Fatalf("unexpected endpoint candidates: %+v", cfg.PeerEndpointCandidates)
}
if len(cfg.PeerDirectory) != 1 || cfg.PeerDirectory[0].NodeID != "node-b" || !cfg.PeerDirectory[0].RecoverySeed {
t.Fatalf("unexpected peer directory: %+v", cfg.PeerDirectory)
}
if len(cfg.RecoverySeeds) != 1 || cfg.RecoverySeeds[0].NodeID != "node-b" {
t.Fatalf("unexpected recovery seeds: %+v", cfg.RecoverySeeds)
}
if len(cfg.RendezvousLeases) != 1 || cfg.RendezvousLeases[0].RelayNodeID != "node-r" {
t.Fatalf("unexpected rendezvous leases: %+v", cfg.RendezvousLeases)
}
}
func TestLoadScopedSyntheticConfigRejectsWrongCluster(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-2",
LocalNodeID: "node-a",
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if !errors.Is(err, ErrClusterMismatch) {
t.Fatalf("err = %v, want ErrClusterMismatch", err)
}
}
func TestLoadScopedSyntheticConfigRejectsWrongNode(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-x",
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if !errors.Is(err, ErrNodeMismatch) {
t.Fatalf("err = %v, want ErrNodeMismatch", err)
}
}
func TestLoadScopedSyntheticConfigRejectsExpiredRoute(t *testing.T) {
route := liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})
route.ExpiresAt = time.Now().UTC().Add(-time.Minute)
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
Routes: []SyntheticRoute{route},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if !errors.Is(err, ErrRouteExpired) {
t.Fatalf("err = %v, want ErrRouteExpired", err)
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidPeerEndpointCandidate(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
PeerEndpointCandidates: map[string][]PeerEndpointCandidate{
"node-b": {
{
EndpointID: "node-b-public",
NodeID: "node-c",
Transport: "direct_tcp_tls",
Address: "203.0.113.20:443",
Reachability: "public",
ConnectivityMode: "direct",
},
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected invalid peer endpoint candidate error")
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidPeerDirectory(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
PeerDirectory: []PeerDirectoryEntry{
{NodeID: "node-a"},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected invalid peer directory error")
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidRecoverySeed(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17f.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
RecoverySeeds: []PeerRecoverySeed{
{NodeID: "node-b", Endpoint: "", Transport: "direct_tcp_tls"},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected invalid recovery seed error")
}
}
func TestLoadScopedSyntheticConfigRejectsInvalidRendezvousLease(t *testing.T) {
path := writeScopedConfig(t, ScopedSyntheticConfig{
SchemaVersion: "c17z12.synthetic.v1",
ClusterID: "cluster-1",
LocalNodeID: "node-a",
RendezvousLeases: []PeerRendezvousLease{
{
LeaseID: "lease-node-b-via-node-r",
PeerNodeID: "node-b",
RelayNodeID: "node-r",
RelayEndpoint: "http://node-r:19000",
Transport: "relay_control",
RouteIDs: []string{"route-a-b"},
ExpiresAt: time.Now().UTC().Add(time.Hour),
},
},
Routes: []SyntheticRoute{liveSyntheticRoute("route-a-b", []string{"node-a", "node-r", "node-b"})},
})
_, err := LoadScopedSyntheticConfig(path, PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"})
if err == nil {
t.Fatal("expected invalid rendezvous lease error")
}
}
func writeScopedConfig(t *testing.T, cfg ScopedSyntheticConfig) string {
t.Helper()
payload, err := json.Marshal(cfg)
if err != nil {
t.Fatalf("marshal config: %v", err)
}
path := filepath.Join(t.TempDir(), "mesh-config.json")
if err := os.WriteFile(path, payload, 0o600); err != nil {
t.Fatalf("write config: %v", err)
}
return path
}
@@ -0,0 +1,291 @@
package mesh
import (
"context"
"encoding/json"
"net/http"
"time"
)
type ProductionEnvelopeObserver func(context.Context, ProductionEnvelopeObservation) error
type ProductionForwardLogger func(ProductionForwardLogEntry)
type Server struct {
Local PeerIdentity
SyntheticRuntime *SyntheticRuntime
ProductionForwardingEnabled bool
ProductionEnvelopeObserver ProductionEnvelopeObserver
ProductionForwardTransport ProductionForwardTransport
ProductionForwardLogger ProductionForwardLogger
ProductionRoutes []SyntheticRoute
}
func (s Server) Handler() http.Handler {
mux := http.NewServeMux()
mux.HandleFunc("/mesh/v1/health", s.handleHealth)
mux.HandleFunc("/mesh/v1/forward", s.handleForward)
mux.HandleFunc("/mesh/v1/synthetic/probe", s.handleSyntheticProbe)
return mux
}
func (s Server) handleHealth(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
var message HealthMessage
if err := json.NewDecoder(r.Body).Decode(&message); err != nil {
http.Error(w, "invalid health message", http.StatusBadRequest)
return
}
if message.ProtocolVersion != ProtocolVersion {
http.Error(w, "unsupported mesh protocol version", http.StatusBadRequest)
return
}
if err := ValidatePeer(s.Local, message.From); err != nil {
http.Error(w, err.Error(), http.StatusForbidden)
return
}
if message.To.NodeID != "" && message.To.NodeID != s.Local.NodeID {
http.Error(w, ErrNodeMismatch.Error(), http.StatusForbidden)
return
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(HealthAck{
ProtocolVersion: ProtocolVersion,
Accepted: true,
By: s.Local,
})
}
func (s Server) handleForward(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
if !s.ProductionForwardingEnabled {
s.logProductionForward(ProductionForwardLogEntry{
Event: "production_forward_rejected",
ClusterID: s.Local.ClusterID,
LocalNodeID: s.Local.NodeID,
Reason: ErrForwardDisabled.Error(),
StatusCode: http.StatusNotImplemented,
OccurredAt: time.Now().UTC(),
})
http.Error(w, ErrForwardDisabled.Error(), http.StatusNotImplemented)
return
}
var envelope ProductionEnvelope
if err := json.NewDecoder(r.Body).Decode(&envelope); err != nil {
s.logProductionForward(ProductionForwardLogEntry{
Event: "production_forward_rejected",
ClusterID: s.Local.ClusterID,
LocalNodeID: s.Local.NodeID,
Reason: "invalid production mesh envelope",
StatusCode: http.StatusBadRequest,
OccurredAt: time.Now().UTC(),
})
http.Error(w, "invalid production mesh envelope", http.StatusBadRequest)
return
}
if err := ValidateProductionEnvelope(s.Local, envelope, time.Now().UTC()); err != nil {
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
return
}
if err := ValidateProductionEnvelopeRouteConfig(s.Local, envelope, s.ProductionRoutes, time.Now().UTC()); err != nil {
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
return
}
s.logProductionForward(productionForwardLogEntry("production_forward_accepted", s.Local, envelope, "", 0))
if s.ProductionEnvelopeObserver != nil {
observation := NewProductionEnvelopeObservation(envelope, time.Now().UTC())
if err := observeProductionEnvelope(r.Context(), s.ProductionEnvelopeObserver, observation); err != nil {
s.logProductionForward(productionForwardLogEntry("production_forward_rejected", s.Local, envelope, ErrForwardObservationFailed.Error(), http.StatusInternalServerError))
http.Error(w, ErrForwardObservationFailed.Error(), http.StatusInternalServerError)
return
}
}
if envelope.DestinationNodeID == s.Local.NodeID {
s.logProductionForward(productionForwardLogEntry("production_forward_delivered", s.Local, envelope, "", http.StatusOK))
writeProductionForwardResult(w, ProductionForwardResult{
Accepted: true,
Delivered: true,
By: s.Local,
MessageID: envelope.MessageID,
RouteID: envelope.RouteID,
})
return
}
if envelope.NextHopNodeID == s.Local.NodeID {
s.rejectProductionForward(w, envelope, ErrLoopDetected, forwardStatusCode(ErrLoopDetected))
return
}
if len(envelope.RoutePath) == 0 && envelope.NextHopNodeID != envelope.DestinationNodeID {
s.rejectProductionForward(w, envelope, ErrForwardRuntimeUnavailable, http.StatusNotImplemented)
return
}
if s.ProductionForwardTransport == nil {
s.rejectProductionForward(w, envelope, ErrForwardRuntimeUnavailable, http.StatusNotImplemented)
return
}
if envelope.TTL <= 1 {
s.rejectProductionForward(w, envelope, ErrTTLExhausted, forwardStatusCode(ErrTTLExhausted))
return
}
forwarded := envelope
forwarded.CurrentHopNodeID = envelope.NextHopNodeID
forwarded.NextHopNodeID = nextProductionHopAfter(envelope.RoutePath, envelope.NextHopNodeID, envelope.DestinationNodeID)
forwarded.TTL = envelope.TTL - 1
forwarded.HopCount = envelope.HopCount + 1
forwarded.VisitedNodeIDs = append(append([]string{}, envelope.VisitedNodeIDs...), s.Local.NodeID)
result, err := s.ProductionForwardTransport.SendProduction(r.Context(), envelope.NextHopNodeID, forwarded)
if err != nil {
s.rejectProductionForward(w, envelope, err, forwardStatusCode(err))
return
}
s.logProductionForward(productionForwardLogEntry("production_forward_forwarded", s.Local, envelope, "", http.StatusOK))
result.Accepted = true
result.Forwarded = true
result.By = s.Local
result.MessageID = envelope.MessageID
result.RouteID = envelope.RouteID
result.NextNodeID = envelope.NextHopNodeID
writeProductionForwardResult(w, result)
}
func (s Server) rejectProductionForward(w http.ResponseWriter, envelope ProductionEnvelope, err error, statusCode int) {
s.logProductionForward(productionForwardLogEntry("production_forward_rejected", s.Local, envelope, err.Error(), statusCode))
http.Error(w, err.Error(), statusCode)
}
func (s Server) logProductionForward(entry ProductionForwardLogEntry) {
if s.ProductionForwardLogger == nil {
return
}
if entry.OccurredAt.IsZero() {
entry.OccurredAt = time.Now().UTC()
}
s.ProductionForwardLogger(entry)
}
func productionForwardLogEntry(event string, local PeerIdentity, envelope ProductionEnvelope, reason string, statusCode int) ProductionForwardLogEntry {
return ProductionForwardLogEntry{
Event: event,
RouteID: envelope.RouteID,
MessageID: envelope.MessageID,
ClusterID: envelope.ClusterID,
LocalNodeID: local.NodeID,
SourceNodeID: envelope.SourceNodeID,
DestinationNodeID: envelope.DestinationNodeID,
CurrentHopNodeID: envelope.CurrentHopNodeID,
NextHopNodeID: envelope.NextHopNodeID,
ChannelClass: envelope.ChannelClass,
MessageType: envelope.MessageType,
Reason: reason,
StatusCode: statusCode,
TTL: envelope.TTL,
HopCount: envelope.HopCount,
RoutePathLength: len(envelope.RoutePath),
VisitedCount: len(envelope.VisitedNodeIDs),
PayloadLength: envelope.PayloadLength,
OccurredAt: time.Now().UTC(),
}
}
func nextProductionHopAfter(routePath []string, currentNodeID string, destinationNodeID string) string {
if len(routePath) == 0 {
return destinationNodeID
}
for index, nodeID := range routePath {
if nodeID == currentNodeID {
if index >= len(routePath)-1 {
return currentNodeID
}
return routePath[index+1]
}
}
return destinationNodeID
}
func writeProductionForwardResult(w http.ResponseWriter, result ProductionForwardResult) {
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(result)
}
func observeProductionEnvelope(ctx context.Context, observer ProductionEnvelopeObserver, observation ProductionEnvelopeObservation) (err error) {
if observer == nil {
return nil
}
defer func() {
if recover() != nil {
err = ErrForwardObservationFailed
}
}()
return observer(ctx, observation)
}
func (s Server) handleSyntheticProbe(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
w.WriteHeader(http.StatusMethodNotAllowed)
return
}
if s.SyntheticRuntime == nil {
http.Error(w, ErrMeshRuntimeDisabled.Error(), http.StatusServiceUnavailable)
return
}
var envelope SyntheticEnvelope
if err := json.NewDecoder(r.Body).Decode(&envelope); err != nil {
http.Error(w, "invalid synthetic mesh envelope", http.StatusBadRequest)
return
}
ack, err := s.SyntheticRuntime.Receive(r.Context(), envelope)
if err != nil {
http.Error(w, err.Error(), syntheticStatusCode(err))
return
}
w.Header().Set("Content-Type", "application/json")
_ = json.NewEncoder(w).Encode(ack)
}
func NewHealthMessage(from, to PeerIdentity) HealthMessage {
status := "reachable"
return HealthMessage{
ProtocolVersion: ProtocolVersion,
From: from,
To: to,
ObservedAt: time.Now().UTC(),
LinkStatus: status,
}
}
func syntheticStatusCode(err error) int {
switch err {
case ErrClusterMismatch, ErrNodeMismatch, ErrUnauthorizedChannel, ErrLoopDetected:
return http.StatusForbidden
case ErrMeshRuntimeDisabled:
return http.StatusServiceUnavailable
case ErrRouteExpired, ErrTTLExhausted, ErrInvalidRoutePath, ErrUnsupportedSyntheticMessage, ErrRouteIDRequired:
return http.StatusBadRequest
case ErrRouteNotFound, ErrSyntheticPeerUnavailable:
return http.StatusNotFound
default:
return http.StatusBadRequest
}
}
func forwardStatusCode(err error) int {
switch err {
case ErrClusterMismatch, ErrNodeMismatch, ErrUnauthorizedChannel, ErrLoopDetected:
return http.StatusForbidden
case ErrRouteExpired, ErrTTLExhausted, ErrInvalidRoutePath, ErrRouteIDRequired:
return http.StatusBadRequest
case ErrForwardRuntimeUnavailable:
return http.StatusNotImplemented
case ErrRouteNotFound:
return http.StatusNotFound
case ErrForwardPeerUnavailable:
return http.StatusBadGateway
default:
return http.StatusBadRequest
}
}
@@ -0,0 +1,802 @@
package mesh
import (
"bytes"
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"errors"
"net/http"
"net/http/httptest"
"testing"
"time"
)
func TestMeshHealthAcceptsSameCluster(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{Local: local}.Handler())
defer server.Close()
client := NewClient(server.URL)
ack, err := client.SendHealth(context.Background(), NewHealthMessage(
PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
local,
))
if err != nil {
t.Fatalf("send health: %v", err)
}
if !ack.Accepted || ack.By.NodeID != "node-b" {
t.Fatalf("unexpected ack: %+v", ack)
}
}
func TestMeshHealthRejectsClusterMismatch(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{Local: local}.Handler())
defer server.Close()
message := NewHealthMessage(PeerIdentity{ClusterID: "cluster-2", NodeID: "node-a"}, local)
payload, err := json.Marshal(message)
if err != nil {
t.Fatalf("marshal message: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/health", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post health: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusForbidden {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
}
}
func TestMeshForwardingDisabled(t *testing.T) {
server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler())
defer server.Close()
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/octet-stream", bytes.NewReader([]byte("payload")))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusNotImplemented {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
}
}
func TestMeshForwardingGateEnabledStillHasNoProductionRuntime(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
}.Handler())
defer server.Close()
payload, err := json.Marshal(validProductionEnvelope(local))
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusNotImplemented {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
}
}
func TestMeshForwardingGateDeliversFabricControlAtDestination(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
var events []ProductionForwardLogEntry
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
events = append(events, entry)
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = local.NodeID
envelope.CurrentHopNodeID = local.NodeID
envelope.NextHopNodeID = local.NodeID
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
t.Fatalf("decode result: %v", err)
}
if !result.Accepted || !result.Delivered || result.Forwarded || result.By.NodeID != local.NodeID {
t.Fatalf("unexpected result: %+v", result)
}
if !hasProductionForwardEvent(events, "production_forward_accepted") || !hasProductionForwardEvent(events, "production_forward_delivered") {
t.Fatalf("missing production forward events: %+v", events)
}
}
func TestMeshForwardingGateForwardsDirectFabricControlToNextHop(t *testing.T) {
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
var deliveredObservation ProductionEnvelopeObservation
serverC := httptest.NewServer(Server{
Local: nodeC,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
deliveredObservation = observation
return nil
},
}.Handler())
defer serverC.Close()
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
serverB := httptest.NewServer(Server{
Local: nodeB,
ProductionForwardingEnabled: true,
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeC.NodeID: serverC.URL,
}),
}.Handler())
defer serverB.Close()
envelope := validProductionEnvelope(nodeB)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = nodeC.NodeID
envelope.CurrentHopNodeID = nodeB.NodeID
envelope.NextHopNodeID = nodeC.NodeID
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
t.Fatalf("decode result: %v", err)
}
if !result.Accepted || !result.Forwarded || !result.Delivered || result.NextNodeID != nodeC.NodeID || result.By.NodeID != nodeB.NodeID {
t.Fatalf("unexpected forward result: %+v", result)
}
if deliveredObservation.CurrentHopNodeID != nodeC.NodeID || deliveredObservation.MessageID != envelope.MessageID {
t.Fatalf("destination did not observe forwarded envelope: %+v", deliveredObservation)
}
}
func TestMeshForwardingGateForwardsMultiHopFabricControlByRoutePath(t *testing.T) {
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
var deliveredObservation ProductionEnvelopeObservation
var nodeREvents []ProductionForwardLogEntry
var nodeBEvents []ProductionForwardLogEntry
serverC := httptest.NewServer(Server{
Local: nodeC,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
deliveredObservation = observation
return nil
},
}.Handler())
defer serverC.Close()
nodeR := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}
serverR := httptest.NewServer(Server{
Local: nodeR,
ProductionForwardingEnabled: true,
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeC.NodeID: serverC.URL,
}),
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
nodeREvents = append(nodeREvents, entry)
},
}.Handler())
defer serverR.Close()
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
serverB := httptest.NewServer(Server{
Local: nodeB,
ProductionForwardingEnabled: true,
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeR.NodeID: serverR.URL,
}),
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
nodeBEvents = append(nodeBEvents, entry)
},
}.Handler())
defer serverB.Close()
envelope := validProductionEnvelope(nodeB)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = nodeC.NodeID
envelope.CurrentHopNodeID = nodeB.NodeID
envelope.NextHopNodeID = nodeR.NodeID
envelope.RoutePath = []string{"node-a", nodeB.NodeID, nodeR.NodeID, nodeC.NodeID}
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
}
var result ProductionForwardResult
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
t.Fatalf("decode result: %v", err)
}
if !result.Accepted || !result.Forwarded || !result.Delivered || result.NextNodeID != nodeR.NodeID || result.By.NodeID != nodeB.NodeID {
t.Fatalf("unexpected multi-hop result: %+v", result)
}
if deliveredObservation.CurrentHopNodeID != nodeC.NodeID || deliveredObservation.NextHopNodeID != nodeC.NodeID {
t.Fatalf("destination did not observe final hop: %+v", deliveredObservation)
}
if len(deliveredObservation.VisitedNodeIDs) != 2 || deliveredObservation.VisitedNodeIDs[0] != nodeB.NodeID || deliveredObservation.VisitedNodeIDs[1] != nodeR.NodeID {
t.Fatalf("visited path not propagated: %+v", deliveredObservation.VisitedNodeIDs)
}
if !hasProductionForwardEvent(nodeBEvents, "production_forward_forwarded") || !hasProductionForwardEvent(nodeREvents, "production_forward_forwarded") {
t.Fatalf("missing relay forward events: nodeB=%+v nodeR=%+v", nodeBEvents, nodeREvents)
}
}
func TestMeshForwardingGateForwardsConfiguredProductionRoute(t *testing.T) {
nodeC := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-c"}
route := configuredProductionRoute("route-1", []string{"node-a", "node-b", "node-r", nodeC.NodeID})
var deliveredObservation ProductionEnvelopeObservation
serverC := httptest.NewServer(Server{
Local: nodeC,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
deliveredObservation = observation
return nil
},
}.Handler())
defer serverC.Close()
nodeR := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"}
serverR := httptest.NewServer(Server{
Local: nodeR,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeC.NodeID: serverC.URL,
}),
}.Handler())
defer serverR.Close()
nodeB := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
serverB := httptest.NewServer(Server{
Local: nodeB,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
ProductionForwardTransport: NewHTTPProductionForwardTransport(map[string]string{
nodeR.NodeID: serverR.URL,
}),
}.Handler())
defer serverB.Close()
envelope := validProductionEnvelope(nodeB)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = nodeC.NodeID
envelope.CurrentHopNodeID = nodeB.NodeID
envelope.NextHopNodeID = nodeR.NodeID
envelope.RoutePath = route.Hops
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(serverB.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusOK)
}
if deliveredObservation.RouteID != route.RouteID || deliveredObservation.CurrentHopNodeID != nodeC.NodeID {
t.Fatalf("configured route was not delivered: %+v", deliveredObservation)
}
}
func TestMeshForwardingGateRejectsUnknownConfiguredProductionRoute(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{
configuredProductionRoute("route-other", []string{"node-a", local.NodeID, "node-c"}),
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusNotFound {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotFound)
}
}
func TestMeshForwardingGateRejectsConfiguredProductionRouteWrongNextHop(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
route := configuredProductionRoute("route-1", []string{"node-a", local.NodeID, "node-r", "node-c"})
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionRoutes: []SyntheticRoute{route},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = "node-c"
envelope.CurrentHopNodeID = local.NodeID
envelope.NextHopNodeID = "node-c"
envelope.RoutePath = route.Hops
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
}
func TestMeshForwardingGateRejectsRoutePathWrongNextHop(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
var events []ProductionForwardLogEntry
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionForwardLogger: func(entry ProductionForwardLogEntry) {
events = append(events, entry)
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = "node-c"
envelope.CurrentHopNodeID = local.NodeID
envelope.NextHopNodeID = "node-x"
envelope.RoutePath = []string{"node-a", local.NodeID, "node-r", "node-c"}
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
if !hasProductionForwardEvent(events, "production_forward_rejected") {
t.Fatalf("missing reject event: %+v", events)
}
}
func TestMeshForwardingGateRejectsRoutePathLoop(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.SourceNodeID = "node-a"
envelope.DestinationNodeID = "node-c"
envelope.CurrentHopNodeID = local.NodeID
envelope.NextHopNodeID = "node-r"
envelope.RoutePath = []string{"node-a", local.NodeID, "node-r", local.NodeID, "node-c"}
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusForbidden {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
}
}
func TestMeshForwardingGateRejectsInvalidProductionEnvelope(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.PayloadHash = "bad-hash"
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
}
func TestMeshForwardingGateRejectsOversizedProductionEnvelopePayload(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
observed := false
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
observed = true
return nil
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.Payload = json.RawMessage(`"` + string(bytes.Repeat([]byte("a"), MaxProductionEnvelopePayloadBytes+1)) + `"`)
sum := sha256.Sum256(envelope.Payload)
envelope.PayloadLength = len(envelope.Payload)
envelope.PayloadHash = hex.EncodeToString(sum[:])
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
if observed {
t.Fatal("observer called for oversized envelope")
}
}
func TestMeshForwardingGateRejectsFutureCreatedAt(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
observed := false
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
observed = true
return nil
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.CreatedAt = time.Now().UTC().Add(MaxProductionEnvelopeFutureSkew + time.Second)
envelope.ExpiresAt = envelope.CreatedAt.Add(time.Minute)
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusBadRequest {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusBadRequest)
}
if observed {
t.Fatal("observer called for future-created envelope")
}
}
func TestMeshForwardingGateObservesValidEnvelopeWithoutPayload(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
var observed ProductionEnvelopeObservation
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(_ context.Context, observation ProductionEnvelopeObservation) error {
observed = observation
return nil
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusNotImplemented {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
}
if observed.MessageID != envelope.MessageID || observed.RouteID != envelope.RouteID {
t.Fatalf("unexpected observation: %+v", observed)
}
if observed.PayloadHash != envelope.PayloadHash || observed.PayloadLength != envelope.PayloadLength {
t.Fatalf("payload metadata missing from observation: %+v", observed)
}
}
func TestMeshForwardingGateDoesNotObserveRejectedEnvelope(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
observed := false
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
observed = true
return nil
},
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.ClusterID = "wrong-cluster"
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusForbidden {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
}
if observed {
t.Fatal("observer called for rejected envelope")
}
}
func TestMeshForwardingGateFailsClosedWhenObservationFails(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
return errors.New("observer down")
},
}.Handler())
defer server.Close()
payload, err := json.Marshal(validProductionEnvelope(local))
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusInternalServerError {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusInternalServerError)
}
}
func TestMeshForwardingGateFailsClosedWhenObservationPanics(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: func(context.Context, ProductionEnvelopeObservation) error {
panic("observer panic")
},
}.Handler())
defer server.Close()
payload, err := json.Marshal(validProductionEnvelope(local))
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusInternalServerError {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusInternalServerError)
}
}
func TestObserveProductionEnvelopeAllowsNilObserver(t *testing.T) {
if err := observeProductionEnvelope(context.Background(), nil, ProductionEnvelopeObservation{}); err != nil {
t.Fatalf("observeProductionEnvelope nil observer err = %v", err)
}
}
func TestProductionEnvelopeObservationSinkKeepsBoundedMetadata(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
sink := NewProductionEnvelopeObservationSink(2)
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
ProductionEnvelopeObserver: sink.Observe,
}.Handler())
defer server.Close()
for i := 1; i <= 3; i++ {
envelope := validProductionEnvelope(local)
envelope.MessageID = "message-" + string(rune('0'+i))
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
resp.Body.Close()
if resp.StatusCode != http.StatusNotImplemented {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusNotImplemented)
}
}
observations := sink.Snapshot()
if len(observations) != 2 {
t.Fatalf("observation count = %d, want 2", len(observations))
}
if observations[0].MessageID != "message-2" || observations[1].MessageID != "message-3" {
t.Fatalf("unexpected bounded observations: %+v", observations)
}
if observations[0].PayloadHash == "" || observations[0].PayloadLength == 0 {
t.Fatalf("payload metadata missing from bounded observation: %+v", observations[0])
}
metrics := sink.Metrics()
if metrics.Capacity != 2 || metrics.CurrentDepth != 2 || metrics.AcceptedTotal != 3 || metrics.DroppedOldest != 1 {
t.Fatalf("unexpected sink metrics: %+v", metrics)
}
}
func TestProductionEnvelopeObservationSinkMetricsStartEmpty(t *testing.T) {
sink := NewProductionEnvelopeObservationSink(3)
metrics := sink.Metrics()
if metrics.Capacity != 3 || metrics.CurrentDepth != 0 || metrics.AcceptedTotal != 0 || metrics.DroppedOldest != 0 {
t.Fatalf("unexpected empty metrics: %+v", metrics)
}
}
func TestMeshForwardingGateRejectsServiceChannel(t *testing.T) {
local := PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}
server := httptest.NewServer(Server{
Local: local,
ProductionForwardingEnabled: true,
}.Handler())
defer server.Close()
envelope := validProductionEnvelope(local)
envelope.ChannelClass = "render"
payload, err := json.Marshal(envelope)
if err != nil {
t.Fatalf("marshal envelope: %v", err)
}
resp, err := http.Post(server.URL+"/mesh/v1/forward", "application/json", bytes.NewReader(payload))
if err != nil {
t.Fatalf("post forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusForbidden {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusForbidden)
}
}
func TestMeshForwardingRequiresPost(t *testing.T) {
server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler())
defer server.Close()
resp, err := http.Get(server.URL + "/mesh/v1/forward")
if err != nil {
t.Fatalf("get forward: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusMethodNotAllowed {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusMethodNotAllowed)
}
}
func validProductionEnvelope(local PeerIdentity) ProductionEnvelope {
payload := json.RawMessage(`{"kind":"control"}`)
sum := sha256.Sum256(payload)
now := time.Now().UTC()
return ProductionEnvelope{
FabricProtocolVersion: ProtocolVersion,
MessageID: "message-1",
RouteID: "route-1",
ClusterID: local.ClusterID,
SourceNodeID: "node-a",
DestinationNodeID: "node-c",
CurrentHopNodeID: local.NodeID,
NextHopNodeID: "node-c",
ChannelClass: ProductionChannelFabricControl,
MessageType: ProductionMessageFabricControl,
TTL: 4,
HopCount: 1,
CreatedAt: now,
ExpiresAt: now.Add(time.Minute),
PayloadLength: len(payload),
PayloadHash: hex.EncodeToString(sum[:]),
Payload: payload,
}
}
func configuredProductionRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: append([]string{}, hops...),
AllowedChannels: []string{ProductionChannelFabricControl},
ExpiresAt: time.Now().UTC().Add(time.Hour),
MaxTTL: 8,
MaxHops: 8,
}
}
func hasProductionForwardEvent(events []ProductionForwardLogEntry, event string) bool {
for _, item := range events {
if item.Event == event {
return true
}
}
return false
}
func TestSyntheticEndpointDisabledByDefault(t *testing.T) {
server := httptest.NewServer(Server{Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"}}.Handler())
defer server.Close()
resp, err := http.Post(server.URL+"/mesh/v1/synthetic/probe", "application/json", bytes.NewReader([]byte(`{}`)))
if err != nil {
t.Fatalf("post synthetic probe: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusServiceUnavailable {
t.Fatalf("status = %d, want %d", resp.StatusCode, http.StatusServiceUnavailable)
}
}
@@ -0,0 +1,280 @@
package mesh
import (
"sync"
"time"
)
type SyntheticRelaySchedulerConfig struct {
Enabled bool
Local PeerIdentity
QueuePolicies []SyntheticRelayQueuePolicy
AllowedChannels []string
AllowedMessageTypes []string
Now func() time.Time
Logger func(SyntheticLogEntry)
}
type SyntheticRelayScheduler struct {
enabled bool
local PeerIdentity
policies map[string]SyntheticRelayQueuePolicy
allowedChannels map[string]struct{}
allowedMessageTypes map[string]struct{}
priorityOrder []string
now func() time.Time
logger func(SyntheticLogEntry)
mu sync.Mutex
queues map[string][]SyntheticEnvelope
metrics SyntheticRelayQueueMetrics
}
func NewSyntheticRelayScheduler(cfg SyntheticRelaySchedulerConfig) *SyntheticRelayScheduler {
policies := cfg.QueuePolicies
if len(policies) == 0 {
policies = []SyntheticRelayQueuePolicy{
{Channel: SyntheticChannelFabricControl, Capacity: 64, Droppable: false},
{Channel: SyntheticChannelRouteControl, Capacity: 64, Droppable: false},
{Channel: SyntheticChannelTelemetry, Capacity: 16, Droppable: true},
}
}
policyMap := map[string]SyntheticRelayQueuePolicy{}
allowedChannels := map[string]struct{}{}
priorityOrder := make([]string, 0, len(policies))
for _, policy := range policies {
if policy.Channel == "" {
continue
}
if policy.Capacity <= 0 {
policy.Capacity = 1
}
policyMap[policy.Channel] = policy
allowedChannels[policy.Channel] = struct{}{}
priorityOrder = append(priorityOrder, policy.Channel)
}
for _, channel := range cfg.AllowedChannels {
if channel != "" {
allowedChannels[channel] = struct{}{}
}
}
messageTypes := cfg.AllowedMessageTypes
if len(messageTypes) == 0 {
messageTypes = []string{
SyntheticMessageProbe,
SyntheticMessageProbeAck,
SyntheticMessageRouteHealth,
SyntheticMessageRouteHealthAck,
SyntheticMessageTelemetry,
SyntheticMessageTestService,
SyntheticMessageTestServiceAck,
}
}
allowedMessageTypes := map[string]struct{}{}
for _, messageType := range messageTypes {
if messageType != "" {
allowedMessageTypes[messageType] = struct{}{}
}
}
now := cfg.Now
if now == nil {
now = func() time.Time { return time.Now().UTC() }
}
return &SyntheticRelayScheduler{
enabled: cfg.Enabled,
local: cfg.Local,
policies: policyMap,
allowedChannels: allowedChannels,
allowedMessageTypes: allowedMessageTypes,
priorityOrder: priorityOrder,
now: now,
logger: cfg.Logger,
queues: map[string][]SyntheticEnvelope{},
metrics: SyntheticRelayQueueMetrics{
QueueDepths: map[string]int{},
},
}
}
func (s *SyntheticRelayScheduler) Enqueue(envelope SyntheticEnvelope) (SyntheticRelayEnqueueResult, error) {
if err := s.validateEnvelope(envelope); err != nil {
s.reject(envelope, err)
return SyntheticRelayEnqueueResult{}, err
}
policy := s.policies[envelope.Channel]
result := SyntheticRelayEnqueueResult{
Channel: envelope.Channel,
QueueCapacity: policy.Capacity,
AcceptedSequence: envelope.Sequence,
}
s.mu.Lock()
queue := s.queues[envelope.Channel]
if len(queue) >= policy.Capacity {
if !policy.Droppable {
s.metrics.Rejected++
s.metrics.LastRejectReason = ErrSyntheticRelayQueueFull.Error()
s.mu.Unlock()
s.log(SyntheticLogEntry{
Event: "fabric_relay_rejected",
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
LocalNodeID: s.local.NodeID,
Channel: envelope.Channel,
MessageType: envelope.MessageType,
Reason: ErrSyntheticRelayQueueFull.Error(),
QueueDepth: len(queue),
QueueCapacity: policy.Capacity,
OccurredAt: s.now(),
})
return SyntheticRelayEnqueueResult{}, ErrSyntheticRelayQueueFull
}
result.Dropped = true
result.DroppedSequence = queue[0].Sequence
queue = queue[1:]
s.metrics.Dropped++
}
queue = append(queue, envelope)
s.queues[envelope.Channel] = queue
result.QueueDepth = len(queue)
s.metrics.Enqueued++
s.metrics.QueueDepths[envelope.Channel] = len(queue)
s.mu.Unlock()
s.log(SyntheticLogEntry{
Event: "fabric_relay_enqueued",
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
LocalNodeID: s.local.NodeID,
Channel: envelope.Channel,
MessageType: envelope.MessageType,
QueueDepth: result.QueueDepth,
QueueCapacity: result.QueueCapacity,
Dropped: result.Dropped,
DroppedSequence: result.DroppedSequence,
OccurredAt: s.now(),
})
return result, nil
}
func (s *SyntheticRelayScheduler) Dequeue() (SyntheticEnvelope, error) {
if !s.enabled {
return SyntheticEnvelope{}, ErrMeshRuntimeDisabled
}
s.mu.Lock()
for _, channel := range s.priorityOrder {
queue := s.queues[channel]
if len(queue) == 0 {
continue
}
envelope := queue[0]
queue = queue[1:]
s.queues[channel] = queue
s.metrics.Dequeued++
s.metrics.QueueDepths[channel] = len(queue)
s.mu.Unlock()
s.log(SyntheticLogEntry{
Event: "fabric_relay_dequeued",
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
LocalNodeID: s.local.NodeID,
Channel: envelope.Channel,
MessageType: envelope.MessageType,
QueueDepth: len(queue),
QueueCapacity: s.policies[channel].Capacity,
OccurredAt: s.now(),
})
return envelope, nil
}
s.mu.Unlock()
return SyntheticEnvelope{}, ErrSyntheticRelayQueueEmpty
}
func (s *SyntheticRelayScheduler) SnapshotQueueMetrics() SyntheticRelayQueueMetrics {
s.mu.Lock()
defer s.mu.Unlock()
depths := map[string]int{}
for channel, depth := range s.metrics.QueueDepths {
depths[channel] = depth
}
for channel, queue := range s.queues {
depths[channel] = len(queue)
}
return SyntheticRelayQueueMetrics{
Enqueued: s.metrics.Enqueued,
Dequeued: s.metrics.Dequeued,
Dropped: s.metrics.Dropped,
Rejected: s.metrics.Rejected,
LastRejectReason: s.metrics.LastRejectReason,
QueueDepths: depths,
}
}
func (s *SyntheticRelayScheduler) validateEnvelope(envelope SyntheticEnvelope) error {
if s == nil || !s.enabled {
return ErrMeshRuntimeDisabled
}
if envelope.ProtocolVersion != ProtocolVersion {
return ErrUnsupportedSyntheticMessage
}
if envelope.RouteID == "" {
return ErrRouteIDRequired
}
if envelope.ClusterID == "" || envelope.ClusterID != s.local.ClusterID {
return ErrClusterMismatch
}
if envelope.From.ClusterID != s.local.ClusterID || envelope.From.NodeID == "" {
return ErrNodeMismatch
}
if envelope.To.ClusterID != s.local.ClusterID || envelope.To.NodeID != s.local.NodeID {
return ErrNodeMismatch
}
if envelope.TTL <= 0 {
return ErrTTLExhausted
}
if envelope.HopCount <= 0 {
return ErrInvalidRoutePath
}
if contains(envelope.Visited, s.local.NodeID) {
return ErrLoopDetected
}
if _, ok := s.allowedChannels[envelope.Channel]; !ok {
return ErrUnauthorizedChannel
}
if _, ok := s.policies[envelope.Channel]; !ok {
return ErrUnauthorizedChannel
}
if _, ok := s.allowedMessageTypes[envelope.MessageType]; !ok {
return ErrUnsupportedSyntheticMessage
}
return nil
}
func (s *SyntheticRelayScheduler) reject(envelope SyntheticEnvelope, err error) {
reason := ""
if err != nil {
reason = err.Error()
}
if s != nil {
s.mu.Lock()
s.metrics.Rejected++
s.metrics.LastRejectReason = reason
s.mu.Unlock()
}
if s != nil {
s.log(SyntheticLogEntry{
Event: "fabric_relay_rejected",
RouteID: envelope.RouteID,
ClusterID: envelope.ClusterID,
LocalNodeID: s.local.NodeID,
Channel: envelope.Channel,
MessageType: envelope.MessageType,
Reason: reason,
OccurredAt: s.now(),
})
}
}
func (s *SyntheticRelayScheduler) log(entry SyntheticLogEntry) {
if s.logger != nil {
s.logger(entry)
}
}
@@ -0,0 +1,213 @@
package mesh
import (
"errors"
"testing"
)
func TestSyntheticRelaySchedulerDequeuesByQoSPriority(t *testing.T) {
scheduler := testRelayScheduler()
telemetry := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 1)
routeControl := testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageRouteHealth, 2)
fabricControl := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 3)
if _, err := scheduler.Enqueue(telemetry); err != nil {
t.Fatalf("enqueue telemetry: %v", err)
}
if _, err := scheduler.Enqueue(routeControl); err != nil {
t.Fatalf("enqueue route control: %v", err)
}
if _, err := scheduler.Enqueue(fabricControl); err != nil {
t.Fatalf("enqueue fabric control: %v", err)
}
first, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue first: %v", err)
}
second, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue second: %v", err)
}
third, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue third: %v", err)
}
if first.Channel != SyntheticChannelFabricControl {
t.Fatalf("first channel = %q, want fabric_control", first.Channel)
}
if second.Channel != SyntheticChannelRouteControl {
t.Fatalf("second channel = %q, want route_control", second.Channel)
}
if third.Channel != SyntheticChannelTelemetry {
t.Fatalf("third channel = %q, want telemetry", third.Channel)
}
}
func TestSyntheticRelaySchedulerDropsOldestTelemetryOnly(t *testing.T) {
scheduler := testRelayScheduler()
first := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 1)
second := testRelayEnvelope(SyntheticChannelTelemetry, SyntheticMessageTelemetry, 2)
if result, err := scheduler.Enqueue(first); err != nil || result.Dropped {
t.Fatalf("enqueue first result=%+v err=%v", result, err)
}
result, err := scheduler.Enqueue(second)
if err != nil {
t.Fatalf("enqueue second: %v", err)
}
if !result.Dropped || result.DroppedSequence != 1 {
t.Fatalf("result = %+v, want dropped sequence 1", result)
}
dequeued, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue: %v", err)
}
if dequeued.Sequence != 2 {
t.Fatalf("dequeued sequence = %d, want 2", dequeued.Sequence)
}
metrics := scheduler.SnapshotQueueMetrics()
if metrics.Dropped != 1 || metrics.Enqueued != 2 {
t.Fatalf("metrics = %+v, want one drop and two enqueues", metrics)
}
}
func TestSyntheticRelaySchedulerRejectsFullReliableQueue(t *testing.T) {
scheduler := testRelayScheduler()
first := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1)
second := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 2)
if _, err := scheduler.Enqueue(first); err != nil {
t.Fatalf("enqueue first: %v", err)
}
_, err := scheduler.Enqueue(second)
if !errors.Is(err, ErrSyntheticRelayQueueFull) {
t.Fatalf("err = %v, want ErrSyntheticRelayQueueFull", err)
}
dequeued, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue: %v", err)
}
if dequeued.Sequence != 1 {
t.Fatalf("dequeued sequence = %d, want 1", dequeued.Sequence)
}
metrics := scheduler.SnapshotQueueMetrics()
if metrics.Dropped != 0 || metrics.Rejected != 1 {
t.Fatalf("metrics = %+v, want no drop and one rejection", metrics)
}
}
func TestSyntheticRelaySchedulerRejectsInvalidEnvelopes(t *testing.T) {
tests := []struct {
name string
mutate func(*SyntheticEnvelope)
want error
}{
{
name: "wrong cluster",
mutate: func(envelope *SyntheticEnvelope) {
envelope.ClusterID = "cluster-2"
},
want: ErrClusterMismatch,
},
{
name: "wrong node",
mutate: func(envelope *SyntheticEnvelope) {
envelope.To.NodeID = "node-x"
},
want: ErrNodeMismatch,
},
{
name: "unauthorized channel",
mutate: func(envelope *SyntheticEnvelope) {
envelope.Channel = "rdp_render"
},
want: ErrUnauthorizedChannel,
},
{
name: "unsupported message",
mutate: func(envelope *SyntheticEnvelope) {
envelope.MessageType = "rdp.input"
},
want: ErrUnsupportedSyntheticMessage,
},
{
name: "ttl exhausted",
mutate: func(envelope *SyntheticEnvelope) {
envelope.TTL = 0
},
want: ErrTTLExhausted,
},
{
name: "loop detected",
mutate: func(envelope *SyntheticEnvelope) {
envelope.Visited = append(envelope.Visited, "node-r")
},
want: ErrLoopDetected,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
scheduler := testRelayScheduler()
envelope := testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1)
tt.mutate(&envelope)
_, err := scheduler.Enqueue(envelope)
if !errors.Is(err, tt.want) {
t.Fatalf("err = %v, want %v", err, tt.want)
}
})
}
}
func TestSyntheticRelaySchedulerDisabledRejects(t *testing.T) {
scheduler := NewSyntheticRelayScheduler(SyntheticRelaySchedulerConfig{
Enabled: false,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
})
_, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1))
if !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
}
if _, err := scheduler.Dequeue(); !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("dequeue err = %v, want ErrMeshRuntimeDisabled", err)
}
}
func TestSyntheticRelaySchedulerQueueDepthSnapshot(t *testing.T) {
scheduler := testRelayScheduler()
if _, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelFabricControl, SyntheticMessageProbe, 1)); err != nil {
t.Fatalf("enqueue fabric control: %v", err)
}
if _, err := scheduler.Enqueue(testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageRouteHealth, 2)); err != nil {
t.Fatalf("enqueue route control: %v", err)
}
metrics := scheduler.SnapshotQueueMetrics()
if metrics.QueueDepths[SyntheticChannelFabricControl] != 1 {
t.Fatalf("fabric_control depth = %d, want 1", metrics.QueueDepths[SyntheticChannelFabricControl])
}
if metrics.QueueDepths[SyntheticChannelRouteControl] != 1 {
t.Fatalf("route_control depth = %d, want 1", metrics.QueueDepths[SyntheticChannelRouteControl])
}
}
func testRelayScheduler() *SyntheticRelayScheduler {
return NewSyntheticRelayScheduler(SyntheticRelaySchedulerConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-r"},
QueuePolicies: []SyntheticRelayQueuePolicy{
{Channel: SyntheticChannelFabricControl, Capacity: 1, Droppable: false},
{Channel: SyntheticChannelRouteControl, Capacity: 1, Droppable: false},
{Channel: SyntheticChannelTelemetry, Capacity: 1, Droppable: true},
},
})
}
func testRelayEnvelope(channel string, messageType string, sequence uint64) SyntheticEnvelope {
route := testRoute("route-relay-scheduler", []string{"node-a", "node-r", "node-b"})
envelope := testEnvelope(route, "node-a", "node-r")
envelope.Channel = channel
envelope.MessageType = messageType
envelope.Sequence = sequence
return envelope
}
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,432 @@
package mesh
import (
"context"
"encoding/json"
"errors"
"testing"
"time"
)
type syntheticTestTransport struct {
nodes map[string]*SyntheticRuntime
}
func (t syntheticTestTransport) SendSynthetic(ctx context.Context, nextNodeID string, envelope SyntheticEnvelope) (SyntheticEnvelope, error) {
next := t.nodes[nextNodeID]
if next == nil {
return SyntheticEnvelope{}, ErrSyntheticPeerUnavailable
}
return next.Receive(ctx, envelope)
}
func TestSyntheticRuntimeDirectProbe(t *testing.T) {
route := testRoute("route-direct", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
ack, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-direct")
if err != nil {
t.Fatalf("send probe: %v", err)
}
if ack.MessageType != SyntheticMessageProbeAck {
t.Fatalf("MessageType = %q, want %q", ack.MessageType, SyntheticMessageProbeAck)
}
if ack.From.NodeID != "node-b" || ack.To.NodeID != "node-a" {
t.Fatalf("unexpected ack peers: from=%+v to=%+v", ack.From, ack.To)
}
payload := decodeAckPayload(t, ack)
if len(payload.Path) != 2 || payload.Path[0] != "node-a" || payload.Path[1] != "node-b" {
t.Fatalf("Path = %#v, want node-a -> node-b", payload.Path)
}
if nodeB.SnapshotMetrics().ProbeAcksCreated != 1 {
t.Fatalf("ProbeAcksCreated = %d, want 1", nodeB.SnapshotMetrics().ProbeAcksCreated)
}
}
func TestSyntheticRuntimeSingleRelayProbe(t *testing.T) {
route := testRoute("route-relay", []string{"node-a", "node-r", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeR := testRuntime("node-r", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-r"] = nodeR
transport.nodes["node-b"] = nodeB
ack, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-relay")
if err != nil {
t.Fatalf("send probe: %v", err)
}
payload := decodeAckPayload(t, ack)
if len(payload.Path) != 3 || payload.Path[0] != "node-a" || payload.Path[1] != "node-r" || payload.Path[2] != "node-b" {
t.Fatalf("Path = %#v, want node-a -> node-r -> node-b", payload.Path)
}
if nodeR.SnapshotMetrics().ProbesForwarded != 1 {
t.Fatalf("ProbesForwarded = %d, want 1", nodeR.SnapshotMetrics().ProbesForwarded)
}
}
func TestSyntheticRuntimeDisabledRejectsProbe(t *testing.T) {
route := testRoute("route-disabled", []string{"node-a", "node-b"})
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: false,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
})
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-disabled")
if !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
}
}
func TestSyntheticRuntimeRejectsWrongCluster(t *testing.T) {
route := testRoute("route-wrong-cluster", []string{"node-a", "node-b"})
nodeB := testRuntime("node-b", syntheticTestTransport{}, route)
envelope := testEnvelope(route, "node-a", "node-b")
envelope.ClusterID = "cluster-2"
_, err := nodeB.Receive(context.Background(), envelope)
if !errors.Is(err, ErrClusterMismatch) {
t.Fatalf("err = %v, want ErrClusterMismatch", err)
}
}
func TestSyntheticRuntimeRejectsWrongNode(t *testing.T) {
route := testRoute("route-wrong-node", []string{"node-a", "node-b"})
nodeB := testRuntime("node-b", syntheticTestTransport{}, route)
envelope := testEnvelope(route, "node-a", "node-c")
_, err := nodeB.Receive(context.Background(), envelope)
if !errors.Is(err, ErrNodeMismatch) {
t.Fatalf("err = %v, want ErrNodeMismatch", err)
}
}
func TestSyntheticRuntimeRejectsUnauthorizedChannel(t *testing.T) {
route := testRoute("route-unauthorized", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
_, err := nodeA.SendProbe(context.Background(), route.RouteID, "rdp_render", "probe-unauthorized")
if !errors.Is(err, ErrUnauthorizedChannel) {
t.Fatalf("err = %v, want ErrUnauthorizedChannel", err)
}
}
func TestSyntheticRuntimeRejectsExpiredRoute(t *testing.T) {
route := testRoute("route-expired", []string{"node-a", "node-b"})
route.ExpiresAt = time.Now().UTC().Add(-time.Minute)
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-expired")
if !errors.Is(err, ErrRouteExpired) {
t.Fatalf("err = %v, want ErrRouteExpired", err)
}
}
func TestSyntheticRuntimeRejectsTTLExhaustion(t *testing.T) {
route := testRoute("route-ttl", []string{"node-a", "node-r", "node-b"})
route.MaxTTL = 1
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeR := testRuntime("node-r", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-r"] = nodeR
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-ttl")
if !errors.Is(err, ErrTTLExhausted) {
t.Fatalf("err = %v, want ErrTTLExhausted", err)
}
}
func TestSyntheticRuntimeRejectsLoop(t *testing.T) {
route := testRoute("route-loop", []string{"node-a", "node-b"})
nodeB := testRuntime("node-b", syntheticTestTransport{}, route)
envelope := testEnvelope(route, "node-a", "node-b")
envelope.Visited = []string{"node-a", "node-b"}
_, err := nodeB.Receive(context.Background(), envelope)
if !errors.Is(err, ErrLoopDetected) {
t.Fatalf("err = %v, want ErrLoopDetected", err)
}
}
func TestSyntheticRuntimeRejectsUnavailablePeer(t *testing.T) {
route := testRoute("route-missing-peer", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}, route)
_, err := nodeA.SendProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-missing-peer")
if !errors.Is(err, ErrSyntheticPeerUnavailable) {
t.Fatalf("err = %v, want ErrSyntheticPeerUnavailable", err)
}
}
func TestSyntheticRuntimeRouteHealthProbeRecordsSuccess(t *testing.T) {
route := testRoute("route-health", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-health")
if err != nil {
t.Fatalf("send route health probe: %v", err)
}
if result.Ack.MessageType != SyntheticMessageRouteHealthAck {
t.Fatalf("MessageType = %q, want %q", result.Ack.MessageType, SyntheticMessageRouteHealthAck)
}
if result.FallbackUsed {
t.Fatal("FallbackUsed = true, want false")
}
observation, ok := nodeA.SnapshotRouteObservation(route.RouteID)
if !ok {
t.Fatal("route observation missing")
}
if observation.State != SyntheticRouteStateHealthy || observation.SuccessCount != 1 {
t.Fatalf("observation = %+v, want healthy success", observation)
}
if observation.PolicyVersion != "policy-v1" || observation.PeerDirectoryVersion != "peers-v1" || observation.RouteVersion != "route-v1" {
t.Fatalf("observation versions = %+v", observation)
}
metrics := nodeA.SnapshotMetrics()
if metrics.RouteHealthProbesSent != 1 || metrics.RouteDeliveriesSucceeded != 1 {
t.Fatalf("metrics = %+v, want health probe success", metrics)
}
}
func TestSyntheticRuntimeRouteHealthUsesDedicatedRouteConfig(t *testing.T) {
base := testRoute("route-effective-health", []string{"node-a", "node-old", "node-b"})
effective := testRoute("route-effective-health", []string{"node-a", "node-new", "node-b"})
effective.RouteVersion = "decision-v1"
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntimeWithRouteHealth("node-a", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
nodeOld := testRuntimeWithRouteHealth("node-old", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
nodeNew := testRuntimeWithRouteHealth("node-new", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
nodeB := testRuntimeWithRouteHealth("node-b", transport, []SyntheticRoute{base}, []SyntheticRoute{effective})
transport.nodes["node-a"] = nodeA
transport.nodes["node-old"] = nodeOld
transport.nodes["node-new"] = nodeNew
transport.nodes["node-b"] = nodeB
health, err := nodeA.SendRouteHealthProbe(context.Background(), base.RouteID, SyntheticChannelFabricControl, "probe-health-effective")
if err != nil {
t.Fatalf("send route health probe: %v", err)
}
healthPayload := decodeAckPayload(t, health.Ack)
if got, want := healthPayload.Path, []string{"node-a", "node-new", "node-b"}; !sameStrings(got, want) {
t.Fatalf("route health path = %v, want %v", got, want)
}
if nodeNew.SnapshotMetrics().ProbesForwarded != 1 {
t.Fatalf("node-new forwarded = %d, want 1", nodeNew.SnapshotMetrics().ProbesForwarded)
}
if nodeOld.SnapshotMetrics().ProbesForwarded != 0 {
t.Fatalf("node-old forwarded = %d, want 0 before regular probe", nodeOld.SnapshotMetrics().ProbesForwarded)
}
observation, ok := nodeA.SnapshotRouteObservation(base.RouteID)
if !ok || observation.RouteVersion != "decision-v1" {
t.Fatalf("route health observation = %+v, want decision route version", observation)
}
probe, err := nodeA.SendProbe(context.Background(), base.RouteID, SyntheticChannelFabricControl, "probe-regular")
if err != nil {
t.Fatalf("send regular probe: %v", err)
}
probePayload := decodeAckPayload(t, probe)
if got, want := probePayload.Path, []string{"node-a", "node-old", "node-b"}; !sameStrings(got, want) {
t.Fatalf("regular probe path = %v, want %v", got, want)
}
if nodeOld.SnapshotMetrics().ProbesForwarded != 1 {
t.Fatalf("node-old forwarded = %d, want 1 after regular probe", nodeOld.SnapshotMetrics().ProbesForwarded)
}
}
func TestSyntheticRuntimeRouteHealthUsesFallbackWhenPreferredUnavailable(t *testing.T) {
preferred := testRoute("route-preferred", []string{"node-a", "node-r", "node-b"})
fallback := testRoute("route-fallback", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, preferred, fallback)
nodeB := testRuntime("node-b", transport, preferred, fallback)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendRouteHealthProbeWithFallback(
context.Background(),
preferred.RouteID,
[]string{fallback.RouteID},
SyntheticChannelFabricControl,
"probe-fallback",
)
if err != nil {
t.Fatalf("send route health probe with fallback: %v", err)
}
if !result.FallbackUsed {
t.Fatal("FallbackUsed = false, want true")
}
if result.SelectedRouteID != fallback.RouteID {
t.Fatalf("SelectedRouteID = %q, want %q", result.SelectedRouteID, fallback.RouteID)
}
preferredObservation, ok := nodeA.SnapshotRouteObservation(preferred.RouteID)
if !ok {
t.Fatal("preferred route observation missing")
}
if preferredObservation.State != SyntheticRouteStateFailed || preferredObservation.FailureCount != 1 {
t.Fatalf("preferred observation = %+v, want failed", preferredObservation)
}
fallbackObservation, ok := nodeA.SnapshotRouteObservation(fallback.RouteID)
if !ok {
t.Fatal("fallback route observation missing")
}
if fallbackObservation.State != SyntheticRouteStateHealthy || fallbackObservation.SuccessCount != 1 {
t.Fatalf("fallback observation = %+v, want healthy", fallbackObservation)
}
metrics := nodeA.SnapshotMetrics()
if metrics.FallbackRoutesUsed != 1 || metrics.WarmRoutesPromoted != 1 || metrics.RouteDeliveriesFailed != 1 {
t.Fatalf("metrics = %+v, want fallback promotion and one failed delivery", metrics)
}
}
func TestSyntheticRuntimeRouteCacheInvalidatesOnVersionChange(t *testing.T) {
route := testRoute("route-cache", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
if _, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-cache"); err != nil {
t.Fatalf("send route health probe: %v", err)
}
if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); !ok {
t.Fatal("route observation missing before invalidation")
}
invalidated := nodeA.InvalidateRouteCache("policy_changed", SyntheticRouteCacheVersion{PolicyVersion: "policy-v2"})
if invalidated != 1 {
t.Fatalf("invalidated = %d, want 1", invalidated)
}
if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); ok {
t.Fatal("route observation still present after invalidation")
}
if nodeA.SnapshotMetrics().RouteCacheInvalidations != 1 {
t.Fatalf("RouteCacheInvalidations = %d, want 1", nodeA.SnapshotMetrics().RouteCacheInvalidations)
}
}
func TestSyntheticRuntimeRouteCacheKeepsCurrentVersion(t *testing.T) {
route := testRoute("route-cache-current", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
if _, err := nodeA.SendRouteHealthProbe(context.Background(), route.RouteID, SyntheticChannelFabricControl, "probe-cache-current"); err != nil {
t.Fatalf("send route health probe: %v", err)
}
invalidated := nodeA.InvalidateRouteCache("same_versions", SyntheticRouteCacheVersion{
RouteVersion: "route-v1",
PolicyVersion: "policy-v1",
PeerDirectoryVersion: "peers-v1",
})
if invalidated != 0 {
t.Fatalf("invalidated = %d, want 0", invalidated)
}
if _, ok := nodeA.SnapshotRouteObservation(route.RouteID); !ok {
t.Fatal("route observation missing after same-version invalidation")
}
}
func TestSyntheticRuntimeRouteHealthDisabledRejects(t *testing.T) {
route := testRoute("route-health-disabled", []string{"node-a", "node-b"})
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: false,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
})
_, err := nodeA.SendRouteHealthProbeWithFallback(
context.Background(),
route.RouteID,
[]string{"route-fallback"},
SyntheticChannelFabricControl,
"probe-disabled-health",
)
if !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
}
}
func testRuntime(nodeID string, transport SyntheticTransport, routes ...SyntheticRoute) *SyntheticRuntime {
return NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: nodeID},
Routes: routes,
Transport: transport,
MaxTTL: 8,
MaxHops: 8,
})
}
func testRuntimeWithRouteHealth(nodeID string, transport SyntheticTransport, routes []SyntheticRoute, routeHealthRoutes []SyntheticRoute) *SyntheticRuntime {
return NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: nodeID},
Routes: routes,
RouteHealthRoutes: routeHealthRoutes,
Transport: transport,
MaxTTL: 8,
MaxHops: 8,
})
}
func testRoute(routeID string, hops []string) SyntheticRoute {
return SyntheticRoute{
RouteID: routeID,
ClusterID: "cluster-1",
SourceNodeID: hops[0],
DestinationNodeID: hops[len(hops)-1],
Hops: hops,
AllowedChannels: []string{SyntheticChannelFabricControl},
ExpiresAt: time.Now().UTC().Add(time.Hour),
MaxTTL: 8,
MaxHops: 8,
RouteVersion: "route-v1",
PolicyVersion: "policy-v1",
PeerDirectoryVersion: "peers-v1",
}
}
func testEnvelope(route SyntheticRoute, fromNodeID string, toNodeID string) SyntheticEnvelope {
payload, _ := json.Marshal(SyntheticProbePayload{
ProbeID: "probe-test",
SentAt: time.Now().UTC(),
})
return SyntheticEnvelope{
ProtocolVersion: ProtocolVersion,
RouteID: route.RouteID,
ClusterID: route.ClusterID,
From: PeerIdentity{ClusterID: route.ClusterID, NodeID: fromNodeID},
To: PeerIdentity{ClusterID: route.ClusterID, NodeID: toNodeID},
Channel: SyntheticChannelFabricControl,
MessageType: SyntheticMessageProbe,
TTL: 8,
HopCount: 1,
Visited: []string{fromNodeID},
Sequence: 1,
SentAt: time.Now().UTC(),
Payload: payload,
}
}
func decodeAckPayload(t *testing.T, envelope SyntheticEnvelope) SyntheticProbeAckPayload {
t.Helper()
var payload SyntheticProbeAckPayload
if err := json.Unmarshal(envelope.Payload, &payload); err != nil {
t.Fatalf("decode ack payload: %v", err)
}
return payload
}
@@ -0,0 +1,235 @@
package mesh
import (
"context"
"encoding/json"
"errors"
"strings"
"testing"
)
func TestSyntheticRuntimeTestServiceDirectRoute(t *testing.T) {
route := testServiceRoute("route-test-service-direct", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-direct", "hello"))
if err != nil {
t.Fatalf("send test service: %v", err)
}
if result.Ack.MessageType != SyntheticMessageTestServiceAck {
t.Fatalf("MessageType = %q, want %q", result.Ack.MessageType, SyntheticMessageTestServiceAck)
}
if result.Response.EchoPayload != "hello" {
t.Fatalf("EchoPayload = %q, want hello", result.Response.EchoPayload)
}
if len(result.Response.Path) != 2 || result.Response.Path[0] != "node-a" || result.Response.Path[1] != "node-b" {
t.Fatalf("Path = %#v, want node-a -> node-b", result.Response.Path)
}
metrics := nodeA.SnapshotMetrics()
if metrics.TestServiceRequestsSent != 1 || metrics.TestServiceDeliveriesSucceeded != 1 {
t.Fatalf("metrics = %+v, want one test service success", metrics)
}
}
func TestSyntheticRuntimeTestServiceSingleRelayRoute(t *testing.T) {
route := testServiceRoute("route-test-service-relay", []string{"node-a", "node-r", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, route)
nodeR := testRuntime("node-r", transport, route)
nodeB := testRuntime("node-b", transport, route)
transport.nodes["node-a"] = nodeA
transport.nodes["node-r"] = nodeR
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-relay", "relay"))
if err != nil {
t.Fatalf("send test service: %v", err)
}
if len(result.Response.Path) != 3 || result.Response.Path[0] != "node-a" || result.Response.Path[1] != "node-r" || result.Response.Path[2] != "node-b" {
t.Fatalf("Path = %#v, want node-a -> node-r -> node-b", result.Response.Path)
}
if nodeR.SnapshotMetrics().ProbesForwarded != 1 {
t.Fatalf("ProbesForwarded = %d, want 1", nodeR.SnapshotMetrics().ProbesForwarded)
}
}
func TestSyntheticRuntimeTestServiceUsesForcedFallback(t *testing.T) {
preferred := testServiceRoute("route-test-service-preferred", []string{"node-a", "node-r", "node-b"})
fallback := testServiceRoute("route-test-service-fallback", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := testRuntime("node-a", transport, preferred, fallback)
nodeB := testRuntime("node-b", transport, preferred, fallback)
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendTestServiceWithFallback(
context.Background(),
preferred.RouteID,
[]string{fallback.RouteID},
SyntheticChannelRouteControl,
testServiceRequest("request-fallback", "fallback"),
)
if err != nil {
t.Fatalf("send test service with fallback: %v", err)
}
if !result.FallbackUsed {
t.Fatal("FallbackUsed = false, want true")
}
if result.SelectedRouteID != fallback.RouteID {
t.Fatalf("SelectedRouteID = %q, want %q", result.SelectedRouteID, fallback.RouteID)
}
if result.Response.EchoPayload != "fallback" {
t.Fatalf("EchoPayload = %q, want fallback", result.Response.EchoPayload)
}
metrics := nodeA.SnapshotMetrics()
if metrics.TestServiceFallbacksUsed != 1 || metrics.TestServiceDeliveriesFailed != 1 || metrics.TestServiceDeliveriesSucceeded != 1 {
t.Fatalf("metrics = %+v, want fallback success with one preferred failure", metrics)
}
}
func TestSyntheticRuntimeTestServiceRejectsWrongOrganization(t *testing.T) {
route := testServiceRoute("route-test-service-wrong-org", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
request := testServiceRequest("request-wrong-org", "hello")
request.OrganizationID = "org-other"
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request)
if !errors.Is(err, ErrSyntheticOrganizationMismatch) {
t.Fatalf("err = %v, want ErrSyntheticOrganizationMismatch", err)
}
}
func TestSyntheticRuntimeTestServiceRejectsUnsupportedService(t *testing.T) {
route := testServiceRoute("route-test-service-unsupported", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
request := testServiceRequest("request-unsupported", "hello")
request.ServiceType = "rdp"
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request)
if !errors.Is(err, ErrUnsupportedSyntheticService) {
t.Fatalf("err = %v, want ErrUnsupportedSyntheticService", err)
}
}
func TestSyntheticRuntimeTestServiceRejectsOversizedPayload(t *testing.T) {
route := testServiceRoute("route-test-service-oversized", []string{"node-a", "node-b"})
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
MaxTestPayloadBytes: 4,
})
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-oversized", "12345"))
if !errors.Is(err, ErrSyntheticPayloadTooLarge) {
t.Fatalf("err = %v, want ErrSyntheticPayloadTooLarge", err)
}
}
func TestSyntheticRuntimeTestServiceRejectsUnauthorizedChannel(t *testing.T) {
route := testServiceRoute("route-test-service-channel", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelFabricControl, testServiceRequest("request-channel", "hello"))
if !errors.Is(err, ErrUnauthorizedChannel) {
t.Fatalf("err = %v, want ErrUnauthorizedChannel", err)
}
}
func TestSyntheticRuntimeTestServiceDisabledRejects(t *testing.T) {
route := testServiceRoute("route-test-service-disabled", []string{"node-a", "node-b"})
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: false,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
})
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-disabled", "hello"))
if !errors.Is(err, ErrMeshRuntimeDisabled) {
t.Fatalf("err = %v, want ErrMeshRuntimeDisabled", err)
}
}
func TestSyntheticRelaySchedulerAcceptsTestServiceMessage(t *testing.T) {
scheduler := testRelayScheduler()
envelope := testRelayEnvelope(SyntheticChannelRouteControl, SyntheticMessageTestService, 42)
envelope.Payload = mustMarshalTestServiceRequest(testServiceRequest("request-relay-scheduler", "hello"))
if _, err := scheduler.Enqueue(envelope); err != nil {
t.Fatalf("enqueue test service: %v", err)
}
dequeued, err := scheduler.Dequeue()
if err != nil {
t.Fatalf("dequeue test service: %v", err)
}
if dequeued.MessageType != SyntheticMessageTestService {
t.Fatalf("MessageType = %q, want %q", dequeued.MessageType, SyntheticMessageTestService)
}
}
func testServiceRoute(routeID string, hops []string) SyntheticRoute {
route := testRoute(routeID, hops)
route.AllowedChannels = []string{SyntheticChannelRouteControl}
return route
}
func testServiceRequest(requestID string, payload string) SyntheticTestServiceRequest {
return SyntheticTestServiceRequest{
RequestID: requestID,
OrganizationID: SyntheticDefaultTestOrganizationID,
ServiceType: SyntheticTestServiceType,
Payload: payload,
}
}
func mustMarshalTestServiceRequest(request SyntheticTestServiceRequest) []byte {
payload, err := json.Marshal(request)
if err != nil {
panic(err)
}
return payload
}
func TestSyntheticRuntimeTestServiceRejectsMissingRequestID(t *testing.T) {
route := testServiceRoute("route-test-service-missing-request", []string{"node-a", "node-b"})
nodeA := testRuntime("node-a", syntheticTestTransport{}, route)
request := testServiceRequest("", "hello")
_, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, request)
if !errors.Is(err, ErrSyntheticRequestInvalid) {
t.Fatalf("err = %v, want ErrSyntheticRequestInvalid", err)
}
}
func TestSyntheticRuntimeTestServiceAllowsMaxPayloadBoundary(t *testing.T) {
route := testServiceRoute("route-test-service-max", []string{"node-a", "node-b"})
transport := syntheticTestTransport{nodes: map[string]*SyntheticRuntime{}}
nodeA := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-a"},
Routes: []SyntheticRoute{route},
Transport: transport,
MaxTestPayloadBytes: 8,
})
nodeB := NewSyntheticRuntime(SyntheticRuntimeConfig{
Enabled: true,
Local: PeerIdentity{ClusterID: "cluster-1", NodeID: "node-b"},
Routes: []SyntheticRoute{route},
Transport: transport,
MaxTestPayloadBytes: 8,
})
transport.nodes["node-a"] = nodeA
transport.nodes["node-b"] = nodeB
result, err := nodeA.SendTestService(context.Background(), route.RouteID, SyntheticChannelRouteControl, testServiceRequest("request-max", strings.Repeat("a", 8)))
if err != nil {
t.Fatalf("send test service: %v", err)
}
if result.Response.EchoPayload != strings.Repeat("a", 8) {
t.Fatalf("EchoPayload = %q", result.Response.EchoPayload)
}
}
@@ -0,0 +1,18 @@
package relay
import "errors"
var ErrProductionForwardingDisabled = errors.New("relay skeleton does not forward production payloads before an approved production mesh stage")
type Skeleton struct {
ClusterID string
NodeID string
}
func (s Skeleton) AcceptControlConnection() bool {
return s.ClusterID != "" && s.NodeID != ""
}
func (s Skeleton) ForwardProductionPayload([]byte) error {
return ErrProductionForwardingDisabled
}
@@ -0,0 +1,16 @@
package relay
import (
"errors"
"testing"
)
func TestRelaySkeletonAcceptsControlButNotPayloadForwarding(t *testing.T) {
relay := Skeleton{ClusterID: "cluster-1", NodeID: "node-relay"}
if !relay.AcceptControlConnection() {
t.Fatal("relay skeleton should accept control connection metadata")
}
if err := relay.ForwardProductionPayload([]byte("rdp")); !errors.Is(err, ErrProductionForwardingDisabled) {
t.Fatalf("err = %v, want ErrProductionForwardingDisabled", err)
}
}
@@ -0,0 +1,129 @@
package state
import (
"crypto/rand"
"encoding/base64"
"encoding/json"
"errors"
"os"
"path/filepath"
"time"
)
const FileName = "identity.json"
type Identity struct {
NodeID string `json:"node_id"`
ClusterID string `json:"cluster_id"`
NodeName string `json:"node_name"`
NodeFingerprint string `json:"node_fingerprint"`
PublicKey string `json:"public_key"`
IdentityStatus string `json:"identity_status"`
PendingJoinRequestID string `json:"pending_join_request_id,omitempty"`
ClusterAuthorityPublicKey string `json:"cluster_authority_public_key,omitempty"`
ClusterAuthorityFingerprint string `json:"cluster_authority_fingerprint,omitempty"`
CreatedAt time.Time `json:"created_at"`
UpdatedAt time.Time `json:"updated_at"`
}
func LoadOrCreate(dir, clusterID, nodeName string) (Identity, error) {
path := filepath.Join(dir, FileName)
existing, err := Load(path)
if err == nil {
return existing, nil
}
if !errors.Is(err, os.ErrNotExist) {
return Identity{}, err
}
now := time.Now().UTC()
fingerprint, err := randomToken("rap-node-fp")
if err != nil {
return Identity{}, err
}
publicKey, err := randomToken("rap-node-pub")
if err != nil {
return Identity{}, err
}
identity := Identity{
ClusterID: clusterID,
NodeName: nodeName,
NodeFingerprint: fingerprint,
PublicKey: publicKey,
IdentityStatus: "new",
CreatedAt: now,
UpdatedAt: now,
}
if err := Save(path, identity); err != nil {
return Identity{}, err
}
return identity, nil
}
func Load(path string) (Identity, error) {
payload, err := os.ReadFile(path)
if err != nil {
return Identity{}, err
}
var identity Identity
if err := json.Unmarshal(payload, &identity); err != nil {
return Identity{}, err
}
return identity, nil
}
func Save(path string, identity Identity) error {
if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil {
return err
}
identity.UpdatedAt = time.Now().UTC()
payload, err := json.MarshalIndent(identity, "", " ")
if err != nil {
return err
}
return os.WriteFile(path, payload, 0o600)
}
func MarkEnrollmentSubmitted(dir, clusterID, joinRequestID string) (Identity, error) {
path := filepath.Join(dir, FileName)
identity, err := Load(path)
if err != nil {
return Identity{}, err
}
identity.ClusterID = clusterID
identity.PendingJoinRequestID = joinRequestID
identity.IdentityStatus = "pending_approval"
if err := Save(path, identity); err != nil {
return Identity{}, err
}
return identity, nil
}
func MarkApproved(dir string, nodeID, clusterID, status string) (Identity, error) {
return MarkApprovedWithAuthority(dir, nodeID, clusterID, status, "", "")
}
func MarkApprovedWithAuthority(dir string, nodeID, clusterID, status, authorityPublicKey, authorityFingerprint string) (Identity, error) {
path := filepath.Join(dir, FileName)
identity, err := Load(path)
if err != nil {
return Identity{}, err
}
identity.NodeID = nodeID
identity.ClusterID = clusterID
identity.IdentityStatus = status
identity.PendingJoinRequestID = ""
identity.ClusterAuthorityPublicKey = authorityPublicKey
identity.ClusterAuthorityFingerprint = authorityFingerprint
if err := Save(path, identity); err != nil {
return Identity{}, err
}
return identity, nil
}
func randomToken(prefix string) (string, error) {
var random [32]byte
if _, err := rand.Read(random[:]); err != nil {
return "", err
}
return prefix + "_" + base64.RawURLEncoding.EncodeToString(random[:]), nil
}
@@ -0,0 +1,55 @@
package state
import (
"path/filepath"
"testing"
)
func TestLoadOrCreatePersistsIdentity(t *testing.T) {
dir := t.TempDir()
identity, err := LoadOrCreate(dir, "cluster-1", "node-a")
if err != nil {
t.Fatalf("load or create: %v", err)
}
if identity.NodeFingerprint == "" || identity.PublicKey == "" {
t.Fatalf("identity missing generated fields: %+v", identity)
}
loaded, err := Load(filepath.Join(dir, FileName))
if err != nil {
t.Fatalf("load identity: %v", err)
}
if loaded.NodeFingerprint != identity.NodeFingerprint {
t.Fatal("identity fingerprint was not persisted")
}
}
func TestMarkApprovedUpdatesIdentity(t *testing.T) {
dir := t.TempDir()
if _, err := LoadOrCreate(dir, "cluster-1", "node-a"); err != nil {
t.Fatalf("load or create: %v", err)
}
if _, err := MarkEnrollmentSubmitted(dir, "cluster-1", "join-request-1"); err != nil {
t.Fatalf("mark enrollment submitted: %v", err)
}
approved, err := MarkApproved(dir, "node-1", "cluster-1", "active")
if err != nil {
t.Fatalf("mark approved: %v", err)
}
if approved.NodeID != "node-1" || approved.IdentityStatus != "active" || approved.PendingJoinRequestID != "" {
t.Fatalf("unexpected approved identity: %+v", approved)
}
}
func TestMarkApprovedWithAuthorityPinsClusterAuthority(t *testing.T) {
dir := t.TempDir()
if _, err := LoadOrCreate(dir, "cluster-1", "node-a"); err != nil {
t.Fatalf("load or create: %v", err)
}
approved, err := MarkApprovedWithAuthority(dir, "node-1", "cluster-1", "active", "public-key-b64", "rap-ca-ed25519-test")
if err != nil {
t.Fatalf("mark approved with authority: %v", err)
}
if approved.ClusterAuthorityPublicKey != "public-key-b64" || approved.ClusterAuthorityFingerprint != "rap-ca-ed25519-test" {
t.Fatalf("authority pin was not persisted: %+v", approved)
}
}
@@ -0,0 +1,40 @@
package supervisor
import (
"context"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
)
type Supervisor interface {
Apply(ctx context.Context, desired []client.DesiredWorkload) ([]client.WorkloadStatusRequest, error)
}
type StubSupervisor struct {
Version string
}
func (s StubSupervisor) Apply(_ context.Context, desired []client.DesiredWorkload) ([]client.WorkloadStatusRequest, error) {
statuses := make([]client.WorkloadStatusRequest, 0, len(desired))
for _, workload := range desired {
state := "degraded"
if workload.DesiredState == "disabled" {
state = "stopped"
}
version := workload.Version
if version == "" {
version = s.Version
}
statuses = append(statuses, client.WorkloadStatusRequest{
ReportedState: state,
RuntimeMode: workload.RuntimeMode,
Version: version,
StatusPayload: map[string]any{
"supervisor": "stub",
"desired_state": workload.DesiredState,
"service_type": workload.ServiceType,
},
})
}
return statuses, nil
}
@@ -0,0 +1,35 @@
package supervisor
import (
"context"
"testing"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
)
func TestStubSupervisorReportsDegradedForEnabledWorkload(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{ServiceType: "rdp-worker", DesiredState: "enabled", RuntimeMode: "container"},
})
if err != nil {
t.Fatalf("apply desired workload: %v", err)
}
if len(statuses) != 1 {
t.Fatalf("statuses length = %d", len(statuses))
}
if statuses[0].ReportedState != "degraded" {
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
}
}
func TestStubSupervisorReportsStoppedForDisabledWorkload(t *testing.T) {
statuses, err := (StubSupervisor{Version: "test"}).Apply(context.Background(), []client.DesiredWorkload{
{ServiceType: "relay-node", DesiredState: "disabled", RuntimeMode: "container"},
})
if err != nil {
t.Fatalf("apply desired workload: %v", err)
}
if statuses[0].ReportedState != "stopped" {
t.Fatalf("ReportedState = %q", statuses[0].ReportedState)
}
}