Files
rdp-proxy/agents/rap-node-agent/cmd/rap-node-agent/main.go
T
2026-05-14 23:26:19 +03:00

4350 lines
172 KiB
Go

package main
import (
"context"
"encoding/json"
"fmt"
"log"
"net"
"net/http"
"os"
"os/exec"
"os/signal"
"path/filepath"
"runtime"
"sort"
"strings"
"sync/atomic"
"syscall"
"time"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/agent"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/authority"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/client"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/config"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/hostagent"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/mesh"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/state"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/supervisor"
"github.com/example/remote-access-platform/agents/rap-node-agent/internal/vpnruntime"
)
const (
meshRendezvousLeaseRenewalWindow = time.Minute
meshRendezvousLeaseRefreshBackoff = 30 * time.Second
meshSyntheticConfigRefreshInterval = 20 * time.Second
meshRouteHealthFeedbackRefreshBackoff = 5 * time.Second
maxMeshRendezvousLeaseReportEntries = 20
meshRendezvousLeaseReportSchema = "c17z18.mesh_rendezvous_lease_report.v1"
meshRendezvousLeaseTelemetryCapability = "mesh_rendezvous_lease_telemetry"
meshRendezvousLeaseRefreshCapability = "mesh_rendezvous_lease_refresh_contract"
meshRendezvousRelayReplacementCapability = "mesh_rendezvous_relay_replacement_contract"
meshRoutePathDecisionReportSchema = "c17z18.mesh_route_path_decision_report.v1"
meshRoutePathDecisionCapability = "mesh_route_path_decision_contract"
meshRouteGenerationReportSchema = "c17z18.mesh_route_generation_report.v1"
meshRouteGenerationTrackerCapability = "mesh_route_generation_tracker"
meshRouteHealthConfigReportSchema = "c17z20.mesh_route_health_config_report.v1"
meshRouteHealthConfigCapability = "mesh_route_health_config_from_path_decisions"
meshRouteHealthFeedbackRefreshSchema = "c17z20.mesh_route_health_feedback_refresh_report.v1"
meshRouteHealthFeedbackRefreshCapability = "mesh_route_health_feedback_refresh_contract"
)
func main() {
cfg, err := config.Load(os.Args[1:], nil)
if err != nil {
log.Fatalf("load config: %v", err)
}
signalCtx, stopSignals := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM)
defer stopSignals()
ctx, cancel := context.WithCancel(signalCtx)
defer cancel()
identity, err := state.LoadOrCreate(cfg.StateDir, cfg.ClusterID, cfg.NodeName)
if err != nil {
log.Fatalf("load identity state: %v", err)
}
api := client.New(cfg.BackendURL)
if identity.NodeID == "" {
identity, err = ensureApprovedIdentity(ctx, cfg, identity, api)
if err != nil {
log.Fatalf("bootstrap node identity: %v", err)
}
if identity.NodeID == "" {
log.Printf("enrollment still pending: join_request_id=%s identity_file=%s", identity.PendingJoinRequestID, filepath.Join(cfg.StateDir, state.FileName))
return
}
}
log.Printf("node-agent started: node_id=%s cluster_id=%s backend=%s", identity.NodeID, identity.ClusterID, cfg.BackendURL)
vpnGateway := &vpnruntime.Gateway{API: api}
meshState, stopMeshEndpoint, err := startSyntheticMeshEndpoint(ctx, cancel, cfg, identity, api, vpnGateway)
if err != nil {
log.Fatalf("start synthetic mesh endpoint: %v", err)
}
defer stopMeshEndpoint()
supervisor := supervisor.StubSupervisor{
Version: agent.Version,
RemoteWorkspaceRealAdapter: supervisor.RemoteWorkspaceRealAdapterConfig{
EnabledRequested: cfg.RemoteWorkspaceRealAdapterEnabled,
Command: cfg.RemoteWorkspaceRealAdapterCommand,
ArgsJSON: cfg.RemoteWorkspaceRealAdapterArgsJSON,
WorkDir: cfg.RemoteWorkspaceRealAdapterWorkDir,
},
}
startedAt := time.Now().UTC()
ticker := time.NewTicker(cfg.HeartbeatInterval)
defer ticker.Stop()
for {
flags, err := sendHeartbeat(ctx, api, cfg, identity, meshState)
if err != nil {
log.Printf("heartbeat failed: %v", err)
}
if flags.Enabled && flags.TelemetryEnabled {
telemetry := agent.TelemetryPayload(identity, startedAt)
if telemetry.Payload == nil {
telemetry.Payload = map[string]any{}
}
if meshState != nil && meshState.ServiceChannelAccessStats != nil {
telemetry.Payload["fabric_service_channel_access_report"] = meshState.ServiceChannelAccessStats.Report(time.Now().UTC())
}
if meshState != nil && meshState.RemoteWorkspaceFrameSink != nil {
telemetry.Payload["remote_workspace_adapter_sink_report"] = meshState.RemoteWorkspaceFrameSink.Report(time.Now().UTC())
}
if err := api.ReportTelemetry(ctx, identity.ClusterID, identity.NodeID, telemetry); err != nil {
log.Printf("telemetry failed: %v", err)
} else {
log.Printf("telemetry sent: node_id=%s cluster_id=%s scopes=%v", identity.NodeID, identity.ClusterID, flags.AppliedScopes)
}
}
if cfg.WorkloadSupervisionEnabled {
if err := reportWorkloadStatus(ctx, api, supervisor, identity, meshState); err != nil {
log.Printf("workload status failed: %v", err)
}
}
if err := ensureVPNGatewayRuntime(ctx, api, identity, vpnGateway, meshState); err != nil {
log.Printf("vpn gateway runtime failed: %v", err)
}
if err := reportVPNAssignmentStatus(ctx, api, identity, vpnGateway); err != nil {
log.Printf("vpn assignment status failed: %v", err)
}
logProductionObservationSinkMetrics(meshState)
if flags.Enabled && flags.SyntheticLinksEnabled {
if err := api.ReportMeshLink(ctx, identity.ClusterID, agent.MeshSelfObservationPayload(identity)); err != nil {
log.Printf("mesh self-observation failed: %v", err)
} else {
log.Printf("mesh self-observation sent: node_id=%s cluster_id=%s scopes=%v", identity.NodeID, identity.ClusterID, flags.AppliedScopes)
}
if err := refreshRendezvousLeasesIfNeeded(ctx, cfg, identity, api, meshState, time.Now().UTC()); err != nil {
log.Printf("mesh rendezvous lease refresh failed: %v", err)
}
if err := refreshSyntheticMeshConfigIfDue(ctx, cfg, identity, api, meshState, time.Now().UTC()); err != nil {
log.Printf("mesh synthetic config refresh failed: %v", err)
}
if err := reportSyntheticRouteHealth(ctx, cfg, api, identity, meshState); err != nil {
log.Printf("mesh synthetic route health failed: %v", err)
}
if err := probeWarmPeerHealth(ctx, api, identity, meshState); err != nil {
log.Printf("mesh warm peer health failed: %v", err)
}
}
select {
case <-ctx.Done():
log.Print("node-agent stopped")
return
case <-ticker.C:
}
}
}
type joinRequestEnvelope struct {
ID string `json:"id"`
}
type nodeApprovalAuthorityPayload struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
JoinRequestID string `json:"join_request_id"`
NodeID string `json:"node_id"`
NodeFingerprint string `json:"node_fingerprint"`
IdentityStatus string `json:"identity_status"`
HeartbeatEndpoint string `json:"heartbeat_endpoint"`
ApprovedByUserID string `json:"approved_by_user_id"`
IssuedAt time.Time `json:"issued_at"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
}
func ensureApprovedIdentity(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client) (state.Identity, error) {
clusterID := firstNonEmpty(identity.ClusterID, cfg.ClusterID)
if clusterID == "" {
return state.Identity{}, fmt.Errorf("cluster ID is required for enrollment")
}
identity.ClusterID = clusterID
if identity.PendingJoinRequestID == "" {
if cfg.JoinToken == "" {
return state.Identity{}, fmt.Errorf("join token is required for first enrollment")
}
response, err := api.Enroll(ctx, agent.EnrollmentPayload(clusterID, cfg.JoinToken, identity))
if err != nil {
return state.Identity{}, fmt.Errorf("enroll node: %w", err)
}
joinRequestID, err := parseJoinRequestID(response.JoinRequest)
if err != nil {
return state.Identity{}, err
}
identity, err = state.MarkEnrollmentSubmitted(cfg.StateDir, clusterID, joinRequestID)
if err != nil {
return state.Identity{}, fmt.Errorf("persist pending enrollment: %w", err)
}
log.Printf("enrollment submitted: status=%s join_request_id=%s identity_file=%s", response.Status, joinRequestID, filepath.Join(cfg.StateDir, state.FileName))
}
deadline := time.Time{}
if cfg.EnrollmentPollTimeout > 0 {
deadline = time.Now().UTC().Add(cfg.EnrollmentPollTimeout)
}
for {
response, err := api.BootstrapEnrollment(ctx, identity.PendingJoinRequestID, client.EnrollmentBootstrapRequest{
ClusterID: clusterID,
NodeFingerprint: identity.NodeFingerprint,
PublicKey: identity.PublicKey,
})
if err == nil {
switch response.Status {
case "approved":
if response.Bootstrap == nil {
return state.Identity{}, fmt.Errorf("approved enrollment missing bootstrap contract")
}
if err := verifyEnrollmentBootstrap(*response.Bootstrap, identity, cfg); err != nil {
return state.Identity{}, err
}
approved, err := state.MarkApprovedWithAuthority(
cfg.StateDir,
response.Bootstrap.NodeID,
response.Bootstrap.ClusterID,
response.Bootstrap.IdentityStatus,
response.Bootstrap.ClusterAuthority.PublicKey,
response.Bootstrap.ClusterAuthority.PublicKeyFingerprint,
)
if err != nil {
return state.Identity{}, fmt.Errorf("persist approved identity: %w", err)
}
log.Printf("enrollment approved: node_id=%s cluster_id=%s authority=%s", approved.NodeID, approved.ClusterID, approved.ClusterAuthorityFingerprint)
return approved, nil
case "rejected", "cancelled":
return state.Identity{}, fmt.Errorf("enrollment %s", response.Status)
default:
log.Printf("enrollment waiting for approval: status=%s join_request_id=%s", response.Status, identity.PendingJoinRequestID)
}
} else {
log.Printf("enrollment bootstrap poll failed: %v", err)
}
if cfg.EnrollmentPollTimeout > 0 && !deadline.IsZero() && !time.Now().UTC().Before(deadline) {
return identity, nil
}
select {
case <-ctx.Done():
return state.Identity{}, ctx.Err()
case <-time.After(cfg.EnrollmentPollInterval):
}
}
}
func parseJoinRequestID(raw json.RawMessage) (string, error) {
var envelope joinRequestEnvelope
if err := json.Unmarshal(raw, &envelope); err != nil {
return "", fmt.Errorf("decode join request: %w", err)
}
if strings.TrimSpace(envelope.ID) == "" {
return "", fmt.Errorf("join request id missing from enrollment response")
}
return strings.TrimSpace(envelope.ID), nil
}
func verifyEnrollmentBootstrap(bootstrap client.NodeBootstrap, identity state.Identity, cfg config.Config) error {
if bootstrap.ClusterAuthority == nil {
return fmt.Errorf("node bootstrap missing cluster authority")
}
if bootstrap.AuthoritySignature == nil || rawMessageEmpty(bootstrap.AuthorityPayload) {
return fmt.Errorf("node bootstrap missing authority payload or signature")
}
if bootstrap.ClusterID != identity.ClusterID || bootstrap.NodeID == "" || bootstrap.IdentityStatus == "" {
return fmt.Errorf("node bootstrap identity mismatch")
}
if bootstrap.ClusterAuthority.SchemaVersion != authority.AuthoritySchemaVersion ||
bootstrap.ClusterAuthority.ClusterID != bootstrap.ClusterID ||
bootstrap.ClusterAuthority.KeyAlgorithm != authority.AlgorithmEd25519 {
return fmt.Errorf("node bootstrap cluster authority descriptor mismatch")
}
if bootstrap.AuthoritySignature.KeyFingerprint != bootstrap.ClusterAuthority.PublicKeyFingerprint {
return fmt.Errorf("node bootstrap authority fingerprint mismatch")
}
if pinned := firstNonEmpty(identity.ClusterAuthorityFingerprint, cfg.ClusterAuthorityFingerprint); pinned != "" && pinned != bootstrap.ClusterAuthority.PublicKeyFingerprint {
return fmt.Errorf("node bootstrap pinned authority fingerprint mismatch")
}
if pinned := firstNonEmpty(identity.ClusterAuthorityPublicKey, cfg.ClusterAuthorityPublicKey); pinned != "" && pinned != bootstrap.ClusterAuthority.PublicKey {
return fmt.Errorf("node bootstrap pinned authority public key mismatch")
}
signature := authority.Signature{
SchemaVersion: bootstrap.AuthoritySignature.SchemaVersion,
Algorithm: bootstrap.AuthoritySignature.Algorithm,
KeyFingerprint: bootstrap.AuthoritySignature.KeyFingerprint,
Signature: bootstrap.AuthoritySignature.Signature,
}
if err := authority.VerifyRaw(bootstrap.ClusterAuthority.PublicKey, bootstrap.AuthorityPayload, signature); err != nil {
return fmt.Errorf("verify node bootstrap authority signature: %w", err)
}
var payload nodeApprovalAuthorityPayload
if err := json.Unmarshal(bootstrap.AuthorityPayload, &payload); err != nil {
return fmt.Errorf("decode node bootstrap authority payload: %w", err)
}
if payload.SchemaVersion != "rap.cluster.node_approval.v1" ||
payload.ClusterID != bootstrap.ClusterID ||
payload.NodeID != bootstrap.NodeID ||
payload.NodeFingerprint != identity.NodeFingerprint ||
payload.IdentityStatus != bootstrap.IdentityStatus ||
payload.HeartbeatEndpoint != bootstrap.HeartbeatEndpoint ||
!payload.ControlPlaneOnly ||
payload.ProductionForwarding {
return fmt.Errorf("node bootstrap authority payload mismatch")
}
if identity.PendingJoinRequestID != "" && payload.JoinRequestID != identity.PendingJoinRequestID {
return fmt.Errorf("node bootstrap authority payload join request mismatch")
}
return nil
}
type syntheticMeshState struct {
Runtime *mesh.SyntheticRuntime
Routes []mesh.SyntheticRoute
RouteHealthRoutes []mesh.SyntheticRoute
Source string
PeerCache *mesh.PeerCache
RendezvousLeases []mesh.PeerRendezvousLease
RoutePathDecisions *client.RoutePathDecisionReport
ServiceChannelFeedback *client.FabricServiceChannelFeedbackReport
ServiceChannelAdaptivePolicy *client.FabricServiceChannelAdaptivePolicy
ServiceChannelRemediationCommands []client.FabricServiceChannelRemediationCommand
RouteGenerationTracker *meshRouteGenerationTracker
ConfigVersion string
PeerDirectoryVersion string
PolicyVersion string
PeerConnections *mesh.PeerConnectionTracker
PeerConnectionManager *mesh.PeerConnectionManager
LastPeerRecoveryPlan *mesh.PeerRecoveryPlan
LastPeerConnectionIntent *mesh.PeerConnectionIntentPlan
LastConfigRefreshAt time.Time
LastLeaseRefresh *meshRendezvousLeaseRefreshState
LeaseRefreshAttempts int
LeaseRefreshSuccesses int
LeaseRefreshFailures int
LastRouteHealthRefresh *meshRouteHealthFeedbackRefreshState
RouteHealthRefreshAttempts int
RouteHealthRefreshSuccesses int
RouteHealthRefreshFailures int
RouteHealthRefreshSuppressed int
ProductionObservationSink *mesh.ProductionEnvelopeObservationSink
ProductionForwardTransport mesh.ProductionForwardTransport
ProductionForwardingEnabled bool
VPNFabricInbox *vpnruntime.FabricPacketInbox
VPNFabricIngress *vpnruntime.FabricClientPacketIngress
VPNGateway *vpnruntime.Gateway
ServiceChannelAccessStats *fabricServiceChannelAccessStats
RemoteWorkspaceFrameSink *mesh.RemoteWorkspaceFrameProbeSink
LastProductionSinkMetrics *mesh.ProductionEnvelopeObservationSinkMetrics
ListenerReport meshListenerReport
ListenerConfigKey string
ListenerRuntimeConfig config.Config
ListenerHandler *dynamicHTTPHandler
StopListener func()
ConfigLoadError string
}
type fabricServiceChannelAccessStats struct {
Total atomic.Int64
Signed atomic.Int64
Introspection atomic.Int64
LegacyUnsigned atomic.Int64
BackendFallback atomic.Int64
BackendFallbackBlocked atomic.Int64
FabricRouteSendFailure atomic.Int64
DataPlaneContract atomic.Int64
LastAcceptedUnixSec atomic.Int64
LastDataPlaneMode atomic.Value
LastWorkingData atomic.Value
LastSteadyState atomic.Value
LastBackendRelay atomic.Value
LastLogicalFlowMode atomic.Value
LastViolationStatus atomic.Value
LastViolationReason atomic.Value
}
func newFabricServiceChannelAccessStats() *fabricServiceChannelAccessStats {
return &fabricServiceChannelAccessStats{}
}
func (s *fabricServiceChannelAccessStats) Observe(entry mesh.FabricServiceChannelAccessLogEntry) {
if s == nil {
return
}
s.Total.Add(1)
switch strings.TrimSpace(entry.AcceptedBy) {
case "signed":
s.Signed.Add(1)
case "introspection":
s.Introspection.Add(1)
case "legacy_unsigned":
s.LegacyUnsigned.Add(1)
}
if entry.ForceBackendFallback && strings.TrimSpace(entry.BackendRelayPolicy) != "disabled" {
s.BackendFallback.Add(1)
}
switch strings.TrimSpace(entry.ViolationStatus) {
case "backend_fallback_blocked_by_policy":
s.BackendFallbackBlocked.Add(1)
case "fabric_route_send_failed_backend_fallback_blocked":
s.BackendFallbackBlocked.Add(1)
s.FabricRouteSendFailure.Add(1)
}
if strings.TrimSpace(entry.ViolationStatus) != "" {
s.LastViolationStatus.Store(strings.TrimSpace(entry.ViolationStatus))
s.LastViolationReason.Store(strings.TrimSpace(entry.ViolationReason))
}
if entry.DataPlaneValid {
s.DataPlaneContract.Add(1)
s.LastDataPlaneMode.Store(strings.TrimSpace(entry.DataPlaneMode))
s.LastWorkingData.Store(strings.TrimSpace(entry.WorkingDataTransport))
s.LastSteadyState.Store(strings.TrimSpace(entry.SteadyStateTransport))
s.LastBackendRelay.Store(strings.TrimSpace(entry.BackendRelayPolicy))
s.LastLogicalFlowMode.Store(strings.TrimSpace(entry.LogicalFlowMode))
}
occurredAt := entry.OccurredAt
if occurredAt.IsZero() {
occurredAt = time.Now().UTC()
}
s.LastAcceptedUnixSec.Store(occurredAt.Unix())
}
func (s *fabricServiceChannelAccessStats) Report(observedAt time.Time) map[string]any {
if s == nil {
return nil
}
if observedAt.IsZero() {
observedAt = time.Now().UTC()
}
report := map[string]any{
"schema_version": "c18z52.fabric_service_channel_access_report.v1",
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
"total": s.Total.Load(),
"signed": s.Signed.Load(),
"introspection": s.Introspection.Load(),
"legacy_unsigned": s.LegacyUnsigned.Load(),
"backend_fallback": s.BackendFallback.Load(),
"backend_fallback_blocked": s.BackendFallbackBlocked.Load(),
"fabric_route_send_failure": s.FabricRouteSendFailure.Load(),
"data_plane_contract": s.DataPlaneContract.Load(),
"accepted_by_signed": s.Signed.Load(),
"accepted_by_introspection": s.Introspection.Load(),
"accepted_by_legacy_unsigned": s.LegacyUnsigned.Load(),
}
if value, ok := s.LastDataPlaneMode.Load().(string); ok && value != "" {
report["last_data_plane_mode"] = value
}
if value, ok := s.LastWorkingData.Load().(string); ok && value != "" {
report["last_working_data_transport"] = value
}
if value, ok := s.LastSteadyState.Load().(string); ok && value != "" {
report["last_steady_state_transport"] = value
}
if value, ok := s.LastBackendRelay.Load().(string); ok && value != "" {
report["last_backend_relay_policy"] = value
}
if value, ok := s.LastLogicalFlowMode.Load().(string); ok && value != "" {
report["last_logical_flow_mode"] = value
}
if value, ok := s.LastViolationStatus.Load().(string); ok && value != "" {
report["last_data_plane_violation_status"] = value
}
if value, ok := s.LastViolationReason.Load().(string); ok && value != "" {
report["last_data_plane_violation_reason"] = value
}
if last := s.LastAcceptedUnixSec.Load(); last > 0 {
report["last_accepted_at"] = time.Unix(last, 0).UTC().Format(time.RFC3339Nano)
}
return report
}
type dynamicHTTPHandler struct {
current atomic.Value
}
func newDynamicHTTPHandler(handler http.Handler) *dynamicHTTPHandler {
out := &dynamicHTTPHandler{}
out.Update(handler)
return out
}
func (h *dynamicHTTPHandler) Update(handler http.Handler) {
if handler == nil {
handler = http.NotFoundHandler()
}
h.current.Store(handler)
}
func (h *dynamicHTTPHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
if h == nil {
http.NotFound(w, r)
return
}
handler, _ := h.current.Load().(http.Handler)
if handler == nil {
http.NotFound(w, r)
return
}
handler.ServeHTTP(w, r)
}
type meshListenerReport struct {
SchemaVersion string `json:"schema_version"`
ConfiguredListenAddr string `json:"configured_listen_addr,omitempty"`
EffectiveListenAddr string `json:"effective_listen_addr,omitempty"`
ListenPortMode string `json:"listen_port_mode"`
Status string `json:"status"`
InboundReachability string `json:"inbound_reachability"`
ControlPlaneReachable bool `json:"control_plane_reachable"`
OneWayConnectivity bool `json:"one_way_connectivity"`
FailureReason string `json:"failure_reason,omitempty"`
FailureError string `json:"failure_error,omitempty"`
PortConflict bool `json:"port_conflict,omitempty"`
AutoPortSelected bool `json:"auto_port_selected,omitempty"`
ObservedAt string `json:"observed_at"`
}
type meshOutboundSessionReport struct {
SchemaVersion string `json:"schema_version"`
Status string `json:"status"`
Direction string `json:"direction"`
Transport string `json:"transport"`
ControlPlaneURL string `json:"control_plane_url,omitempty"`
ConnectivityMode string `json:"connectivity_mode,omitempty"`
InboundListenerRequired bool `json:"inbound_listener_required"`
UsableForInboundControl bool `json:"usable_for_inbound_control"`
ListenerStatus string `json:"listener_status,omitempty"`
ListenerFailureReason string `json:"listener_failure_reason,omitempty"`
ListenerPortConflict bool `json:"listener_port_conflict,omitempty"`
ConfigLoadError string `json:"config_load_error,omitempty"`
PeerConnectionReady int `json:"peer_connection_ready"`
PeerConnectionRelayReady int `json:"peer_connection_relay_ready"`
PeerConnectionWaiting int `json:"peer_connection_waiting"`
RendezvousLeaseCount int `json:"rendezvous_lease_count"`
ProductionForwarding bool `json:"production_forwarding"`
ServiceWorkloadTraffic bool `json:"service_workload_traffic"`
ObservedAt string `json:"observed_at"`
}
type meshRendezvousLeaseRefreshState struct {
Status string
Reason string
Error string
AttemptedAt time.Time
CompletedAt time.Time
PreviousLeaseCount int
RefreshedLeaseCount int
RefreshNeededCount int
RenewalNeededCount int
ExpiredCount int
StaleRelayCount int
ConfigVersion string
}
type meshRouteHealthFeedbackTrigger struct {
Reason string
RouteID string
PeerNodeID string
SelectedRelayID string
LinkStatus string
FailureReason string
DriftDetected bool
ObservedAt time.Time
}
type meshRouteHealthFeedbackRefreshState struct {
Status string
Reason string
Error string
AttemptedAt time.Time
CompletedAt time.Time
RouteID string
PeerNodeID string
SelectedRelayID string
LinkStatus string
FailureReason string
DriftDetected bool
PreviousConfigVersion string
RefreshedConfigVersion string
PreviousRouteHealthRouteCount int
RefreshedRouteHealthRouteCount int
}
type loadedSyntheticMeshConfig struct {
PeerEndpoints map[string]string
PeerEndpointCandidates map[string][]mesh.PeerEndpointCandidate
PeerDirectory []mesh.PeerDirectoryEntry
RecoverySeeds []mesh.PeerRecoverySeed
RendezvousLeases []mesh.PeerRendezvousLease
RoutePathDecisions *client.RoutePathDecisionReport
ServiceChannelFeedback *client.FabricServiceChannelFeedbackReport
ServiceChannelRemediationCommands []client.FabricServiceChannelRemediationCommand
ServiceChannelAdaptivePolicy *client.FabricServiceChannelAdaptivePolicy
MeshListener *client.MeshListenerConfig
Routes []mesh.SyntheticRoute
Source string
ConfigVersion string
PeerDirectoryVersion string
PolicyVersion string
ProductionForwarding bool
}
func startSyntheticMeshEndpoint(ctx context.Context, _ context.CancelFunc, cfg config.Config, identity state.Identity, api *client.Client, vpnGateway *vpnruntime.Gateway) (*syntheticMeshState, func(), error) {
noop := func() {}
if !cfg.MeshSyntheticRuntimeEnabled {
return nil, noop, nil
}
local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID}
loadedConfig, err := loadSyntheticMeshConfig(ctx, cfg, identity, api)
if err != nil {
log.Printf("synthetic mesh config load failed; starting diagnostics-only mesh state: %v", err)
loadedConfig = loadedSyntheticMeshConfig{
PeerEndpoints: map[string]string{},
PeerEndpointCandidates: map[string][]mesh.PeerEndpointCandidate{},
PeerDirectory: []mesh.PeerDirectoryEntry{},
RecoverySeeds: []mesh.PeerRecoverySeed{},
RendezvousLeases: []mesh.PeerRendezvousLease{},
Routes: []mesh.SyntheticRoute{},
Source: "config_load_failed",
}
}
peerEndpoints := loadedConfig.PeerEndpoints
routes := loadedConfig.Routes
productionForwardingEnabled := cfg.MeshProductionForwardingEnabled || loadedConfig.ProductionForwarding
routeHealthRoutes := routeHealthRoutesFromPathDecisions(routes, loadedConfig.RoutePathDecisions)
peerCache := mesh.NewPeerCache(mesh.PeerCacheConfig{
Local: local,
PeerEndpoints: loadedConfig.PeerEndpoints,
PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates,
PeerDirectory: loadedConfig.PeerDirectory,
RecoverySeeds: loadedConfig.RecoverySeeds,
RendezvousLeases: loadedConfig.RendezvousLeases,
Routes: loadedConfig.Routes,
WarmPeerLimit: mesh.DefaultWarmPeerLimit,
PreferredRegion: cfg.MeshRegion,
Now: time.Now().UTC(),
})
peerCacheSnapshot := peerCache.Snapshot()
peerConnections := mesh.NewPeerConnectionTracker(peerCacheSnapshot, time.Now().UTC())
peerConnectionSnapshot := peerConnections.Snapshot()
peerRecoveryPlan := mesh.PlanPeerRecovery(mesh.PeerRecoveryPlanConfig{
PeerCache: peerCacheSnapshot,
Connections: peerConnectionSnapshot,
TargetReadyPeers: mesh.DefaultStablePeerTarget,
MaxProbeCandidates: mesh.DefaultRecoveryProbeLimit,
Now: time.Now().UTC(),
})
peerConnectionIntentPlan := mesh.PlanPeerConnectionIntents(mesh.PeerConnectionIntentPlanConfig{
PeerCache: peerCacheSnapshot,
RecoveryPlan: peerRecoveryPlan,
RendezvousLeases: loadedConfig.RendezvousLeases,
Now: time.Now().UTC(),
})
peerConnectionManager := mesh.NewPeerConnectionManager(mesh.PeerConnectionManagerConfig{
Local: local,
PeerCache: peerCache,
Tracker: peerConnections,
RendezvousLeases: loadedConfig.RendezvousLeases,
})
routeGenerationTracker := newMeshRouteGenerationTracker(loadedConfig.RoutePathDecisions, time.Now().UTC())
gateEnabled, runtimeEnabled := productionForwardingLogState(cfg, loadedConfig.ProductionForwarding)
log.Printf(
"synthetic mesh config loaded: source=%s node_id=%s cluster_id=%s peers=%d routes=%d peer_cache_peers=%d warm_peers=%d recovery_seeds=%d rendezvous_leases=%d peer_connection_states=%d peer_recovery_mode=%s peer_recovery_target_ready_peers=%d peer_connection_intents=%d rendezvous_required=%d rendezvous_resolved=%d production_forwarding_gate_enabled=%t production_forwarding_runtime_enabled=%t",
loadedConfig.Source,
identity.NodeID,
identity.ClusterID,
len(peerEndpoints),
len(routes),
peerCacheSnapshot.PeerCount,
peerCacheSnapshot.WarmPeerCount,
peerCacheSnapshot.RecoverySeedCount,
peerCacheSnapshot.RendezvousLeaseCount,
peerConnectionSnapshot.Total,
peerRecoveryPlan.Mode,
peerRecoveryPlan.TargetReadyPeers,
peerConnectionIntentPlan.IntentCount,
peerConnectionIntentPlan.RendezvousRequiredCount,
peerConnectionIntentPlan.RendezvousResolvedCount,
gateEnabled,
runtimeEnabled,
)
runtime := mesh.NewSyntheticRuntime(mesh.SyntheticRuntimeConfig{
Enabled: true,
Local: local,
Routes: routes,
RouteHealthRoutes: routeHealthRoutes,
Transport: mesh.NewHTTPPeerTransport(peerEndpoints),
Logger: func(entry mesh.SyntheticLogEntry) {
payload, err := json.Marshal(entry)
if err != nil {
log.Printf("mesh synthetic event marshal failed: %v", err)
return
}
log.Printf("mesh_synthetic_event=%s", string(payload))
},
})
productionObservationSink := productionEnvelopeObservationSinkFromConfig(cfg)
var productionEnvelopeObserver mesh.ProductionEnvelopeObserver
if productionObservationSink != nil {
productionEnvelopeObserver = productionObservationSink.Observe
}
var productionForwardTransport mesh.ProductionForwardTransport
if productionForwardingEnabled {
productionForwardTransport = mesh.NewHTTPProductionForwardTransport(peerEndpoints)
}
vpnFabricInbox := vpnruntime.NewFabricPacketInbox(4096)
serviceChannelAccessStats := newFabricServiceChannelAccessStats()
remoteWorkspaceFrameSink := mesh.NewRemoteWorkspaceFrameProbeSink()
vpnFabricIngress := &vpnruntime.FabricClientPacketIngress{
ForwardTransport: productionForwardTransport,
Inbox: vpnFabricInbox,
FlowScheduler: vpnruntime.NewFabricFlowScheduler(0, 0),
MaxParallelFlowSends: 4,
ClusterID: identity.ClusterID,
LocalNodeID: identity.NodeID,
LocalGateway: func(vpnConnectionID string) bool {
return vpnGateway != nil && vpnGateway.IsReadyForConnection(vpnConnectionID)
},
Routes: func() []mesh.SyntheticRoute {
return routes
},
}
initialRouteManagerAt := time.Now().UTC()
vpnFabricIngress.UpdateRouteManager(routeManagerDecisionsFromControlPlane(loadedConfig.RoutePathDecisions, loadedConfig.ServiceChannelRemediationCommands), loadedConfig.ConfigVersion, initialRouteManagerAt)
vpnFabricIngress.UpdateRouteQualityPreferences(routeQualityPreferencesFromServiceChannelFeedback(loadedConfig.ServiceChannelFeedback, initialRouteManagerAt), initialRouteManagerAt)
serverHandler := mesh.Server{
Local: local,
SyntheticRuntime: runtime,
ProductionForwardingEnabled: productionForwardingEnabled,
ProductionEnvelopeObserver: productionEnvelopeObserver,
ProductionEnvelopeDelivery: vpnFabricInbox.DeliverProductionEnvelope,
ProductionForwardTransport: productionForwardTransport,
ProductionForwardLogger: func(entry mesh.ProductionForwardLogEntry) {
payload, err := json.Marshal(entry)
if err != nil {
log.Printf("mesh production forward event marshal failed: %v", err)
return
}
log.Printf("mesh_production_forward_event=%s", string(payload))
},
FabricServiceChannelLogger: func(entry mesh.FabricServiceChannelAccessLogEntry) {
serviceChannelAccessStats.Observe(entry)
payload, err := json.Marshal(entry)
if err != nil {
log.Printf("fabric service channel access event marshal failed: %v", err)
return
}
log.Printf("fabric_service_channel_access_event=%s", string(payload))
},
RemoteWorkspaceFrameSink: remoteWorkspaceFrameSink,
ProductionRoutes: routes,
VPNPacketIngress: vpnFabricIngress,
BackendProxyBaseURL: cfg.BackendURL,
ClusterAuthorityPublicKey: firstNonEmpty(identity.ClusterAuthorityPublicKey, cfg.ClusterAuthorityPublicKey),
}.Handler()
dynamicListenerHandler := newDynamicHTTPHandler(serverHandler)
listenerCfg := meshListenerRuntimeConfig(cfg, loadedConfig.MeshListener)
listenerReport, stopListener := startSyntheticMeshHTTPServer(ctx, listenerCfg, identity, dynamicListenerHandler, len(peerEndpoints), len(routes), gateEnabled, runtimeEnabled)
return &syntheticMeshState{
Runtime: runtime,
Routes: routes,
RouteHealthRoutes: routeHealthRoutes,
Source: loadedConfig.Source,
PeerCache: peerCache,
RendezvousLeases: loadedConfig.RendezvousLeases,
RoutePathDecisions: loadedConfig.RoutePathDecisions,
ServiceChannelFeedback: loadedConfig.ServiceChannelFeedback,
ServiceChannelRemediationCommands: append([]client.FabricServiceChannelRemediationCommand{}, loadedConfig.ServiceChannelRemediationCommands...),
RouteGenerationTracker: routeGenerationTracker,
ConfigVersion: loadedConfig.ConfigVersion,
PeerDirectoryVersion: loadedConfig.PeerDirectoryVersion,
PolicyVersion: loadedConfig.PolicyVersion,
LastConfigRefreshAt: time.Now().UTC(),
PeerConnections: peerConnections,
PeerConnectionManager: peerConnectionManager,
LastPeerRecoveryPlan: &peerRecoveryPlan,
LastPeerConnectionIntent: &peerConnectionIntentPlan,
ProductionObservationSink: productionObservationSink,
ProductionForwardTransport: productionForwardTransport,
ProductionForwardingEnabled: productionForwardingEnabled,
VPNFabricInbox: vpnFabricInbox,
VPNFabricIngress: vpnFabricIngress,
VPNGateway: vpnGateway,
ServiceChannelAccessStats: serviceChannelAccessStats,
RemoteWorkspaceFrameSink: remoteWorkspaceFrameSink,
ListenerReport: listenerReport,
ListenerConfigKey: meshListenerConfigKey(listenerCfg),
ListenerRuntimeConfig: listenerCfg,
ListenerHandler: dynamicListenerHandler,
StopListener: stopListener,
ConfigLoadError: errorString(err),
}, stopListener, nil
}
func productionForwardingLogState(cfg config.Config, signedControlPlaneEnabled bool) (gateEnabled bool, runtimeEnabled bool) {
enabled := cfg.MeshProductionForwardingEnabled || signedControlPlaneEnabled
return enabled, enabled
}
func newVPNFabricIngress(meshState *syntheticMeshState, identity state.Identity, routes []mesh.SyntheticRoute, decisions *client.RoutePathDecisionReport, remediationCommands []client.FabricServiceChannelRemediationCommand, serviceChannelFeedback *client.FabricServiceChannelFeedbackReport, adaptivePolicy *client.FabricServiceChannelAdaptivePolicy, configVersion string, vpnGateway *vpnruntime.Gateway) *vpnruntime.FabricClientPacketIngress {
if meshState == nil || meshState.VPNFabricInbox == nil {
return nil
}
ingress := meshState.VPNFabricIngress
if ingress == nil {
ingress = &vpnruntime.FabricClientPacketIngress{}
}
ingress.PreventLastRouteWithdrawal = true
ingress.UpdateRuntime(
meshState.ProductionForwardTransport,
meshState.VPNFabricInbox,
identity.ClusterID,
identity.NodeID,
func(vpnConnectionID string) bool {
return vpnGateway != nil && vpnGateway.IsReadyForConnection(vpnConnectionID)
},
func() []mesh.SyntheticRoute {
return routes
},
serviceChannelRecoveryPolicyFingerprint(serviceChannelFeedback),
vpnruntimeAdaptivePolicy(adaptivePolicy),
)
appliedAt := time.Now().UTC()
ingress.UpdateRouteManager(routeManagerDecisionsFromControlPlane(decisions, remediationCommands), configVersion, appliedAt)
ingress.UpdateRouteQualityPreferences(routeQualityPreferencesFromServiceChannelFeedback(serviceChannelFeedback, appliedAt), appliedAt)
return ingress
}
func vpnruntimeAdaptivePolicy(policy *client.FabricServiceChannelAdaptivePolicy) vpnruntime.FabricServiceChannelAdaptivePolicy {
if policy == nil {
return vpnruntime.FabricServiceChannelAdaptivePolicy{}
}
return vpnruntime.FabricServiceChannelAdaptivePolicy{
SchemaVersion: policy.SchemaVersion,
Fingerprint: policy.Fingerprint,
MaxParallelWindow: policy.MaxParallelWindow,
BulkPressureChannelThreshold: policy.BulkPressureChannelThreshold,
QueuePressureHighWatermark: policy.QueuePressureHighWatermark,
QueuePressureMaxInFlight: policy.QueuePressureMaxInFlight,
ClassWindows: policy.ClassWindows,
}
}
func serviceChannelRecoveryPolicyFingerprint(report *client.FabricServiceChannelFeedbackReport) string {
if report == nil || report.RecoveryPolicy == nil {
return ""
}
return strings.TrimSpace(report.RecoveryPolicy.Fingerprint)
}
func routeQualityPreferencesFromServiceChannelFeedback(report *client.FabricServiceChannelFeedbackReport, observedAt time.Time) []vpnruntime.FabricServiceChannelRouteQualityPreference {
if report == nil {
return nil
}
now := observedAt.UTC()
if now.IsZero() {
now = time.Now().UTC()
}
out := make([]vpnruntime.FabricServiceChannelRouteQualityPreference, 0, len(report.Observations))
for _, observation := range report.Observations {
effectiveScore := observation.EffectiveScoreAdjustment
if effectiveScore <= 0 {
effectiveScore = observation.ScoreAdjustment
}
if strings.TrimSpace(observation.RouteID) == "" || strings.TrimSpace(observation.FeedbackStatus) != "healthy" || effectiveScore <= 0 {
continue
}
if !observation.ExpiresAt.IsZero() && !observation.ExpiresAt.After(now) {
continue
}
out = append(out, vpnruntime.FabricServiceChannelRouteQualityPreference{
RouteID: observation.RouteID,
FeedbackStatus: observation.FeedbackStatus,
ScoreAdjustment: effectiveScore,
RawScoreAdjustment: observation.ScoreAdjustment,
Reasons: append([]string{}, observation.Reasons...),
LastSendDurationMs: observation.LastSendDurationMs,
ObservedAt: observation.ObservedAt.UTC().Format(time.RFC3339Nano),
ExpiresAt: observation.ExpiresAt.UTC().Format(time.RFC3339Nano),
})
}
return out
}
func routeManagerDecisionsFromPathDecisions(report *client.RoutePathDecisionReport) []vpnruntime.FabricServiceChannelRouteManagerDecision {
if report == nil {
return nil
}
out := make([]vpnruntime.FabricServiceChannelRouteManagerDecision, 0, len(report.Decisions))
for _, decision := range report.Decisions {
if strings.TrimSpace(decision.RebuildStatus) == "" {
continue
}
out = append(out, vpnruntime.FabricServiceChannelRouteManagerDecision{
RouteID: decision.RouteID,
ReplacementRouteID: decision.ReplacementRouteID,
RebuildRequestID: decision.RebuildRequestID,
RebuildStatus: decision.RebuildStatus,
RebuildReason: decision.RebuildReason,
RebuildAttempt: decision.RebuildAttempt,
DecisionSource: decision.DecisionSource,
Generation: decision.Generation,
EffectiveHops: append([]string{}, decision.EffectiveHops...),
})
}
return out
}
func routeManagerDecisionsFromControlPlane(report *client.RoutePathDecisionReport, commands []client.FabricServiceChannelRemediationCommand) []vpnruntime.FabricServiceChannelRouteManagerDecision {
out := routeManagerDecisionsFromPathDecisions(report)
if len(commands) == 0 {
return out
}
decisionByRequestID := map[string]struct{}{}
for _, decision := range out {
if requestID := strings.TrimSpace(decision.RebuildRequestID); requestID != "" {
decisionByRequestID[requestID] = struct{}{}
}
}
now := time.Now().UTC()
for _, command := range commands {
action := strings.TrimSpace(command.Action)
if action != "prefer_alternate_route" && action != "rebuild_route" {
continue
}
guardStatus := strings.TrimSpace(command.GuardStatus)
if guardStatus != "" && guardStatus != "allowed" {
continue
}
primaryRouteID := strings.TrimSpace(command.PrimaryRouteID)
replacementRouteID := strings.TrimSpace(command.ReplacementRouteID)
if primaryRouteID == "" {
continue
}
if !command.ExpiresAt.IsZero() && !command.ExpiresAt.After(now) {
continue
}
if commandID := strings.TrimSpace(command.CommandID); commandID != "" {
if _, ok := decisionByRequestID[commandID]; ok {
continue
}
}
rebuildStatus := "pending_degraded_fallback"
if action == "prefer_alternate_route" {
if replacementRouteID == "" || primaryRouteID == replacementRouteID {
continue
}
rebuildStatus = "applied"
}
out = append(out, vpnruntime.FabricServiceChannelRouteManagerDecision{
RouteID: primaryRouteID,
ReplacementRouteID: replacementRouteID,
RebuildRequestID: strings.TrimSpace(command.CommandID),
RebuildStatus: rebuildStatus,
RebuildReason: firstNonEmpty(command.Reason, "service_channel_remediation_"+action),
DecisionSource: "service_channel_remediation_command",
Generation: strings.TrimSpace(command.CommandID),
})
}
return out
}
func errorString(err error) string {
if err == nil {
return ""
}
return err.Error()
}
func startSyntheticMeshHTTPServer(ctx context.Context, cfg config.Config, identity state.Identity, handler http.Handler, peerCount int, routeCount int, gateEnabled bool, runtimeEnabled bool) (meshListenerReport, func()) {
now := time.Now().UTC()
mode := defaultString(cfg.MeshListenPortMode, "manual")
baseReport := meshListenerReport{
SchemaVersion: "c17z21.mesh_listener_report.v1",
ConfiguredListenAddr: cfg.MeshListenAddr,
ListenPortMode: mode,
Status: "disabled",
InboundReachability: "unavailable",
ControlPlaneReachable: true,
OneWayConnectivity: true,
ObservedAt: now.Format(time.RFC3339Nano),
}
if mode == "disabled" || strings.TrimSpace(cfg.MeshListenAddr) == "" {
if strings.TrimSpace(cfg.MeshListenAddr) == "" {
baseReport.FailureReason = "listen_addr_empty"
log.Print("synthetic mesh runtime enabled, but RAP_MESH_LISTEN_ADDR is empty; inbound endpoint disabled")
} else {
baseReport.FailureReason = "listen_disabled"
log.Printf("synthetic mesh endpoint disabled by listen port mode: node_id=%s cluster_id=%s", identity.NodeID, identity.ClusterID)
}
return baseReport, func() {}
}
listener, effectiveAddr, autoSelected, bindErr := bindSyntheticMeshListener(cfg)
if bindErr != nil {
baseReport.Status = "listen_failed"
baseReport.FailureReason = "bind_failed"
baseReport.FailureError = bindErr.Error()
baseReport.PortConflict = isAddressInUse(bindErr)
log.Printf("synthetic mesh endpoint unavailable: listen_addr=%s mode=%s node_id=%s cluster_id=%s err=%v", cfg.MeshListenAddr, mode, identity.NodeID, identity.ClusterID, bindErr)
return baseReport, func() {}
}
report := baseReport
report.Status = "listening"
if autoSelected {
report.Status = "auto_rebound"
}
report.EffectiveListenAddr = effectiveAddr
report.InboundReachability = reachabilityFromConnectivityMode(cfg.MeshConnectivityMode)
report.OneWayConnectivity = cfg.MeshConnectivityMode == "outbound_only"
report.AutoPortSelected = autoSelected
server := &http.Server{
Addr: effectiveAddr,
Handler: handler,
ReadHeaderTimeout: 5 * time.Second,
}
go func() {
log.Printf(
"synthetic mesh endpoint starting: listen_addr=%s effective_listen_addr=%s mode=%s node_id=%s cluster_id=%s peers=%d routes=%d production_forwarding_gate_enabled=%t production_forwarding_runtime_enabled=%t",
cfg.MeshListenAddr,
effectiveAddr,
mode,
identity.NodeID,
identity.ClusterID,
peerCount,
routeCount,
gateEnabled,
runtimeEnabled,
)
if err := server.Serve(listener); err != nil && err != http.ErrServerClosed {
log.Printf("synthetic mesh endpoint stopped unexpectedly: %v", err)
}
}()
go func() {
<-ctx.Done()
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second)
defer shutdownCancel()
if err := server.Shutdown(shutdownCtx); err != nil {
log.Printf("synthetic mesh endpoint shutdown failed: %v", err)
}
}()
return report, func() {
shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 5*time.Second)
defer shutdownCancel()
_ = server.Shutdown(shutdownCtx)
}
}
func meshListenerRuntimeConfig(base config.Config, desired *client.MeshListenerConfig) config.Config {
out := base
if desired == nil {
return out
}
if desired.ListenAddr != "" {
out.MeshListenAddr = strings.TrimSpace(desired.ListenAddr)
}
if desired.ListenPortMode != "" {
out.MeshListenPortMode = strings.ToLower(strings.TrimSpace(desired.ListenPortMode))
}
if desired.DesiredState != "" && desired.DesiredState != "enabled" {
out.MeshListenPortMode = "disabled"
}
if desired.AutoPortStart > 0 {
out.MeshListenAutoPortStart = desired.AutoPortStart
}
if desired.AutoPortEnd > 0 {
out.MeshListenAutoPortEnd = desired.AutoPortEnd
}
if desired.AdvertiseEndpoint != "" {
out.MeshAdvertiseEndpoint = strings.TrimRight(strings.TrimSpace(desired.AdvertiseEndpoint), "/")
}
if desired.AdvertiseTransport != "" {
out.MeshAdvertiseTransport = strings.TrimSpace(desired.AdvertiseTransport)
}
if desired.ConnectivityMode != "" {
out.MeshConnectivityMode = strings.TrimSpace(desired.ConnectivityMode)
}
if desired.NATType != "" {
out.MeshNATType = strings.TrimSpace(desired.NATType)
}
if desired.Region != "" {
out.MeshRegion = strings.TrimSpace(desired.Region)
}
out.MeshProductionForwardingEnabled = base.MeshProductionForwardingEnabled || desired.ProductionForwarding
return out
}
func meshListenerConfigKey(cfg config.Config) string {
return strings.Join([]string{
strings.TrimSpace(cfg.MeshListenAddr),
strings.ToLower(strings.TrimSpace(cfg.MeshListenPortMode)),
fmt.Sprintf("%d", cfg.MeshListenAutoPortStart),
fmt.Sprintf("%d", cfg.MeshListenAutoPortEnd),
strings.TrimRight(strings.TrimSpace(cfg.MeshAdvertiseEndpoint), "/"),
strings.TrimSpace(cfg.MeshAdvertiseTransport),
strings.TrimSpace(cfg.MeshConnectivityMode),
strings.TrimSpace(cfg.MeshNATType),
strings.TrimSpace(cfg.MeshRegion),
fmt.Sprintf("%t", cfg.MeshProductionForwardingEnabled),
}, "|")
}
func bindSyntheticMeshListener(cfg config.Config) (net.Listener, string, bool, error) {
listener, err := net.Listen("tcp", cfg.MeshListenAddr)
if err == nil {
return listener, listener.Addr().String(), false, nil
}
if cfg.MeshListenPortMode != "auto" {
return nil, "", false, err
}
host, _, splitErr := net.SplitHostPort(cfg.MeshListenAddr)
if splitErr != nil {
host = ""
}
for port := cfg.MeshListenAutoPortStart; port <= cfg.MeshListenAutoPortEnd; port++ {
addr := net.JoinHostPort(host, fmt.Sprintf("%d", port))
listener, listenErr := net.Listen("tcp", addr)
if listenErr == nil {
return listener, listener.Addr().String(), true, nil
}
}
return nil, "", false, err
}
func isAddressInUse(err error) bool {
if err == nil {
return false
}
text := strings.ToLower(err.Error())
return strings.Contains(text, "address already in use") || strings.Contains(text, "only one usage of each socket address")
}
func productionEnvelopeObservationSinkFromConfig(cfg config.Config) *mesh.ProductionEnvelopeObservationSink {
if cfg.MeshProductionObservationSinkCapacity <= 0 {
return nil
}
sink := mesh.NewProductionEnvelopeObservationSink(cfg.MeshProductionObservationSinkCapacity)
log.Printf("production envelope observation sink enabled: capacity=%d payload_storage=false", sink.Capacity())
return sink
}
func logProductionObservationSinkMetrics(meshState *syntheticMeshState) {
if meshState == nil || meshState.ProductionObservationSink == nil {
return
}
metrics := meshState.ProductionObservationSink.Metrics()
if meshState.LastProductionSinkMetrics != nil && productionObservationSinkMetricsEqual(*meshState.LastProductionSinkMetrics, metrics) {
return
}
meshState.LastProductionSinkMetrics = &metrics
log.Printf(
"production envelope observation sink metrics: capacity=%d current_depth=%d accepted_total=%d dropped_oldest=%d payload_storage=false",
metrics.Capacity,
metrics.CurrentDepth,
metrics.AcceptedTotal,
metrics.DroppedOldest,
)
}
func productionObservationSinkMetricsEqual(a, b mesh.ProductionEnvelopeObservationSinkMetrics) bool {
return a.Capacity == b.Capacity &&
a.CurrentDepth == b.CurrentDepth &&
a.AcceptedTotal == b.AcceptedTotal &&
a.DroppedOldest == b.DroppedOldest
}
func loadSyntheticMeshConfig(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client) (loadedSyntheticMeshConfig, error) {
local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID}
if cfg.MeshSyntheticConfigPath != "" {
scoped, err := mesh.LoadScopedSyntheticConfig(cfg.MeshSyntheticConfigPath, local)
if err != nil {
return loadedSyntheticMeshConfig{}, err
}
return loadedSyntheticMeshConfig{
PeerEndpoints: scoped.PeerEndpoints,
PeerEndpointCandidates: scoped.PeerEndpointCandidates,
PeerDirectory: scoped.PeerDirectory,
RecoverySeeds: scoped.RecoverySeeds,
RendezvousLeases: scoped.RendezvousLeases,
RoutePathDecisions: nil,
Routes: scoped.Routes,
Source: "scoped_config",
ConfigVersion: scoped.ConfigVersion,
PeerDirectoryVersion: scoped.PeerDirectoryVersion,
PolicyVersion: scoped.PolicyVersion,
ProductionForwarding: false,
}, nil
}
if api != nil {
remote, err := api.SyntheticMeshConfig(ctx, local.ClusterID, local.NodeID)
if err == nil {
if verifyErr := verifyControlPlaneSyntheticMeshConfig(remote, identity, cfg); verifyErr != nil {
return loadedSyntheticMeshConfig{}, verifyErr
}
}
if err == nil && remote.Enabled {
return loadedSyntheticMeshConfig{
PeerEndpoints: remote.PeerEndpoints,
PeerEndpointCandidates: peerEndpointCandidatesFromControlPlane(remote.PeerEndpointCandidates),
PeerDirectory: peerDirectoryFromControlPlane(remote.PeerDirectory),
RecoverySeeds: recoverySeedsFromControlPlane(remote.RecoverySeeds),
RendezvousLeases: rendezvousLeasesFromControlPlane(remote.RendezvousLeases),
RoutePathDecisions: remote.RoutePathDecisions,
ServiceChannelFeedback: remote.ServiceChannelFeedback,
ServiceChannelAdaptivePolicy: remote.ServiceChannelAdaptivePolicy,
ServiceChannelRemediationCommands: append([]client.FabricServiceChannelRemediationCommand{}, remote.ServiceChannelRemediationCommands...),
MeshListener: remote.MeshListener,
Routes: syntheticRoutesFromControlPlane(remote.Routes),
Source: "control_plane",
ConfigVersion: remote.ConfigVersion,
PeerDirectoryVersion: remote.PeerDirectoryVersion,
PolicyVersion: remote.PolicyVersion,
ProductionForwarding: remote.ProductionForwarding,
}, nil
}
if err != nil {
log.Printf("control-plane synthetic mesh config unavailable, falling back to debug JSON: %v", err)
}
}
peerEndpoints, err := parseMeshPeerEndpoints(cfg.MeshPeerEndpointsJSON)
if err != nil {
return loadedSyntheticMeshConfig{}, err
}
routes, err := parseMeshSyntheticRoutes(cfg.MeshSyntheticRoutesJSON)
if err != nil {
return loadedSyntheticMeshConfig{}, err
}
return loadedSyntheticMeshConfig{
PeerEndpoints: peerEndpoints,
Routes: routes,
Source: "debug_json",
}, nil
}
type controlPlaneMeshConfigAuthorityPayload struct {
SchemaVersion string `json:"schema_version"`
ClusterID string `json:"cluster_id"`
LocalNodeID string `json:"local_node_id"`
ConfigVersion string `json:"config_version"`
ConfigSHA256 string `json:"config_sha256"`
IssuedAt time.Time `json:"issued_at"`
ExpiresAt time.Time `json:"expires_at"`
ControlPlaneOnly bool `json:"control_plane_only"`
ProductionForwarding bool `json:"production_forwarding"`
}
func verifyControlPlaneSyntheticMeshConfig(remote client.SyntheticMeshConfig, identity state.Identity, cfg config.Config) error {
signaturePresent := remote.ClusterAuthority != nil || len(remote.AuthorityPayload) > 0 || remote.AuthoritySignature != nil
if !remote.AuthorityRequired && !signaturePresent {
return nil
}
if remote.ClusterAuthority == nil {
return fmt.Errorf("control-plane synthetic mesh config requires cluster authority")
}
if remote.AuthoritySignature == nil || rawMessageEmpty(remote.AuthorityPayload) {
return fmt.Errorf("control-plane synthetic mesh config requires authority payload and signature")
}
if remote.ClusterAuthority.SchemaVersion != authority.AuthoritySchemaVersion {
return fmt.Errorf("control-plane synthetic mesh config authority schema mismatch")
}
if remote.ClusterAuthority.ClusterID != identity.ClusterID || remote.ClusterAuthority.ClusterID != remote.ClusterID {
return fmt.Errorf("control-plane synthetic mesh config authority cluster mismatch")
}
if remote.ClusterAuthority.KeyAlgorithm != authority.AlgorithmEd25519 {
return fmt.Errorf("control-plane synthetic mesh config authority algorithm mismatch")
}
if remote.AuthoritySignature.KeyFingerprint != remote.ClusterAuthority.PublicKeyFingerprint {
return fmt.Errorf("control-plane synthetic mesh config signature fingerprint mismatch")
}
if pinned := firstNonEmpty(identity.ClusterAuthorityFingerprint, cfg.ClusterAuthorityFingerprint); pinned != "" && pinned != remote.ClusterAuthority.PublicKeyFingerprint {
return fmt.Errorf("control-plane synthetic mesh config authority fingerprint mismatch")
}
if pinned := firstNonEmpty(identity.ClusterAuthorityPublicKey, cfg.ClusterAuthorityPublicKey); pinned != "" && pinned != remote.ClusterAuthority.PublicKey {
return fmt.Errorf("control-plane synthetic mesh config authority public key mismatch")
}
signature := authority.Signature{
SchemaVersion: remote.AuthoritySignature.SchemaVersion,
Algorithm: remote.AuthoritySignature.Algorithm,
KeyFingerprint: remote.AuthoritySignature.KeyFingerprint,
Signature: remote.AuthoritySignature.Signature,
}
if err := authority.VerifyRaw(remote.ClusterAuthority.PublicKey, remote.AuthorityPayload, signature); err != nil {
return fmt.Errorf("verify control-plane synthetic mesh config authority signature: %w", err)
}
var payload controlPlaneMeshConfigAuthorityPayload
if err := json.Unmarshal(remote.AuthorityPayload, &payload); err != nil {
return fmt.Errorf("decode control-plane synthetic mesh config authority payload: %w", err)
}
if payload.SchemaVersion != "rap.cluster.mesh_config_snapshot.v1" {
return fmt.Errorf("control-plane synthetic mesh config authority payload schema mismatch")
}
if payload.ClusterID != identity.ClusterID || payload.ClusterID != remote.ClusterID {
return fmt.Errorf("control-plane synthetic mesh config authority payload cluster mismatch")
}
if payload.LocalNodeID != identity.NodeID || payload.LocalNodeID != remote.LocalNodeID {
return fmt.Errorf("control-plane synthetic mesh config authority payload node mismatch")
}
if payload.ConfigVersion != remote.ConfigVersion {
return fmt.Errorf("control-plane synthetic mesh config authority payload version mismatch")
}
if payload.ControlPlaneOnly == payload.ProductionForwarding {
return fmt.Errorf("synthetic mesh config authority payload control-plane/production forwarding flags mismatch")
}
if payload.ProductionForwarding != remote.ProductionForwarding {
return fmt.Errorf("synthetic mesh config authority payload production forwarding mismatch")
}
if !payload.ExpiresAt.IsZero() && !payload.ExpiresAt.After(time.Now().UTC()) {
return fmt.Errorf("control-plane synthetic mesh config authority payload expired")
}
configHash, err := syntheticMeshConfigAuthorityHash(remote)
if err != nil {
return err
}
if payload.ConfigSHA256 != configHash {
return fmt.Errorf("control-plane synthetic mesh config authority payload hash mismatch")
}
return nil
}
func syntheticMeshConfigAuthorityHash(remote client.SyntheticMeshConfig) (string, error) {
if !rawMessageEmpty(remote.Raw) {
var unsigned map[string]json.RawMessage
if err := json.Unmarshal(remote.Raw, &unsigned); err != nil {
return "", fmt.Errorf("decode raw control-plane synthetic mesh config for authority hash: %w", err)
}
delete(unsigned, "authority_payload")
delete(unsigned, "authority_signature")
raw, err := json.Marshal(unsigned)
if err != nil {
return "", fmt.Errorf("marshal raw control-plane synthetic mesh config for authority hash: %w", err)
}
hash, err := authority.HashRaw(raw)
if err != nil {
return "", fmt.Errorf("hash raw control-plane synthetic mesh config authority payload: %w", err)
}
return hash, nil
}
unsigned := remote
unsigned.Raw = nil
unsigned.AuthorityPayload = nil
unsigned.AuthoritySignature = nil
raw, err := json.Marshal(unsigned)
if err != nil {
return "", fmt.Errorf("marshal control-plane synthetic mesh config for authority hash: %w", err)
}
hash, err := authority.HashRaw(raw)
if err != nil {
return "", fmt.Errorf("hash control-plane synthetic mesh config authority payload: %w", err)
}
return hash, nil
}
func rawMessageEmpty(raw json.RawMessage) bool {
value := strings.TrimSpace(string(raw))
return value == "" || value == "{}" || value == "null"
}
func firstNonEmpty(values ...string) string {
for _, value := range values {
if strings.TrimSpace(value) != "" {
return strings.TrimSpace(value)
}
}
return ""
}
type meshRendezvousLeasePosture struct {
RefreshNeeded bool
Reason string
RefreshNeededCount int
RenewalNeededCount int
ExpiredCount int
InvalidCount int
StaleRelayCount int
}
func refreshRendezvousLeasesIfNeeded(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client, meshState *syntheticMeshState, observedAt time.Time) error {
if meshState == nil || meshState.PeerCache == nil {
return nil
}
observedAt = observedAt.UTC()
posture := meshRendezvousLeasePostureForState(meshState, identity, observedAt)
if !posture.RefreshNeeded {
return nil
}
if meshState.LastLeaseRefresh != nil && meshState.LastLeaseRefresh.AttemptedAt.Add(meshRendezvousLeaseRefreshBackoff).After(observedAt) {
return nil
}
refresh := &meshRendezvousLeaseRefreshState{
Status: "attempted",
Reason: posture.Reason,
AttemptedAt: observedAt,
PreviousLeaseCount: len(meshState.RendezvousLeases),
RefreshNeededCount: posture.RefreshNeededCount,
RenewalNeededCount: posture.RenewalNeededCount,
ExpiredCount: posture.ExpiredCount,
StaleRelayCount: posture.StaleRelayCount,
ConfigVersion: meshState.ConfigVersion,
}
meshState.LastLeaseRefresh = refresh
meshState.LeaseRefreshAttempts++
if api == nil || meshState.Source != "control_plane" || cfg.MeshSyntheticConfigPath != "" {
refresh.Status = "unsupported"
refresh.Error = "control_plane_synthetic_config_required"
refresh.CompletedAt = observedAt
meshState.LeaseRefreshFailures++
return nil
}
local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID}
loadedConfig, err := loadSyntheticMeshConfig(ctx, cfg, identity, api)
completedAt := time.Now().UTC()
refresh.CompletedAt = completedAt
if err != nil {
refresh.Status = "failed"
refresh.Error = err.Error()
meshState.LeaseRefreshFailures++
return err
}
applyRefreshedSyntheticMeshConfig(ctx, cfg, identity, meshState, loadedConfig, local, cfg.MeshRegion, completedAt)
refresh.Status = "succeeded"
refresh.RefreshedLeaseCount = len(loadedConfig.RendezvousLeases)
refresh.ConfigVersion = loadedConfig.ConfigVersion
meshState.LeaseRefreshSuccesses++
log.Printf(
"mesh rendezvous lease refresh succeeded: reason=%s previous_leases=%d refreshed_leases=%d config_version=%s",
refresh.Reason,
refresh.PreviousLeaseCount,
refresh.RefreshedLeaseCount,
refresh.ConfigVersion,
)
return nil
}
func refreshSyntheticMeshConfigIfDue(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client, meshState *syntheticMeshState, observedAt time.Time) error {
if meshState == nil || meshState.PeerCache == nil {
return nil
}
observedAt = observedAt.UTC()
if !meshState.LastConfigRefreshAt.IsZero() && meshState.LastConfigRefreshAt.Add(meshSyntheticConfigRefreshInterval).After(observedAt) {
return nil
}
if api == nil || cfg.MeshSyntheticConfigPath != "" {
meshState.LastConfigRefreshAt = observedAt
return nil
}
if identity.NodeID == "" || identity.ClusterID == "" {
return nil
}
local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID}
loadedConfig, err := loadSyntheticMeshConfig(ctx, cfg, identity, api)
completedAt := time.Now().UTC()
if err != nil {
meshState.LastConfigRefreshAt = observedAt
return err
}
previousVersion := meshState.ConfigVersion
applyRefreshedSyntheticMeshConfig(ctx, cfg, identity, meshState, loadedConfig, local, cfg.MeshRegion, completedAt)
log.Printf(
"mesh synthetic config refreshed: previous_config_version=%s refreshed_config_version=%s route_health_routes=%d",
previousVersion,
loadedConfig.ConfigVersion,
len(meshState.RouteHealthRoutes),
)
return nil
}
func refreshSyntheticMeshConfigForRouteHealthFeedback(ctx context.Context, cfg config.Config, identity state.Identity, api *client.Client, meshState *syntheticMeshState, trigger meshRouteHealthFeedbackTrigger, observedAt time.Time) error {
if meshState == nil || meshState.PeerCache == nil {
return nil
}
observedAt = observedAt.UTC()
if trigger.RouteID == "" {
return nil
}
if meshState.LastRouteHealthRefresh != nil && meshState.LastRouteHealthRefresh.AttemptedAt.Add(meshRouteHealthFeedbackRefreshBackoff).After(observedAt) {
meshState.RouteHealthRefreshSuppressed++
return nil
}
refresh := &meshRouteHealthFeedbackRefreshState{
Status: "attempted",
Reason: trigger.Reason,
AttemptedAt: observedAt,
RouteID: trigger.RouteID,
PeerNodeID: trigger.PeerNodeID,
SelectedRelayID: trigger.SelectedRelayID,
LinkStatus: trigger.LinkStatus,
FailureReason: trigger.FailureReason,
DriftDetected: trigger.DriftDetected,
PreviousConfigVersion: meshState.ConfigVersion,
PreviousRouteHealthRouteCount: len(meshState.RouteHealthRoutes),
}
meshState.LastRouteHealthRefresh = refresh
meshState.RouteHealthRefreshAttempts++
if api == nil || meshState.Source != "control_plane" || cfg.MeshSyntheticConfigPath != "" {
refresh.Status = "unsupported"
refresh.Error = "control_plane_synthetic_config_required"
refresh.CompletedAt = observedAt
meshState.RouteHealthRefreshFailures++
return nil
}
if identity.NodeID == "" || identity.ClusterID == "" {
refresh.Status = "unsupported"
refresh.Error = "approved_identity_required"
refresh.CompletedAt = observedAt
meshState.RouteHealthRefreshFailures++
return nil
}
local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID}
loadedConfig, err := loadSyntheticMeshConfig(ctx, cfg, identity, api)
completedAt := time.Now().UTC()
refresh.CompletedAt = completedAt
if err != nil {
refresh.Status = "failed"
refresh.Error = err.Error()
meshState.RouteHealthRefreshFailures++
return err
}
applyRefreshedSyntheticMeshConfig(ctx, cfg, identity, meshState, loadedConfig, local, cfg.MeshRegion, completedAt)
refresh.Status = "succeeded"
refresh.RefreshedConfigVersion = loadedConfig.ConfigVersion
refresh.RefreshedRouteHealthRouteCount = len(meshState.RouteHealthRoutes)
meshState.RouteHealthRefreshSuccesses++
log.Printf(
"mesh route-health feedback refresh succeeded: reason=%s route_id=%s previous_config_version=%s refreshed_config_version=%s route_health_routes=%d",
refresh.Reason,
refresh.RouteID,
refresh.PreviousConfigVersion,
refresh.RefreshedConfigVersion,
refresh.RefreshedRouteHealthRouteCount,
)
return nil
}
func applyRefreshedSyntheticMeshConfig(ctx context.Context, cfg config.Config, identity state.Identity, meshState *syntheticMeshState, loadedConfig loadedSyntheticMeshConfig, local mesh.PeerIdentity, preferredRegion string, observedAt time.Time) {
routeHealthRoutes := routeHealthRoutesFromPathDecisions(loadedConfig.Routes, loadedConfig.RoutePathDecisions)
peerCache := mesh.NewPeerCache(mesh.PeerCacheConfig{
Local: local,
PeerEndpoints: loadedConfig.PeerEndpoints,
PeerEndpointCandidates: loadedConfig.PeerEndpointCandidates,
PeerDirectory: loadedConfig.PeerDirectory,
RecoverySeeds: loadedConfig.RecoverySeeds,
RendezvousLeases: loadedConfig.RendezvousLeases,
Routes: loadedConfig.Routes,
WarmPeerLimit: mesh.DefaultWarmPeerLimit,
PreferredRegion: preferredRegion,
Now: observedAt,
})
if meshState.PeerConnections == nil {
meshState.PeerConnections = mesh.NewPeerConnectionTracker(peerCache.Snapshot(), observedAt)
}
peerConnectionSnapshot := meshState.PeerConnections.Snapshot()
peerRecoveryPlan := mesh.PlanPeerRecovery(mesh.PeerRecoveryPlanConfig{
PeerCache: peerCache.Snapshot(),
Connections: peerConnectionSnapshot,
TargetReadyPeers: mesh.DefaultStablePeerTarget,
MaxProbeCandidates: mesh.DefaultRecoveryProbeLimit,
Now: observedAt,
})
peerConnectionIntentPlan := mesh.PlanPeerConnectionIntents(mesh.PeerConnectionIntentPlanConfig{
PeerCache: peerCache.Snapshot(),
RecoveryPlan: peerRecoveryPlan,
RendezvousLeases: loadedConfig.RendezvousLeases,
Now: observedAt,
})
if meshState.PeerConnectionManager == nil {
meshState.PeerConnectionManager = mesh.NewPeerConnectionManager(mesh.PeerConnectionManagerConfig{
Local: local,
PeerCache: peerCache,
Tracker: meshState.PeerConnections,
RendezvousLeases: loadedConfig.RendezvousLeases,
})
} else {
meshState.PeerConnectionManager.UpdatePeerConfig(peerCache, loadedConfig.RendezvousLeases)
}
if meshState.Runtime != nil {
meshState.Runtime.UpdateConfig(loadedConfig.Routes, mesh.NewHTTPPeerTransport(loadedConfig.PeerEndpoints))
meshState.Runtime.UpdateRouteHealthConfig(routeHealthRoutes)
}
if meshState.RouteGenerationTracker == nil {
meshState.RouteGenerationTracker = newMeshRouteGenerationTracker(loadedConfig.RoutePathDecisions, observedAt)
} else {
meshState.RouteGenerationTracker.Apply(loadedConfig.RoutePathDecisions, observedAt)
}
productionForwardingEnabled := cfg.MeshProductionForwardingEnabled || loadedConfig.ProductionForwarding
meshState.ProductionForwardingEnabled = productionForwardingEnabled
if productionForwardingEnabled {
meshState.ProductionForwardTransport = mesh.NewHTTPProductionForwardTransport(loadedConfig.PeerEndpoints)
} else {
meshState.ProductionForwardTransport = nil
}
vpnFabricIngress := newVPNFabricIngress(meshState, identity, loadedConfig.Routes, loadedConfig.RoutePathDecisions, loadedConfig.ServiceChannelRemediationCommands, loadedConfig.ServiceChannelFeedback, loadedConfig.ServiceChannelAdaptivePolicy, loadedConfig.ConfigVersion, meshState.VPNGateway)
meshState.VPNFabricIngress = vpnFabricIngress
if meshState.ServiceChannelAccessStats == nil {
meshState.ServiceChannelAccessStats = newFabricServiceChannelAccessStats()
}
if meshState.RemoteWorkspaceFrameSink == nil {
meshState.RemoteWorkspaceFrameSink = mesh.NewRemoteWorkspaceFrameProbeSink()
}
nextListenerHandler := mesh.Server{
Local: local,
SyntheticRuntime: meshState.Runtime,
ProductionForwardingEnabled: productionForwardingEnabled,
ProductionEnvelopeDelivery: func() mesh.ProductionEnvelopeDelivery {
if meshState.VPNFabricInbox == nil {
return nil
}
return meshState.VPNFabricInbox.DeliverProductionEnvelope
}(),
ProductionForwardTransport: meshState.ProductionForwardTransport,
ProductionForwardLogger: func(entry mesh.ProductionForwardLogEntry) {
payload, err := json.Marshal(entry)
if err != nil {
log.Printf("mesh production forward event marshal failed: %v", err)
return
}
log.Printf("mesh_production_forward_event=%s", string(payload))
},
FabricServiceChannelLogger: func(entry mesh.FabricServiceChannelAccessLogEntry) {
meshState.ServiceChannelAccessStats.Observe(entry)
payload, err := json.Marshal(entry)
if err != nil {
log.Printf("fabric service channel access event marshal failed: %v", err)
return
}
log.Printf("fabric_service_channel_access_event=%s", string(payload))
},
RemoteWorkspaceFrameSink: meshState.RemoteWorkspaceFrameSink,
ProductionRoutes: loadedConfig.Routes,
VPNPacketIngress: vpnFabricIngress,
BackendProxyBaseURL: cfg.BackendURL,
ClusterAuthorityPublicKey: firstNonEmpty(identity.ClusterAuthorityPublicKey, cfg.ClusterAuthorityPublicKey),
}.Handler()
if meshState.ListenerHandler == nil {
meshState.ListenerHandler = newDynamicHTTPHandler(nextListenerHandler)
} else {
meshState.ListenerHandler.Update(nextListenerHandler)
}
applyMeshListenerConfigIfChanged(ctx, cfg, identity, meshState, loadedConfig, observedAt)
meshState.Routes = loadedConfig.Routes
meshState.RouteHealthRoutes = routeHealthRoutes
meshState.Source = loadedConfig.Source
meshState.PeerCache = peerCache
meshState.RendezvousLeases = loadedConfig.RendezvousLeases
meshState.RoutePathDecisions = loadedConfig.RoutePathDecisions
meshState.ServiceChannelFeedback = loadedConfig.ServiceChannelFeedback
meshState.ServiceChannelRemediationCommands = append([]client.FabricServiceChannelRemediationCommand{}, loadedConfig.ServiceChannelRemediationCommands...)
meshState.ConfigVersion = loadedConfig.ConfigVersion
meshState.PeerDirectoryVersion = loadedConfig.PeerDirectoryVersion
meshState.PolicyVersion = loadedConfig.PolicyVersion
meshState.ConfigLoadError = ""
meshState.LastConfigRefreshAt = observedAt
meshState.LastPeerRecoveryPlan = &peerRecoveryPlan
meshState.LastPeerConnectionIntent = &peerConnectionIntentPlan
}
func applyMeshListenerConfigIfChanged(ctx context.Context, base config.Config, identity state.Identity, meshState *syntheticMeshState, loadedConfig loadedSyntheticMeshConfig, observedAt time.Time) {
if meshState == nil || meshState.ListenerHandler == nil {
return
}
nextCfg := meshListenerRuntimeConfig(base, loadedConfig.MeshListener)
nextKey := meshListenerConfigKey(nextCfg)
if nextKey == meshState.ListenerConfigKey {
return
}
if meshState.StopListener != nil {
meshState.StopListener()
}
gateEnabled, runtimeEnabled := productionForwardingLogState(nextCfg, loadedConfig.ProductionForwarding)
report, stop := startSyntheticMeshHTTPServer(ctx, nextCfg, identity, meshState.ListenerHandler, len(loadedConfig.PeerEndpoints), len(loadedConfig.Routes), gateEnabled, runtimeEnabled)
meshState.ListenerReport = report
meshState.ListenerConfigKey = nextKey
meshState.ListenerRuntimeConfig = nextCfg
meshState.StopListener = stop
log.Printf(
"mesh listener config applied: mode=%s listen_addr=%s status=%s config_version=%s observed_at=%s",
nextCfg.MeshListenPortMode,
nextCfg.MeshListenAddr,
report.Status,
loadedConfig.ConfigVersion,
observedAt.Format(time.RFC3339Nano),
)
}
func meshRendezvousLeasePostureForState(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) meshRendezvousLeasePosture {
posture := meshRendezvousLeasePosture{}
if meshState == nil {
return posture
}
connectionByPeer := meshRendezvousConnectionsByPeer(meshState)
for _, lease := range meshState.RendezvousLeases {
valid := meshRendezvousLeaseBaseValid(lease)
expired := valid && !lease.ExpiresAt.After(observedAt)
usable := valid && !expired
renewalNeeded := meshRendezvousLeaseRenewalNeeded(lease, observedAt, usable)
staleRelay := usable && meshRendezvousLeaseStaleRelay(lease, connectionByPeer[lease.PeerNodeID])
switch {
case !valid:
posture.InvalidCount++
case expired:
posture.ExpiredCount++
case renewalNeeded:
posture.RenewalNeededCount++
case staleRelay:
posture.StaleRelayCount++
}
if !valid || expired || renewalNeeded || staleRelay {
posture.RefreshNeededCount++
if posture.Reason == "" {
posture.Reason = meshRendezvousLeaseRefreshReason(valid, expired, renewalNeeded, staleRelay)
}
}
}
posture.RefreshNeeded = posture.RefreshNeededCount > 0 && identity.NodeID != ""
if posture.Reason == "" {
posture.Reason = "none"
}
return posture
}
func meshRendezvousLeaseRefreshReason(valid bool, expired bool, renewalNeeded bool, staleRelay bool) string {
switch {
case !valid:
return "invalid_lease"
case expired:
return "expired_lease"
case staleRelay:
return "stale_relay"
case renewalNeeded:
return "renewal_needed"
default:
return "none"
}
}
func syntheticRoutesFromControlPlane(routes []client.SyntheticMeshRouteConfig) []mesh.SyntheticRoute {
out := make([]mesh.SyntheticRoute, 0, len(routes))
for _, route := range routes {
out = append(out, mesh.SyntheticRoute{
RouteID: route.RouteID,
ClusterID: route.ClusterID,
SourceNodeID: route.SourceNodeID,
DestinationNodeID: route.DestinationNodeID,
Hops: route.Hops,
AllowedChannels: route.AllowedChannels,
ExpiresAt: route.ExpiresAt,
MaxTTL: route.MaxTTL,
MaxHops: route.MaxHops,
RouteVersion: route.RouteVersion,
PolicyVersion: route.PolicyVersion,
PeerDirectoryVersion: route.PeerDirectoryVersion,
})
}
return out
}
func routeHealthRoutesFromPathDecisions(routes []mesh.SyntheticRoute, report *client.RoutePathDecisionReport) []mesh.SyntheticRoute {
out := make([]mesh.SyntheticRoute, 0, len(routes))
routeIndex := map[string]int{}
for _, route := range routes {
if route.RouteID == "" {
continue
}
routeIndex[route.RouteID] = len(out)
out = append(out, cloneSyntheticRoute(route))
}
if report == nil {
return out
}
for _, decision := range report.Decisions {
if strings.TrimSpace(decision.RouteID) == "" || decision.ProductionForwarding || !decision.ControlPlaneOnly {
continue
}
hops := cleanNodePath(decision.EffectiveHops)
if len(hops) < 2 {
continue
}
route, ok := mesh.SyntheticRoute{}, false
if index, exists := routeIndex[decision.RouteID]; exists {
route = out[index]
ok = true
}
if !ok {
route = mesh.SyntheticRoute{
RouteID: decision.RouteID,
ClusterID: decision.ClusterID,
AllowedChannels: []string{mesh.SyntheticChannelFabricControl, mesh.SyntheticChannelRouteControl},
}
routeIndex[decision.RouteID] = len(out)
out = append(out, route)
}
route.Hops = hops
route.SourceNodeID = defaultString(decision.SourceNodeID, hops[0])
route.DestinationNodeID = defaultString(decision.DestinationNodeID, hops[len(hops)-1])
route.ClusterID = defaultString(decision.ClusterID, route.ClusterID)
if !decision.ExpiresAt.IsZero() {
route.ExpiresAt = decision.ExpiresAt
}
if strings.TrimSpace(decision.Generation) != "" {
route.RouteVersion = strings.TrimSpace(decision.Generation)
}
if route.MaxTTL < len(hops) {
route.MaxTTL = len(hops)
}
if route.MaxHops < len(hops)-1 {
route.MaxHops = len(hops) - 1
}
out[routeIndex[decision.RouteID]] = route
}
return out
}
func cloneSyntheticRoute(route mesh.SyntheticRoute) mesh.SyntheticRoute {
route.Hops = append([]string{}, route.Hops...)
route.AllowedChannels = append([]string{}, route.AllowedChannels...)
return route
}
func cleanNodePath(items []string) []string {
out := make([]string, 0, len(items))
for _, item := range items {
item = strings.TrimSpace(item)
if item != "" {
out = append(out, item)
}
}
return out
}
func peerEndpointCandidatesFromControlPlane(candidates map[string][]client.PeerEndpointCandidate) map[string][]mesh.PeerEndpointCandidate {
out := make(map[string][]mesh.PeerEndpointCandidate, len(candidates))
for nodeID, items := range candidates {
for _, item := range items {
out[nodeID] = append(out[nodeID], mesh.PeerEndpointCandidate{
EndpointID: item.EndpointID,
NodeID: item.NodeID,
Transport: item.Transport,
Address: item.Address,
AddressFamily: item.AddressFamily,
Reachability: item.Reachability,
NATType: item.NATType,
ConnectivityMode: item.ConnectivityMode,
Region: item.Region,
Priority: item.Priority,
PolicyTags: item.PolicyTags,
LastVerifiedAt: item.LastVerifiedAt,
Metadata: item.Metadata,
})
}
}
return out
}
func peerDirectoryFromControlPlane(entries []client.PeerDirectoryEntry) []mesh.PeerDirectoryEntry {
out := make([]mesh.PeerDirectoryEntry, 0, len(entries))
for _, item := range entries {
out = append(out, mesh.PeerDirectoryEntry{
NodeID: item.NodeID,
RouteIDs: item.RouteIDs,
EndpointCount: item.EndpointCount,
CandidateCount: item.CandidateCount,
ConnectivityModes: item.ConnectivityModes,
RecoverySeed: item.RecoverySeed,
})
}
return out
}
func recoverySeedsFromControlPlane(seeds []client.PeerRecoverySeed) []mesh.PeerRecoverySeed {
out := make([]mesh.PeerRecoverySeed, 0, len(seeds))
for _, item := range seeds {
out = append(out, mesh.PeerRecoverySeed{
NodeID: item.NodeID,
Endpoint: item.Endpoint,
Transport: item.Transport,
ConnectivityMode: item.ConnectivityMode,
Region: item.Region,
Priority: item.Priority,
LastVerifiedAt: item.LastVerifiedAt,
Metadata: item.Metadata,
})
}
return out
}
func rendezvousLeasesFromControlPlane(leases []client.PeerRendezvousLease) []mesh.PeerRendezvousLease {
out := make([]mesh.PeerRendezvousLease, 0, len(leases))
for _, item := range leases {
out = append(out, mesh.PeerRendezvousLease{
LeaseID: item.LeaseID,
PeerNodeID: item.PeerNodeID,
RelayNodeID: item.RelayNodeID,
RelayEndpoint: item.RelayEndpoint,
Transport: item.Transport,
ConnectivityMode: item.ConnectivityMode,
RouteIDs: item.RouteIDs,
AllowedChannels: item.AllowedChannels,
Priority: item.Priority,
ControlPlaneOnly: item.ControlPlaneOnly,
IssuedAt: item.IssuedAt,
ExpiresAt: item.ExpiresAt,
Reason: item.Reason,
Metadata: item.Metadata,
})
}
return out
}
func parseMeshPeerEndpoints(raw string) (map[string]string, error) {
if raw == "" {
return map[string]string{}, nil
}
var peerEndpoints map[string]string
if err := json.Unmarshal([]byte(raw), &peerEndpoints); err != nil {
return nil, fmt.Errorf("parse synthetic mesh peer endpoints: %w", err)
}
return peerEndpoints, nil
}
func parseMeshSyntheticRoutes(raw string) ([]mesh.SyntheticRoute, error) {
if raw == "" {
return nil, nil
}
var routes []mesh.SyntheticRoute
if err := json.Unmarshal([]byte(raw), &routes); err != nil {
return nil, fmt.Errorf("parse synthetic mesh routes: %w", err)
}
return routes, nil
}
func reportSyntheticRouteHealth(ctx context.Context, cfg config.Config, api *client.Client, identity state.Identity, meshState *syntheticMeshState) error {
if meshState == nil || meshState.Runtime == nil || api == nil {
return nil
}
routes := meshState.RouteHealthRoutes
if len(routes) == 0 {
routes = meshState.Routes
}
decisionsByRoute := routePathDecisionsByRoute(meshState.RoutePathDecisions)
var refreshTrigger *meshRouteHealthFeedbackTrigger
for _, route := range routes {
if route.SourceNodeID != identity.NodeID {
continue
}
decision, decisionApplied := decisionsByRoute[route.RouteID]
probeCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
result, err := meshState.Runtime.SendRouteHealthProbe(probeCtx, route.RouteID, mesh.SyntheticChannelFabricControl, "route-health-"+route.RouteID)
cancel()
if err != nil {
metadata := routeHealthObservationMetadata(meshState, route, decision, decisionApplied, nil)
metadata["failure_reason"] = err.Error()
if reportErr := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{
SourceNodeID: identity.NodeID,
TargetNodeID: route.DestinationNodeID,
LinkStatus: "unreachable",
Metadata: metadata,
}); reportErr != nil {
return reportErr
}
if trigger, ok := routeHealthFeedbackTriggerFromObservation(route, decision, decisionApplied, "unreachable", metadata, time.Now().UTC()); ok && refreshTrigger == nil {
refreshTrigger = &trigger
}
continue
}
latency := int(result.Observation.LastLatencyMs)
qualityScore := syntheticQualityScore(latency)
ackPath := routeHealthAckPath(result.Ack)
metadata := routeHealthObservationMetadata(meshState, route, decision, decisionApplied, ackPath)
metadata["selected_route_id"] = result.SelectedRouteID
metadata["fallback_used"] = result.FallbackUsed
metadata["route_version"] = result.Observation.RouteVersion
metadata["policy_version"] = result.Observation.PolicyVersion
metadata["peer_directory_version"] = result.Observation.PeerDirectoryVersion
metadata["synthetic_message_type"] = result.Ack.MessageType
if err := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{
SourceNodeID: identity.NodeID,
TargetNodeID: route.DestinationNodeID,
LinkStatus: "reachable",
LatencyMs: &latency,
QualityScore: &qualityScore,
Metadata: metadata,
}); err != nil {
return err
}
if trigger, ok := routeHealthFeedbackTriggerFromObservation(route, decision, decisionApplied, "reachable", metadata, time.Now().UTC()); ok && refreshTrigger == nil {
refreshTrigger = &trigger
}
}
if refreshTrigger != nil {
return refreshSyntheticMeshConfigForRouteHealthFeedback(ctx, cfg, identity, api, meshState, *refreshTrigger, time.Now().UTC())
}
return nil
}
func routePathDecisionsByRoute(report *client.RoutePathDecisionReport) map[string]client.RoutePathDecision {
out := map[string]client.RoutePathDecision{}
if report == nil {
return out
}
for _, decision := range report.Decisions {
if strings.TrimSpace(decision.RouteID) == "" {
continue
}
previous, exists := out[decision.RouteID]
if !exists || (previous.DecisionSource != "stale_relay_replacement" && decision.DecisionSource == "stale_relay_replacement") {
out[decision.RouteID] = decision
}
}
return out
}
func routeHealthObservationMetadata(meshState *syntheticMeshState, route mesh.SyntheticRoute, decision client.RoutePathDecision, decisionApplied bool, ackPath []string) map[string]any {
driftDetected := false
if len(ackPath) > 0 {
driftDetected = !sameStringSlice(ackPath, route.Hops)
}
metadata := map[string]any{
"stage": "c17z20",
"traffic_forwarding": false,
"production_forwarding": false,
"production_payload_forwarding": false,
"service_workload_traffic": false,
"observation_type": "synthetic_route_health",
"route_id": route.RouteID,
"config_source": meshState.Source,
"route_health_route_config_contract": "control_plane_route_path_decisions_to_synthetic_route_health",
"route_health_only": true,
"synthetic_route_health_route_path_runtime": true,
"production_route_path_forwarding_runtime": false,
"route_path_decision_applied": decisionApplied,
"expected_effective_hops": append([]string{}, route.Hops...),
"observed_ack_path": append([]string{}, ackPath...),
"route_path_drift_detected": driftDetected,
"control_plane_only": true,
"route_health_service_payload_forwarding": false,
"route_health_production_payload_forwarding": false,
}
if decisionApplied {
metadata["route_path_decision_id"] = decision.DecisionID
metadata["route_path_decision_generation"] = decision.Generation
metadata["route_path_decision_source"] = decision.DecisionSource
metadata["route_path_decision_next_hop_id"] = decision.NextHopID
metadata["route_path_decision_selected_relay_id"] = decision.SelectedRelayID
metadata["route_path_decision_stale_relay_node_id"] = decision.StaleRelayNodeID
metadata["route_path_decision_rendezvous_peer_node_id"] = decision.RendezvousPeerNodeID
metadata["route_path_decision_rendezvous_lease_id"] = decision.RendezvousLeaseID
metadata["route_path_decision_rendezvous_lease_reason"] = decision.RendezvousLeaseReason
metadata["route_path_decision_effective_hops"] = append([]string{}, decision.EffectiveHops...)
metadata["route_path_decision_original_hops"] = append([]string{}, decision.OriginalHops...)
}
return metadata
}
func routeHealthFeedbackTriggerFromObservation(route mesh.SyntheticRoute, decision client.RoutePathDecision, decisionApplied bool, linkStatus string, metadata map[string]any, observedAt time.Time) (meshRouteHealthFeedbackTrigger, bool) {
if strings.TrimSpace(route.RouteID) == "" {
return meshRouteHealthFeedbackTrigger{}, false
}
linkStatus = strings.TrimSpace(linkStatus)
failureReason, _ := metadata["failure_reason"].(string)
driftDetected, _ := metadata["route_path_drift_detected"].(bool)
reason := ""
switch {
case strings.TrimSpace(failureReason) != "":
reason = "synthetic_route_health_failure"
case linkStatus != "" && linkStatus != "reachable":
reason = "synthetic_route_health_unreachable"
case driftDetected:
reason = "synthetic_route_health_drift"
default:
return meshRouteHealthFeedbackTrigger{}, false
}
trigger := meshRouteHealthFeedbackTrigger{
Reason: reason,
RouteID: route.RouteID,
PeerNodeID: route.DestinationNodeID,
LinkStatus: linkStatus,
FailureReason: failureReason,
DriftDetected: driftDetected,
ObservedAt: observedAt.UTC(),
}
if decisionApplied {
if decision.RendezvousPeerNodeID != "" {
trigger.PeerNodeID = decision.RendezvousPeerNodeID
}
trigger.SelectedRelayID = decision.SelectedRelayID
}
return trigger, true
}
func routeHealthAckPath(ack mesh.SyntheticEnvelope) []string {
if len(ack.Payload) == 0 {
return nil
}
var payload mesh.SyntheticProbeAckPayload
if err := json.Unmarshal(ack.Payload, &payload); err != nil {
return nil
}
return append([]string{}, payload.Path...)
}
func probeWarmPeerHealth(ctx context.Context, api *client.Client, identity state.Identity, meshState *syntheticMeshState) error {
if meshState == nil || meshState.PeerCache == nil {
return nil
}
if meshState.PeerConnectionManager != nil {
cycle := meshState.PeerConnectionManager.ProbeOnce(ctx)
meshState.LastPeerRecoveryPlan = &cycle.RecoveryPlan
meshState.LastPeerConnectionIntent = &cycle.IntentPlan
for _, result := range cycle.Results {
metadata := map[string]any{
"stage": "c17z20",
"traffic_forwarding": false,
"observation_type": "peer_connection_manager",
"config_source": meshState.Source,
"manager_probe_status": result.LinkStatus,
"manager_mode": cycle.Mode,
"manager_attempted": cycle.Attempted,
"manager_succeeded": cycle.Succeeded,
"manager_failed": cycle.Failed,
"manager_deferred": cycle.Deferred,
"manager_rendezvous_required": cycle.RendezvousRequiredCount,
"manager_rendezvous_resolved": cycle.RendezvousResolvedCount,
"manager_relay_control": cycle.RelayControlCount,
"connection_intent_action": result.Action,
"connection_intent_reason": result.Reason,
"transport_mode": result.TransportMode,
"requires_rendezvous": result.RequiresRendezvous,
"rendezvous_resolved": result.RendezvousResolved,
"direct_candidate": result.DirectCandidate,
"relay_candidate": result.RelayCandidate,
"rendezvous_lease_id": result.RendezvousLeaseID,
"relay_node_id": result.RelayNodeID,
"relay_endpoint": result.RelayEndpoint,
"connection_state": result.ConnectionState.State,
"consecutive_successes": result.ConnectionState.ConsecutiveSuccesses,
"consecutive_failures": result.ConnectionState.ConsecutiveFailures,
"backoff_until": result.ConnectionState.BackoffUntil,
"service_workload_traffic": false,
"persistent_connection_manager": true,
"persistent_connection_kind": "http_keepalive_control_health_or_relay_control_health",
}
if result.FailureReason != "" {
metadata["failure_reason"] = result.FailureReason
}
var latency *int
var qualityScore *int
if result.LinkStatus == mesh.PeerConnectionProbeReachable {
latency = &result.LatencyMs
score := syntheticQualityScore(result.LatencyMs)
qualityScore = &score
}
if err := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{
SourceNodeID: identity.NodeID,
TargetNodeID: result.NodeID,
LinkStatus: meshLinkStatusFromPeerProbe(result.LinkStatus),
LatencyMs: latency,
QualityScore: qualityScore,
Metadata: metadata,
}); err != nil {
return err
}
}
return nil
}
local := mesh.PeerIdentity{ClusterID: identity.ClusterID, NodeID: identity.NodeID}
plan := peerRecoveryPlan(meshState, time.Now().UTC())
meshState.LastPeerRecoveryPlan = &plan
intentPlan := peerConnectionIntentPlan(meshState, plan, time.Now().UTC())
meshState.LastPeerConnectionIntent = &intentPlan
intentsByNode := peerConnectionIntentsByNode(intentPlan)
for _, candidate := range plan.Candidates {
if strings.TrimSpace(candidate.Endpoint) == "" {
continue
}
intent := intentsByNode[candidate.NodeID]
now := time.Now().UTC()
if meshState.PeerConnections != nil && !meshState.PeerConnections.ShouldProbe(candidate.NodeID, now) {
continue
}
entry := mesh.PeerCacheEntry{
NodeID: candidate.NodeID,
Endpoint: candidate.Endpoint,
Warm: candidate.Warm,
WarmReason: candidate.WarmReason,
RecoverySeed: candidate.RecoverySeed,
BestCandidateID: candidate.BestCandidateID,
BestTransport: candidate.BestTransport,
}
if meshState.PeerConnections != nil {
meshState.PeerConnections.BeginProbe(entry, now)
}
startedAt := time.Now()
probeCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
_, err := mesh.NewClient(strings.TrimRight(entry.Endpoint, "/")).SendHealth(probeCtx, mesh.NewHealthMessage(local, mesh.PeerIdentity{
ClusterID: identity.ClusterID,
NodeID: candidate.NodeID,
}))
cancel()
if err != nil {
connectionState := mesh.PeerConnectionState{}
if meshState.PeerConnections != nil {
connectionState = meshState.PeerConnections.RecordFailure(candidate.NodeID, err.Error(), time.Now().UTC())
}
if reportErr := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{
SourceNodeID: identity.NodeID,
TargetNodeID: candidate.NodeID,
LinkStatus: "unreachable",
Metadata: map[string]any{
"stage": "c17z10",
"traffic_forwarding": false,
"observation_type": "warm_peer_health",
"config_source": meshState.Source,
"warm_reason": entry.WarmReason,
"best_candidate_id": entry.BestCandidateID,
"best_transport": entry.BestTransport,
"recovery_seed": entry.RecoverySeed,
"recovery_plan_mode": plan.Mode,
"recovery_probe_reason": candidate.Reason,
"recovery_target_ready": plan.TargetReadyPeers,
"recovery_ready_peers": plan.ReadyPeerCount,
"recovery_deficit": plan.Deficit,
"connection_intent_action": intent.Action,
"transport_mode": intent.TransportMode,
"requires_rendezvous": intent.RequiresRendezvous,
"direct_candidate": intent.DirectCandidate,
"connection_state": connectionState.State,
"consecutive_failures": connectionState.ConsecutiveFailures,
"backoff_until": connectionState.BackoffUntil,
"failure_reason": err.Error(),
"service_workload_traffic": false,
},
}); reportErr != nil {
return reportErr
}
continue
}
latency := int(time.Since(startedAt).Milliseconds())
qualityScore := syntheticQualityScore(latency)
connectionState := mesh.PeerConnectionState{}
if meshState.PeerConnections != nil {
connectionState = meshState.PeerConnections.RecordSuccess(candidate.NodeID, latency, time.Now().UTC())
}
if err := api.ReportMeshLink(ctx, identity.ClusterID, client.MeshLinkObservationRequest{
SourceNodeID: identity.NodeID,
TargetNodeID: candidate.NodeID,
LinkStatus: "reachable",
LatencyMs: &latency,
QualityScore: &qualityScore,
Metadata: map[string]any{
"stage": "c17z10",
"traffic_forwarding": false,
"observation_type": "warm_peer_health",
"config_source": meshState.Source,
"warm_reason": entry.WarmReason,
"best_candidate_id": entry.BestCandidateID,
"best_transport": entry.BestTransport,
"recovery_seed": entry.RecoverySeed,
"recovery_plan_mode": plan.Mode,
"recovery_probe_reason": candidate.Reason,
"recovery_target_ready": plan.TargetReadyPeers,
"recovery_ready_peers": plan.ReadyPeerCount,
"recovery_deficit": plan.Deficit,
"connection_intent_action": intent.Action,
"transport_mode": intent.TransportMode,
"requires_rendezvous": intent.RequiresRendezvous,
"direct_candidate": intent.DirectCandidate,
"connection_state": connectionState.State,
"consecutive_successes": connectionState.ConsecutiveSuccesses,
"service_workload_traffic": false,
},
}); err != nil {
return err
}
}
return nil
}
func meshLinkStatusFromPeerProbe(status string) string {
switch status {
case mesh.PeerConnectionProbeReachable:
return "reachable"
case mesh.PeerConnectionProbeUnreachable:
return "unreachable"
case mesh.PeerConnectionProbeDeferred:
return "degraded"
case mesh.PeerConnectionProbeSkipped:
return "unknown"
default:
return "unknown"
}
}
func peerRecoveryPlan(meshState *syntheticMeshState, now time.Time) mesh.PeerRecoveryPlan {
if meshState == nil || meshState.PeerCache == nil {
return mesh.PeerRecoveryPlan{}
}
var connections mesh.PeerConnectionSnapshot
if meshState.PeerConnections != nil {
connections = meshState.PeerConnections.Snapshot()
}
return mesh.PlanPeerRecovery(mesh.PeerRecoveryPlanConfig{
PeerCache: meshState.PeerCache.Snapshot(),
Connections: connections,
TargetReadyPeers: mesh.DefaultStablePeerTarget,
MaxProbeCandidates: mesh.DefaultRecoveryProbeLimit,
Now: now,
})
}
func peerConnectionIntentPlan(meshState *syntheticMeshState, recoveryPlan mesh.PeerRecoveryPlan, now time.Time) mesh.PeerConnectionIntentPlan {
if meshState == nil || meshState.PeerCache == nil {
return mesh.PeerConnectionIntentPlan{}
}
return mesh.PlanPeerConnectionIntents(mesh.PeerConnectionIntentPlanConfig{
PeerCache: meshState.PeerCache.Snapshot(),
RecoveryPlan: recoveryPlan,
RendezvousLeases: meshState.RendezvousLeases,
Now: now,
})
}
func peerConnectionIntentsByNode(plan mesh.PeerConnectionIntentPlan) map[string]mesh.PeerConnectionIntent {
out := map[string]mesh.PeerConnectionIntent{}
for _, intent := range plan.Intents {
if strings.TrimSpace(intent.NodeID) != "" {
out[intent.NodeID] = intent
}
}
return out
}
func syntheticQualityScore(latencyMs int) int {
switch {
case latencyMs <= 10:
return 100
case latencyMs >= 1000:
return 1
default:
score := 100 - latencyMs/10
if score < 1 {
return 1
}
return score
}
}
func sendHeartbeat(ctx context.Context, api *client.Client, cfg config.Config, identity state.Identity, meshState *syntheticMeshState) (client.EffectiveTestingFlags, error) {
if identity.NodeID == "" || identity.ClusterID == "" {
return client.EffectiveTestingFlags{}, fmt.Errorf("node identity is not approved")
}
response, err := api.Heartbeat(ctx, identity.ClusterID, identity.NodeID, heartbeatPayload(cfg, identity, meshState, time.Now().UTC()))
if err == nil {
log.Printf("heartbeat sent: node_id=%s cluster_id=%s", identity.NodeID, identity.ClusterID)
if err := persistUpdateHintTrigger(cfg.StateDir, response.UpdateHint); err != nil {
log.Printf("update hint trigger failed: %v", err)
}
}
return response.TestingFlags, err
}
func persistUpdateHintTrigger(stateDir string, hint *client.NodeUpdateHint) error {
if hint == nil || !hint.CheckNow || strings.TrimSpace(hint.Generation) == "" {
return nil
}
current := hostagent.CurrentUpdateTriggerGenerationForNodeAgent(stateDir)
if current == strings.TrimSpace(hint.Generation) {
return nil
}
return hostagent.SaveUpdateTrigger(stateDir, hostagent.UpdateTrigger{
SchemaVersion: "rap.node_update_trigger.v1",
Generation: strings.TrimSpace(hint.Generation),
Products: hint.Products,
Reason: hint.Reason,
DeliveryMode: hint.DeliveryMode,
SubscriptionStatus: hint.SubscriptionStatus,
FallbackPollSeconds: hint.FallbackPollSeconds,
UpdateServiceNodeID: func() string {
if hint.UpdateService == nil {
return ""
}
return hint.UpdateService.NodeID
}(),
UpdateServiceStatus: func() string {
if hint.UpdateService == nil {
return ""
}
return hint.UpdateService.Status
}(),
ObservedAt: time.Now().UTC(),
})
}
func heartbeatPayload(cfg config.Config, identity state.Identity, meshState *syntheticMeshState, observedAt time.Time) client.HeartbeatRequest {
if meshState != nil && meshState.ListenerRuntimeConfig.BackendURL != "" {
cfg = meshState.ListenerRuntimeConfig
}
payload := agent.HeartbeatPayload()
candidates, err := advertisedEndpointCandidates(cfg, identity, meshState, observedAt)
if err != nil {
log.Printf("mesh endpoint report skipped: %v", err)
return payload
}
if len(candidates) == 0 && (meshState == nil || (meshState.PeerCache == nil && meshState.ListenerReport.SchemaVersion == "")) {
return payload
}
if payload.Metadata == nil {
payload.Metadata = map[string]any{}
}
if payload.Capabilities == nil {
payload.Capabilities = map[string]any{}
}
payload.Metadata["stage"] = "c17z20"
if meshState != nil && meshState.ListenerReport.SchemaVersion != "" {
report := meshState.ListenerReport
report.ObservedAt = observedAt.UTC().Format(time.RFC3339Nano)
payload.Metadata["mesh_listener_report"] = report
payload.Capabilities["mesh_listener_diagnostics"] = true
if report.OneWayConnectivity {
payload.Capabilities["mesh_one_way_connectivity"] = true
}
if report.Status == "listen_failed" && cfg.MeshConnectivityMode != "outbound_only" {
payload.HealthStatus = "warning"
}
}
if cfg.MeshSyntheticRuntimeEnabled {
payload.Metadata["mesh_outbound_session_report"] = meshOutboundSessionReportFromState(cfg, meshState, observedAt)
payload.Capabilities["mesh_outbound_control_session"] = true
payload.Capabilities["mesh_reverse_control_channel_contract"] = true
if meshState != nil && meshState.ServiceChannelAccessStats != nil {
payload.Metadata["fabric_service_channel_access_report"] = meshState.ServiceChannelAccessStats.Report(observedAt)
payload.Capabilities["fabric_service_channel_access_telemetry"] = true
}
if cfg.MeshProductionForwardingEnabled || (meshState != nil && meshState.ProductionForwardingEnabled) {
payload.Capabilities["mesh_production_forwarding"] = true
}
if meshState != nil && meshState.ConfigLoadError != "" {
payload.HealthStatus = "warning"
}
}
if len(candidates) > 0 {
payload.Metadata["mesh_endpoint_report"] = meshEndpointReport(cfg, identity, meshState, observedAt, candidates)
payload.Capabilities["mesh_dynamic_endpoint_reporting"] = true
}
if meshState != nil && meshState.PeerCache != nil {
payload.Metadata["mesh_peer_recovery_report"] = meshPeerRecoveryReport(meshState, observedAt)
payload.Metadata["mesh_peer_connection_intent_report"] = meshPeerConnectionIntentReport(meshState, observedAt)
payload.Metadata["mesh_peer_connection_manager_report"] = meshPeerConnectionManagerReport(meshState, observedAt)
payload.Metadata["mesh_rendezvous_lease_report"] = meshRendezvousLeaseReport(meshState, identity, observedAt)
payload.Metadata["mesh_route_path_decision_report"] = meshRoutePathDecisionReport(meshState, identity, observedAt)
payload.Metadata["mesh_route_generation_report"] = meshRouteGenerationReport(meshState, identity, observedAt)
payload.Metadata["mesh_route_health_config_report"] = meshRouteHealthConfigReport(meshState, identity, observedAt)
payload.Metadata["mesh_route_health_feedback_refresh_report"] = meshRouteHealthFeedbackRefreshReport(meshState, identity, observedAt)
payload.Capabilities["mesh_peer_recovery_planning"] = true
payload.Capabilities["mesh_peer_connection_intent_planning"] = true
payload.Capabilities["mesh_peer_connection_manager"] = true
payload.Capabilities["mesh_per_peer_endpoint_probe_fallback"] = true
payload.Capabilities["mesh_rendezvous_relay_control_contract"] = true
payload.Capabilities[meshRendezvousLeaseTelemetryCapability] = true
payload.Capabilities[meshRendezvousLeaseRefreshCapability] = true
payload.Capabilities[meshRendezvousRelayReplacementCapability] = true
payload.Capabilities[meshRoutePathDecisionCapability] = true
payload.Capabilities[meshRouteGenerationTrackerCapability] = true
payload.Capabilities[meshRouteHealthConfigCapability] = true
payload.Capabilities[meshRouteHealthFeedbackRefreshCapability] = true
}
if meshState != nil && (meshState.VPNFabricIngress != nil || meshState.VPNFabricInbox != nil) {
payload.Metadata["fabric_service_channel_runtime_report"] = fabricServiceChannelRuntimeReport(meshState, identity, observedAt)
payload.Capabilities["fabric_service_channel_runtime"] = true
payload.Capabilities["fabric_service_channel_route_manager"] = true
}
return payload
}
func fabricServiceChannelRuntimeReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any {
report := map[string]any{
"schema_version": "c18l.fabric_service_channel_runtime_report.v1",
"cluster_id": identity.ClusterID,
"node_id": identity.NodeID,
"service_class": "vpn_packets",
"channel_class": mesh.ProductionChannelVPNPacket,
"route_manager": "primary_sticky_with_alternate_route_failover",
"backend_relay_fallback": false,
"backend_relay_fallback_position": "disabled_farm_owned_dataplane",
"route_authority": "fabric_farm",
"application_protocol_agnostic": true,
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
}
if meshState == nil {
report["enabled"] = false
return report
}
report["enabled"] = meshState.VPNFabricIngress != nil
report["production_payload_forwarding"] = meshState.ProductionForwardingEnabled
report["route_candidate_total"] = countVPNPacketRoutes(meshState.Routes, identity.ClusterID, identity.NodeID)
report["config_source"] = meshState.Source
report["config_version"] = meshState.ConfigVersion
if meshState.VPNFabricIngress != nil {
report["ingress"] = meshState.VPNFabricIngress.Snapshot(identity.ClusterID)
}
if meshState.VPNFabricInbox != nil {
report["inbox"] = meshState.VPNFabricInbox.Snapshot()
}
return report
}
func countVPNPacketRoutes(routes []mesh.SyntheticRoute, clusterID string, localNodeID string) int {
count := 0
now := time.Now().UTC()
for _, route := range routes {
if route.ClusterID != clusterID || route.SourceNodeID != localNodeID || !containsString(route.AllowedChannels, mesh.ProductionChannelVPNPacket) {
continue
}
if !route.ExpiresAt.IsZero() && !route.ExpiresAt.After(now) {
continue
}
nextHop := serviceChannelNextHopAfter(route.Hops, localNodeID, route.DestinationNodeID)
if nextHop == "" || nextHop == localNodeID {
continue
}
count++
}
return count
}
func serviceChannelNextHopAfter(path []string, localNodeID string, destinationNodeID string) string {
if len(path) == 0 {
return destinationNodeID
}
for index, nodeID := range path {
if nodeID == localNodeID {
if index+1 < len(path) {
return path[index+1]
}
return localNodeID
}
}
return destinationNodeID
}
func meshOutboundSessionReportFromState(cfg config.Config, meshState *syntheticMeshState, observedAt time.Time) meshOutboundSessionReport {
report := meshOutboundSessionReport{
SchemaVersion: "c17z22.mesh_outbound_session_report.v1",
Status: "ready",
Direction: "node_to_control_plane",
Transport: "heartbeat_keepalive",
ControlPlaneURL: cfg.BackendURL,
ConnectivityMode: defaultString(cfg.MeshConnectivityMode, "direct"),
InboundListenerRequired: false,
ProductionForwarding: false,
ServiceWorkloadTraffic: false,
ObservedAt: observedAt.UTC().Format(time.RFC3339Nano),
}
if meshState != nil {
listener := meshState.ListenerReport
report.ListenerStatus = listener.Status
report.ListenerFailureReason = listener.FailureReason
report.ListenerPortConflict = listener.PortConflict
report.ConfigLoadError = meshState.ConfigLoadError
report.UsableForInboundControl = listener.Status == "listening" ||
listener.Status == "auto_rebound" ||
listener.OneWayConnectivity ||
listener.Status == "listen_failed" ||
cfg.MeshConnectivityMode == "outbound_only"
if meshState.PeerConnections != nil {
snapshot := meshState.PeerConnections.Snapshot()
report.PeerConnectionReady = snapshot.Ready
report.PeerConnectionRelayReady = snapshot.RelayReady
report.PeerConnectionWaiting = snapshot.Waiting
}
report.RendezvousLeaseCount = len(meshState.RendezvousLeases)
if meshState.ConfigLoadError != "" {
report.Status = "degraded"
report.ListenerFailureReason = firstNonEmpty(report.ListenerFailureReason, "mesh_config_load_failed")
}
} else {
report.UsableForInboundControl = cfg.MeshConnectivityMode == "outbound_only"
}
return report
}
func meshEndpointReport(cfg config.Config, identity state.Identity, meshState *syntheticMeshState, observedAt time.Time, candidates []mesh.PeerEndpointCandidate) map[string]any {
transport := cfg.MeshAdvertiseTransport
if transport == "" {
transport = "direct_tcp_tls"
}
connectivityMode := cfg.MeshConnectivityMode
if connectivityMode == "" {
connectivityMode = "direct"
}
natType := cfg.MeshNATType
if natType == "" {
natType = "unknown"
}
report := map[string]any{
"schema_version": "c17z6.mesh_endpoint_report.v1",
"cluster_id": identity.ClusterID,
"node_id": identity.NodeID,
"peer_endpoint": candidates[0].Address,
"transport": transport,
"connectivity_mode": connectivityMode,
"nat_type": natType,
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
"endpoint_candidates": candidates,
}
if meshState != nil && meshState.PeerCache != nil {
snapshot := meshState.PeerCache.Snapshot()
report["peer_cache_peers"] = snapshot.PeerCount
report["warm_peers"] = snapshot.WarmPeerCount
report["recovery_seeds"] = snapshot.RecoverySeedCount
report["rendezvous_leases"] = snapshot.RendezvousLeaseCount
}
if meshState != nil && meshState.PeerConnections != nil {
snapshot := meshState.PeerConnections.Snapshot()
report["peer_connection_total"] = snapshot.Total
report["peer_connection_ready"] = snapshot.Ready
report["peer_connection_relay_ready"] = snapshot.RelayReady
report["peer_connection_degraded"] = snapshot.Degraded
report["peer_connection_backoff"] = snapshot.Backoff
report["peer_connection_waiting_rendezvous"] = snapshot.Waiting
report["peer_connection_connecting"] = snapshot.Connecting
report["peer_connection_disconnected"] = snapshot.Disconnected
}
if meshState != nil && meshState.PeerCache != nil {
plan := peerRecoveryPlan(meshState, observedAt)
meshState.LastPeerRecoveryPlan = &plan
intentPlan := peerConnectionIntentPlan(meshState, plan, observedAt)
meshState.LastPeerConnectionIntent = &intentPlan
report["peer_recovery_mode"] = plan.Mode
report["peer_recovery_healthy"] = plan.Healthy
report["peer_recovery_target_ready"] = plan.TargetReadyPeers
report["peer_recovery_ready"] = plan.ReadyPeerCount
report["peer_recovery_deficit"] = plan.Deficit
report["peer_recovery_probe_candidates"] = plan.ProbeCandidateCount
report["peer_recovery_seed_candidates"] = plan.RecoverySeedCandidateCount
report["peer_connection_intents"] = intentPlan.IntentCount
report["peer_connection_intent_direct"] = intentPlan.DirectCount
report["peer_connection_intent_private_lan"] = intentPlan.PrivateLANCount
report["peer_connection_intent_corp_lan"] = intentPlan.CorporateLANCount
report["peer_connection_intent_outbound_only"] = intentPlan.OutboundOnlyCount
report["peer_connection_intent_relay_required"] = intentPlan.RelayRequiredCount
report["peer_connection_intent_relay_control"] = intentPlan.RelayControlCount
report["peer_connection_intent_rendezvous_required"] = intentPlan.RendezvousRequiredCount
report["peer_connection_intent_rendezvous_resolved"] = intentPlan.RendezvousResolvedCount
report["rendezvous_lease_count"] = intentPlan.RendezvousLeaseCount
}
if cfg.MeshRegion != "" {
report["region"] = cfg.MeshRegion
}
return report
}
func meshPeerRecoveryReport(meshState *syntheticMeshState, observedAt time.Time) map[string]any {
plan := peerRecoveryPlan(meshState, observedAt)
meshState.LastPeerRecoveryPlan = &plan
intentPlan := peerConnectionIntentPlan(meshState, plan, observedAt)
meshState.LastPeerConnectionIntent = &intentPlan
report := map[string]any{
"schema_version": "c17z9.mesh_peer_recovery_report.v1",
"mode": plan.Mode,
"healthy": plan.Healthy,
"target_ready_peers": plan.TargetReadyPeers,
"ready_peer_count": plan.ReadyPeerCount,
"degraded_peer_count": plan.DegradedPeerCount,
"backoff_peer_count": plan.BackoffPeerCount,
"connectable_peer_count": plan.ConnectablePeerCount,
"deficit": plan.Deficit,
"probe_candidate_count": plan.ProbeCandidateCount,
"recovery_seed_candidate_count": plan.RecoverySeedCandidateCount,
"service_workload_traffic": false,
"production_payload_forwarding": false,
"persistent_connection_transport": false,
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
"connection_intent_count": intentPlan.IntentCount,
"rendezvous_required_count": intentPlan.RendezvousRequiredCount,
"rendezvous_resolved_count": intentPlan.RendezvousResolvedCount,
"rendezvous_lease_count": intentPlan.RendezvousLeaseCount,
"relay_control_count": intentPlan.RelayControlCount,
}
if meshState != nil && meshState.PeerConnections != nil {
snapshot := meshState.PeerConnections.Snapshot()
report["peer_connection_total"] = snapshot.Total
report["peer_connection_ready"] = snapshot.Ready
report["peer_connection_relay_ready"] = snapshot.RelayReady
report["peer_connection_degraded"] = snapshot.Degraded
report["peer_connection_backoff"] = snapshot.Backoff
report["peer_connection_waiting_rendezvous"] = snapshot.Waiting
report["peer_connection_connecting"] = snapshot.Connecting
report["peer_connection_disconnected"] = snapshot.Disconnected
}
return report
}
func meshPeerConnectionIntentReport(meshState *syntheticMeshState, observedAt time.Time) map[string]any {
recoveryPlan := peerRecoveryPlan(meshState, observedAt)
meshState.LastPeerRecoveryPlan = &recoveryPlan
intentPlan := peerConnectionIntentPlan(meshState, recoveryPlan, observedAt)
meshState.LastPeerConnectionIntent = &intentPlan
return map[string]any{
"schema_version": "c17z12.mesh_peer_connection_intent_report.v1",
"mode": intentPlan.Mode,
"intent_count": intentPlan.IntentCount,
"maintain_count": intentPlan.MaintainCount,
"probe_count": intentPlan.ProbeCount,
"recover_count": intentPlan.RecoverCount,
"direct_count": intentPlan.DirectCount,
"private_lan_count": intentPlan.PrivateLANCount,
"corporate_lan_count": intentPlan.CorporateLANCount,
"outbound_only_count": intentPlan.OutboundOnlyCount,
"relay_required_count": intentPlan.RelayRequiredCount,
"relay_control_count": intentPlan.RelayControlCount,
"rendezvous_required_count": intentPlan.RendezvousRequiredCount,
"rendezvous_resolved_count": intentPlan.RendezvousResolvedCount,
"rendezvous_lease_count": intentPlan.RendezvousLeaseCount,
"service_workload_traffic": false,
"production_payload_forwarding": false,
"persistent_connection_transport": false,
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
}
}
func meshPeerConnectionManagerReport(meshState *syntheticMeshState, observedAt time.Time) map[string]any {
report := map[string]any{
"schema_version": "c17z25.mesh_peer_connection_manager_report.v1",
"service_workload_traffic": false,
"production_payload_forwarding": false,
"persistent_connection_transport": true,
"persistent_connection_kind": "http_keepalive_control_health_or_relay_control_health",
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
}
if meshState == nil || meshState.PeerConnectionManager == nil {
report["enabled"] = false
return report
}
report["enabled"] = true
snapshot := meshState.PeerConnectionManager.Snapshot()
cycle := snapshot.LastCycle
report["mode"] = cycle.Mode
report["intent_count"] = cycle.IntentCount
report["attempted"] = cycle.Attempted
report["succeeded"] = cycle.Succeeded
report["failed"] = cycle.Failed
report["deferred"] = cycle.Deferred
report["skipped"] = cycle.Skipped
report["rendezvous_required_count"] = cycle.RendezvousRequiredCount
report["rendezvous_resolved_count"] = cycle.RendezvousResolvedCount
report["relay_control_count"] = cycle.RelayControlCount
report["last_started_at"] = cycle.StartedAt
report["last_completed_at"] = cycle.CompletedAt
report["probe_results"] = cycle.Results
if meshState.PeerConnections != nil {
connectionSnapshot := meshState.PeerConnections.Snapshot()
report["peer_connection_ready"] = connectionSnapshot.Ready
report["peer_connection_relay_ready"] = connectionSnapshot.RelayReady
report["peer_connection_degraded"] = connectionSnapshot.Degraded
report["peer_connection_backoff"] = connectionSnapshot.Backoff
report["peer_connection_waiting_rendezvous"] = connectionSnapshot.Waiting
}
return report
}
func meshRendezvousLeaseReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any {
observedAt = observedAt.UTC()
posture := meshRendezvousLeasePostureForState(meshState, identity, observedAt)
report := map[string]any{
"schema_version": meshRendezvousLeaseReportSchema,
"cluster_id": identity.ClusterID,
"node_id": identity.NodeID,
"lease_count": len(meshState.RendezvousLeases),
"config_source": meshState.Source,
"config_version": meshState.ConfigVersion,
"peer_directory_version": meshState.PeerDirectoryVersion,
"policy_version": meshState.PolicyVersion,
"renewal_window_ms": int64(meshRendezvousLeaseRenewalWindow / time.Millisecond),
"refresh_backoff_ms": int64(meshRendezvousLeaseRefreshBackoff / time.Millisecond),
"refresh_contract": "node_scoped_synthetic_config_get",
"refresh_supported": meshState.Source == "control_plane",
"control_plane_only": true,
"relay_payload_forwarding": false,
"service_workload_traffic": false,
"production_payload_forwarding": false,
"persistent_connection_transport": true,
"observed_at": observedAt.Format(time.RFC3339Nano),
}
connectionByPeer := map[string]mesh.PeerConnectionState{}
if meshState != nil && meshState.PeerConnections != nil {
snapshot := meshState.PeerConnections.Snapshot()
report["peer_connection_total"] = snapshot.Total
report["peer_connection_ready"] = snapshot.Ready
report["peer_connection_relay_ready"] = snapshot.RelayReady
report["peer_connection_degraded"] = snapshot.Degraded
report["peer_connection_backoff"] = snapshot.Backoff
report["peer_connection_waiting_rendezvous"] = snapshot.Waiting
report["peer_connection_connecting"] = snapshot.Connecting
report["peer_connection_disconnected"] = snapshot.Disconnected
for _, entry := range snapshot.Entries {
connectionByPeer[entry.NodeID] = entry
}
}
if meshState == nil {
report["leases"] = []map[string]any{}
return report
}
leaseDetails := make([]map[string]any, 0, minInt(len(meshState.RendezvousLeases), maxMeshRendezvousLeaseReportEntries))
activeCount := 0
usableCount := 0
controlPlaneOnlyCount := 0
invalidCount := 0
expiredCount := 0
expiringSoonCount := 0
renewalNeededCount := 0
admittedAsRelayCount := 0
admittedAsPeerCount := 0
entryObserverCount := 0
relayControlReadyCount := 0
staleRelayCount := 0
withdrawalNeededCount := 0
reselectionNeededCount := 0
for index, lease := range meshState.RendezvousLeases {
role := meshRendezvousLeaseRole(lease, identity.NodeID)
valid := meshRendezvousLeaseBaseValid(lease)
expired := valid && !lease.ExpiresAt.After(observedAt)
usable := valid && !expired
if lease.ControlPlaneOnly {
controlPlaneOnlyCount++
}
if !valid {
invalidCount++
}
if expired {
expiredCount++
}
if usable {
activeCount++
usableCount++
switch role {
case "relay":
admittedAsRelayCount++
case "peer":
admittedAsPeerCount++
default:
entryObserverCount++
}
}
ttlRemaining := lease.ExpiresAt.Sub(observedAt)
renewalAfter := meshRendezvousLeaseRenewalAfter(lease)
expiringSoon := usable && ttlRemaining <= meshRendezvousLeaseRenewalWindow
renewalNeeded := meshRendezvousLeaseRenewalNeeded(lease, observedAt, usable)
if expiringSoon {
expiringSoonCount++
}
if renewalNeeded {
renewalNeededCount++
}
connectionState := connectionByPeer[lease.PeerNodeID]
staleRelay := usable && meshRendezvousLeaseStaleRelay(lease, connectionState)
withdrawalNeeded := staleRelay && role == "relay"
reselectionNeeded := staleRelay && role != "relay"
if staleRelay {
staleRelayCount++
}
if withdrawalNeeded {
withdrawalNeededCount++
}
if reselectionNeeded {
reselectionNeededCount++
}
relayReady := usable && connectionState.State == mesh.PeerConnectionRelayReady
if relayReady {
relayControlReadyCount++
}
if index < maxMeshRendezvousLeaseReportEntries {
leaseDetails = append(leaseDetails, map[string]any{
"lease_id": lease.LeaseID,
"peer_node_id": lease.PeerNodeID,
"relay_node_id": lease.RelayNodeID,
"relay_endpoint": strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/"),
"transport": defaultString(lease.Transport, "relay_control"),
"connectivity_mode": defaultString(lease.ConnectivityMode, "relay_required"),
"route_ids": append([]string{}, lease.RouteIDs...),
"allowed_channels": append([]string{}, lease.AllowedChannels...),
"priority": lease.Priority,
"role": role,
"status": meshRendezvousLeaseStatus(valid, expired, renewalNeeded, role),
"usable": usable,
"admitted": usable && role == "relay",
"renewal_needed": renewalNeeded,
"expiring_soon": expiringSoon,
"stale_relay": staleRelay,
"withdrawal_needed": withdrawalNeeded,
"reselection_needed": reselectionNeeded,
"relay_ready": relayReady,
"connection_state": connectionState.State,
"ttl_remaining_ms": int64(ttlRemaining / time.Millisecond),
"issued_at": formatOptionalTime(lease.IssuedAt),
"expires_at": formatOptionalTime(lease.ExpiresAt),
"renewal_after": formatOptionalTime(renewalAfter),
"reason": lease.Reason,
})
}
}
report["active_count"] = activeCount
report["usable_count"] = usableCount
report["control_plane_only_count"] = controlPlaneOnlyCount
report["invalid_count"] = invalidCount
report["expired_count"] = expiredCount
report["expiring_soon_count"] = expiringSoonCount
report["renewal_needed_count"] = renewalNeededCount
report["admitted_as_relay_count"] = admittedAsRelayCount
report["admitted_as_peer_count"] = admittedAsPeerCount
report["entry_observer_count"] = entryObserverCount
report["relay_control_ready_count"] = relayControlReadyCount
report["stale_relay_count"] = staleRelayCount
report["withdrawal_needed_count"] = withdrawalNeededCount
report["reselection_needed_count"] = reselectionNeededCount
report["refresh_needed"] = posture.RefreshNeeded
report["refresh_reason"] = posture.Reason
report["refresh_needed_count"] = posture.RefreshNeededCount
report["refresh_attempt_count"] = meshState.LeaseRefreshAttempts
report["refresh_success_count"] = meshState.LeaseRefreshSuccesses
report["refresh_failure_count"] = meshState.LeaseRefreshFailures
if meshState.LastLeaseRefresh != nil {
report["last_refresh_status"] = meshState.LastLeaseRefresh.Status
report["last_refresh_reason"] = meshState.LastLeaseRefresh.Reason
report["last_refresh_error"] = meshState.LastLeaseRefresh.Error
report["last_refresh_attempted_at"] = formatOptionalTime(meshState.LastLeaseRefresh.AttemptedAt)
report["last_refresh_completed_at"] = formatOptionalTime(meshState.LastLeaseRefresh.CompletedAt)
report["last_refresh_previous_lease_count"] = meshState.LastLeaseRefresh.PreviousLeaseCount
report["last_refresh_refreshed_lease_count"] = meshState.LastLeaseRefresh.RefreshedLeaseCount
report["last_refresh_config_version"] = meshState.LastLeaseRefresh.ConfigVersion
}
report["truncated"] = len(meshState.RendezvousLeases) > maxMeshRendezvousLeaseReportEntries
report["leases"] = leaseDetails
return report
}
type meshRouteGenerationDecisionState struct {
DecisionID string
RouteID string
Generation string
DecisionSource string
LocalRole string
PreviousHopID string
NextHopID string
SelectedRelayID string
StaleRelayNodeID string
RendezvousLeaseID string
EffectiveHops []string
OriginalHops []string
PathScore int
Status string
ApplyStatus string
WithdrawStatus string
AppliedAt time.Time
WithdrawnAt time.Time
ControlPlaneOnly bool
ProductionForwarding bool
}
type meshRouteGenerationTracker struct {
Generation string
PreviousGeneration string
LastAppliedAt time.Time
LastChangedAt time.Time
LastAppliedCount int
LastWithdrawnCount int
LastUnchangedCount int
TotalAppliedCount int
TotalWithdrawnCount int
Active map[string]meshRouteGenerationDecisionState
Withdrawn []meshRouteGenerationDecisionState
}
func newMeshRouteGenerationTracker(report *client.RoutePathDecisionReport, observedAt time.Time) *meshRouteGenerationTracker {
tracker := &meshRouteGenerationTracker{
Active: map[string]meshRouteGenerationDecisionState{},
}
tracker.Apply(report, observedAt)
return tracker
}
func (t *meshRouteGenerationTracker) Apply(report *client.RoutePathDecisionReport, observedAt time.Time) {
if t == nil {
return
}
if observedAt.IsZero() {
observedAt = time.Now().UTC()
} else {
observedAt = observedAt.UTC()
}
nextGeneration := ""
decisions := []client.RoutePathDecision{}
if report != nil {
nextGeneration = strings.TrimSpace(report.Generation)
decisions = append(decisions, report.Decisions...)
}
t.PreviousGeneration = t.Generation
t.Generation = nextGeneration
t.LastAppliedAt = observedAt
t.LastAppliedCount = 0
t.LastWithdrawnCount = 0
t.LastUnchangedCount = 0
nextActive := map[string]meshRouteGenerationDecisionState{}
seen := map[string]struct{}{}
withdrawnRelayKeys := map[string]struct{}{}
for _, previous := range t.Withdrawn {
if key := routeGenerationWithdrawnRelayKey(previous); key != "" {
withdrawnRelayKeys[key] = struct{}{}
}
}
appliedReplacementDecisions := []client.RoutePathDecision{}
for _, decision := range decisions {
state := routeGenerationDecisionState(decision, observedAt)
key := routeGenerationDecisionKey(decision)
if key == "" {
continue
}
seen[key] = struct{}{}
if previous, ok := t.Active[key]; ok && routeGenerationDecisionSame(previous, decision) {
state.Status = "active"
state.ApplyStatus = "unchanged"
state.AppliedAt = previous.AppliedAt
t.LastUnchangedCount++
} else {
state.Status = "active"
state.ApplyStatus = "applied"
state.AppliedAt = observedAt
t.LastAppliedCount++
t.TotalAppliedCount++
if decision.DecisionSource == "stale_relay_replacement" && strings.TrimSpace(decision.StaleRelayNodeID) != "" {
appliedReplacementDecisions = append(appliedReplacementDecisions, decision)
}
}
nextActive[key] = state
}
for key, previous := range t.Active {
if _, ok := seen[key]; ok {
continue
}
previous.Status = "withdrawn"
previous.ApplyStatus = "not_active"
previous.WithdrawStatus = "withdrawn"
previous.WithdrawnAt = observedAt
t.Withdrawn = append([]meshRouteGenerationDecisionState{previous}, t.Withdrawn...)
if relayKey := routeGenerationWithdrawnRelayKey(previous); relayKey != "" {
withdrawnRelayKeys[relayKey] = struct{}{}
}
t.LastWithdrawnCount++
t.TotalWithdrawnCount++
}
for _, decision := range appliedReplacementDecisions {
relayKey := routeGenerationRouteRelayKey(decision.RouteID, decision.StaleRelayNodeID)
if relayKey == "" {
continue
}
if _, alreadyWithdrawn := withdrawnRelayKeys[relayKey]; alreadyWithdrawn {
continue
}
withdrawn := routeGenerationReplacementWithdrawnDecisionState(decision, observedAt)
t.Withdrawn = append([]meshRouteGenerationDecisionState{withdrawn}, t.Withdrawn...)
withdrawnRelayKeys[relayKey] = struct{}{}
t.LastWithdrawnCount++
t.TotalWithdrawnCount++
}
if len(t.Withdrawn) > maxMeshRendezvousLeaseReportEntries {
t.Withdrawn = t.Withdrawn[:maxMeshRendezvousLeaseReportEntries]
}
if t.LastAppliedCount > 0 || t.LastWithdrawnCount > 0 || t.PreviousGeneration != t.Generation {
t.LastChangedAt = observedAt
}
t.Active = nextActive
}
func routeGenerationDecisionState(decision client.RoutePathDecision, observedAt time.Time) meshRouteGenerationDecisionState {
return meshRouteGenerationDecisionState{
DecisionID: decision.DecisionID,
RouteID: decision.RouteID,
Generation: decision.Generation,
DecisionSource: decision.DecisionSource,
LocalRole: decision.LocalRole,
PreviousHopID: decision.PreviousHopID,
NextHopID: decision.NextHopID,
SelectedRelayID: decision.SelectedRelayID,
StaleRelayNodeID: decision.StaleRelayNodeID,
RendezvousLeaseID: decision.RendezvousLeaseID,
EffectiveHops: append([]string{}, decision.EffectiveHops...),
OriginalHops: append([]string{}, decision.OriginalHops...),
PathScore: decision.PathScore,
Status: "active",
ApplyStatus: "applied",
WithdrawStatus: "not_withdrawn",
AppliedAt: observedAt,
ControlPlaneOnly: decision.ControlPlaneOnly,
ProductionForwarding: decision.ProductionForwarding,
}
}
func routeGenerationReplacementWithdrawnDecisionState(decision client.RoutePathDecision, observedAt time.Time) meshRouteGenerationDecisionState {
withdrawnDecisionID := strings.TrimSpace(decision.RouteID) + "-path-withdrawn-stale-relay-" + strings.TrimSpace(decision.StaleRelayNodeID)
effectiveHops := append([]string{}, decision.OriginalHops...)
if len(effectiveHops) == 0 {
effectiveHops = append([]string{}, decision.EffectiveHops...)
}
return meshRouteGenerationDecisionState{
DecisionID: withdrawnDecisionID,
RouteID: decision.RouteID,
Generation: decision.Generation,
DecisionSource: "stale_relay_withdrawn",
LocalRole: decision.LocalRole,
PreviousHopID: decision.PreviousHopID,
NextHopID: decision.StaleRelayNodeID,
SelectedRelayID: decision.SelectedRelayID,
StaleRelayNodeID: decision.StaleRelayNodeID,
RendezvousLeaseID: decision.RendezvousLeaseID,
EffectiveHops: effectiveHops,
OriginalHops: append([]string{}, decision.OriginalHops...),
PathScore: decision.PathScore,
Status: "withdrawn",
ApplyStatus: "not_active",
WithdrawStatus: "withdrawn_by_replacement",
WithdrawnAt: observedAt,
ControlPlaneOnly: decision.ControlPlaneOnly,
ProductionForwarding: decision.ProductionForwarding,
}
}
func routeGenerationDecisionKey(decision client.RoutePathDecision) string {
if strings.TrimSpace(decision.DecisionID) != "" {
return strings.TrimSpace(decision.DecisionID)
}
if strings.TrimSpace(decision.RouteID) == "" {
return ""
}
return strings.TrimSpace(decision.RouteID) + "\x00" + strings.TrimSpace(decision.LocalNodeID)
}
func routeGenerationRouteRelayKey(routeID string, relayNodeID string) string {
routeID = strings.TrimSpace(routeID)
relayNodeID = strings.TrimSpace(relayNodeID)
if routeID == "" || relayNodeID == "" {
return ""
}
return routeID + "\x00" + relayNodeID
}
func routeGenerationWithdrawnRelayKey(state meshRouteGenerationDecisionState) string {
relayNodeID := strings.TrimSpace(state.StaleRelayNodeID)
if relayNodeID == "" {
relayNodeID = strings.TrimSpace(state.NextHopID)
}
return routeGenerationRouteRelayKey(state.RouteID, relayNodeID)
}
func routeGenerationDecisionSame(previous meshRouteGenerationDecisionState, decision client.RoutePathDecision) bool {
return previous.Generation == decision.Generation &&
previous.RouteID == decision.RouteID &&
previous.DecisionSource == decision.DecisionSource &&
previous.NextHopID == decision.NextHopID &&
previous.SelectedRelayID == decision.SelectedRelayID &&
strings.Join(previous.EffectiveHops, "\x00") == strings.Join(decision.EffectiveHops, "\x00")
}
func meshRouteGenerationReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any {
observedAt = observedAt.UTC()
report := map[string]any{
"schema_version": meshRouteGenerationReportSchema,
"cluster_id": identity.ClusterID,
"node_id": identity.NodeID,
"config_source": "",
"config_version": "",
"generation": "",
"previous_generation": "",
"tracker_contract": "node_side_route_generation_apply_withdraw",
"control_plane_only": true,
"production_payload_forwarding": false,
"service_workload_traffic": false,
"route_path_forwarding_runtime": false,
"observed_at": observedAt.Format(time.RFC3339Nano),
"active_decision_count": 0,
"applied_decision_count": 0,
"unchanged_decision_count": 0,
"withdrawn_decision_count": 0,
"total_applied_decision_count": 0,
"total_withdrawn_decision_count": 0,
"generation_changed": false,
"active_decisions": []map[string]any{},
"withdrawn_decisions": []map[string]any{},
}
if meshState == nil || meshState.RouteGenerationTracker == nil {
return report
}
tracker := meshState.RouteGenerationTracker
report["config_source"] = meshState.Source
report["config_version"] = meshState.ConfigVersion
report["generation"] = tracker.Generation
report["previous_generation"] = tracker.PreviousGeneration
report["last_applied_at"] = formatOptionalTime(tracker.LastAppliedAt)
report["last_changed_at"] = formatOptionalTime(tracker.LastChangedAt)
report["active_decision_count"] = len(tracker.Active)
report["applied_decision_count"] = tracker.LastAppliedCount
report["unchanged_decision_count"] = tracker.LastUnchangedCount
report["withdrawn_decision_count"] = tracker.LastWithdrawnCount
report["total_applied_decision_count"] = tracker.TotalAppliedCount
report["total_withdrawn_decision_count"] = tracker.TotalWithdrawnCount
report["generation_changed"] = tracker.PreviousGeneration != tracker.Generation
report["active_decisions"] = routeGenerationDecisionDetails(tracker.activeList(), maxMeshRendezvousLeaseReportEntries)
report["withdrawn_decisions"] = routeGenerationDecisionDetails(tracker.Withdrawn, maxMeshRendezvousLeaseReportEntries)
report["truncated"] = len(tracker.Active) > maxMeshRendezvousLeaseReportEntries || len(tracker.Withdrawn) > maxMeshRendezvousLeaseReportEntries
return report
}
func (t *meshRouteGenerationTracker) activeList() []meshRouteGenerationDecisionState {
if t == nil {
return nil
}
out := make([]meshRouteGenerationDecisionState, 0, len(t.Active))
for _, state := range t.Active {
out = append(out, state)
}
sort.SliceStable(out, func(i, j int) bool {
if out[i].RouteID != out[j].RouteID {
return out[i].RouteID < out[j].RouteID
}
return out[i].DecisionID < out[j].DecisionID
})
return out
}
func routeGenerationDecisionDetails(states []meshRouteGenerationDecisionState, limit int) []map[string]any {
out := make([]map[string]any, 0, minInt(len(states), limit))
for index, state := range states {
if index >= limit {
break
}
out = append(out, map[string]any{
"decision_id": state.DecisionID,
"route_id": state.RouteID,
"generation": state.Generation,
"decision_source": state.DecisionSource,
"local_role": state.LocalRole,
"previous_hop_id": state.PreviousHopID,
"next_hop_id": state.NextHopID,
"selected_relay_id": state.SelectedRelayID,
"stale_relay_node_id": state.StaleRelayNodeID,
"rendezvous_lease_id": state.RendezvousLeaseID,
"effective_hops": append([]string{}, state.EffectiveHops...),
"original_hops": append([]string{}, state.OriginalHops...),
"path_score": state.PathScore,
"status": state.Status,
"apply_status": state.ApplyStatus,
"withdraw_status": state.WithdrawStatus,
"applied_at": formatOptionalTime(state.AppliedAt),
"withdrawn_at": formatOptionalTime(state.WithdrawnAt),
"control_plane_only": state.ControlPlaneOnly,
"production_forwarding": state.ProductionForwarding,
})
}
return out
}
func meshRouteHealthConfigReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any {
observedAt = observedAt.UTC()
report := map[string]any{
"schema_version": meshRouteHealthConfigReportSchema,
"cluster_id": identity.ClusterID,
"node_id": identity.NodeID,
"config_source": "",
"config_version": "",
"route_health_config_contract": "control_plane_route_path_decisions_to_synthetic_route_health",
"control_plane_only": true,
"route_health_only": true,
"synthetic_route_health_route_path_runtime": true,
"production_route_path_forwarding_runtime": false,
"production_payload_forwarding": false,
"service_workload_traffic": false,
"test_service_route_config_changed": false,
"observed_at": observedAt.Format(time.RFC3339Nano),
"config_refresh_interval_ms": int64(meshSyntheticConfigRefreshInterval / time.Millisecond),
"feedback_refresh_backoff_ms": int64(meshRouteHealthFeedbackRefreshBackoff / time.Millisecond),
"base_route_count": 0,
"route_health_route_count": 0,
"route_path_decision_applied_count": 0,
"replacement_route_health_route_count": 0,
"route_health_decision_drift_candidate_count": 0,
"routes": []map[string]any{},
}
if meshState == nil {
return report
}
report["config_source"] = meshState.Source
report["config_version"] = meshState.ConfigVersion
if !meshState.LastConfigRefreshAt.IsZero() {
report["last_config_refresh_at"] = meshState.LastConfigRefreshAt.UTC().Format(time.RFC3339Nano)
}
report["base_route_count"] = len(meshState.Routes)
routes := meshState.RouteHealthRoutes
if len(routes) == 0 {
routes = meshState.Routes
}
report["route_health_route_count"] = len(routes)
decisionsByRoute := routePathDecisionsByRoute(meshState.RoutePathDecisions)
applied := 0
replacements := 0
driftCandidates := 0
details := make([]map[string]any, 0, minInt(len(routes), maxMeshRendezvousLeaseReportEntries))
for index, route := range routes {
decision, ok := decisionsByRoute[route.RouteID]
if ok {
applied++
if decision.DecisionSource == "stale_relay_replacement" {
replacements++
}
if !sameStringSlice(route.Hops, decision.EffectiveHops) {
driftCandidates++
}
}
if index >= maxMeshRendezvousLeaseReportEntries {
continue
}
item := map[string]any{
"route_id": route.RouteID,
"source_node_id": route.SourceNodeID,
"destination_node_id": route.DestinationNodeID,
"effective_hops": append([]string{}, route.Hops...),
"route_version": route.RouteVersion,
"policy_version": route.PolicyVersion,
"peer_directory_version": route.PeerDirectoryVersion,
"route_path_decision_applied": ok,
}
if ok {
item["route_path_decision_id"] = decision.DecisionID
item["route_path_decision_generation"] = decision.Generation
item["route_path_decision_source"] = decision.DecisionSource
item["selected_relay_id"] = decision.SelectedRelayID
item["stale_relay_node_id"] = decision.StaleRelayNodeID
item["next_hop_id"] = decision.NextHopID
item["original_hops"] = append([]string{}, decision.OriginalHops...)
}
details = append(details, item)
}
report["route_path_decision_applied_count"] = applied
report["replacement_route_health_route_count"] = replacements
report["route_health_decision_drift_candidate_count"] = driftCandidates
report["feedback_refresh_attempt_count"] = meshState.RouteHealthRefreshAttempts
report["feedback_refresh_success_count"] = meshState.RouteHealthRefreshSuccesses
report["feedback_refresh_failure_count"] = meshState.RouteHealthRefreshFailures
report["feedback_refresh_suppressed_count"] = meshState.RouteHealthRefreshSuppressed
report["routes"] = details
report["truncated"] = len(routes) > maxMeshRendezvousLeaseReportEntries
return report
}
func meshRouteHealthFeedbackRefreshReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any {
observedAt = observedAt.UTC()
report := map[string]any{
"schema_version": meshRouteHealthFeedbackRefreshSchema,
"cluster_id": identity.ClusterID,
"node_id": identity.NodeID,
"refresh_contract": "route_health_feedback_to_node_scoped_synthetic_config_get",
"control_plane_only": true,
"route_health_only": true,
"production_payload_forwarding": false,
"service_workload_traffic": false,
"feedback_refresh_backoff_ms": int64(meshRouteHealthFeedbackRefreshBackoff / time.Millisecond),
"feedback_refresh_supported": false,
"feedback_refresh_attempt_count": 0,
"feedback_refresh_success_count": 0,
"feedback_refresh_failure_count": 0,
"feedback_refresh_suppressed_count": 0,
"last_feedback_refresh_status": "",
"last_feedback_refresh_reason": "",
"last_feedback_refresh_error": "",
"last_feedback_refresh_route_id": "",
"last_feedback_refresh_peer_node_id": "",
"last_feedback_refresh_selected_relay": "",
"observed_at": observedAt.Format(time.RFC3339Nano),
}
if meshState == nil {
return report
}
report["config_source"] = meshState.Source
report["config_version"] = meshState.ConfigVersion
report["feedback_refresh_supported"] = meshState.Source == "control_plane"
report["feedback_refresh_attempt_count"] = meshState.RouteHealthRefreshAttempts
report["feedback_refresh_success_count"] = meshState.RouteHealthRefreshSuccesses
report["feedback_refresh_failure_count"] = meshState.RouteHealthRefreshFailures
report["feedback_refresh_suppressed_count"] = meshState.RouteHealthRefreshSuppressed
if meshState.LastRouteHealthRefresh == nil {
return report
}
last := meshState.LastRouteHealthRefresh
report["last_feedback_refresh_status"] = last.Status
report["last_feedback_refresh_reason"] = last.Reason
report["last_feedback_refresh_error"] = last.Error
report["last_feedback_refresh_route_id"] = last.RouteID
report["last_feedback_refresh_peer_node_id"] = last.PeerNodeID
report["last_feedback_refresh_selected_relay"] = last.SelectedRelayID
report["last_feedback_refresh_link_status"] = last.LinkStatus
report["last_feedback_refresh_failure_reason"] = last.FailureReason
report["last_feedback_refresh_drift_detected"] = last.DriftDetected
report["last_feedback_refresh_attempted_at"] = formatOptionalTime(last.AttemptedAt)
report["last_feedback_refresh_completed_at"] = formatOptionalTime(last.CompletedAt)
report["last_feedback_refresh_previous_config_version"] = last.PreviousConfigVersion
report["last_feedback_refresh_refreshed_config_version"] = last.RefreshedConfigVersion
report["last_feedback_refresh_previous_route_health_route_count"] = last.PreviousRouteHealthRouteCount
report["last_feedback_refresh_refreshed_route_health_route_count"] = last.RefreshedRouteHealthRouteCount
return report
}
func meshRoutePathDecisionReport(meshState *syntheticMeshState, identity state.Identity, observedAt time.Time) map[string]any {
observedAt = observedAt.UTC()
report := map[string]any{
"schema_version": meshRoutePathDecisionReportSchema,
"cluster_id": identity.ClusterID,
"node_id": identity.NodeID,
"config_source": "",
"config_version": "",
"decision_contract": "control_plane_route_path_decisions",
"control_plane_only": true,
"production_payload_forwarding": false,
"service_workload_traffic": false,
"route_path_forwarding_runtime": false,
"observed_at": observedAt.Format(time.RFC3339Nano),
"decision_count": 0,
"replacement_decision_count": 0,
"local_effective_path_count": 0,
"withdrawn_local_relay_count": 0,
"selected_local_relay_count": 0,
"next_hop_available_count": 0,
"decisions": []map[string]any{},
}
if meshState == nil {
return report
}
report["config_source"] = meshState.Source
report["config_version"] = meshState.ConfigVersion
decisionReport := meshState.RoutePathDecisions
if decisionReport == nil {
return report
}
report["control_plane_schema_version"] = decisionReport.SchemaVersion
report["decision_mode"] = decisionReport.DecisionMode
report["generation"] = decisionReport.Generation
report["decision_count"] = decisionReport.DecisionCount
report["replacement_decision_count"] = decisionReport.ReplacementDecisionCount
report["degraded_decision_count"] = decisionReport.DegradedDecisionCount
report["rebuild_request_count"] = decisionReport.RebuildRequestCount
report["rebuild_applied_count"] = decisionReport.RebuildAppliedCount
report["control_plane_report_only"] = decisionReport.ControlPlaneOnly
report["control_plane_report_production_forwarding"] = decisionReport.ProductionForwarding
decisions := make([]map[string]any, 0, minInt(len(decisionReport.Decisions), maxMeshRendezvousLeaseReportEntries))
localEffective := 0
withdrawnLocalRelay := 0
selectedLocalRelay := 0
nextHopAvailable := 0
for index, decision := range decisionReport.Decisions {
if containsString(decision.EffectiveHops, identity.NodeID) {
localEffective++
}
if decision.LocalRole == "withdrawn_relay" {
withdrawnLocalRelay++
}
if decision.LocalRole == "selected_relay" {
selectedLocalRelay++
}
if strings.TrimSpace(decision.NextHopID) != "" {
nextHopAvailable++
}
if index >= maxMeshRendezvousLeaseReportEntries {
continue
}
decisions = append(decisions, map[string]any{
"decision_id": decision.DecisionID,
"route_id": decision.RouteID,
"replacement_route_id": decision.ReplacementRouteID,
"rebuild_request_id": decision.RebuildRequestID,
"rebuild_status": decision.RebuildStatus,
"rebuild_reason": decision.RebuildReason,
"rebuild_attempt": decision.RebuildAttempt,
"source_node_id": decision.SourceNodeID,
"destination_node_id": decision.DestinationNodeID,
"original_hops": append([]string{}, decision.OriginalHops...),
"effective_hops": append([]string{}, decision.EffectiveHops...),
"previous_hop_id": decision.PreviousHopID,
"next_hop_id": decision.NextHopID,
"local_role": decision.LocalRole,
"selected_relay_id": decision.SelectedRelayID,
"selected_relay_endpoint": decision.SelectedRelayEndpoint,
"stale_relay_node_id": decision.StaleRelayNodeID,
"rendezvous_peer_node_id": decision.RendezvousPeerNodeID,
"rendezvous_lease_id": decision.RendezvousLeaseID,
"rendezvous_lease_reason": decision.RendezvousLeaseReason,
"decision_source": decision.DecisionSource,
"generation": decision.Generation,
"path_score": decision.PathScore,
"score_reasons": append([]string{}, decision.ScoreReasons...),
"control_plane_only": decision.ControlPlaneOnly,
"production_forwarding": decision.ProductionForwarding,
"expires_at": formatOptionalTime(decision.ExpiresAt),
})
}
report["local_effective_path_count"] = localEffective
report["withdrawn_local_relay_count"] = withdrawnLocalRelay
report["selected_local_relay_count"] = selectedLocalRelay
report["next_hop_available_count"] = nextHopAvailable
report["truncated"] = len(decisionReport.Decisions) > maxMeshRendezvousLeaseReportEntries
report["decisions"] = decisions
return report
}
func meshRendezvousLeaseBaseValid(lease mesh.PeerRendezvousLease) bool {
return strings.TrimSpace(lease.LeaseID) != "" &&
strings.TrimSpace(lease.PeerNodeID) != "" &&
strings.TrimSpace(lease.RelayNodeID) != "" &&
strings.TrimSpace(lease.RelayEndpoint) != "" &&
!lease.ExpiresAt.IsZero() &&
lease.ControlPlaneOnly
}
func meshRendezvousConnectionsByPeer(meshState *syntheticMeshState) map[string]mesh.PeerConnectionState {
out := map[string]mesh.PeerConnectionState{}
if meshState == nil || meshState.PeerConnections == nil {
return out
}
for _, entry := range meshState.PeerConnections.Snapshot().Entries {
if strings.TrimSpace(entry.NodeID) != "" {
out[entry.NodeID] = entry
}
}
return out
}
func meshRendezvousLeaseRenewalNeeded(lease mesh.PeerRendezvousLease, observedAt time.Time, usable bool) bool {
if !usable {
return false
}
ttlRemaining := lease.ExpiresAt.Sub(observedAt)
if ttlRemaining <= meshRendezvousLeaseRenewalWindow {
return true
}
renewalAfter := meshRendezvousLeaseRenewalAfter(lease)
return !renewalAfter.IsZero() && !renewalAfter.After(observedAt)
}
func meshRendezvousLeaseStaleRelay(lease mesh.PeerRendezvousLease, connection mesh.PeerConnectionState) bool {
if strings.TrimSpace(lease.LeaseID) == "" || strings.TrimSpace(connection.NodeID) == "" {
return false
}
if !meshRendezvousLeaseMatchesConnection(lease, connection) {
return false
}
switch connection.State {
case mesh.PeerConnectionBackoff:
return true
case mesh.PeerConnectionDegraded:
return connection.ConsecutiveFailures > 0
case mesh.PeerConnectionWaiting:
return connection.RendezvousLeaseID == lease.LeaseID && connection.LastFailureReason != ""
default:
return false
}
}
func meshRendezvousLeaseMatchesConnection(lease mesh.PeerRendezvousLease, connection mesh.PeerConnectionState) bool {
if connection.RendezvousLeaseID != "" && connection.RendezvousLeaseID != lease.LeaseID {
return false
}
if connection.RelayNodeID != "" && connection.RelayNodeID != lease.RelayNodeID {
return false
}
return true
}
func meshRendezvousLeaseRole(lease mesh.PeerRendezvousLease, localNodeID string) string {
localNodeID = strings.TrimSpace(localNodeID)
switch {
case localNodeID == "" || (lease.PeerNodeID != localNodeID && lease.RelayNodeID != localNodeID):
return "entry_or_observer"
case lease.PeerNodeID == localNodeID && lease.RelayNodeID == localNodeID:
return "self"
case lease.RelayNodeID == localNodeID:
return "relay"
case lease.PeerNodeID == localNodeID:
return "peer"
default:
return "entry_or_observer"
}
}
func meshRendezvousLeaseStatus(valid bool, expired bool, renewalNeeded bool, role string) string {
switch {
case !valid:
return "invalid"
case expired:
return "expired"
case renewalNeeded:
return "renewal_needed"
case role == "relay":
return "admitted"
default:
return "active"
}
}
func meshRendezvousLeaseRenewalAfter(lease mesh.PeerRendezvousLease) time.Time {
if lease.ExpiresAt.IsZero() {
return time.Time{}
}
if lease.IssuedAt.IsZero() || !lease.ExpiresAt.After(lease.IssuedAt) {
return lease.ExpiresAt.Add(-meshRendezvousLeaseRenewalWindow).UTC()
}
ttl := lease.ExpiresAt.Sub(lease.IssuedAt)
return lease.IssuedAt.Add(ttl * 2 / 3).UTC()
}
func formatOptionalTime(value time.Time) string {
if value.IsZero() {
return ""
}
return value.UTC().Format(time.RFC3339Nano)
}
func advertisedEndpointCandidates(cfg config.Config, identity state.Identity, meshState *syntheticMeshState, observedAt time.Time) ([]mesh.PeerEndpointCandidate, error) {
var candidates []mesh.PeerEndpointCandidate
if cfg.MeshAdvertiseEndpointsJSON != "" {
if err := json.Unmarshal([]byte(cfg.MeshAdvertiseEndpointsJSON), &candidates); err != nil {
return nil, fmt.Errorf("parse RAP_MESH_ADVERTISE_ENDPOINTS_JSON: %w", err)
}
}
if cfg.MeshAdvertiseEndpoint != "" {
candidates = append(candidates, mesh.PeerEndpointCandidate{
EndpointID: identity.NodeID + "-advertised",
NodeID: identity.NodeID,
Transport: cfg.MeshAdvertiseTransport,
Address: cfg.MeshAdvertiseEndpoint,
Reachability: reachabilityFromConnectivityMode(cfg.MeshConnectivityMode),
NATType: cfg.MeshNATType,
ConnectivityMode: cfg.MeshConnectivityMode,
Region: cfg.MeshRegion,
Priority: 10,
})
}
candidates = append(candidates, interfaceEndpointCandidates(cfg, identity, meshState, observedAt)...)
for i := range candidates {
if candidates[i].EndpointID == "" {
candidates[i].EndpointID = fmt.Sprintf("%s-advertised-%d", identity.NodeID, i+1)
}
if candidates[i].NodeID == "" {
candidates[i].NodeID = identity.NodeID
}
if candidates[i].NodeID != identity.NodeID || strings.TrimSpace(candidates[i].Address) == "" {
return nil, fmt.Errorf("invalid advertised mesh endpoint candidate")
}
candidates[i].Address = strings.TrimRight(strings.TrimSpace(candidates[i].Address), "/")
if candidates[i].Transport == "" {
candidates[i].Transport = defaultString(cfg.MeshAdvertiseTransport, "direct_tcp_tls")
}
if candidates[i].ConnectivityMode == "" {
candidates[i].ConnectivityMode = defaultString(cfg.MeshConnectivityMode, "direct")
}
if candidates[i].Reachability == "" {
candidates[i].Reachability = reachabilityFromConnectivityMode(candidates[i].ConnectivityMode)
}
if candidates[i].NATType == "" {
candidates[i].NATType = defaultString(cfg.MeshNATType, "unknown")
}
if candidates[i].Region == "" {
candidates[i].Region = cfg.MeshRegion
}
if candidates[i].Priority <= 0 {
candidates[i].Priority = 10 + i
}
candidates[i].LastVerifiedAt = &observedAt
if candidates[i].Metadata == nil {
metadata, err := json.Marshal(map[string]any{
"source": "node-agent-heartbeat",
"runtime": "c17z7",
"synthetic_runtime": cfg.MeshSyntheticRuntimeEnabled,
"production_forwarding": cfg.MeshProductionForwardingEnabled,
})
if err != nil {
return nil, err
}
candidates[i].Metadata = metadata
}
}
sort.SliceStable(candidates, func(i, j int) bool {
if candidates[i].Priority == candidates[j].Priority {
return candidates[i].EndpointID < candidates[j].EndpointID
}
return candidates[i].Priority < candidates[j].Priority
})
return candidates, nil
}
func interfaceEndpointCandidates(cfg config.Config, identity state.Identity, meshState *syntheticMeshState, observedAt time.Time) []mesh.PeerEndpointCandidate {
if meshState == nil {
return nil
}
report := meshState.ListenerReport
if report.Status != "listening" && report.Status != "auto_rebound" {
return nil
}
if cfg.MeshConnectivityMode == "outbound_only" {
return nil
}
port := listenerPort(report.EffectiveListenAddr, report.ConfiguredListenAddr, cfg.MeshListenAddr)
if port == "" {
return nil
}
interfaces, err := net.Interfaces()
if err != nil {
log.Printf("mesh interface discovery skipped: %v", err)
return nil
}
var candidates []mesh.PeerEndpointCandidate
for _, iface := range interfaces {
if iface.Flags&net.FlagUp == 0 || iface.Flags&net.FlagLoopback != 0 {
continue
}
interfaceType := classifyNetworkInterface(iface.Name)
if interfaceType == "container" {
continue
}
addrs, err := iface.Addrs()
if err != nil {
continue
}
for _, addr := range addrs {
ip := ipFromAddr(addr)
if ip == nil || ip.IsLoopback() || ip.IsUnspecified() || ip.IsMulticast() || ip.IsLinkLocalMulticast() || ip.IsLinkLocalUnicast() {
continue
}
addressFamily := "ipv6"
if ip.To4() != nil {
addressFamily = "ipv4"
}
reachability := "public"
connectivityMode := defaultString(cfg.MeshConnectivityMode, "direct")
if ip.IsPrivate() || ip.IsLinkLocalUnicast() {
reachability = "private"
if connectivityMode == "direct" {
connectivityMode = "private_lan"
}
}
metadata, _ := json.Marshal(map[string]any{
"source": "node-agent-interface-discovery",
"runtime": "c17z24",
"interface_name": iface.Name,
"interface_index": iface.Index,
"interface_type": interfaceType,
"listen_effective_addr": report.EffectiveListenAddr,
"listen_configured_addr": report.ConfiguredListenAddr,
"loopback_filtered": true,
"link_local_filtered": true,
"container_iface_filtered": true,
"operator_override_allowed": true,
"observed_at": observedAt.UTC().Format(time.RFC3339Nano),
})
candidates = append(candidates, mesh.PeerEndpointCandidate{
EndpointID: fmt.Sprintf("%s-if-%s-%s-%s", identity.NodeID, safeEndpointIDPart(iface.Name), safeEndpointIDPart(ip.String()), addressFamily),
NodeID: identity.NodeID,
Transport: defaultString(cfg.MeshAdvertiseTransport, "direct_http"),
Address: endpointAddress(defaultString(cfg.MeshAdvertiseTransport, "direct_http"), ip, port),
AddressFamily: addressFamily,
Reachability: reachability,
NATType: defaultString(cfg.MeshNATType, "unknown"),
ConnectivityMode: connectivityMode,
Region: cfg.MeshRegion,
Priority: endpointPriority(reachability, addressFamily, interfaceType, len(candidates)),
PolicyTags: []string{"auto_discovered", "non_loopback", interfaceType},
LastVerifiedAt: &observedAt,
Metadata: metadata,
})
}
}
return candidates
}
func classifyNetworkInterface(name string) string {
normalized := strings.ToLower(strings.TrimSpace(name))
switch {
case strings.HasPrefix(normalized, "docker"),
strings.HasPrefix(normalized, "br-"),
strings.HasPrefix(normalized, "veth"),
strings.HasPrefix(normalized, "virbr"),
strings.HasPrefix(normalized, "cni"),
strings.HasPrefix(normalized, "flannel"),
strings.HasPrefix(normalized, "calico"),
strings.HasPrefix(normalized, "kube"):
return "container"
case strings.HasPrefix(normalized, "tun"),
strings.HasPrefix(normalized, "tap"),
strings.HasPrefix(normalized, "wg"),
strings.Contains(normalized, "tailscale"),
strings.Contains(normalized, "zerotier"),
strings.HasPrefix(normalized, "zt"):
return "vpn"
case strings.HasPrefix(normalized, "eth"),
strings.HasPrefix(normalized, "ens"),
strings.HasPrefix(normalized, "eno"),
strings.HasPrefix(normalized, "enp"),
strings.HasPrefix(normalized, "wlan"),
strings.HasPrefix(normalized, "wl"),
strings.HasPrefix(normalized, "bond"):
return "physical"
default:
return "unknown"
}
}
func listenerPort(addrs ...string) string {
for _, addr := range addrs {
addr = strings.TrimSpace(addr)
if addr == "" {
continue
}
_, port, err := net.SplitHostPort(addr)
if err == nil && port != "" {
return port
}
if strings.HasPrefix(addr, ":") && len(addr) > 1 {
return strings.TrimPrefix(addr, ":")
}
}
return ""
}
func ipFromAddr(addr net.Addr) net.IP {
switch v := addr.(type) {
case *net.IPNet:
return v.IP
case *net.IPAddr:
return v.IP
default:
host, _, err := net.SplitHostPort(v.String())
if err != nil {
host = v.String()
}
return net.ParseIP(host)
}
}
func endpointAddress(transport string, ip net.IP, port string) string {
host := ip.String()
if ip.To4() == nil {
host = "[" + host + "]"
}
scheme := "http"
switch strings.ToLower(strings.TrimSpace(transport)) {
case "wss":
scheme = "wss"
case "https", "direct_https":
scheme = "https"
}
return scheme + "://" + host + ":" + port
}
func endpointPriority(reachability string, addressFamily string, interfaceType string, offset int) int {
base := 40
if reachability == "public" {
base = 20
} else if reachability == "private" {
base = 30
}
switch interfaceType {
case "vpn":
base += 0
case "physical":
base += 5
default:
base += 10
}
if addressFamily == "ipv6" {
base += 20
}
return base + offset
}
func safeEndpointIDPart(value string) string {
value = strings.ToLower(strings.TrimSpace(value))
var out strings.Builder
lastDash := false
for _, r := range value {
if (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') {
out.WriteRune(r)
lastDash = false
} else if !lastDash {
out.WriteByte('-')
lastDash = true
}
}
result := strings.Trim(out.String(), "-")
if result == "" {
return "iface"
}
return result
}
func defaultString(value string, fallback string) string {
if strings.TrimSpace(value) == "" {
return fallback
}
return value
}
func containsString(items []string, value string) bool {
value = strings.TrimSpace(value)
for _, item := range items {
if strings.TrimSpace(item) == value {
return true
}
}
return false
}
func sameStringSlice(left []string, right []string) bool {
if len(left) != len(right) {
return false
}
for index := range left {
if strings.TrimSpace(left[index]) != strings.TrimSpace(right[index]) {
return false
}
}
return true
}
func minInt(left, right int) int {
if left < right {
return left
}
return right
}
func reachabilityFromConnectivityMode(connectivityMode string) string {
switch connectivityMode {
case "outbound_only":
return "outbound_only"
case "relay_required":
return "relay"
case "private_lan":
return "private"
case "direct":
return "public"
default:
return "unknown"
}
}
func reportWorkloadStatus(ctx context.Context, api *client.Client, supervisor supervisor.Supervisor, identity state.Identity, meshState *syntheticMeshState) error {
desired, err := api.DesiredWorkloads(ctx, identity.ClusterID, identity.NodeID)
if err != nil {
return err
}
statuses, err := supervisor.Apply(ctx, desired)
if err != nil {
return err
}
enrichWorkloadStatuses(statuses, desired, meshState)
for i, status := range statuses {
if i >= len(desired) {
break
}
if err := api.ReportWorkloadStatus(ctx, identity.ClusterID, identity.NodeID, desired[i].ServiceType, status); err != nil {
return err
}
}
if len(statuses) > 0 {
log.Printf("workload status reported: count=%d", len(statuses))
}
return nil
}
func enrichWorkloadStatuses(statuses []client.WorkloadStatusRequest, desired []client.DesiredWorkload, meshState *syntheticMeshState) {
if meshState == nil || meshState.RemoteWorkspaceFrameSink == nil {
return
}
sinkReport := meshState.RemoteWorkspaceFrameSink.Report(time.Now().UTC())
for i := range statuses {
if i >= len(desired) {
return
}
if strings.TrimSpace(desired[i].ServiceType) != "rdp-worker" {
continue
}
if statuses[i].StatusPayload == nil {
statuses[i].StatusPayload = map[string]any{}
}
statuses[i].StatusPayload["remote_workspace_adapter_sink"] = sinkReport
}
}
func reportVPNAssignmentStatus(ctx context.Context, api *client.Client, identity state.Identity, gateway *vpnruntime.Gateway) error {
assignments, err := api.NodeVPNAssignments(ctx, identity.ClusterID, identity.NodeID)
if err != nil {
return err
}
for _, assignment := range assignments {
status := "lease_required"
reason := "eligible_candidate_waiting_for_active_lease"
runtimeAvailable := false
packetForwarding := false
runtimeError := ""
if assignment.ActiveLease != nil && assignment.ActiveLease.OwnerNodeID == identity.NodeID {
running, lastErr := gateway.Status()
runtimeAvailable = running
packetForwarding = running
runtimeError = lastErr
if running {
status = "assigned"
reason = "active_lease_owned_by_local_node"
} else {
status = "blocked"
reason = "vpn_gateway_runtime_unavailable"
if runtimeError == "" {
runtimeError = "vpn gateway runtime is not running"
}
}
}
if assignment.DesiredState != "enabled" {
status = "blocked"
reason = "vpn_connection_disabled"
}
payload := map[string]any{
"schema_version": "rap.node_vpn_assignment_status.v1",
"assignment_reason": assignment.AssignmentReason,
"protocol_family": assignment.ProtocolFamily,
"runtime_available": runtimeAvailable,
"packet_forwarding": packetForwarding,
"reason": reason,
"native_vpn_runtime_note": "experimental packet tunnel runtime is enabled for active linux gateway leases",
"gateway_interface": "rapvpn0",
"gateway_vpn_cidr": "10.77.0.0/24",
"relay_transport": "not_active_owner",
}
if dnsServers := vpnAssignmentDNSServers(assignment); len(dnsServers) > 0 {
payload["exit_dns_servers"] = dnsServers
}
if runtimeError != "" {
payload["runtime_error"] = runtimeError
}
if assignment.ActiveLease != nil && assignment.ActiveLease.OwnerNodeID == identity.NodeID {
gatewayRuntime := gateway.Snapshot()
payload["gateway_runtime"] = gatewayRuntime
if transport, ok := gatewayRuntime["transport"].(string); ok && strings.TrimSpace(transport) != "" {
payload["relay_transport"] = transport
}
}
if assignment.ActiveLease != nil {
payload["active_lease_id"] = assignment.ActiveLease.LeaseID
payload["lease_generation"] = assignment.ActiveLease.LeaseGeneration
payload["lease_expires_at"] = assignment.ActiveLease.ExpiresAt
}
if err := api.ReportNodeVPNAssignmentStatus(ctx, identity.ClusterID, identity.NodeID, assignment.VPNConnectionID, client.NodeVPNAssignmentStatusRequest{
ObservedStatus: status,
StatusPayload: payload,
ObservedAt: time.Now().UTC(),
}); err != nil {
return err
}
}
if len(assignments) > 0 {
log.Printf("vpn assignment status reported: count=%d", len(assignments))
}
return nil
}
func exitDNSServers() []string {
if configured := parseDNSServerList(os.Getenv("RAP_VPN_EXIT_DNS_SERVERS")); len(configured) > 0 {
return configured
}
if configured := parseDNSServerList(os.Getenv("RAP_EXIT_DNS_SERVERS")); len(configured) > 0 {
return configured
}
if runtime.GOOS == "windows" {
return windowsExitDNSServers()
}
seen := map[string]bool{}
var out []string
for _, path := range []string{
"/run/systemd/resolve/resolv.conf",
"/etc/resolv.conf",
"/run/systemd/resolve/stub-resolv.conf",
} {
data, err := os.ReadFile(path)
if err != nil {
continue
}
for _, line := range strings.Split(string(data), "\n") {
fields := strings.Fields(line)
if len(fields) < 2 || fields[0] != "nameserver" {
continue
}
server := strings.TrimSpace(fields[1])
ip := net.ParseIP(server)
if ip == nil || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() {
continue
}
if seen[server] {
continue
}
seen[server] = true
out = append(out, server)
}
if len(out) > 0 {
break
}
}
return out
}
func vpnAssignmentDNSServers(assignment client.NodeVPNAssignment) []string {
if servers := exitDNSServers(); len(servers) > 0 {
return servers
}
for _, raw := range []json.RawMessage{assignment.RoutePolicy, assignment.TargetEndpoint} {
if servers := dnsServersFromRawPolicy(raw); len(servers) > 0 {
return servers
}
}
return nil
}
func windowsExitDNSServers() []string {
ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second)
defer cancel()
output, err := exec.CommandContext(ctx, "netsh", "interface", "ip", "show", "dnsservers").CombinedOutput()
if err != nil || len(output) == 0 {
return nil
}
return parseDNSServerList(string(output))
}
func dnsServersFromRawPolicy(raw json.RawMessage) []string {
var payload map[string]json.RawMessage
if len(raw) == 0 || json.Unmarshal(raw, &payload) != nil {
return nil
}
for _, key := range []string{"dns_servers", "exit_dns_servers"} {
var values []string
if item, ok := payload[key]; ok && json.Unmarshal(item, &values) == nil {
if servers := normalizeDNSServers(values); len(servers) > 0 {
return servers
}
}
}
return nil
}
func normalizeDNSServers(values []string) []string {
seen := map[string]bool{}
out := make([]string, 0, len(values))
for _, value := range values {
server := strings.TrimSpace(value)
ip := net.ParseIP(server)
if ip == nil || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || seen[server] {
continue
}
seen[server] = true
out = append(out, server)
}
return out
}
func parseDNSServerList(value string) []string {
seen := map[string]bool{}
var out []string
for _, field := range strings.FieldsFunc(value, func(r rune) bool {
return r == ',' || r == ';' || r == ' ' || r == '\t' || r == '\n' || r == '\r'
}) {
server := strings.TrimSpace(field)
ip := net.ParseIP(server)
if ip == nil || ip.IsLoopback() || ip.IsUnspecified() || ip.IsLinkLocalUnicast() || seen[server] {
continue
}
seen[server] = true
out = append(out, server)
}
return out
}
func ensureVPNGatewayRuntime(ctx context.Context, api *client.Client, identity state.Identity, gateway *vpnruntime.Gateway, meshState *syntheticMeshState) error {
assignments, err := api.NodeVPNAssignments(ctx, identity.ClusterID, identity.NodeID)
if err != nil {
return err
}
activeOwner := false
for _, assignment := range assignments {
if assignment.AssignmentReason == "eligible_candidate" && assignment.DesiredState == "enabled" {
lease, err := api.AcquireNodeVPNAssignmentLease(ctx, identity.ClusterID, identity.NodeID, assignment.VPNConnectionID, client.NodeVPNAssignmentLeaseAcquireRequest{
TTLSeconds: 300,
Metadata: map[string]any{
"reason": "node_agent_auto_acquire",
"node_id": identity.NodeID,
"agent": "rap-node-agent",
"acquired_at": time.Now().UTC().Format(time.RFC3339Nano),
},
})
if err != nil {
log.Printf("vpn assignment lease auto-acquire skipped: vpn_connection_id=%s error=%v", assignment.VPNConnectionID, err)
} else if lease != nil {
assignment.AssignmentReason = "active_owner"
assignment.ActiveLease = lease
log.Printf("vpn assignment lease auto-acquired: vpn_connection_id=%s lease_id=%s", assignment.VPNConnectionID, lease.LeaseID)
}
}
if assignment.AssignmentReason != "active_owner" {
continue
}
if assignment.ActiveLease == nil || assignment.ActiveLease.OwnerNodeID != identity.NodeID {
continue
}
activeOwner = true
gateway.ClusterID = identity.ClusterID
gateway.VPNConnectionID = assignment.VPNConnectionID
gateway.InterfaceName = "rapvpn0"
gateway.AddressCIDR = "10.77.0.1/24"
gateway.RouteCIDR = "10.77.0.0/24"
gateway.PollTimeout = 25 * time.Second
if transport := fabricGatewayTransportForAssignment(identity, assignment, meshState, api); transport != nil {
if _, ok := gateway.Transport.(vpnruntime.BackendPacketTransport); ok {
gateway.Stop()
}
gateway.Transport = transport
} else if transport := localGatewayTransportForAssignment(identity, assignment, meshState, api); transport != nil {
if _, ok := gateway.Transport.(vpnruntime.BackendPacketTransport); ok {
gateway.Stop()
}
gateway.Transport = transport
} else if _, ok := gateway.Transport.(*vpnruntime.FabricPacketTransport); ok {
gateway.Stop()
gateway.Transport = nil
} else if _, ok := gateway.Transport.(*vpnruntime.AdaptivePacketTransport); ok {
gateway.Stop()
gateway.Transport = nil
} else {
gateway.Stop()
gateway.Transport = nil
log.Printf("vpn gateway runtime skipped: vpn_connection_id=%s reason=fabric_packet_transport_unavailable", assignment.VPNConnectionID)
return nil
}
if err := gateway.EnsureStarted(ctx); err != nil {
return err
}
if err := renewOwnedVPNLease(ctx, api, identity, assignment); err != nil {
return err
}
log.Printf("vpn gateway runtime ensured: vpn_connection_id=%s interface=%s", assignment.VPNConnectionID, gateway.InterfaceName)
return nil
}
if !activeOwner {
gateway.Stop()
}
return nil
}
func localGatewayTransportForAssignment(identity state.Identity, assignment client.NodeVPNAssignment, meshState *syntheticMeshState, _ *client.Client) vpnruntime.PacketTransport {
if meshState == nil || meshState.VPNFabricInbox == nil || assignment.VPNConnectionID == "" {
return nil
}
return &vpnruntime.LocalPacketTransport{
Inbox: meshState.VPNFabricInbox,
VPNConnectionID: assignment.VPNConnectionID,
}
}
func fabricGatewayTransportForAssignment(identity state.Identity, assignment client.NodeVPNAssignment, meshState *syntheticMeshState, _ *client.Client) vpnruntime.PacketTransport {
if meshState == nil || meshState.ProductionForwardTransport == nil || meshState.VPNFabricInbox == nil {
return nil
}
route, nextHop, ok := selectVPNPacketRoute(meshState.Routes, identity.ClusterID, identity.NodeID)
if !ok {
return nil
}
return &vpnruntime.FabricPacketTransport{
ForwardTransport: meshState.ProductionForwardTransport,
Inbox: meshState.VPNFabricInbox,
ClusterID: identity.ClusterID,
VPNConnectionID: assignment.VPNConnectionID,
RouteID: route.RouteID,
LocalNodeID: identity.NodeID,
RemoteNodeID: route.DestinationNodeID,
NextHopNodeID: nextHop,
RoutePath: route.Hops,
SendDirection: vpnruntime.FabricDirectionGatewayToClient,
ReceiveDirection: vpnruntime.FabricDirectionClientToGateway,
}
}
func selectVPNPacketRoute(routes []mesh.SyntheticRoute, clusterID string, localNodeID string) (mesh.SyntheticRoute, string, bool) {
now := time.Now().UTC()
for _, route := range routes {
if route.ClusterID != clusterID || route.SourceNodeID != localNodeID || !containsString(route.AllowedChannels, mesh.ProductionChannelVPNPacket) {
continue
}
if !route.ExpiresAt.IsZero() && !route.ExpiresAt.After(now) {
continue
}
nextHop := nextRouteHop(route.Hops, localNodeID, route.DestinationNodeID)
if nextHop == "" || nextHop == localNodeID {
continue
}
return route, nextHop, true
}
return mesh.SyntheticRoute{}, "", false
}
func nextRouteHop(path []string, localNodeID string, destinationNodeID string) string {
if len(path) == 0 {
return destinationNodeID
}
for index, nodeID := range path {
if nodeID == localNodeID {
if index+1 < len(path) {
return path[index+1]
}
return localNodeID
}
}
return destinationNodeID
}
func renewOwnedVPNLease(ctx context.Context, api *client.Client, identity state.Identity, assignment client.NodeVPNAssignment) error {
if assignment.ActiveLease == nil || assignment.ActiveLease.OwnerNodeID != identity.NodeID {
return nil
}
if err := api.RenewNodeVPNAssignmentLease(ctx, identity.ClusterID, identity.NodeID, assignment.VPNConnectionID, assignment.ActiveLease.LeaseID, client.NodeVPNAssignmentLeaseRenewRequest{
TTLSeconds: 300,
}); err != nil {
return err
}
log.Printf("vpn lease renewed: vpn_connection_id=%s lease_id=%s ttl_seconds=300", assignment.VPNConnectionID, assignment.ActiveLease.LeaseID)
return nil
}