package cluster import ( "bytes" "context" "crypto/rand" "crypto/sha256" "encoding/hex" "encoding/json" "errors" "fmt" "net" "net/url" "sort" "strings" "sync" "time" "github.com/jackc/pgx/v5" "github.com/jackc/pgx/v5/pgconn" "github.com/example/remote-access-platform/backend/internal/platform/clusterauth" ) var ( ErrAccessDenied = errors.New("platform admin role is required") ErrInvalidPayload = errors.New("invalid cluster payload") ErrInvalidJoinToken = errors.New("invalid or expired join token") ErrInvalidNodeRole = errors.New("invalid node role") ErrInvalidCluster = errors.New("cluster not found") ErrInvalidJoinRequest = errors.New("join request not found") ErrClusterReadOnly = errors.New("cluster is not authoritative for policy mutation") ErrLegacyRemovalBlocked = errors.New("legacy compatibility removal is blocked while stale recovery-risk nodes remain") ErrInvalidVPNConnection = errors.New("vpn connection not found") ErrInvalidVPNLease = errors.New("vpn connection lease not found") ErrVPNLeaseAlreadyActive = errors.New("vpn connection already has an active lease") ErrVPNLeaseOwnerNotAllowed = errors.New("vpn lease owner is not allowed") ErrVPNLeaseOwnerRoleRequired = errors.New("vpn lease owner requires active vpn-exit or vpn-connector role") ) type LegacyRemovalBlockedError struct { BlockedOperation string Report StaleNodeRiskReport } func (e *LegacyRemovalBlockedError) Error() string { return ErrLegacyRemovalBlocked.Error() } func (e *LegacyRemovalBlockedError) Is(target error) bool { return target == ErrLegacyRemovalBlocked } type Service struct { store Repository now func() time.Time fabricServiceChannelLeaseMu sync.Mutex fabricServiceChannelLeaseCache map[string]FabricServiceChannelLease } const fabricServiceChannelFeedbackMaxAge = 2 * time.Minute const fabricServiceChannelOperatorExpireCooldown = 2 * time.Minute func NewService(store Repository) *Service { return &Service{store: store, now: func() time.Time { return time.Now().UTC() }, fabricServiceChannelLeaseCache: map[string]FabricServiceChannelLease{}} } const ( clusterJoinTokenAuthoritySchema = "rap.cluster.join_token.v1" clusterNodeApprovalAuthoritySchema = "rap.cluster.node_approval.v1" clusterMeshConfigAuthoritySchema = "rap.cluster.mesh_config_snapshot.v1" ) type clusterJoinTokenAuthorityPayload struct { SchemaVersion string `json:"schema_version"` ClusterID string `json:"cluster_id"` TokenID string `json:"token_id"` Scope json.RawMessage `json:"scope"` ExpiresAt time.Time `json:"expires_at"` MaxUses int `json:"max_uses"` CreatedByUserID *string `json:"created_by_user_id,omitempty"` IssuedAt time.Time `json:"issued_at"` ControlPlaneOnly bool `json:"control_plane_only"` ProductionForwarding bool `json:"production_forwarding"` } type clusterNodeApprovalAuthorityPayload struct { SchemaVersion string `json:"schema_version"` ClusterID string `json:"cluster_id"` JoinRequestID string `json:"join_request_id"` NodeID string `json:"node_id"` NodeFingerprint string `json:"node_fingerprint"` IdentityStatus string `json:"identity_status"` HeartbeatEndpoint string `json:"heartbeat_endpoint"` ApprovedByUserID string `json:"approved_by_user_id"` ClusterAuthorityQuorumSHA256 string `json:"cluster_authority_quorum_sha256,omitempty"` IssuedAt time.Time `json:"issued_at"` ControlPlaneOnly bool `json:"control_plane_only"` ProductionForwarding bool `json:"production_forwarding"` } type clusterMeshConfigAuthorityPayload struct { SchemaVersion string `json:"schema_version"` ClusterID string `json:"cluster_id"` LocalNodeID string `json:"local_node_id"` ConfigVersion string `json:"config_version"` ConfigSHA256 string `json:"config_sha256"` IssuedAt time.Time `json:"issued_at"` ExpiresAt time.Time `json:"expires_at"` ControlPlaneOnly bool `json:"control_plane_only"` ProductionForwarding bool `json:"production_forwarding"` } func (s *Service) ListClusters(ctx context.Context, actorUserID string) ([]Cluster, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListClusters(ctx) } func (s *Service) GetCluster(ctx context.Context, actorUserID, clusterID string) (Cluster, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return Cluster{}, err } item, err := s.store.GetCluster(ctx, clusterID) if errors.Is(err, pgx.ErrNoRows) { return Cluster{}, ErrInvalidCluster } return item, err } func (s *Service) GetFabricServiceChannelRecoveryPolicy(ctx context.Context, actorUserID, clusterID string) (FabricServiceChannelRecoveryPolicy, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return FabricServiceChannelRecoveryPolicy{}, err } cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID)) if errors.Is(err, pgx.ErrNoRows) { return FabricServiceChannelRecoveryPolicy{}, ErrInvalidCluster } if err != nil { return FabricServiceChannelRecoveryPolicy{}, err } return fabricServiceChannelRecoveryPolicyFromCluster(cluster), nil } func (s *Service) UpdateFabricServiceChannelRecoveryPolicy(ctx context.Context, input UpdateFabricServiceChannelRecoveryPolicyInput) (FabricServiceChannelRecoveryPolicy, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return FabricServiceChannelRecoveryPolicy{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) if input.ClusterID == "" { return FabricServiceChannelRecoveryPolicy{}, ErrInvalidCluster } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return FabricServiceChannelRecoveryPolicy{}, err } cluster, err := s.store.GetCluster(ctx, input.ClusterID) if errors.Is(err, pgx.ErrNoRows) { return FabricServiceChannelRecoveryPolicy{}, ErrInvalidCluster } if err != nil { return FabricServiceChannelRecoveryPolicy{}, err } policy := fabricServiceChannelRecoveryPolicyFromCluster(cluster) if input.HysteresisPenalty > 0 { policy.HysteresisPenalty = clampInt(input.HysteresisPenalty, 0, 10000) } if input.PromotionMinSamples > 0 { policy.PromotionMinSamples = clampInt(input.PromotionMinSamples, 1, 100000) } if input.DemotionFailureThreshold > 0 { policy.DemotionFailureThreshold = clampInt(input.DemotionFailureThreshold, 1, 100000) } if input.DemotionDropThreshold > 0 { policy.DemotionDropThreshold = clampInt(input.DemotionDropThreshold, 1, 100000) } if input.DemotionSlowThreshold > 0 { policy.DemotionSlowThreshold = clampInt(input.DemotionSlowThreshold, 1, 100000) } if input.DemotionRebuildEnabled != nil { policy.DemotionRebuildEnabled = *input.DemotionRebuildEnabled } if input.DemotionFencedEnabled != nil { policy.DemotionFencedEnabled = *input.DemotionFencedEnabled } now := s.now().UTC() policy.SchemaVersion = "rap.fabric_service_channel_recovery_policy.v1" policy.Source = "cluster_metadata" policy.UpdatedByUserID = &input.ActorUserID policy.UpdatedAt = now policy.ControlPlaneOnly = true policy.ProductionForwarding = false metadata, err := upsertFabricServiceChannelRecoveryPolicyMetadata(cluster.Metadata, policy) if err != nil { return FabricServiceChannelRecoveryPolicy{}, err } updated, err := s.store.UpdateCluster(ctx, UpdateClusterInput{ ActorUserID: input.ActorUserID, ClusterID: cluster.ID, Name: cluster.Name, Status: cluster.Status, Region: cluster.Region, Metadata: metadata, }) if err != nil { return FabricServiceChannelRecoveryPolicy{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &cluster.ID, ActorUserID: &input.ActorUserID, EventType: "fabric.service_channel.recovery_policy.updated", TargetType: "cluster", TargetID: &cluster.ID, Payload: metadata, CreatedAt: now, }) return fabricServiceChannelRecoveryPolicyFromCluster(updated), nil } func (s *Service) GetFabricServiceChannelAdaptivePolicy(ctx context.Context, actorUserID, clusterID string) (FabricServiceChannelAdaptivePolicy, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return FabricServiceChannelAdaptivePolicy{}, err } cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID)) if errors.Is(err, pgx.ErrNoRows) { return FabricServiceChannelAdaptivePolicy{}, ErrInvalidCluster } if err != nil { return FabricServiceChannelAdaptivePolicy{}, err } return fabricServiceChannelAdaptivePolicyFromCluster(cluster), nil } func (s *Service) UpdateFabricServiceChannelAdaptivePolicy(ctx context.Context, input UpdateFabricServiceChannelAdaptivePolicyInput) (FabricServiceChannelAdaptivePolicy, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return FabricServiceChannelAdaptivePolicy{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) if input.ClusterID == "" { return FabricServiceChannelAdaptivePolicy{}, ErrInvalidCluster } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return FabricServiceChannelAdaptivePolicy{}, err } cluster, err := s.store.GetCluster(ctx, input.ClusterID) if errors.Is(err, pgx.ErrNoRows) { return FabricServiceChannelAdaptivePolicy{}, ErrInvalidCluster } if err != nil { return FabricServiceChannelAdaptivePolicy{}, err } policy := fabricServiceChannelAdaptivePolicyFromCluster(cluster) if input.MaxParallelWindow > 0 { policy.MaxParallelWindow = clampInt(input.MaxParallelWindow, 1, 64) } if input.BulkPressureChannelThreshold > 0 { policy.BulkPressureChannelThreshold = clampInt(input.BulkPressureChannelThreshold, 1, 100000) } if input.QueuePressureHighWatermark > 0 { policy.QueuePressureHighWatermark = clampInt(input.QueuePressureHighWatermark, 1, 100000) } if input.QueuePressureMaxInFlight > 0 { policy.QueuePressureMaxInFlight = clampInt(input.QueuePressureMaxInFlight, 1, 100000) } if len(input.ClassWindows) > 0 { policy.ClassWindows = normalizeFabricServiceChannelAdaptiveClassWindows(input.ClassWindows, policy.MaxParallelWindow) } now := s.now().UTC() policy.SchemaVersion = "rap.fabric_service_channel_adaptive_policy.v1" policy.Source = "cluster_metadata" policy.UpdatedByUserID = &input.ActorUserID policy.UpdatedAt = now policy.ControlPlaneOnly = true policy.ProductionForwarding = false metadata, err := upsertFabricServiceChannelAdaptivePolicyMetadata(cluster.Metadata, policy) if err != nil { return FabricServiceChannelAdaptivePolicy{}, err } updated, err := s.store.UpdateCluster(ctx, UpdateClusterInput{ ActorUserID: input.ActorUserID, ClusterID: cluster.ID, Name: cluster.Name, Status: cluster.Status, Region: cluster.Region, Metadata: metadata, }) if err != nil { return FabricServiceChannelAdaptivePolicy{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &cluster.ID, ActorUserID: &input.ActorUserID, EventType: "fabric.service_channel.adaptive_policy.updated", TargetType: "cluster", TargetID: &cluster.ID, Payload: metadata, CreatedAt: now, }) return fabricServiceChannelAdaptivePolicyFromCluster(updated), nil } func (s *Service) GetFabricServiceChannelPoolPolicy(ctx context.Context, actorUserID, clusterID string) (FabricServiceChannelPoolPolicy, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return FabricServiceChannelPoolPolicy{}, err } cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID)) if errors.Is(err, pgx.ErrNoRows) { return FabricServiceChannelPoolPolicy{}, ErrInvalidCluster } if err != nil { return FabricServiceChannelPoolPolicy{}, err } return fabricServiceChannelPoolPolicyFromCluster(cluster), nil } func (s *Service) UpdateFabricServiceChannelPoolPolicy(ctx context.Context, input UpdateFabricServiceChannelPoolPolicyInput) (FabricServiceChannelPoolPolicy, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return FabricServiceChannelPoolPolicy{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) if input.ClusterID == "" { return FabricServiceChannelPoolPolicy{}, ErrInvalidCluster } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return FabricServiceChannelPoolPolicy{}, err } cluster, err := s.store.GetCluster(ctx, input.ClusterID) if errors.Is(err, pgx.ErrNoRows) { return FabricServiceChannelPoolPolicy{}, ErrInvalidCluster } if err != nil { return FabricServiceChannelPoolPolicy{}, err } policy := fabricServiceChannelPoolPolicyFromCluster(cluster) policy.EntryPoolNodeIDs = dedupeStrings(input.EntryPoolNodeIDs) policy.ExitPoolNodeIDs = dedupeStrings(input.ExitPoolNodeIDs) policy.PreferredEntryNodeID = strings.TrimSpace(input.PreferredEntryNodeID) policy.PreferredExitNodeID = strings.TrimSpace(input.PreferredExitNodeID) if input.SelectionStrategy != "" { policy.SelectionStrategy = strings.TrimSpace(input.SelectionStrategy) } if input.RouteRebuild != "" { policy.RouteRebuild = strings.TrimSpace(input.RouteRebuild) } if input.EntryFailover != "" { policy.EntryFailover = strings.TrimSpace(input.EntryFailover) } if input.ExitFailover != "" { policy.ExitFailover = strings.TrimSpace(input.ExitFailover) } if input.BackendFallbackAllowed != nil { policy.BackendFallbackAllowed = *input.BackendFallbackAllowed } if input.StickySession != nil { policy.StickySession = *input.StickySession } now := s.now().UTC() policy.SchemaVersion = "rap.fabric_service_channel_pool_policy.v1" policy.Source = "cluster_metadata" policy.UpdatedByUserID = &input.ActorUserID policy.UpdatedAt = now policy.ControlPlaneOnly = true policy.ProductionForwarding = false policy = normalizeFabricServiceChannelPoolPolicy(policy, defaultFabricServiceChannelPoolPolicy()) metadata, err := upsertFabricServiceChannelPoolPolicyMetadata(cluster.Metadata, policy) if err != nil { return FabricServiceChannelPoolPolicy{}, err } updated, err := s.store.UpdateCluster(ctx, UpdateClusterInput{ ActorUserID: input.ActorUserID, ClusterID: cluster.ID, Name: cluster.Name, Status: cluster.Status, Region: cluster.Region, Metadata: metadata, }) if err != nil { return FabricServiceChannelPoolPolicy{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &cluster.ID, ActorUserID: &input.ActorUserID, EventType: "fabric.service_channel.pool_policy.updated", TargetType: "cluster", TargetID: &cluster.ID, Payload: metadata, CreatedAt: now, }) return fabricServiceChannelPoolPolicyFromCluster(updated), nil } func (s *Service) GetFabricServiceChannelBreadcrumbWindowPolicy(ctx context.Context, actorUserID, clusterID string) (FabricServiceChannelBreadcrumbWindowPolicy, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return FabricServiceChannelBreadcrumbWindowPolicy{}, err } cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID)) if errors.Is(err, pgx.ErrNoRows) { return FabricServiceChannelBreadcrumbWindowPolicy{}, ErrInvalidCluster } if err != nil { return FabricServiceChannelBreadcrumbWindowPolicy{}, err } return fabricServiceChannelBreadcrumbWindowPolicyFromCluster(cluster), nil } func (s *Service) UpdateFabricServiceChannelBreadcrumbWindowPolicy(ctx context.Context, input UpdateFabricServiceChannelBreadcrumbWindowPolicyInput) (FabricServiceChannelBreadcrumbWindowPolicy, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return FabricServiceChannelBreadcrumbWindowPolicy{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) if input.ClusterID == "" { return FabricServiceChannelBreadcrumbWindowPolicy{}, ErrInvalidCluster } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return FabricServiceChannelBreadcrumbWindowPolicy{}, err } cluster, err := s.store.GetCluster(ctx, input.ClusterID) if errors.Is(err, pgx.ErrNoRows) { return FabricServiceChannelBreadcrumbWindowPolicy{}, ErrInvalidCluster } if err != nil { return FabricServiceChannelBreadcrumbWindowPolicy{}, err } policy := fabricServiceChannelBreadcrumbWindowPolicyFromCluster(cluster) if input.CurrentWindowSeconds > 0 { policy.CurrentWindowSeconds = input.CurrentWindowSeconds } if input.HistoryWindowSeconds > 0 { policy.HistoryWindowSeconds = input.HistoryWindowSeconds } now := s.now().UTC() policy.SchemaVersion = "rap.fabric_service_channel_breadcrumb_window_policy.v1" policy.Source = "cluster_metadata" policy.UpdatedByUserID = &input.ActorUserID policy.UpdatedAt = now policy.ControlPlaneOnly = true policy.ProductionForwarding = false policy = normalizeFabricServiceChannelBreadcrumbWindowPolicy(policy, defaultFabricServiceChannelBreadcrumbWindowPolicy()) metadata, err := upsertFabricServiceChannelBreadcrumbWindowPolicyMetadata(cluster.Metadata, policy) if err != nil { return FabricServiceChannelBreadcrumbWindowPolicy{}, err } updated, err := s.store.UpdateCluster(ctx, UpdateClusterInput{ ActorUserID: input.ActorUserID, ClusterID: cluster.ID, Name: cluster.Name, Status: cluster.Status, Region: cluster.Region, Metadata: metadata, }) if err != nil { return FabricServiceChannelBreadcrumbWindowPolicy{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &cluster.ID, ActorUserID: &input.ActorUserID, EventType: "fabric.service_channel.breadcrumb_window_policy.updated", TargetType: "cluster", TargetID: &cluster.ID, Payload: metadata, CreatedAt: now, }) return fabricServiceChannelBreadcrumbWindowPolicyFromCluster(updated), nil } func (s *Service) CreateCluster(ctx context.Context, input CreateClusterInput) (Cluster, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return Cluster{}, err } input.Slug = strings.TrimSpace(input.Slug) input.Name = strings.TrimSpace(input.Name) if input.Slug == "" || input.Name == "" { return Cluster{}, ErrInvalidPayload } input.Metadata = defaultJSON(input.Metadata, `{}`) if !json.Valid(input.Metadata) { return Cluster{}, errors.New("metadata must be valid json") } item, err := s.store.CreateCluster(ctx, input) if err != nil { return Cluster{}, err } auditPayload := json.RawMessage(`{}`) if authorityKey, err := s.ensureClusterAuthority(ctx, item.ID, &input.ActorUserID); err == nil { auditPayload, _ = json.Marshal(map[string]any{ "cluster_authority": map[string]any{ "key_algorithm": authorityKey.KeyAlgorithm, "public_key_fingerprint": authorityKey.PublicKeyFingerprint, }, }) } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &item.ID, ActorUserID: &input.ActorUserID, EventType: "cluster.created", TargetType: "cluster", TargetID: &item.ID, Payload: auditPayload, CreatedAt: s.now(), }) return item, nil } func (s *Service) ensureClusterAuthority(ctx context.Context, clusterID string, actorUserID *string) (ClusterAuthorityKey, error) { authorityKey, err := s.store.GetClusterAuthority(ctx, clusterID) if errors.Is(err, pgx.ErrNoRows) { return s.store.EnsureClusterAuthority(ctx, clusterID, actorUserID) } return authorityKey, err } func authorityDescriptor(authorityKey ClusterAuthorityKey) *ClusterAuthorityDescriptor { descriptor := authorityKey.ClusterAuthorityDescriptor if descriptor.SchemaVersion == "" { descriptor.SchemaVersion = clusterauth.AuthoritySchemaVersion } return &descriptor } func defaultFabricServiceChannelRecoveryPolicy() FabricServiceChannelRecoveryPolicy { return FabricServiceChannelRecoveryPolicy{ SchemaVersion: "rap.fabric_service_channel_recovery_policy.v1", HysteresisPenalty: fabricServiceChannelRecoveryHysteresisPenalty, PromotionMinSamples: fabricServiceChannelRecoveryPromotionMinSamples, DemotionFailureThreshold: 1, DemotionDropThreshold: 1, DemotionSlowThreshold: 1, DemotionRebuildEnabled: true, DemotionFencedEnabled: true, Source: "defaults", ControlPlaneOnly: true, ProductionForwarding: false, } } func fabricServiceChannelRecoveryPolicyFromCluster(cluster Cluster) FabricServiceChannelRecoveryPolicy { policy := defaultFabricServiceChannelRecoveryPolicy() if len(cluster.Metadata) == 0 || !json.Valid(cluster.Metadata) { return policy } var raw struct { Policy *FabricServiceChannelRecoveryPolicy `json:"fabric_service_channel_recovery_policy"` } if err := json.Unmarshal(cluster.Metadata, &raw); err != nil || raw.Policy == nil { return policy } policy = normalizeFabricServiceChannelRecoveryPolicy(*raw.Policy, policy) policy.Source = "cluster_metadata" return policy } func normalizeFabricServiceChannelRecoveryPolicy(input FabricServiceChannelRecoveryPolicy, fallback FabricServiceChannelRecoveryPolicy) FabricServiceChannelRecoveryPolicy { if input.SchemaVersion == "" { input.SchemaVersion = "rap.fabric_service_channel_recovery_policy.v1" } if input.HysteresisPenalty < 0 { input.HysteresisPenalty = fallback.HysteresisPenalty } if input.HysteresisPenalty == 0 { input.HysteresisPenalty = fallback.HysteresisPenalty } if input.PromotionMinSamples <= 0 { input.PromotionMinSamples = fallback.PromotionMinSamples } if input.DemotionFailureThreshold <= 0 { input.DemotionFailureThreshold = fallback.DemotionFailureThreshold } if input.DemotionDropThreshold <= 0 { input.DemotionDropThreshold = fallback.DemotionDropThreshold } if input.DemotionSlowThreshold <= 0 { input.DemotionSlowThreshold = fallback.DemotionSlowThreshold } if input.Source == "" { input.Source = fallback.Source } input.ControlPlaneOnly = true input.ProductionForwarding = false input.Fingerprint = fabricServiceChannelRecoveryPolicyFingerprint(input) return input } func upsertFabricServiceChannelRecoveryPolicyMetadata(metadata json.RawMessage, policy FabricServiceChannelRecoveryPolicy) (json.RawMessage, error) { raw := map[string]any{} if len(metadata) > 0 && json.Valid(metadata) { if err := json.Unmarshal(metadata, &raw); err != nil { return nil, err } } raw["fabric_service_channel_recovery_policy"] = policy out, err := json.Marshal(raw) if err != nil { return nil, err } return json.RawMessage(out), nil } func fabricServiceChannelRecoveryPolicyRef(policy FabricServiceChannelRecoveryPolicy) *FabricServiceChannelRecoveryPolicy { normalized := normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy()) return &normalized } func fabricServiceChannelRecoveryPolicyFingerprint(policy FabricServiceChannelRecoveryPolicy) string { policy.Fingerprint = "" policy.UpdatedAt = time.Time{} policy.UpdatedByUserID = nil raw, err := json.Marshal(struct { SchemaVersion string `json:"schema_version"` HysteresisPenalty int `json:"hysteresis_penalty"` PromotionMinSamples int `json:"promotion_min_samples"` DemotionFailureThreshold int `json:"demotion_failure_threshold"` DemotionDropThreshold int `json:"demotion_drop_threshold"` DemotionSlowThreshold int `json:"demotion_slow_threshold"` DemotionRebuildEnabled bool `json:"demotion_rebuild_enabled"` DemotionFencedEnabled bool `json:"demotion_fenced_enabled"` ControlPlaneOnly bool `json:"control_plane_only"` ProductionForwarding bool `json:"production_forwarding"` }{ SchemaVersion: policy.SchemaVersion, HysteresisPenalty: policy.HysteresisPenalty, PromotionMinSamples: policy.PromotionMinSamples, DemotionFailureThreshold: policy.DemotionFailureThreshold, DemotionDropThreshold: policy.DemotionDropThreshold, DemotionSlowThreshold: policy.DemotionSlowThreshold, DemotionRebuildEnabled: policy.DemotionRebuildEnabled, DemotionFencedEnabled: policy.DemotionFencedEnabled, ControlPlaneOnly: true, ProductionForwarding: false, }) if err != nil { return "" } sum := sha256.Sum256(raw) return hex.EncodeToString(sum[:]) } func defaultFabricServiceChannelAdaptivePolicy() FabricServiceChannelAdaptivePolicy { return normalizeFabricServiceChannelAdaptivePolicy(FabricServiceChannelAdaptivePolicy{ SchemaVersion: "rap.fabric_service_channel_adaptive_policy.v1", MaxParallelWindow: 4, BulkPressureChannelThreshold: 16, QueuePressureHighWatermark: 16, QueuePressureMaxInFlight: 16, ClassWindows: map[string]int{ "control": 4, "interactive": 4, "reliable": 3, "bulk": 1, "droppable": 1, }, Source: "defaults", ControlPlaneOnly: true, ProductionForwarding: false, }, FabricServiceChannelAdaptivePolicy{}) } func fabricServiceChannelAdaptivePolicyFromCluster(cluster Cluster) FabricServiceChannelAdaptivePolicy { fallback := defaultFabricServiceChannelAdaptivePolicy() if len(cluster.Metadata) == 0 || !json.Valid(cluster.Metadata) { return fallback } var raw struct { Policy *FabricServiceChannelAdaptivePolicy `json:"fabric_service_channel_adaptive_policy"` } if err := json.Unmarshal(cluster.Metadata, &raw); err != nil || raw.Policy == nil { return fallback } policy := normalizeFabricServiceChannelAdaptivePolicy(*raw.Policy, fallback) policy.Source = "cluster_metadata" return policy } func normalizeFabricServiceChannelAdaptivePolicy(input FabricServiceChannelAdaptivePolicy, fallback FabricServiceChannelAdaptivePolicy) FabricServiceChannelAdaptivePolicy { if input.SchemaVersion == "" { input.SchemaVersion = "rap.fabric_service_channel_adaptive_policy.v1" } if fallback.MaxParallelWindow <= 0 { fallback.MaxParallelWindow = 4 } if input.MaxParallelWindow <= 0 { input.MaxParallelWindow = fallback.MaxParallelWindow } input.MaxParallelWindow = clampInt(input.MaxParallelWindow, 1, 64) if input.BulkPressureChannelThreshold <= 0 { input.BulkPressureChannelThreshold = firstPositive(fallback.BulkPressureChannelThreshold, 16) } if input.QueuePressureHighWatermark <= 0 { input.QueuePressureHighWatermark = firstPositive(fallback.QueuePressureHighWatermark, 16) } if input.QueuePressureMaxInFlight <= 0 { input.QueuePressureMaxInFlight = firstPositive(fallback.QueuePressureMaxInFlight, 16) } input.ClassWindows = normalizeFabricServiceChannelAdaptiveClassWindows(firstNonNilStringIntMap(input.ClassWindows, fallback.ClassWindows), input.MaxParallelWindow) if input.Source == "" { input.Source = fallback.Source } if input.Source == "" { input.Source = "defaults" } input.ControlPlaneOnly = true input.ProductionForwarding = false input.Fingerprint = fabricServiceChannelAdaptivePolicyFingerprint(input) return input } func normalizeFabricServiceChannelAdaptiveClassWindows(values map[string]int, maxWindow int) map[string]int { if maxWindow <= 0 { maxWindow = 4 } defaults := map[string]int{"control": maxWindow, "interactive": maxWindow, "reliable": boundedMinInt(maxWindow, 3), "bulk": 1, "droppable": 1} out := map[string]int{} for key, fallback := range defaults { value := values[key] if value <= 0 { value = fallback } out[key] = clampInt(value, 1, maxWindow) } return out } func upsertFabricServiceChannelAdaptivePolicyMetadata(metadata json.RawMessage, policy FabricServiceChannelAdaptivePolicy) (json.RawMessage, error) { raw := map[string]any{} if len(metadata) > 0 && json.Valid(metadata) { if err := json.Unmarshal(metadata, &raw); err != nil { return nil, err } } raw["fabric_service_channel_adaptive_policy"] = policy out, err := json.Marshal(raw) if err != nil { return nil, err } return json.RawMessage(out), nil } func fabricServiceChannelAdaptivePolicyFingerprint(policy FabricServiceChannelAdaptivePolicy) string { raw, err := json.Marshal(struct { SchemaVersion string `json:"schema_version"` MaxParallelWindow int `json:"max_parallel_window"` BulkPressureChannelThreshold int `json:"bulk_pressure_channel_threshold"` QueuePressureHighWatermark int `json:"queue_pressure_high_watermark"` QueuePressureMaxInFlight int `json:"queue_pressure_max_in_flight"` ClassWindows map[string]int `json:"class_windows"` ControlPlaneOnly bool `json:"control_plane_only"` ProductionForwarding bool `json:"production_forwarding"` }{ SchemaVersion: policy.SchemaVersion, MaxParallelWindow: policy.MaxParallelWindow, BulkPressureChannelThreshold: policy.BulkPressureChannelThreshold, QueuePressureHighWatermark: policy.QueuePressureHighWatermark, QueuePressureMaxInFlight: policy.QueuePressureMaxInFlight, ClassWindows: policy.ClassWindows, ControlPlaneOnly: true, ProductionForwarding: false, }) if err != nil { return "" } sum := sha256.Sum256(raw) return hex.EncodeToString(sum[:]) } func defaultFabricServiceChannelPoolPolicy() FabricServiceChannelPoolPolicy { return normalizeFabricServiceChannelPoolPolicy(FabricServiceChannelPoolPolicy{ SchemaVersion: "rap.fabric_service_channel_pool_policy.v1", SelectionStrategy: "fastest_healthy", RouteRebuild: "automatic", EntryFailover: "automatic", ExitFailover: "automatic", BackendFallbackAllowed: true, StickySession: true, Source: "defaults", ControlPlaneOnly: true, ProductionForwarding: false, }, FabricServiceChannelPoolPolicy{}) } func fabricServiceChannelPoolPolicyFromCluster(cluster Cluster) FabricServiceChannelPoolPolicy { fallback := defaultFabricServiceChannelPoolPolicy() if len(cluster.Metadata) == 0 || !json.Valid(cluster.Metadata) { return fallback } var raw struct { Policy *FabricServiceChannelPoolPolicy `json:"fabric_service_channel_pool_policy"` } if err := json.Unmarshal(cluster.Metadata, &raw); err != nil || raw.Policy == nil { return fallback } policy := normalizeFabricServiceChannelPoolPolicy(*raw.Policy, fallback) policy.Source = "cluster_metadata" return policy } func normalizeFabricServiceChannelPoolPolicy(input FabricServiceChannelPoolPolicy, fallback FabricServiceChannelPoolPolicy) FabricServiceChannelPoolPolicy { if input.SchemaVersion == "" { input.SchemaVersion = firstNonEmptyString(fallback.SchemaVersion, "rap.fabric_service_channel_pool_policy.v1") } input.EntryPoolNodeIDs = dedupeStrings(firstNonEmptyStringSlice(input.EntryPoolNodeIDs, fallback.EntryPoolNodeIDs)) input.ExitPoolNodeIDs = dedupeStrings(firstNonEmptyStringSlice(input.ExitPoolNodeIDs, fallback.ExitPoolNodeIDs)) input.PreferredEntryNodeID = strings.TrimSpace(firstNonEmptyString(input.PreferredEntryNodeID, fallback.PreferredEntryNodeID)) input.PreferredExitNodeID = strings.TrimSpace(firstNonEmptyString(input.PreferredExitNodeID, fallback.PreferredExitNodeID)) input.SelectionStrategy = normalizeFabricServiceChannelPoolPolicyMode(firstNonEmptyString(input.SelectionStrategy, fallback.SelectionStrategy), []string{"fastest_healthy", "preferred_first", "stable_first"}, "fastest_healthy") input.RouteRebuild = normalizeFabricServiceChannelPoolPolicyMode(firstNonEmptyString(input.RouteRebuild, fallback.RouteRebuild), []string{"automatic", "manual", "disabled"}, "automatic") input.EntryFailover = normalizeFabricServiceChannelPoolPolicyMode(firstNonEmptyString(input.EntryFailover, fallback.EntryFailover), []string{"automatic", "manual", "disabled"}, "automatic") input.ExitFailover = normalizeFabricServiceChannelPoolPolicyMode(firstNonEmptyString(input.ExitFailover, fallback.ExitFailover), []string{"automatic", "manual", "disabled"}, "automatic") if input.Source == "" { input.Source = firstNonEmptyString(fallback.Source, "defaults") } input.ControlPlaneOnly = true input.ProductionForwarding = false input.Fingerprint = fabricServiceChannelPoolPolicyFingerprint(input) return input } func normalizeFabricServiceChannelPoolPolicyMode(value string, allowed []string, fallback string) string { value = strings.TrimSpace(strings.ToLower(value)) for _, item := range allowed { if value == item { return value } } return fallback } func upsertFabricServiceChannelPoolPolicyMetadata(metadata json.RawMessage, policy FabricServiceChannelPoolPolicy) (json.RawMessage, error) { raw := map[string]any{} if len(metadata) > 0 && json.Valid(metadata) { if err := json.Unmarshal(metadata, &raw); err != nil { return nil, err } } raw["fabric_service_channel_pool_policy"] = policy out, err := json.Marshal(raw) if err != nil { return nil, err } return json.RawMessage(out), nil } func fabricServiceChannelPoolPolicyRef(policy FabricServiceChannelPoolPolicy) *FabricServiceChannelPoolPolicy { normalized := normalizeFabricServiceChannelPoolPolicy(policy, defaultFabricServiceChannelPoolPolicy()) return &normalized } func fabricServiceChannelPoolPolicyFingerprint(policy FabricServiceChannelPoolPolicy) string { raw, err := json.Marshal(struct { SchemaVersion string `json:"schema_version"` EntryPoolNodeIDs []string `json:"entry_pool_node_ids,omitempty"` ExitPoolNodeIDs []string `json:"exit_pool_node_ids,omitempty"` PreferredEntryNodeID string `json:"preferred_entry_node_id,omitempty"` PreferredExitNodeID string `json:"preferred_exit_node_id,omitempty"` SelectionStrategy string `json:"selection_strategy"` RouteRebuild string `json:"route_rebuild"` EntryFailover string `json:"entry_failover"` ExitFailover string `json:"exit_failover"` BackendFallbackAllowed bool `json:"backend_fallback_allowed"` StickySession bool `json:"sticky_session"` ControlPlaneOnly bool `json:"control_plane_only"` ProductionForwarding bool `json:"production_forwarding"` }{ SchemaVersion: policy.SchemaVersion, EntryPoolNodeIDs: policy.EntryPoolNodeIDs, ExitPoolNodeIDs: policy.ExitPoolNodeIDs, PreferredEntryNodeID: policy.PreferredEntryNodeID, PreferredExitNodeID: policy.PreferredExitNodeID, SelectionStrategy: policy.SelectionStrategy, RouteRebuild: policy.RouteRebuild, EntryFailover: policy.EntryFailover, ExitFailover: policy.ExitFailover, BackendFallbackAllowed: policy.BackendFallbackAllowed, StickySession: policy.StickySession, ControlPlaneOnly: true, ProductionForwarding: false, }) if err != nil { return "" } sum := sha256.Sum256(raw) return hex.EncodeToString(sum[:]) } func defaultFabricServiceChannelBreadcrumbWindowPolicy() FabricServiceChannelBreadcrumbWindowPolicy { return normalizeFabricServiceChannelBreadcrumbWindowPolicy(FabricServiceChannelBreadcrumbWindowPolicy{ SchemaVersion: "rap.fabric_service_channel_breadcrumb_window_policy.v1", CurrentWindowSeconds: int64((30 * time.Minute).Seconds()), HistoryWindowSeconds: int64((24 * time.Hour).Seconds()), Source: "defaults", ControlPlaneOnly: true, ProductionForwarding: false, }, FabricServiceChannelBreadcrumbWindowPolicy{}) } func fabricServiceChannelBreadcrumbWindowPolicyFromCluster(cluster Cluster) FabricServiceChannelBreadcrumbWindowPolicy { fallback := defaultFabricServiceChannelBreadcrumbWindowPolicy() if len(cluster.Metadata) == 0 || !json.Valid(cluster.Metadata) { return fallback } var raw struct { Policy *FabricServiceChannelBreadcrumbWindowPolicy `json:"fabric_service_channel_breadcrumb_window_policy"` } if err := json.Unmarshal(cluster.Metadata, &raw); err != nil || raw.Policy == nil { return fallback } policy := normalizeFabricServiceChannelBreadcrumbWindowPolicy(*raw.Policy, fallback) policy.Source = "cluster_metadata" return policy } func normalizeFabricServiceChannelBreadcrumbWindowPolicy(input FabricServiceChannelBreadcrumbWindowPolicy, fallback FabricServiceChannelBreadcrumbWindowPolicy) FabricServiceChannelBreadcrumbWindowPolicy { if input.SchemaVersion == "" { input.SchemaVersion = firstNonEmptyString(fallback.SchemaVersion, "rap.fabric_service_channel_breadcrumb_window_policy.v1") } if input.CurrentWindowSeconds <= 0 { input.CurrentWindowSeconds = firstPositiveInt64(fallback.CurrentWindowSeconds, int64((30 * time.Minute).Seconds())) } if input.HistoryWindowSeconds <= 0 { input.HistoryWindowSeconds = firstPositiveInt64(fallback.HistoryWindowSeconds, int64((24 * time.Hour).Seconds())) } input.CurrentWindowSeconds = clampInt64(input.CurrentWindowSeconds, 60, int64((7 * 24 * time.Hour).Seconds())) input.HistoryWindowSeconds = clampInt64(input.HistoryWindowSeconds, input.CurrentWindowSeconds, int64((30 * 24 * time.Hour).Seconds())) if input.Source == "" { input.Source = firstNonEmptyString(fallback.Source, "defaults") } input.ControlPlaneOnly = true input.ProductionForwarding = false input.Fingerprint = fabricServiceChannelBreadcrumbWindowPolicyFingerprint(input) return input } func upsertFabricServiceChannelBreadcrumbWindowPolicyMetadata(metadata json.RawMessage, policy FabricServiceChannelBreadcrumbWindowPolicy) (json.RawMessage, error) { raw := map[string]any{} if len(metadata) > 0 && json.Valid(metadata) { if err := json.Unmarshal(metadata, &raw); err != nil { return nil, err } } raw["fabric_service_channel_breadcrumb_window_policy"] = policy out, err := json.Marshal(raw) if err != nil { return nil, err } return json.RawMessage(out), nil } func fabricServiceChannelBreadcrumbWindowPolicyFingerprint(policy FabricServiceChannelBreadcrumbWindowPolicy) string { raw, err := json.Marshal(struct { SchemaVersion string `json:"schema_version"` CurrentWindowSeconds int64 `json:"current_window_seconds"` HistoryWindowSeconds int64 `json:"history_window_seconds"` ControlPlaneOnly bool `json:"control_plane_only"` ProductionForwarding bool `json:"production_forwarding"` }{ SchemaVersion: policy.SchemaVersion, CurrentWindowSeconds: policy.CurrentWindowSeconds, HistoryWindowSeconds: policy.HistoryWindowSeconds, ControlPlaneOnly: true, ProductionForwarding: false, }) if err != nil { return "" } sum := sha256.Sum256(raw) return hex.EncodeToString(sum[:]) } func firstNonEmptyStringSlice(values ...[]string) []string { for _, value := range values { if len(value) > 0 { return value } } return nil } func firstPositive(values ...int) int { for _, value := range values { if value > 0 { return value } } return 0 } func firstPositiveInt64(values ...int64) int64 { for _, value := range values { if value > 0 { return value } } return 0 } func firstNonNilStringIntMap(values ...map[string]int) map[string]int { for _, value := range values { if len(value) > 0 { return value } } return nil } func boundedMinInt(a, b int) int { if a < b { return a } return b } func clampInt(value, minValue, maxValue int) int { if value < minValue { return minValue } if value > maxValue { return maxValue } return value } func clampInt64(value, minValue, maxValue int64) int64 { if value < minValue { return minValue } if value > maxValue { return maxValue } return value } func (s *Service) UpdateCluster(ctx context.Context, input UpdateClusterInput) (Cluster, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return Cluster{}, err } if input.ClusterID == "" { return Cluster{}, ErrInvalidCluster } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return Cluster{}, err } input.Name = strings.TrimSpace(input.Name) input.Status = strings.TrimSpace(input.Status) if input.Name == "" { return Cluster{}, ErrInvalidPayload } if input.Status == "" { input.Status = ClusterStatusActive } if input.Status != ClusterStatusActive && input.Status != ClusterStatusDisabled { return Cluster{}, ErrInvalidPayload } input.Metadata = defaultJSON(input.Metadata, `{}`) if !json.Valid(input.Metadata) { return Cluster{}, errors.New("metadata must be valid json") } item, err := s.store.UpdateCluster(ctx, input) if errors.Is(err, pgx.ErrNoRows) { return Cluster{}, ErrInvalidCluster } if err != nil { return Cluster{}, err } payload, _ := json.Marshal(map[string]any{ "name": item.Name, "status": item.Status, "region": item.Region, }) _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &item.ID, ActorUserID: &input.ActorUserID, EventType: "cluster.updated", TargetType: "cluster", TargetID: &item.ID, Payload: payload, CreatedAt: s.now(), }) return item, nil } func (s *Service) ListClusterNodes(ctx context.Context, actorUserID, clusterID string) ([]ClusterNode, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListClusterNodes(ctx, clusterID) } func (s *Service) ListNodeGroups(ctx context.Context, actorUserID, clusterID string) ([]ClusterNodeGroup, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListNodeGroups(ctx, clusterID) } func (s *Service) CreateNodeGroup(ctx context.Context, input CreateNodeGroupInput) (ClusterNodeGroup, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return ClusterNodeGroup{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return ClusterNodeGroup{}, err } input.Name = strings.TrimSpace(input.Name) if input.ClusterID == "" || input.Name == "" { return ClusterNodeGroup{}, ErrInvalidPayload } if input.Description != nil { trimmed := strings.TrimSpace(*input.Description) input.Description = &trimmed } input.Metadata = defaultJSON(input.Metadata, `{}`) if !json.Valid(input.Metadata) { return ClusterNodeGroup{}, errors.New("node group metadata must be valid json") } item, err := s.store.CreateNodeGroup(ctx, input) if errors.Is(err, pgx.ErrNoRows) { return ClusterNodeGroup{}, ErrInvalidPayload } return item, err } func (s *Service) CreateJoinToken(ctx context.Context, input CreateJoinTokenInput) (CreatedJoinToken, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return CreatedJoinToken{}, err } if input.ClusterID == "" { return CreatedJoinToken{}, ErrInvalidCluster } input.Scope = defaultJSON(input.Scope, `{}`) if !json.Valid(input.Scope) { return CreatedJoinToken{}, errors.New("scope must be valid json") } if input.ExpiresAt.IsZero() { input.ExpiresAt = defaultJoinTokenExpiry(s.now()) } if input.ExpiresAt.Before(s.now()) { return CreatedJoinToken{}, errors.New("expires_at must be in the future") } if input.MaxUses <= 0 { input.MaxUses = 1 } rawToken, err := generateJoinToken() if err != nil { return CreatedJoinToken{}, err } tokenHash, err := hashJoinToken(rawToken) if err != nil { return CreatedJoinToken{}, err } item, err := s.store.CreateJoinToken(ctx, input, tokenHash) if err != nil { return CreatedJoinToken{}, err } item, err = s.signJoinToken(ctx, input, item) if err != nil { return CreatedJoinToken{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "node_join_token.created", TargetType: "node_join_token", TargetID: &item.ID, Payload: json.RawMessage(`{"raw_token_returned_once":true}`), CreatedAt: s.now(), }) return CreatedJoinToken{NodeJoinToken: item, Token: rawToken}, nil } func (s *Service) ListJoinTokens(ctx context.Context, actorUserID, clusterID string) ([]NodeJoinToken, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } if err := s.store.ExpireJoinTokens(ctx, clusterID); err != nil { return nil, err } return s.store.ListJoinTokens(ctx, clusterID) } func (s *Service) GetDockerInstallProfile(ctx context.Context, input DockerInstallProfileRequest) (DockerInstallProfile, error) { input.ClusterID = strings.TrimSpace(input.ClusterID) input.InstallToken = strings.TrimSpace(input.InstallToken) if input.ClusterID == "" || input.InstallToken == "" { return DockerInstallProfile{}, ErrInvalidPayload } if err := s.store.ExpireJoinTokens(ctx, input.ClusterID); err != nil { return DockerInstallProfile{}, err } tokenHash, err := hashJoinToken(input.InstallToken) if err != nil { return DockerInstallProfile{}, ErrInvalidJoinToken } token, err := s.store.GetValidJoinTokenByHash(ctx, input.ClusterID, tokenHash) if err != nil { if errors.Is(err, pgx.ErrNoRows) { return DockerInstallProfile{}, ErrInvalidJoinToken } return DockerInstallProfile{}, err } profile, err := dockerInstallProfileFromScope(input, token.Scope) if err != nil { return DockerInstallProfile{}, err } profile.ClusterID = input.ClusterID profile.JoinToken = input.InstallToken return profile, nil } func (s *Service) GetWindowsInstallProfile(ctx context.Context, input DockerInstallProfileRequest) (WindowsInstallProfile, error) { input.ClusterID = strings.TrimSpace(input.ClusterID) input.InstallToken = strings.TrimSpace(input.InstallToken) if input.ClusterID == "" || input.InstallToken == "" { return WindowsInstallProfile{}, ErrInvalidPayload } if err := s.store.ExpireJoinTokens(ctx, input.ClusterID); err != nil { return WindowsInstallProfile{}, err } tokenHash, err := hashJoinToken(input.InstallToken) if err != nil { return WindowsInstallProfile{}, ErrInvalidJoinToken } token, err := s.store.GetValidJoinTokenByHash(ctx, input.ClusterID, tokenHash) if err != nil { if errors.Is(err, pgx.ErrNoRows) { return WindowsInstallProfile{}, ErrInvalidJoinToken } return WindowsInstallProfile{}, err } profile, err := windowsInstallProfileFromScope(input, token.Scope) if err != nil { return WindowsInstallProfile{}, err } profile.ClusterID = input.ClusterID profile.JoinToken = input.InstallToken return profile, nil } func (s *Service) GetLinuxInstallProfile(ctx context.Context, input DockerInstallProfileRequest) (LinuxInstallProfile, error) { input.ClusterID = strings.TrimSpace(input.ClusterID) input.InstallToken = strings.TrimSpace(input.InstallToken) if input.ClusterID == "" || input.InstallToken == "" { return LinuxInstallProfile{}, ErrInvalidPayload } if err := s.store.ExpireJoinTokens(ctx, input.ClusterID); err != nil { return LinuxInstallProfile{}, err } tokenHash, err := hashJoinToken(input.InstallToken) if err != nil { return LinuxInstallProfile{}, ErrInvalidJoinToken } token, err := s.store.GetValidJoinTokenByHash(ctx, input.ClusterID, tokenHash) if err != nil { if errors.Is(err, pgx.ErrNoRows) { return LinuxInstallProfile{}, ErrInvalidJoinToken } return LinuxInstallProfile{}, err } profile, err := linuxInstallProfileFromScope(input, token.Scope) if err != nil { return LinuxInstallProfile{}, err } profile.ClusterID = input.ClusterID profile.JoinToken = input.InstallToken return profile, nil } func (s *Service) signJoinToken(ctx context.Context, input CreateJoinTokenInput, item NodeJoinToken) (NodeJoinToken, error) { authorityKey, err := s.ensureClusterAuthority(ctx, input.ClusterID, &input.ActorUserID) if err != nil { return NodeJoinToken{}, err } payload := clusterJoinTokenAuthorityPayload{ SchemaVersion: clusterJoinTokenAuthoritySchema, ClusterID: input.ClusterID, TokenID: item.ID, Scope: item.Scope, ExpiresAt: item.ExpiresAt, MaxUses: item.MaxUses, CreatedByUserID: item.CreatedByUserID, IssuedAt: item.CreatedAt, ControlPlaneOnly: true, ProductionForwarding: false, } rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now()) if err != nil { return NodeJoinToken{}, err } return s.store.SetJoinTokenAuthority(ctx, input.ClusterID, item.ID, rawPayload, signature) } func (s *Service) CreateJoinRequest(ctx context.Context, input CreateJoinRequestInput) (NodeJoinRequest, error) { if input.ClusterID == "" { return NodeJoinRequest{}, ErrInvalidCluster } if err := s.store.ExpireJoinTokens(ctx, input.ClusterID); err != nil { return NodeJoinRequest{}, err } input.NodeName = strings.TrimSpace(input.NodeName) input.NodeFingerprint = strings.TrimSpace(input.NodeFingerprint) input.PublicKey = strings.TrimSpace(input.PublicKey) if input.NodeName == "" || input.NodeFingerprint == "" || input.PublicKey == "" { return NodeJoinRequest{}, ErrInvalidPayload } input.ReportedCapabilities = defaultJSON(input.ReportedCapabilities, `{}`) input.ReportedFacts = defaultJSON(input.ReportedFacts, `{}`) input.RequestedRoles = defaultJSON(input.RequestedRoles, `[]`) if !json.Valid(input.ReportedCapabilities) || !json.Valid(input.ReportedFacts) || !json.Valid(input.RequestedRoles) { return NodeJoinRequest{}, errors.New("reported_capabilities, reported_facts, and requested_roles must be valid json") } tokenHash, err := hashJoinToken(input.JoinToken) if err != nil { return NodeJoinRequest{}, ErrInvalidJoinToken } token, err := s.store.GetValidJoinTokenByHash(ctx, input.ClusterID, tokenHash) if err != nil { if errors.Is(err, pgx.ErrNoRows) { return NodeJoinRequest{}, ErrInvalidJoinToken } return NodeJoinRequest{}, err } item, err := s.store.CreateJoinRequest(ctx, input, token.ID) if err != nil { if errors.Is(err, pgx.ErrNoRows) { return NodeJoinRequest{}, ErrInvalidJoinToken } return NodeJoinRequest{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, EventType: "node_join_request.created", TargetType: "node_join_request", TargetID: &item.ID, Payload: json.RawMessage(`{"source":"node_agent"}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) ListJoinRequests(ctx context.Context, actorUserID, clusterID string) ([]NodeJoinRequest, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListJoinRequests(ctx, clusterID) } func (s *Service) GetJoinRequestBootstrap(ctx context.Context, input GetJoinRequestBootstrapInput) (JoinRequestBootstrapResult, error) { input.ClusterID = strings.TrimSpace(input.ClusterID) input.JoinRequestID = strings.TrimSpace(input.JoinRequestID) input.NodeFingerprint = strings.TrimSpace(input.NodeFingerprint) input.PublicKey = strings.TrimSpace(input.PublicKey) if input.ClusterID == "" || input.JoinRequestID == "" || input.NodeFingerprint == "" || input.PublicKey == "" { return JoinRequestBootstrapResult{}, ErrInvalidJoinRequest } item, err := s.store.GetJoinRequestForBootstrap(ctx, input) if errors.Is(err, pgx.ErrNoRows) { return JoinRequestBootstrapResult{}, ErrInvalidJoinRequest } if err != nil { return JoinRequestBootstrapResult{}, err } result := JoinRequestBootstrapResult{Status: item.Status, JoinRequest: item} if item.Status != JoinRequestStatusApproved { return result, nil } bootstrap, updated, err := s.bootstrapForApprovedJoinRequest(ctx, item) if err != nil { return JoinRequestBootstrapResult{}, err } result.JoinRequest = updated result.Bootstrap = &bootstrap return result, nil } func (s *Service) RevokeJoinToken(ctx context.Context, input RevokeJoinTokenInput) (NodeJoinToken, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return NodeJoinToken{}, err } item, err := s.store.RevokeJoinToken(ctx, input) if errors.Is(err, pgx.ErrNoRows) { return NodeJoinToken{}, ErrInvalidJoinToken } if err != nil { return NodeJoinToken{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "node_join_token.revoked", TargetType: "node_join_token", TargetID: &input.TokenID, Payload: json.RawMessage(`{}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) ApproveJoinRequest(ctx context.Context, input ApproveJoinRequestInput) (ApprovedJoinRequest, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return ApprovedJoinRequest{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return ApprovedJoinRequest{}, err } if input.ClusterID == "" || input.JoinRequestID == "" { return ApprovedJoinRequest{}, ErrInvalidJoinRequest } item, err := s.store.ApproveJoinRequest(ctx, input) if errors.Is(err, pgx.ErrNoRows) { return ApprovedJoinRequest{}, ErrInvalidJoinRequest } if err != nil { return ApprovedJoinRequest{}, err } item, err = s.signApprovedJoinRequest(ctx, input, item) if err != nil { return ApprovedJoinRequest{}, err } return item, nil } func (s *Service) signApprovedJoinRequest(ctx context.Context, input ApproveJoinRequestInput, item ApprovedJoinRequest) (ApprovedJoinRequest, error) { authorityKey, err := s.ensureClusterAuthority(ctx, input.ClusterID, &input.ActorUserID) if err != nil { return ApprovedJoinRequest{}, err } if item.Bootstrap.HeartbeatEndpoint == "" { item.Bootstrap.HeartbeatEndpoint = nodeHeartbeatEndpoint(input.ClusterID, item.Bootstrap.NodeID) } payload := clusterNodeApprovalAuthorityPayload{ SchemaVersion: clusterNodeApprovalAuthoritySchema, ClusterID: input.ClusterID, JoinRequestID: item.JoinRequest.ID, NodeID: item.Bootstrap.NodeID, NodeFingerprint: item.JoinRequest.NodeFingerprint, IdentityStatus: item.Bootstrap.IdentityStatus, HeartbeatEndpoint: item.Bootstrap.HeartbeatEndpoint, ApprovedByUserID: input.ActorUserID, IssuedAt: s.now(), ControlPlaneOnly: true, ProductionForwarding: false, } payload.ClusterAuthorityQuorumSHA256, err = clusterAuthorityQuorumDescriptorHash(authorityKey) if err != nil { return ApprovedJoinRequest{}, err } rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now()) if err != nil { return ApprovedJoinRequest{}, err } updated, err := s.store.SetJoinRequestApprovalAuthority(ctx, input.ClusterID, item.JoinRequest.ID, rawPayload, signature) if err != nil { return ApprovedJoinRequest{}, err } item.JoinRequest = updated item.Bootstrap.ClusterAuthority = authorityDescriptor(authorityKey) item.Bootstrap.ClusterAuthorityQuorum = authorityKey.QuorumDescriptor item.Bootstrap.AuthorityPayload = rawPayload item.Bootstrap.AuthoritySignature = &signature return item, nil } func (s *Service) bootstrapForApprovedJoinRequest(ctx context.Context, item NodeJoinRequest) (NodeBootstrap, NodeJoinRequest, error) { if item.Status != JoinRequestStatusApproved || item.ApprovedNodeID == nil || strings.TrimSpace(*item.ApprovedNodeID) == "" { return NodeBootstrap{}, NodeJoinRequest{}, ErrInvalidJoinRequest } authorityKey, err := s.ensureClusterAuthority(ctx, item.ClusterID, item.ReviewedByUserID) if err != nil { return NodeBootstrap{}, NodeJoinRequest{}, err } heartbeatEndpoint := nodeHeartbeatEndpoint(item.ClusterID, *item.ApprovedNodeID) identityStatus := NodeRegistrationActive if rawMessageEmpty(item.ApprovalPayload) || rawMessageEmpty(item.ApprovalSignature) { approvedBy := "system" if item.ReviewedByUserID != nil && strings.TrimSpace(*item.ReviewedByUserID) != "" { approvedBy = strings.TrimSpace(*item.ReviewedByUserID) } payload := clusterNodeApprovalAuthorityPayload{ SchemaVersion: clusterNodeApprovalAuthoritySchema, ClusterID: item.ClusterID, JoinRequestID: item.ID, NodeID: *item.ApprovedNodeID, NodeFingerprint: item.NodeFingerprint, IdentityStatus: identityStatus, HeartbeatEndpoint: heartbeatEndpoint, ApprovedByUserID: approvedBy, IssuedAt: s.now(), ControlPlaneOnly: true, ProductionForwarding: false, } payload.ClusterAuthorityQuorumSHA256, err = clusterAuthorityQuorumDescriptorHash(authorityKey) if err != nil { return NodeBootstrap{}, NodeJoinRequest{}, err } rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now()) if err != nil { return NodeBootstrap{}, NodeJoinRequest{}, err } item, err = s.store.SetJoinRequestApprovalAuthority(ctx, item.ClusterID, item.ID, rawPayload, signature) if err != nil { return NodeBootstrap{}, NodeJoinRequest{}, err } } else { var signature ClusterSignature if err := json.Unmarshal(item.ApprovalSignature, &signature); err != nil { return NodeBootstrap{}, NodeJoinRequest{}, err } if err := clusterauth.VerifyRaw(authorityKey.PublicKey, item.ApprovalPayload, signature); err != nil { return NodeBootstrap{}, NodeJoinRequest{}, err } } var signature ClusterSignature if err := json.Unmarshal(item.ApprovalSignature, &signature); err != nil { return NodeBootstrap{}, NodeJoinRequest{}, err } bootstrap := NodeBootstrap{ NodeID: *item.ApprovedNodeID, ClusterID: item.ClusterID, IdentityStatus: identityStatus, Certificate: map[string]any{ "status": "pending_issuer_integration", }, HeartbeatEndpoint: heartbeatEndpoint, ClusterAuthority: authorityDescriptor(authorityKey), ClusterAuthorityQuorum: authorityKey.QuorumDescriptor, AuthorityPayload: item.ApprovalPayload, AuthoritySignature: &signature, } return bootstrap, item, nil } func clusterAuthorityQuorumDescriptorHash(authorityKey ClusterAuthorityKey) (string, error) { if authorityKey.QuorumDescriptor == nil { return "", nil } descriptor := *authorityKey.QuorumDescriptor if descriptor.SchemaVersion == "" { descriptor.SchemaVersion = clusterauth.QuorumSchemaVersion } if strings.TrimSpace(descriptor.ClusterID) == "" { descriptor.ClusterID = authorityKey.ClusterID } return clusterauth.QuorumDescriptorHash(descriptor) } func nodeHeartbeatEndpoint(clusterID, nodeID string) string { return "/api/v1/clusters/" + clusterID + "/nodes/" + nodeID + "/heartbeats" } func rawMessageEmpty(raw json.RawMessage) bool { value := strings.TrimSpace(string(raw)) return value == "" || value == "{}" || value == "null" } func (s *Service) RejectJoinRequest(ctx context.Context, input RejectJoinRequestInput) (NodeJoinRequest, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return NodeJoinRequest{}, err } input.Reason = strings.TrimSpace(input.Reason) if input.Reason == "" { input.Reason = "Rejected by platform administrator." } item, err := s.store.RejectJoinRequest(ctx, input) if errors.Is(err, pgx.ErrNoRows) { return NodeJoinRequest{}, ErrInvalidJoinRequest } return item, err } func (s *Service) AssignNodeRole(ctx context.Context, input AssignNodeRoleInput) (NodeRoleAssignment, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return NodeRoleAssignment{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return NodeRoleAssignment{}, err } if !isAllowedNodeRole(input.Role) { return NodeRoleAssignment{}, ErrInvalidNodeRole } if input.Status == "" { input.Status = "active" } if input.Status != "active" && input.Status != "disabled" && input.Status != "revoked" { return NodeRoleAssignment{}, ErrInvalidPayload } input.Policy = defaultJSON(input.Policy, `{}`) if !json.Valid(input.Policy) { return NodeRoleAssignment{}, errors.New("policy must be valid json") } item, err := s.store.AssignNodeRole(ctx, input) if err != nil { return NodeRoleAssignment{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "node_role." + input.Status, TargetType: "node", TargetID: &input.NodeID, Payload: json.RawMessage(`{"capability_is_not_permission":true}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) ListNodeRoleAssignments(ctx context.Context, actorUserID, clusterID, nodeID string) ([]NodeRoleAssignment, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListNodeRoleAssignments(ctx, clusterID, nodeID) } func (s *Service) AttachExistingNodeToCluster(ctx context.Context, input AttachExistingNodeInput) (ClusterNode, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return ClusterNode{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return ClusterNode{}, err } if input.ClusterID == "" || input.NodeID == "" { return ClusterNode{}, ErrInvalidPayload } for _, role := range input.Roles { if !isAllowedNodeRole(role) { return ClusterNode{}, ErrInvalidNodeRole } } item, err := s.store.AttachExistingNodeToCluster(ctx, input) if errors.Is(err, pgx.ErrNoRows) { return ClusterNode{}, ErrInvalidPayload } return item, err } func (s *Service) AssignNodeToGroup(ctx context.Context, input AssignNodeGroupInput) (ClusterNode, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return ClusterNode{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return ClusterNode{}, err } if input.ClusterID == "" || input.NodeID == "" { return ClusterNode{}, ErrInvalidPayload } if input.GroupID != nil { trimmed := strings.TrimSpace(*input.GroupID) if trimmed == "" { input.GroupID = nil } else { input.GroupID = &trimmed } } item, err := s.store.AssignNodeToGroup(ctx, input) if errors.Is(err, pgx.ErrNoRows) { return ClusterNode{}, ErrInvalidPayload } return item, err } func (s *Service) RevokeNodeIdentity(ctx context.Context, input RevokeNodeIdentityInput) error { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return err } input.Reason = strings.TrimSpace(input.Reason) if input.Reason == "" { input.Reason = "revoked by platform administrator" } if err := s.store.RevokeNodeIdentity(ctx, input); err != nil { if errors.Is(err, pgx.ErrNoRows) { return ErrInvalidPayload } return err } return nil } func (s *Service) DisableClusterMembership(ctx context.Context, input DisableMembershipInput) error { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return err } input.Reason = strings.TrimSpace(input.Reason) if input.Reason == "" { input.Reason = "disabled by platform administrator" } if err := s.store.DisableClusterMembership(ctx, input); err != nil { if errors.Is(err, pgx.ErrNoRows) { return ErrInvalidPayload } return err } return nil } func (s *Service) DeleteClusterNode(ctx context.Context, input DeleteClusterNodeInput) error { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return err } input.Reason = strings.TrimSpace(input.Reason) if input.ClusterID == "" || input.NodeID == "" { return ErrInvalidPayload } if input.Reason == "" { input.Reason = "deleted by platform administrator" } if err := s.store.DeleteClusterNode(ctx, input); err != nil { if errors.Is(err, pgx.ErrNoRows) { return ErrInvalidPayload } return err } return nil } func (s *Service) RecordHeartbeat(ctx context.Context, input RecordHeartbeatInput) (NodeHeartbeat, error) { if input.ClusterID == "" || input.NodeID == "" { return NodeHeartbeat{}, ErrInvalidPayload } if input.HealthStatus == "" { input.HealthStatus = "unknown" } input.Capabilities = defaultJSON(input.Capabilities, `{}`) input.ServiceStates = defaultJSON(input.ServiceStates, `{}`) input.Metadata = defaultJSON(input.Metadata, `{}`) heartbeat, err := s.store.RecordHeartbeat(ctx, input) if err != nil { return NodeHeartbeat{}, err } _ = s.recordFabricServiceChannelRouteFeedback(ctx, heartbeat) _ = s.autoWarmFabricServiceChannelRouteRebuildSnapshotsAfterHeartbeat(ctx, heartbeat) return heartbeat, nil } func (s *Service) ListNodeHeartbeats(ctx context.Context, actorUserID, clusterID, nodeID string, limit int) ([]NodeHeartbeat, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, limit) } func (s *Service) ListFabricServiceChannelRouteFeedback(ctx context.Context, actorUserID string, input ListFabricServiceChannelRouteFeedbackInput) ([]FabricServiceChannelRouteFeedbackObservation, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } input.ClusterID = strings.TrimSpace(input.ClusterID) input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID) input.RouteID = strings.TrimSpace(input.RouteID) input.ServiceClass = strings.TrimSpace(input.ServiceClass) input.FeedbackStatus = strings.TrimSpace(input.FeedbackStatus) if input.ClusterID == "" { return nil, ErrInvalidPayload } if input.Now.IsZero() { input.Now = s.now() } observations, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, input) if err != nil { return nil, err } policy := s.fabricServiceChannelRecoveryPolicy(ctx, input.ClusterID) intents, err := s.store.ListRouteIntents(ctx, input.ClusterID) if err != nil { return nil, err } report := serviceChannelRouteFeedbackReportWithPolicyAndProvenance(observations, input.Now, policy, fabricServiceChannelRouteProvenanceFromIntents(intents)) if report == nil { return nil, nil } return report.Observations, nil } func (s *Service) ListFabricServiceChannelRouteRebuildAttempts(ctx context.Context, actorUserID string, input ListFabricServiceChannelRouteRebuildAttemptsInput) ([]FabricServiceChannelRouteRebuildAttempt, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } input.ClusterID = strings.TrimSpace(input.ClusterID) input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID) input.RouteID = strings.TrimSpace(input.RouteID) input.ReplacementRouteID = strings.TrimSpace(input.ReplacementRouteID) input.ServiceClass = strings.TrimSpace(input.ServiceClass) input.RebuildStatus = strings.TrimSpace(input.RebuildStatus) input.RebuildRequestID = strings.TrimSpace(input.RebuildRequestID) input.Generation = strings.TrimSpace(input.Generation) input.FeedbackSource = strings.TrimSpace(input.FeedbackSource) input.FeedbackChannelID = strings.TrimSpace(input.FeedbackChannelID) input.FeedbackViolationStatus = strings.TrimSpace(input.FeedbackViolationStatus) input.EnrichmentMode = strings.TrimSpace(input.EnrichmentMode) if input.ClusterID == "" { return nil, ErrInvalidPayload } if input.Offset < 0 { input.Offset = 0 } if input.EnrichmentMode == "" { input.EnrichmentMode = "summary" } items, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, input) if err != nil { return nil, err } if input.EnrichmentMode != "deep" { return stripFabricServiceChannelRouteRebuildCorrelation(items), nil } return s.enrichFabricServiceChannelRouteRebuildAttempts(ctx, input.ClusterID, items, s.now()), nil } func (s *Service) GetFabricServiceChannelRouteRebuildHealthSummary(ctx context.Context, actorUserID string, input GetFabricServiceChannelRouteRebuildHealthSummaryInput) (FabricServiceChannelRouteRebuildHealthSummary, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return FabricServiceChannelRouteRebuildHealthSummary{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) if input.ClusterID == "" { return FabricServiceChannelRouteRebuildHealthSummary{}, ErrInvalidPayload } if input.Limit <= 0 || input.Limit > 500 { input.Limit = 200 } now := s.now() if now.IsZero() { now = time.Now().UTC() } items, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{ ClusterID: input.ClusterID, Limit: input.Limit, UseCachedSnapshot: true, }) if err != nil { return FabricServiceChannelRouteRebuildHealthSummary{}, err } items = s.enrichFabricServiceChannelRouteRebuildAttempts(ctx, input.ClusterID, items, now) silences, err := s.store.ListFabricServiceChannelRouteRebuildAlertSilences(ctx, input.ClusterID, now) if err != nil { return FabricServiceChannelRouteRebuildHealthSummary{}, err } items = applyFabricServiceChannelRouteRebuildAlertSilences(items, silences) summary := FabricServiceChannelRouteRebuildHealthSummary{ ClusterID: input.ClusterID, ObservedAt: now.UTC(), WindowLimit: input.Limit, TotalAttempts: len(items), CountsByGuardStatus: map[string]int{}, CountsByGuardSeverity: map[string]int{}, } affectedNodes := map[string]struct{}{} affectedRoutes := map[string]struct{}{} feedbackBreakdowns := map[string]*fabricServiceChannelRebuildFeedbackBreakdownAccumulator{} for _, item := range items { severity := firstNonEmptyString(item.GuardSeverity, "unknown") status := firstNonEmptyString(item.GuardStatus, "unknown") summary.CountsByGuardSeverity[severity]++ summary.CountsByGuardStatus[status]++ switch severity { case "good": summary.GoodCount++ case "warn": summary.WarnCount++ if !item.AlertSilenced { summary.ActiveWarnCount++ } case "bad": summary.BadCount++ if !item.AlertSilenced { summary.ActiveBadCount++ } default: summary.UnknownCount++ } if item.AlertSilenced { summary.SilencedCount++ } if item.AlertResurfaced { summary.ResurfacedCount++ } if item.RebuildStatus == "applied" { summary.AppliedCount++ } else if item.RebuildStatus != "" { summary.PendingCount++ } if (severity == "bad" || severity == "warn") && !item.AlertSilenced { if item.ReporterNodeID != "" { affectedNodes[item.ReporterNodeID] = struct{}{} } if item.RouteID != "" { affectedRoutes[item.RouteID] = struct{}{} } } if severity == "bad" && !item.AlertSilenced && len(summary.MostRecentBadAttempts) < 10 { summary.MostRecentBadAttempts = append(summary.MostRecentBadAttempts, item) } if item.AlertResurfaced && len(summary.ResurfacedAttempts) < 10 { summary.ResurfacedAttempts = append(summary.ResurfacedAttempts, item) } addFabricServiceChannelRebuildFeedbackBreakdown(feedbackBreakdowns, item, severity) } if accessTelemetry, err := s.GetFabricServiceChannelAccessTelemetry(ctx, actorUserID, GetFabricServiceChannelAccessTelemetryInput{ ClusterID: input.ClusterID, Limit: input.Limit, Now: now, }); err == nil { summary.AccessRouteDecisionCount = accessTelemetry.RouteDecisionChannelCount summary.AccessReplacementCount = accessTelemetry.ReplacementDecisionCount summary.AccessAppliedCount = accessTelemetry.AppliedRebuildDecisionCount summary.AccessRecoveryCount = accessTelemetry.RecoveryDecisionCount summary.AccessNoSafeCount = accessTelemetry.NoSafeRecoveryDecisionCount accessIncidents := append( fabricServiceChannelAccessDecisionIncidents(input.ClusterID, accessTelemetry), fabricServiceChannelDataPlaneContractIncidents(input.ClusterID, accessTelemetry)..., ) for _, incident := range applyFabricServiceChannelAccessDecisionIncidentSilences(accessIncidents, silences) { summary.CountsByGuardStatus[incident.GuardStatus]++ summary.CountsByGuardSeverity[incident.GuardSeverity]++ if incident.AlertSilenced { summary.SilencedCount++ } if incident.AlertResurfaced { summary.ResurfacedCount++ } switch incident.GuardSeverity { case "good": summary.GoodCount++ case "warn": summary.WarnCount++ if !incident.AlertSilenced { summary.ActiveWarnCount++ } case "bad": summary.BadCount++ if !incident.AlertSilenced { summary.ActiveBadCount++ } default: summary.UnknownCount++ } if (incident.GuardSeverity == "bad" || incident.GuardSeverity == "warn") && !incident.AlertSilenced { if incident.ReporterNodeID != "" { affectedNodes[incident.ReporterNodeID] = struct{}{} } if incident.RouteID != "" { affectedRoutes[incident.RouteID] = struct{}{} } } } } summary.AffectedReporterNodeIDs = sortedStringSetKeys(affectedNodes) summary.AffectedRouteIDs = sortedStringSetKeys(affectedRoutes) summary.FeedbackBreakdowns = sortedFabricServiceChannelRebuildFeedbackBreakdowns(feedbackBreakdowns) summary.RecommendedOperatorAction = fabricServiceChannelRebuildRecommendedAction(summary) return summary, nil } type fabricServiceChannelRebuildFeedbackBreakdownAccumulator struct { item FabricServiceChannelRouteRebuildFeedbackHealthBreakdown nodes map[string]struct{} routes map[string]struct{} } func addFabricServiceChannelRebuildFeedbackBreakdown(out map[string]*fabricServiceChannelRebuildFeedbackBreakdownAccumulator, attempt FabricServiceChannelRouteRebuildAttempt, severity string) { payload := jsonObject(attempt.Payload) source := firstNonEmptyString(attempt.FeedbackSource, jsonString(payload, "feedback_source")) channelID := firstNonEmptyString(attempt.FeedbackChannelID, jsonString(payload, "feedback_channel_id")) violationStatus := firstNonEmptyString(attempt.FeedbackViolationStatus, jsonString(payload, "feedback_violation_status")) if source == "" && channelID == "" && violationStatus == "" { return } key := source + "\x00" + channelID + "\x00" + violationStatus acc := out[key] if acc == nil { acc = &fabricServiceChannelRebuildFeedbackBreakdownAccumulator{ item: FabricServiceChannelRouteRebuildFeedbackHealthBreakdown{ FeedbackSource: source, FeedbackChannelID: channelID, FeedbackViolationStatus: violationStatus, }, nodes: map[string]struct{}{}, routes: map[string]struct{}{}, } out[key] = acc } acc.item.TotalCount++ switch severity { case "good": acc.item.GoodCount++ case "warn": acc.item.WarnCount++ if !attempt.AlertSilenced { acc.item.ActiveWarnCount++ } case "bad": acc.item.BadCount++ if !attempt.AlertSilenced { acc.item.ActiveBadCount++ } default: acc.item.UnknownCount++ } if attempt.AlertSilenced { acc.item.SilencedCount++ } observedAt := time.Time{} if attempt.FeedbackObservedAt != nil { observedAt = attempt.FeedbackObservedAt.UTC() } else if value := strings.TrimSpace(jsonString(payload, "feedback_observed_at")); value != "" { if parsed, err := time.Parse(time.RFC3339Nano, value); err == nil { observedAt = parsed.UTC() } } if observedAt.IsZero() { observedAt = attempt.UpdatedAt.UTC() } if observedAt.After(acc.item.LatestObservedAt) { acc.item.LatestObservedAt = observedAt } if attempt.ReporterNodeID != "" { acc.nodes[attempt.ReporterNodeID] = struct{}{} } if attempt.RouteID != "" { acc.routes[attempt.RouteID] = struct{}{} } } func sortedFabricServiceChannelRebuildFeedbackBreakdowns(input map[string]*fabricServiceChannelRebuildFeedbackBreakdownAccumulator) []FabricServiceChannelRouteRebuildFeedbackHealthBreakdown { out := make([]FabricServiceChannelRouteRebuildFeedbackHealthBreakdown, 0, len(input)) for _, acc := range input { item := acc.item item.AffectedReporterNodeIDs = sortedStringSetKeys(acc.nodes) item.AffectedRouteIDs = sortedStringSetKeys(acc.routes) out = append(out, item) } sort.SliceStable(out, func(i, j int) bool { leftActive := out[i].ActiveBadCount*100000 + out[i].ActiveWarnCount*1000 + out[i].TotalCount rightActive := out[j].ActiveBadCount*100000 + out[j].ActiveWarnCount*1000 + out[j].TotalCount if leftActive != rightActive { return leftActive > rightActive } if !out[i].LatestObservedAt.Equal(out[j].LatestObservedAt) { return out[i].LatestObservedAt.After(out[j].LatestObservedAt) } left := out[i].FeedbackSource + out[i].FeedbackChannelID + out[i].FeedbackViolationStatus right := out[j].FeedbackSource + out[j].FeedbackChannelID + out[j].FeedbackViolationStatus return left < right }) if len(out) > 100 { out = out[:100] } return out } func (s *Service) GetFabricServiceChannelReadiness(ctx context.Context, actorUserID string, input GetFabricServiceChannelReadinessInput) (FabricServiceChannelReadiness, error) { if input.Limit <= 0 || input.Limit > 5 { input.Limit = 5 } summary, err := s.GetFabricServiceChannelRouteRebuildHealthSummary(ctx, actorUserID, GetFabricServiceChannelRouteRebuildHealthSummaryInput{ ClusterID: input.ClusterID, Limit: input.Limit, }) if err != nil { return FabricServiceChannelReadiness{}, err } return fabricServiceChannelReadinessFromRebuildHealth(summary), nil } func (s *Service) GetFabricServiceChannelSchemaStatus(ctx context.Context, actorUserID string, input GetFabricServiceChannelSchemaStatusInput) (FabricServiceChannelSchemaStatus, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return FabricServiceChannelSchemaStatus{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) if input.ClusterID == "" { return FabricServiceChannelSchemaStatus{}, ErrInvalidPayload } return s.store.GetFabricServiceChannelSchemaStatus(ctx, input) } func (s *Service) GetFabricServiceChannelRebuildSnapshotMaintenanceHealth(ctx context.Context, actorUserID string, input GetFabricServiceChannelRebuildSnapshotMaintenanceHealthInput) (FabricServiceChannelRebuildSnapshotMaintenanceHealth, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return FabricServiceChannelRebuildSnapshotMaintenanceHealth{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) if input.ClusterID == "" { return FabricServiceChannelRebuildSnapshotMaintenanceHealth{}, ErrInvalidPayload } if input.Limit <= 0 { input.Limit = 50 } if input.Limit > 100 { input.Limit = 100 } if input.MinAgeSeconds <= 0 { input.MinAgeSeconds = 60 } if input.MinAgeSeconds > 3600 { input.MinAgeSeconds = 3600 } if input.HeartbeatThreshold <= 0 { input.HeartbeatThreshold = 2 } if input.HeartbeatThreshold > 10 { input.HeartbeatThreshold = 10 } now := s.now() if now.IsZero() { now = time.Now().UTC() } out := FabricServiceChannelRebuildSnapshotMaintenanceHealth{ ClusterID: input.ClusterID, ObservedAt: now.UTC(), Status: "ready", Reason: "snapshot_maintenance_ready", WindowLimit: input.Limit, MinAgeSeconds: input.MinAgeSeconds, HeartbeatThreshold: input.HeartbeatThreshold, } attempts, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{ ClusterID: input.ClusterID, Limit: input.Limit, }) if err != nil { return FabricServiceChannelRebuildSnapshotMaintenanceHealth{}, err } heartbeatsByNode := map[string][]NodeHeartbeat{} nodes := map[string]*FabricServiceChannelRebuildSnapshotNodeHealth{} nodeHealth := func(nodeID string) *FabricServiceChannelRebuildSnapshotNodeHealth { nodeID = strings.TrimSpace(nodeID) if nodeID == "" { nodeID = "unknown" } if item, ok := nodes[nodeID]; ok { return item } item := &FabricServiceChannelRebuildSnapshotNodeHealth{NodeID: nodeID} nodes[nodeID] = item return item } for _, attempt := range attempts { out.RecentAttemptCount++ node := nodeHealth(attempt.ReporterNodeID) node.RecentAttemptCount++ if fabricServiceChannelRouteRebuildHasCorrelationSnapshot(attempt) { out.ValidSnapshotCount++ node.ValidSnapshotCount++ continue } out.MissingSnapshotCount++ node.MissingSnapshotCount++ ageSeconds := int64(now.Sub(attempt.UpdatedAt).Seconds()) if ageSeconds < input.MinAgeSeconds { continue } reporterNodeID := strings.TrimSpace(attempt.ReporterNodeID) if reporterNodeID == "" { continue } heartbeats, ok := heartbeatsByNode[reporterNodeID] if !ok { heartbeats, err = s.store.ListNodeHeartbeats(ctx, input.ClusterID, reporterNodeID, input.HeartbeatThreshold+5) if err != nil { heartbeats = nil } heartbeatsByNode[reporterNodeID] = heartbeats } heartbeatAfterAttemptCount := 0 for _, heartbeat := range heartbeats { observedAt := heartbeat.ObservedAt if node.LastHeartbeatAt == nil || observedAt.After(*node.LastHeartbeatAt) { value := observedAt node.LastHeartbeatAt = &value } if observedAt.After(attempt.UpdatedAt) || observedAt.Equal(attempt.UpdatedAt) { heartbeatAfterAttemptCount++ } } if heartbeatAfterAttemptCount > node.HeartbeatAfterAttemptCount { node.HeartbeatAfterAttemptCount = heartbeatAfterAttemptCount } if heartbeatAfterAttemptCount >= input.HeartbeatThreshold { out.OverdueMissingSnapshotCount++ node.OverdueMissingSnapshotCount++ if len(out.OverdueMissingSnapshotAttempts) < 10 { out.OverdueMissingSnapshotAttempts = append(out.OverdueMissingSnapshotAttempts, attempt) } } } events, err := s.store.ListAuditEvents(ctx, ListAuditEventsInput{ ClusterID: input.ClusterID, EventTypes: []string{"fabric.service_channel_rebuild_snapshot.auto_warmup"}, Limit: 100, }) if err != nil { return FabricServiceChannelRebuildSnapshotMaintenanceHealth{}, err } for _, event := range events { if event.EventType != "fabric.service_channel_rebuild_snapshot.auto_warmup" { continue } payload := jsonObject(event.Payload) nodeID := jsonString(payload, "reporter_node_id") node := nodeHealth(nodeID) out.AutoWarmupEventCount++ out.AutoWarmupWarmedCount += jsonInt(payload, "warmed_count") out.AutoWarmupAlreadyFreshCount += jsonInt(payload, "already_fresh_count") out.AutoWarmupErrorCount += jsonInt(payload, "error_count") node.AutoWarmupEventCount++ node.AutoWarmupWarmedCount += jsonInt(payload, "warmed_count") node.AutoWarmupErrorCount += jsonInt(payload, "error_count") createdAt := event.CreatedAt if out.LatestAutoWarmupAt == nil || createdAt.After(*out.LatestAutoWarmupAt) { value := createdAt out.LatestAutoWarmupAt = &value } if node.LatestAutoWarmupAt == nil || createdAt.After(*node.LatestAutoWarmupAt) { value := createdAt node.LatestAutoWarmupAt = &value } } out.Nodes = make([]FabricServiceChannelRebuildSnapshotNodeHealth, 0, len(nodes)) for _, item := range nodes { out.Nodes = append(out.Nodes, *item) } sort.Slice(out.Nodes, func(i, j int) bool { if out.Nodes[i].OverdueMissingSnapshotCount != out.Nodes[j].OverdueMissingSnapshotCount { return out.Nodes[i].OverdueMissingSnapshotCount > out.Nodes[j].OverdueMissingSnapshotCount } if out.Nodes[i].MissingSnapshotCount != out.Nodes[j].MissingSnapshotCount { return out.Nodes[i].MissingSnapshotCount > out.Nodes[j].MissingSnapshotCount } return out.Nodes[i].NodeID < out.Nodes[j].NodeID }) if out.AutoWarmupErrorCount > 0 { out.Status = "degraded" out.Reason = "auto_warmup_errors_seen" out.RecommendedOperatorAction = "Check backend logs and heartbeat metadata for nodes with auto-warmup errors." } if out.OverdueMissingSnapshotCount > 0 { out.Status = "degraded" out.Reason = "snapshot_warmup_overdue" out.RecommendedOperatorAction = "Run warm snapshots or inspect reporter nodes whose heartbeat evidence is not producing rebuild snapshots." } if out.MissingSnapshotCount > 0 && out.OverdueMissingSnapshotCount == 0 && out.RecommendedOperatorAction == "" { out.RecommendedOperatorAction = "Recent attempts are still waiting for runtime heartbeat evidence." } return out, nil } func (s *Service) WarmupFabricServiceChannelRebuildSnapshots(ctx context.Context, input WarmupFabricServiceChannelRebuildSnapshotsInput) (FabricServiceChannelRebuildSnapshotWarmup, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return FabricServiceChannelRebuildSnapshotWarmup{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) if input.ClusterID == "" { return FabricServiceChannelRebuildSnapshotWarmup{}, ErrInvalidPayload } if input.Limit <= 0 || input.Limit > 50 { input.Limit = 10 } if input.StaleAfterSeconds <= 0 || input.StaleAfterSeconds > int64((24*time.Hour).Seconds()) { input.StaleAfterSeconds = 60 } now := input.Now if now.IsZero() { now = s.now() } if now.IsZero() { now = time.Now().UTC() } result := FabricServiceChannelRebuildSnapshotWarmup{ ClusterID: input.ClusterID, ObservedAt: now.UTC(), WindowLimit: input.Limit, StaleAfterSeconds: input.StaleAfterSeconds, Status: "ready", Reason: "snapshots_warmed", } items, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{ ClusterID: input.ClusterID, Limit: input.Limit, }) if err != nil { return FabricServiceChannelRebuildSnapshotWarmup{}, err } result.ScannedCount = len(items) heartbeatsByNode := map[string][]NodeHeartbeat{} staleAfter := time.Duration(input.StaleAfterSeconds) * time.Second for _, item := range items { if !fabricServiceChannelRouteRebuildHasCorrelationSnapshot(item) { result.MissingSnapshotCount++ } else if fabricServiceChannelRouteRebuildSnapshotIsStale(item, now, staleAfter) { result.StaleSnapshotCount++ result.DeferredStaleCount++ continue } else { result.AlreadyFreshCount++ continue } nodeID := strings.TrimSpace(item.ReporterNodeID) if nodeID == "" { result.ErrorCount++ continue } if _, ok := heartbeatsByNode[nodeID]; !ok { heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, nodeID, 120) if err != nil { result.ErrorCount++ heartbeats = nil } heartbeatsByNode[nodeID] = heartbeats } item = enrichFabricServiceChannelRouteRebuildAttempt(item, heartbeatsByNode[nodeID], now) item.CorrelationSnapshotAt = &now if err := s.store.UpdateFabricServiceChannelRouteRebuildCorrelationSnapshot(ctx, fabricServiceChannelRouteRebuildCorrelationSnapshotInput(item, now)); err != nil { result.ErrorCount++ continue } result.WarmedCount++ } if result.ErrorCount > 0 { result.Status = "degraded" result.Reason = "snapshot_warmup_partial" result.RecommendedOperatorAction = "Check node heartbeat history and backend logs for rebuild snapshot warmup failures." } else if result.DeferredStaleCount > 0 { result.Status = "ready" result.Reason = "missing_snapshots_warmed_stale_deferred" result.RecommendedOperatorAction = "Stale snapshots were detected and left cached; age-sensitive guard state is recomputed on read." } return result, nil } func (s *Service) ListFabricServiceChannelRouteRebuildIncidents(ctx context.Context, actorUserID string, input ListFabricServiceChannelRouteRebuildIncidentsInput) ([]FabricServiceChannelRouteRebuildIncident, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } input.ClusterID = strings.TrimSpace(input.ClusterID) if input.ClusterID == "" { return nil, ErrInvalidPayload } if input.Limit <= 0 || input.Limit > 5 { input.Limit = 5 } now := s.now() if now.IsZero() { now = time.Now().UTC() } items, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{ ClusterID: input.ClusterID, Limit: input.Limit, UseCachedSnapshot: true, }) if err != nil { return nil, err } items = s.enrichFabricServiceChannelRouteRebuildAttempts(ctx, input.ClusterID, items, now) silences, err := s.store.ListFabricServiceChannelRouteRebuildAlertSilences(ctx, input.ClusterID, now) if err != nil { return nil, err } items = applyFabricServiceChannelRouteRebuildAlertSilences(items, silences) incidents := fabricServiceChannelRouteRebuildIncidentsFromAttempts(input.ClusterID, items) if accessTelemetry, err := s.GetFabricServiceChannelAccessTelemetry(ctx, actorUserID, GetFabricServiceChannelAccessTelemetryInput{ ClusterID: input.ClusterID, Limit: input.Limit, Now: now, }); err == nil { accessIncidents := append( fabricServiceChannelAccessDecisionIncidents(input.ClusterID, accessTelemetry), fabricServiceChannelDataPlaneContractIncidents(input.ClusterID, accessTelemetry)..., ) incidents = append(incidents, applyFabricServiceChannelAccessDecisionIncidentSilences(accessIncidents, silences)...) fabricServiceChannelSortRouteRebuildIncidents(incidents) } if len(incidents) > input.Limit { incidents = incidents[:input.Limit] } return incidents, nil } func (s *Service) RecordFabricServiceChannelRouteRebuildInvestigation(ctx context.Context, input RecordFabricServiceChannelRouteRebuildInvestigationInput) error { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return err } input.ClusterID = strings.TrimSpace(input.ClusterID) input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID) input.RouteID = strings.TrimSpace(input.RouteID) input.ServiceClass = strings.TrimSpace(input.ServiceClass) input.Generation = strings.TrimSpace(input.Generation) input.GuardStatus = strings.TrimSpace(input.GuardStatus) input.IncidentID = strings.TrimSpace(input.IncidentID) input.FeedbackSource = strings.TrimSpace(input.FeedbackSource) input.FeedbackChannelID = strings.TrimSpace(input.FeedbackChannelID) input.FeedbackViolationStatus = strings.TrimSpace(input.FeedbackViolationStatus) input.DrilldownSource = strings.TrimSpace(input.DrilldownSource) input.Reason = strings.TrimSpace(input.Reason) if input.ClusterID == "" || (input.ReporterNodeID == "" && input.RouteID == "" && input.FeedbackSource == "" && input.FeedbackChannelID == "" && input.FeedbackViolationStatus == "") { return ErrInvalidPayload } now := input.Now if now.IsZero() { now = s.now() } if now.IsZero() { now = time.Now().UTC() } eventType := "fabric.service_channel_rebuild_incident.investigation_opened" targetType := "fabric_service_channel_route_rebuild_incident" targetIDValue := firstNonEmptyString(input.RouteID, input.FeedbackChannelID, input.FeedbackViolationStatus, input.FeedbackSource, input.ReporterNodeID) if input.DrilldownSource == "rebuild_health_feedback_breakdown" || input.FeedbackSource != "" || input.FeedbackChannelID != "" || input.FeedbackViolationStatus != "" { eventType = "fabric.service_channel_rebuild_feedback_breakdown.investigation_opened" targetType = "fabric_service_channel_rebuild_feedback_breakdown" } return s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: eventType, TargetType: targetType, TargetID: &targetIDValue, Payload: mustJSONRaw(map[string]any{ "incident_id": input.IncidentID, "reporter_node_id": input.ReporterNodeID, "route_id": input.RouteID, "service_class": input.ServiceClass, "generation": input.Generation, "guard_status": input.GuardStatus, "feedback_source": input.FeedbackSource, "feedback_channel_id": input.FeedbackChannelID, "feedback_violation_status": input.FeedbackViolationStatus, "drilldown_source": input.DrilldownSource, "reason": input.Reason, }), CreatedAt: now.UTC(), }) } func (s *Service) SilenceFabricServiceChannelRouteRebuildAlert(ctx context.Context, input SilenceFabricServiceChannelRouteRebuildAlertInput) (FabricServiceChannelRouteRebuildAlertSilence, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return FabricServiceChannelRouteRebuildAlertSilence{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID) input.RouteID = strings.TrimSpace(input.RouteID) input.GuardStatus = strings.TrimSpace(input.GuardStatus) input.Generation = strings.TrimSpace(input.Generation) input.Reason = strings.TrimSpace(input.Reason) input.IncidentSource = strings.TrimSpace(input.IncidentSource) input.ChannelID = strings.TrimSpace(input.ChannelID) if input.ClusterID == "" || input.ReporterNodeID == "" || input.RouteID == "" || input.GuardStatus == "" { return FabricServiceChannelRouteRebuildAlertSilence{}, ErrInvalidPayload } requestedRouteID := input.RouteID if input.IncidentSource == "access_decision" || input.IncidentSource == "data_plane_contract" { if input.ChannelID == "" { return FabricServiceChannelRouteRebuildAlertSilence{}, ErrInvalidPayload } input.RouteID = fabricServiceChannelAccessDecisionSilenceRouteID(input.ChannelID, input.RouteID) } if input.TTL <= 0 || input.TTL > 7*24*time.Hour { input.TTL = 6 * time.Hour } now := input.Now if now.IsZero() { now = s.now() } if now.IsZero() { now = time.Now().UTC() } expiresAt := now.UTC().Add(input.TTL) silence, err := s.store.UpsertFabricServiceChannelRouteRebuildAlertSilence(ctx, input, expiresAt) if err != nil { return FabricServiceChannelRouteRebuildAlertSilence{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "fabric.service_channel_rebuild_alert.silenced", TargetType: "fabric_service_channel_route_rebuild_alert", TargetID: &input.RouteID, Payload: mustJSONRaw(map[string]any{ "reporter_node_id": input.ReporterNodeID, "route_id": requestedRouteID, "stored_route_id": input.RouteID, "incident_source": input.IncidentSource, "channel_id": input.ChannelID, "guard_status": input.GuardStatus, "generation": input.Generation, "reason": input.Reason, "expires_at": expiresAt.UTC().Format(time.RFC3339Nano), }), CreatedAt: now.UTC(), }) return silence, nil } func (s *Service) ListFabricServiceChannelRouteRebuildAlertSilences(ctx context.Context, actorUserID string, clusterID string, now time.Time) ([]FabricServiceChannelRouteRebuildAlertSilence, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } clusterID = strings.TrimSpace(clusterID) if clusterID == "" { return nil, ErrInvalidPayload } if now.IsZero() { now = s.now() } if now.IsZero() { now = time.Now().UTC() } return s.store.ListFabricServiceChannelRouteRebuildAlertSilences(ctx, clusterID, now) } func (s *Service) UnsilenceFabricServiceChannelRouteRebuildAlert(ctx context.Context, input UnsilenceFabricServiceChannelRouteRebuildAlertInput) (FabricServiceChannelRouteRebuildAlertSilence, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return FabricServiceChannelRouteRebuildAlertSilence{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) input.SilenceID = strings.TrimSpace(input.SilenceID) input.Reason = strings.TrimSpace(input.Reason) if input.ClusterID == "" || input.SilenceID == "" { return FabricServiceChannelRouteRebuildAlertSilence{}, ErrInvalidPayload } now := input.Now if now.IsZero() { now = s.now() } if now.IsZero() { now = time.Now().UTC() } silence, err := s.store.DeleteFabricServiceChannelRouteRebuildAlertSilence(ctx, input) if err != nil { return FabricServiceChannelRouteRebuildAlertSilence{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "fabric.service_channel_rebuild_alert.unsilenced", TargetType: "fabric_service_channel_route_rebuild_alert_silence", TargetID: &input.SilenceID, Payload: mustJSONRaw(map[string]any{ "reporter_node_id": silence.ReporterNodeID, "route_id": silence.DisplayRouteID, "stored_route_id": silence.RouteID, "incident_source": silence.IncidentSource, "channel_id": silence.ChannelID, "guard_status": silence.GuardStatus, "generation": silence.Generation, "reason": input.Reason, "unsilenced_at": now.UTC().Format(time.RFC3339Nano), }), CreatedAt: now.UTC(), }) return silence, nil } func (s *Service) enrichFabricServiceChannelRouteRebuildAttempts(ctx context.Context, clusterID string, items []FabricServiceChannelRouteRebuildAttempt, now time.Time) []FabricServiceChannelRouteRebuildAttempt { if len(items) == 0 { return items } if now.IsZero() { now = time.Now().UTC() } heartbeatsByNode := map[string][]NodeHeartbeat{} for idx := range items { if fabricServiceChannelRouteRebuildHasCorrelationSnapshot(items[idx]) { items[idx] = applyFabricServiceChannelRouteRebuildGuard(items[idx], now) continue } nodeID := strings.TrimSpace(items[idx].ReporterNodeID) if nodeID == "" { continue } if _, ok := heartbeatsByNode[nodeID]; !ok { heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 120) if err != nil { heartbeats = nil } heartbeatsByNode[nodeID] = heartbeats } items[idx] = enrichFabricServiceChannelRouteRebuildAttempt(items[idx], heartbeatsByNode[nodeID], now) if fabricServiceChannelRouteRebuildHasRuntimeEvidence(items[idx]) { items[idx].CorrelationSnapshotAt = &now _ = s.store.UpdateFabricServiceChannelRouteRebuildCorrelationSnapshot(ctx, fabricServiceChannelRouteRebuildCorrelationSnapshotInput(items[idx], now)) } } return items } func fabricServiceChannelRouteRebuildHasCorrelationSnapshot(item FabricServiceChannelRouteRebuildAttempt) bool { return item.CorrelationSnapshotAt != nil && fabricServiceChannelRouteRebuildHasRuntimeEvidence(item) } func fabricServiceChannelRouteRebuildHasRuntimeEvidence(item FabricServiceChannelRouteRebuildAttempt) bool { return item.NodeTransitionMatched || item.NodeRouteGenerationMatched || item.PostRebuildSelectedRouteID != "" || item.PostRebuildSendPackets > 0 || item.PostRebuildSendFlowPackets > 0 } func fabricServiceChannelRouteRebuildSnapshotIsStale(item FabricServiceChannelRouteRebuildAttempt, now time.Time, staleAfter time.Duration) bool { if item.CorrelationSnapshotAt == nil { return true } if staleAfter <= 0 { return false } snapshotAt := item.CorrelationSnapshotAt.UTC() if snapshotAt.IsZero() { return true } if now.IsZero() { now = time.Now().UTC() } return now.UTC().Sub(snapshotAt) > staleAfter } func stripFabricServiceChannelRouteRebuildCorrelation(items []FabricServiceChannelRouteRebuildAttempt) []FabricServiceChannelRouteRebuildAttempt { for idx := range items { items[idx].NodeTransitionStatus = "" items[idx].NodeTransitionGeneration = "" items[idx].NodeTransitionObservedAt = "" items[idx].NodeTransitionMatched = false items[idx].NodeRouteGenerationStatus = "" items[idx].NodeRouteGenerationAppliedAt = "" items[idx].NodeRouteGenerationWithdrawnAt = "" items[idx].NodeRouteGenerationMatched = false items[idx].PostRebuildSelectedRouteID = "" items[idx].PostRebuildSendPackets = 0 items[idx].PostRebuildSendFailures = 0 items[idx].PostRebuildSendFlowPackets = 0 items[idx].PostRebuildSendFlowDropped = 0 items[idx].GuardStatus = "" items[idx].GuardSeverity = "" items[idx].GuardReason = "" items[idx].GuardAgeSeconds = 0 items[idx].GuardTransitionDeadlineSeconds = 0 items[idx].GuardTrafficDeadlineSeconds = 0 items[idx].Timeline = nil items[idx].CorrelationSnapshotAt = nil } return items } func fabricServiceChannelRouteRebuildCorrelationSnapshotInput(item FabricServiceChannelRouteRebuildAttempt, now time.Time) UpdateFabricServiceChannelRouteRebuildCorrelationSnapshotInput { if now.IsZero() { now = time.Now().UTC() } return UpdateFabricServiceChannelRouteRebuildCorrelationSnapshotInput{ ID: item.ID, NodeTransitionStatus: item.NodeTransitionStatus, NodeTransitionGeneration: item.NodeTransitionGeneration, NodeTransitionObservedAt: item.NodeTransitionObservedAt, NodeTransitionMatched: item.NodeTransitionMatched, NodeRouteGenerationStatus: item.NodeRouteGenerationStatus, NodeRouteGenerationAppliedAt: item.NodeRouteGenerationAppliedAt, NodeRouteGenerationWithdrawnAt: item.NodeRouteGenerationWithdrawnAt, NodeRouteGenerationMatched: item.NodeRouteGenerationMatched, PostRebuildSelectedRouteID: item.PostRebuildSelectedRouteID, PostRebuildSendPackets: item.PostRebuildSendPackets, PostRebuildSendFailures: item.PostRebuildSendFailures, PostRebuildSendFlowPackets: item.PostRebuildSendFlowPackets, PostRebuildSendFlowDropped: item.PostRebuildSendFlowDropped, GuardStatus: item.GuardStatus, GuardSeverity: item.GuardSeverity, GuardReason: item.GuardReason, GuardTransitionDeadlineSeconds: item.GuardTransitionDeadlineSeconds, GuardTrafficDeadlineSeconds: item.GuardTrafficDeadlineSeconds, Timeline: item.Timeline, CorrelationSnapshotAt: now.UTC(), } } func enrichFabricServiceChannelRouteRebuildAttempt(item FabricServiceChannelRouteRebuildAttempt, heartbeats []NodeHeartbeat, now time.Time) FabricServiceChannelRouteRebuildAttempt { item.Timeline = append(item.Timeline, FabricServiceChannelRouteRebuildTimelineEvent{ Stage: "backend_decision", Status: firstNonEmptyString(item.RebuildStatus, "unknown"), At: item.UpdatedAt.UTC().Format(time.RFC3339Nano), RouteID: item.RouteID, Generation: item.Generation, Payload: mustJSONRaw(map[string]any{ "rebuild_request_id": item.RebuildRequestID, "decision_source": item.DecisionSource, "outcome": item.Outcome, "replacement_route_id": item.ReplacementRouteID, "rebuild_reason": item.RebuildReason, }), }) for _, heartbeat := range heartbeats { metadata := jsonObject(heartbeat.Metadata) runtime := jsonMapPath(metadata, "fabric_service_channel_runtime_report") ingress := jsonMapPath(runtime, "ingress") transition := jsonMapPath(ingress, "route_manager_transition") if !item.NodeTransitionMatched && transitionMatchesRebuildAttempt(transition, item) { item.NodeTransitionMatched = true item.NodeTransitionStatus = jsonString(transition, "status") item.NodeTransitionGeneration = jsonString(transition, "generation") item.NodeTransitionObservedAt = firstNonEmptyString(jsonString(transition, "observed_at"), heartbeat.ObservedAt.UTC().Format(time.RFC3339Nano)) item.Timeline = append(item.Timeline, FabricServiceChannelRouteRebuildTimelineEvent{ Stage: "node_route_manager_transition", Status: item.NodeTransitionStatus, At: item.NodeTransitionObservedAt, RouteID: item.RouteID, Generation: item.NodeTransitionGeneration, Payload: mustJSONRaw(transition), }) } routeGeneration := jsonMapPath(metadata, "mesh_route_generation_report") if !item.NodeRouteGenerationMatched { if decision, ok := routeGenerationDecisionForAttempt(routeGeneration, item); ok { item.NodeRouteGenerationMatched = true item.NodeRouteGenerationStatus = firstNonEmptyString(jsonString(decision, "status"), jsonString(decision, "apply_status"), jsonString(decision, "withdraw_status")) item.NodeRouteGenerationAppliedAt = jsonString(decision, "applied_at") item.NodeRouteGenerationWithdrawnAt = jsonString(decision, "withdrawn_at") item.Timeline = append(item.Timeline, FabricServiceChannelRouteRebuildTimelineEvent{ Stage: "node_route_generation_apply", Status: item.NodeRouteGenerationStatus, At: firstNonEmptyString(item.NodeRouteGenerationAppliedAt, item.NodeRouteGenerationWithdrawnAt, heartbeat.ObservedAt.UTC().Format(time.RFC3339Nano)), RouteID: item.RouteID, Generation: jsonString(decision, "generation"), Payload: mustJSONRaw(decision), }) } } if item.PostRebuildSelectedRouteID == "" && !heartbeat.ObservedAt.Before(item.UpdatedAt) { selectedRouteID := jsonString(ingress, "last_selected_route_id") if selectedRouteID == item.ReplacementRouteID || selectedRouteID == item.RouteID || selectedRouteID != "" { item.PostRebuildSelectedRouteID = selectedRouteID item.PostRebuildSendPackets = jsonUint64(ingress, "send_packets") item.PostRebuildSendFailures = jsonUint64(ingress, "send_route_failures") item.PostRebuildSendFlowPackets = jsonUint64(ingress, "send_flow_packets") item.PostRebuildSendFlowDropped = jsonUint64(ingress, "send_flow_dropped") item.Timeline = append(item.Timeline, FabricServiceChannelRouteRebuildTimelineEvent{ Stage: "post_rebuild_traffic", Status: "observed", At: heartbeat.ObservedAt.UTC().Format(time.RFC3339Nano), RouteID: selectedRouteID, Generation: jsonString(runtime, "config_version"), Payload: mustJSONRaw(map[string]any{ "last_selected_route_id": selectedRouteID, "send_packets": item.PostRebuildSendPackets, "send_route_failures": item.PostRebuildSendFailures, "send_flow_packets": item.PostRebuildSendFlowPackets, "send_flow_dropped": item.PostRebuildSendFlowDropped, "recommended_parallel": jsonUint64(ingress, "recommended_parallel_flow_sends"), }), }) } } if item.NodeTransitionMatched && item.NodeRouteGenerationMatched && item.PostRebuildSelectedRouteID != "" { break } } sort.SliceStable(item.Timeline, func(i, j int) bool { left, leftErr := time.Parse(time.RFC3339Nano, item.Timeline[i].At) right, rightErr := time.Parse(time.RFC3339Nano, item.Timeline[j].At) if leftErr == nil && rightErr == nil && !left.Equal(right) { return left.Before(right) } return item.Timeline[i].Stage < item.Timeline[j].Stage }) item = applyFabricServiceChannelRouteRebuildGuard(item, now) return item } const ( fabricServiceChannelRebuildTransitionDeadline = 90 * time.Second fabricServiceChannelRebuildTrafficDeadline = 180 * time.Second ) func applyFabricServiceChannelRouteRebuildGuard(item FabricServiceChannelRouteRebuildAttempt, now time.Time) FabricServiceChannelRouteRebuildAttempt { if now.IsZero() { now = time.Now().UTC() } age := now.Sub(item.UpdatedAt) if age < 0 { age = 0 } item.GuardAgeSeconds = int64(age / time.Second) item.GuardTransitionDeadlineSeconds = int64(fabricServiceChannelRebuildTransitionDeadline / time.Second) item.GuardTrafficDeadlineSeconds = int64(fabricServiceChannelRebuildTrafficDeadline / time.Second) if item.RebuildStatus == "" { item.GuardStatus = "unknown" item.GuardSeverity = "warn" item.GuardReason = "missing_backend_rebuild_status" return item } if item.RebuildStatus == "pending_degraded_fallback" { if item.NodeTransitionMatched { item.GuardStatus = "pending_degraded_fallback_seen" item.GuardSeverity = "warn" item.GuardReason = "node_confirmed_pending_degraded_fallback" return item } if age > fabricServiceChannelRebuildTransitionDeadline { item.GuardStatus = "missing_node_transition" item.GuardSeverity = "bad" item.GuardReason = "node_did_not_report_pending_fallback_transition" return item } item.GuardStatus = "pending_node_transition" item.GuardSeverity = "warn" item.GuardReason = "waiting_for_node_pending_fallback_transition" return item } if item.RebuildStatus != "applied" { item.GuardStatus = "not_applied" item.GuardSeverity = "warn" item.GuardReason = "backend_rebuild_not_applied" return item } if !item.NodeTransitionMatched { if age > fabricServiceChannelRebuildTransitionDeadline { item.GuardStatus = "missing_node_transition" item.GuardSeverity = "bad" item.GuardReason = "node_did_not_report_applied_rebuild_transition" return item } item.GuardStatus = "pending_node_transition" item.GuardSeverity = "warn" item.GuardReason = "waiting_for_node_applied_rebuild_transition" return item } if !item.NodeRouteGenerationMatched { if age > fabricServiceChannelRebuildTransitionDeadline { item.GuardStatus = "missing_route_generation" item.GuardSeverity = "bad" item.GuardReason = "node_transition_seen_but_route_generation_not_correlated" return item } item.GuardStatus = "pending_route_generation" item.GuardSeverity = "warn" item.GuardReason = "waiting_for_route_generation_correlation" return item } if item.PostRebuildSelectedRouteID == "" { if age > fabricServiceChannelRebuildTrafficDeadline { item.GuardStatus = "missing_post_rebuild_traffic" item.GuardSeverity = "bad" item.GuardReason = "no_post_rebuild_traffic_observed" return item } item.GuardStatus = "pending_post_rebuild_traffic" item.GuardSeverity = "warn" item.GuardReason = "waiting_for_post_rebuild_traffic" return item } if item.ReplacementRouteID != "" && item.PostRebuildSelectedRouteID != item.ReplacementRouteID { item.GuardStatus = "unexpected_post_rebuild_route" item.GuardSeverity = "bad" item.GuardReason = "post_rebuild_selected_route_differs_from_replacement" return item } if item.PostRebuildSendFailures > 0 || item.PostRebuildSendFlowDropped > 0 { item.GuardStatus = "post_rebuild_degraded" item.GuardSeverity = "warn" item.GuardReason = "post_rebuild_traffic_has_failures_or_drops" return item } item.GuardStatus = "ok" item.GuardSeverity = "good" item.GuardReason = "backend_decision_node_transition_and_post_rebuild_traffic_correlated" return item } func sortedStringSetKeys(values map[string]struct{}) []string { if len(values) == 0 { return nil } out := make([]string, 0, len(values)) for value := range values { out = append(out, value) } sort.Strings(out) return out } func applyFabricServiceChannelRouteRebuildAlertSilences(items []FabricServiceChannelRouteRebuildAttempt, silences []FabricServiceChannelRouteRebuildAlertSilence) []FabricServiceChannelRouteRebuildAttempt { if len(items) == 0 || len(silences) == 0 { return items } byKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{} for _, silence := range silences { byKey[fabricServiceChannelRebuildAlertSilenceKey(silence.ReporterNodeID, silence.RouteID, silence.GuardStatus, silence.Generation)] = silence } for idx := range items { item := &items[idx] silence, ok := byKey[fabricServiceChannelRebuildAlertSilenceKey(item.ReporterNodeID, item.RouteID, item.GuardStatus, item.Generation)] if !ok { continue } item.AlertSilenced = true item.AlertSilenceID = silence.ID item.AlertSilenceReason = silence.Reason item.AlertSilencedUntil = &silence.ExpiresAt } byResurfaceKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{} for _, silence := range silences { key := fabricServiceChannelRebuildAlertResurfaceKey(silence.ReporterNodeID, silence.RouteID, silence.GuardStatus) current, ok := byResurfaceKey[key] if !ok || silence.CreatedAt.After(current.CreatedAt) { byResurfaceKey[key] = silence } } for idx := range items { item := &items[idx] if item.AlertSilenced || (item.GuardSeverity != "bad" && item.GuardSeverity != "warn") { continue } silence, ok := byResurfaceKey[fabricServiceChannelRebuildAlertResurfaceKey(item.ReporterNodeID, item.RouteID, item.GuardStatus)] if !ok || strings.TrimSpace(silence.Generation) == strings.TrimSpace(item.Generation) { continue } item.AlertResurfaced = true item.AlertResurfacedFromSilenceID = silence.ID item.AlertResurfacedPreviousGeneration = silence.Generation item.AlertResurfacedPreviousUntil = &silence.ExpiresAt } return items } func fabricServiceChannelRebuildAlertSilenceKey(reporterNodeID, routeID, guardStatus, generation string) string { return strings.TrimSpace(reporterNodeID) + "|" + strings.TrimSpace(routeID) + "|" + strings.TrimSpace(guardStatus) + "|" + strings.TrimSpace(generation) } func fabricServiceChannelRebuildAlertResurfaceKey(reporterNodeID, routeID, guardStatus string) string { return strings.TrimSpace(reporterNodeID) + "|" + strings.TrimSpace(routeID) + "|" + strings.TrimSpace(guardStatus) } func fabricServiceChannelReadinessFromRebuildHealth(summary FabricServiceChannelRouteRebuildHealthSummary) FabricServiceChannelReadiness { readiness := FabricServiceChannelReadiness{ ClusterID: summary.ClusterID, ObservedAt: summary.ObservedAt, Status: "clean", Reason: "no_active_service_channel_rebuild_alerts", ActiveAlertCount: summary.ActiveBadCount + summary.ActiveWarnCount, ActiveBadCount: summary.ActiveBadCount, ActiveWarnCount: summary.ActiveWarnCount, ResurfacedCount: summary.ResurfacedCount, SilencedCount: summary.SilencedCount, MissingTransitionCount: summary.CountsByGuardStatus["missing_node_transition"], MissingRouteGenerationCount: summary.CountsByGuardStatus["missing_route_generation"], MissingPostTrafficCount: summary.CountsByGuardStatus["missing_post_rebuild_traffic"], UnexpectedRouteCount: summary.CountsByGuardStatus["unexpected_post_rebuild_route"], PostRebuildDegradedCount: summary.CountsByGuardStatus["post_rebuild_degraded"], RecommendedOperatorAction: summary.RecommendedOperatorAction, } if summary.ResurfacedCount > 0 { readiness.BlockingReasons = append(readiness.BlockingReasons, "resurfaced_rebuild_alert") } if summary.ActiveBadCount > 0 { readiness.BlockingReasons = append(readiness.BlockingReasons, "active_bad_rebuild_alert") } if readiness.MissingTransitionCount > 0 { readiness.BlockingReasons = append(readiness.BlockingReasons, "missing_node_transition") } if readiness.MissingRouteGenerationCount > 0 { readiness.BlockingReasons = append(readiness.BlockingReasons, "missing_route_generation") } if readiness.MissingPostTrafficCount > 0 { readiness.BlockingReasons = append(readiness.BlockingReasons, "missing_post_rebuild_traffic") } if readiness.UnexpectedRouteCount > 0 { readiness.BlockingReasons = append(readiness.BlockingReasons, "unexpected_post_rebuild_route") } if readiness.PostRebuildDegradedCount > 0 { readiness.DegradedReasons = append(readiness.DegradedReasons, "post_rebuild_degraded") } if summary.ActiveWarnCount > 0 { readiness.DegradedReasons = append(readiness.DegradedReasons, "active_warn_rebuild_alert") } if summary.PendingCount > 0 { readiness.DegradedReasons = append(readiness.DegradedReasons, "pending_rebuild_attempt") } if summary.SilencedCount > 0 { readiness.DegradedReasons = append(readiness.DegradedReasons, "silenced_alert_under_observation") } if len(readiness.BlockingReasons) > 0 { readiness.Status = "blocked" readiness.Reason = readiness.BlockingReasons[0] return readiness } if len(readiness.DegradedReasons) > 0 { readiness.Status = "degraded" readiness.Reason = readiness.DegradedReasons[0] } return readiness } func fabricServiceChannelRebuildRecommendedAction(summary FabricServiceChannelRouteRebuildHealthSummary) string { if summary.AccessNoSafeCount > 0 { return "inspect_access_no_safe_recovery_route_pool_and_signed_policy" } if summary.ActiveBadCount > 0 { if summary.ResurfacedCount > 0 { return "resurfaced_rebuild_alerts_need_reinspection_new_generation_or_route_changed" } return "inspect_bad_rebuild_attempts_check_reporter_node_heartbeats_route_generation_and_post_rebuild_traffic" } if summary.ActiveWarnCount > 0 { return "watch_pending_rebuild_attempts_until_node_transition_and_post_rebuild_traffic_arrive" } if summary.SilencedCount > 0 { return "no_active_rebuild_alerts_silenced_alerts_remain_under_observation" } if summary.TotalAttempts == 0 { return "no_rebuild_attempts_observed" } return "no_operator_action_required" } func fabricServiceChannelRouteRebuildIncidentsFromAttempts(clusterID string, items []FabricServiceChannelRouteRebuildAttempt) []FabricServiceChannelRouteRebuildIncident { byKey := map[string]*FabricServiceChannelRouteRebuildIncident{} for _, item := range items { guardStatus := firstNonEmptyString(item.GuardStatus, "unknown") guardSeverity := firstNonEmptyString(item.GuardSeverity, "unknown") key := strings.Join([]string{item.ReporterNodeID, item.RouteID, item.ServiceClass, item.Generation, guardStatus}, "|") incident, ok := byKey[key] if !ok { fingerprint := hashStringHex(key) incident = &FabricServiceChannelRouteRebuildIncident{ Fingerprint: fingerprint, ClusterID: clusterID, ReporterNodeID: item.ReporterNodeID, RouteID: item.RouteID, ServiceClass: item.ServiceClass, Generation: item.Generation, GuardStatus: guardStatus, GuardSeverity: guardSeverity, GuardReason: item.GuardReason, FirstSeenAt: item.CreatedAt, LastSeenAt: item.UpdatedAt, LatestReplacementRouteID: item.ReplacementRouteID, LatestRebuildStatus: item.RebuildStatus, LatestOutcome: item.Outcome, AlertSilenced: item.AlertSilenced, AlertResurfaced: item.AlertResurfaced, } byKey[key] = incident } incident.AttemptCount++ if item.CreatedAt.Before(incident.FirstSeenAt) { incident.FirstSeenAt = item.CreatedAt } if item.UpdatedAt.After(incident.LastSeenAt) { incident.LastSeenAt = item.UpdatedAt incident.GuardSeverity = guardSeverity incident.GuardReason = item.GuardReason incident.LatestReplacementRouteID = item.ReplacementRouteID incident.LatestRebuildStatus = item.RebuildStatus incident.LatestOutcome = item.Outcome } incident.AlertSilenced = incident.AlertSilenced || item.AlertSilenced if item.AlertResurfaced { incident.AlertResurfaced = true incident.AlertResurfacedFromSilenceID = item.AlertResurfacedFromSilenceID incident.AlertResurfacedCause = item.AlertResurfacedCause incident.AlertResurfacedPreviousRouteID = item.AlertResurfacedPreviousRouteID incident.AlertResurfacedPreviousChannelID = item.AlertResurfacedPreviousChannelID incident.AlertResurfacedPreviousGeneration = item.AlertResurfacedPreviousGeneration incident.AlertResurfacedPreviousUntil = item.AlertResurfacedPreviousUntil } } out := make([]FabricServiceChannelRouteRebuildIncident, 0, len(byKey)) for _, incident := range byKey { incident.RecommendedOperatorAction = fabricServiceChannelRebuildIncidentRecommendedAction(*incident) out = append(out, *incident) } for idx := range out { out[idx].RecommendedOperatorAction = fabricServiceChannelRebuildIncidentRecommendedAction(out[idx]) } fabricServiceChannelSortRouteRebuildIncidents(out) return out } func fabricServiceChannelSortRouteRebuildIncidents(out []FabricServiceChannelRouteRebuildIncident) { sort.SliceStable(out, func(i, j int) bool { leftRank := fabricServiceChannelRebuildIncidentSeverityRank(out[i]) rightRank := fabricServiceChannelRebuildIncidentSeverityRank(out[j]) if leftRank != rightRank { return leftRank > rightRank } return out[i].LastSeenAt.After(out[j].LastSeenAt) }) } func fabricServiceChannelAccessDecisionIncidents(clusterID string, telemetry FabricServiceChannelAccessTelemetry) []FabricServiceChannelRouteRebuildIncident { out := []FabricServiceChannelRouteRebuildIncident{} for _, channel := range telemetry.ActiveChannels { if channel.RouteDecisionSource == "" { continue } status, severity, reason := fabricServiceChannelAccessDecisionIncidentState(channel) if status == "" { continue } key := strings.Join([]string{"access_decision", channel.ChannelID, channel.RouteDecisionRouteID, status, channel.RouteDecisionGeneration}, "|") out = append(out, FabricServiceChannelRouteRebuildIncident{ Fingerprint: hashStringHex(key), ClusterID: clusterID, ReporterNodeID: channel.SelectedEntryNodeID, RouteID: firstNonEmptyString(channel.RouteDecisionRouteID, channel.PrimaryRouteID), ServiceClass: channel.ServiceClass, Generation: channel.RouteDecisionGeneration, IncidentSource: "access_decision", ChannelID: channel.ChannelID, GuardStatus: status, GuardSeverity: severity, GuardReason: reason, AttemptCount: 1, FirstSeenAt: telemetry.ObservedAt, LastSeenAt: telemetry.ObservedAt, LatestReplacementRouteID: channel.RouteDecisionReplacementRouteID, LatestRebuildStatus: channel.RouteDecisionRebuildStatus, LatestOutcome: channel.RouteDecisionSource, }) } for idx := range out { out[idx].RecommendedOperatorAction = fabricServiceChannelRebuildIncidentRecommendedAction(out[idx]) } fabricServiceChannelSortRouteRebuildIncidents(out) return out } func fabricServiceChannelDataPlaneContractIncidents(clusterID string, telemetry FabricServiceChannelAccessTelemetry) []FabricServiceChannelRouteRebuildIncident { out := []FabricServiceChannelRouteRebuildIncident{} for _, channel := range telemetry.ActiveChannels { status, severity, reason := fabricServiceChannelDataPlaneContractIncidentState(channel) if status == "" { continue } routeID := firstNonEmptyString(channel.RouteDecisionRouteID, channel.PrimaryRouteID, "data_plane") generation := firstNonEmptyString(channel.RouteDecisionGeneration, channel.PrimaryRouteID, channel.DataPlane.BackendRelayPolicy, channel.ChannelID) key := strings.Join([]string{"data_plane_contract", channel.ChannelID, routeID, status, generation}, "|") out = append(out, FabricServiceChannelRouteRebuildIncident{ Fingerprint: hashStringHex(key), ClusterID: clusterID, ReporterNodeID: channel.SelectedEntryNodeID, RouteID: routeID, ServiceClass: channel.ServiceClass, Generation: generation, IncidentSource: "data_plane_contract", ChannelID: channel.ChannelID, GuardStatus: status, GuardSeverity: severity, GuardReason: reason, AttemptCount: 1, FirstSeenAt: telemetry.ObservedAt, LastSeenAt: telemetry.ObservedAt, LatestOutcome: firstNonEmptyString(channel.EntryNodeLastWorkingDataTransport, channel.DataPlane.WorkingDataTransport, "unknown"), LatestRebuildStatus: firstNonEmptyString( channel.EntryNodeLastBackendRelayPolicy, channel.DataPlane.BackendRelayPolicy, ), }) } for idx := range out { out[idx].RecommendedOperatorAction = fabricServiceChannelRebuildIncidentRecommendedAction(out[idx]) } fabricServiceChannelSortRouteRebuildIncidents(out) return out } func applyFabricServiceChannelAccessDecisionIncidentSilences(items []FabricServiceChannelRouteRebuildIncident, silences []FabricServiceChannelRouteRebuildAlertSilence) []FabricServiceChannelRouteRebuildIncident { if len(items) == 0 || len(silences) == 0 { return items } byKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{} byResurfaceKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{} byGeneralResurfaceKey := map[string]FabricServiceChannelRouteRebuildAlertSilence{} byAccessReporterGuard := map[string]FabricServiceChannelRouteRebuildAlertSilence{} for _, silence := range silences { byKey[fabricServiceChannelRebuildAlertSilenceKey(silence.ReporterNodeID, silence.RouteID, silence.GuardStatus, silence.Generation)] = silence resurfaceKey := fabricServiceChannelRebuildAlertResurfaceKey(silence.ReporterNodeID, silence.RouteID, silence.GuardStatus) current, ok := byResurfaceKey[resurfaceKey] if !ok || silence.CreatedAt.After(current.CreatedAt) { byResurfaceKey[resurfaceKey] = silence } if channelID, routeID, ok := fabricServiceChannelParseAccessDecisionSilenceRouteID(silence.RouteID); ok { _ = channelID generalKey := fabricServiceChannelRebuildAlertResurfaceKey(silence.ReporterNodeID, routeID, silence.GuardStatus) current, ok := byGeneralResurfaceKey[generalKey] if !ok || silence.CreatedAt.After(current.CreatedAt) { byGeneralResurfaceKey[generalKey] = silence } accessKey := fabricServiceChannelRebuildAlertResurfaceKey(silence.ReporterNodeID, "access_decision", silence.GuardStatus) current, ok = byAccessReporterGuard[accessKey] if !ok || silence.CreatedAt.After(current.CreatedAt) { byAccessReporterGuard[accessKey] = silence } } } for idx := range items { item := &items[idx] silenceRouteID := fabricServiceChannelAccessDecisionSilenceRouteID(item.ChannelID, item.RouteID) silence, ok := byKey[fabricServiceChannelRebuildAlertSilenceKey(item.ReporterNodeID, silenceRouteID, item.GuardStatus, item.Generation)] if ok { item.AlertSilenced = true continue } if item.GuardSeverity != "bad" && item.GuardSeverity != "warn" { continue } silence, ok = byResurfaceKey[fabricServiceChannelRebuildAlertResurfaceKey(item.ReporterNodeID, silenceRouteID, item.GuardStatus)] if !ok || strings.TrimSpace(silence.Generation) == strings.TrimSpace(item.Generation) { generalSilence, generalOK := byGeneralResurfaceKey[fabricServiceChannelRebuildAlertResurfaceKey(item.ReporterNodeID, item.RouteID, item.GuardStatus)] if !generalOK || strings.TrimSpace(generalSilence.Generation) == strings.TrimSpace(item.Generation) { accessSilence, accessOK := byAccessReporterGuard[fabricServiceChannelRebuildAlertResurfaceKey(item.ReporterNodeID, "access_decision", item.GuardStatus)] if !accessOK || !fabricServiceChannelAccessDecisionSilenceDiffers(*item, accessSilence) { continue } generalSilence = accessSilence } silence = generalSilence } item.AlertResurfaced = true item.AlertResurfacedFromSilenceID = silence.ID item.AlertResurfacedCause = fabricServiceChannelAccessDecisionResurfaceCause(*item, silence) item.AlertResurfacedPreviousRouteID = silence.DisplayRouteID item.AlertResurfacedPreviousChannelID = silence.ChannelID item.AlertResurfacedPreviousGeneration = silence.Generation item.AlertResurfacedPreviousUntil = &silence.ExpiresAt } return items } func fabricServiceChannelAccessDecisionSilenceDiffers(item FabricServiceChannelRouteRebuildIncident, silence FabricServiceChannelRouteRebuildAlertSilence) bool { return strings.TrimSpace(silence.ChannelID) != strings.TrimSpace(item.ChannelID) || strings.TrimSpace(silence.DisplayRouteID) != strings.TrimSpace(item.RouteID) || strings.TrimSpace(silence.Generation) != strings.TrimSpace(item.Generation) } func fabricServiceChannelAccessDecisionResurfaceCause(item FabricServiceChannelRouteRebuildIncident, silence FabricServiceChannelRouteRebuildAlertSilence) string { if strings.TrimSpace(silence.ChannelID) != "" && strings.TrimSpace(silence.ChannelID) != strings.TrimSpace(item.ChannelID) { return "channel_changed" } if strings.TrimSpace(silence.DisplayRouteID) != "" && strings.TrimSpace(silence.DisplayRouteID) != strings.TrimSpace(item.RouteID) { return "route_changed" } if strings.TrimSpace(silence.Generation) != strings.TrimSpace(item.Generation) { return "generation_changed" } return "resurfaced" } func fabricServiceChannelAccessDecisionSilenceRouteID(channelID string, routeID string) string { return "access:" + strings.TrimSpace(channelID) + ":" + strings.TrimSpace(routeID) } func fabricServiceChannelParseAccessDecisionSilenceRouteID(value string) (string, string, bool) { value = strings.TrimSpace(value) if !strings.HasPrefix(value, "access:") { return "", "", false } rest := strings.TrimPrefix(value, "access:") parts := strings.SplitN(rest, ":", 2) if len(parts) != 2 || strings.TrimSpace(parts[0]) == "" || strings.TrimSpace(parts[1]) == "" { return "", "", false } return strings.TrimSpace(parts[0]), strings.TrimSpace(parts[1]), true } func fabricServiceChannelAccessDecisionIncidentState(channel FabricServiceChannelAccessTelemetryChannel) (string, string, string) { switch { case fabricServiceChannelRouteDecisionIsNoSafeRecovery(channel): return "access_no_safe_recovery", "bad", firstNonEmptyString(channel.RouteDecisionRebuildReason, "no_unfenced_alternate_route") case fabricServiceChannelRouteDecisionIsRecovery(channel): return "access_recovery_selected", "warn", firstNonEmptyString(channel.RouteDecisionRebuildReason, "recovery_route_selected") case channel.RouteDecisionRebuildStatus == "applied" || containsString(channel.RouteDecisionScoreReasons, "service_channel_rebuild_applied"): return "access_rebuild_applied", "good", firstNonEmptyString(channel.RouteDecisionRebuildReason, "planner_applied_rebuild") case fabricServiceChannelRouteDecisionIsReplacement(channel): return "access_replacement_selected", "warn", firstNonEmptyString(channel.RouteDecisionRebuildReason, "replacement_route_selected") default: return "", "", "" } } func fabricServiceChannelDataPlaneContractIncidentState(channel FabricServiceChannelAccessTelemetryChannel) (string, string, string) { accepted := channel.EntryNodeTotalAccepted > 0 || channel.EntryNodeIntrospectionAccepted > 0 || channel.EntryNodeBackendFallbackCount > 0 if accepted && channel.EntryNodeDataPlaneContractCount == 0 { return "data_plane_contract_not_reported", "bad", "entry_node_accepted_service_channel_without_reporting_data_plane_contract" } workingTransport := firstNonEmptyString(channel.EntryNodeLastWorkingDataTransport, channel.DataPlane.WorkingDataTransport) if workingTransport != "" && workingTransport != "fabric_quic_route" { return "data_plane_working_transport_violation", "bad", "working_data_transport_must_be_fabric_quic_route" } steadyTransport := firstNonEmptyString(channel.EntryNodeLastSteadyStateTransport, channel.DataPlane.SteadyStateTransport) if steadyTransport != "" && steadyTransport != "fabric_route" { return "data_plane_steady_state_transport_violation", "bad", "steady_state_transport_must_be_fabric_route" } logicalFlowMode := firstNonEmptyString(channel.EntryNodeLastLogicalFlowMode, channel.DataPlane.LogicalFlowMode) if logicalFlowMode != "" && logicalFlowMode != "multi_flow_isolated" { return "data_plane_logical_flow_violation", "bad", "logical_flow_mode_must_be_multi_flow_isolated" } backendRelayPolicy := firstNonEmptyString(channel.EntryNodeLastBackendRelayPolicy, channel.DataPlane.BackendRelayPolicy) if channel.EntryNodeBackendFallbackBlockedCount > 0 { return firstNonEmptyString(channel.EntryNodeLastDataPlaneViolationStatus, "data_plane_backend_fallback_blocked"), "bad", firstNonEmptyString(channel.EntryNodeLastDataPlaneViolationReason, "backend_fallback_blocked_by_data_plane_policy") } if channel.EntryNodeFabricRouteSendFailureCount > 0 { return firstNonEmptyString(channel.EntryNodeLastDataPlaneViolationStatus, "data_plane_fabric_route_send_failed"), "bad", firstNonEmptyString(channel.EntryNodeLastDataPlaneViolationReason, "fabric_route_send_failed") } if backendRelayPolicy == "disabled" && (channel.EntryNodeBackendFallbackCount > 0 || channel.ForceBackendFallback) { return "data_plane_disabled_backend_relay_observed", "bad", "backend_relay_policy_disabled_but_backend_fallback_was_observed" } return "", "", "" } func hashStringHex(value string) string { sum := sha256.Sum256([]byte(value)) return hex.EncodeToString(sum[:]) } func fabricServiceChannelRebuildIncidentSeverityRank(item FabricServiceChannelRouteRebuildIncident) int { if item.AlertResurfaced { return 4 } if item.IncidentSource == "access_decision" && item.GuardStatus == "access_no_safe_recovery" { return 4 } switch item.GuardSeverity { case "bad": return 3 case "warn": return 2 case "good": return 1 default: return 0 } } func fabricServiceChannelRebuildIncidentRecommendedAction(item FabricServiceChannelRouteRebuildIncident) string { if item.AlertSilenced && !item.AlertResurfaced { return "silenced_rebuild_incident_under_observation" } if item.AlertResurfaced { return "open_deep_ledger_for_resurfaced_generation" } if item.IncidentSource == "access_decision" { switch item.GuardStatus { case "access_no_safe_recovery": return "inspect_access_no_safe_recovery_route_pool_and_signed_policy" case "access_recovery_selected": return "watch_recovery_route_quality_and_confirm_post_recovery_traffic" case "access_rebuild_applied": return "confirm_applied_rebuild_runtime_traffic_stays_on_replacement" case "access_replacement_selected": return "watch_replacement_route_quality_until_applied_or_recovered" } } if item.IncidentSource == "data_plane_contract" { switch item.GuardStatus { case "data_plane_contract_not_reported": return "upgrade_or_restart_entry_node_until_data_plane_contract_is_reported" case "data_plane_working_transport_violation", "data_plane_steady_state_transport_violation", "data_plane_logical_flow_violation": return "inspect_signed_data_plane_contract_and_node_agent_runtime_path" case "data_plane_disabled_backend_relay_observed": return "stop_backend_relay_usage_and_restore_fabric_route_before_service_traffic" case "data_plane_degraded_backend_relay_observed": return "restore_fabric_route_and_treat_backend_relay_as_degraded_only" case "backend_fallback_blocked_by_policy", "fabric_route_send_failed_backend_fallback_blocked", "data_plane_backend_fallback_blocked": return "restore_fabric_route_or_change_signed_backend_relay_policy_before_retry" case "data_plane_fabric_route_send_failed": return "inspect_entry_route_runtime_and_restore_fabric_route_delivery" } } switch item.GuardStatus { case "missing_node_transition": return "open_deep_ledger_check_reporter_heartbeats_and_route_manager_transition" case "missing_route_generation": return "open_deep_ledger_check_route_generation_apply_or_withdraw" case "missing_post_rebuild_traffic": return "open_deep_ledger_check_post_rebuild_traffic_and_selected_route" case "unexpected_post_rebuild_route": return "open_deep_ledger_check_selected_route_vs_replacement" case "post_rebuild_degraded": return "inspect_post_rebuild_drops_failures_and_route_quality" case "ok": return "no_operator_action_required" default: if item.GuardSeverity == "bad" || item.GuardSeverity == "warn" { return "open_deep_ledger_for_rebuild_incident" } return "no_operator_action_required" } } func transitionMatchesRebuildAttempt(transition map[string]any, item FabricServiceChannelRouteRebuildAttempt) bool { if len(transition) == 0 { return false } generation := jsonString(transition, "generation") if item.Generation != "" { return generation != "" && generation == item.Generation } status := jsonString(transition, "status") return (status == "applied_rebuild" && item.RebuildStatus == "applied") || (status == "pending_degraded_fallback" && item.RebuildStatus == "pending_degraded_fallback") } func routeGenerationDecisionForAttempt(report map[string]any, item FabricServiceChannelRouteRebuildAttempt) (map[string]any, bool) { for _, key := range []string{"active_decisions", "withdrawn_decisions"} { for _, raw := range jsonArray(report, key) { decision, ok := raw.(map[string]any) if !ok { continue } if jsonString(decision, "route_id") != item.RouteID { continue } generation := jsonString(decision, "generation") if item.Generation == "" || generation == "" || generation == item.Generation { return decision, true } } } return nil, false } func jsonObject(raw json.RawMessage) map[string]any { if len(raw) == 0 || !json.Valid(raw) { return map[string]any{} } var out map[string]any if err := json.Unmarshal(raw, &out); err != nil { return map[string]any{} } return out } func jsonMapPath(raw map[string]any, path ...string) map[string]any { current := raw for _, key := range path { next, ok := current[key].(map[string]any) if !ok { return map[string]any{} } current = next } return current } func jsonArray(raw map[string]any, key string) []any { if raw == nil { return nil } items, _ := raw[key].([]any) return items } func jsonString(raw map[string]any, key string) string { if raw == nil { return "" } value, _ := raw[key].(string) return strings.TrimSpace(value) } func jsonStringArray(raw map[string]any, key string) []string { items := jsonArray(raw, key) if len(items) == 0 { return nil } out := make([]string, 0, len(items)) for _, item := range items { value, ok := item.(string) if !ok { continue } value = strings.TrimSpace(value) if value != "" { out = append(out, value) } } return out } func jsonInt(raw map[string]any, key string) int { if raw == nil { return 0 } switch value := raw[key].(type) { case float64: return int(value) case int: return value case int64: return int(value) case json.Number: parsed, _ := value.Int64() return int(parsed) default: return 0 } } func jsonBool(raw map[string]any, key string) bool { if raw == nil { return false } value, _ := raw[key].(bool) return value } func jsonStringIntMap(raw map[string]any, key string) map[string]int { if raw == nil { return nil } values, ok := raw[key].(map[string]any) if !ok || len(values) == 0 { return nil } out := make(map[string]int, len(values)) for name, value := range values { name = strings.TrimSpace(name) if name == "" { continue } switch typed := value.(type) { case float64: out[name] = int(typed) case int: out[name] = typed case int64: out[name] = int(typed) case json.Number: parsed, _ := typed.Int64() out[name] = int(parsed) } } if len(out) == 0 { return nil } return out } func copyStringIntMap(values map[string]int) map[string]int { if len(values) == 0 { return nil } out := make(map[string]int, len(values)) for key, value := range values { out[key] = value } return out } func mergeStringIntMap(target map[string]int, source map[string]int) { if target == nil || len(source) == 0 { return } for key, value := range source { target[key] += value } } func mergeMinStringIntMap(target map[string]int, source map[string]int) { if target == nil || len(source) == 0 { return } for key, value := range source { if strings.TrimSpace(key) == "" || value <= 0 { continue } current, ok := target[key] if !ok || value < current { target[key] = value } } } func jsonUint64(raw map[string]any, key string) uint64 { if raw == nil { return 0 } switch value := raw[key].(type) { case float64: if value > 0 { return uint64(value) } case int: if value > 0 { return uint64(value) } case int64: if value > 0 { return uint64(value) } case uint64: return value } return 0 } func (s *Service) ExpireFabricServiceChannelRouteFeedback(ctx context.Context, input ExpireFabricServiceChannelRouteFeedbackInput) (ExpireFabricServiceChannelRouteFeedbackResult, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return ExpireFabricServiceChannelRouteFeedbackResult{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return ExpireFabricServiceChannelRouteFeedbackResult{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) input.ReporterNodeID = strings.TrimSpace(input.ReporterNodeID) input.RouteID = strings.TrimSpace(input.RouteID) input.ServiceClass = strings.TrimSpace(input.ServiceClass) input.Reason = strings.TrimSpace(input.Reason) if input.ClusterID == "" || input.RouteID == "" { return ExpireFabricServiceChannelRouteFeedbackResult{}, ErrInvalidPayload } if input.Now.IsZero() { input.Now = s.now() } result, err := s.store.ExpireFabricServiceChannelRouteFeedback(ctx, input) if err != nil { return ExpireFabricServiceChannelRouteFeedbackResult{}, err } payload, _ := json.Marshal(map[string]any{ "reporter_node_id": input.ReporterNodeID, "route_id": input.RouteID, "service_class": input.ServiceClass, "reason": input.Reason, "expired_count": result.ExpiredCount, "expired_at": result.ExpiredAt, "cooldown_until": result.CooldownUntil, }) _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "fabric.service_channel_route_feedback.expired", TargetType: "fabric_service_channel_route", TargetID: &input.RouteID, Payload: payload, CreatedAt: input.Now.UTC(), }) return result, nil } func (s *Service) CreateReleaseVersion(ctx context.Context, input CreateReleaseVersionInput) (ReleaseVersion, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return ReleaseVersion{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return ReleaseVersion{}, err } input.Product = normalizeUpdateToken(input.Product) input.Version = strings.TrimSpace(input.Version) input.Channel = normalizeUpdateToken(firstNonEmptyString(input.Channel, "dev")) input.Status = normalizeUpdateToken(firstNonEmptyString(input.Status, "active")) if input.ClusterID == "" || input.Product == "" || input.Version == "" || len(input.Artifacts) == 0 { return ReleaseVersion{}, ErrInvalidPayload } if input.Status != "active" && input.Status != "draft" && input.Status != "revoked" { return ReleaseVersion{}, ErrInvalidPayload } input.Compatibility = defaultJSON(input.Compatibility, `{}`) if !json.Valid(input.Compatibility) { return ReleaseVersion{}, ErrInvalidPayload } if releaseRequestsLegacyRemoval(input.Compatibility) { report, err := s.GetStaleNodeRiskReport(ctx, GetStaleNodeRiskReportInput{ ActorUserID: input.ActorUserID, ClusterID: input.ClusterID, }) if err != nil { return ReleaseVersion{}, err } if !report.LegacyRemovalAllowed { s.recordLegacyRemovalBlockedAudit(ctx, input.ClusterID, input.ActorUserID, "release_version", input.Product+":"+input.Version, "create_breaking_release", report) return ReleaseVersion{}, &LegacyRemovalBlockedError{ BlockedOperation: "create_breaking_release", Report: report, } } } for i := range input.Artifacts { input.Artifacts[i].OS = normalizeUpdateToken(input.Artifacts[i].OS) input.Artifacts[i].Arch = normalizeUpdateToken(input.Artifacts[i].Arch) input.Artifacts[i].InstallType = normalizeUpdateToken(input.Artifacts[i].InstallType) input.Artifacts[i].Kind = normalizeUpdateToken(input.Artifacts[i].Kind) input.Artifacts[i].URL = strings.TrimSpace(input.Artifacts[i].URL) input.Artifacts[i].SHA256 = strings.TrimSpace(input.Artifacts[i].SHA256) input.Artifacts[i].Metadata = defaultJSON(input.Artifacts[i].Metadata, `{}`) if input.Artifacts[i].OS == "" || input.Artifacts[i].Arch == "" || input.Artifacts[i].InstallType == "" || input.Artifacts[i].Kind == "" || input.Artifacts[i].URL == "" || input.Artifacts[i].SHA256 == "" || !json.Valid(input.Artifacts[i].Metadata) { return ReleaseVersion{}, ErrInvalidPayload } } item, err := s.store.CreateReleaseVersion(ctx, input) if err != nil { return ReleaseVersion{}, err } item, err = s.signReleaseVersion(ctx, item, &input.ActorUserID) if err != nil { return ReleaseVersion{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "release_version.created", TargetType: "release_version", TargetID: &item.ID, Payload: json.RawMessage(`{"production_forwarding":false}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) ListReleaseVersions(ctx context.Context, actorUserID, clusterID, product, channel string) ([]ReleaseVersion, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListReleaseVersions(ctx, clusterID, normalizeUpdateToken(product), normalizeUpdateToken(channel)) } func (s *Service) UpsertNodeUpdatePolicy(ctx context.Context, input UpsertNodeUpdatePolicyInput) (NodeUpdatePolicy, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return NodeUpdatePolicy{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return NodeUpdatePolicy{}, err } input.Product = normalizeUpdateToken(input.Product) input.Channel = normalizeUpdateToken(firstNonEmptyString(input.Channel, "dev")) input.Strategy = normalizeUpdateToken(firstNonEmptyString(input.Strategy, "manual")) if input.ClusterID == "" || input.NodeID == "" || input.Product == "" { return NodeUpdatePolicy{}, ErrInvalidPayload } switch input.Strategy { case "manual", "canary", "rolling", "pinned": default: return NodeUpdatePolicy{}, ErrInvalidPayload } if input.HealthWindowSec <= 0 { input.HealthWindowSec = 180 } if input.TargetVersion != nil { trimmed := strings.TrimSpace(*input.TargetVersion) input.TargetVersion = &trimmed } if input.TargetVersion != nil && *input.TargetVersion != "" { releases, err := s.store.ListReleaseVersions(ctx, input.ClusterID, input.Product, input.Channel) if err != nil { return NodeUpdatePolicy{}, err } if !hasTargetedReleaseVersion(releases, *input.TargetVersion) { return NodeUpdatePolicy{}, ErrInvalidPayload } if targetedReleaseRequestsLegacyRemoval(releases, *input.TargetVersion) { report, err := s.GetStaleNodeRiskReport(ctx, GetStaleNodeRiskReportInput{ ActorUserID: input.ActorUserID, ClusterID: input.ClusterID, }) if err != nil { return NodeUpdatePolicy{}, err } if !report.LegacyRemovalAllowed { s.recordLegacyRemovalBlockedAudit(ctx, input.ClusterID, input.ActorUserID, "node_update_policy", input.NodeID, "target_breaking_update_policy", report) return NodeUpdatePolicy{}, &LegacyRemovalBlockedError{ BlockedOperation: "target_breaking_update_policy", Report: report, } } } } item, err := s.store.UpsertNodeUpdatePolicy(ctx, input) if err != nil { return NodeUpdatePolicy{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "node_update_policy.updated", TargetType: "node", TargetID: &input.NodeID, Payload: json.RawMessage(`{"production_forwarding":false}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) GetNodeUpdatePlan(ctx context.Context, input GetNodeUpdatePlanInput) (NodeUpdatePlan, error) { input.Product = normalizeUpdateToken(firstNonEmptyString(input.Product, "rap-node-agent")) input.Channel = normalizeUpdateToken(input.Channel) input.OS = normalizeUpdateToken(input.OS) input.Arch = normalizeUpdateToken(input.Arch) input.InstallType = normalizeUpdateToken(input.InstallType) input.CurrentVersion = strings.TrimSpace(input.CurrentVersion) input.ArtifactOrigin = normalizeArtifactOrigin(input.ArtifactOrigin) if input.ClusterID == "" || input.NodeID == "" || input.Product == "" || input.OS == "" || input.Arch == "" || input.InstallType == "" { return NodeUpdatePlan{}, ErrInvalidPayload } if heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, input.NodeID, 1); err == nil && len(heartbeats) > 0 { input.ArtifactOrigin = preferredNodeArtifactOrigin(input.ArtifactOrigin, artifactOriginFromHeartbeat(heartbeats[0])) } policy, err := s.store.GetNodeUpdatePolicy(ctx, input.ClusterID, input.NodeID, input.Product) if errors.Is(err, pgx.ErrNoRows) { return s.signNodeUpdatePlan(ctx, NodeUpdatePlan{ SchemaVersion: "rap.node_update_plan.v1", ClusterID: input.ClusterID, NodeID: input.NodeID, Product: input.Product, CurrentVersion: input.CurrentVersion, Action: "none", Reason: "no_update_policy", ProductionForwarding: false, }) } if err != nil { return NodeUpdatePlan{}, err } if input.Channel == "" { input.Channel = policy.Channel } base := NodeUpdatePlan{ SchemaVersion: "rap.node_update_plan.v1", ClusterID: input.ClusterID, NodeID: input.NodeID, Product: input.Product, CurrentVersion: input.CurrentVersion, Channel: input.Channel, Strategy: policy.Strategy, RollbackAllowed: policy.RollbackAllowed, HealthWindowSec: policy.HealthWindowSec, ProductionForwarding: false, } if !policy.Enabled { base.Action = "none" base.Reason = "policy_disabled" return s.signNodeUpdatePlan(ctx, base) } if mismatch, err := s.hostAgentPlatformMismatch(ctx, input); err != nil { return NodeUpdatePlan{}, err } else if mismatch { base.Action = "none" base.Reason = "host_agent_artifact_platform_mismatch" return s.signNodeUpdatePlan(ctx, base) } releases, err := s.store.ListReleaseVersions(ctx, input.ClusterID, input.Product, input.Channel) if err != nil { return NodeUpdatePlan{}, err } release, artifact, ok := selectReleaseArtifact(releases, input, policy) if !ok { base.Action = "none" base.Reason = "no_matching_artifact" return s.signNodeUpdatePlan(ctx, base) } base.TargetVersion = release.Version artifact = absolutizeReleaseArtifact(artifact, input.ArtifactOrigin) base.Artifact = &artifact if strings.TrimSpace(input.CurrentVersion) == release.Version { base.Action = "none" base.Reason = "already_current" return s.signNodeUpdatePlan(ctx, base) } base.Action = "update" base.Reason = "matching_release_available" return s.signNodeUpdatePlan(ctx, base) } func (s *Service) ReportNodeUpdateStatus(ctx context.Context, input ReportNodeUpdateStatusInput) (NodeUpdateStatus, error) { input.Product = normalizeUpdateToken(firstNonEmptyString(input.Product, "rap-node-agent")) input.Phase = normalizeUpdateToken(input.Phase) input.Status = normalizeUpdateToken(input.Status) if input.ClusterID == "" || input.NodeID == "" || input.Product == "" || input.Phase == "" || input.Status == "" { return NodeUpdateStatus{}, ErrInvalidPayload } input.Payload = defaultJSON(input.Payload, `{}`) if !json.Valid(input.Payload) { return NodeUpdateStatus{}, ErrInvalidPayload } if input.ObservedAt.IsZero() { input.ObservedAt = s.now() } return s.store.ReportNodeUpdateStatus(ctx, input) } func (s *Service) ListNodeUpdateStatuses(ctx context.Context, actorUserID, clusterID, nodeID string, limit int) ([]NodeUpdateStatus, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } if clusterID == "" || nodeID == "" { return nil, ErrInvalidPayload } return s.store.ListNodeUpdateStatuses(ctx, clusterID, nodeID, limit) } func (s *Service) GetStaleNodeRiskReport(ctx context.Context, input GetStaleNodeRiskReportInput) (StaleNodeRiskReport, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return StaleNodeRiskReport{}, err } if strings.TrimSpace(input.ClusterID) == "" { return StaleNodeRiskReport{}, ErrInvalidPayload } now := s.now().UTC() nodes, err := s.store.ListClusterNodes(ctx, input.ClusterID) if err != nil { return StaleNodeRiskReport{}, err } report := StaleNodeRiskReport{ ClusterID: input.ClusterID, GeneratedAt: now, HeartbeatStaleAfterSeconds: int(staleNodeRiskHeartbeatThreshold / time.Second), LegacyRemovalAllowed: true, BridgeHoldNodeIDs: []string{}, BridgeHoldReasons: []string{}, Nodes: make([]StaleNodeRiskNode, 0, len(nodes)), } releaseCache := map[string][]ReleaseVersion{} for _, node := range nodes { item, err := s.evaluateStaleNodeRisk(ctx, input.ClusterID, node, now, releaseCache) if err != nil { return StaleNodeRiskReport{}, err } item.Blocked = len(item.Risks) > 0 report.Nodes = append(report.Nodes, item) report.Summary.TotalNodes++ if item.HeartbeatStale { report.Summary.StaleNodes++ } if item.Blocked { report.Summary.BlockedNodes++ report.LegacyRemovalAllowed = false } if item.DirectPeerAlert { report.Summary.DirectPeerAlertNodes++ } if containsAnyRiskWithPrefix(item.Risks, "stale_node_no_compatible_") { report.Summary.ArtifactGapNodes++ continue } if containsAnyRiskWithPrefix(item.Risks, "stale_node_unknown_profile_") { report.Summary.UnknownProfileNodes++ continue } if containsAnyRiskWithPrefix(item.Risks, "stale_node_no_") && containsAnyRiskWithSuffix(item.Risks, "_update_status") { report.Summary.WaitingUpdateStatusNodes++ continue } if containsAnyRiskWithPrefix(item.Risks, "stale_node_unknown_") && containsAnyRiskWithSuffix(item.Risks, "_version") { report.Summary.UnknownVersionNodes++ continue } if containsAnyRiskWithPrefix(item.Risks, "stale_node_legacy_recovery_contract_") { report.Summary.LegacyRecoveryContractNodes++ if item.RecoveryBridgeRequired { report.BridgeHoldRequired = true report.BridgeHoldNodeIDs = append(report.BridgeHoldNodeIDs, item.NodeID) report.Summary.RecoveryBridgeRequiredNodes++ } if item.RecoveryBridgeReplayReady { report.Summary.RecoveryBridgeReplayReadyNodes++ } continue } if item.HeartbeatStale { report.Summary.WaitingRecoveryHeartbeatNodes++ } } sort.Slice(report.Nodes, func(i, j int) bool { if report.Nodes[i].HeartbeatStale != report.Nodes[j].HeartbeatStale { return report.Nodes[i].HeartbeatStale } if len(report.Nodes[i].Risks) != len(report.Nodes[j].Risks) { return len(report.Nodes[i].Risks) > len(report.Nodes[j].Risks) } return report.Nodes[i].Name < report.Nodes[j].Name }) if !report.LegacyRemovalAllowed { report.BlockedOperations = []string{ "create_breaking_release", "target_breaking_update_policy", } } if report.BridgeHoldRequired { report.BridgeHoldReasons = append(report.BridgeHoldReasons, "legacy_contract_overlap") report.LegacyRemovalAllowed = false for _, operation := range []string{ "create_breaking_release", "target_breaking_update_policy", "remove_recovery_bridge_overlap", } { if !containsString(report.BlockedOperations, operation) { report.BlockedOperations = append(report.BlockedOperations, operation) } } } return report, nil } func (s *Service) GetNodeBridgeReplayPlan(ctx context.Context, input GetNodeBridgeReplayPlanInput) (NodeBridgeReplayPlan, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return NodeBridgeReplayPlan{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) input.NodeID = strings.TrimSpace(input.NodeID) if input.ClusterID == "" || input.NodeID == "" { return NodeBridgeReplayPlan{}, ErrInvalidPayload } nodes, err := s.store.ListClusterNodes(ctx, input.ClusterID) if err != nil { return NodeBridgeReplayPlan{}, err } var node *ClusterNode for i := range nodes { if strings.TrimSpace(nodes[i].ID) == input.NodeID { node = &nodes[i] break } } if node == nil { return NodeBridgeReplayPlan{}, ErrInvalidPayload } report, err := s.GetStaleNodeRiskReport(ctx, GetStaleNodeRiskReportInput{ ActorUserID: input.ActorUserID, ClusterID: input.ClusterID, }) if err != nil { return NodeBridgeReplayPlan{}, err } var riskNode *StaleNodeRiskNode for i := range report.Nodes { if strings.TrimSpace(report.Nodes[i].NodeID) == input.NodeID { riskNode = &report.Nodes[i] break } } if riskNode == nil { return NodeBridgeReplayPlan{}, ErrInvalidPayload } artifactOrigin := "" if heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, input.NodeID, 1); err == nil && len(heartbeats) > 0 { artifactOrigin = artifactOriginFromHeartbeat(heartbeats[0]) } plan := NodeBridgeReplayPlan{ SchemaVersion: "rap.node_bridge_replay_plan.v1", ClusterID: input.ClusterID, NodeID: input.NodeID, NodeName: riskNode.Name, HealthStatus: riskNode.HealthStatus, HeartbeatStale: riskNode.HeartbeatStale, BridgeHoldRequired: riskNode.RecoveryBridgeRequired, RecoveryBridgeReplayReady: riskNode.RecoveryBridgeReplayReady, BridgeHoldReasons: append([]string{}, report.BridgeHoldReasons...), BridgeActions: append([]string{}, riskNode.RecoveryBridgeActions...), Products: []NodeBridgeReplayProductPlan{}, } for _, product := range riskNode.Products { if !product.RecoveryBridgeReplayReady { continue } updatePlan, err := s.GetNodeUpdatePlan(ctx, GetNodeUpdatePlanInput{ ClusterID: input.ClusterID, NodeID: input.NodeID, Product: product.Product, CurrentVersion: product.CurrentVersion, OS: product.DetectedOS, Arch: product.DetectedArch, InstallType: product.DetectedInstallType, Channel: product.Channel, ArtifactOrigin: artifactOrigin, }) if err != nil { return NodeBridgeReplayPlan{}, err } plan.Products = append(plan.Products, NodeBridgeReplayProductPlan{ Product: product.Product, RecoveryBridgeMode: product.RecoveryBridgeMode, RecoveryBridgeReplayReady: product.RecoveryBridgeReplayReady, LastStatusReason: product.LastStatusReason, UpdatePlan: updatePlan, }) } return plan, nil } func (s *Service) GetNodeUpdateHint(ctx context.Context, clusterID, nodeID string) NodeUpdateHint { products := []string{"rap-node-agent", "rap-host-agent"} parts := make([]string, 0, len(products)) activeProducts := make([]string, 0, len(products)) updateService := s.selectNodeUpdateService(ctx, clusterID, nodeID) for _, product := range products { policy, err := s.store.GetNodeUpdatePolicy(ctx, clusterID, nodeID, product) if err != nil || !policy.Enabled { continue } targetVersion := strings.TrimSpace(updateHintTargetVersion(ctx, s, clusterID, product, policy)) if targetVersion == "" { continue } activeProducts = append(activeProducts, product) parts = append(parts, product+":"+targetVersion+":"+policy.UpdatedAt.UTC().Format(time.RFC3339Nano)) } if len(parts) == 0 { return NodeUpdateHint{ SchemaVersion: "rap.node_update_hint.v1", CheckNow: false, Reason: "no_enabled_update_policy", DeliveryMode: "update_service_subscription", SubscriptionStatus: "subscribed", UpdateService: updateService, FallbackPollSeconds: 21600, } } sort.Strings(parts) sort.Strings(activeProducts) sum := sha256.Sum256([]byte(strings.Join(parts, "|"))) return NodeUpdateHint{ SchemaVersion: "rap.node_update_hint.v1", Generation: hex.EncodeToString(sum[:])[:16], CheckNow: true, Products: activeProducts, Reason: "enabled_update_policy", DeliveryMode: "update_service_subscription", SubscriptionStatus: "subscribed", UpdateService: updateService, FallbackPollSeconds: 21600, } } func (s *Service) selectNodeUpdateService(ctx context.Context, clusterID, nodeID string) *NodeUpdateServiceAssignment { now := s.now() assignment := &NodeUpdateServiceAssignment{ SchemaVersion: "rap.node_update_service_assignment.v1", Status: "control_plane_fallback", Reason: "no_healthy_update_cache_service", AssignedAt: now, ExpiresAt: now.Add(2 * time.Minute), } candidates, err := s.store.ListNodeUpdateServiceCandidates(ctx, clusterID) if err != nil || len(candidates) == 0 { return assignment } selected := candidates[0] for _, candidate := range candidates { if candidate.NodeID == nodeID { selected = candidate break } } assignment.NodeID = selected.NodeID assignment.NodeName = selected.NodeName assignment.Endpoint = selected.Endpoint assignment.Region = selected.Region assignment.Status = "assigned" assignment.Reason = "healthy_update_cache_service" assignment.ExpiresAt = now.Add(5 * time.Minute) return assignment } func updateHintTargetVersion(ctx context.Context, s *Service, clusterID, product string, policy NodeUpdatePolicy) string { if policy.TargetVersion != nil { return strings.TrimSpace(*policy.TargetVersion) } releases, err := s.store.ListReleaseVersions(ctx, clusterID, product, policy.Channel) if err != nil { return "" } for _, release := range releases { if release.Status == "active" && strings.TrimSpace(release.Version) != "" { return strings.TrimSpace(release.Version) } } return "" } func (s *Service) signReleaseVersion(ctx context.Context, item ReleaseVersion, actorUserID *string) (ReleaseVersion, error) { authorityKey, err := s.ensureClusterAuthority(ctx, item.ClusterID, actorUserID) if err != nil { return ReleaseVersion{}, err } payload := map[string]any{ "schema_version": "rap.release_version_authority.v1", "cluster_id": item.ClusterID, "release_id": item.ID, "product": item.Product, "version": item.Version, "channel": item.Channel, "artifact_count": len(item.Artifacts), "control_plane_only": true, "production_forwarding": false, } rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now()) if err != nil { return ReleaseVersion{}, err } item.AuthorityPayload = rawPayload item.AuthoritySignature = &signature return item, nil } func (s *Service) signNodeUpdatePlan(ctx context.Context, plan NodeUpdatePlan) (NodeUpdatePlan, error) { authorityKey, err := s.ensureClusterAuthority(ctx, plan.ClusterID, nil) if err != nil { return NodeUpdatePlan{}, err } payload := map[string]any{ "schema_version": "rap.node_update_plan_authority.v1", "cluster_id": plan.ClusterID, "node_id": plan.NodeID, "product": plan.Product, "current_version": plan.CurrentVersion, "action": plan.Action, "target_version": plan.TargetVersion, "artifact_sha256": "", "control_plane_only": true, "production_forwarding": false, } if plan.Artifact != nil { payload["artifact_sha256"] = plan.Artifact.SHA256 payload["artifact_url"] = plan.Artifact.URL } rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now()) if err != nil { return NodeUpdatePlan{}, err } plan.AuthorityPayload = rawPayload plan.AuthoritySignature = &signature quorumEnvelope, err := nodeUpdatePlanQuorumEnvelope(authorityKey, rawPayload, s.now()) if err != nil { return NodeUpdatePlan{}, err } plan.AuthorityQuorum = quorumEnvelope return plan, nil } func nodeUpdatePlanQuorumEnvelope(authorityKey ClusterAuthorityKey, payload json.RawMessage, signedAt time.Time) (*QuorumEnvelope, error) { if authorityKey.QuorumDescriptor == nil { return nil, nil } descriptor := *authorityKey.QuorumDescriptor if descriptor.SchemaVersion == "" { descriptor.SchemaVersion = clusterauth.QuorumSchemaVersion } if strings.TrimSpace(descriptor.ClusterID) == "" { descriptor.ClusterID = authorityKey.ClusterID } signature, err := clusterauth.SignRaw(authorityKey.PrivateKey, payload, signedAt) if err != nil { return nil, err } payloadHash, err := clusterauth.HashRaw(payload) if err != nil { return nil, err } descriptorHash, err := clusterauth.QuorumDescriptorHash(descriptor) if err != nil { return nil, err } envelope := QuorumEnvelope{ SchemaVersion: clusterauth.QuorumEnvelopeVersion, ClusterID: descriptor.ClusterID, Epoch: descriptor.Epoch, Threshold: descriptor.Threshold, PayloadSHA256: payloadHash, QuorumSHA256: descriptorHash, Signatures: []ClusterSignature{signature}, AllowedScopes: []string{"update-authority"}, DecisionReason: "node_update_plan", } if err := clusterauth.VerifyQuorumRaw(descriptor, payload, envelope, "update-authority"); err != nil { return nil, err } return &envelope, nil } func (s *Service) UpsertFabricTestingFlag(ctx context.Context, input UpsertFabricTestingFlagInput) (FabricTestingFlag, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return FabricTestingFlag{}, err } input.ScopeType = strings.TrimSpace(input.ScopeType) if input.ScopeType == "" { return FabricTestingFlag{}, ErrInvalidPayload } switch input.ScopeType { case "platform": input.ScopeID = nil case "organization", "node": if input.ScopeID == nil || strings.TrimSpace(*input.ScopeID) == "" { return FabricTestingFlag{}, ErrInvalidPayload } default: return FabricTestingFlag{}, ErrInvalidPayload } if input.HistoryRetentionHours <= 0 { input.HistoryRetentionHours = 24 } input.Metadata = defaultJSON(input.Metadata, `{}`) if !json.Valid(input.Metadata) { return FabricTestingFlag{}, errors.New("testing flag metadata must be valid json") } item, err := s.store.UpsertFabricTestingFlag(ctx, input) if err != nil { return FabricTestingFlag{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "fabric.testing_flag.updated", TargetType: input.ScopeType, TargetID: input.ScopeID, Payload: json.RawMessage(`{"runtime_mesh_enabled":false}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) ListFabricTestingFlags(ctx context.Context, actorUserID string) ([]FabricTestingFlag, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListFabricTestingFlags(ctx) } func (s *Service) GetEffectiveNodeTestingFlags(ctx context.Context, clusterID, nodeID string) (EffectiveNodeTestingFlags, error) { if clusterID == "" || nodeID == "" { return EffectiveNodeTestingFlags{}, ErrInvalidPayload } return s.store.GetEffectiveNodeTestingFlags(ctx, clusterID, nodeID) } func (s *Service) IssueFabricServiceChannelLease(ctx context.Context, input IssueFabricServiceChannelLeaseInput) (FabricServiceChannelLease, error) { input.ClusterID = strings.TrimSpace(input.ClusterID) input.OrganizationID = strings.TrimSpace(input.OrganizationID) input.UserID = strings.TrimSpace(input.UserID) input.ResourceID = strings.TrimSpace(input.ResourceID) input.ServiceClass = normalizeFabricServiceClass(input.ServiceClass) input.EntryNodeIDs = dedupeStrings(input.EntryNodeIDs) input.ExitNodeIDs = dedupeStrings(input.ExitNodeIDs) input.PreferredEntryNodeID = strings.TrimSpace(input.PreferredEntryNodeID) input.PreferredExitNodeID = strings.TrimSpace(input.PreferredExitNodeID) if input.ClusterID == "" || input.OrganizationID == "" || input.UserID == "" || input.ServiceClass == "" || len(input.EntryNodeIDs) == 0 || len(input.ExitNodeIDs) == 0 { return FabricServiceChannelLease{}, ErrInvalidPayload } if !isAllowedFabricServiceClass(input.ServiceClass) { return FabricServiceChannelLease{}, ErrInvalidPayload } ttl := input.TTL if ttl <= 0 { ttl = time.Minute } if ttl > 6*time.Hour { ttl = 6 * time.Hour } now := s.now().UTC() expiresAt := now.Add(ttl) routeGeneration := "fsc-" + now.Format("20060102T150405.000000000Z") allowedChannels := normalizeFabricServiceChannels(input.AllowedChannels, input.ServiceClass) requiredRoles := normalizeFabricRequiredRoles(input.RequiredRoles, input.ServiceClass) cluster, err := s.store.GetCluster(ctx, input.ClusterID) if errors.Is(err, pgx.ErrNoRows) { return FabricServiceChannelLease{}, ErrInvalidCluster } if err != nil { return FabricServiceChannelLease{}, err } poolPolicy := fabricServiceChannelPoolPolicyFromCluster(cluster) if input.BackendFallbackAllowed != nil { poolPolicy.BackendFallbackAllowed = *input.BackendFallbackAllowed } entryNodeIDs := fabricServiceChannelEffectivePool(input.EntryNodeIDs, poolPolicy.EntryPoolNodeIDs) exitNodeIDs := fabricServiceChannelEffectivePool(input.ExitNodeIDs, poolPolicy.ExitPoolNodeIDs) if len(entryNodeIDs) == 0 || len(exitNodeIDs) == 0 { return FabricServiceChannelLease{}, ErrInvalidPayload } selectedEntry := selectFabricServiceChannelPreferredNode(entryNodeIDs, firstNonEmptyString(poolPolicy.PreferredEntryNodeID, input.PreferredEntryNodeID)) selectedExit := selectFabricServiceChannelPreferredNode(exitNodeIDs, firstNonEmptyString(poolPolicy.PreferredExitNodeID, input.PreferredExitNodeID)) if selectedEntry == "" || selectedExit == "" { return FabricServiceChannelLease{}, ErrInvalidPayload } intents, err := s.store.ListRouteIntents(ctx, input.ClusterID) if err != nil { return FabricServiceChannelLease{}, err } recoveryPolicy := s.fabricServiceChannelRecoveryPolicy(ctx, input.ClusterID) routeProvenance := fabricServiceChannelRouteProvenanceFromIntents(intents) feedback, err := s.fabricServiceChannelRouteFeedback(ctx, input.ClusterID, entryNodeIDs, now, recoveryPolicy, routeProvenance) if err != nil { return FabricServiceChannelLease{}, err } routes := fabricServiceChannelRoutesFromIntents(intents, input.ServiceClass, entryNodeIDs, exitNodeIDs, allowedChannels, routeGeneration, now, expiresAt, feedback, recoveryPolicy) primary, alternates := selectFabricServicePrimaryRoute(routes, selectedEntry, selectedExit) if primary.RouteID != "" && containsString(entryNodeIDs, primary.SourceNodeID) { selectedEntry = primary.SourceNodeID } if primary.RouteID != "" && containsString(exitNodeIDs, primary.DestinationNodeID) { selectedExit = primary.DestinationNodeID } fallback := FabricServiceChannelFallback{ Allowed: true, Transport: "backend_relay", BackendRelay: true, Compatibility: true, Reason: "compatibility_fallback_available", } fallback.Allowed = poolPolicy.BackendFallbackAllowed fallback.BackendRelay = poolPolicy.BackendFallbackAllowed status := FabricServiceChannelStatusReady if primary.RouteID == "" { if poolPolicy.BackendFallbackAllowed { status = FabricServiceChannelStatusDegradedFallback fallback.Active = true fallback.Degraded = true fallback.Reason = "no_authorized_fabric_route_for_selected_entry_exit" } else { status = "blocked_no_fabric_route" fallback.Active = false fallback.Degraded = true fallback.Reason = "backend_fallback_disabled_by_pool_policy" } if fabricServiceRoutesFencedForSelectedPair(routes, selectedEntry, selectedExit) { fallback.Reason = "fabric_route_rebuild_pending_backend_relay" } else if fabricServiceRoutesFencedForPool(routes) { fallback.Reason = "fabric_entry_exit_pool_rebuild_pending_backend_relay" } primary = FabricServiceChannelRoute{ ClusterID: input.ClusterID, ServiceClass: input.ServiceClass, SourceNodeID: selectedEntry, DestinationNodeID: selectedExit, Hops: []string{selectedEntry, selectedExit}, AllowedChannels: allowedChannels, Generation: routeGeneration, Status: "missing_route_intent", RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(recoveryPolicy), PathScore: 1, ScoreReasons: []string{"fallback_until_fabric_route_exists"}, ExpiresAt: expiresAt, } } else { fallback.Active = false fallback.Degraded = false } channelID := uuidLikeRandom() if channelID == "" { channelID = "fabric-channel-" + now.Format("20060102T150405.000000000Z") } token := uuidLikeRandom() if token == "" { token = channelID } lease := FabricServiceChannelLease{ SchemaVersion: "rap.fabric_service_channel_lease.v1", ChannelID: channelID, ClusterID: input.ClusterID, OrganizationID: input.OrganizationID, UserID: input.UserID, ResourceID: input.ResourceID, ServiceClass: input.ServiceClass, Status: status, SelectedEntryNodeID: selectedEntry, SelectedExitNodeID: selectedExit, EntryPool: fabricServiceChannelNodePool(entryNodeIDs, "entry", selectedEntry), ExitPool: fabricServiceChannelNodePool(exitNodeIDs, "exit", selectedExit), RequiredRoles: requiredRoles, AllowedChannels: allowedChannels, PrimaryRoute: primary, AlternateRoutes: alternates, RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(recoveryPolicy), PoolPolicy: fabricServiceChannelPoolPolicyRef(poolPolicy), DataPlane: fabricServiceChannelDataPlaneContract(input.ServiceClass, poolPolicy, fallback), QoS: defaultJSON(input.QoS, defaultFabricServiceQoS(input.ServiceClass)), Failover: defaultJSON(input.Failover, fabricServiceFailoverFromPoolPolicy(poolPolicy)), Fallback: fallback, Token: FabricServiceChannelToken{ Type: "control_plane_issued_bearer", Token: "rap_fsc_" + strings.ReplaceAll(token, "-", ""), TTLSeconds: int(ttl.Seconds()), IntrospectionPath: "/api/v1/clusters/{cluster_id}/fabric/service-channels/{channel_id}/introspect", }, EntryHTTP: fabricServiceChannelHTTPIngress(input.ServiceClass), RouteGeneration: routeGeneration, FencingEpoch: now.UnixNano(), IssuedAt: now, ExpiresAt: expiresAt, Metadata: defaultJSON(input.Metadata, `{}`), } if signed, err := s.signFabricServiceChannelLease(ctx, lease); err == nil { lease = signed } s.rememberFabricServiceChannelLease(lease) if _, err := s.store.StoreFabricServiceChannelLease(ctx, StoreFabricServiceChannelLeaseInput{ Lease: lease, TokenHash: fabricServiceChannelTokenHash(lease.Token.Token), }); err != nil { return FabricServiceChannelLease{}, err } return lease, nil } func (s *Service) rememberFabricServiceChannelLease(lease FabricServiceChannelLease) { if strings.TrimSpace(lease.ClusterID) == "" || strings.TrimSpace(lease.ChannelID) == "" || strings.TrimSpace(lease.Token.Token) == "" { return } now := s.now() if now.IsZero() { now = time.Now().UTC() } s.fabricServiceChannelLeaseMu.Lock() defer s.fabricServiceChannelLeaseMu.Unlock() if s.fabricServiceChannelLeaseCache == nil { s.fabricServiceChannelLeaseCache = map[string]FabricServiceChannelLease{} } for key, item := range s.fabricServiceChannelLeaseCache { if !item.ExpiresAt.IsZero() && !item.ExpiresAt.After(now) { delete(s.fabricServiceChannelLeaseCache, key) } } s.fabricServiceChannelLeaseCache[fabricServiceChannelLeaseCacheKey(lease.ClusterID, lease.ChannelID)] = lease } func (s *Service) IntrospectFabricServiceChannelLease(ctx context.Context, input IntrospectFabricServiceChannelLeaseInput) (FabricServiceChannelLeaseIntrospection, error) { input.ClusterID = strings.TrimSpace(input.ClusterID) input.ChannelID = strings.TrimSpace(input.ChannelID) input.ResourceID = strings.TrimSpace(input.ResourceID) input.ServiceClass = normalizeFabricServiceClass(input.ServiceClass) input.ChannelClass = strings.TrimSpace(strings.ToLower(input.ChannelClass)) input.Token = strings.TrimSpace(input.Token) input.EntryNodeID = strings.TrimSpace(input.EntryNodeID) if input.ClusterID == "" || input.ChannelID == "" || input.Token == "" { return FabricServiceChannelLeaseIntrospection{}, ErrInvalidPayload } now := s.now() if now.IsZero() { now = time.Now().UTC() } s.fabricServiceChannelLeaseMu.Lock() lease, ok := s.fabricServiceChannelLeaseCache[fabricServiceChannelLeaseCacheKey(input.ClusterID, input.ChannelID)] tokenHash := "" if ok && !lease.ExpiresAt.IsZero() && !lease.ExpiresAt.After(now) { delete(s.fabricServiceChannelLeaseCache, fabricServiceChannelLeaseCacheKey(input.ClusterID, input.ChannelID)) ok = false } if ok { tokenHash = fabricServiceChannelTokenHash(lease.Token.Token) } s.fabricServiceChannelLeaseMu.Unlock() if !ok { record, err := s.store.GetFabricServiceChannelLease(ctx, input.ClusterID, input.ChannelID) if err != nil && !errors.Is(err, pgx.ErrNoRows) { return FabricServiceChannelLeaseIntrospection{}, err } if err == nil { lease = record.Lease tokenHash = strings.TrimSpace(record.TokenHash) if !lease.ExpiresAt.IsZero() && !lease.ExpiresAt.After(now) { ok = false } else { ok = true s.rememberFabricServiceChannelLease(lease) } } } out := FabricServiceChannelLeaseIntrospection{ SchemaVersion: "rap.fabric_service_channel_introspection.v1", ClusterID: input.ClusterID, ChannelID: input.ChannelID, ResourceID: input.ResourceID, ServiceClass: input.ServiceClass, AcceptedBy: "introspection", Status: "denied", Reason: "lease_not_found", } if !ok { return out, nil } out.ResourceID = lease.ResourceID out.ServiceClass = lease.ServiceClass out.SelectedEntryNodeID = lease.SelectedEntryNodeID out.SelectedExitNodeID = lease.SelectedExitNodeID out.AllowedChannels = append([]string{}, lease.AllowedChannels...) out.LeaseStatus = lease.Status out.PrimaryRoute = lease.PrimaryRoute out.DataPlane = lease.DataPlane out.RouteGeneration = lease.RouteGeneration out.FencingEpoch = lease.FencingEpoch out.ExpiresAt = lease.ExpiresAt if lease.ClusterID != input.ClusterID || lease.ChannelID != input.ChannelID || tokenHash == "" || tokenHash != fabricServiceChannelTokenHash(input.Token) { out.Reason = "lease_token_mismatch" return out, nil } if lease.ResourceID != "" && input.ResourceID != "" && lease.ResourceID != input.ResourceID { out.Reason = "resource_mismatch" return out, nil } if input.ServiceClass != "" && lease.ServiceClass != input.ServiceClass { out.Reason = "service_class_mismatch" return out, nil } if input.ChannelClass != "" && !containsString(lease.AllowedChannels, input.ChannelClass) { out.Reason = "channel_class_not_allowed" return out, nil } if input.EntryNodeID != "" && lease.SelectedEntryNodeID != "" && lease.SelectedEntryNodeID != input.EntryNodeID { out.Reason = "entry_node_mismatch" return out, nil } out.Allowed = true out.Status = "allowed" out.Reason = "lease_introspection_allowed" if lease.Status == FabricServiceChannelStatusDegradedFallback || lease.PrimaryRoute.Status == "missing_route_intent" { out.ForceBackendFallback = true } else { out.PreferredRouteID = strings.TrimSpace(lease.PrimaryRoute.RouteID) } return out, nil } func (s *Service) ListFabricServiceChannelLeases(ctx context.Context, actorUserID string, input ListFabricServiceChannelLeasesInput) (FabricServiceChannelLeaseMaintenance, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return FabricServiceChannelLeaseMaintenance{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) input.ServiceClass = normalizeFabricServiceClass(input.ServiceClass) input.EntryNodeID = strings.TrimSpace(input.EntryNodeID) input.ResourceID = strings.TrimSpace(input.ResourceID) if input.ClusterID == "" { return FabricServiceChannelLeaseMaintenance{}, ErrInvalidPayload } if input.Limit <= 0 || input.Limit > 500 { input.Limit = 100 } now := input.Now if now.IsZero() { now = s.now() } if now.IsZero() { now = time.Now().UTC() } records, err := s.store.ListFabricServiceChannelLeases(ctx, input) if err != nil { return FabricServiceChannelLeaseMaintenance{}, err } out := FabricServiceChannelLeaseMaintenance{ SchemaVersion: "rap.fabric_service_channel_lease_maintenance.v1", ClusterID: input.ClusterID, Status: "ready", Reason: "lease_maintenance_ready", ObservedAt: now.UTC(), WindowLimit: input.Limit, } for _, record := range records { summary := fabricServiceChannelLeaseSummaryFromRecord(record, now) if summary.Expired { out.ExpiredCount++ } else { out.ActiveCount++ } out.Leases = append(out.Leases, summary) } out.ScannedCount = len(out.Leases) if out.ExpiredCount > 0 { out.Status = "degraded" out.Reason = "expired_leases_pending_cleanup" out.RecommendedOperatorAction = "Run service-channel lease cleanup to remove expired compatibility lease records." } return out, nil } func (s *Service) CleanupFabricServiceChannelLeases(ctx context.Context, input CleanupFabricServiceChannelLeasesInput) (FabricServiceChannelLeaseMaintenance, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return FabricServiceChannelLeaseMaintenance{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) if input.ClusterID == "" { return FabricServiceChannelLeaseMaintenance{}, ErrInvalidPayload } if input.Limit <= 0 || input.Limit > 1000 { input.Limit = 100 } now := input.Now if now.IsZero() { now = s.now() } if now.IsZero() { now = time.Now().UTC() } deleted, err := s.store.CleanupExpiredFabricServiceChannelLeases(ctx, input.ClusterID, now.UTC(), input.Limit) if err != nil { return FabricServiceChannelLeaseMaintenance{}, err } out, err := s.ListFabricServiceChannelLeases(ctx, input.ActorUserID, ListFabricServiceChannelLeasesInput{ ClusterID: input.ClusterID, IncludeExpired: true, Limit: input.Limit, Now: now, }) if err != nil { return FabricServiceChannelLeaseMaintenance{}, err } out.DeletedExpiredCount = deleted out.Status = "ready" out.Reason = "expired_leases_cleaned" out.RecommendedOperatorAction = "" if out.ExpiredCount > 0 { out.Status = "degraded" out.Reason = "expired_leases_remaining" out.RecommendedOperatorAction = "Run cleanup again; expired leases remain beyond the bounded cleanup window." } return out, nil } func (s *Service) GetFabricServiceChannelAccessTelemetry(ctx context.Context, actorUserID string, input GetFabricServiceChannelAccessTelemetryInput) (FabricServiceChannelAccessTelemetry, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return FabricServiceChannelAccessTelemetry{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) if input.ClusterID == "" { return FabricServiceChannelAccessTelemetry{}, ErrInvalidPayload } if input.Limit <= 0 || input.Limit > 200 { input.Limit = 100 } now := input.Now if now.IsZero() { now = s.now() } if now.IsZero() { now = time.Now().UTC() } nodes, err := s.store.ListClusterNodes(ctx, input.ClusterID) if err != nil { return FabricServiceChannelAccessTelemetry{}, err } out := FabricServiceChannelAccessTelemetry{ SchemaVersion: "rap.fabric_service_channel_access_telemetry.v1", ClusterID: input.ClusterID, Status: "ready", Reason: "access_telemetry_ready", ObservedAt: now.UTC(), NodeCount: len(nodes), TrafficClassCounts: map[string]int{}, RecommendedParallelWindows: map[string]int{}, } for _, node := range nodes { if len(out.Nodes) >= input.Limit { break } items, err := s.store.ListNodeTelemetry(ctx, input.ClusterID, node.ID, 5) if err != nil { continue } report := map[string]any{} var observedAt time.Time for _, item := range items { payload := jsonObject(item.Payload) report = jsonMapPath(payload, "fabric_service_channel_access_report") if len(report) > 0 { observedAt = item.ObservedAt break } } if len(report) == 0 { heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, node.ID, 5) if err == nil { for _, heartbeat := range heartbeats { payload := jsonObject(heartbeat.Metadata) report = jsonMapPath(payload, "fabric_service_channel_access_report") if len(report) > 0 { observedAt = heartbeat.ObservedAt break } } } } if len(report) == 0 { continue } nodeReport := FabricServiceChannelAccessTelemetryNode{ NodeID: node.ID, NodeName: node.Name, ObservedAt: observedAt, TotalAccepted: jsonInt(report, "total"), SignedAccepted: jsonInt(report, "signed"), IntrospectionAccepted: jsonInt(report, "introspection"), LegacyUnsignedAccepted: jsonInt(report, "legacy_unsigned"), BackendFallbackCount: jsonInt(report, "backend_fallback"), BackendFallbackBlockedCount: jsonInt(report, "backend_fallback_blocked"), FabricRouteSendFailureCount: jsonInt(report, "fabric_route_send_failure"), DataPlaneContractCount: jsonInt(report, "data_plane_contract"), LastDataPlaneMode: jsonString(report, "last_data_plane_mode"), LastWorkingDataTransport: jsonString(report, "last_working_data_transport"), LastSteadyStateTransport: jsonString(report, "last_steady_state_transport"), LastBackendRelayPolicy: jsonString(report, "last_backend_relay_policy"), LastLogicalFlowMode: jsonString(report, "last_logical_flow_mode"), LastDataPlaneViolationStatus: jsonString(report, "last_data_plane_violation_status"), LastDataPlaneViolationReason: jsonString(report, "last_data_plane_violation_reason"), } if nodeReport.SignedAccepted == 0 { nodeReport.SignedAccepted = jsonInt(report, "accepted_by_signed") } if nodeReport.IntrospectionAccepted == 0 { nodeReport.IntrospectionAccepted = jsonInt(report, "accepted_by_introspection") } if nodeReport.LegacyUnsignedAccepted == 0 { nodeReport.LegacyUnsignedAccepted = jsonInt(report, "accepted_by_legacy_unsigned") } if value := jsonString(report, "last_accepted_at"); value != "" { if parsed, err := time.Parse(time.RFC3339Nano, value); err == nil { nodeReport.LastAcceptedAt = &parsed if out.LatestAcceptedAt == nil || parsed.After(*out.LatestAcceptedAt) { latest := parsed out.LatestAcceptedAt = &latest } } } if heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, node.ID, 1); err == nil && len(heartbeats) > 0 { flowScheduler := fabricServiceChannelFlowSchedulerFromHeartbeat(heartbeats[0]) nodeReport.TrafficClassCounts = jsonStringIntMap(flowScheduler, "traffic_class_counts") nodeReport.FlowChannelCount = jsonInt(flowScheduler, "channel_count") nodeReport.FlowDropped = jsonInt(flowScheduler, "dropped") nodeReport.FlowHighWatermark = jsonInt(flowScheduler, "high_watermark") nodeReport.FlowMaxInFlight = jsonInt(flowScheduler, "max_in_flight") nodeReport.RecommendedParallelWindows = jsonStringIntMap(flowScheduler, "recommended_parallel_windows") nodeReport.AdaptiveBackpressureActive = jsonBool(flowScheduler, "adaptive_backpressure_active") nodeReport.AdaptiveBackpressureReason = jsonString(flowScheduler, "adaptive_backpressure_reason") nodeReport.AdaptivePolicyFingerprint = jsonString(flowScheduler, "adaptive_policy_fingerprint") } nodeReport.FlowHealthStatus, nodeReport.FlowHealthReason, _ = fabricServiceChannelFlowHealth( nodeReport.TrafficClassCounts, nodeReport.FlowDropped, nodeReport.FlowHighWatermark, nodeReport.FlowMaxInFlight, nodeReport.BackendFallbackCount, 0, 0, 0, 0, ) out.ReportingNodeCount++ out.TotalAccepted += nodeReport.TotalAccepted out.SignedAccepted += nodeReport.SignedAccepted out.IntrospectionAccepted += nodeReport.IntrospectionAccepted out.LegacyUnsignedAccepted += nodeReport.LegacyUnsignedAccepted out.BackendFallbackCount += nodeReport.BackendFallbackCount out.BackendFallbackBlockedCount += nodeReport.BackendFallbackBlockedCount out.FabricRouteSendFailureCount += nodeReport.FabricRouteSendFailureCount out.DataPlaneContractCount += nodeReport.DataPlaneContractCount if out.LastDataPlaneMode == "" { out.LastDataPlaneMode = nodeReport.LastDataPlaneMode } if out.LastWorkingDataTransport == "" { out.LastWorkingDataTransport = nodeReport.LastWorkingDataTransport } if out.LastSteadyStateTransport == "" { out.LastSteadyStateTransport = nodeReport.LastSteadyStateTransport } if out.LastBackendRelayPolicy == "" { out.LastBackendRelayPolicy = nodeReport.LastBackendRelayPolicy } if out.LastLogicalFlowMode == "" { out.LastLogicalFlowMode = nodeReport.LastLogicalFlowMode } if out.LastDataPlaneViolationStatus == "" { out.LastDataPlaneViolationStatus = nodeReport.LastDataPlaneViolationStatus } if out.LastDataPlaneViolationReason == "" { out.LastDataPlaneViolationReason = nodeReport.LastDataPlaneViolationReason } mergeStringIntMap(out.TrafficClassCounts, nodeReport.TrafficClassCounts) mergeMinStringIntMap(out.RecommendedParallelWindows, nodeReport.RecommendedParallelWindows) if nodeReport.AdaptiveBackpressureActive { out.AdaptiveBackpressureActive = true if out.AdaptiveBackpressureReason == "" { out.AdaptiveBackpressureReason = nodeReport.AdaptiveBackpressureReason } } if out.AdaptivePolicyFingerprint == "" { out.AdaptivePolicyFingerprint = nodeReport.AdaptivePolicyFingerprint } out.FlowChannelCount += nodeReport.FlowChannelCount out.FlowDropped += nodeReport.FlowDropped if nodeReport.FlowHighWatermark > out.FlowHighWatermark { out.FlowHighWatermark = nodeReport.FlowHighWatermark } if nodeReport.FlowMaxInFlight > out.FlowMaxInFlight { out.FlowMaxInFlight = nodeReport.FlowMaxInFlight } out.Nodes = append(out.Nodes, nodeReport) } if len(out.TrafficClassCounts) == 0 { out.TrafficClassCounts = nil } if len(out.RecommendedParallelWindows) == 0 { out.RecommendedParallelWindows = nil } nodeReportsByID := map[string]FabricServiceChannelAccessTelemetryNode{} for _, node := range out.Nodes { nodeReportsByID[node.NodeID] = node } routeManagerByNodeID := map[string]map[string]any{} routeManagerTransitionByNodeID := map[string]map[string]any{} for _, node := range nodes { heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, node.ID, 1) if err != nil || len(heartbeats) == 0 { continue } metadata := jsonObject(heartbeats[0].Metadata) runtime := jsonMapPath(metadata, "fabric_service_channel_runtime_report") ingress := jsonMapPath(runtime, "ingress") routeManager := jsonMapPath(ingress, "route_manager") if len(routeManager) > 0 { routeManagerByNodeID[node.ID] = routeManager } transition := jsonMapPath(ingress, "route_manager_transition") if len(transition) > 0 { routeManagerTransitionByNodeID[node.ID] = transition } } feedbackItems, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{ ClusterID: input.ClusterID, ServiceClass: FabricServiceClassVPNPackets, Now: now.UTC(), IncludeExpired: false, }) if err != nil { return FabricServiceChannelAccessTelemetry{}, err } feedbackByRouteID := map[string]FabricServiceChannelRouteFeedbackObservation{} for _, item := range feedbackItems { if strings.TrimSpace(item.RouteID) == "" { continue } current, ok := feedbackByRouteID[item.RouteID] if !ok || item.ObservedAt.After(current.ObservedAt) { feedbackByRouteID[item.RouteID] = item } } leaseRecords, err := s.store.ListFabricServiceChannelLeases(ctx, ListFabricServiceChannelLeasesInput{ ClusterID: input.ClusterID, IncludeExpired: false, Limit: input.Limit, Now: now.UTC(), }) if err != nil { return FabricServiceChannelAccessTelemetry{}, err } for _, record := range leaseRecords { summary := fabricServiceChannelLeaseSummaryFromRecord(record, now) channel := FabricServiceChannelAccessTelemetryChannel{ ChannelID: summary.ChannelID, ResourceID: summary.ResourceID, ServiceClass: summary.ServiceClass, Status: summary.Status, SelectedEntryNodeID: summary.SelectedEntryNodeID, SelectedExitNodeID: summary.SelectedExitNodeID, PrimaryRouteID: summary.PrimaryRouteID, PrimaryRouteStatus: summary.PrimaryRouteStatus, ForceBackendFallback: summary.ForceBackendFallback, DataPlane: summary.DataPlane, ExpiresAt: summary.ExpiresAt, } if record.Lease.PoolPolicy != nil { channel.PoolPolicyFingerprint = record.Lease.PoolPolicy.Fingerprint } if entryReport, ok := nodeReportsByID[channel.SelectedEntryNodeID]; ok { channel.EntryNodeTotalAccepted = entryReport.TotalAccepted channel.EntryNodeIntrospectionAccepted = entryReport.IntrospectionAccepted channel.EntryNodeBackendFallbackCount = entryReport.BackendFallbackCount channel.EntryNodeBackendFallbackBlockedCount = entryReport.BackendFallbackBlockedCount channel.EntryNodeFabricRouteSendFailureCount = entryReport.FabricRouteSendFailureCount channel.EntryNodeDataPlaneContractCount = entryReport.DataPlaneContractCount channel.EntryNodeLastDataPlaneMode = entryReport.LastDataPlaneMode channel.EntryNodeLastWorkingDataTransport = entryReport.LastWorkingDataTransport channel.EntryNodeLastSteadyStateTransport = entryReport.LastSteadyStateTransport channel.EntryNodeLastBackendRelayPolicy = entryReport.LastBackendRelayPolicy channel.EntryNodeLastLogicalFlowMode = entryReport.LastLogicalFlowMode channel.EntryNodeLastDataPlaneViolationStatus = entryReport.LastDataPlaneViolationStatus channel.EntryNodeLastDataPlaneViolationReason = entryReport.LastDataPlaneViolationReason channel.EntryNodeTrafficClassCounts = copyStringIntMap(entryReport.TrafficClassCounts) channel.EntryNodeFlowChannelCount = entryReport.FlowChannelCount channel.EntryNodeFlowDropped = entryReport.FlowDropped channel.EntryNodeFlowHighWatermark = entryReport.FlowHighWatermark channel.EntryNodeFlowMaxInFlight = entryReport.FlowMaxInFlight channel.EntryNodeFlowHealthStatus = entryReport.FlowHealthStatus channel.EntryNodeFlowHealthReason = entryReport.FlowHealthReason channel.EntryNodeRecommendedParallelWindows = copyStringIntMap(entryReport.RecommendedParallelWindows) channel.EntryNodeAdaptiveBackpressureActive = entryReport.AdaptiveBackpressureActive channel.EntryNodeAdaptiveBackpressureReason = entryReport.AdaptiveBackpressureReason channel.EntryNodeAdaptivePolicyFingerprint = entryReport.AdaptivePolicyFingerprint } if feedback, ok := feedbackByRouteID[channel.PrimaryRouteID]; ok { observedAt := feedback.ObservedAt channel.RouteFeedbackStatus = feedback.FeedbackStatus channel.RouteFeedbackObservedAt = &observedAt channel.RouteFeedbackScoreAdjustment = feedback.ScoreAdjustment channel.RouteFeedbackEffectiveScoreAdjustment = feedback.EffectiveScoreAdjustment channel.RouteFeedbackReasons = append([]string{}, feedback.Reasons...) channel.RouteQualityWindowSampleCount = fabricServiceChannelFeedbackPayloadInt(feedback.Payload, "quality_window_sample_count") channel.RouteQualityWindowFailureCount = fabricServiceChannelFeedbackPayloadInt(feedback.Payload, "quality_window_failure_count") channel.RouteQualityWindowDropCount = fabricServiceChannelFeedbackPayloadInt(feedback.Payload, "quality_window_drop_count") channel.RouteQualityWindowSlowCount = fabricServiceChannelFeedbackPayloadInt(feedback.Payload, "quality_window_slow_count") channel.LastSendDurationMs = feedback.LastSendDurationMs channel.EntryNodeFlowHealthStatus, channel.EntryNodeFlowHealthReason, _ = fabricServiceChannelFlowHealth( channel.EntryNodeTrafficClassCounts, channel.EntryNodeFlowDropped, channel.EntryNodeFlowHighWatermark, channel.EntryNodeFlowMaxInFlight, channel.EntryNodeBackendFallbackCount, channel.LastSendDurationMs, channel.RouteQualityWindowFailureCount, channel.RouteQualityWindowDropCount, channel.RouteQualityWindowSlowCount, ) out.CorrelatedRouteCount++ if feedback.FeedbackStatus == "degraded" || feedback.FeedbackStatus == "fenced" || feedback.EffectiveScoreAdjustment < 0 || feedback.ScoreAdjustment < 0 { out.DegradedRouteCount++ } } channel = fabricServiceChannelAccessRemediation(channel, record.Lease, now) channel = fabricServiceChannelAccessRouteDecisionTelemetry(channel, routeManagerByNodeID[channel.SelectedEntryNodeID], routeManagerTransitionByNodeID[channel.SelectedEntryNodeID]) channel = fabricServiceChannelAccessRemediationExecution(channel, routeManagerByNodeID[channel.SelectedEntryNodeID], routeManagerTransitionByNodeID[channel.SelectedEntryNodeID], now) channel = s.fabricServiceChannelAccessRemediationLedgerExecution(ctx, input.ClusterID, channel) fabricServiceChannelAccumulateRouteDecisionTelemetry(&out, channel) if channel.ForceBackendFallback { out.DegradedFallbackChannelCount++ } out.ActiveChannels = append(out.ActiveChannels, channel) } out.ActiveChannelCount = len(out.ActiveChannels) sort.Slice(out.Nodes, func(i, j int) bool { if out.Nodes[i].TotalAccepted != out.Nodes[j].TotalAccepted { return out.Nodes[i].TotalAccepted > out.Nodes[j].TotalAccepted } return out.Nodes[i].NodeName < out.Nodes[j].NodeName }) sort.Slice(out.ActiveChannels, func(i, j int) bool { if out.ActiveChannels[i].ForceBackendFallback != out.ActiveChannels[j].ForceBackendFallback { return out.ActiveChannels[i].ForceBackendFallback } if out.ActiveChannels[i].RouteFeedbackStatus != out.ActiveChannels[j].RouteFeedbackStatus { return out.ActiveChannels[i].RouteFeedbackStatus > out.ActiveChannels[j].RouteFeedbackStatus } return out.ActiveChannels[i].ExpiresAt.Before(out.ActiveChannels[j].ExpiresAt) }) if out.NoSafeRecoveryDecisionCount > 0 { out.Status = "degraded" out.Reason = "active_channels_no_safe_recovery" out.RecommendedOperatorAction = "Inspect active service-channel route decisions; at least one channel has no safe recovery route." } else if out.ReportingNodeCount == 0 { out.Status = "degraded" out.Reason = "no_access_telemetry_reported" out.RecommendedOperatorAction = "Wait for node telemetry or verify fabric_service_channel_access_telemetry capability on node-agent." } else if out.DegradedFallbackChannelCount > 0 || out.DegradedRouteCount > 0 { out.Status = "degraded" out.Reason = "active_channels_degraded" out.RecommendedOperatorAction = "Inspect active service-channel routes with backend fallback or degraded route-quality feedback." } out.FlowHealthStatus, out.FlowHealthReason, _ = fabricServiceChannelFlowHealth( out.TrafficClassCounts, out.FlowDropped, out.FlowHighWatermark, out.FlowMaxInFlight, out.BackendFallbackCount, 0, 0, 0, 0, ) for _, channel := range out.ActiveChannels { out.FlowHealthStatus, out.FlowHealthReason = fabricServiceChannelWorseFlowHealth(out.FlowHealthStatus, out.FlowHealthReason, channel.EntryNodeFlowHealthStatus, channel.EntryNodeFlowHealthReason) } if out.FlowHealthStatus == "critical" || out.FlowHealthStatus == "degraded" { out.Status = "degraded" if out.Reason == "access_telemetry_ready" { out.Reason = "flow_health_degraded" } if out.RecommendedOperatorAction == "" { out.RecommendedOperatorAction = fabricServiceChannelFlowHealthAction(out.FlowHealthStatus, out.FlowHealthReason) } } else if out.FlowHealthStatus == "watch" && out.RecommendedOperatorAction == "" { out.RecommendedOperatorAction = fabricServiceChannelFlowHealthAction(out.FlowHealthStatus, out.FlowHealthReason) } return out, nil } func fabricServiceChannelFlowHealth(trafficClassCounts map[string]int, flowDropped, flowHighWatermark, flowMaxInFlight, backendFallbackCount int, lastSendDurationMs int64, routeFailureCount, routeDropCount, routeSlowCount int) (string, string, string) { switch { case flowDropped > 0: return "critical", "flow_drops_reported", fabricServiceChannelFlowHealthAction("critical", "flow_drops_reported") case routeDropCount > 0: return "critical", "route_quality_window_drops_reported", fabricServiceChannelFlowHealthAction("critical", "route_quality_window_drops_reported") case backendFallbackCount > 0: return "degraded", "backend_fallback_observed", fabricServiceChannelFlowHealthAction("degraded", "backend_fallback_observed") case routeFailureCount > 0: return "degraded", "route_quality_window_failures_reported", fabricServiceChannelFlowHealthAction("degraded", "route_quality_window_failures_reported") case routeSlowCount > 0: return "degraded", "route_quality_window_slow_samples_reported", fabricServiceChannelFlowHealthAction("degraded", "route_quality_window_slow_samples_reported") case lastSendDurationMs >= 1000: return "degraded", "route_send_latency_high", fabricServiceChannelFlowHealthAction("degraded", "route_send_latency_high") } bulk := trafficClassCounts["bulk"] interactive := trafficClassCounts["interactive"] + trafficClassCounts["control"] switch { case flowHighWatermark >= 64 || flowMaxInFlight >= 16: return "degraded", "flow_queue_pressure_high", fabricServiceChannelFlowHealthAction("degraded", "flow_queue_pressure_high") case bulk >= 16 && interactive > 0: return "watch", "bulk_pressure_with_interactive_qos_observed", fabricServiceChannelFlowHealthAction("watch", "bulk_pressure_with_interactive_qos_observed") case bulk >= 16: return "watch", "bulk_pressure_observed", fabricServiceChannelFlowHealthAction("watch", "bulk_pressure_observed") case flowHighWatermark >= 16 || flowMaxInFlight >= 4: return "watch", "flow_queue_pressure_observed", fabricServiceChannelFlowHealthAction("watch", "flow_queue_pressure_observed") default: return "healthy", "flow_health_ready", fabricServiceChannelFlowHealthAction("healthy", "flow_health_ready") } } func fabricServiceChannelWorseFlowHealth(currentStatus, currentReason, candidateStatus, candidateReason string) (string, string) { if candidateStatus == "" { return currentStatus, currentReason } if fabricServiceChannelFlowHealthRank(candidateStatus) > fabricServiceChannelFlowHealthRank(currentStatus) { return candidateStatus, candidateReason } return currentStatus, currentReason } func fabricServiceChannelFlowHealthRank(status string) int { switch status { case "critical": return 4 case "degraded": return 3 case "watch": return 2 case "healthy": return 1 default: return 0 } } func fabricServiceChannelFlowHealthAction(status, reason string) string { switch status { case "critical": return "Reduce or reroute service-channel pressure immediately; inspect flow drops, route drops, and backend fallback before adding user traffic." case "degraded": return "Inspect service-channel route quality and active entry-node pressure; prefer alternate route or rebuild when degraded evidence persists." case "watch": if reason == "bulk_pressure_with_interactive_qos_observed" { return "Bulk pressure is active while interactive/control remains observable; keep watching latency and drops before increasing load." } return "Bulk or queue pressure is visible; monitor interactive/control traffic before increasing production load." default: return "Flow health is within the current service-channel guard policy." } } func fabricServiceChannelAccessRemediation(channel FabricServiceChannelAccessTelemetryChannel, lease FabricServiceChannelLease, now time.Time) FabricServiceChannelAccessTelemetryChannel { if channel.ForceBackendFallback { channel.RemediationAction = "use_backend_fallback" channel.RemediationReason = "explicit_backend_fallback_active" channel.RecommendedOperatorAction = "Inspect missing/fenced fabric route and keep backend fallback visible until a normal route is available." channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now) return channel } degraded := channel.RouteFeedbackStatus == "degraded" || channel.RouteFeedbackStatus == "fenced" || channel.RouteFeedbackScoreAdjustment < 0 || channel.RouteFeedbackEffectiveScoreAdjustment < 0 if !degraded { channel.RemediationAction = "none" channel.RemediationReason = "active_route_quality_acceptable" channel.RecommendedOperatorAction = "No route remediation required." return channel } if containsString(channel.RouteFeedbackReasons, "service_channel_degraded_fallback_recommended") { channel.RemediationAction = "use_backend_fallback" channel.RemediationReason = "route_feedback_recommends_degraded_fallback" channel.RecommendedOperatorAction = "Use explicit degraded backend fallback while route rebuild catches up." channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now) return channel } if alternate, ok := fabricServiceChannelFirstAuthorizedAlternate(lease.AlternateRoutes, channel.PrimaryRouteID); ok { guardStatus, guardReason := fabricServiceChannelRouteAllowedByLeasePool(lease, alternate) if guardStatus != "allowed" { channel.RemediationAction = "rebuild_route" channel.RemediationReason = "alternate_route_rejected_by_pool_policy" channel.RemediationRouteID = alternate.RouteID channel.RemediationRouteStatus = alternate.Status channel.RemediationGuardStatus = guardStatus channel.RemediationGuardReason = guardReason channel.RecommendedOperatorAction = "Reject the alternate route and rebuild within the signed entry/exit pool policy." channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now) return channel } channel.RemediationAction = "prefer_alternate_route" channel.RemediationReason = "authorized_alternate_route_available" channel.RemediationRouteID = alternate.RouteID channel.RemediationRouteStatus = alternate.Status channel.RemediationGuardStatus = guardStatus channel.RemediationGuardReason = guardReason channel.RecommendedOperatorAction = "Prefer the authorized alternate route for this active service channel." channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now) return channel } if containsString(channel.RouteFeedbackReasons, "service_channel_route_rebuild_recommended") || channel.RouteFeedbackStatus == "fenced" { channel.RemediationAction = "rebuild_route" channel.RemediationReason = "route_feedback_recommends_rebuild" channel.RecommendedOperatorAction = "Trigger or wait for route rebuild; keep this distinct from backend fallback." channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now) return channel } channel.RemediationAction = "inspect_route_quality" channel.RemediationReason = "degraded_route_quality_without_replacement" channel.RecommendedOperatorAction = "Inspect rolling route quality counters and route feedback provenance." channel.RemediationCommand = fabricServiceChannelAccessRemediationCommand(channel, lease, now) return channel } func fabricServiceChannelAccessRemediationCommand(channel FabricServiceChannelAccessTelemetryChannel, lease FabricServiceChannelLease, now time.Time) *FabricServiceChannelAccessRemediationCommand { action := strings.TrimSpace(channel.RemediationAction) if action == "" || action == "none" { return nil } if now.IsZero() { now = time.Now().UTC() } issuedAt := now.UTC() expiresAt := issuedAt.Add(60 * time.Second) if !channel.ExpiresAt.IsZero() && channel.ExpiresAt.Before(expiresAt) { expiresAt = channel.ExpiresAt.UTC() } routeComponent := firstNonEmptyString(channel.RemediationRouteID, channel.PrimaryRouteID, "no-route") return &FabricServiceChannelAccessRemediationCommand{ SchemaVersion: "rap.fabric_service_channel_access_remediation_command.v1", CommandID: "fsc-remediation:" + channel.ChannelID + ":" + action + ":" + routeComponent, Action: action, ClusterID: lease.ClusterID, ChannelID: channel.ChannelID, ResourceID: channel.ResourceID, ServiceClass: channel.ServiceClass, EntryNodeID: channel.SelectedEntryNodeID, ExitNodeID: channel.SelectedExitNodeID, PrimaryRouteID: channel.PrimaryRouteID, ReplacementRouteID: channel.RemediationRouteID, ReplacementRouteStatus: channel.RemediationRouteStatus, PoolPolicyFingerprint: channel.PoolPolicyFingerprint, GuardStatus: firstNonEmptyString(channel.RemediationGuardStatus, "allowed"), GuardReason: firstNonEmptyString(channel.RemediationGuardReason, "lease_pool_policy_allows_route"), ExecutionStatus: channel.RemediationExecutionStatus, ExecutionReason: channel.RemediationExecutionReason, ExecutionGeneration: channel.RemediationExecutionGeneration, ExecutionObservedAt: channel.RemediationExecutionObservedAt, Reason: channel.RemediationReason, OperatorAction: channel.RecommendedOperatorAction, IssuedAt: issuedAt, ExpiresAt: expiresAt, } } func fabricServiceChannelAccessRemediationExecution(channel FabricServiceChannelAccessTelemetryChannel, routeManager map[string]any, transition map[string]any, now time.Time) FabricServiceChannelAccessTelemetryChannel { if channel.RemediationCommand == nil { return channel } if !channel.RemediationCommand.ExpiresAt.IsZero() && !now.IsZero() && !channel.RemediationCommand.ExpiresAt.After(now.UTC()) { channel.RemediationExecutionStatus = "expired" channel.RemediationExecutionReason = "remediation_command_ttl_expired" return fabricServiceChannelSyncRemediationCommandExecution(channel) } if channel.RemediationGuardStatus == "rejected" || channel.RemediationCommand.GuardStatus == "rejected" { channel.RemediationExecutionStatus = "rejected_by_policy_guard" channel.RemediationExecutionReason = firstNonEmptyString(channel.RemediationGuardReason, channel.RemediationCommand.GuardReason, "remediation_guard_rejected") return fabricServiceChannelSyncRemediationCommandExecution(channel) } switch channel.RemediationCommand.Action { case "prefer_alternate_route": if decision, ok := fabricServiceChannelRouteManagerDecisionForCommand(routeManager, *channel.RemediationCommand); ok { channel.RemediationExecutionStatus = firstNonEmptyString(jsonString(decision, "rebuild_status"), "observed") channel.RemediationExecutionReason = firstNonEmptyString(jsonString(decision, "rebuild_reason"), jsonString(decision, "decision_source"), "route_manager_decision_observed") channel.RemediationExecutionGeneration = jsonString(decision, "generation") channel.RemediationExecutionObservedAt = firstNonEmptyString(jsonString(routeManager, "last_applied_at"), jsonString(transition, "observed_at")) return fabricServiceChannelSyncRemediationCommandExecution(channel) } channel.RemediationExecutionStatus = "waiting_node_apply" channel.RemediationExecutionReason = "route_manager_has_not_reported_command" channel.RemediationExecutionObservedAt = jsonString(transition, "observed_at") case "rebuild_route": if decision, ok := fabricServiceChannelRouteManagerDecisionForCommand(routeManager, *channel.RemediationCommand); ok { channel.RemediationExecutionStatus = firstNonEmptyString(jsonString(decision, "rebuild_status"), "pending_rebuild_request") channel.RemediationExecutionReason = firstNonEmptyString(jsonString(decision, "rebuild_reason"), jsonString(decision, "decision_source"), "route_manager_rebuild_decision_observed") channel.RemediationExecutionGeneration = jsonString(decision, "generation") channel.RemediationExecutionObservedAt = firstNonEmptyString(jsonString(routeManager, "last_applied_at"), jsonString(transition, "observed_at")) return fabricServiceChannelSyncRemediationCommandExecution(channel) } channel.RemediationExecutionStatus = "pending_rebuild_request" channel.RemediationExecutionReason = "bounded_rebuild_route_command_visible" channel.RemediationExecutionObservedAt = jsonString(transition, "observed_at") case "use_backend_fallback": channel.RemediationExecutionStatus = "degraded_fallback_visible" channel.RemediationExecutionReason = "backend_fallback_command_visible" default: channel.RemediationExecutionStatus = "visible" channel.RemediationExecutionReason = "remediation_command_visible" } return fabricServiceChannelSyncRemediationCommandExecution(channel) } func fabricServiceChannelAccessRouteDecisionTelemetry(channel FabricServiceChannelAccessTelemetryChannel, routeManager map[string]any, transition map[string]any) FabricServiceChannelAccessTelemetryChannel { decision, ok := fabricServiceChannelRouteManagerDecisionForChannel(routeManager, channel) if !ok { return channel } channel.RouteDecisionSource = jsonString(decision, "decision_source") channel.RouteDecisionRouteID = jsonString(decision, "route_id") channel.RouteDecisionReplacementRouteID = jsonString(decision, "replacement_route_id") channel.RouteDecisionRebuildStatus = jsonString(decision, "rebuild_status") channel.RouteDecisionRebuildReason = jsonString(decision, "rebuild_reason") channel.RouteDecisionGeneration = firstNonEmptyString(jsonString(decision, "generation"), jsonString(decision, "rebuild_request_id")) channel.RouteDecisionScoreReasons = jsonStringArray(decision, "score_reasons") if channel.RemediationExecutionObservedAt == "" { channel.RemediationExecutionObservedAt = firstNonEmptyString(jsonString(routeManager, "last_applied_at"), jsonString(transition, "observed_at")) } if channel.RouteDecisionSource == "service_channel_feedback_no_alternate" || channel.RouteDecisionRebuildStatus == "pending_degraded_fallback" || containsString(channel.RouteDecisionScoreReasons, "no_unfenced_alternate_route") { channel.RemediationAction = firstNonEmptyString(channel.RemediationAction, "use_backend_fallback") if channel.RemediationAction == "none" { channel.RemediationAction = "use_backend_fallback" } channel.RemediationReason = "route_decision_no_safe_recovery" channel.RemediationExecutionStatus = "route_rebuild_no_safe_recovery" channel.RemediationExecutionReason = firstNonEmptyString(channel.RouteDecisionRebuildReason, "no_unfenced_alternate_route") channel.RemediationExecutionGeneration = channel.RouteDecisionGeneration channel.RecommendedOperatorAction = "No safe recovery route is available; keep degraded fallback visible and rebuild the route pool." } return channel } func fabricServiceChannelRouteManagerDecisionForChannel(routeManager map[string]any, channel FabricServiceChannelAccessTelemetryChannel) (map[string]any, bool) { decisionsRaw := jsonArray(routeManager, "decisions") if len(decisionsRaw) == 0 { return nil, false } var selected map[string]any selectedRank := 0 for _, raw := range decisionsRaw { decision, ok := raw.(map[string]any) if !ok || !fabricServiceChannelRouteManagerDecisionMatchesChannel(decision, channel) { continue } rank := fabricServiceChannelRouteManagerDecisionTelemetryRank(decision) if rank > selectedRank { selected = decision selectedRank = rank } } if selected == nil { return nil, false } return selected, true } func fabricServiceChannelRouteManagerDecisionMatchesChannel(decision map[string]any, channel FabricServiceChannelAccessTelemetryChannel) bool { routeID := jsonString(decision, "route_id") replacementRouteID := jsonString(decision, "replacement_route_id") if routeID != "" && routeID == channel.PrimaryRouteID { return true } if replacementRouteID != "" && replacementRouteID == channel.PrimaryRouteID { return true } sourceNodeID := jsonString(decision, "source_node_id") destinationNodeID := jsonString(decision, "destination_node_id") localNodeID := jsonString(decision, "local_node_id") return sourceNodeID != "" && destinationNodeID != "" && sourceNodeID == channel.SelectedEntryNodeID && destinationNodeID == channel.SelectedExitNodeID && (localNodeID == "" || localNodeID == channel.SelectedEntryNodeID) } func fabricServiceChannelRouteManagerDecisionTelemetryRank(decision map[string]any) int { source := jsonString(decision, "decision_source") status := jsonString(decision, "rebuild_status") reasons := jsonStringArray(decision, "score_reasons") switch { case source == "service_channel_feedback_no_alternate" || status == "pending_degraded_fallback" || containsString(reasons, "no_unfenced_alternate_route"): return 50 case status == "applied" || containsString(reasons, "service_channel_rebuild_applied"): return 40 case strings.Contains(source, "replacement"): return 30 case status != "": return 20 default: return 10 } } func fabricServiceChannelAccumulateRouteDecisionTelemetry(out *FabricServiceChannelAccessTelemetry, channel FabricServiceChannelAccessTelemetryChannel) { if out == nil || channel.RouteDecisionSource == "" { return } out.RouteDecisionChannelCount++ if fabricServiceChannelRouteDecisionIsReplacement(channel) { out.ReplacementDecisionCount++ } if channel.RouteDecisionRebuildStatus == "applied" || containsString(channel.RouteDecisionScoreReasons, "service_channel_rebuild_applied") { out.AppliedRebuildDecisionCount++ } if fabricServiceChannelRouteDecisionIsRecovery(channel) { out.RecoveryDecisionCount++ } if fabricServiceChannelRouteDecisionIsNoSafeRecovery(channel) { out.NoSafeRecoveryDecisionCount++ } } func fabricServiceChannelRouteDecisionIsReplacement(channel FabricServiceChannelAccessTelemetryChannel) bool { return strings.Contains(channel.RouteDecisionSource, "replacement") || strings.TrimSpace(channel.RouteDecisionReplacementRouteID) != "" } func fabricServiceChannelRouteDecisionIsRecovery(channel FabricServiceChannelAccessTelemetryChannel) bool { return containsString(channel.RouteDecisionScoreReasons, "service_channel_recovery_promoted") || containsString(channel.RouteDecisionScoreReasons, "service_channel_recovery_hysteresis") || strings.Contains(channel.RouteDecisionRebuildReason, "recovery") } func fabricServiceChannelRouteDecisionIsNoSafeRecovery(channel FabricServiceChannelAccessTelemetryChannel) bool { return channel.RouteDecisionSource == "service_channel_feedback_no_alternate" || channel.RouteDecisionRebuildStatus == "pending_degraded_fallback" || containsString(channel.RouteDecisionScoreReasons, "no_unfenced_alternate_route") } func fabricServiceChannelSyncRemediationCommandExecution(channel FabricServiceChannelAccessTelemetryChannel) FabricServiceChannelAccessTelemetryChannel { if channel.RemediationCommand == nil { return channel } channel.RemediationCommand.ExecutionStatus = channel.RemediationExecutionStatus channel.RemediationCommand.ExecutionReason = channel.RemediationExecutionReason channel.RemediationCommand.ExecutionGeneration = channel.RemediationExecutionGeneration channel.RemediationCommand.ExecutionObservedAt = channel.RemediationExecutionObservedAt return channel } func fabricServiceChannelRouteManagerDecisionForCommand(routeManager map[string]any, command FabricServiceChannelAccessRemediationCommand) (map[string]any, bool) { decisionsRaw, ok := routeManager["decisions"].([]any) if !ok { return nil, false } for _, raw := range decisionsRaw { decision, ok := raw.(map[string]any) if !ok { continue } if command.CommandID != "" && jsonString(decision, "rebuild_request_id") == command.CommandID { return decision, true } if jsonString(decision, "route_id") == command.PrimaryRouteID && jsonString(decision, "replacement_route_id") == command.ReplacementRouteID && jsonString(decision, "decision_source") == "service_channel_remediation_command" { return decision, true } } return nil, false } func (s *Service) fabricServiceChannelAccessRemediationLedgerExecution(ctx context.Context, clusterID string, channel FabricServiceChannelAccessTelemetryChannel) FabricServiceChannelAccessTelemetryChannel { if channel.RemediationCommand == nil || channel.RemediationCommand.Action != "rebuild_route" { return channel } attempts, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{ ClusterID: clusterID, ReporterNodeID: channel.SelectedEntryNodeID, RouteID: channel.PrimaryRouteID, ServiceClass: channel.ServiceClass, RebuildRequestID: channel.RemediationCommand.CommandID, Limit: 1, }) if err != nil || len(attempts) == 0 { return channel } attempt := attempts[0] switch attempt.RebuildStatus { case "requested": if channel.RemediationExecutionStatus == "pending_degraded_fallback" { channel.RemediationExecutionStatus = "rebuild_request_recorded_node_pending" channel.RemediationExecutionReason = firstNonEmptyString(channel.RemediationExecutionReason, attempt.RebuildReason, "durable_rebuild_route_request_recorded_and_node_pending") } else { channel.RemediationExecutionStatus = "rebuild_request_recorded" channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_request_recorded") } case "rejected": channel.RemediationExecutionStatus = "rebuild_request_rejected" channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_request_rejected") case "applied": channel.RemediationExecutionStatus = "rebuild_request_applied" channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_request_applied") case "no_alternate": channel.RemediationExecutionStatus = "rebuild_request_no_alternate" channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_no_alternate") case "deferred_by_policy": channel.RemediationExecutionStatus = "rebuild_request_deferred_by_policy" channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_deferred_by_policy") case "expired": channel.RemediationExecutionStatus = "rebuild_request_expired" channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, "durable_rebuild_route_expired") default: channel.RemediationExecutionStatus = firstNonEmptyString(attempt.RebuildStatus, channel.RemediationExecutionStatus) channel.RemediationExecutionReason = firstNonEmptyString(attempt.RebuildReason, channel.RemediationExecutionReason) } channel.RemediationExecutionGeneration = firstNonEmptyString(attempt.Generation, channel.RemediationExecutionGeneration) if !attempt.UpdatedAt.IsZero() { channel.RemediationExecutionObservedAt = attempt.UpdatedAt.UTC().Format(time.RFC3339Nano) } return fabricServiceChannelSyncRemediationCommandExecution(channel) } func (s *Service) fabricServiceChannelRemediationCommandsForNode(ctx context.Context, clusterID string, nodeID string, feedback map[string]fabricServiceChannelRouteFeedback, now time.Time) ([]FabricServiceChannelAccessRemediationCommand, error) { records, err := s.store.ListFabricServiceChannelLeases(ctx, ListFabricServiceChannelLeasesInput{ ClusterID: clusterID, EntryNodeID: nodeID, ServiceClass: FabricServiceClassVPNPackets, IncludeExpired: false, Limit: 100, Now: now.UTC(), }) if err != nil { return nil, err } commands := make([]FabricServiceChannelAccessRemediationCommand, 0, len(records)) for _, record := range records { summary := fabricServiceChannelLeaseSummaryFromRecord(record, now) if summary.Expired || strings.TrimSpace(summary.PrimaryRouteID) == "" { continue } channel := FabricServiceChannelAccessTelemetryChannel{ ChannelID: summary.ChannelID, ResourceID: summary.ResourceID, ServiceClass: summary.ServiceClass, Status: summary.Status, SelectedEntryNodeID: summary.SelectedEntryNodeID, SelectedExitNodeID: summary.SelectedExitNodeID, PrimaryRouteID: summary.PrimaryRouteID, PrimaryRouteStatus: summary.PrimaryRouteStatus, ForceBackendFallback: summary.ForceBackendFallback, ExpiresAt: summary.ExpiresAt, } if record.Lease.PoolPolicy != nil { channel.PoolPolicyFingerprint = record.Lease.PoolPolicy.Fingerprint } if item, ok := feedback[channel.PrimaryRouteID]; ok { observedAt := item.ObservedAt channel.RouteFeedbackObservedAt = &observedAt if item.Fenced { channel.RouteFeedbackStatus = "fenced" } else if item.ScoreAdjustment < 0 { channel.RouteFeedbackStatus = "degraded" } else if item.RouteID != "" { channel.RouteFeedbackStatus = "healthy" } channel.RouteFeedbackScoreAdjustment = item.ScoreAdjustment channel.RouteFeedbackEffectiveScoreAdjustment = item.ScoreAdjustment channel.RouteFeedbackReasons = append([]string{}, item.Reasons...) channel.RouteQualityWindowSampleCount = item.QualityWindowSampleCount channel.RouteQualityWindowFailureCount = item.QualityWindowFailureCount channel.RouteQualityWindowDropCount = item.QualityWindowDropCount channel.RouteQualityWindowSlowCount = item.QualityWindowSlowCount channel.LastSendDurationMs = item.LastSendDurationMs } channel = fabricServiceChannelAccessRemediation(channel, record.Lease, now) if channel.RemediationCommand != nil { commands = append(commands, *channel.RemediationCommand) } } sort.SliceStable(commands, func(i, j int) bool { if commands[i].Action != commands[j].Action { return commands[i].Action < commands[j].Action } return commands[i].CommandID < commands[j].CommandID }) return commands, nil } func (s *Service) recordFabricServiceChannelRemediationRebuildIntents(ctx context.Context, clusterID string, nodeID string, commands []FabricServiceChannelAccessRemediationCommand, now time.Time) error { if len(commands) == 0 { return nil } if now.IsZero() { now = time.Now().UTC() } for _, command := range commands { if command.Action != "rebuild_route" || strings.TrimSpace(command.CommandID) == "" || strings.TrimSpace(command.PrimaryRouteID) == "" { continue } rebuildStatus := "requested" outcome := "rebuild_requested" if command.GuardStatus == "rejected" { rebuildStatus = "rejected" outcome = "policy_guard_rejected" } payload := mustJSONRaw(map[string]any{ "schema_version": "c18z75.service_channel_remediation_rebuild_intent.v1", "command_id": command.CommandID, "channel_id": command.ChannelID, "resource_id": command.ResourceID, "entry_node_id": command.EntryNodeID, "exit_node_id": command.ExitNodeID, "pool_policy_fingerprint": command.PoolPolicyFingerprint, "guard_status": command.GuardStatus, "guard_reason": command.GuardReason, "command_expires_at": command.ExpiresAt.UTC().Format(time.RFC3339Nano), "recorded_at": now.UTC().Format(time.RFC3339Nano), }) _, err := s.store.RecordFabricServiceChannelRouteRebuildAttempt(ctx, RecordFabricServiceChannelRouteRebuildAttemptInput{ ClusterID: clusterID, ReporterNodeID: nodeID, ServiceClass: firstNonEmptyString(command.ServiceClass, FabricServiceClassVPNPackets), RouteID: command.PrimaryRouteID, ReplacementRouteID: command.ReplacementRouteID, RebuildRequestID: command.CommandID, RebuildStatus: rebuildStatus, RebuildReason: firstNonEmptyString(command.Reason, command.GuardReason, "service_channel_remediation_rebuild_route_requested"), DecisionSource: "service_channel_remediation_command", Outcome: outcome, Generation: command.ExecutionGeneration, PolicyFingerprint: command.PoolPolicyFingerprint, ObservedPolicyFingerprint: command.PoolPolicyFingerprint, FeedbackReasons: []string{firstNonEmptyString(command.Reason, command.GuardReason, "service_channel_remediation_rebuild_route_requested")}, OldHops: []string{}, ReplacementHops: []string{}, Payload: payload, }) if err != nil { return err } } return nil } func (s *Service) resolveFabricServiceChannelRemediationRebuildIntents(ctx context.Context, input GetNodeSyntheticMeshConfigInput, commands []FabricServiceChannelAccessRemediationCommand, intents []MeshRouteIntent, feedback map[string]fabricServiceChannelRouteFeedback, generation string, now time.Time) ([]RoutePathDecision, error) { if len(commands) == 0 { return nil, nil } if now.IsZero() { now = time.Now().UTC() } decisions := []RoutePathDecision{} for _, command := range commands { if command.Action != "rebuild_route" || strings.TrimSpace(command.CommandID) == "" || strings.TrimSpace(command.PrimaryRouteID) == "" { continue } lease, leaseOK, err := s.fabricServiceChannelLeaseForRemediationCommand(ctx, input.ClusterID, input.NodeID, command, now) if err != nil { return nil, err } status := "no_alternate" outcome := "no_alternate" reason := "no_unfenced_alternate_route" var primary SyntheticMeshRouteConfig var replacement SyntheticMeshRouteConfig if command.GuardStatus == "rejected" { status = "deferred_by_policy" outcome = "deferred_by_policy" reason = firstNonEmptyString(command.GuardReason, "remediation_guard_rejected") } else if !command.ExpiresAt.IsZero() && !command.ExpiresAt.After(now.UTC()) { status = "expired" outcome = "expired" reason = "remediation_command_ttl_expired" } else if !leaseOK { status = "deferred_by_policy" outcome = "deferred_by_policy" reason = "active_lease_not_found_for_rebuild_resolution" } else { var ok bool primary, ok = s.syntheticRouteByID(input, intents, command.PrimaryRouteID) if !ok { reason = "primary_route_not_available_for_rebuild" } else if selected, _, ok := s.selectServiceChannelRouteReplacement(input, primary, intents, feedback); ok { if guardStatus, guardReason := fabricServiceChannelRouteAllowedByLeasePool(lease, FabricServiceChannelRoute{ RouteID: selected.RouteID, ClusterID: selected.ClusterID, ServiceClass: firstNonEmptyString(command.ServiceClass, FabricServiceClassVPNPackets), SourceNodeID: selected.SourceNodeID, DestinationNodeID: selected.DestinationNodeID, Status: "authorized", }); guardStatus != "allowed" { status = "deferred_by_policy" outcome = "deferred_by_policy" reason = guardReason } else { replacement = selected status = "applied" outcome = "replacement_selected" reason = "remediation_rebuild_applied_to_alternate" } } } feedbackItem := feedback[command.PrimaryRouteID] feedbackStatus := "" if feedbackItem.Fenced { feedbackStatus = "fenced" } else if feedbackItem.ScoreAdjustment < 0 { feedbackStatus = "degraded" } else if feedbackItem.RouteID != "" { feedbackStatus = "healthy" } payload := mustJSONRaw(map[string]any{ "schema_version": "c18z77.service_channel_remediation_rebuild_resolution.v1", "command_id": command.CommandID, "channel_id": command.ChannelID, "resource_id": command.ResourceID, "entry_node_id": command.EntryNodeID, "exit_node_id": command.ExitNodeID, "pool_policy_fingerprint": command.PoolPolicyFingerprint, "guard_status": command.GuardStatus, "guard_reason": command.GuardReason, "resolution_status": status, "resolution_outcome": outcome, "resolution_reason": reason, "resolved_at": now.UTC().Format(time.RFC3339Nano), }) _, err = s.store.RecordFabricServiceChannelRouteRebuildAttempt(ctx, RecordFabricServiceChannelRouteRebuildAttemptInput{ ClusterID: input.ClusterID, ReporterNodeID: input.NodeID, ServiceClass: firstNonEmptyString(command.ServiceClass, FabricServiceClassVPNPackets), RouteID: command.PrimaryRouteID, ReplacementRouteID: replacement.RouteID, RebuildRequestID: command.CommandID, RebuildStatus: status, RebuildReason: reason, DecisionSource: "service_channel_remediation_command", Outcome: outcome, Generation: firstNonEmptyString(generation, command.ExecutionGeneration, command.CommandID), PolicyFingerprint: command.PoolPolicyFingerprint, ObservedPolicyFingerprint: command.PoolPolicyFingerprint, FeedbackStatus: feedbackStatus, FeedbackScoreAdjustment: feedbackItem.ScoreAdjustment, FeedbackEffectiveScoreAdjustment: feedbackItem.ScoreAdjustment, FeedbackReasons: append([]string{reason}, feedbackItem.Reasons...), LastError: feedbackItem.LastError, ConsecutiveFailures: feedbackItem.ConsecutiveFailures, StallCount: feedbackItem.StallCount, LastSendDurationMs: feedbackItem.LastSendDurationMs, QualityWindowSampleCount: feedbackItem.QualityWindowSampleCount, QualityWindowFailureCount: feedbackItem.QualityWindowFailureCount, QualityWindowDropCount: feedbackItem.QualityWindowDropCount, QualityWindowSlowCount: feedbackItem.QualityWindowSlowCount, OldHops: append([]string{}, primary.Hops...), ReplacementHops: append([]string{}, replacement.Hops...), Payload: payload, }) if err != nil { return nil, err } if status != "applied" { continue } decision := RoutePathDecision{ DecisionID: command.PrimaryRouteID + "-path-" + input.NodeID + "-service-channel-remediation", RouteID: command.PrimaryRouteID, ReplacementRouteID: replacement.RouteID, RebuildRequestID: command.CommandID, RebuildStatus: "applied", RebuildReason: reason, ClusterID: input.ClusterID, LocalNodeID: input.NodeID, SourceNodeID: primary.SourceNodeID, DestinationNodeID: primary.DestinationNodeID, OriginalHops: append([]string{}, primary.Hops...), EffectiveHops: append([]string{}, replacement.Hops...), DecisionSource: "service_channel_remediation_command", Generation: firstNonEmptyString(generation, command.CommandID), PathScore: serviceChannelReplacementRouteScore(replacement), ScoreReasons: []string{"service_channel_remediation_rebuild_route", "selected_unfenced_alternate_route", "service_channel_rebuild_applied"}, ControlPlaneOnly: true, ProductionForwarding: false, ExpiresAt: minNonZeroTime(primary.ExpiresAt, replacement.ExpiresAt, command.ExpiresAt, now.Add(60*time.Second)).UTC(), } decision.PreviousHopID, decision.NextHopID, decision.LocalRole = routePathLocalPosition(decision.EffectiveHops, input.NodeID, "", "") decisions = append(decisions, decision) } return decisions, nil } func (s *Service) fabricServiceChannelLeaseForRemediationCommand(ctx context.Context, clusterID string, nodeID string, command FabricServiceChannelAccessRemediationCommand, now time.Time) (FabricServiceChannelLease, bool, error) { records, err := s.store.ListFabricServiceChannelLeases(ctx, ListFabricServiceChannelLeasesInput{ ClusterID: clusterID, ServiceClass: firstNonEmptyString(command.ServiceClass, FabricServiceClassVPNPackets), EntryNodeID: nodeID, ResourceID: command.ResourceID, IncludeExpired: false, Limit: 100, Now: now.UTC(), }) if err != nil { return FabricServiceChannelLease{}, false, err } for _, record := range records { if strings.TrimSpace(record.ChannelID) == strings.TrimSpace(command.ChannelID) { return record.Lease, true, nil } } return FabricServiceChannelLease{}, false, nil } func (s *Service) syntheticRouteByID(input GetNodeSyntheticMeshConfigInput, intents []MeshRouteIntent, routeID string) (SyntheticMeshRouteConfig, bool) { routeID = strings.TrimSpace(routeID) if routeID == "" { return SyntheticMeshRouteConfig{}, false } for _, intent := range intents { route, _, _, _, _, ok := s.syntheticRouteFromIntent(input, intent, endpointPerspective{}) if ok && route.RouteID == routeID { return route, true } } return SyntheticMeshRouteConfig{}, false } func minNonZeroTime(items ...time.Time) time.Time { var out time.Time for _, item := range items { if item.IsZero() { continue } if out.IsZero() || item.Before(out) { out = item } } return out } func fabricServiceChannelFirstAuthorizedAlternate(routes []FabricServiceChannelRoute, primaryRouteID string) (FabricServiceChannelRoute, bool) { for _, route := range routes { if strings.TrimSpace(route.RouteID) == "" || route.RouteID == primaryRouteID { continue } if route.Status == "authorized" { return route, true } } return FabricServiceChannelRoute{}, false } func fabricServiceChannelRouteAllowedByLeasePool(lease FabricServiceChannelLease, route FabricServiceChannelRoute) (string, string) { if strings.TrimSpace(route.RouteID) == "" { return "rejected", "replacement_route_missing" } entryAllowed := len(lease.EntryPool) == 0 for _, candidate := range lease.EntryPool { if candidate.NodeID == route.SourceNodeID { entryAllowed = true break } } if !entryAllowed { return "rejected", "replacement_entry_outside_signed_pool_policy" } exitAllowed := len(lease.ExitPool) == 0 for _, candidate := range lease.ExitPool { if candidate.NodeID == route.DestinationNodeID { exitAllowed = true break } } if !exitAllowed { return "rejected", "replacement_exit_outside_signed_pool_policy" } return "allowed", "lease_pool_policy_allows_route" } func fabricServiceChannelLeaseSummaryFromRecord(record FabricServiceChannelLeaseRecord, now time.Time) FabricServiceChannelLeaseSummary { if now.IsZero() { now = time.Now().UTC() } lease := record.Lease summary := FabricServiceChannelLeaseSummary{ ClusterID: record.ClusterID, ChannelID: record.ChannelID, ResourceID: firstNonEmptyString(record.ResourceID, lease.ResourceID), ServiceClass: firstNonEmptyString(record.ServiceClass, lease.ServiceClass), Status: lease.Status, SelectedEntryNodeID: firstNonEmptyString(record.SelectedEntryNodeID, lease.SelectedEntryNodeID), SelectedExitNodeID: lease.SelectedExitNodeID, AllowedChannels: append([]string{}, lease.AllowedChannels...), PrimaryRouteID: strings.TrimSpace(lease.PrimaryRoute.RouteID), PrimaryRouteStatus: strings.TrimSpace(lease.PrimaryRoute.Status), DataPlane: lease.DataPlane, ForceBackendFallback: lease.Status == FabricServiceChannelStatusDegradedFallback || lease.PrimaryRoute.Status == "missing_route_intent", IssuedAt: lease.IssuedAt, ExpiresAt: record.ExpiresAt, CreatedAt: record.CreatedAt, UpdatedAt: record.UpdatedAt, } if summary.ExpiresAt.IsZero() { summary.ExpiresAt = lease.ExpiresAt } summary.Expired = !summary.ExpiresAt.IsZero() && !summary.ExpiresAt.After(now.UTC()) return summary } func fabricServiceChannelLeaseCacheKey(clusterID string, channelID string) string { return strings.TrimSpace(clusterID) + "/" + strings.TrimSpace(channelID) } func (s *Service) signFabricServiceChannelLease(ctx context.Context, lease FabricServiceChannelLease) (FabricServiceChannelLease, error) { authorityKey, err := s.ensureClusterAuthority(ctx, lease.ClusterID, nil) if err != nil { return lease, err } payload := FabricServiceChannelLeaseAuthorityPayload{ SchemaVersion: "rap.fabric_service_channel_lease_authority.v1", ChannelID: lease.ChannelID, ClusterID: lease.ClusterID, OrganizationID: lease.OrganizationID, UserID: lease.UserID, ResourceID: lease.ResourceID, ServiceClass: lease.ServiceClass, Status: lease.Status, SelectedEntryNodeID: lease.SelectedEntryNodeID, SelectedExitNodeID: lease.SelectedExitNodeID, EntryPool: append([]FabricServiceChannelNodeCandidate{}, lease.EntryPool...), ExitPool: append([]FabricServiceChannelNodeCandidate{}, lease.ExitPool...), AllowedChannels: append([]string{}, lease.AllowedChannels...), PrimaryRoute: lease.PrimaryRoute, RecoveryPolicy: lease.RecoveryPolicy, PoolPolicy: lease.PoolPolicy, DataPlane: lease.DataPlane, RouteGeneration: lease.RouteGeneration, FencingEpoch: lease.FencingEpoch, TokenHash: fabricServiceChannelTokenHash(lease.Token.Token), IssuedAt: lease.IssuedAt, ExpiresAt: lease.ExpiresAt, } rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, s.now()) if err != nil { return lease, err } lease.AuthorityPayload = rawPayload lease.AuthoritySignature = &signature return lease, nil } func fabricServiceChannelTokenHash(token string) string { sum := sha256.Sum256([]byte(strings.TrimSpace(token))) return hex.EncodeToString(sum[:]) } func normalizeFabricServiceClass(value string) string { return strings.TrimSpace(strings.ToLower(value)) } func isAllowedFabricServiceClass(value string) bool { switch value { case FabricServiceClassVPNPackets, FabricServiceClassRemoteWorkspace, FabricServiceClassFileTransfer, FabricServiceClassVideo, FabricServiceClassPlatformAdmin, FabricServiceClassClusterAdmin, FabricServiceClassOrganization, FabricServiceClassUserPortal: return true default: return false } } func normalizeFabricServiceChannels(channels []string, serviceClass string) []string { channels = dedupeStrings(channels) if len(channels) > 0 { return channels } switch serviceClass { case FabricServiceClassVPNPackets: return []string{FabricChannelControl, FabricChannelBulk, "vpn_packet"} case FabricServiceClassRemoteWorkspace: return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelReliable, FabricChannelDroppable} case FabricServiceClassVideo: return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelDroppable} case FabricServiceClassFileTransfer: return []string{FabricChannelControl, FabricChannelReliable, FabricChannelBulk} case FabricServiceClassPlatformAdmin, FabricServiceClassClusterAdmin, FabricServiceClassOrganization, FabricServiceClassUserPortal: return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelReliable} default: return []string{FabricChannelControl, FabricChannelReliable} } } func normalizeFabricRequiredRoles(roles []string, serviceClass string) []string { roles = dedupeStrings(roles) if len(roles) > 0 { return roles } switch serviceClass { case FabricServiceClassVPNPackets: return []string{"entry-node", "vpn-exit", "ipv4-egress"} case FabricServiceClassRemoteWorkspace: return []string{"entry-node", "rdp-worker"} case FabricServiceClassVideo: return []string{"entry-node", "video-relay"} case FabricServiceClassFileTransfer: return []string{"entry-node", "file-storage-cache"} case FabricServiceClassPlatformAdmin: return []string{"admin-ingress", "global-admin-runtime", "identity-runtime", "policy-authority", "audit-sink"} case FabricServiceClassClusterAdmin: return []string{"admin-ingress", "cluster-admin-runtime", "identity-runtime", "policy-authority", "audit-sink"} case FabricServiceClassOrganization: return []string{"public-ingress", "organization-portal-runtime", "identity-runtime", "policy-authority", "audit-sink"} case FabricServiceClassUserPortal: return []string{"public-ingress", "user-portal-runtime", "identity-runtime", "policy-authority", "audit-sink"} default: return []string{"entry-node"} } } func selectFabricServiceChannelPreferredNode(nodeIDs []string, preferred string) string { preferred = strings.TrimSpace(preferred) if preferred != "" && containsString(nodeIDs, preferred) { return preferred } if len(nodeIDs) == 0 { return "" } return strings.TrimSpace(nodeIDs[0]) } func fabricServiceChannelEffectivePool(requested []string, policy []string) []string { requested = dedupeStrings(requested) policy = dedupeStrings(policy) if len(policy) == 0 { return requested } if len(requested) == 0 { return policy } out := []string{} for _, nodeID := range requested { if containsString(policy, nodeID) { out = append(out, nodeID) } } return dedupeStrings(out) } func fabricServiceFailoverFromPoolPolicy(policy FabricServiceChannelPoolPolicy) string { policy = normalizeFabricServiceChannelPoolPolicy(policy, defaultFabricServiceChannelPoolPolicy()) raw, err := json.Marshal(map[string]any{ "route_rebuild": policy.RouteRebuild, "entry_failover": policy.EntryFailover, "exit_failover": policy.ExitFailover, "sticky_session": policy.StickySession, "backend_fallback_allowed": policy.BackendFallbackAllowed, "selection_strategy": policy.SelectionStrategy, "pool_policy_fingerprint": policy.Fingerprint, }) if err != nil { return defaultFabricServiceFailover() } return string(raw) } func fabricServiceChannelNodePool(nodeIDs []string, role string, selected string) []FabricServiceChannelNodeCandidate { out := make([]FabricServiceChannelNodeCandidate, 0, len(nodeIDs)) for index, nodeID := range nodeIDs { status := "candidate" if nodeID == selected { status = "selected" } out = append(out, FabricServiceChannelNodeCandidate{ NodeID: nodeID, Role: role, Priority: index + 1, Status: status, Metadata: json.RawMessage(`{}`), }) } return out } type fabricServiceChannelRouteFeedback struct { RouteID string ObservationID string Source string ChannelID string ResourceID string ViolationStatus string ViolationReason string Fenced bool ManualRetry bool StalePolicy bool StaleGeneration bool ProvenanceMissing bool StaleReason string ScoreAdjustment int Reasons []string LastError string ConsecutiveFailures int StallCount int LastSendDurationMs int64 DegradedFallbackRecommended bool RouteRebuildRecommended bool QualityWindowSampleCount int QualityWindowSuccessCount int QualityWindowFailureCount int QualityWindowSlowCount int QualityWindowDropCount int ObservedAt time.Time ExpiresAt time.Time RetryCooldownUntil *time.Time } type fabricServiceChannelRouteProvenance struct { RouteID string RouteVersion string PolicyVersion string RouteGeneration string } func fabricServiceChannelRouteProvenanceFromIntents(intents []MeshRouteIntent) map[string]fabricServiceChannelRouteProvenance { out := map[string]fabricServiceChannelRouteProvenance{} for _, intent := range intents { if strings.TrimSpace(intent.ID) == "" { continue } var policy syntheticRoutePolicy _ = json.Unmarshal(intent.Policy, &policy) routeVersion := strings.TrimSpace(policy.RouteVersion) if routeVersion == "" { routeVersion = intent.UpdatedAt.UTC().Format(time.RFC3339) } policyVersion := strings.TrimSpace(policy.PolicyVersion) if policyVersion == "" { policyVersion = routeVersion } out[intent.ID] = fabricServiceChannelRouteProvenance{ RouteID: intent.ID, RouteVersion: routeVersion, PolicyVersion: policyVersion, RouteGeneration: policyVersion, } } return out } func (s *Service) fabricServiceChannelRouteFeedback(ctx context.Context, clusterID string, entryNodeIDs []string, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) (map[string]fabricServiceChannelRouteFeedback, error) { out := map[string]fabricServiceChannelRouteFeedback{} policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy()) for _, nodeID := range dedupeStrings(entryNodeIDs) { if strings.TrimSpace(nodeID) == "" { continue } observations, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{ ClusterID: clusterID, ReporterNodeID: nodeID, ServiceClass: FabricServiceClassVPNPackets, Now: now, }) if err != nil { return nil, err } mergeFabricServiceChannelRouteFeedback(out, fabricServiceChannelRouteFeedbackFromObservationsWithProvenance(observations, now, policy, routeProvenance)) expiredObservations, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{ ClusterID: clusterID, ReporterNodeID: nodeID, ServiceClass: FabricServiceClassVPNPackets, IncludeExpired: true, Now: now, }) if err != nil { return nil, err } mergeFabricServiceChannelRouteFeedback(out, fabricServiceChannelManualRetryFeedbackFromObservationsWithProvenance(expiredObservations, now, policy, routeProvenance)) if len(observations) > 0 { continue } heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1) if err != nil { return nil, err } if len(heartbeats) == 0 || now.Sub(heartbeats[0].ObservedAt.UTC()) > fabricServiceChannelFeedbackMaxAge { continue } mergeFabricServiceChannelRouteFeedback(out, fabricServiceChannelRouteFeedbackFromHeartbeatWithProvenance(heartbeats[0], now, policy, routeProvenance)) } return out, nil } func (s *Service) fabricServiceChannelRecoveryPolicy(ctx context.Context, clusterID string) FabricServiceChannelRecoveryPolicy { cluster, err := s.store.GetCluster(ctx, strings.TrimSpace(clusterID)) if err != nil { return defaultFabricServiceChannelRecoveryPolicy() } return fabricServiceChannelRecoveryPolicyFromCluster(cluster) } func (s *Service) recordFabricServiceChannelRouteFeedback(ctx context.Context, heartbeat NodeHeartbeat) error { if strings.TrimSpace(heartbeat.ClusterID) == "" || strings.TrimSpace(heartbeat.NodeID) == "" { return nil } observedAt := heartbeat.ObservedAt.UTC() if observedAt.IsZero() { observedAt = s.now().UTC() } expiresAt := observedAt.Add(fabricServiceChannelFeedbackMaxAge) for _, input := range fabricServiceChannelRouteFeedbackInputsFromHeartbeat(heartbeat, FabricServiceClassVPNPackets, expiresAt) { if _, err := s.store.RecordFabricServiceChannelRouteFeedback(ctx, input); err != nil { return err } } for _, input := range s.fabricServiceChannelRouteFeedbackInputsFromAccessReport(ctx, heartbeat, FabricServiceClassVPNPackets, expiresAt) { if _, err := s.store.RecordFabricServiceChannelRouteFeedback(ctx, input); err != nil { return err } } return nil } func (s *Service) fabricServiceChannelRouteFeedbackInputsFromAccessReport(ctx context.Context, heartbeat NodeHeartbeat, serviceClass string, expiresAt time.Time) []RecordFabricServiceChannelRouteFeedbackInput { if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) { return nil } report := jsonMapPath(jsonObject(heartbeat.Metadata), "fabric_service_channel_access_report") if len(report) == 0 { return nil } if jsonInt(report, "fabric_route_send_failure") <= 0 { return nil } status := jsonString(report, "last_data_plane_violation_status") if status != "fabric_route_send_failed_backend_fallback_blocked" { return nil } observedAt := heartbeat.ObservedAt.UTC() if observedAt.IsZero() { observedAt = time.Now().UTC() } records, err := s.store.ListFabricServiceChannelLeases(ctx, ListFabricServiceChannelLeasesInput{ ClusterID: heartbeat.ClusterID, EntryNodeID: heartbeat.NodeID, ServiceClass: serviceClass, IncludeExpired: false, Limit: 100, Now: observedAt, }) if err != nil || len(records) == 0 { return nil } reason := firstNonEmptyString(jsonString(report, "last_data_plane_violation_reason"), "fabric_route_send_failed_backend_fallback_blocked") out := make([]RecordFabricServiceChannelRouteFeedbackInput, 0, len(records)) for _, record := range records { summary := fabricServiceChannelLeaseSummaryFromRecord(record, observedAt) routeID := strings.TrimSpace(summary.PrimaryRouteID) if summary.Expired || routeID == "" || summary.ForceBackendFallback { continue } if s.fabricServiceChannelHasActiveAccessReportRouteFeedback(ctx, heartbeat.ClusterID, heartbeat.NodeID, routeID, serviceClass, observedAt) { continue } out = append(out, RecordFabricServiceChannelRouteFeedbackInput{ ClusterID: heartbeat.ClusterID, ReporterNodeID: heartbeat.NodeID, RouteID: routeID, ServiceClass: serviceClass, FeedbackStatus: "fenced", ScoreAdjustment: -1030, Reasons: []string{"service_channel_route_rebuild_recommended", "data_plane_fabric_route_send_failed", "backend_fallback_blocked_by_policy"}, LastError: reason, ConsecutiveFailures: maxInt(1, jsonInt(report, "fabric_route_send_failure")), Payload: mustJSONRaw(map[string]any{ "source": "fabric_service_channel_access_report", "channel_id": summary.ChannelID, "resource_id": summary.ResourceID, "last_data_plane_violation_status": status, "last_data_plane_violation_reason": reason, "backend_fallback_blocked": jsonInt(report, "backend_fallback_blocked"), "fabric_route_send_failure": jsonInt(report, "fabric_route_send_failure"), "last_backend_relay_policy": jsonString(report, "last_backend_relay_policy"), "last_working_data_transport": jsonString(report, "last_working_data_transport"), "last_steady_state_transport": jsonString(report, "last_steady_state_transport"), }), ObservedAt: observedAt, ExpiresAt: expiresAt, }) } return out } func (s *Service) fabricServiceChannelHasActiveAccessReportRouteFeedback(ctx context.Context, clusterID, reporterNodeID, routeID, serviceClass string, observedAt time.Time) bool { observations, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{ ClusterID: clusterID, ReporterNodeID: reporterNodeID, RouteID: routeID, ServiceClass: serviceClass, IncludeExpired: false, Now: observedAt, }) if err != nil { return false } for _, observation := range observations { if observation.FeedbackStatus != "fenced" && observation.FeedbackStatus != "degraded" { continue } if containsString(observation.Reasons, "data_plane_fabric_route_send_failed") || jsonString(jsonObject(observation.Payload), "source") == "fabric_service_channel_access_report" { return true } } return false } type fabricServiceChannelRuntimeHeartbeat struct { SchemaVersion string `json:"schema_version"` ConfigVersion string `json:"config_version"` Ingress struct { FlowScheduler struct { ChannelStats map[string]fabricServiceChannelRuntimeChannelStat `json:"channel_stats"` } `json:"flow_scheduler"` } `json:"ingress"` } type fabricServiceChannelRuntimeChannelStat struct { LastRouteID string `json:"last_route_id"` RoutePolicyVersion string `json:"route_policy_version,omitempty"` RouteGeneration string `json:"route_generation,omitempty"` RecoveryPolicyFingerprint string `json:"recovery_policy_fingerprint,omitempty"` LastFailedRouteID string `json:"last_failed_route_id"` LastFailedRoutePolicyVersion string `json:"last_failed_route_policy_version,omitempty"` LastFailedRouteGeneration string `json:"last_failed_route_generation,omitempty"` LastError string `json:"last_error"` ConsecutiveFailures int `json:"consecutive_failures"` StallCount int `json:"stall_count"` LastSendDurationMillis int64 `json:"last_send_duration_ms"` RouteRebuildRecommended bool `json:"route_rebuild_recommended"` DegradedFallbackRecommended bool `json:"degraded_fallback_recommended"` QualityWindowSampleCount int `json:"quality_window_sample_count"` QualityWindowSuccessCount int `json:"quality_window_success_count"` QualityWindowFailureCount int `json:"quality_window_failure_count"` QualityWindowSlowCount int `json:"quality_window_slow_count"` QualityWindowDropCount int `json:"quality_window_drop_count"` QualityWindowAvgLatencyMs int64 `json:"quality_window_avg_latency_ms"` QualityWindowLastUpdatedAt string `json:"quality_window_last_updated_at"` } func fabricServiceChannelRouteFeedbackFromHeartbeat(heartbeat NodeHeartbeat, now time.Time) map[string]fabricServiceChannelRouteFeedback { return fabricServiceChannelRouteFeedbackFromHeartbeatWithProvenance(heartbeat, now, defaultFabricServiceChannelRecoveryPolicy(), nil) } func fabricServiceChannelRouteFeedbackFromHeartbeatWithProvenance(heartbeat NodeHeartbeat, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) map[string]fabricServiceChannelRouteFeedback { out := map[string]fabricServiceChannelRouteFeedback{} for _, input := range fabricServiceChannelRouteFeedbackInputsFromHeartbeat(heartbeat, FabricServiceClassVPNPackets, now.Add(fabricServiceChannelFeedbackMaxAge)) { observation := fabricServiceChannelAnnotateFeedbackProvenance(FabricServiceChannelRouteFeedbackObservation{ ClusterID: input.ClusterID, ReporterNodeID: input.ReporterNodeID, RouteID: input.RouteID, ServiceClass: input.ServiceClass, FeedbackStatus: input.FeedbackStatus, ScoreAdjustment: input.ScoreAdjustment, Reasons: append([]string{}, input.Reasons...), LastError: input.LastError, ConsecutiveFailures: input.ConsecutiveFailures, StallCount: input.StallCount, LastSendDurationMs: input.LastSendDurationMs, Payload: input.Payload, ObservedAt: input.ObservedAt, ExpiresAt: input.ExpiresAt, }, policy, routeProvenance) scoreAdjustment := input.ScoreAdjustment fenced := input.FeedbackStatus == "fenced" routeRebuildRecommended := containsString(input.Reasons, "service_channel_route_rebuild_recommended") degradedFallbackRecommended := containsString(input.Reasons, "service_channel_degraded_fallback_recommended") if observation.StalePolicy || observation.StaleGeneration { scoreAdjustment = fabricServiceChannelConservativeStaleScore(scoreAdjustment) fenced = false routeRebuildRecommended = false degradedFallbackRecommended = false } item := fabricServiceChannelRouteFeedback{ RouteID: input.RouteID, Fenced: fenced, StalePolicy: observation.StalePolicy, StaleGeneration: observation.StaleGeneration, ProvenanceMissing: observation.ProvenanceMissing, StaleReason: observation.StaleReason, ScoreAdjustment: scoreAdjustment, Reasons: observation.Reasons, LastError: input.LastError, ConsecutiveFailures: input.ConsecutiveFailures, StallCount: input.StallCount, LastSendDurationMs: input.LastSendDurationMs, DegradedFallbackRecommended: degradedFallbackRecommended, RouteRebuildRecommended: routeRebuildRecommended, QualityWindowSampleCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_sample_count"), QualityWindowSuccessCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_success_count"), QualityWindowFailureCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_failure_count"), QualityWindowSlowCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_slow_count"), QualityWindowDropCount: fabricServiceChannelFeedbackPayloadInt(input.Payload, "quality_window_drop_count"), ObservedAt: input.ObservedAt, } out[input.RouteID] = item } return out } func fabricServiceChannelRouteFeedbackInputsFromHeartbeat(heartbeat NodeHeartbeat, serviceClass string, expiresAt time.Time) []RecordFabricServiceChannelRouteFeedbackInput { if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) { return nil } var metadata struct { Report fabricServiceChannelRuntimeHeartbeat `json:"fabric_service_channel_runtime_report"` } if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil { return nil } if metadata.Report.SchemaVersion == "" || len(metadata.Report.Ingress.FlowScheduler.ChannelStats) == 0 { return nil } observedAt := heartbeat.ObservedAt.UTC() if observedAt.IsZero() { observedAt = time.Now().UTC() } var out []RecordFabricServiceChannelRouteFeedbackInput for _, stat := range metadata.Report.Ingress.FlowScheduler.ChannelStats { failedRouteID := strings.TrimSpace(stat.LastFailedRouteID) rollingFailureCount := fabricServiceChannelRollingFailureCount(stat) rollingStallCount := fabricServiceChannelRollingStallCount(stat) rollingLatencyMs := fabricServiceChannelRollingLatencyMs(stat) rollingWindowActive := stat.QualityWindowSampleCount > 0 freshFailureActive := failedRouteID != "" && (!rollingWindowActive || rollingFailureCount > 0) if freshFailureActive { scoreAdjustment := -30 reasons := []string{"service_channel_recent_route_failure"} if rollingWindowActive { reasons = append(reasons, "service_channel_rolling_quality_window") } status := "degraded" if stat.RouteRebuildRecommended || stat.DegradedFallbackRecommended || rollingFailureCount >= 2 { status = "fenced" scoreAdjustment -= 1000 reasons = append(reasons, "service_channel_route_rebuild_recommended") if stat.DegradedFallbackRecommended { reasons = append(reasons, "service_channel_degraded_fallback_recommended") } } out = append(out, RecordFabricServiceChannelRouteFeedbackInput{ ClusterID: heartbeat.ClusterID, ReporterNodeID: heartbeat.NodeID, RouteID: failedRouteID, ServiceClass: serviceClass, FeedbackStatus: status, ScoreAdjustment: scoreAdjustment, Reasons: dedupeStrings(reasons), LastError: strings.TrimSpace(stat.LastError), ConsecutiveFailures: rollingFailureCount, StallCount: rollingStallCount, LastSendDurationMs: rollingLatencyMs, Payload: fabricServiceChannelFeedbackPayload(stat, metadata.Report.ConfigVersion), ObservedAt: observedAt, ExpiresAt: expiresAt, }) } successRouteID := strings.TrimSpace(stat.LastRouteID) if successRouteID != "" && (!freshFailureActive || successRouteID != failedRouteID) && fabricServiceChannelStatHasFreshSuccess(stat) { qualityAdjustment, qualityReasons := fabricServiceChannelRouteQualityScore(rollingLatencyMs, rollingFailureCount, rollingStallCount) reasons := append([]string{"service_channel_recent_success"}, qualityReasons...) if rollingWindowActive { reasons = append(reasons, "service_channel_rolling_quality_window") } out = append(out, RecordFabricServiceChannelRouteFeedbackInput{ ClusterID: heartbeat.ClusterID, ReporterNodeID: heartbeat.NodeID, RouteID: successRouteID, ServiceClass: serviceClass, FeedbackStatus: "healthy", ScoreAdjustment: 10 + qualityAdjustment, Reasons: dedupeStrings(reasons), ConsecutiveFailures: rollingFailureCount, StallCount: rollingStallCount, LastSendDurationMs: rollingLatencyMs, Payload: fabricServiceChannelFeedbackPayload(stat, metadata.Report.ConfigVersion), ObservedAt: observedAt, ExpiresAt: expiresAt, }) } } return out } func fabricServiceChannelFeedbackPayload(stat fabricServiceChannelRuntimeChannelStat, configVersion string) json.RawMessage { payload := map[string]any{} rawStat, err := json.Marshal(stat) if err == nil { _ = json.Unmarshal(rawStat, &payload) } if strings.TrimSpace(configVersion) != "" { payload["observed_config_version"] = strings.TrimSpace(configVersion) } raw, err := json.Marshal(payload) if err != nil { return json.RawMessage(`{}`) } return raw } func fabricServiceChannelStatHasFreshSuccess(stat fabricServiceChannelRuntimeChannelStat) bool { if stat.QualityWindowSampleCount <= 0 { return !stat.RouteRebuildRecommended && !stat.DegradedFallbackRecommended } return stat.QualityWindowSuccessCount > 0 && stat.QualityWindowFailureCount == 0 && stat.QualityWindowDropCount == 0 } func fabricServiceChannelFlowSchedulerFromHeartbeat(heartbeat NodeHeartbeat) map[string]any { if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) { return map[string]any{} } metadata := jsonObject(heartbeat.Metadata) return jsonMapPath(metadata, "fabric_service_channel_runtime_report", "ingress", "flow_scheduler") } func fabricServiceChannelRollingFailureCount(stat fabricServiceChannelRuntimeChannelStat) int { if stat.QualityWindowSampleCount <= 0 { return stat.ConsecutiveFailures } return stat.QualityWindowFailureCount + stat.QualityWindowDropCount } func fabricServiceChannelRollingStallCount(stat fabricServiceChannelRuntimeChannelStat) int { if stat.QualityWindowSampleCount <= 0 { return stat.StallCount } return stat.QualityWindowSlowCount } func fabricServiceChannelRollingLatencyMs(stat fabricServiceChannelRuntimeChannelStat) int64 { if stat.QualityWindowSampleCount > 0 && stat.QualityWindowAvgLatencyMs > 0 { return stat.QualityWindowAvgLatencyMs } return stat.LastSendDurationMillis } func fabricServiceChannelRouteQualityScore(lastSendDurationMs int64, consecutiveFailures int, stallCount int) (int, []string) { score := 0 reasons := []string{} switch { case lastSendDurationMs <= 0: case lastSendDurationMs <= 10: score += 80 reasons = append(reasons, "service_channel_quality_latency_le_10ms") case lastSendDurationMs <= 25: score += 60 reasons = append(reasons, "service_channel_quality_latency_le_25ms") case lastSendDurationMs <= 50: score += 40 reasons = append(reasons, "service_channel_quality_latency_le_50ms") case lastSendDurationMs <= 100: score += 20 reasons = append(reasons, "service_channel_quality_latency_le_100ms") case lastSendDurationMs <= 250: score += 5 reasons = append(reasons, "service_channel_quality_latency_le_250ms") case lastSendDurationMs <= 500: score -= 10 reasons = append(reasons, "service_channel_quality_latency_slow") case lastSendDurationMs <= 1000: score -= 30 reasons = append(reasons, "service_channel_quality_latency_very_slow") default: score -= 60 reasons = append(reasons, "service_channel_quality_latency_unhealthy") } if consecutiveFailures > 0 { penalty := consecutiveFailures * 20 if penalty > 100 { penalty = 100 } score -= penalty reasons = append(reasons, "service_channel_quality_recent_failures") } if stallCount > 0 { penalty := stallCount * 5 if penalty > 50 { penalty = 50 } score -= penalty reasons = append(reasons, "service_channel_quality_recent_stalls") } return score, dedupeStrings(reasons) } func fabricServiceChannelRetryCooldownUntil(payload json.RawMessage) *time.Time { if len(payload) == 0 || !json.Valid(payload) { return nil } var raw map[string]any if err := json.Unmarshal(payload, &raw); err != nil { return nil } value, ok := raw["operator_retry_cooldown_until"].(string) if !ok || strings.TrimSpace(value) == "" { return nil } parsed, err := time.Parse(time.RFC3339Nano, strings.TrimSpace(value)) if err != nil { return nil } parsed = parsed.UTC() return &parsed } func fabricServiceChannelFeedbackPayloadBool(payload json.RawMessage, key string) bool { if len(payload) == 0 || !json.Valid(payload) { return false } var raw map[string]any if err := json.Unmarshal(payload, &raw); err != nil { return false } value, ok := raw[key].(bool) return ok && value } func fabricServiceChannelFeedbackPayloadInt(payload json.RawMessage, key string) int { if len(payload) == 0 || !json.Valid(payload) { return 0 } var raw map[string]any if err := json.Unmarshal(payload, &raw); err != nil { return 0 } switch value := raw[key].(type) { case float64: return int(value) case int: return value case json.Number: parsed, _ := value.Int64() return int(parsed) default: return 0 } } func fabricServiceChannelFeedbackPayloadString(payload json.RawMessage, keys ...string) string { if len(payload) == 0 || !json.Valid(payload) { return "" } var raw map[string]any if err := json.Unmarshal(payload, &raw); err != nil { return "" } for _, key := range keys { if value, ok := raw[key].(string); ok && strings.TrimSpace(value) != "" { return strings.TrimSpace(value) } } if nested, ok := raw["recovery_policy"].(map[string]any); ok { for _, key := range keys { if value, ok := nested[key].(string); ok && strings.TrimSpace(value) != "" { return strings.TrimSpace(value) } } } return "" } func fabricServiceChannelAnnotateFeedbackProvenance(observation FabricServiceChannelRouteFeedbackObservation, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) FabricServiceChannelRouteFeedbackObservation { policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy()) observation.EffectivePolicyFingerprint = policy.Fingerprint observation.ObservedPolicyFingerprint = fabricServiceChannelFeedbackPayloadString(observation.Payload, "recovery_policy_fingerprint", "policy_fingerprint", "fingerprint") provenance := routeProvenance[observation.RouteID] observation.EffectiveRouteGeneration = provenance.RouteGeneration observation.ObservedRouteGeneration = fabricServiceChannelFeedbackPayloadString(observation.Payload, "route_generation", "route_policy_version", "policy_version") missingPolicy := observation.ObservedPolicyFingerprint == "" missingGeneration := observation.ObservedRouteGeneration == "" && observation.EffectiveRouteGeneration != "" observation.ProvenanceMissing = missingPolicy || missingGeneration if observation.ObservedPolicyFingerprint != "" && policy.Fingerprint != "" && observation.ObservedPolicyFingerprint != policy.Fingerprint { observation.StalePolicy = true } if observation.ObservedRouteGeneration != "" && observation.EffectiveRouteGeneration != "" && observation.ObservedRouteGeneration != observation.EffectiveRouteGeneration { observation.StaleGeneration = true } switch { case observation.StalePolicy && observation.StaleGeneration: observation.StaleReason = "service_channel_feedback_stale_policy_and_generation" case observation.StalePolicy: observation.StaleReason = "service_channel_feedback_stale_policy" case observation.StaleGeneration: observation.StaleReason = "service_channel_feedback_stale_generation" case observation.ProvenanceMissing: observation.StaleReason = "service_channel_feedback_provenance_missing" } if observation.StaleReason != "" { observation.Reasons = dedupeStrings(append(observation.Reasons, observation.StaleReason)) } return observation } func fabricServiceChannelConservativeStaleScore(score int) int { if score > 0 { return 0 } if score < -10 { return -10 } return score } func fabricServiceChannelFeedbackSuppressedByOperatorCooldown(input RecordFabricServiceChannelRouteFeedbackInput, cooldownUntil, observedAt time.Time) RecordFabricServiceChannelRouteFeedbackInput { originalStatus := input.FeedbackStatus originalScore := input.ScoreAdjustment payload := map[string]any{} if len(input.Payload) > 0 && json.Valid(input.Payload) { _ = json.Unmarshal(input.Payload, &payload) } payload["operator_feedback_suppressed"] = true payload["operator_suppressed_feedback_status"] = originalStatus payload["operator_suppressed_score_adjustment"] = originalScore payload["operator_retry_cooldown_until"] = cooldownUntil.UTC().Format(time.RFC3339Nano) payload["operator_suppressed_at"] = observedAt.UTC().Format(time.RFC3339Nano) raw, err := json.Marshal(payload) if err != nil { raw = []byte(`{}`) } input.FeedbackStatus = "operator_retry_cooldown" input.ScoreAdjustment = 0 input.Reasons = dedupeStrings(append(input.Reasons, "operator_expired_feedback_retry", "manual_feedback_expired_retry_cooldown", "service_channel_feedback_suppressed_by_operator_expire")) input.Payload = raw input.ExpiresAt = cooldownUntil.UTC() return input } func fabricServiceChannelRouteFeedbackFromObservations(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time) map[string]fabricServiceChannelRouteFeedback { return fabricServiceChannelRouteFeedbackFromObservationsWithProvenance(observations, now, defaultFabricServiceChannelRecoveryPolicy(), nil) } func fabricServiceChannelRouteFeedbackFromObservationsWithProvenance(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) map[string]fabricServiceChannelRouteFeedback { out := map[string]fabricServiceChannelRouteFeedback{} for _, observation := range observations { observation = fabricServiceChannelAnnotateFeedbackProvenance(observation, policy, routeProvenance) if strings.TrimSpace(observation.RouteID) == "" || (!observation.ExpiresAt.IsZero() && !observation.ExpiresAt.After(now.UTC())) { continue } item := out[observation.RouteID] item.RouteID = observation.RouteID stale := observation.StalePolicy || observation.StaleGeneration item.StalePolicy = item.StalePolicy || observation.StalePolicy item.StaleGeneration = item.StaleGeneration || observation.StaleGeneration item.ProvenanceMissing = item.ProvenanceMissing || observation.ProvenanceMissing if observation.StaleReason != "" { item.StaleReason = observation.StaleReason } item.Fenced = item.Fenced || (!stale && observation.FeedbackStatus == "fenced") if observation.RetryCooldownUntil != nil && observation.RetryCooldownUntil.After(now.UTC()) { item.ManualRetry = true } scoreAdjustment, ageDecayReasons := fabricServiceChannelFeedbackScoreWithAgeDecay(observation, now) if stale { scoreAdjustment = fabricServiceChannelConservativeStaleScore(scoreAdjustment) } item.ScoreAdjustment += scoreAdjustment item.Reasons = append(item.Reasons, observation.Reasons...) item.Reasons = append(item.Reasons, ageDecayReasons...) if observation.LastSendDurationMs > 0 && (item.LastSendDurationMs == 0 || observation.LastSendDurationMs < item.LastSendDurationMs) { item.LastSendDurationMs = observation.LastSendDurationMs } if observation.ConsecutiveFailures > item.ConsecutiveFailures { item.ConsecutiveFailures = observation.ConsecutiveFailures } if observation.StallCount > item.StallCount { item.StallCount = observation.StallCount } item.DegradedFallbackRecommended = item.DegradedFallbackRecommended || (!stale && (containsString(observation.Reasons, "service_channel_degraded_fallback_recommended") || fabricServiceChannelFeedbackPayloadBool(observation.Payload, "degraded_fallback_recommended"))) item.RouteRebuildRecommended = item.RouteRebuildRecommended || (!stale && (containsString(observation.Reasons, "service_channel_route_rebuild_recommended") || fabricServiceChannelFeedbackPayloadBool(observation.Payload, "route_rebuild_recommended"))) if sampleCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_sample_count"); sampleCount > item.QualityWindowSampleCount { item.QualityWindowSampleCount = sampleCount } if successCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_success_count"); successCount > item.QualityWindowSuccessCount { item.QualityWindowSuccessCount = successCount } if failureCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_failure_count"); failureCount > item.QualityWindowFailureCount { item.QualityWindowFailureCount = failureCount } if slowCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_slow_count"); slowCount > item.QualityWindowSlowCount { item.QualityWindowSlowCount = slowCount } if dropCount := fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_drop_count"); dropCount > item.QualityWindowDropCount { item.QualityWindowDropCount = dropCount } if observation.LastError != "" { item.LastError = observation.LastError } if observation.ObservedAt.After(item.ObservedAt) { item.ObservedAt = observation.ObservedAt item.ExpiresAt = observation.ExpiresAt item.ObservationID = observation.ID item.Source = jsonString(jsonObject(observation.Payload), "source") item.ChannelID = jsonString(jsonObject(observation.Payload), "channel_id") item.ResourceID = jsonString(jsonObject(observation.Payload), "resource_id") item.ViolationStatus = jsonString(jsonObject(observation.Payload), "last_data_plane_violation_status") item.ViolationReason = jsonString(jsonObject(observation.Payload), "last_data_plane_violation_reason") } if observation.RetryCooldownUntil != nil && (item.RetryCooldownUntil == nil || observation.RetryCooldownUntil.After(*item.RetryCooldownUntil)) { cooldown := observation.RetryCooldownUntil.UTC() item.RetryCooldownUntil = &cooldown } out[observation.RouteID] = item } for routeID, item := range out { item.Reasons = dedupeStrings(item.Reasons) out[routeID] = item } return out } func fabricServiceChannelManualRetryFeedbackFromObservations(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time) map[string]fabricServiceChannelRouteFeedback { return fabricServiceChannelManualRetryFeedbackFromObservationsWithProvenance(observations, now, defaultFabricServiceChannelRecoveryPolicy(), nil) } func fabricServiceChannelManualRetryFeedbackFromObservationsWithProvenance(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) map[string]fabricServiceChannelRouteFeedback { out := map[string]fabricServiceChannelRouteFeedback{} now = now.UTC() for _, observation := range observations { observation = fabricServiceChannelAnnotateFeedbackProvenance(observation, policy, routeProvenance) if strings.TrimSpace(observation.RouteID) == "" || observation.RetryCooldownUntil == nil || !observation.RetryCooldownUntil.After(now) { continue } if observation.FeedbackStatus == "healthy" { continue } item := out[observation.RouteID] item.RouteID = observation.RouteID item.ManualRetry = true item.StalePolicy = item.StalePolicy || observation.StalePolicy item.StaleGeneration = item.StaleGeneration || observation.StaleGeneration item.ProvenanceMissing = item.ProvenanceMissing || observation.ProvenanceMissing if observation.StaleReason != "" { item.StaleReason = observation.StaleReason } item.ScoreAdjustment += 0 item.Reasons = append(item.Reasons, "operator_expired_feedback_retry", "manual_feedback_expired_retry_cooldown") if observation.LastError != "" { item.LastError = observation.LastError } if observation.ObservedAt.After(item.ObservedAt) { item.ObservedAt = observation.ObservedAt } cooldown := observation.RetryCooldownUntil.UTC() if item.RetryCooldownUntil == nil || cooldown.After(*item.RetryCooldownUntil) { item.RetryCooldownUntil = &cooldown } out[observation.RouteID] = item } for routeID, item := range out { item.Reasons = dedupeStrings(item.Reasons) out[routeID] = item } return out } func fabricServiceChannelFeedbackScoreWithAgeDecay(observation FabricServiceChannelRouteFeedbackObservation, now time.Time) (int, []string) { score := observation.ScoreAdjustment if score <= 0 || observation.FeedbackStatus != "healthy" || observation.ObservedAt.IsZero() { return score, nil } observedAt := observation.ObservedAt.UTC() now = now.UTC() if !now.After(observedAt) { return score, nil } maxAge := fabricServiceChannelFeedbackMaxAge if !observation.ExpiresAt.IsZero() && observation.ExpiresAt.After(observedAt) { maxAge = observation.ExpiresAt.Sub(observedAt) } if maxAge <= 0 { return 0, []string{"service_channel_feedback_age_decay_expired"} } age := now.Sub(observedAt) if age <= 0 { return score, nil } if age >= maxAge { return 0, []string{"service_channel_feedback_age_decay_expired"} } remaining := maxAge - age decayed := int((int64(score)*int64(remaining) + int64(maxAge) - 1) / int64(maxAge)) if decayed < 1 { decayed = 1 } if decayed == score { return score, nil } return decayed, []string{"service_channel_feedback_age_decay"} } func mergeFabricServiceChannelRouteFeedback(dst map[string]fabricServiceChannelRouteFeedback, src map[string]fabricServiceChannelRouteFeedback) { for routeID, incoming := range src { existing := dst[routeID] existing.RouteID = routeID existing.Fenced = existing.Fenced || incoming.Fenced existing.ManualRetry = existing.ManualRetry || incoming.ManualRetry existing.StalePolicy = existing.StalePolicy || incoming.StalePolicy existing.StaleGeneration = existing.StaleGeneration || incoming.StaleGeneration existing.ProvenanceMissing = existing.ProvenanceMissing || incoming.ProvenanceMissing if incoming.StaleReason != "" { existing.StaleReason = incoming.StaleReason } existing.ScoreAdjustment += incoming.ScoreAdjustment existing.Reasons = dedupeStrings(append(existing.Reasons, incoming.Reasons...)) if incoming.ConsecutiveFailures > existing.ConsecutiveFailures { existing.ConsecutiveFailures = incoming.ConsecutiveFailures } if incoming.StallCount > existing.StallCount { existing.StallCount = incoming.StallCount } if incoming.LastSendDurationMs > 0 && (existing.LastSendDurationMs == 0 || incoming.LastSendDurationMs < existing.LastSendDurationMs) { existing.LastSendDurationMs = incoming.LastSendDurationMs } existing.DegradedFallbackRecommended = existing.DegradedFallbackRecommended || incoming.DegradedFallbackRecommended existing.RouteRebuildRecommended = existing.RouteRebuildRecommended || incoming.RouteRebuildRecommended if incoming.QualityWindowSampleCount > existing.QualityWindowSampleCount { existing.QualityWindowSampleCount = incoming.QualityWindowSampleCount } if incoming.QualityWindowSuccessCount > existing.QualityWindowSuccessCount { existing.QualityWindowSuccessCount = incoming.QualityWindowSuccessCount } if incoming.QualityWindowFailureCount > existing.QualityWindowFailureCount { existing.QualityWindowFailureCount = incoming.QualityWindowFailureCount } if incoming.QualityWindowSlowCount > existing.QualityWindowSlowCount { existing.QualityWindowSlowCount = incoming.QualityWindowSlowCount } if incoming.QualityWindowDropCount > existing.QualityWindowDropCount { existing.QualityWindowDropCount = incoming.QualityWindowDropCount } if incoming.LastError != "" { existing.LastError = incoming.LastError } if incoming.ObservedAt.After(existing.ObservedAt) { existing.ObservedAt = incoming.ObservedAt } if incoming.RetryCooldownUntil != nil && (existing.RetryCooldownUntil == nil || incoming.RetryCooldownUntil.After(*existing.RetryCooldownUntil)) { cooldown := incoming.RetryCooldownUntil.UTC() existing.RetryCooldownUntil = &cooldown } dst[routeID] = existing } } func serviceChannelRouteFeedbackReport(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time) *FabricServiceChannelRouteFeedbackReport { return serviceChannelRouteFeedbackReportWithPolicy(observations, now, defaultFabricServiceChannelRecoveryPolicy()) } func serviceChannelRouteFeedbackReportWithPolicy(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy) *FabricServiceChannelRouteFeedbackReport { return serviceChannelRouteFeedbackReportWithPolicyAndProvenance(observations, now, policy, nil) } func serviceChannelRouteFeedbackReportWithPolicyAndProvenance(observations []FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy, routeProvenance map[string]fabricServiceChannelRouteProvenance) *FabricServiceChannelRouteFeedbackReport { policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy()) reportObservations := make([]FabricServiceChannelRouteFeedbackObservation, 0, len(observations)) for _, observation := range observations { observation = fabricServiceChannelAnnotateFeedbackProvenance(observation, policy, routeProvenance) effectiveScore, ageDecayReasons := fabricServiceChannelFeedbackScoreWithAgeDecay(observation, now) if observation.StalePolicy || observation.StaleGeneration { effectiveScore = fabricServiceChannelConservativeStaleScore(effectiveScore) } observation.EffectiveScoreAdjustment = effectiveScore observation.Reasons = dedupeStrings(append(observation.Reasons, ageDecayReasons...)) observation.RecoveryState = fabricServiceChannelFeedbackObservationRecoveryState(observation, now) observation.RecoveryPromoted = fabricServiceChannelFeedbackObservationRecoveryPromoted(observation, now, policy) if observation.RecoveryPromoted { observation.RecoveryState = "healthy" } observation.RecoveryDemoted, observation.RecoveryReason = fabricServiceChannelFeedbackObservationRecoveryDemotion(observation, now, policy) observation.RecoveryHysteresisActive = observation.RecoveryState == "recovered" if observation.RecoveryHysteresisActive { observation.RecoveryHysteresisPenalty = policy.HysteresisPenalty } reportObservations = append(reportObservations, observation) } report := &FabricServiceChannelRouteFeedbackReport{ SchemaVersion: "rap.fabric_service_channel_route_feedback_report.v1", GeneratedAt: now.UTC(), FeedbackMaxAgeSeconds: int(fabricServiceChannelFeedbackMaxAge.Seconds()), RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(policy), ObservationCount: len(observations), Observations: reportObservations, } for _, observation := range reportObservations { switch strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) { case "fenced": report.FencedRouteCount++ case "degraded": report.DegradedRouteCount++ case "healthy": report.HealthyRouteCount++ } if observation.RecoveryState == "recovered" { report.RecoveredRouteCount++ } if observation.RecoveryHysteresisActive { report.RecoveryHysteresisCount++ } if observation.RecoveryPromoted { report.RecoveryPromotedCount++ } if observation.RecoveryDemoted { report.RecoveryDemotedCount++ } if observation.ProvenanceMissing { report.MissingProvenanceCount++ } if observation.StalePolicy { report.StalePolicyCount++ } if observation.StaleGeneration { report.StaleGenerationCount++ } } return report } func fabricServiceChannelFeedbackObservationRecoveryState(observation FabricServiceChannelRouteFeedbackObservation, now time.Time) string { switch strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) { case "fenced": return "fenced" case "degraded": return "degraded" case "healthy": if observation.RetryCooldownUntil != nil && observation.RetryCooldownUntil.After(now.UTC()) && containsString(observation.Reasons, "service_channel_rolling_quality_window") { return "recovered" } return "healthy" default: if observation.RetryCooldownUntil != nil && observation.RetryCooldownUntil.After(now.UTC()) { return "cooldown" } return "" } } func fabricServiceChannelFeedbackObservationRecoveryPromoted(observation FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy) bool { if observation.RetryCooldownUntil == nil || !observation.RetryCooldownUntil.After(now.UTC()) { return false } if strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) != "healthy" || !containsString(observation.Reasons, "service_channel_rolling_quality_window") { return false } return fabricServiceChannelFeedbackCleanRollingSamples( fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_sample_count"), fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_success_count"), fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_failure_count"), fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_slow_count"), fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_drop_count"), policy, ) } func fabricServiceChannelFeedbackObservationRecoveryDemotion(observation FabricServiceChannelRouteFeedbackObservation, now time.Time, policy FabricServiceChannelRecoveryPolicy) (bool, string) { if observation.RetryCooldownUntil == nil || !observation.RetryCooldownUntil.After(now.UTC()) { return false, "" } if observation.RecoveryPromoted { return false, "" } if policy.DemotionFencedEnabled && strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) == "fenced" { return true, "service_channel_recovery_demoted_fenced" } if policy.DemotionRebuildEnabled && (containsString(observation.Reasons, "service_channel_route_rebuild_recommended") || fabricServiceChannelFeedbackPayloadBool(observation.Payload, "route_rebuild_recommended")) { return true, "service_channel_recovery_demoted_rebuild" } if fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_failure_count") >= policy.DemotionFailureThreshold || fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_drop_count") >= policy.DemotionDropThreshold { return true, "service_channel_recovery_demoted_failure" } if fabricServiceChannelFeedbackPayloadInt(observation.Payload, "quality_window_slow_count") >= policy.DemotionSlowThreshold { return true, "service_channel_recovery_demoted_slow" } if strings.ToLower(strings.TrimSpace(observation.FeedbackStatus)) == "degraded" { return true, "service_channel_recovery_demoted_degraded" } return false, "" } func fabricServiceChannelRoutesFromIntents(intents []MeshRouteIntent, serviceClass string, entryPool, exitPool, allowedChannels []string, generation string, now, defaultExpiresAt time.Time, feedback map[string]fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) []FabricServiceChannelRoute { policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy()) routes := []FabricServiceChannelRoute{} for _, intent := range intents { route, ok := fabricServiceChannelRouteFromIntent(intent, serviceClass, entryPool, exitPool, allowedChannels, generation, now, defaultExpiresAt, feedback, policy) if ok { routes = append(routes, route) } } sort.SliceStable(routes, func(i, j int) bool { if routes[i].Status != routes[j].Status { return routes[i].Status == "authorized" } if routes[i].PathScore != routes[j].PathScore { return routes[i].PathScore > routes[j].PathScore } if len(routes[i].Hops) != len(routes[j].Hops) { return len(routes[i].Hops) < len(routes[j].Hops) } return routes[i].RouteID < routes[j].RouteID }) return routes } func fabricServiceChannelRouteFromIntent(intent MeshRouteIntent, serviceClass string, entryPool, exitPool, requestedChannels []string, generation string, now, defaultExpiresAt time.Time, feedback map[string]fabricServiceChannelRouteFeedback, recoveryPolicy FabricServiceChannelRecoveryPolicy) (FabricServiceChannelRoute, bool) { recoveryPolicy = normalizeFabricServiceChannelRecoveryPolicy(recoveryPolicy, defaultFabricServiceChannelRecoveryPolicy()) if intent.Status != "active" || strings.TrimSpace(intent.ServiceClass) != serviceClass { return FabricServiceChannelRoute{}, false } var policy syntheticRoutePolicy if err := json.Unmarshal(intent.Policy, &policy); err != nil { return FabricServiceChannelRoute{}, false } if policy.ExpiresAt != nil && !policy.ExpiresAt.After(now.UTC()) { return FabricServiceChannelRoute{}, false } var source nodeSelector var destination nodeSelector _ = json.Unmarshal(intent.SourceSelector, &source) _ = json.Unmarshal(intent.DestinationSelector, &destination) sourceNodeID := firstNodeID(source) destinationNodeID := firstNodeID(destination) hops := append([]string{}, policy.Hops...) if len(hops) == 0 && sourceNodeID != "" && destinationNodeID != "" { hops = []string{sourceNodeID, destinationNodeID} } if len(hops) < 2 { return FabricServiceChannelRoute{}, false } if sourceNodeID == "" { sourceNodeID = hops[0] } if destinationNodeID == "" { destinationNodeID = hops[len(hops)-1] } if !containsString(entryPool, sourceNodeID) || !containsString(exitPool, destinationNodeID) { return FabricServiceChannelRoute{}, false } allowedChannels := policy.AllowedChannels if len(allowedChannels) == 0 { allowedChannels = requestedChannels } if !fabricChannelsIntersect(allowedChannels, requestedChannels) { return FabricServiceChannelRoute{}, false } expiresAt := defaultExpiresAt if policy.ExpiresAt != nil { expiresAt = policy.ExpiresAt.UTC() } routeVersion := policy.RouteVersion if routeVersion == "" { routeVersion = intent.UpdatedAt.UTC().Format(time.RFC3339) } policyVersion := policy.PolicyVersion if policyVersion == "" { policyVersion = routeVersion } score := 100 - len(hops)*5 + intent.Priority if score < 1 { score = 1 } status := "authorized" recoveryState := "" recoveryPenalty := 0 recoveryPromoted := false recoveryDemoted := false recoveryReason := "" scoreReasons := []string{"active_route_intent", "entry_exit_pool_match"} if item, ok := feedback[intent.ID]; ok { score += item.ScoreAdjustment scoreReasons = append(scoreReasons, item.Reasons...) if item.StalePolicy || item.StaleGeneration { recoveryReason = item.StaleReason if recoveryReason == "" { recoveryReason = "service_channel_feedback_stale" } scoreReasons = append(scoreReasons, "service_channel_feedback_stale", recoveryReason) } if fabricServiceChannelFeedbackRecoveryDemoted(item, recoveryPolicy) { recoveryDemoted = true recoveryReason = fabricServiceChannelFeedbackRecoveryDemotionReason(item, recoveryPolicy) scoreReasons = append(scoreReasons, "service_channel_recovery_demoted", recoveryReason) } if item.Fenced { status = "fenced_by_service_channel_feedback" recoveryState = "fenced" score = 0 } else if score < 1 { score = 1 } if status == "authorized" && fabricServiceChannelFeedbackRecoveryPromoted(item, recoveryPolicy) { recoveryState = "healthy" recoveryPromoted = true scoreReasons = append(scoreReasons, "service_channel_recovery_promoted") } else if status == "authorized" && fabricServiceChannelFeedbackRecoveryHysteresisActive(item, recoveryPolicy) { recoveryState = "recovered" recoveryPenalty = recoveryPolicy.HysteresisPenalty score -= recoveryPenalty if score < 1 { score = 1 } scoreReasons = append(scoreReasons, "service_channel_recovery_hysteresis") } else if status == "authorized" && item.ScoreAdjustment > 0 { recoveryState = "healthy" } } return FabricServiceChannelRoute{ RouteID: intent.ID, ClusterID: intent.ClusterID, ServiceClass: serviceClass, SourceNodeID: sourceNodeID, DestinationNodeID: destinationNodeID, Hops: hops, AllowedChannels: allowedChannels, RouteVersion: routeVersion, PolicyVersion: policyVersion, Generation: generation, Status: status, RecoveryState: recoveryState, RecoveryPenalty: recoveryPenalty, RecoveryPromoted: recoveryPromoted, RecoveryDemoted: recoveryDemoted, RecoveryReason: recoveryReason, RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(recoveryPolicy), PathScore: score, ScoreReasons: dedupeStrings(scoreReasons), ExpiresAt: expiresAt, }, true } const fabricServiceChannelRecoveryHysteresisPenalty = 150 const fabricServiceChannelRecoveryPromotionMinSamples = 64 func fabricServiceChannelFeedbackRecoveryHysteresisActive(item fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) bool { if item.StalePolicy || item.StaleGeneration { return false } return item.ManualRetry && !item.Fenced && item.ScoreAdjustment > 0 && containsString(item.Reasons, "service_channel_rolling_quality_window") && !fabricServiceChannelFeedbackRecoveryPromoted(item, policy) } func fabricServiceChannelFeedbackRecoveryPromoted(item fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) bool { if item.StalePolicy || item.StaleGeneration { return false } return item.ManualRetry && !item.Fenced && item.ScoreAdjustment > 0 && containsString(item.Reasons, "service_channel_rolling_quality_window") && fabricServiceChannelFeedbackCleanRollingSamples( item.QualityWindowSampleCount, item.QualityWindowSuccessCount, item.QualityWindowFailureCount, item.QualityWindowSlowCount, item.QualityWindowDropCount, policy, ) } func fabricServiceChannelFeedbackRecoveryDemoted(item fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) bool { if item.StalePolicy || item.StaleGeneration { return false } return item.ManualRetry && !fabricServiceChannelFeedbackRecoveryPromoted(item, policy) && ((policy.DemotionFencedEnabled && item.Fenced) || (policy.DemotionRebuildEnabled && item.RouteRebuildRecommended) || item.DegradedFallbackRecommended || item.QualityWindowFailureCount >= policy.DemotionFailureThreshold || item.QualityWindowDropCount >= policy.DemotionDropThreshold || item.QualityWindowSlowCount >= policy.DemotionSlowThreshold || item.ScoreAdjustment < 0) } func fabricServiceChannelFeedbackRecoveryDemotionReason(item fabricServiceChannelRouteFeedback, policy FabricServiceChannelRecoveryPolicy) string { if policy.DemotionFencedEnabled && item.Fenced { return "service_channel_recovery_demoted_fenced" } if policy.DemotionRebuildEnabled && item.RouteRebuildRecommended { return "service_channel_recovery_demoted_rebuild" } if item.QualityWindowFailureCount >= policy.DemotionFailureThreshold || item.QualityWindowDropCount >= policy.DemotionDropThreshold { return "service_channel_recovery_demoted_failure" } if item.QualityWindowSlowCount >= policy.DemotionSlowThreshold { return "service_channel_recovery_demoted_slow" } if item.DegradedFallbackRecommended { return "service_channel_recovery_demoted_degraded_fallback" } if item.ScoreAdjustment < 0 { return "service_channel_recovery_demoted_degraded" } return "service_channel_recovery_demoted" } func fabricServiceChannelFeedbackCleanRollingSamples(sampleCount, successCount, failureCount, slowCount, dropCount int, policy FabricServiceChannelRecoveryPolicy) bool { return sampleCount >= policy.PromotionMinSamples && successCount >= policy.PromotionMinSamples && failureCount == 0 && slowCount == 0 && dropCount == 0 } func fabricChannelsIntersect(a, b []string) bool { for _, left := range a { if containsString(b, left) { return true } } return false } func selectFabricServicePrimaryRoute(routes []FabricServiceChannelRoute, selectedEntry, selectedExit string) (FabricServiceChannelRoute, []FabricServiceChannelRoute) { if len(routes) == 0 { return FabricServiceChannelRoute{}, nil } alternates := make([]FabricServiceChannelRoute, 0, len(routes)-1) for _, route := range routes { if route.Status != "authorized" { continue } if route.SourceNodeID == selectedEntry && route.DestinationNodeID == selectedExit { for _, alternate := range routes { if alternate.RouteID != route.RouteID && alternate.Status == "authorized" { alternates = append(alternates, alternate) } } return route, alternates } } primary := FabricServiceChannelRoute{} for _, route := range routes { if route.Status != "authorized" { continue } if primary.RouteID == "" { primary = route continue } alternates = append(alternates, route) } return primary, alternates } type fabricServiceChannelRouteIntentReplacementScope struct { EntryPoolKey string ExitPoolKey string ResourceKey string } func fabricServiceChannelRouteIntentMetadataKey(intent MeshRouteIntent, keys []string) string { if len(intent.Policy) == 0 || !json.Valid(intent.Policy) { return "" } var policy syntheticRoutePolicy if err := json.Unmarshal(intent.Policy, &policy); err != nil { return "" } for _, key := range keys { value, ok := policy.Metadata[key] if !ok { continue } switch typed := value.(type) { case string: if trimmed := strings.TrimSpace(typed); trimmed != "" { return key + ":" + trimmed } case fmt.Stringer: if trimmed := strings.TrimSpace(typed.String()); trimmed != "" { return key + ":" + trimmed } } } return "" } func fabricServiceChannelRouteIntentReplacementScopes(intents []MeshRouteIntent) map[string]fabricServiceChannelRouteIntentReplacementScope { out := map[string]fabricServiceChannelRouteIntentReplacementScope{} for _, intent := range intents { if routeID := strings.TrimSpace(intent.ID); routeID != "" { out[routeID] = fabricServiceChannelRouteIntentReplacementScope{ EntryPoolKey: fabricServiceChannelRouteIntentMetadataKey(intent, []string{"entry_pool_id", "service_entry_pool_id", "fabric_entry_pool_id"}), ExitPoolKey: fabricServiceChannelRouteIntentMetadataKey(intent, []string{"exit_pool_id", "service_exit_pool_id", "fabric_exit_pool_id"}), ResourceKey: fabricServiceChannelRouteIntentMetadataKey(intent, []string{"service_resource_id", "resource_id", "fabric_service_resource_id"}), } } } return out } func fabricServiceChannelRoutesShareReplacementScope(fencedRoute, candidateRoute SyntheticMeshRouteConfig, scopes map[string]fabricServiceChannelRouteIntentReplacementScope) bool { if fencedRoute.SourceNodeID == candidateRoute.SourceNodeID && fencedRoute.DestinationNodeID == candidateRoute.DestinationNodeID { return true } fencedScope := scopes[fencedRoute.RouteID] candidateScope := scopes[candidateRoute.RouteID] sameResource := strings.TrimSpace(fencedScope.ResourceKey) != "" && fencedScope.ResourceKey == strings.TrimSpace(candidateScope.ResourceKey) if fencedRoute.SourceNodeID == candidateRoute.SourceNodeID { return sameResource || (strings.TrimSpace(fencedScope.ExitPoolKey) != "" && fencedScope.ExitPoolKey == strings.TrimSpace(candidateScope.ExitPoolKey)) } if fencedRoute.DestinationNodeID == candidateRoute.DestinationNodeID { return sameResource || (strings.TrimSpace(fencedScope.EntryPoolKey) != "" && fencedScope.EntryPoolKey == strings.TrimSpace(candidateScope.EntryPoolKey)) } if sameResource && strings.TrimSpace(fencedScope.EntryPoolKey) != "" && fencedScope.EntryPoolKey == strings.TrimSpace(candidateScope.EntryPoolKey) && strings.TrimSpace(fencedScope.ExitPoolKey) != "" && fencedScope.ExitPoolKey == strings.TrimSpace(candidateScope.ExitPoolKey) { return true } return false } func fabricServiceRoutesFencedForSelectedPair(routes []FabricServiceChannelRoute, selectedEntry, selectedExit string) bool { for _, route := range routes { if route.SourceNodeID == selectedEntry && route.DestinationNodeID == selectedExit && route.Status == "fenced_by_service_channel_feedback" { return true } } return false } func fabricServiceRoutesFencedForPool(routes []FabricServiceChannelRoute) bool { for _, route := range routes { if route.Status == "fenced_by_service_channel_feedback" { return true } } return false } func defaultFabricServiceQoS(serviceClass string) string { switch serviceClass { case FabricServiceClassVPNPackets: return `{"priority":"bulk","interactive":false,"bulk_limit_mbps":0}` case FabricServiceClassRemoteWorkspace: return `{"priority":"interactive","interactive":true,"bulk_limit_mbps":0}` case FabricServiceClassVideo: return `{"priority":"interactive","interactive":true,"adaptive":true}` case FabricServiceClassPlatformAdmin, FabricServiceClassClusterAdmin, FabricServiceClassOrganization, FabricServiceClassUserPortal: return `{"priority":"control","interactive":true,"bulk_limit_mbps":0,"requires_step_up_for_high_risk":true}` default: return `{"priority":"normal","interactive":false,"bulk_limit_mbps":0}` } } func fabricServiceChannelHTTPIngress(serviceClass string) FabricServiceChannelHTTPIngress { return FabricServiceChannelHTTPIngress{ Type: "fabric_quic_only", SupportedMethods: []string{}, } } func fabricServiceChannelDataPlaneContract(serviceClass string, poolPolicy FabricServiceChannelPoolPolicy, fallback FabricServiceChannelFallback) FabricServiceChannelDataPlaneContract { entryFailover := firstNonEmptyString(poolPolicy.EntryFailover, "automatic") exitFailover := firstNonEmptyString(poolPolicy.ExitFailover, "automatic") routeRebuild := firstNonEmptyString(poolPolicy.RouteRebuild, "automatic") return FabricServiceChannelDataPlaneContract{ SchemaVersion: "rap.fabric_service_channel_data_plane.v1", Mode: "fabric_quic_only", ControlPlaneTransport: "fabric_control_quic", WorkingDataTransport: "fabric_quic_route", SteadyStateTransport: "fabric_route", BackendRelayPolicy: "disabled", ProductionForwardingRequired: true, ServiceNeutral: true, ProtocolAgnostic: true, LogicalFlowMode: "multi_flow_isolated", RequiredFlowIsolationClasses: fabricServiceChannelFlowIsolationClasses(serviceClass), RouteSelectionStrategy: firstNonEmptyString(poolPolicy.SelectionStrategy, "fastest_healthy"), EntryFailoverMode: entryFailover, ExitFailoverMode: exitFailover, RouteRebuildMode: routeRebuild, FailureDetectionSource: "route_quality_feedback_and_runtime_heartbeats", DegradedFallbackVisibility: "explicit_access_telemetry_and_rebuild_health", StableContractForServiceClass: serviceClass, } } func fabricServiceChannelFlowIsolationClasses(serviceClass string) []string { switch serviceClass { case FabricServiceClassVPNPackets: return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelReliable, FabricChannelBulk, FabricChannelDroppable, "vpn_packet"} case FabricServiceClassRemoteWorkspace: return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelReliable, FabricChannelBulk, FabricChannelDroppable} case FabricServiceClassVideo: return []string{FabricChannelControl, FabricChannelInteractive, FabricChannelDroppable} case FabricServiceClassFileTransfer: return []string{FabricChannelControl, FabricChannelReliable, FabricChannelBulk} default: return []string{FabricChannelControl, FabricChannelReliable} } } func defaultFabricServiceFailover() string { return `{"route_rebuild":"automatic","exit_failover":"automatic","sticky_session":true}` } func (s *Service) GetNodeSyntheticMeshConfig(ctx context.Context, input GetNodeSyntheticMeshConfigInput) (NodeSyntheticMeshConfig, error) { input.ClusterID = strings.TrimSpace(input.ClusterID) input.NodeID = strings.TrimSpace(input.NodeID) if input.ClusterID == "" || input.NodeID == "" { return NodeSyntheticMeshConfig{}, ErrInvalidPayload } cfg := NodeSyntheticMeshConfig{ Enabled: false, SchemaVersion: "c17z18.synthetic.v1", ClusterID: input.ClusterID, LocalNodeID: input.NodeID, AuthorityRequired: true, ConfigVersion: "disabled", PeerDirectoryVersion: "disabled", PolicyVersion: "disabled", PeerEndpoints: map[string]string{}, PeerEndpointCandidates: map[string][]PeerEndpointCandidate{}, PeerDirectory: []PeerDirectoryEntry{}, RecoverySeeds: []PeerRecoverySeed{}, RendezvousLeases: []PeerRendezvousLease{}, Routes: []SyntheticMeshRouteConfig{}, ProductionForwarding: false, } listenerConfig, err := s.nodeMeshListenerConfig(ctx, input) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh listener config: %w", err) } cfg.MeshListener = listenerConfig if listenerConfig != nil && listenerConfig.ProductionForwarding { cfg.ProductionForwarding = true } flags, err := s.store.GetEffectiveNodeTestingFlags(ctx, input.ClusterID, input.NodeID) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh testing flags: %w", err) } if !flags.Enabled || !flags.SyntheticLinksEnabled { signed, err := s.signSyntheticMeshConfig(ctx, cfg) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh sign disabled config: %w", err) } return signed, nil } intents, err := s.store.ListRouteIntents(ctx, input.ClusterID) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh route intents: %w", err) } cfg.Enabled = true cfg.ConfigVersion = "c17z18-" + s.now().UTC().Format("20060102T150405Z") cfg.PeerDirectoryVersion = cfg.ConfigVersion cfg.PolicyVersion = cfg.ConfigVersion if cfg.MeshListener != nil && cfg.MeshListener.ConfigVersion == "" { cfg.MeshListener.ConfigVersion = cfg.ConfigVersion } meshLinks, err := s.store.ListMeshLinks(ctx, input.ClusterID) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh links: %w", err) } relayPolicy := newRendezvousRelayPolicy(input.NodeID, meshLinks, s.now()) recoveryPolicy := s.fabricServiceChannelRecoveryPolicy(ctx, input.ClusterID) cluster, err := s.store.GetCluster(ctx, input.ClusterID) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh cluster: %w", err) } adaptivePolicy := fabricServiceChannelAdaptivePolicyFromCluster(cluster) cfg.ServiceChannelAdaptivePolicy = &adaptivePolicy routeProvenance := fabricServiceChannelRouteProvenanceFromIntents(intents) serviceChannelFeedbackItems, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{ ClusterID: input.ClusterID, ReporterNodeID: input.NodeID, Now: s.now(), }) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh service channel feedback: %w", err) } cfg.ServiceChannelFeedback = serviceChannelRouteFeedbackReportWithPolicyAndProvenance(serviceChannelFeedbackItems, s.now(), recoveryPolicy, routeProvenance) serviceChannelFeedback := fabricServiceChannelRouteFeedbackFromObservationsWithProvenance(serviceChannelFeedbackItems, s.now(), recoveryPolicy, routeProvenance) cfg.ServiceChannelRemediationCommands, err = s.fabricServiceChannelRemediationCommandsForNode(ctx, input.ClusterID, input.NodeID, serviceChannelFeedback, s.now()) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh remediation commands: %w", err) } if err := s.recordFabricServiceChannelRemediationRebuildIntents(ctx, input.ClusterID, input.NodeID, cfg.ServiceChannelRemediationCommands, s.now()); err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh record remediation intents: %w", err) } remediationRoutePathDecisions, err := s.resolveFabricServiceChannelRemediationRebuildIntents(ctx, input, cfg.ServiceChannelRemediationCommands, intents, serviceChannelFeedback, cfg.ConfigVersion, s.now()) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh remediation decisions: %w", err) } serviceChannelExpiredFeedbackItems, err := s.store.ListFabricServiceChannelRouteFeedback(ctx, ListFabricServiceChannelRouteFeedbackInput{ ClusterID: input.ClusterID, ReporterNodeID: input.NodeID, IncludeExpired: true, Now: s.now(), }) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh expired service channel feedback: %w", err) } mergeFabricServiceChannelRouteFeedback(serviceChannelFeedback, fabricServiceChannelManualRetryFeedbackFromObservationsWithProvenance(serviceChannelExpiredFeedbackItems, s.now(), recoveryPolicy, routeProvenance)) localPerspective, err := s.localEndpointPerspective(ctx, input.ClusterID, input.NodeID) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh local endpoint perspective: %w", err) } peerDirectory := map[string]*PeerDirectoryEntry{} recoverySeeds := map[string]PeerRecoverySeed{} rendezvousLeases := map[string]PeerRendezvousLease{} routePathDecisions := append([]RoutePathDecision{}, remediationRoutePathDecisions...) for _, intent := range intents { route, peers, candidates, seeds, policyLeases, ok := s.syntheticRouteFromIntent(input, intent, localPerspective) if !ok { continue } if feedback, ok := serviceChannelFeedback[route.RouteID]; ok && feedback.Fenced { replacementDecision := s.serviceChannelRouteReplacementDecision(input, route, intents, serviceChannelFeedback, cfg.ConfigVersion) routePathDecisions = append(routePathDecisions, replacementDecision) if replacementDecision.DecisionSource != "service_channel_feedback_no_alternate_keep_primary" { continue } } reportedPeers, reportedCandidates, err := s.reportedEndpointConfig(ctx, input.ClusterID, input.NodeID, route.Hops, localPerspective) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh reported endpoint config: %w", err) } feedback, err := s.rendezvousRelayFeedback(ctx, input.ClusterID, route.Hops, s.now()) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh rendezvous relay feedback: %w", err) } relayPolicy.addFeedback(feedback) replacementHints, err := s.rendezvousRelayReplacementHints(ctx, input.ClusterID, route.Hops, s.now()) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh rendezvous replacement hints: %w", err) } relayPolicy.addReplacementHints(replacementHints) relayPolicy.addFeedback(replacementHintFeedback(replacementHints, s.now())) relayPolicy.addFeedback(rendezvousRelayRouteHealthFeedback(input.NodeID, route, meshLinks, s.now())) for nodeID, endpoint := range reportedPeers { peers[nodeID] = endpoint } for nodeID, items := range reportedCandidates { candidates[nodeID] = append(candidates[nodeID], items...) } if localRelayCandidates := publicDirectRelayCandidates(localPerspective.PeerEndpointCandidates); len(localRelayCandidates) > 0 { candidates[input.NodeID] = append(candidates[input.NodeID], enrichPeerEndpointCandidateCertPins(localRelayCandidates)...) if isUsableFabricControlEndpoint(localPerspective.PeerEndpoint) && !endpointPrivateForOffsite(localPerspective.PeerEndpoint) { peers[input.NodeID] = localPerspective.PeerEndpoint } } relayPeers, relayCandidates, err := s.reportedRouteRelayEndpointConfig(ctx, input.ClusterID, input.NodeID, route.Hops, localPerspective) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh reported relay endpoint config: %w", err) } for nodeID, endpoint := range relayPeers { if _, exists := peers[nodeID]; !exists { peers[nodeID] = endpoint } } for nodeID, items := range relayCandidates { if len(items) > 0 { candidates[nodeID] = append(candidates[nodeID], items...) } } routeLeases := scopedRendezvousLeases(policyLeases, route, input.NodeID, relayPolicy, s.now()) routeLeases = append(routeLeases, derivedRendezvousLeases(route, peers, candidates, input.NodeID, localPerspective, relayPolicy, s.now())...) cfg.Routes = append(cfg.Routes, route) routePathDecisions = append(routePathDecisions, routePathDecisionForRoute(route, input.NodeID, routeLeases, relayPolicy, cfg.ConfigVersion, serviceChannelFeedback[route.RouteID])) mergePeerDirectoryRoute(peerDirectory, route, input.NodeID) for nodeID, endpoint := range peers { if strings.TrimSpace(nodeID) != "" && strings.TrimSpace(endpoint) != "" { cfg.PeerEndpoints[nodeID] = endpoint peerDirectoryEntry(peerDirectory, nodeID).EndpointCount++ } } for nodeID, nodeCandidates := range candidates { if strings.TrimSpace(nodeID) == "" || len(nodeCandidates) == 0 { continue } cfg.PeerEndpointCandidates[nodeID] = append(cfg.PeerEndpointCandidates[nodeID], nodeCandidates...) mergePeerDirectoryCandidates(peerDirectory, nodeID, nodeCandidates) } mergeRecoverySeeds(recoverySeeds, seeds) mergeRendezvousLeases(rendezvousLeases, routeLeases) } if err := s.addCoreMeshBootstrapPeers(ctx, input, &cfg, peerDirectory, recoverySeeds, rendezvousLeases, localPerspective, relayPolicy); err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh core bootstrap peers: %w", err) } cfg.RecoverySeeds = sortedRecoverySeeds(recoverySeeds, maxScopedRecoverySeeds) cfg.RendezvousLeases = sortedRendezvousLeases(rendezvousLeases, maxScopedRendezvousLeases) cfg.RendezvousRelayPolicy = relayPolicy.report() cfg.RoutePathDecisions = routePathDecisionReportWithRecoveryPolicy(cfg.ConfigVersion, routePathDecisions, recoveryPolicy) _ = s.recordFabricServiceChannelRouteRebuildAttempts(ctx, input, cfg.RoutePathDecisions, cfg.ServiceChannelFeedback) markPeerDirectoryRecoverySeeds(peerDirectory, cfg.RecoverySeeds) markPeerDirectoryRendezvousLeases(peerDirectory, cfg.RendezvousLeases, input.NodeID) cfg.PeerDirectory = sortedPeerDirectory(peerDirectory) signed, err := s.signSyntheticMeshConfig(ctx, cfg) if err != nil { return NodeSyntheticMeshConfig{}, fmt.Errorf("synthetic mesh sign config: %w", err) } return signed, nil } func (s *Service) recordFabricServiceChannelRouteRebuildAttempts(ctx context.Context, input GetNodeSyntheticMeshConfigInput, report *RoutePathDecisionReport, feedbackReport *FabricServiceChannelRouteFeedbackReport) error { if report == nil || len(report.Decisions) == 0 { return nil } feedbackByRoute := map[string]FabricServiceChannelRouteFeedbackObservation{} if feedbackReport != nil { for _, item := range feedbackReport.Observations { if strings.TrimSpace(item.RouteID) != "" { feedbackByRoute[item.RouteID] = item } } } for _, decision := range report.Decisions { if strings.TrimSpace(decision.RebuildRequestID) == "" { continue } feedback := feedbackByRoute[decision.RouteID] serviceClass := firstNonEmptyString(feedback.ServiceClass, FabricServiceClassVPNPackets) outcome := "degraded_fallback" if strings.TrimSpace(decision.ReplacementRouteID) != "" { outcome = "replacement_selected" } else if decision.DecisionSource == "service_channel_feedback_no_alternate" { outcome = "no_alternate" } payload := mustJSONRaw(map[string]any{ "schema_version": "c18z98.route_rebuild_attempt_correlation.v1", "decision_id": decision.DecisionID, "score_reasons": decision.ScoreReasons, "path_score": decision.PathScore, "local_role": decision.LocalRole, "previous_hop_id": decision.PreviousHopID, "next_hop_id": decision.NextHopID, "control_plane_only": decision.ControlPlaneOnly, "production_forwarding": decision.ProductionForwarding, "decision_expires_at": decision.ExpiresAt.UTC().Format(time.RFC3339Nano), "feedback_observation_id": decision.FeedbackObservationID, "feedback_source": decision.FeedbackSource, "feedback_observed_at": formatOptionalTime(decision.FeedbackObservedAt), "feedback_expires_at": formatOptionalTime(decision.FeedbackExpiresAt), "feedback_channel_id": decision.FeedbackChannelID, "feedback_resource_id": decision.FeedbackResourceID, "feedback_violation_status": decision.FeedbackViolationStatus, "feedback_violation_reason": decision.FeedbackViolationReason, }) _, err := s.store.RecordFabricServiceChannelRouteRebuildAttempt(ctx, RecordFabricServiceChannelRouteRebuildAttemptInput{ ClusterID: input.ClusterID, ReporterNodeID: input.NodeID, ServiceClass: serviceClass, RouteID: decision.RouteID, ReplacementRouteID: decision.ReplacementRouteID, RebuildRequestID: decision.RebuildRequestID, RebuildStatus: decision.RebuildStatus, RebuildReason: decision.RebuildReason, RebuildAttempt: decision.RebuildAttempt, DecisionSource: decision.DecisionSource, Outcome: outcome, Generation: decision.Generation, PolicyFingerprint: feedback.EffectivePolicyFingerprint, ObservedPolicyFingerprint: feedback.ObservedPolicyFingerprint, ObservedRouteGeneration: feedback.ObservedRouteGeneration, EffectiveRouteGeneration: feedback.EffectiveRouteGeneration, FeedbackStatus: feedback.FeedbackStatus, FeedbackObservationID: decision.FeedbackObservationID, FeedbackSource: decision.FeedbackSource, FeedbackObservedAt: decision.FeedbackObservedAt, FeedbackExpiresAt: decision.FeedbackExpiresAt, FeedbackChannelID: decision.FeedbackChannelID, FeedbackResourceID: decision.FeedbackResourceID, FeedbackViolationStatus: decision.FeedbackViolationStatus, FeedbackViolationReason: decision.FeedbackViolationReason, FeedbackScoreAdjustment: feedback.ScoreAdjustment, FeedbackEffectiveScoreAdjustment: feedback.EffectiveScoreAdjustment, FeedbackReasons: append([]string{}, feedback.Reasons...), LastError: feedback.LastError, ConsecutiveFailures: feedback.ConsecutiveFailures, StallCount: feedback.StallCount, LastSendDurationMs: feedback.LastSendDurationMs, OldHops: append([]string{}, decision.OriginalHops...), ReplacementHops: append([]string{}, decision.EffectiveHops...), Payload: payload, }) if err != nil { return err } } return nil } func (s *Service) autoWarmFabricServiceChannelRouteRebuildAttemptSnapshot(ctx context.Context, clusterID string, attempt FabricServiceChannelRouteRebuildAttempt, now time.Time) (bool, error) { if fabricServiceChannelRouteRebuildHasCorrelationSnapshot(attempt) { return false, nil } nodeID := strings.TrimSpace(attempt.ReporterNodeID) if nodeID == "" { return false, ErrInvalidPayload } if now.IsZero() { now = time.Now().UTC() } heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 120) if err != nil { return false, err } attempt = enrichFabricServiceChannelRouteRebuildAttempt(attempt, heartbeats, now) if !attempt.NodeTransitionMatched && !attempt.NodeRouteGenerationMatched && attempt.PostRebuildSelectedRouteID == "" && attempt.PostRebuildSendPackets == 0 && attempt.PostRebuildSendFlowPackets == 0 { return false, nil } attempt.CorrelationSnapshotAt = &now if err := s.store.UpdateFabricServiceChannelRouteRebuildCorrelationSnapshot(ctx, fabricServiceChannelRouteRebuildCorrelationSnapshotInput(attempt, now)); err != nil { return false, err } return true, nil } func formatOptionalTime(value *time.Time) string { if value == nil || value.IsZero() { return "" } return value.UTC().Format(time.RFC3339Nano) } func (s *Service) autoWarmFabricServiceChannelRouteRebuildSnapshotsAfterHeartbeat(ctx context.Context, heartbeat NodeHeartbeat) error { clusterID := strings.TrimSpace(heartbeat.ClusterID) nodeID := strings.TrimSpace(heartbeat.NodeID) if clusterID == "" || nodeID == "" { return nil } now := heartbeat.ObservedAt if now.IsZero() { now = s.now() } if now.IsZero() { now = time.Now().UTC() } attempts, err := s.store.ListFabricServiceChannelRouteRebuildAttempts(ctx, ListFabricServiceChannelRouteRebuildAttemptsInput{ ClusterID: clusterID, ReporterNodeID: nodeID, Limit: 5, }) if err != nil { return err } warmedCount := 0 freshCount := 0 errorCount := 0 warmedAttemptIDs := []string{} warmedRouteIDs := []string{} warmedRebuildRequestIDs := []string{} warmedGenerations := []string{} for _, attempt := range attempts { if fabricServiceChannelRouteRebuildHasCorrelationSnapshot(attempt) { freshCount++ continue } warmed, err := s.autoWarmFabricServiceChannelRouteRebuildAttemptSnapshot(ctx, clusterID, attempt, now) if err != nil { errorCount++ continue } if warmed { warmedCount++ warmedAttemptIDs = append(warmedAttemptIDs, attempt.ID) warmedRouteIDs = append(warmedRouteIDs, attempt.RouteID) warmedRebuildRequestIDs = append(warmedRebuildRequestIDs, attempt.RebuildRequestID) warmedGenerations = append(warmedGenerations, attempt.Generation) } else { freshCount++ } } if warmedCount == 0 && errorCount == 0 { return nil } targetID := nodeID return s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &clusterID, EventType: "fabric.service_channel_rebuild_snapshot.auto_warmup", TargetType: "fabric_service_channel_route_rebuild_snapshot", TargetID: &targetID, Payload: mustJSONRaw(map[string]any{ "schema_version": "c18z45.rebuild_snapshot_auto_warmup.v1", "trigger": "node_heartbeat", "reporter_node_id": nodeID, "heartbeat_id": heartbeat.ID, "scanned_count": len(attempts), "warmed_count": warmedCount, "already_fresh_count": freshCount, "error_count": errorCount, "warmed_attempt_ids": warmedAttemptIDs, "warmed_route_ids": warmedRouteIDs, "warmed_rebuild_ids": warmedRebuildRequestIDs, "warmed_generations": warmedGenerations, }), CreatedAt: now.UTC(), }) } func (s *Service) nodeMeshListenerConfig(ctx context.Context, input GetNodeSyntheticMeshConfigInput) (*NodeMeshListenerConfig, error) { workloads, err := s.store.ListDesiredWorkloads(ctx, input.ClusterID, input.NodeID) if err != nil { return nil, err } for _, workload := range workloads { if strings.TrimSpace(workload.ServiceType) != "mesh-listener" { continue } cfg, err := nodeMeshListenerConfigFromDesired(workload) if err != nil { return nil, err } return cfg, nil } return nil, nil } func (s *Service) desiredMeshListenerEndpointConfig(ctx context.Context, clusterID, nodeID string, priority int) (string, []PeerEndpointCandidate, error) { listener, err := s.nodeMeshListenerConfig(ctx, GetNodeSyntheticMeshConfigInput{ClusterID: clusterID, NodeID: nodeID}) if err != nil { return "", nil, err } if listener == nil || strings.TrimSpace(listener.DesiredState) != "enabled" || (strings.TrimSpace(listener.AdvertiseEndpoint) == "" && len(listener.EndpointCandidates) == 0 && len(listener.AdvertiseEndpoints) == 0) { return "", nil, nil } transport := firstNonEmptyString(listener.AdvertiseTransport, "direct_quic") connectivityMode := firstNonEmptyString(listener.ConnectivityMode, "direct") natType := firstNonEmptyString(listener.NATType, "unknown") rawCandidates := append([]PeerEndpointCandidate{}, listener.EndpointCandidates...) if len(rawCandidates) == 0 { for idx, endpoint := range listener.AdvertiseEndpoints { rawCandidates = append(rawCandidates, PeerEndpointCandidate{ EndpointID: fmt.Sprintf("%s-desired-mesh-listener-%d", nodeID, idx+1), Address: endpoint, Priority: idx + 1, }) } } if strings.TrimSpace(listener.AdvertiseEndpoint) != "" { rawCandidates = append([]PeerEndpointCandidate{{ EndpointID: nodeID + "-desired-mesh-listener", Address: listener.AdvertiseEndpoint, Priority: priority, }}, rawCandidates...) } candidates := make([]PeerEndpointCandidate, 0, len(rawCandidates)) seen := map[string]struct{}{} for idx, candidate := range rawCandidates { endpoint := strings.TrimRight(strings.TrimSpace(candidate.Address), "/") if endpoint == "" || isUnusableLocalPeerEndpoint(endpoint) { continue } if _, ok := seen[endpoint]; ok { continue } seen[endpoint] = struct{}{} candidate.Address = endpoint candidate.NodeID = nodeID candidate.EndpointID = firstNonEmptyString(strings.TrimSpace(candidate.EndpointID), fmt.Sprintf("%s-desired-mesh-listener-%d", nodeID, idx+1)) candidate.Transport = firstNonEmptyString(strings.TrimSpace(candidate.Transport), transport) candidate.ConnectivityMode = firstNonEmptyString(strings.TrimSpace(candidate.ConnectivityMode), connectivityMode) candidate.Reachability = firstNonEmptyString(strings.TrimSpace(candidate.Reachability), reachabilityFromConnectivityMode(candidate.ConnectivityMode)) candidate.NATType = firstNonEmptyString(strings.TrimSpace(candidate.NATType), natType) candidate.Region = firstNonEmptyString(strings.TrimSpace(candidate.Region), listener.Region) if candidate.Priority <= 0 { candidate.Priority = priority + idx } candidate.PolicyTags = appendUniqueStrings(candidate.PolicyTags, "operator-configured", "desired-mesh-listener") if len(candidate.Metadata) == 0 || !json.Valid(candidate.Metadata) { metadata, err := json.Marshal(map[string]any{ "source": "desired_workload.mesh-listener", "config_version": listener.ConfigVersion, "listen_addr": listener.ListenAddr, }) if err != nil { return "", nil, err } candidate.Metadata = metadata } candidates = append(candidates, candidate) } if len(candidates) == 0 { return "", nil, nil } if err := validatePeerEndpointCandidates(map[string][]PeerEndpointCandidate{nodeID: candidates}, []string{nodeID}); err != nil { return "", nil, err } return candidates[0].Address, candidates, nil } func nodeMeshListenerConfigFromDesired(workload NodeWorkloadDesiredState) (*NodeMeshListenerConfig, error) { var raw map[string]any if len(workload.Config) > 0 { if err := json.Unmarshal(workload.Config, &raw); err != nil { return nil, ErrInvalidPayload } } value := func(key string) string { if raw == nil { return "" } if text, ok := raw[key].(string); ok { return strings.TrimSpace(text) } return "" } intValue := func(key string) int { if raw == nil { return 0 } switch v := raw[key].(type) { case float64: return int(v) case int: return v } return 0 } boolValue := func(key string) bool { if raw == nil { return false } switch v := raw[key].(type) { case bool: return v case string: switch strings.ToLower(strings.TrimSpace(v)) { case "1", "true", "yes", "enabled": return true default: return false } } return false } stringSliceValue := func(key string) []string { if raw == nil { return nil } values, ok := raw[key].([]any) if !ok { return nil } out := make([]string, 0, len(values)) for _, value := range values { text := strings.TrimRight(strings.TrimSpace(fmt.Sprint(value)), "/") if text != "" { out = append(out, text) } } return out } endpointCandidatesValue := func(key string) ([]PeerEndpointCandidate, error) { if raw == nil || raw[key] == nil { return nil, nil } data, err := json.Marshal(raw[key]) if err != nil { return nil, ErrInvalidPayload } var candidates []PeerEndpointCandidate if err := json.Unmarshal(data, &candidates); err != nil { return nil, ErrInvalidPayload } return candidates, nil } mode := strings.ToLower(value("listen_port_mode")) if workload.DesiredState != "enabled" { mode = "disabled" } if mode == "" { mode = "manual" } switch mode { case "manual", "auto", "disabled": default: return nil, ErrInvalidPayload } listenAddr := value("listen_addr") start := intValue("auto_port_start") end := intValue("auto_port_end") if start <= 0 { start = 19131 } if end <= 0 { end = 19231 } if start > end { return nil, ErrInvalidPayload } endpointCandidates, err := endpointCandidatesValue("endpoint_candidates") if err != nil { return nil, err } productionForwarding := boolValue("production_forwarding") || boolValue("production_forwarding_enabled") return &NodeMeshListenerConfig{ SchemaVersion: "c17z23.mesh_listener_config.v1", Source: "desired_workload.mesh-listener", DesiredState: firstNonEmptyString(workload.DesiredState, "disabled"), ListenAddr: listenAddr, ListenPortMode: mode, AutoPortStart: start, AutoPortEnd: end, AdvertiseEndpoint: strings.TrimRight(value("advertise_endpoint"), "/"), AdvertiseEndpoints: stringSliceValue("advertise_endpoints"), EndpointCandidates: endpointCandidates, AdvertiseTransport: value("advertise_transport"), ConnectivityMode: value("connectivity_mode"), NATType: value("nat_type"), Region: value("region"), ConfigVersion: stringPtrValue(workload.Version), UpdatedByUserID: stringPtrValue(workload.UpdatedByUserID), UpdatedAt: workload.UpdatedAt.UTC().Format(time.RFC3339Nano), ControlPlaneOnly: !productionForwarding, ProductionForwarding: productionForwarding, }, nil } func (s *Service) addCoreMeshBootstrapPeers(ctx context.Context, input GetNodeSyntheticMeshConfigInput, cfg *NodeSyntheticMeshConfig, peerDirectory map[string]*PeerDirectoryEntry, recoverySeeds map[string]PeerRecoverySeed, rendezvousLeases map[string]PeerRendezvousLease, localPerspective endpointPerspective, relayPolicy *rendezvousRelayPolicy) error { roles, err := s.store.ListNodeRoleAssignments(ctx, input.ClusterID, input.NodeID) if err != nil { return err } if !hasActiveNodeRole(roles, "core-mesh") { return nil } localRelayCandidates := publicDirectRelayCandidates(localPerspective.PeerEndpointCandidates) if len(localRelayCandidates) > 0 { cfg.PeerEndpointCandidates[input.NodeID] = append(cfg.PeerEndpointCandidates[input.NodeID], localRelayCandidates...) mergePeerDirectoryCandidates(peerDirectory, input.NodeID, localRelayCandidates) if isUsableFabricControlEndpoint(localPerspective.PeerEndpoint) && !endpointPrivateForOffsite(localPerspective.PeerEndpoint) { cfg.PeerEndpoints[input.NodeID] = localPerspective.PeerEndpoint peerDirectoryEntry(peerDirectory, input.NodeID).EndpointCount++ } } nodes, err := s.store.ListClusterNodes(ctx, input.ClusterID) if err != nil { return err } sort.SliceStable(nodes, func(i, j int) bool { if nodes[i].HealthStatus != nodes[j].HealthStatus { return nodes[i].HealthStatus == "healthy" } iSeen := nodeLastSeen(nodes[i]) jSeen := nodeLastSeen(nodes[j]) if !iSeen.Equal(jSeen) { return iSeen.After(jSeen) } return nodes[i].CreatedAt.Before(nodes[j].CreatedAt) }) added := 0 for _, node := range nodes { if node.ID == input.NodeID || node.ID == "" || node.MembershipStatus != "active" || node.RegistrationStatus != NodeRegistrationActive || node.HealthStatus != "healthy" { continue } desiredEndpoint, desiredCandidates, err := s.desiredMeshListenerEndpointConfig(ctx, input.ClusterID, node.ID, added) if err != nil { return fmt.Errorf("desired mesh listener endpoint for node %s: %w", node.ID, err) } if added >= defaultCoreMeshBootstrapPeerTarget && !hasDirectUsableEndpointCandidate(desiredCandidates) { continue } heartbeats, err := s.store.ListNodeHeartbeats(ctx, input.ClusterID, node.ID, 1) if err != nil { return fmt.Errorf("list bootstrap peer heartbeat for node %s: %w", node.ID, err) } if len(heartbeats) == 0 && desiredEndpoint == "" && len(desiredCandidates) == 0 { continue } endpoint := desiredEndpoint candidates := append([]PeerEndpointCandidate{}, desiredCandidates...) if len(heartbeats) > 0 { reportedEndpoint, reportedCandidates, ok := endpointReportFromHeartbeat(heartbeats[0]) if ok { if endpoint == "" { endpoint = reportedEndpoint } candidates = append(candidates, reportedCandidates...) } } endpoint, candidates = scopeEndpointReportForLocal(localPerspective, endpoint, candidates) if endpoint != "" { cfg.PeerEndpoints[node.ID] = endpoint peerDirectoryEntry(peerDirectory, node.ID).EndpointCount++ } if len(candidates) > 0 { cfg.PeerEndpointCandidates[node.ID] = append(cfg.PeerEndpointCandidates[node.ID], candidates...) mergePeerDirectoryCandidates(peerDirectory, node.ID, candidates) } seed := recoverySeedFromEndpointReport(node.ID, endpoint, candidates, added) if seed.NodeID != "" && !endpointCandidateRequiresRendezvous(PeerEndpointCandidate{ Address: seed.Endpoint, Transport: seed.Transport, ConnectivityMode: seed.ConnectivityMode, Reachability: reachabilityFromConnectivityMode(seed.ConnectivityMode), }) { mergeRecoverySeeds(recoverySeeds, []PeerRecoverySeed{seed}) } added++ } mergeRendezvousLeases(rendezvousLeases, coreMeshBootstrapRendezvousLeases(input.ClusterID, input.NodeID, cfg.PeerEndpointCandidates, relayPolicy, s.now())) return nil } func publicDirectRelayCandidates(candidates []PeerEndpointCandidate) []PeerEndpointCandidate { out := []PeerEndpointCandidate{} for _, candidate := range candidates { if endpointCandidateRequiresRendezvous(candidate) || endpointCandidatePrivateForOffsite(candidate) { continue } if !isUsableFabricControlEndpoint(candidate.Address) { continue } if strings.ToLower(strings.TrimSpace(candidate.Reachability)) != "public" && strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode)) != "direct" { continue } out = append(out, candidate) } return out } func hasDirectUsableEndpointCandidate(candidates []PeerEndpointCandidate) bool { for _, candidate := range candidates { if strings.TrimSpace(candidate.Address) != "" && !endpointCandidatePrivateForOffsite(candidate) && !endpointCandidateRequiresRendezvous(candidate) { return true } } return false } func (s *Service) signSyntheticMeshConfig(ctx context.Context, cfg NodeSyntheticMeshConfig) (NodeSyntheticMeshConfig, error) { authorityKey, err := s.ensureClusterAuthority(ctx, cfg.ClusterID, nil) if err != nil { return NodeSyntheticMeshConfig{}, err } cfg.AuthorityRequired = true cfg.ClusterAuthority = authorityDescriptor(authorityKey) unsigned := cfg unsigned.AuthorityPayload = nil unsigned.AuthoritySignature = nil rawConfig, err := json.Marshal(unsigned) if err != nil { return NodeSyntheticMeshConfig{}, err } configHash, err := clusterauth.HashRaw(rawConfig) if err != nil { return NodeSyntheticMeshConfig{}, err } issuedAt := s.now().UTC() payload := clusterMeshConfigAuthorityPayload{ SchemaVersion: clusterMeshConfigAuthoritySchema, ClusterID: cfg.ClusterID, LocalNodeID: cfg.LocalNodeID, ConfigVersion: cfg.ConfigVersion, ConfigSHA256: configHash, IssuedAt: issuedAt, ExpiresAt: issuedAt.Add(5 * time.Minute), ControlPlaneOnly: !cfg.ProductionForwarding, ProductionForwarding: cfg.ProductionForwarding, } rawPayload, signature, err := clusterauth.SignPayload(authorityKey.PrivateKey, payload, issuedAt) if err != nil { return NodeSyntheticMeshConfig{}, err } cfg.AuthorityPayload = rawPayload cfg.AuthoritySignature = &signature return cfg, nil } func (s *Service) RecordNodeTelemetry(ctx context.Context, input RecordNodeTelemetryInput) (NodeTelemetryObservation, error) { if input.ClusterID == "" || input.NodeID == "" { return NodeTelemetryObservation{}, ErrInvalidPayload } input.Payload = defaultJSON(input.Payload, `{}`) if !json.Valid(input.Payload) { return NodeTelemetryObservation{}, errors.New("telemetry payload must be valid json") } if input.ObservedAt.IsZero() { input.ObservedAt = s.now() } return s.store.RecordNodeTelemetry(ctx, input) } func (s *Service) ListNodeTelemetry(ctx context.Context, actorUserID, clusterID, nodeID string, limit int) ([]NodeTelemetryObservation, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListNodeTelemetry(ctx, clusterID, nodeID, limit) } func (s *Service) SetDesiredWorkload(ctx context.Context, input SetDesiredWorkloadInput) (NodeWorkloadDesiredState, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return NodeWorkloadDesiredState{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return NodeWorkloadDesiredState{}, err } input.ServiceType = strings.TrimSpace(input.ServiceType) if input.ClusterID == "" || input.NodeID == "" || input.ServiceType == "" { return NodeWorkloadDesiredState{}, ErrInvalidPayload } if input.DesiredState == "" { input.DesiredState = "disabled" } if input.RuntimeMode == "" { input.RuntimeMode = "container" } if !isSupportedWorkloadRuntimeMode(input.RuntimeMode) { return NodeWorkloadDesiredState{}, ErrInvalidPayload } input.Config = defaultJSON(input.Config, `{}`) input.Environment = defaultJSON(input.Environment, `{}`) if !json.Valid(input.Config) || !json.Valid(input.Environment) { return NodeWorkloadDesiredState{}, errors.New("config and environment must be valid json") } item, err := s.store.SetDesiredWorkload(ctx, input) if err != nil { return NodeWorkloadDesiredState{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "node_workload.desired_state_set", TargetType: "node", TargetID: &input.NodeID, Payload: json.RawMessage(`{"supervision_runtime":"stub_c5"}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) ListDesiredWorkloads(ctx context.Context, actorUserID, clusterID, nodeID string) ([]NodeWorkloadDesiredState, error) { actorUserID = strings.TrimSpace(actorUserID) if actorUserID != "" { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } } if clusterID == "" || nodeID == "" { return nil, ErrInvalidPayload } return s.store.ListDesiredWorkloads(ctx, clusterID, nodeID) } func (s *Service) ReportWorkloadStatus(ctx context.Context, input ReportWorkloadStatusInput) (NodeWorkloadStatus, error) { input.ServiceType = strings.TrimSpace(input.ServiceType) if input.ClusterID == "" || input.NodeID == "" || input.ServiceType == "" { return NodeWorkloadStatus{}, ErrInvalidPayload } if input.ReportedState == "" { input.ReportedState = "unknown" } if input.RuntimeMode == "" { input.RuntimeMode = "container" } if !isSupportedWorkloadRuntimeMode(input.RuntimeMode) { return NodeWorkloadStatus{}, ErrInvalidPayload } input.StatusPayload = defaultJSON(input.StatusPayload, `{}`) if !json.Valid(input.StatusPayload) { return NodeWorkloadStatus{}, errors.New("status_payload must be valid json") } return s.store.ReportWorkloadStatus(ctx, input) } func isSupportedWorkloadRuntimeMode(mode string) bool { switch strings.TrimSpace(mode) { case "native", "container": return true default: return false } } func (s *Service) ListLatestWorkloadStatuses(ctx context.Context, actorUserID, clusterID, nodeID string) ([]NodeWorkloadStatus, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListLatestWorkloadStatuses(ctx, clusterID, nodeID) } func (s *Service) ReportMeshLink(ctx context.Context, input ReportMeshLinkInput) (MeshLinkObservation, error) { if input.ClusterID == "" || input.SourceNodeID == "" || input.TargetNodeID == "" { return MeshLinkObservation{}, ErrInvalidPayload } if input.LinkStatus == "" { input.LinkStatus = "unknown" } input.Metadata = defaultJSON(input.Metadata, `{}`) if !json.Valid(input.Metadata) { return MeshLinkObservation{}, errors.New("metadata must be valid json") } return s.store.ReportMeshLink(ctx, input) } func (s *Service) ListMeshLinks(ctx context.Context, actorUserID, clusterID string) ([]MeshLinkObservation, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListMeshLinks(ctx, clusterID) } func (s *Service) CreateRouteIntent(ctx context.Context, input CreateRouteIntentInput) (MeshRouteIntent, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return MeshRouteIntent{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return MeshRouteIntent{}, err } if input.ClusterID == "" || input.ServiceClass == "" { return MeshRouteIntent{}, ErrInvalidPayload } if input.Priority == 0 { input.Priority = 100 } input.SourceSelector = defaultJSON(input.SourceSelector, `{}`) input.DestinationSelector = defaultJSON(input.DestinationSelector, `{}`) input.Policy = defaultJSON(input.Policy, `{}`) if !json.Valid(input.SourceSelector) || !json.Valid(input.DestinationSelector) || !json.Valid(input.Policy) { return MeshRouteIntent{}, errors.New("source_selector, destination_selector, and policy must be valid json") } item, err := s.store.CreateRouteIntent(ctx, input) if err != nil { return MeshRouteIntent{}, err } item = routeIntentWithLifecycle(item, s.now()) _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "mesh.route_intent.created", TargetType: "mesh_route_intent", TargetID: &item.ID, Payload: json.RawMessage(`{"traffic_forwarding_enabled":false}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) ListRouteIntents(ctx context.Context, actorUserID, clusterID string) ([]MeshRouteIntent, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } items, err := s.store.ListRouteIntents(ctx, clusterID) if err != nil { return nil, err } return routeIntentsWithLifecycle(items, s.now()), nil } func (s *Service) ExpireRouteIntent(ctx context.Context, input RouteIntentLifecycleInput) (MeshRouteIntent, error) { input.ActorUserID = strings.TrimSpace(input.ActorUserID) input.ClusterID = strings.TrimSpace(input.ClusterID) input.RouteIntentID = strings.TrimSpace(input.RouteIntentID) input.Reason = strings.TrimSpace(input.Reason) if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return MeshRouteIntent{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return MeshRouteIntent{}, err } if input.ClusterID == "" || input.RouteIntentID == "" { return MeshRouteIntent{}, ErrInvalidPayload } if input.Reason == "" { input.Reason = "operator expired route intent" } expiresAt := s.now().UTC() item, err := s.store.ExpireRouteIntent(ctx, input, expiresAt) if err != nil { return MeshRouteIntent{}, err } item = routeIntentWithLifecycle(item, s.now()) _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "mesh.route_intent.expired", TargetType: "mesh_route_intent", TargetID: &item.ID, Payload: mustJSONRaw(map[string]any{"reason": input.Reason, "expires_at": expiresAt.Format(time.RFC3339Nano)}), CreatedAt: s.now(), }) return item, nil } func (s *Service) DisableRouteIntent(ctx context.Context, input RouteIntentLifecycleInput) (MeshRouteIntent, error) { input.ActorUserID = strings.TrimSpace(input.ActorUserID) input.ClusterID = strings.TrimSpace(input.ClusterID) input.RouteIntentID = strings.TrimSpace(input.RouteIntentID) input.Reason = strings.TrimSpace(input.Reason) if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return MeshRouteIntent{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return MeshRouteIntent{}, err } if input.ClusterID == "" || input.RouteIntentID == "" { return MeshRouteIntent{}, ErrInvalidPayload } if input.Reason == "" { input.Reason = "operator disabled route intent" } item, err := s.store.DisableRouteIntent(ctx, input) if err != nil { return MeshRouteIntent{}, err } item = routeIntentWithLifecycle(item, s.now()) _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "mesh.route_intent.disabled", TargetType: "mesh_route_intent", TargetID: &item.ID, Payload: mustJSONRaw(map[string]any{"reason": input.Reason}), CreatedAt: s.now(), }) return item, nil } func routeIntentsWithLifecycle(items []MeshRouteIntent, now time.Time) []MeshRouteIntent { out := make([]MeshRouteIntent, 0, len(items)) for _, item := range items { out = append(out, routeIntentWithLifecycle(item, now)) } return out } func routeIntentWithLifecycle(item MeshRouteIntent, now time.Time) MeshRouteIntent { item.LifecycleStatus = strings.TrimSpace(item.Status) var policy syntheticRoutePolicy if err := json.Unmarshal(item.Policy, &policy); err == nil && policy.ExpiresAt != nil { expiresAt := policy.ExpiresAt.UTC() item.PolicyExpiresAt = &expiresAt if !expiresAt.After(now.UTC()) { item.IsExpired = true } } switch { case item.Status == "disabled": item.LifecycleStatus = "disabled" case item.IsExpired: item.LifecycleStatus = "expired" case item.LifecycleStatus == "": item.LifecycleStatus = "active" } return item } func (s *Service) ListQoSPolicies(ctx context.Context, actorUserID, clusterID string) ([]MeshQoSPolicy, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListQoSPolicies(ctx, clusterID) } func (s *Service) ListFabricEntryPoints(ctx context.Context, actorUserID, clusterID string) ([]FabricEntryPoint, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListFabricEntryPoints(ctx, clusterID) } func (s *Service) CreateFabricEntryPoint(ctx context.Context, input CreateFabricEntryPointInput) (FabricEntryPoint, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return FabricEntryPoint{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return FabricEntryPoint{}, err } input.Name = strings.TrimSpace(input.Name) input.Status = strings.TrimSpace(input.Status) input.EndpointType = strings.TrimSpace(input.EndpointType) if input.Status == "" { input.Status = "active" } if input.EndpointType == "" { input.EndpointType = "client_access" } if input.ClusterID == "" || input.Name == "" || !isFabricEndpointStatus(input.Status) || !isFabricEntryPointType(input.EndpointType) { return FabricEntryPoint{}, ErrInvalidPayload } if input.PublicEndpoint != nil { trimmed := strings.TrimSpace(*input.PublicEndpoint) if trimmed == "" { input.PublicEndpoint = nil } else { input.PublicEndpoint = &trimmed } } input.Policy = defaultJSON(input.Policy, `{}`) input.Metadata = defaultJSON(input.Metadata, `{}`) if !json.Valid(input.Policy) || !json.Valid(input.Metadata) { return FabricEntryPoint{}, errors.New("entry point policy and metadata must be valid json") } item, err := s.store.CreateFabricEntryPoint(ctx, input) if err != nil { return FabricEntryPoint{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "fabric.entry_point.created", TargetType: "fabric_entry_point", TargetID: &item.ID, Payload: json.RawMessage(`{"runtime_routing_enabled":false}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) SetFabricEntryPointNode(ctx context.Context, input SetFabricEntryPointNodeInput) (FabricEntryPointNode, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return FabricEntryPointNode{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return FabricEntryPointNode{}, err } input.Status = strings.TrimSpace(input.Status) if input.Status == "" { input.Status = "active" } if input.Priority <= 0 { input.Priority = 100 } if input.ClusterID == "" || input.EntryPointID == "" || input.NodeID == "" || !isFabricEndpointStatus(input.Status) { return FabricEntryPointNode{}, ErrInvalidPayload } input.Metadata = defaultJSON(input.Metadata, `{}`) if !json.Valid(input.Metadata) { return FabricEntryPointNode{}, errors.New("entry point node metadata must be valid json") } return s.store.SetFabricEntryPointNode(ctx, input) } func (s *Service) ListFabricEntryPointNodes(ctx context.Context, actorUserID, clusterID, entryPointID string) ([]FabricEntryPointNode, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } if clusterID == "" || entryPointID == "" { return nil, ErrInvalidPayload } return s.store.ListFabricEntryPointNodes(ctx, clusterID, entryPointID) } func (s *Service) ListFabricEgressPools(ctx context.Context, actorUserID, clusterID string) ([]FabricEgressPool, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListFabricEgressPools(ctx, clusterID) } func (s *Service) CreateFabricEgressPool(ctx context.Context, input CreateFabricEgressPoolInput) (FabricEgressPool, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return FabricEgressPool{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return FabricEgressPool{}, err } input.Name = strings.TrimSpace(input.Name) input.Status = strings.TrimSpace(input.Status) if input.Status == "" { input.Status = "active" } if input.ClusterID == "" || input.Name == "" || !isFabricEndpointStatus(input.Status) { return FabricEgressPool{}, ErrInvalidPayload } if input.Description != nil { trimmed := strings.TrimSpace(*input.Description) if trimmed == "" { input.Description = nil } else { input.Description = &trimmed } } input.RouteScope = defaultJSON(input.RouteScope, `{}`) input.Policy = defaultJSON(input.Policy, `{}`) input.Metadata = defaultJSON(input.Metadata, `{}`) if !json.Valid(input.RouteScope) || !json.Valid(input.Policy) || !json.Valid(input.Metadata) { return FabricEgressPool{}, errors.New("egress pool route_scope, policy, and metadata must be valid json") } item, err := s.store.CreateFabricEgressPool(ctx, input) if err != nil { return FabricEgressPool{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "fabric.egress_pool.created", TargetType: "fabric_egress_pool", TargetID: &item.ID, Payload: json.RawMessage(`{"runtime_routing_enabled":false}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) SetFabricEgressPoolNode(ctx context.Context, input SetFabricEgressPoolNodeInput) (FabricEgressPoolNode, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return FabricEgressPoolNode{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return FabricEgressPoolNode{}, err } input.Status = strings.TrimSpace(input.Status) if input.Status == "" { input.Status = "active" } if input.Priority <= 0 { input.Priority = 100 } if input.ClusterID == "" || input.EgressPoolID == "" || input.NodeID == "" || !isFabricEndpointStatus(input.Status) { return FabricEgressPoolNode{}, ErrInvalidPayload } input.Metadata = defaultJSON(input.Metadata, `{}`) if !json.Valid(input.Metadata) { return FabricEgressPoolNode{}, errors.New("egress pool node metadata must be valid json") } return s.store.SetFabricEgressPoolNode(ctx, input) } func (s *Service) ListFabricEgressPoolNodes(ctx context.Context, actorUserID, clusterID, egressPoolID string) ([]FabricEgressPoolNode, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } if clusterID == "" || egressPoolID == "" { return nil, ErrInvalidPayload } return s.store.ListFabricEgressPoolNodes(ctx, clusterID, egressPoolID) } func (s *Service) GetClusterAuthorityState(ctx context.Context, actorUserID, clusterID string) (ClusterAuthorityState, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return ClusterAuthorityState{}, err } return s.store.GetClusterAuthorityState(ctx, clusterID) } func (s *Service) UpdateClusterAuthorityState(ctx context.Context, input UpdateClusterAuthorityInput) (ClusterAuthorityState, error) { role, err := s.store.GetPlatformRole(ctx, strings.TrimSpace(input.ActorUserID)) if err != nil { return ClusterAuthorityState{}, err } if !isPlatformAdminRole(role) { return ClusterAuthorityState{}, ErrAccessDenied } if input.MutationMode == "recovery_override" && role != PlatformRoleRecoveryAdmin { return ClusterAuthorityState{}, ErrAccessDenied } if input.AuthorityState == "" { input.AuthorityState = "authoritative" } if input.MutationMode == "" { input.MutationMode = "normal" } item, err := s.store.UpdateClusterAuthorityState(ctx, input) if err != nil { return ClusterAuthorityState{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "cluster_authority.updated", TargetType: "cluster", TargetID: &input.ClusterID, Payload: json.RawMessage(`{"split_brain_guard":true}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) ListClusterAdminSummaries(ctx context.Context, actorUserID string) ([]ClusterAdminSummary, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListClusterAdminSummaries(ctx) } func (s *Service) CreateVPNConnection(ctx context.Context, input CreateVPNConnectionInput) (VPNConnection, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return VPNConnection{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return VPNConnection{}, err } input.Name = strings.TrimSpace(input.Name) input.ProtocolFamily = strings.TrimSpace(input.ProtocolFamily) if input.ProtocolFamily == "" { input.ProtocolFamily = "generic" } input.Mode = strings.TrimSpace(input.Mode) if input.Mode == "" { input.Mode = VPNConnectionModeSingleActive } input.DesiredState = strings.TrimSpace(input.DesiredState) if input.DesiredState == "" { input.DesiredState = VPNConnectionDesiredDisabled } if input.ClusterID == "" || input.OrganizationID == "" || input.Name == "" { return VPNConnection{}, ErrInvalidPayload } if input.Mode != VPNConnectionModeSingleActive { return VPNConnection{}, errors.New("vpn connection mode must be single_active") } if !isAllowedVPNDesiredState(input.DesiredState) { return VPNConnection{}, errors.New("vpn connection desired_state must be enabled or disabled") } input.TargetEndpoint = defaultJSON(input.TargetEndpoint, `{}`) input.AllowedNodePolicy = defaultJSON(input.AllowedNodePolicy, `{"mode":"explicit","node_ids":[]}`) input.RoutingUsage = defaultJSON(input.RoutingUsage, `[]`) input.RoutePolicy = defaultJSON(input.RoutePolicy, `{}`) input.QoSPolicy = defaultJSON(input.QoSPolicy, `{}`) input.PlacementPolicy = defaultJSON(input.PlacementPolicy, `{}`) input.Metadata = defaultJSON(input.Metadata, `{}`) if !json.Valid(input.TargetEndpoint) || !json.Valid(input.AllowedNodePolicy) || !json.Valid(input.RoutingUsage) || !json.Valid(input.RoutePolicy) || !json.Valid(input.QoSPolicy) || !json.Valid(input.PlacementPolicy) || !json.Valid(input.Metadata) { return VPNConnection{}, errors.New("vpn connection json fields must be valid json") } item, err := s.store.CreateVPNConnection(ctx, input) if err != nil { return VPNConnection{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "vpn_connection.created", TargetType: "vpn_connection", TargetID: &item.ID, Payload: json.RawMessage(`{"runtime_created":false}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) ListVPNConnections(ctx context.Context, actorUserID, clusterID string) ([]VPNConnection, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListVPNConnections(ctx, clusterID) } func (s *Service) GetVPNConnection(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) (VPNConnection, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return VPNConnection{}, err } item, err := s.store.GetVPNConnection(ctx, clusterID, vpnConnectionID) if errors.Is(err, pgx.ErrNoRows) { return VPNConnection{}, ErrInvalidVPNConnection } return item, err } func (s *Service) UpdateVPNConnectionDesiredState(ctx context.Context, input UpdateVPNConnectionDesiredStateInput) (VPNConnection, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return VPNConnection{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return VPNConnection{}, err } input.DesiredState = strings.TrimSpace(input.DesiredState) if !isAllowedVPNDesiredState(input.DesiredState) { return VPNConnection{}, errors.New("vpn connection desired_state must be enabled or disabled") } item, err := s.store.UpdateVPNConnectionDesiredState(ctx, input) if errors.Is(err, pgx.ErrNoRows) { return VPNConnection{}, ErrInvalidVPNConnection } if err != nil { return VPNConnection{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "vpn_connection.desired_state_changed", TargetType: "vpn_connection", TargetID: &input.VPNConnectionID, Payload: json.RawMessage(`{"runtime_executed":false}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) UpsertVPNConnectionRoutePolicy(ctx context.Context, input UpsertVPNConnectionRoutePolicyInput) (VPNConnectionRoutePolicy, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return VPNConnectionRoutePolicy{}, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return VPNConnectionRoutePolicy{}, err } input.RouteType = strings.TrimSpace(input.RouteType) input.Destination = strings.TrimSpace(input.Destination) input.Action = strings.TrimSpace(input.Action) input.Status = strings.TrimSpace(input.Status) if input.Action == "" { input.Action = "allow" } if input.Status == "" { input.Status = "active" } if input.Priority == 0 { input.Priority = 100 } if input.ClusterID == "" || input.VPNConnectionID == "" || input.RouteType == "" || input.Destination == "" { return VPNConnectionRoutePolicy{}, ErrInvalidPayload } if !isAllowedVPNRouteType(input.RouteType) || !isAllowedVPNRouteAction(input.Action) || !isAllowedVPNPolicyStatus(input.Status) { return VPNConnectionRoutePolicy{}, ErrInvalidPayload } input.Policy = defaultJSON(input.Policy, `{}`) if !json.Valid(input.Policy) { return VPNConnectionRoutePolicy{}, errors.New("vpn route policy json must be valid json") } item, err := s.store.UpsertVPNConnectionRoutePolicy(ctx, input) if errors.Is(err, pgx.ErrNoRows) { return VPNConnectionRoutePolicy{}, ErrInvalidVPNConnection } if err != nil { return VPNConnectionRoutePolicy{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "vpn_connection.route_policy_changed", TargetType: "vpn_connection", TargetID: &input.VPNConnectionID, Payload: json.RawMessage(`{"routing_runtime_changed":false}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) ListVPNConnectionRoutePolicies(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) ([]VPNConnectionRoutePolicy, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListVPNConnectionRoutePolicies(ctx, clusterID, vpnConnectionID) } func (s *Service) SetVPNConnectionAllowedNodes(ctx context.Context, input SetVPNConnectionAllowedNodesInput) ([]VPNConnectionAllowedNode, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return nil, err } if err := s.ensureClusterMutable(ctx, input.ActorUserID, input.ClusterID); err != nil { return nil, err } input.RolePreference = strings.TrimSpace(input.RolePreference) if input.RolePreference == "" { input.RolePreference = "candidate" } if input.ClusterID == "" || input.VPNConnectionID == "" { return nil, ErrInvalidPayload } if !isAllowedVPNNodePreference(input.RolePreference) { return nil, ErrInvalidPayload } input.Metadata = defaultJSON(input.Metadata, `{}`) if !json.Valid(input.Metadata) { return nil, errors.New("allowed node metadata must be valid json") } nodes := make([]string, 0, len(input.NodeIDs)) seen := map[string]struct{}{} for _, nodeID := range input.NodeIDs { nodeID = strings.TrimSpace(nodeID) if nodeID == "" { continue } if _, ok := seen[nodeID]; ok { continue } seen[nodeID] = struct{}{} nodes = append(nodes, nodeID) } input.NodeIDs = nodes items, err := s.store.SetVPNConnectionAllowedNodes(ctx, input) if errors.Is(err, pgx.ErrNoRows) { return nil, ErrInvalidVPNConnection } if err != nil { return nil, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "vpn_connection.allowed_nodes_changed", TargetType: "vpn_connection", TargetID: &input.VPNConnectionID, Payload: json.RawMessage(`{"node_runtime_changed":false}`), CreatedAt: s.now(), }) return items, nil } func (s *Service) ListVPNConnectionAllowedNodes(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) ([]VPNConnectionAllowedNode, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } return s.store.ListVPNConnectionAllowedNodes(ctx, clusterID, vpnConnectionID) } func (s *Service) AcquireVPNConnectionLease(ctx context.Context, input AcquireVPNConnectionLeaseInput) (VPNConnectionLease, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return VPNConnectionLease{}, err } if input.ClusterID == "" || input.VPNConnectionID == "" || input.OwnerNodeID == "" { return VPNConnectionLease{}, ErrInvalidPayload } conn, err := s.store.GetVPNConnection(ctx, input.ClusterID, input.VPNConnectionID) if errors.Is(err, pgx.ErrNoRows) { return VPNConnectionLease{}, ErrInvalidVPNConnection } if err != nil { return VPNConnectionLease{}, err } if conn.Mode != VPNConnectionModeSingleActive || conn.DesiredState != VPNConnectionDesiredEnabled { return VPNConnectionLease{}, errors.New("vpn connection must be enabled single_active before lease acquisition") } if err := s.ensureVPNLeaseOwnerEligible(ctx, input.ClusterID, input.VPNConnectionID, input.OwnerNodeID); err != nil { return VPNConnectionLease{}, err } if input.TTL <= 0 { input.TTL = 30 * time.Second } input.Metadata = defaultJSON(input.Metadata, `{}`) if !json.Valid(input.Metadata) { return VPNConnectionLease{}, errors.New("lease metadata must be valid json") } token, err := generateFencingToken() if err != nil { return VPNConnectionLease{}, err } item, err := s.store.AcquireVPNConnectionLease(ctx, input, s.now().Add(input.TTL), token) if errors.Is(err, pgx.ErrNoRows) { return VPNConnectionLease{}, ErrInvalidVPNLease } if errors.Is(err, ErrVPNLeaseAlreadyActive) { return VPNConnectionLease{}, ErrVPNLeaseAlreadyActive } if err != nil { return VPNConnectionLease{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "vpn_connection.lease_acquired", TargetType: "vpn_connection", TargetID: &input.VPNConnectionID, Payload: json.RawMessage(`{"vpn_runtime_started":false}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) RenewVPNConnectionLease(ctx context.Context, input RenewVPNConnectionLeaseInput) (VPNConnectionLease, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return VPNConnectionLease{}, err } if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" || input.OwnerNodeID == "" || input.FencingToken == "" { return VPNConnectionLease{}, ErrInvalidPayload } if input.TTL <= 0 { input.TTL = 30 * time.Second } if err := s.ensureVPNLeaseOwnerEligible(ctx, input.ClusterID, input.VPNConnectionID, input.OwnerNodeID); err != nil { return VPNConnectionLease{}, err } item, err := s.store.RenewVPNConnectionLease(ctx, input, s.now().Add(input.TTL)) if errors.Is(err, pgx.ErrNoRows) { return VPNConnectionLease{}, ErrInvalidVPNLease } if err != nil { return VPNConnectionLease{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "vpn_connection.lease_renewed", TargetType: "vpn_connection", TargetID: &input.VPNConnectionID, Payload: json.RawMessage(`{"vpn_runtime_changed":false}`), CreatedAt: s.now(), }) return item, err } func (s *Service) RenewNodeVPNAssignmentLease(ctx context.Context, input RenewNodeVPNAssignmentLeaseInput) (VPNConnectionLease, error) { input.ClusterID = strings.TrimSpace(input.ClusterID) input.VPNConnectionID = strings.TrimSpace(input.VPNConnectionID) input.LeaseID = strings.TrimSpace(input.LeaseID) input.OwnerNodeID = strings.TrimSpace(input.OwnerNodeID) if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" || input.OwnerNodeID == "" { return VPNConnectionLease{}, ErrInvalidPayload } if input.TTL <= 0 { input.TTL = 2 * time.Minute } if err := s.ensureVPNLeaseOwnerEligible(ctx, input.ClusterID, input.VPNConnectionID, input.OwnerNodeID); err != nil { return VPNConnectionLease{}, err } assignments, err := s.store.ListNodeVPNAssignments(ctx, input.ClusterID, input.OwnerNodeID) if err != nil { return VPNConnectionLease{}, err } ownsVisibleLease := false for _, assignment := range assignments { if assignment.VPNConnectionID == input.VPNConnectionID && assignment.AssignmentReason == "active_owner" && assignment.ActiveLease != nil && assignment.ActiveLease.LeaseID == input.LeaseID && assignment.ActiveLease.OwnerNodeID == input.OwnerNodeID { ownsVisibleLease = true break } } if !ownsVisibleLease { return VPNConnectionLease{}, ErrVPNLeaseOwnerNotAllowed } item, err := s.store.RenewNodeVPNAssignmentLease(ctx, input, s.now().Add(input.TTL)) if errors.Is(err, pgx.ErrNoRows) { return VPNConnectionLease{}, ErrInvalidVPNLease } if err != nil { return VPNConnectionLease{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, EventType: "vpn_connection.lease_renewed_by_node", TargetType: "vpn_connection", TargetID: &input.VPNConnectionID, Payload: json.RawMessage(`{"node_agent_runtime_executed":true}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) AcquireNodeVPNAssignmentLease(ctx context.Context, input AcquireNodeVPNAssignmentLeaseInput) (VPNConnectionLease, error) { input.ClusterID = strings.TrimSpace(input.ClusterID) input.VPNConnectionID = strings.TrimSpace(input.VPNConnectionID) input.OwnerNodeID = strings.TrimSpace(input.OwnerNodeID) if input.ClusterID == "" || input.VPNConnectionID == "" || input.OwnerNodeID == "" { return VPNConnectionLease{}, ErrInvalidPayload } conn, err := s.store.GetVPNConnection(ctx, input.ClusterID, input.VPNConnectionID) if errors.Is(err, pgx.ErrNoRows) { return VPNConnectionLease{}, ErrInvalidVPNConnection } if err != nil { return VPNConnectionLease{}, err } if conn.Mode != VPNConnectionModeSingleActive || conn.DesiredState != VPNConnectionDesiredEnabled { return VPNConnectionLease{}, errors.New("vpn connection must be enabled single_active before lease acquisition") } if err := s.ensureVPNLeaseOwnerEligible(ctx, input.ClusterID, input.VPNConnectionID, input.OwnerNodeID); err != nil { return VPNConnectionLease{}, err } assignments, err := s.store.ListNodeVPNAssignments(ctx, input.ClusterID, input.OwnerNodeID) if err != nil { return VPNConnectionLease{}, err } visibleCandidate := false for _, assignment := range assignments { if assignment.VPNConnectionID != input.VPNConnectionID { continue } if assignment.DesiredState != "" && assignment.DesiredState != VPNConnectionDesiredEnabled { return VPNConnectionLease{}, ErrVPNLeaseOwnerNotAllowed } if assignment.AssignmentReason == "active_owner" && assignment.ActiveLease != nil && assignment.ActiveLease.OwnerNodeID == input.OwnerNodeID { return VPNConnectionLease{ ID: assignment.ActiveLease.LeaseID, VPNConnectionID: assignment.VPNConnectionID, ClusterID: assignment.ClusterID, OwnerNodeID: assignment.ActiveLease.OwnerNodeID, LeaseGeneration: assignment.ActiveLease.LeaseGeneration, Status: assignment.ActiveLease.Status, RenewedAt: assignment.ActiveLease.RenewedAt, ExpiresAt: assignment.ActiveLease.ExpiresAt, }, nil } if assignment.AssignmentReason == "eligible_candidate" { visibleCandidate = true break } } if !visibleCandidate { return VPNConnectionLease{}, ErrVPNLeaseOwnerNotAllowed } if input.TTL <= 0 { input.TTL = 2 * time.Minute } input.Metadata = defaultJSON(input.Metadata, `{}`) if !json.Valid(input.Metadata) { return VPNConnectionLease{}, errors.New("lease metadata must be valid json") } token, err := generateFencingToken() if err != nil { return VPNConnectionLease{}, err } item, err := s.store.AcquireVPNConnectionLease(ctx, AcquireVPNConnectionLeaseInput{ ClusterID: input.ClusterID, VPNConnectionID: input.VPNConnectionID, OwnerNodeID: input.OwnerNodeID, TTL: input.TTL, Metadata: input.Metadata, }, s.now().Add(input.TTL), token) if errors.Is(err, pgx.ErrNoRows) { return VPNConnectionLease{}, ErrInvalidVPNLease } if errors.Is(err, ErrVPNLeaseAlreadyActive) { return VPNConnectionLease{}, ErrVPNLeaseAlreadyActive } if err != nil { return VPNConnectionLease{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, EventType: "vpn_connection.lease_acquired_by_node", TargetType: "vpn_connection", TargetID: &input.VPNConnectionID, Payload: json.RawMessage(`{"node_agent_runtime_requested":true}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) ReleaseVPNConnectionLease(ctx context.Context, input ReleaseVPNConnectionLeaseInput) (VPNConnectionLease, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return VPNConnectionLease{}, err } if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" || input.OwnerNodeID == "" || input.FencingToken == "" { return VPNConnectionLease{}, ErrInvalidPayload } item, err := s.store.ReleaseVPNConnectionLease(ctx, input) if errors.Is(err, pgx.ErrNoRows) { return VPNConnectionLease{}, ErrInvalidVPNLease } if err != nil { return VPNConnectionLease{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "vpn_connection.lease_released", TargetType: "vpn_connection", TargetID: &input.VPNConnectionID, Payload: json.RawMessage(`{"vpn_runtime_stopped":false}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) FenceVPNConnectionLease(ctx context.Context, input FenceVPNConnectionLeaseInput) (VPNConnectionLease, error) { if err := s.ensurePlatformRecoveryAdmin(ctx, input.ActorUserID); err != nil { return VPNConnectionLease{}, err } input.Reason = strings.TrimSpace(input.Reason) if input.Reason == "" { input.Reason = "fenced by platform recovery administrator" } if input.ClusterID == "" || input.VPNConnectionID == "" || input.LeaseID == "" { return VPNConnectionLease{}, ErrInvalidPayload } item, err := s.store.FenceVPNConnectionLease(ctx, input) if errors.Is(err, pgx.ErrNoRows) { return VPNConnectionLease{}, ErrInvalidVPNLease } if err != nil { return VPNConnectionLease{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "vpn_connection.owner_fenced", TargetType: "vpn_connection", TargetID: &input.VPNConnectionID, Payload: json.RawMessage(`{"split_brain_guard":true}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) GetActiveVPNConnectionLease(ctx context.Context, actorUserID, clusterID, vpnConnectionID string) (VPNConnectionLease, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return VPNConnectionLease{}, err } item, err := s.store.GetActiveVPNConnectionLease(ctx, clusterID, vpnConnectionID) if errors.Is(err, pgx.ErrNoRows) { return VPNConnectionLease{}, ErrInvalidVPNLease } return item, err } func (s *Service) ExpireStaleVPNConnectionLeases(ctx context.Context, input ExpireStaleVPNConnectionLeasesInput) ([]VPNConnectionLease, error) { if err := s.ensurePlatformAdmin(ctx, input.ActorUserID); err != nil { return nil, err } if input.ClusterID == "" { return nil, ErrInvalidPayload } items, err := s.store.ExpireStaleVPNConnectionLeases(ctx, input.ClusterID, s.now()) if err != nil { return nil, err } for _, item := range items { vpnConnectionID := item.VPNConnectionID _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, ActorUserID: &input.ActorUserID, EventType: "vpn_connection.lease_expired", TargetType: "vpn_connection", TargetID: &vpnConnectionID, Payload: json.RawMessage(`{"stale_reclamation":true,"vpn_runtime_changed":false}`), CreatedAt: s.now(), }) } return items, nil } func (s *Service) ListNodeVPNAssignments(ctx context.Context, clusterID, nodeID string) ([]NodeVPNAssignment, error) { clusterID = strings.TrimSpace(clusterID) nodeID = strings.TrimSpace(nodeID) if clusterID == "" || nodeID == "" { return nil, ErrInvalidPayload } return s.store.ListNodeVPNAssignments(ctx, clusterID, nodeID) } func (s *Service) ReportNodeVPNAssignmentStatus(ctx context.Context, input ReportNodeVPNAssignmentStatusInput) (NodeVPNAssignmentStatus, error) { input.ClusterID = strings.TrimSpace(input.ClusterID) input.NodeID = strings.TrimSpace(input.NodeID) input.VPNConnectionID = strings.TrimSpace(input.VPNConnectionID) input.ObservedStatus = strings.TrimSpace(input.ObservedStatus) if input.ClusterID == "" || input.NodeID == "" || input.VPNConnectionID == "" { return NodeVPNAssignmentStatus{}, ErrInvalidPayload } if input.ObservedStatus == "" { input.ObservedStatus = VPNAssignmentStatusUnknown } if !isAllowedVPNAssignmentStatus(input.ObservedStatus) { return NodeVPNAssignmentStatus{}, ErrInvalidPayload } input.StatusPayload = defaultJSON(input.StatusPayload, `{}`) if !json.Valid(input.StatusPayload) { return NodeVPNAssignmentStatus{}, errors.New("status_payload must be valid json") } if input.ObservedAt.IsZero() { input.ObservedAt = s.now() } assignments, err := s.store.ListNodeVPNAssignments(ctx, input.ClusterID, input.NodeID) if err != nil { return NodeVPNAssignmentStatus{}, err } visible := false for _, assignment := range assignments { if assignment.VPNConnectionID == input.VPNConnectionID { visible = true break } } if !visible { return NodeVPNAssignmentStatus{}, ErrVPNLeaseOwnerNotAllowed } item, err := s.store.ReportNodeVPNAssignmentStatus(ctx, input) if err != nil { return NodeVPNAssignmentStatus{}, err } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &input.ClusterID, EventType: "vpn_connection.assignment_status_reported", TargetType: "vpn_connection", TargetID: &input.VPNConnectionID, Payload: json.RawMessage(`{"node_agent_runtime_executed":false}`), CreatedAt: s.now(), }) return item, nil } func (s *Service) GetVPNClientProfile( ctx context.Context, clusterID, organizationID, userID string, preferredEntryNodeID ...string, ) (VPNClientProfile, error) { clusterID = strings.TrimSpace(clusterID) organizationID = strings.TrimSpace(organizationID) userID = strings.TrimSpace(userID) if clusterID == "" || organizationID == "" || userID == "" { return VPNClientProfile{}, ErrInvalidPayload } preferredEntry := "" if len(preferredEntryNodeID) > 0 { preferredEntry = strings.TrimSpace(preferredEntryNodeID[0]) } preferredExit := "" if len(preferredEntryNodeID) > 1 { preferredExit = strings.TrimSpace(preferredEntryNodeID[1]) } profile, err := s.store.GetVPNClientProfile(ctx, clusterID, organizationID, userID, preferredEntry, preferredExit, s.now().UTC()) if err != nil { return VPNClientProfile{}, err } if profile.ClusterID == "" { profile.ClusterID = clusterID } if profile.OrganizationID == "" { profile.OrganizationID = organizationID } if profile.UserID == "" { profile.UserID = userID } profile = attachVPNDataplaneSessions(profile, s.now().UTC()) if err := s.ensureVPNFabricRouteIntents(ctx, clusterID, profile); err != nil { return VPNClientProfile{}, err } profile = s.attachVPNFabricServiceChannelLeases(ctx, profile) return profile, nil } func (s *Service) attachVPNFabricServiceChannelLeases(ctx context.Context, profile VPNClientProfile) VPNClientProfile { for i := range profile.Connections { connection := profile.Connections[i] route := vpnFabricRouteFromClientConfig(connection.ClientConfig) if route.Status == "planned" && route.SelectedEntryNodeID == "" && route.SelectedExitNodeID != "" { profile.Connections[i].ClientConfig = attachVPNMeshNodeRouteContract(connection.ClientConfig) continue } if route.Status != "planned" || route.SelectedEntryNodeID == "" || route.SelectedExitNodeID == "" { continue } entryPool := dedupeStrings(append([]string{}, route.EntryPoolNodeIDs...)) if len(entryPool) == 0 { entryPool = dedupeStrings(append([]string{route.SelectedEntryNodeID}, connection.EntryNodeIDs...)) } exitPool := dedupeStrings(append([]string{}, route.ExitPoolNodeIDs...)) if len(exitPool) == 0 { exitPool = dedupeStrings(append([]string{route.SelectedExitNodeID, connection.ExitNodeID}, connection.AllowedNodeIDs...)) } backendFallbackAllowed := false lease, err := s.IssueFabricServiceChannelLease(ctx, IssueFabricServiceChannelLeaseInput{ ClusterID: profile.ClusterID, OrganizationID: profile.OrganizationID, UserID: profile.UserID, ResourceID: connection.ID, ServiceClass: FabricServiceClassVPNPackets, EntryNodeIDs: entryPool, ExitNodeIDs: exitPool, PreferredEntryNodeID: route.SelectedEntryNodeID, PreferredExitNodeID: route.SelectedExitNodeID, AllowedChannels: []string{"vpn_packet", "fabric_control", FabricChannelBulk, FabricChannelControl}, TTL: 6 * time.Hour, BackendFallbackAllowed: &backendFallbackAllowed, }) if err != nil { profile.Connections[i].ClientConfig = attachVPNFabricServiceChannelError(connection.ClientConfig, err) continue } profile.Connections[i].ClientConfig = attachVPNFabricServiceChannelLease(connection.ClientConfig, lease) } return profile } func attachVPNMeshNodeRouteContract(raw json.RawMessage) json.RawMessage { var cfg map[string]any if err := json.Unmarshal(raw, &cfg); err != nil || cfg == nil { cfg = map[string]any{} } cfg["fabric_service_channel_status"] = "mesh_node_route_required" cfg["fabric_service_channel_lease"] = nil cfg["vpn_client_node_contract"] = map[string]any{ "schema_version": "rap.vpn_client_node_route.v1", "node_role": "vpn-client", "route_authority": "fabric_farm", "entry_node_required": false, "exit_selection": "pool", "transport": "quic_fabric_mesh", "legacy_protocol_supported": false, "backend_packet_relay": false, "android_runtime_packaging": "node_agent_required", "standalone_vpnservice_only": false, "service_binding": map[string]any{ "type": "local_ipv4_ingress", "accepts_from": []string{"android_vpnservice_tun", "linux_tun", "host_service_port"}, "listen_tcp_ports": cfg["listen_tcp_ports"], "listen_udp_ports": cfg["listen_udp_ports"], "exit_selection": "pool", "preferred_exit_pool_id": cfg["exit_pool_id"], "packet_service_class": "vpn_packets", "legacy_protocol_listener": false, }, } out, err := json.Marshal(cfg) if err != nil { return raw } return out } func attachVPNFabricServiceChannelLease(raw json.RawMessage, lease FabricServiceChannelLease) json.RawMessage { var cfg map[string]any if err := json.Unmarshal(raw, &cfg); err != nil || cfg == nil { cfg = map[string]any{} } cfg["fabric_service_channel_lease"] = lease cfg["fabric_service_channel_status"] = lease.Status out, err := json.Marshal(cfg) if err != nil { return raw } return out } func attachVPNFabricServiceChannelError(raw json.RawMessage, err error) json.RawMessage { var cfg map[string]any if json.Unmarshal(raw, &cfg) != nil || cfg == nil { cfg = map[string]any{} } cfg["fabric_service_channel_status"] = "error" cfg["fabric_service_channel_error"] = err.Error() out, marshalErr := json.Marshal(cfg) if marshalErr != nil { return raw } return out } func attachVPNDataplaneSessions(profile VPNClientProfile, now time.Time) VPNClientProfile { for i := range profile.Connections { profile.Connections[i].ClientConfig = enrichVPNDataplaneSession(profile, profile.Connections[i], now) } return profile } func enrichVPNDataplaneSession(profile VPNClientProfile, connection VPNClientConnection, now time.Time) json.RawMessage { var cfg map[string]any if err := json.Unmarshal(connection.ClientConfig, &cfg); err != nil || cfg == nil { cfg = map[string]any{} } route := vpnFabricRouteFromClientConfig(connection.ClientConfig) expiresAt := now.Add(time.Minute) sessionID := uuidLikeRandom() if sessionID == "" { sessionID = "vpn-session-" + now.UTC().Format("20060102T150405.000000000Z") } entryCandidates := vpnDataplaneEntryCandidates(route, connection, cfg) exitCandidates := vpnConcreteExitCandidatesFromClientConfig(cfg) serviceChannelRequest := vpnFabricServiceChannelRequest(profile, connection, route, cfg, sessionID, now) routeBundle := vpnFabricRouteBundle(route, entryCandidates, exitCandidates, now) transportCandidates := vpnDataplaneTransportCandidates(route, entryCandidates, exitCandidates) status := "waiting_for_entry_endpoint" if route.Status == "planned" && route.SelectedEntryNodeID != "" && route.SelectedExitNodeID != "" { status = "ready_for_entry_listener" } else if route.Status == "planned" && route.SelectedEntryNodeID == "" && route.SelectedExitNodeID != "" { status = "ready_for_mesh_node_route" } preferredTransport := "fabric_mesh_node_route_v1" nodeValidation := "vpn_client_node_identity_and_policy" cfg["vpn_dataplane_session"] = map[string]any{ "schema_version": "rap.vpn_dataplane_session.v1", "session_id": sessionID, "status": status, "issued_at": now, "expires_at": expiresAt, "cluster_id": profile.ClusterID, "organization_id": profile.OrganizationID, "user_id": profile.UserID, "vpn_connection_id": connection.ID, "entry_node_id": route.SelectedEntryNodeID, "exit_node_id": route.SelectedExitNodeID, "preferred_transport": preferredTransport, "fallback_transport": "none", "route_authority": "fabric_farm", "backend_relay_allowed": false, "packet_contract": map[string]any{ "tunnel_type": "universal_ip_packet", "application_protocol_agnostic": true, "all_ip_traffic": true, "protocol_specific_routing": false, }, "client_node_service_binding": map[string]any{ "type": "local_ipv4_ingress", "accepts_from": []string{"android_vpnservice_tun", "linux_tun", "host_service_port"}, "listen_tcp_ports": cfg["listen_tcp_ports"], "listen_udp_ports": cfg["listen_udp_ports"], "exit_selection": "pool", "preferred_exit_pool_id": cfg["exit_pool_id"], "selected_exit_node_id": route.SelectedExitNodeID, "packet_service_class": "vpn_packets", "legacy_protocol_listener": false, }, "auth": map[string]any{ "type": "control_plane_issued_bearer", "token": "rap_vpn_dps_" + sessionID, "token_ttl_seconds": int(expiresAt.Sub(now).Seconds()), "node_validation": nodeValidation, "introspection_path": "/api/v1/clusters/{cluster_id}/vpn/dataplane-sessions/{session_id}/introspect", }, "entry_candidates": entryCandidates, "exit_candidates": exitCandidates, "fabric_service_channel_request": serviceChannelRequest, "fabric_route_bundle": routeBundle, "transport_candidates": transportCandidates, } out, err := json.Marshal(cfg) if err != nil { return connection.ClientConfig } return out } func vpnFabricServiceChannelRequest(profile VPNClientProfile, connection VPNClientConnection, route vpnClientFabricRoute, cfg map[string]any, channelID string, now time.Time) map[string]any { targetPoolIDs := vpnConnectionExitPoolIDs(connection) if len(targetPoolIDs) == 0 { if raw, ok := cfg["exit_pool_id"].(string); ok && strings.TrimSpace(raw) != "" { targetPoolIDs = []string{strings.TrimSpace(raw)} } } if len(targetPoolIDs) == 0 && route.SelectedExitNodeID != "" { targetPoolIDs = []string{"selected-exit-node-pool"} } warmStandby := 1 if len(route.ExitPoolNodeIDs) <= 1 && len(targetPoolIDs) <= 1 { warmStandby = 0 } return map[string]any{ "schema_version": "rap.fabric_service_channel_request.v1", "channel_id": channelID, "cluster_id": profile.ClusterID, "organization_id": profile.OrganizationID, "user_id": profile.UserID, "resource_id": connection.ID, "source_role": "vpn-client", "service_class": "vpn_packets", "target": map[string]any{ "kind": "pool", "pool_ids": targetPoolIDs, "node_ids": route.ExitPoolNodeIDs, "selected_node_id": route.SelectedExitNodeID, "service_role": "ipv4-egress", "selection_policy": "latency_and_load_aware", "single_member_pool": len(route.ExitPoolNodeIDs) <= 1, }, "traffic": map[string]any{ "mode": "duplex", "channel_class": "vpn_packet", "application_protocol_agnostic": true, "tunnel_type": "universal_ip_packet", "flow_distribution": "latency_and_load_aware", "service_adapter_owns_protocol": false, "fabric_owns_route_and_failover": true, }, "resilience": map[string]any{ "min_active_paths": 1, "warm_standby_paths": warmStandby, "failover": "pool_member_or_next_authorized_pool", "reroute_on": []string{"route_failure", "latency_regression", "loss_regression", "backpressure", "pool_member_failure"}, }, "adapter_contract": map[string]any{ "adapter": "vpn-client", "adapter_role": "tun_packet_adapter", "adapter_may_select_endpoint": false, "adapter_may_use_legacy_relay": false, }, "issued_at": now, } } func vpnConnectionExitPoolIDs(connection VPNClientConnection) []string { var target struct { ExitPoolIDs []string `json:"exit_pool_ids"` ExitPoolID string `json:"exit_pool_id"` } _ = json.Unmarshal(connection.TargetEndpoint, &target) out := dedupeStrings(target.ExitPoolIDs) if len(out) == 0 && strings.TrimSpace(target.ExitPoolID) != "" { out = []string{strings.TrimSpace(target.ExitPoolID)} } var placement struct { ExitPoolIDs []string `json:"exit_pool_ids"` ExitPools []struct { PoolID string `json:"pool_id"` } `json:"exit_pools"` } _ = json.Unmarshal(connection.PlacementPolicy, &placement) out = append(out, placement.ExitPoolIDs...) for _, pool := range placement.ExitPools { if strings.TrimSpace(pool.PoolID) != "" { out = append(out, strings.TrimSpace(pool.PoolID)) } } return dedupeStrings(out) } func vpnFabricRouteBundle(route vpnClientFabricRoute, entryCandidates []map[string]any, exitCandidates []map[string]any, now time.Time) map[string]any { primaryPath := vpnFabricRouteLeasePath("primary", route.SelectedExitNodeID, exitCandidates) standbyPaths := vpnFabricRouteLeaseStandbyPaths(route.SelectedExitNodeID, exitCandidates) return map[string]any{ "schema_version": "rap.fabric_route_bundle.v1", "route_authority": "fabric_farm", "selected_entry_node_id": route.SelectedEntryNodeID, "selected_target_node_id": route.SelectedExitNodeID, "target_pool_node_ids": route.ExitPoolNodeIDs, "entry_candidates": entryCandidates, "target_candidates": exitCandidates, "endpoint_candidates": exitCandidates, "route_lease": map[string]any{ "schema_version": "rap.fabric_route_lease.v1", "lease_id": "lease-" + firstNonEmptyString(route.SelectedExitNodeID, "pool") + "-" + now.UTC().Format("20060102T150405Z"), "route_authority": "fabric_farm", "target_kind": "pool", "target_pool_node_ids": route.ExitPoolNodeIDs, "selected_target_node": route.SelectedExitNodeID, "primary_path": primaryPath, "warm_standby_paths": standbyPaths, "multipath": map[string]any{"enabled": true, "flow_distribution": "latency_and_load_aware", "min_active_paths": 1, "max_parallel_paths": maxInt(1, minInt(2, len(exitCandidates)))}, "rebuild_policy": map[string]any{"owner": "fabric_farm", "reroute_on": []string{"route_failure", "latency_regression", "loss_regression", "backpressure", "pool_member_failure"}, "service_adapter_action": "keep_sending_packets_to_channel"}, "service_visibility": "opaque_route_lease", "physical_path_visible": false, "generated_at": now, }, "primary_paths": 1, "warm_standby_paths": maxInt(0, minInt(1, len(route.ExitPoolNodeIDs)-1)), "legacy_visibility": "opaque_to_service_adapters", "generated_at": now, } } func vpnFabricRouteLeasePath(pathID string, selectedNodeID string, candidates []map[string]any) map[string]any { pathCandidates := make([]map[string]any, 0, len(candidates)) for _, candidate := range candidates { nodeID, _ := candidate["node_id"].(string) if selectedNodeID != "" && nodeID != "" && nodeID != selectedNodeID { continue } pathCandidates = append(pathCandidates, candidate) } if len(pathCandidates) == 0 { pathCandidates = candidates } return map[string]any{ "path_id": pathID, "target_node_id": selectedNodeID, "status": "ready", "endpoint_candidates": pathCandidates, } } func vpnFabricRouteLeaseStandbyPaths(selectedNodeID string, candidates []map[string]any) []map[string]any { out := make([]map[string]any, 0, len(candidates)) seen := map[string]struct{}{} for _, candidate := range candidates { nodeID, _ := candidate["node_id"].(string) if nodeID == "" || nodeID == selectedNodeID { continue } if _, ok := seen[nodeID]; ok { continue } seen[nodeID] = struct{}{} out = append(out, vpnFabricRouteLeasePath("standby-"+nodeID, nodeID, candidates)) } return out } func vpnDataplaneEntryCandidates(route vpnClientFabricRoute, connection VPNClientConnection, cfg map[string]any) []map[string]any { concrete := vpnConcreteEntryCandidatesFromClientConfig(cfg) ids := dedupeStrings(append([]string{route.SelectedEntryNodeID}, connection.EntryNodeIDs...)) out := make([]map[string]any, 0, len(concrete)+len(ids)) nodesWithConcrete := map[string]struct{}{} for _, candidate := range concrete { nodeID, _ := candidate["node_id"].(string) if nodeID == "" { continue } nodesWithConcrete[nodeID] = struct{}{} enriched := make(map[string]any, len(candidate)+4) for k, v := range candidate { enriched[k] = v } status := "endpoint_reported" if nodeID == route.SelectedEntryNodeID { status = "selected_endpoint_reported" } reachability, _ := enriched["reachability"].(string) if nodeID == route.SelectedEntryNodeID && strings.EqualFold(reachability, "public") { status = "selected_endpoint_public" } enriched["status"] = status enriched["endpoint_source"] = "node_latest_heartbeat.mesh_endpoint_report" enriched["transports"] = []string{"fabric_quic_route_v1"} out = append(out, enriched) } for _, nodeID := range ids { if nodeID == "" { continue } if _, ok := nodesWithConcrete[nodeID]; ok { continue } status := "endpoint_pending" if nodeID == route.SelectedEntryNodeID { status = "selected_endpoint_pending" } out = append(out, map[string]any{ "node_id": nodeID, "status": status, "transports": []string{"fabric_quic_route_v1"}, "endpoint_source": "node_mesh_advertisement_pending", }) } return out } func vpnConcreteEntryCandidatesFromClientConfig(cfg map[string]any) []map[string]any { raw, ok := cfg["vpn_entry_endpoint_candidates"] if !ok { return nil } payload, err := json.Marshal(raw) if err != nil { return nil } var out []map[string]any if err := json.Unmarshal(payload, &out); err != nil { return nil } return out } func vpnConcreteExitCandidatesFromClientConfig(cfg map[string]any) []map[string]any { raw, ok := cfg["vpn_exit_endpoint_candidates"] if !ok { return nil } payload, err := json.Marshal(raw) if err != nil { return nil } var out []map[string]any if err := json.Unmarshal(payload, &out); err != nil { return nil } return out } func vpnDataplaneTransportCandidates(route vpnClientFabricRoute, entryCandidates []map[string]any, exitCandidates []map[string]any) []map[string]any { candidates := []map[string]any{ { "type": "fabric_mesh_node_route_v1", "status": "contract_ready_quic_fabric_route_required", "entry_node_id": route.SelectedEntryNodeID, "exit_node_id": route.SelectedExitNodeID, "route_authority": "fabric_farm", "backend_relay_allowed": false, "entry_candidates": entryCandidates, "exit_candidates": exitCandidates, "application_protocols": []string{"ip"}, }, } return candidates } func vpnDirectHTTPEntryTransportCandidate(route vpnClientFabricRoute, entryCandidates []map[string]any) map[string]any { return nil } func uuidLikeRandom() string { var raw [16]byte if _, err := rand.Read(raw[:]); err != nil { return "" } raw[6] = (raw[6] & 0x0f) | 0x40 raw[8] = (raw[8] & 0x3f) | 0x80 encoded := hex.EncodeToString(raw[:]) return encoded[0:8] + "-" + encoded[8:12] + "-" + encoded[12:16] + "-" + encoded[16:20] + "-" + encoded[20:32] } func (s *Service) ensureVPNFabricRouteIntents(ctx context.Context, clusterID string, profile VPNClientProfile) error { intents, err := s.store.ListRouteIntents(ctx, clusterID) if err != nil { return err } existing := map[string]bool{} for _, intent := range intents { source, destination, ok := activeVPNPacketRouteIntent(intent, s.now()) if !ok { continue } existing[source+"->"+destination] = true } for _, connection := range profile.Connections { route := vpnFabricRouteFromClientConfig(connection.ClientConfig) if route.Status != "planned" || route.SelectedEntryNodeID == "" || route.SelectedExitNodeID == "" || route.SelectedEntryNodeID == route.SelectedExitNodeID { continue } pairs := [][2]string{ {route.SelectedEntryNodeID, route.SelectedExitNodeID}, {route.SelectedExitNodeID, route.SelectedEntryNodeID}, } for _, pair := range pairs { key := pair[0] + "->" + pair[1] if existing[key] { continue } if _, err := s.store.CreateRouteIntent(ctx, CreateRouteIntentInput{ ClusterID: clusterID, SourceSelector: mustJSONRaw(map[string]any{"node_id": pair[0]}), DestinationSelector: mustJSONRaw(map[string]any{"node_id": pair[1]}), ServiceClass: "vpn_packets", Priority: 10, Policy: mustJSONRaw(vpnFabricRouteIntentPolicy(pair[0], pair[1], s.now().UTC().Add(30*24*time.Hour))), }); err != nil { return err } existing[key] = true } } return nil } type vpnClientFabricRoute struct { Status string `json:"status"` SelectedEntryNodeID string `json:"selected_entry_node_id"` SelectedExitNodeID string `json:"selected_exit_node_id"` EntryPoolNodeIDs []string `json:"entry_pool_node_ids"` ExitPoolNodeIDs []string `json:"exit_pool_node_ids"` } func vpnFabricRouteFromClientConfig(raw json.RawMessage) vpnClientFabricRoute { var cfg struct { Route vpnClientFabricRoute `json:"vpn_fabric_route"` } if len(raw) == 0 { return vpnClientFabricRoute{} } _ = json.Unmarshal(raw, &cfg) return cfg.Route } func activeVPNPacketRouteIntent(intent MeshRouteIntent, now time.Time) (string, string, bool) { if intent.Status != "active" || intent.ServiceClass != "vpn_packets" { return "", "", false } var policy syntheticRoutePolicy if err := json.Unmarshal(intent.Policy, &policy); err != nil || !containsString(policy.AllowedChannels, "vpn_packet") { return "", "", false } if policy.ExpiresAt != nil && !policy.ExpiresAt.After(now.UTC()) { return "", "", false } var source nodeSelector var destination nodeSelector _ = json.Unmarshal(intent.SourceSelector, &source) _ = json.Unmarshal(intent.DestinationSelector, &destination) sourceNodeID := firstNodeID(source) destinationNodeID := firstNodeID(destination) if sourceNodeID == "" || destinationNodeID == "" { return "", "", false } return sourceNodeID, destinationNodeID, true } func vpnFabricRouteIntentPolicy(sourceNodeID, destinationNodeID string, expiresAt time.Time) map[string]any { version := "vpn-fabric-" + expiresAt.UTC().Format("20060102T150405Z") return map[string]any{ "synthetic_enabled": true, "hops": []string{sourceNodeID, destinationNodeID}, "allowed_channels": []string{"vpn_packet", "fabric_control"}, "max_ttl": 8, "max_hops": 8, "expires_at": expiresAt.UTC().Format(time.RFC3339), "route_version": version, "policy_version": version, "peer_directory_version": version, "backend_relay_fallback": false, "data_plane_preference": "fabric_service_channel", "route_owner": "fabric_farm", "vpn_builds_routes": false, "vpn_builds_tunnels": false, "farm_builds_routes": true, "farm_builds_tunnels": true, "route_refresh_required": true, "route_refresh_threshold": "24h", } } func mustJSONRaw(value any) json.RawMessage { raw, err := json.Marshal(value) if err != nil { return json.RawMessage(`{}`) } return raw } func (s *Service) ListAuditEvents(ctx context.Context, actorUserID string, input ListAuditEventsInput) ([]ClusterAuditEvent, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return nil, err } input.ClusterID = strings.TrimSpace(input.ClusterID) input.EventTypes = trimStringSlice(input.EventTypes) input.TargetTypes = trimStringSlice(input.TargetTypes) input.Correlation = strings.TrimSpace(input.Correlation) events, err := s.store.ListAuditEvents(ctx, input) if err != nil { return nil, err } if input.Correlation == "fabric_diagnostics" { events = s.withFabricDiagnosticsAuditCorrelation(ctx, actorUserID, input.ClusterID, events) } return events, nil } func (s *Service) ListFabricServiceChannelRebuildInvestigationBreadcrumbs(ctx context.Context, actorUserID string, input ListFabricServiceChannelRebuildInvestigationBreadcrumbsInput) (FabricServiceChannelRebuildInvestigationBreadcrumbs, error) { if err := s.ensurePlatformAdmin(ctx, actorUserID); err != nil { return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, err } input.ClusterID = strings.TrimSpace(input.ClusterID) if input.ClusterID == "" { return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, ErrInvalidPayload } if input.Limit <= 0 || input.Limit > 100 { input.Limit = 20 } cluster, err := s.store.GetCluster(ctx, input.ClusterID) if errors.Is(err, pgx.ErrNoRows) { return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, ErrInvalidCluster } if err != nil { return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, err } windowPolicy := fabricServiceChannelBreadcrumbWindowPolicyFromCluster(cluster) if input.CurrentWindowSeconds <= 0 { input.CurrentWindowSeconds = windowPolicy.CurrentWindowSeconds } if input.HistoryWindowSeconds <= 0 { input.HistoryWindowSeconds = windowPolicy.HistoryWindowSeconds } if input.HistoryWindowSeconds < input.CurrentWindowSeconds { input.HistoryWindowSeconds = input.CurrentWindowSeconds } events, err := s.ListAuditEvents(ctx, actorUserID, ListAuditEventsInput{ ClusterID: input.ClusterID, EventTypes: []string{ "fabric.service_channel_rebuild_feedback_breakdown.investigation_opened", "fabric.service_channel_rebuild_incident.investigation_opened", }, Correlation: "fabric_diagnostics", Limit: input.Limit, }) if err != nil { return FabricServiceChannelRebuildInvestigationBreadcrumbs{}, err } events = withFabricDiagnosticsBreadcrumbFreshness(events, s.now(), input.CurrentWindowSeconds, input.HistoryWindowSeconds) summary := summarizeClusterAuditEvents(events) return FabricServiceChannelRebuildInvestigationBreadcrumbs{ ClusterID: input.ClusterID, Events: events, Summary: summary, CurrentWindowSeconds: input.CurrentWindowSeconds, HistoryWindowSeconds: input.HistoryWindowSeconds, CurrentCount: summary.CountsByBreadcrumbStatus["current"], StaleCount: summary.CountsByBreadcrumbStatus["stale"], ExpiredCount: summary.CountsByBreadcrumbStatus["expired"], }, nil } func withFabricDiagnosticsBreadcrumbFreshness(events []ClusterAuditEvent, now time.Time, currentWindowSeconds, historyWindowSeconds int64) []ClusterAuditEvent { if len(events) == 0 { return events } if now.IsZero() { now = time.Now().UTC() } for index := range events { if events[index].CorrelationHints == nil { events[index].CorrelationHints = &ClusterAuditCorrelationHints{Scope: "fabric_diagnostics"} } ageSeconds := int64(0) if !events[index].CreatedAt.IsZero() { ageSeconds = int64(now.Sub(events[index].CreatedAt).Seconds()) if ageSeconds < 0 { ageSeconds = 0 } } status := "current" if ageSeconds > historyWindowSeconds { status = "expired" } else if ageSeconds > currentWindowSeconds { status = "stale" } events[index].CorrelationHints.BreadcrumbStatus = status events[index].CorrelationHints.BreadcrumbAgeSeconds = ageSeconds events[index].CorrelationHints.BreadcrumbCurrentWindow = currentWindowSeconds events[index].CorrelationHints.BreadcrumbHistoryWindow = historyWindowSeconds } return events } func (s *Service) withFabricDiagnosticsAuditCorrelation(ctx context.Context, actorUserID, clusterID string, events []ClusterAuditEvent) []ClusterAuditEvent { if len(events) == 0 { return events } health, healthErr := s.GetFabricServiceChannelRouteRebuildHealthSummary(ctx, actorUserID, GetFabricServiceChannelRouteRebuildHealthSummaryInput{ ClusterID: clusterID, Limit: 5, }) incidents, incidentsErr := s.ListFabricServiceChannelRouteRebuildIncidents(ctx, actorUserID, ListFabricServiceChannelRouteRebuildIncidentsInput{ ClusterID: clusterID, Limit: 20, }) for index := range events { hints := ClusterAuditCorrelationHints{ Scope: "fabric_diagnostics", CurrentDiagnosticStatus: "not_visible", } if healthErr == nil { if breakdown := fabricAuditMatchingFeedbackBreakdown(events[index], health.FeedbackBreakdowns); breakdown != nil { hints.CurrentDiagnosticStatus = "breakdown_active" hints.FeedbackBreakdown = breakdown hints.RecommendedAction = "open_filtered_rebuild_ledger" } } if hints.FeedbackBreakdown == nil && incidentsErr == nil { if incident := fabricAuditMatchingRebuildIncident(events[index], incidents); incident != nil { hints.CurrentDiagnosticStatus = "incident_visible" hints.RebuildIncident = incident hints.RecommendedAction = "open_deep_rebuild_ledger" } } events[index].CorrelationHints = &hints } return events } func fabricAuditMatchingFeedbackBreakdown(event ClusterAuditEvent, breakdowns []FabricServiceChannelRouteRebuildFeedbackHealthBreakdown) *FabricServiceChannelRouteRebuildFeedbackHealthBreakdown { payload := jsonObject(event.Payload) feedbackSource := jsonString(payload, "feedback_source") feedbackChannelID := jsonString(payload, "feedback_channel_id") feedbackViolationStatus := jsonString(payload, "feedback_violation_status") reporterNodeID := jsonString(payload, "reporter_node_id") routeID := jsonString(payload, "route_id") if feedbackSource == "" && feedbackChannelID == "" && feedbackViolationStatus == "" { return nil } for index := range breakdowns { item := breakdowns[index] if feedbackSource != "" && item.FeedbackSource != feedbackSource { continue } if feedbackChannelID != "" && item.FeedbackChannelID != feedbackChannelID { continue } if feedbackViolationStatus != "" && item.FeedbackViolationStatus != feedbackViolationStatus { continue } if reporterNodeID != "" && !containsString(item.AffectedReporterNodeIDs, reporterNodeID) { continue } if routeID != "" && !containsString(item.AffectedRouteIDs, routeID) { continue } return &item } return nil } func fabricAuditMatchingRebuildIncident(event ClusterAuditEvent, incidents []FabricServiceChannelRouteRebuildIncident) *FabricServiceChannelRouteRebuildIncident { payload := jsonObject(event.Payload) reporterNodeID := jsonString(payload, "reporter_node_id") routeID := jsonString(payload, "route_id") if routeID == "" && event.TargetType == "fabric_service_channel_route_rebuild_incident" && event.TargetID != nil { routeID = *event.TargetID } serviceClass := jsonString(payload, "service_class") generation := jsonString(payload, "generation") guardStatus := jsonString(payload, "guard_status") for index := range incidents { item := incidents[index] if reporterNodeID != "" && item.ReporterNodeID != reporterNodeID { continue } if routeID != "" && item.RouteID != routeID { continue } if serviceClass != "" && item.ServiceClass != serviceClass { continue } if generation != "" && item.Generation != generation { continue } if guardStatus != "" && item.GuardStatus != guardStatus { continue } if reporterNodeID == "" && routeID == "" && serviceClass == "" && generation == "" && guardStatus == "" { continue } return &item } return nil } func summarizeClusterAuditEvents(events []ClusterAuditEvent) ClusterAuditSummary { summary := ClusterAuditSummary{ TotalCount: len(events), CountsByEventType: map[string]int{}, CountsByTargetType: map[string]int{}, CountsByCurrentDiagnosticStatus: map[string]int{}, CountsByFeedbackSource: map[string]int{}, CountsByFeedbackViolationStatus: map[string]int{}, CountsByBreadcrumbStatus: map[string]int{}, } for _, event := range events { if event.EventType != "" { summary.CountsByEventType[event.EventType]++ } if event.TargetType != "" { summary.CountsByTargetType[event.TargetType]++ } if event.CreatedAt.After(summary.LatestAt) { summary.LatestAt = event.CreatedAt.UTC() } payload := jsonObject(event.Payload) if source := jsonString(payload, "feedback_source"); source != "" { summary.CountsByFeedbackSource[source]++ } if status := jsonString(payload, "feedback_violation_status"); status != "" { summary.CountsByFeedbackViolationStatus[status]++ } if event.CorrelationHints == nil { continue } if breadcrumbStatus := strings.TrimSpace(event.CorrelationHints.BreadcrumbStatus); breadcrumbStatus != "" { summary.CountsByBreadcrumbStatus[breadcrumbStatus]++ } status := firstNonEmptyString(event.CorrelationHints.CurrentDiagnosticStatus, "unknown") summary.CountsByCurrentDiagnosticStatus[status]++ if status == "not_visible" { summary.NotVisibleCount++ } else { summary.CorrelatedCount++ } } return summary } func (s *Service) ensurePlatformAdmin(ctx context.Context, userID string) error { userID = strings.TrimSpace(userID) if userID == "" { return ErrAccessDenied } role, err := s.store.GetPlatformRole(ctx, userID) if err != nil { if errors.Is(err, pgx.ErrNoRows) || isInvalidUUIDTextError(err) { return ErrAccessDenied } return err } if !isPlatformAdminRole(role) { return ErrAccessDenied } return nil } func (s *Service) ensurePlatformRecoveryAdmin(ctx context.Context, userID string) error { userID = strings.TrimSpace(userID) if userID == "" { return ErrAccessDenied } role, err := s.store.GetPlatformRole(ctx, userID) if err != nil { if errors.Is(err, pgx.ErrNoRows) || isInvalidUUIDTextError(err) { return ErrAccessDenied } return err } if role != PlatformRoleRecoveryAdmin { return ErrAccessDenied } return nil } func (s *Service) ensureClusterMutable(ctx context.Context, actorUserID, clusterID string) error { actorUserID = strings.TrimSpace(actorUserID) if actorUserID == "" { return ErrAccessDenied } role, err := s.store.GetPlatformRole(ctx, actorUserID) if err != nil { if errors.Is(err, pgx.ErrNoRows) || isInvalidUUIDTextError(err) { return ErrAccessDenied } return err } if role == PlatformRoleRecoveryAdmin { return nil } state, err := s.store.GetClusterAuthorityState(ctx, clusterID) if err != nil { if errors.Is(err, pgx.ErrNoRows) { return nil } return err } if state.AuthorityState != "authoritative" || state.MutationMode != "normal" { return ErrClusterReadOnly } return nil } func isInvalidUUIDTextError(err error) bool { var pgErr *pgconn.PgError return errors.As(err, &pgErr) && pgErr.Code == "22P02" } func (s *Service) ensureVPNLeaseOwnerEligible(ctx context.Context, clusterID, vpnConnectionID, ownerNodeID string) error { eligibility, err := s.store.CheckVPNLeaseOwnerEligibility(ctx, clusterID, vpnConnectionID, ownerNodeID) if errors.Is(err, pgx.ErrNoRows) { return ErrInvalidVPNConnection } if err != nil { return err } if eligibility.MembershipStatus != "active" || eligibility.NodeRegistrationStatus != NodeRegistrationActive { return ErrVPNLeaseOwnerNotAllowed } if !eligibility.AllowedByPolicy { return ErrVPNLeaseOwnerNotAllowed } if !eligibility.HasAuthorizedRole { return ErrVPNLeaseOwnerRoleRequired } return nil } func defaultJSON(raw json.RawMessage, fallback string) json.RawMessage { if len(raw) == 0 { return json.RawMessage(fallback) } return raw } func isAllowedVPNDesiredState(state string) bool { return state == VPNConnectionDesiredEnabled || state == VPNConnectionDesiredDisabled } func isAllowedVPNRouteType(routeType string) bool { switch routeType { case "cidr", "dns_suffix", "service", "resource": return true default: return false } } func isAllowedVPNRouteAction(action string) bool { return action == "allow" || action == "deny" } func isAllowedVPNPolicyStatus(status string) bool { return status == "active" || status == "disabled" } func isFabricEndpointStatus(status string) bool { switch status { case "active", "disabled", "maintenance": return true default: return false } } func isFabricEntryPointType(endpointType string) bool { switch endpointType { case "client_access", "admin", "api", "other": return true default: return false } } func isAllowedVPNNodePreference(preference string) bool { switch preference { case "candidate", "standby", "preferred": return true default: return false } } func isAllowedVPNAssignmentStatus(status string) bool { switch status { case VPNAssignmentStatusNotStarted, VPNAssignmentStatusAssigned, VPNAssignmentStatusLeaseRequired, VPNAssignmentStatusBlocked, VPNAssignmentStatusUnknown: return true default: return false } } type syntheticRoutePolicy struct { SyntheticEnabled bool `json:"synthetic_enabled"` PeerEndpoints map[string]string `json:"peer_endpoints"` PeerEndpointCandidates map[string][]PeerEndpointCandidate `json:"peer_endpoint_candidates"` RecoverySeeds []PeerRecoverySeed `json:"recovery_seeds"` RendezvousLeases []PeerRendezvousLease `json:"rendezvous_leases"` Hops []string `json:"hops"` AllowedChannels []string `json:"allowed_channels"` MaxTTL int `json:"max_ttl"` MaxHops int `json:"max_hops"` ExpiresAt *time.Time `json:"expires_at"` RouteVersion string `json:"route_version"` PolicyVersion string `json:"policy_version"` PeerDirectoryVersion string `json:"peer_directory_version"` Metadata map[string]any `json:"metadata"` } type dockerInstallProfileScope struct { BackendURL string `json:"backend_url"` ControlPlaneEndpoints []string `json:"control_plane_endpoints"` ArtifactEndpoints []string `json:"artifact_endpoints"` FabricRegistryRecords json.RawMessage `json:"fabric_registry_records"` DockerImageArtifactURLs []string `json:"docker_image_artifact_urls"` DockerImageArtifactSHA256 string `json:"docker_image_artifact_sha256"` DockerImageArtifactSizeBytes int64 `json:"docker_image_artifact_size_bytes"` NodeAgentArtifactURLs []string `json:"node_agent_artifact_urls"` NodeAgentArtifactSHA256 string `json:"node_agent_artifact_sha256"` NodeAgentArtifactSizeBytes int64 `json:"node_agent_artifact_size_bytes"` Roles []string `json:"roles"` NodeName string `json:"node_name"` NodeGroupID string `json:"node_group_id"` Image string `json:"image"` ContainerName string `json:"container_name"` StateDir string `json:"state_dir"` InstallDir string `json:"install_dir"` StartupMode string `json:"startup_mode"` Network string `json:"network"` RestartPolicy string `json:"restart_policy"` PullImage *bool `json:"pull_image"` Replace *bool `json:"replace"` DockerVPNGatewayEnabled *bool `json:"docker_vpn_gateway_enabled"` WorkloadSupervisionEnabled *bool `json:"workload_supervision_enabled"` MeshSyntheticRuntimeEnabled *bool `json:"mesh_synthetic_runtime_enabled"` MeshProductionForwardingEnabled *bool `json:"mesh_production_forwarding_enabled"` MeshListenAddr string `json:"mesh_listen_addr"` MeshListenPortMode string `json:"mesh_listen_port_mode"` MeshListenAutoPortStart int `json:"mesh_listen_auto_port_start"` MeshListenAutoPortEnd int `json:"mesh_listen_auto_port_end"` MeshAdvertiseEndpoint string `json:"mesh_advertise_endpoint"` MeshAdvertiseEndpointsJSON json.RawMessage `json:"mesh_advertise_endpoints_json"` MeshAdvertiseTransport string `json:"mesh_advertise_transport"` MeshConnectivityMode string `json:"mesh_connectivity_mode"` MeshNATType string `json:"mesh_nat_type"` MeshRegion string `json:"mesh_region"` HeartbeatIntervalSeconds int `json:"heartbeat_interval_seconds"` EnrollmentPollIntervalSeconds int `json:"enrollment_poll_interval_seconds"` EnrollmentPollTimeoutSeconds int `json:"enrollment_poll_timeout_seconds"` ProductionObservationSinkCapacity int `json:"production_observation_sink_capacity"` } func dockerInstallProfileFromScope(input DockerInstallProfileRequest, scopeRaw json.RawMessage) (DockerInstallProfile, error) { var scope dockerInstallProfileScope if len(scopeRaw) > 0 { if !json.Valid(scopeRaw) { return DockerInstallProfile{}, ErrInvalidPayload } if err := json.Unmarshal(scopeRaw, &scope); err != nil { return DockerInstallProfile{}, ErrInvalidPayload } } nodeName := firstNonEmptyString(strings.TrimSpace(input.NodeName), scope.NodeName) if nodeName == "" { nodeName = "docker-node" } containerName := firstNonEmptyString(scope.ContainerName, "rap-node-agent-"+safeInstallProfileSlug(nodeName)) roles := trimStringSlice(scope.Roles) profile := DockerInstallProfile{ SchemaVersion: "rap.docker_install_profile.v1", BackendURL: strings.TrimRight(strings.TrimSpace(scope.BackendURL), "/"), ControlPlaneEndpoints: trimStringSlice(scope.ControlPlaneEndpoints), ArtifactEndpoints: trimEndpointSlice(scope.ArtifactEndpoints), FabricRegistryRecords: cloneRawJSON(scope.FabricRegistryRecords), Roles: roles, NodeName: nodeName, Image: firstNonEmptyString(scope.Image, "rap-node-agent:latest"), ContainerName: containerName, StateDir: firstNonEmptyString(scope.StateDir, "/var/lib/rap/nodes/"+safeInstallProfileSlug(nodeName)), Network: firstNonEmptyString(scope.Network, "host"), RestartPolicy: firstNonEmptyString(scope.RestartPolicy, "unless-stopped"), PullImage: boolPtrValue(scope.PullImage, false), Replace: boolPtrValue(scope.Replace, true), DockerVPNGatewayEnabled: boolPtrValue(scope.DockerVPNGatewayEnabled, containsString(roles, "vpn-exit")), WorkloadSupervisionEnabled: boolPtrValue(scope.WorkloadSupervisionEnabled, false), MeshSyntheticRuntimeEnabled: boolPtrValue(scope.MeshSyntheticRuntimeEnabled, false), MeshProductionForwardingEnabled: boolPtrValue(scope.MeshProductionForwardingEnabled, false), MeshListenAddr: strings.TrimSpace(scope.MeshListenAddr), MeshListenPortMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.MeshListenPortMode)), "auto"), MeshListenAutoPortStart: positiveOrDefault(scope.MeshListenAutoPortStart, 19131), MeshListenAutoPortEnd: positiveOrDefault(scope.MeshListenAutoPortEnd, 19231), MeshAdvertiseEndpoint: strings.TrimRight(strings.TrimSpace(scope.MeshAdvertiseEndpoint), "/"), MeshAdvertiseEndpointsJSON: scope.MeshAdvertiseEndpointsJSON, MeshAdvertiseTransport: strings.TrimSpace(scope.MeshAdvertiseTransport), MeshConnectivityMode: strings.TrimSpace(scope.MeshConnectivityMode), MeshNATType: strings.TrimSpace(scope.MeshNATType), MeshRegion: strings.TrimSpace(scope.MeshRegion), HeartbeatIntervalSeconds: positiveOrDefault(scope.HeartbeatIntervalSeconds, 15), EnrollmentPollIntervalSeconds: positiveOrDefault(scope.EnrollmentPollIntervalSeconds, 5), EnrollmentPollTimeoutSeconds: nonNegativeOrDefault(scope.EnrollmentPollTimeoutSeconds, 0), ProductionObservationSinkCapacity: scope.ProductionObservationSinkCapacity, } profile.DockerImageArtifact = dockerImageArtifactFromScope(profile.Image, profile.ArtifactEndpoints, scope) if profile.BackendURL == "" && len(profile.ControlPlaneEndpoints) > 0 { profile.BackendURL = profile.ControlPlaneEndpoints[0] } if profile.BackendURL == "" { return DockerInstallProfile{}, ErrInvalidPayload } if len(profile.ArtifactEndpoints) == 0 { if endpoint := defaultArtifactEndpointFromBackendURL(profile.BackendURL); endpoint != "" { profile.ArtifactEndpoints = []string{endpoint} profile.DockerImageArtifact = dockerImageArtifactFromScope(profile.Image, profile.ArtifactEndpoints, scope) } } if len(profile.MeshAdvertiseEndpointsJSON) > 0 && !json.Valid(profile.MeshAdvertiseEndpointsJSON) { return DockerInstallProfile{}, ErrInvalidPayload } if !isOptionalJSONArray(profile.FabricRegistryRecords) { return DockerInstallProfile{}, ErrInvalidPayload } switch profile.MeshListenPortMode { case "manual", "auto", "disabled": default: return DockerInstallProfile{}, ErrInvalidPayload } if profile.MeshListenAutoPortStart > profile.MeshListenAutoPortEnd { return DockerInstallProfile{}, ErrInvalidPayload } return profile, nil } func windowsInstallProfileFromScope(input DockerInstallProfileRequest, scopeRaw json.RawMessage) (WindowsInstallProfile, error) { var scope dockerInstallProfileScope if len(scopeRaw) > 0 { if !json.Valid(scopeRaw) { return WindowsInstallProfile{}, ErrInvalidPayload } if err := json.Unmarshal(scopeRaw, &scope); err != nil { return WindowsInstallProfile{}, ErrInvalidPayload } } nodeName := firstNonEmptyString(strings.TrimSpace(input.NodeName), scope.NodeName) if nodeName == "" { nodeName = "windows-node" } profile := WindowsInstallProfile{ SchemaVersion: "rap.windows_install_profile.v1", BackendURL: strings.TrimRight(strings.TrimSpace(scope.BackendURL), "/"), ControlPlaneEndpoints: trimStringSlice(scope.ControlPlaneEndpoints), ArtifactEndpoints: trimEndpointSlice(scope.ArtifactEndpoints), FabricRegistryRecords: cloneRawJSON(scope.FabricRegistryRecords), Roles: trimStringSlice(scope.Roles), NodeName: nodeName, StateDir: firstNonEmptyString(scope.StateDir, `C:\ProgramData\RAP\nodes\`+safeInstallProfileSlug(nodeName)), InstallDir: firstNonEmptyString(scope.InstallDir, `C:\Program Files\RAP\`+safeInstallProfileSlug(nodeName)), StartupMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.StartupMode)), "auto"), WorkloadSupervisionEnabled: boolPtrValue(scope.WorkloadSupervisionEnabled, false), MeshSyntheticRuntimeEnabled: boolPtrValue(scope.MeshSyntheticRuntimeEnabled, false), MeshProductionForwardingEnabled: boolPtrValue(scope.MeshProductionForwardingEnabled, false), MeshListenAddr: strings.TrimSpace(scope.MeshListenAddr), MeshListenPortMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.MeshListenPortMode)), "auto"), MeshListenAutoPortStart: positiveOrDefault(scope.MeshListenAutoPortStart, 19131), MeshListenAutoPortEnd: positiveOrDefault(scope.MeshListenAutoPortEnd, 19231), MeshAdvertiseEndpoint: strings.TrimRight(strings.TrimSpace(scope.MeshAdvertiseEndpoint), "/"), MeshAdvertiseEndpointsJSON: scope.MeshAdvertiseEndpointsJSON, MeshAdvertiseTransport: strings.TrimSpace(scope.MeshAdvertiseTransport), MeshConnectivityMode: firstNonEmptyString(strings.TrimSpace(scope.MeshConnectivityMode), "outbound_only"), MeshNATType: firstNonEmptyString(strings.TrimSpace(scope.MeshNATType), "unknown"), MeshRegion: firstNonEmptyString(strings.TrimSpace(scope.MeshRegion), "windows"), HeartbeatIntervalSeconds: positiveOrDefault(scope.HeartbeatIntervalSeconds, 15), EnrollmentPollIntervalSeconds: positiveOrDefault(scope.EnrollmentPollIntervalSeconds, 5), EnrollmentPollTimeoutSeconds: nonNegativeOrDefault(scope.EnrollmentPollTimeoutSeconds, 0), ProductionObservationSinkCapacity: scope.ProductionObservationSinkCapacity, } profile.NodeAgentArtifact = windowsNodeAgentArtifactFromScope(profile.ArtifactEndpoints, scope) if profile.BackendURL == "" && len(profile.ControlPlaneEndpoints) > 0 { profile.BackendURL = profile.ControlPlaneEndpoints[0] } if profile.BackendURL == "" { return WindowsInstallProfile{}, ErrInvalidPayload } if len(profile.ArtifactEndpoints) == 0 { if endpoint := defaultArtifactEndpointFromBackendURL(profile.BackendURL); endpoint != "" { profile.ArtifactEndpoints = []string{endpoint} profile.NodeAgentArtifact = windowsNodeAgentArtifactFromScope(profile.ArtifactEndpoints, scope) } } if len(profile.MeshAdvertiseEndpointsJSON) > 0 && !json.Valid(profile.MeshAdvertiseEndpointsJSON) { return WindowsInstallProfile{}, ErrInvalidPayload } if !isOptionalJSONArray(profile.FabricRegistryRecords) { return WindowsInstallProfile{}, ErrInvalidPayload } switch profile.MeshListenPortMode { case "manual", "auto", "disabled": default: return WindowsInstallProfile{}, ErrInvalidPayload } switch profile.StartupMode { case "auto", "system-task", "user-task", "none": default: return WindowsInstallProfile{}, ErrInvalidPayload } if profile.MeshListenAutoPortStart > profile.MeshListenAutoPortEnd { return WindowsInstallProfile{}, ErrInvalidPayload } return profile, nil } func linuxInstallProfileFromScope(input DockerInstallProfileRequest, scopeRaw json.RawMessage) (LinuxInstallProfile, error) { var scope dockerInstallProfileScope if len(scopeRaw) > 0 { if !json.Valid(scopeRaw) { return LinuxInstallProfile{}, ErrInvalidPayload } if err := json.Unmarshal(scopeRaw, &scope); err != nil { return LinuxInstallProfile{}, ErrInvalidPayload } } nodeName := firstNonEmptyString(strings.TrimSpace(input.NodeName), scope.NodeName) if nodeName == "" { nodeName = "linux-node" } slug := safeInstallProfileSlug(nodeName) profile := LinuxInstallProfile{ SchemaVersion: "rap.linux_install_profile.v1", BackendURL: strings.TrimRight(strings.TrimSpace(scope.BackendURL), "/"), ControlPlaneEndpoints: trimStringSlice(scope.ControlPlaneEndpoints), ArtifactEndpoints: trimEndpointSlice(scope.ArtifactEndpoints), FabricRegistryRecords: cloneRawJSON(scope.FabricRegistryRecords), Roles: trimStringSlice(scope.Roles), NodeName: nodeName, StateDir: firstNonEmptyString(scope.StateDir, "/var/lib/rap/nodes/"+slug), InstallDir: firstNonEmptyString(scope.InstallDir, "/opt/rap/"+slug), StartupMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.StartupMode)), "systemd"), WorkloadSupervisionEnabled: boolPtrValue(scope.WorkloadSupervisionEnabled, false), MeshSyntheticRuntimeEnabled: boolPtrValue(scope.MeshSyntheticRuntimeEnabled, false), MeshProductionForwardingEnabled: boolPtrValue(scope.MeshProductionForwardingEnabled, false), MeshListenAddr: strings.TrimSpace(scope.MeshListenAddr), MeshListenPortMode: firstNonEmptyString(strings.ToLower(strings.TrimSpace(scope.MeshListenPortMode)), "auto"), MeshListenAutoPortStart: positiveOrDefault(scope.MeshListenAutoPortStart, 19131), MeshListenAutoPortEnd: positiveOrDefault(scope.MeshListenAutoPortEnd, 19231), MeshAdvertiseEndpoint: strings.TrimRight(strings.TrimSpace(scope.MeshAdvertiseEndpoint), "/"), MeshAdvertiseEndpointsJSON: scope.MeshAdvertiseEndpointsJSON, MeshAdvertiseTransport: firstNonEmptyString(strings.TrimSpace(scope.MeshAdvertiseTransport), "direct_quic"), MeshConnectivityMode: firstNonEmptyString(strings.TrimSpace(scope.MeshConnectivityMode), "outbound_only"), MeshNATType: firstNonEmptyString(strings.TrimSpace(scope.MeshNATType), "unknown"), MeshRegion: firstNonEmptyString(strings.TrimSpace(scope.MeshRegion), "linux"), HeartbeatIntervalSeconds: positiveOrDefault(scope.HeartbeatIntervalSeconds, 15), EnrollmentPollIntervalSeconds: positiveOrDefault(scope.EnrollmentPollIntervalSeconds, 5), EnrollmentPollTimeoutSeconds: nonNegativeOrDefault(scope.EnrollmentPollTimeoutSeconds, 0), ProductionObservationSinkCapacity: scope.ProductionObservationSinkCapacity, } profile.NodeAgentArtifact = linuxNodeAgentArtifactFromScope(profile.ArtifactEndpoints, scope) if profile.BackendURL == "" && len(profile.ControlPlaneEndpoints) > 0 { profile.BackendURL = profile.ControlPlaneEndpoints[0] } if profile.BackendURL == "" { return LinuxInstallProfile{}, ErrInvalidPayload } if len(profile.ArtifactEndpoints) == 0 { if endpoint := defaultArtifactEndpointFromBackendURL(profile.BackendURL); endpoint != "" { profile.ArtifactEndpoints = []string{endpoint} profile.NodeAgentArtifact = linuxNodeAgentArtifactFromScope(profile.ArtifactEndpoints, scope) } } if len(profile.MeshAdvertiseEndpointsJSON) > 0 && !json.Valid(profile.MeshAdvertiseEndpointsJSON) { return LinuxInstallProfile{}, ErrInvalidPayload } if !isOptionalJSONArray(profile.FabricRegistryRecords) { return LinuxInstallProfile{}, ErrInvalidPayload } switch profile.MeshListenPortMode { case "manual", "auto", "disabled": default: return LinuxInstallProfile{}, ErrInvalidPayload } switch profile.StartupMode { case "auto", "systemd", "none": default: return LinuxInstallProfile{}, ErrInvalidPayload } if profile.MeshListenAutoPortStart > profile.MeshListenAutoPortEnd { return LinuxInstallProfile{}, ErrInvalidPayload } return profile, nil } func linuxNodeAgentArtifactFromScope(artifactEndpoints []string, scope dockerInstallProfileScope) *DockerArtifact { urls := trimEndpointSlice(scope.NodeAgentArtifactURLs) if len(urls) == 0 { for _, endpoint := range artifactEndpoints { urls = append(urls, strings.TrimRight(endpoint, "/")+"/rap-node-agent-linux-amd64") } } sha256 := strings.TrimSpace(scope.NodeAgentArtifactSHA256) sizeBytes := scope.NodeAgentArtifactSizeBytes if len(urls) == 0 && sha256 == "" { return nil } return &DockerArtifact{ Kind: "linux_binary", MediaType: "application/octet-stream", FileName: "rap-node-agent-linux-amd64", URLs: urls, SHA256: sha256, SizeBytes: sizeBytes, } } func windowsNodeAgentArtifactFromScope(artifactEndpoints []string, scope dockerInstallProfileScope) *DockerArtifact { urls := trimEndpointSlice(scope.NodeAgentArtifactURLs) if len(urls) == 0 { for _, endpoint := range artifactEndpoints { urls = append(urls, strings.TrimRight(endpoint, "/")+"/rap-node-agent-windows-amd64.exe") } } sha256 := strings.TrimSpace(scope.NodeAgentArtifactSHA256) sizeBytes := scope.NodeAgentArtifactSizeBytes if len(urls) == 0 && sha256 == "" { return nil } return &DockerArtifact{ Kind: "windows_exe", MediaType: "application/vnd.microsoft.portable-executable", FileName: "rap-node-agent-windows-amd64.exe", URLs: urls, SHA256: sha256, SizeBytes: sizeBytes, } } func dockerImageArtifactFromScope(image string, artifactEndpoints []string, scope dockerInstallProfileScope) *DockerArtifact { image = strings.TrimSpace(image) if image == "" { return nil } fileName := safeArtifactFileName(image) + ".tar" urls := trimEndpointSlice(scope.DockerImageArtifactURLs) if len(urls) == 0 { for _, endpoint := range artifactEndpoints { urls = append(urls, strings.TrimRight(endpoint, "/")+"/"+fileName) } } sha256 := strings.TrimSpace(scope.DockerImageArtifactSHA256) sizeBytes := scope.DockerImageArtifactSizeBytes if len(urls) == 0 && sha256 == "" { return nil } return &DockerArtifact{ Kind: "docker_image_tar", Image: image, MediaType: "application/vnd.docker.image.rootfs.diff.tar", FileName: fileName, URLs: urls, SHA256: sha256, SizeBytes: sizeBytes, } } func defaultArtifactEndpointFromBackendURL(backendURL string) string { value := strings.TrimRight(strings.TrimSpace(backendURL), "/") if value == "" { return "" } for _, suffix := range []string{"/api/v1", "/api"} { if strings.HasSuffix(value, suffix) { value = strings.TrimSuffix(value, suffix) break } } return strings.TrimRight(value, "/") + "/downloads" } type heartbeatMeshEndpointReport struct { SchemaVersion string `json:"schema_version"` ClusterID string `json:"cluster_id"` NodeID string `json:"node_id"` PeerEndpoint string `json:"peer_endpoint"` Transport string `json:"transport"` ConnectivityMode string `json:"connectivity_mode"` NATType string `json:"nat_type"` Region string `json:"region"` PeerRecoveryHealthy bool `json:"peer_recovery_healthy"` PeerRecoveryReady int `json:"peer_recovery_ready"` PeerRecoveryDeficit int `json:"peer_recovery_deficit"` EndpointCandidates []PeerEndpointCandidate `json:"endpoint_candidates"` ObservedAt *time.Time `json:"observed_at"` } type heartbeatMeshPeerRecoveryReport struct { TargetReadyPeers int `json:"target_ready_peers"` } type heartbeatMeshPeerConnectionIntentReport struct { DirectCount int `json:"direct_count"` } type heartbeatMeshPeerConnectionManagerReport struct { ProbeResults []heartbeatMeshPeerConnectionProbeResult `json:"probe_results"` PeerConnectionReady int `json:"peer_connection_ready"` } type heartbeatMeshPeerConnectionProbeResult struct { NodeID string `json:"node_id"` LinkStatus string `json:"link_status"` TransportMode string `json:"transport_mode"` DirectCandidate bool `json:"direct_candidate"` CandidateResults []heartbeatMeshPeerConnectionCandidate `json:"candidate_results"` } type heartbeatMeshPeerConnectionCandidate struct { Transport string `json:"transport"` LinkStatus string `json:"link_status"` } type heartbeatRendezvousLeaseReport struct { SchemaVersion string `json:"schema_version"` ClusterID string `json:"cluster_id"` NodeID string `json:"node_id"` ObservedAt string `json:"observed_at"` Leases []heartbeatRendezvousLeaseDetails `json:"leases"` } type heartbeatRendezvousLeaseDetails struct { LeaseID string `json:"lease_id"` PeerNodeID string `json:"peer_node_id"` RelayNodeID string `json:"relay_node_id"` RouteIDs []string `json:"route_ids"` StaleRelay bool `json:"stale_relay"` WithdrawalNeeded bool `json:"withdrawal_needed"` ReselectionNeeded bool `json:"reselection_needed"` ConnectionState string `json:"connection_state"` Reason string `json:"reason"` } type meshRouteHealthObservationMetadata struct { ObservationType string `json:"observation_type"` RouteID string `json:"route_id"` RoutePathDecisionApplied bool `json:"route_path_decision_applied"` RoutePathDecisionSelectedRelayID string `json:"route_path_decision_selected_relay_id"` RoutePathDecisionStaleRelayNodeID string `json:"route_path_decision_stale_relay_node_id"` RoutePathDecisionRendezvousPeerNodeID string `json:"route_path_decision_rendezvous_peer_node_id"` RoutePathDecisionRendezvousLeaseID string `json:"route_path_decision_rendezvous_lease_id"` RoutePathDecisionRendezvousLeaseReason string `json:"route_path_decision_rendezvous_lease_reason"` RoutePathDecisionSource string `json:"route_path_decision_source"` ExpectedEffectiveHops []string `json:"expected_effective_hops"` ObservedAckPath []string `json:"observed_ack_path"` RoutePathDriftDetected bool `json:"route_path_drift_detected"` FailureReason string `json:"failure_reason"` ControlPlaneOnly bool `json:"control_plane_only"` ProductionForwarding bool `json:"production_forwarding"` ProductionPayloadForwarding bool `json:"production_payload_forwarding"` RouteHealthProductionPayloadForwarding bool `json:"route_health_production_payload_forwarding"` RouteHealthServicePayloadForwarding bool `json:"route_health_service_payload_forwarding"` } type rendezvousRelayFeedbackEntry struct { ReporterNodeID string RouteIDs []string LeaseID string PeerNodeID string RelayNodeID string ConnectionState string Reason string WithdrawalNeeded bool ReselectionNeeded bool ObservedAt time.Time } type rendezvousRelaySelection struct { RelayNodeID string Endpoint string PeerCertSHA256 string Score int Reasons []string } type rendezvousRelayPolicy struct { localNodeID string now time.Time links []MeshLinkObservation feedback []rendezvousRelayFeedbackEntry withdrawn map[string]RendezvousRelayPolicyDecision replacements map[string]RendezvousRelayPolicyDecision } const ( maxScopedRecoverySeeds = 20 maxScopedRendezvousLeases = 128 defaultCoreMeshBootstrapPeerTarget = 3 rendezvousRelayFeedbackMaxAge = 2 * time.Minute ) type nodeSelector struct { NodeID string `json:"node_id"` NodeIDs []string `json:"node_ids"` } func (s *Service) syntheticRouteFromIntent(input GetNodeSyntheticMeshConfigInput, intent MeshRouteIntent, localPerspective endpointPerspective) (SyntheticMeshRouteConfig, map[string]string, map[string][]PeerEndpointCandidate, []PeerRecoverySeed, []PeerRendezvousLease, bool) { if intent.Status != "active" { return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false } var policy syntheticRoutePolicy if err := json.Unmarshal(intent.Policy, &policy); err != nil { return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false } if !policy.SyntheticEnabled { return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false } var source nodeSelector var destination nodeSelector _ = json.Unmarshal(intent.SourceSelector, &source) _ = json.Unmarshal(intent.DestinationSelector, &destination) sourceNodeID := firstNodeID(source) destinationNodeID := firstNodeID(destination) hops := append([]string{}, policy.Hops...) if len(hops) == 0 && sourceNodeID != "" && destinationNodeID != "" { hops = []string{sourceNodeID, destinationNodeID} } if len(hops) < 2 { return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false } if err := validatePeerEndpointCandidates(policy.PeerEndpointCandidates, hops); err != nil { return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false } if err := validatePeerRecoverySeeds(policy.RecoverySeeds); err != nil { return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false } if err := validatePeerRendezvousLeases(policy.RendezvousLeases, hops, s.now()); err != nil { return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false } scopedHops := append([]string{}, hops...) if !containsString(scopedHops, input.NodeID) { lease, ok := policyRendezvousLeaseForRelayNode(policy.RendezvousLeases, hops, input.NodeID, intent.ID) if !ok { if !localPerspectiveCanServeRendezvousRelay(localPerspective) { return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false } peerNodeID := destinationNodeID if peerNodeID == "" || !containsString(hops, peerNodeID) { peerNodeID = hops[len(hops)-1] } scopedHops = effectiveRoutePathWithReplacement(hops, peerNodeID, "", input.NodeID) } else { scopedHops = effectiveRoutePathWithReplacement(hops, lease.PeerNodeID, "", lease.RelayNodeID) } if !containsString(scopedHops, input.NodeID) { return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false } } if sourceNodeID == "" { sourceNodeID = hops[0] } if destinationNodeID == "" { destinationNodeID = hops[len(hops)-1] } expiresAt := s.now().UTC().Add(5 * time.Minute) if policy.ExpiresAt != nil { expiresAt = policy.ExpiresAt.UTC() } if !expiresAt.After(s.now().UTC()) { return SyntheticMeshRouteConfig{}, nil, nil, nil, nil, false } allowedChannels := policy.AllowedChannels if len(allowedChannels) == 0 { allowedChannels = []string{"fabric_control", "route_control"} } maxTTL := policy.MaxTTL if maxTTL <= 0 { maxTTL = 8 } maxHops := policy.MaxHops if maxHops <= 0 { maxHops = 8 } routeVersion := policy.RouteVersion if routeVersion == "" { routeVersion = intent.UpdatedAt.UTC().Format(time.RFC3339) } policyVersion := policy.PolicyVersion if policyVersion == "" { policyVersion = routeVersion } peerDirectoryVersion := policy.PeerDirectoryVersion if peerDirectoryVersion == "" { peerDirectoryVersion = routeVersion } route := SyntheticMeshRouteConfig{ RouteID: intent.ID, ClusterID: input.ClusterID, SourceNodeID: sourceNodeID, DestinationNodeID: destinationNodeID, Hops: scopedHops, AllowedChannels: allowedChannels, ExpiresAt: expiresAt, MaxTTL: maxTTL, MaxHops: maxHops, RouteVersion: routeVersion, PolicyVersion: policyVersion, PeerDirectoryVersion: peerDirectoryVersion, } return route, scopedPeerEndpoints(policy.PeerEndpoints, scopedHops), scopedPeerEndpointCandidates(policy.PeerEndpointCandidates, scopedHops), policy.RecoverySeeds, normalizeRendezvousLeases(policy.RendezvousLeases, route, s.now()), true } func localPerspectiveCanServeRendezvousRelay(local endpointPerspective) bool { if len(publicDirectRelayCandidates(local.PeerEndpointCandidates)) > 0 { return true } return isUsableFabricControlEndpoint(local.PeerEndpoint) && !endpointPrivateForOffsite(local.PeerEndpoint) } func policyRendezvousLeaseForRelayNode(leases []PeerRendezvousLease, routeHops []string, localNodeID string, routeID string) (PeerRendezvousLease, bool) { localNodeID = strings.TrimSpace(localNodeID) if localNodeID == "" { return PeerRendezvousLease{}, false } var selected PeerRendezvousLease found := false for _, lease := range leases { if strings.TrimSpace(lease.RelayNodeID) != localNodeID || strings.TrimSpace(lease.PeerNodeID) == "" || !containsString(routeHops, lease.PeerNodeID) || !rendezvousLeaseAppliesToRoute(lease, routeID) { continue } if !found || rendezvousLeaseBetterForRoutePath(lease, selected) { selected = lease found = true } } return selected, found } func rendezvousLeaseAppliesToRoute(lease PeerRendezvousLease, routeID string) bool { routeID = strings.TrimSpace(routeID) return routeID == "" || len(lease.RouteIDs) == 0 || containsString(lease.RouteIDs, routeID) } func (s *Service) reportedEndpointConfig(ctx context.Context, clusterID string, localNodeID string, routePath []string, localPerspective endpointPerspective) (map[string]string, map[string][]PeerEndpointCandidate, error) { peers := map[string]string{} candidates := map[string][]PeerEndpointCandidate{} for _, nodeID := range routePath { nodeID = strings.TrimSpace(nodeID) if nodeID == "" || nodeID == localNodeID { continue } desiredEndpoint, desiredCandidates, err := s.desiredMeshListenerEndpointConfig(ctx, clusterID, nodeID, 0) if err != nil { return nil, nil, err } heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1) if err != nil { return nil, nil, err } if len(heartbeats) == 0 && desiredEndpoint == "" && len(desiredCandidates) == 0 { continue } peerEndpoint := desiredEndpoint nodeCandidates := append([]PeerEndpointCandidate{}, desiredCandidates...) if len(heartbeats) > 0 { reportedEndpoint, reportedCandidates, ok := endpointReportFromHeartbeat(heartbeats[0]) if ok { if peerEndpoint == "" { peerEndpoint = reportedEndpoint } nodeCandidates = append(nodeCandidates, reportedCandidates...) } } peerEndpoint, nodeCandidates = scopeEndpointReportForLocal(localPerspective, peerEndpoint, nodeCandidates) nodeCandidates = enrichPeerEndpointCandidateCertPins(nodeCandidates) if peerEndpoint != "" { peers[nodeID] = peerEndpoint } if len(nodeCandidates) > 0 { candidates[nodeID] = append(candidates[nodeID], nodeCandidates...) } } return peers, candidates, nil } func (s *Service) reportedRouteRelayEndpointConfig(ctx context.Context, clusterID string, localNodeID string, routePath []string, localPerspective endpointPerspective) (map[string]string, map[string][]PeerEndpointCandidate, error) { nodes, err := s.store.ListClusterNodes(ctx, clusterID) if err != nil { return nil, nil, err } sort.SliceStable(nodes, func(i, j int) bool { if nodes[i].HealthStatus != nodes[j].HealthStatus { return nodes[i].HealthStatus == "healthy" } iSeen := nodeLastSeen(nodes[i]) jSeen := nodeLastSeen(nodes[j]) if !iSeen.Equal(jSeen) { return iSeen.After(jSeen) } return nodes[i].CreatedAt.Before(nodes[j].CreatedAt) }) routeNodes := map[string]struct{}{} for _, nodeID := range routePath { if nodeID = strings.TrimSpace(nodeID); nodeID != "" { routeNodes[nodeID] = struct{}{} } } peers := map[string]string{} candidates := map[string][]PeerEndpointCandidate{} added := 0 for _, node := range nodes { if node.ID == "" || node.ID == localNodeID || node.MembershipStatus != "active" || node.RegistrationStatus != NodeRegistrationActive || node.HealthStatus != "healthy" { continue } if _, inRoute := routeNodes[node.ID]; inRoute { continue } desiredEndpoint, desiredCandidates, err := s.desiredMeshListenerEndpointConfig(ctx, clusterID, node.ID, added) if err != nil { return nil, nil, fmt.Errorf("desired mesh listener endpoint for relay node %s: %w", node.ID, err) } heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, node.ID, 1) if err != nil { return nil, nil, fmt.Errorf("list relay peer heartbeat for node %s: %w", node.ID, err) } endpoint := desiredEndpoint nodeCandidates := append([]PeerEndpointCandidate{}, desiredCandidates...) if len(heartbeats) > 0 { reportedEndpoint, reportedCandidates, ok := endpointReportFromHeartbeat(heartbeats[0]) if ok { if endpoint == "" { endpoint = reportedEndpoint } nodeCandidates = append(nodeCandidates, reportedCandidates...) } } endpoint, nodeCandidates = scopeEndpointReportForLocal(localPerspective, endpoint, nodeCandidates) nodeCandidates = publicDirectRelayCandidates(enrichPeerEndpointCandidateCertPins(nodeCandidates)) if len(nodeCandidates) == 0 { continue } if endpoint != "" && !endpointPrivateForOffsite(endpoint) { peers[node.ID] = endpoint } candidates[node.ID] = append(candidates[node.ID], nodeCandidates...) added++ if added >= defaultCoreMeshBootstrapPeerTarget { break } } return peers, candidates, nil } func enrichPeerEndpointCandidateCertPins(candidates []PeerEndpointCandidate) []PeerEndpointCandidate { if len(candidates) == 0 { return candidates } certByEndpoint := map[string]string{} for _, candidate := range candidates { endpoint := strings.TrimRight(strings.TrimSpace(candidate.Address), "/") if endpoint == "" { continue } if certSHA256 := peerEndpointCandidateTLSCertSHA256(candidate); certSHA256 != "" { certByEndpoint[endpoint] = certSHA256 if hostPort := peerEndpointHostPort(endpoint); hostPort != "" { certByEndpoint[hostPort] = certSHA256 } } } if len(certByEndpoint) == 0 { return candidates } out := append([]PeerEndpointCandidate{}, candidates...) for i := range out { if peerEndpointCandidateTLSCertSHA256(out[i]) != "" { continue } endpoint := strings.TrimRight(strings.TrimSpace(out[i].Address), "/") certSHA256 := certByEndpoint[endpoint] if certSHA256 == "" { certSHA256 = certByEndpoint[peerEndpointHostPort(endpoint)] } if certSHA256 == "" { certSHA256 = certByEndpoint[peerEndpointCandidateMapsTo(out[i])] } if certSHA256 == "" { continue } out[i].Metadata = peerEndpointCandidateMetadataWithCert(out[i].Metadata, certSHA256) } return out } func peerEndpointHostPort(endpoint string) string { trimmed := strings.TrimSpace(endpoint) if trimmed == "" { return "" } if parsed, err := url.Parse(trimmed); err == nil && parsed.Host != "" { return strings.TrimSpace(parsed.Host) } return strings.TrimSpace(peerEndpointHost(trimmed)) } func peerEndpointCandidateMapsTo(candidate PeerEndpointCandidate) string { if len(candidate.Metadata) == 0 || !json.Valid(candidate.Metadata) { return "" } var values struct { MapsTo string `json:"maps_to,omitempty"` } if err := json.Unmarshal(candidate.Metadata, &values); err != nil { return "" } return strings.TrimSpace(values.MapsTo) } func peerEndpointCandidateMetadataWithCert(raw json.RawMessage, certSHA256 string) json.RawMessage { certSHA256 = strings.TrimSpace(certSHA256) if certSHA256 == "" { return raw } values := map[string]any{} if len(raw) > 0 && json.Valid(raw) { _ = json.Unmarshal(raw, &values) } tlsCert, _ := values["tls_cert_sha256"].(string) peerCert, _ := values["peer_cert_sha256"].(string) if strings.TrimSpace(tlsCert) == "" && strings.TrimSpace(peerCert) == "" { values["tls_cert_sha256"] = certSHA256 } payload, err := json.Marshal(values) if err != nil { return raw } return payload } type endpointPerspective struct { OutboundOnly bool Region string ControlPlaneURL string ControlPlaneRelayEndpoint string PeerEndpoint string PeerEndpointCandidates []PeerEndpointCandidate } func (s *Service) localEndpointPerspective(ctx context.Context, clusterID, localNodeID string) (endpointPerspective, error) { heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, localNodeID, 1) if err != nil { return endpointPerspective{}, err } if len(heartbeats) == 0 { return endpointPerspective{}, nil } return endpointPerspectiveFromHeartbeat(heartbeats[0]), nil } func endpointPerspectiveFromHeartbeat(heartbeat NodeHeartbeat) endpointPerspective { var metadata struct { MeshEndpointReport heartbeatMeshEndpointReport `json:"mesh_endpoint_report"` MeshListenerReport struct { InboundReachability string `json:"inbound_reachability"` OneWayConnectivity bool `json:"one_way_connectivity"` } `json:"mesh_listener_report"` MeshOutboundSessionReport struct { ControlPlaneURL string `json:"control_plane_url"` Status string `json:"status"` } `json:"mesh_outbound_session_report"` } if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) { return endpointPerspective{} } if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil { return endpointPerspective{} } connectivity := strings.ToLower(strings.TrimSpace(metadata.MeshEndpointReport.ConnectivityMode)) reachability := strings.ToLower(strings.TrimSpace(metadata.MeshListenerReport.InboundReachability)) peerEndpoint, peerCandidates, _ := endpointReportFromHeartbeat(heartbeat) return endpointPerspective{ OutboundOnly: connectivity == "outbound_only" || reachability == "outbound_only" || metadata.MeshListenerReport.OneWayConnectivity, Region: strings.TrimSpace(metadata.MeshEndpointReport.Region), ControlPlaneURL: strings.TrimSpace(metadata.MeshOutboundSessionReport.ControlPlaneURL), ControlPlaneRelayEndpoint: controlPlaneRelayEndpointFromURL(metadata.MeshOutboundSessionReport.ControlPlaneURL), PeerEndpoint: peerEndpoint, PeerEndpointCandidates: peerCandidates, } } func controlPlaneRelayEndpointFromURL(raw string) string { raw = strings.TrimRight(strings.TrimSpace(raw), "/") if raw == "" { return "" } parsed, err := url.Parse(raw) if err != nil || parsed.Scheme == "" || parsed.Host == "" { return "" } path := strings.TrimRight(parsed.Path, "/") for _, suffix := range []string{"/api/v1", "/api"} { if strings.HasSuffix(path, suffix) { path = strings.TrimRight(strings.TrimSuffix(path, suffix), "/") break } } parsed.Path = path parsed.RawPath = "" parsed.RawQuery = "" parsed.Fragment = "" return strings.TrimRight(parsed.String(), "/") } func controlPlaneBootstrapRendezvousLease(clusterID, peerNodeID string, candidates []PeerEndpointCandidate, local endpointPerspective, now time.Time) (PeerRendezvousLease, bool) { return PeerRendezvousLease{}, false } func legacyControlPlaneBootstrapRendezvousLease(clusterID, peerNodeID string, candidates []PeerEndpointCandidate, local endpointPerspective, now time.Time) (PeerRendezvousLease, bool) { if !local.OutboundOnly || local.ControlPlaneRelayEndpoint == "" { return PeerRendezvousLease{}, false } requiresRendezvous := false for _, candidate := range candidates { if endpointCandidateRequiresRendezvous(candidate) { requiresRendezvous = true break } } if !requiresRendezvous { return PeerRendezvousLease{}, false } issuedAt := now.UTC() return PeerRendezvousLease{ LeaseID: "core-mesh-bootstrap-rv-" + peerNodeID + "-via-control-plane", PeerNodeID: peerNodeID, RelayNodeID: "control-plane-relay", RelayEndpoint: local.ControlPlaneRelayEndpoint, Transport: "relay_control", ConnectivityMode: "relay_required", RouteIDs: []string{"core-mesh-bootstrap"}, AllowedChannels: []string{"fabric_control", "route_control"}, Priority: 90, ControlPlaneOnly: true, IssuedAt: issuedAt, ExpiresAt: issuedAt.Add(5 * time.Minute), Reason: "control_plane_bootstrap_relay", Metadata: json.RawMessage(`{ "cluster_id": "` + clusterID + `", "source": "control_plane_bootstrap", "service_workload_traffic": false, "production_forwarding": false }`), }, true } func scopeEndpointReportForLocal(local endpointPerspective, endpoint string, candidates []PeerEndpointCandidate) (string, []PeerEndpointCandidate) { if !local.OutboundOnly && strings.TrimSpace(local.Region) == "" { return endpoint, candidates } out := make([]PeerEndpointCandidate, 0, len(candidates)) directUsable := false remotePrivate := false for _, candidate := range candidates { if endpointCandidatePrivateForLocalOffsite(local, candidate) { candidate = relayRequiredCandidateForOffsite(candidate) remotePrivate = true } else if !endpointCandidateRequiresRendezvous(candidate) { directUsable = true } if candidate.Metadata == nil { candidate.Metadata = json.RawMessage(`{}`) } out = append(out, candidate) } if !directUsable && (local.OutboundOnly || remotePrivate) && endpointPrivateForOffsite(endpoint) { endpoint = "" } return endpoint, out } func endpointCandidatePrivateForLocalOffsite(local endpointPerspective, candidate PeerEndpointCandidate) bool { if !endpointCandidatePrivateForOffsite(candidate) { return false } if endpointCandidateSharesLocalPrivateLAN(local, candidate) { return false } localRegion := strings.TrimSpace(local.Region) peerRegion := strings.TrimSpace(candidate.Region) return local.OutboundOnly || (localRegion != "" && peerRegion != "" && !strings.EqualFold(localRegion, peerRegion)) } func endpointCandidateSharesLocalPrivateLAN(local endpointPerspective, candidate PeerEndpointCandidate) bool { remoteIP := net.ParseIP(peerEndpointHost(candidate.Address)) if remoteIP == nil || !remoteIP.IsPrivate() { return false } for _, localCandidate := range local.PeerEndpointCandidates { localIP := net.ParseIP(peerEndpointHost(localCandidate.Address)) if localIP == nil || !localIP.IsPrivate() { continue } if privateIPsShareLikelyLAN(localIP, remoteIP) { return true } } localIP := net.ParseIP(peerEndpointHost(local.PeerEndpoint)) return localIP != nil && localIP.IsPrivate() && privateIPsShareLikelyLAN(localIP, remoteIP) } func privateIPsShareLikelyLAN(left, right net.IP) bool { if left4, right4 := left.To4(), right.To4(); left4 != nil && right4 != nil { return left4[0] == right4[0] && left4[1] == right4[1] && left4[2] == right4[2] } left16 := left.To16() right16 := right.To16() if left16 == nil || right16 == nil { return false } for i := 0; i < 8; i++ { if left16[i] != right16[i] { return false } } return true } func endpointCandidatePrivateForOffsite(candidate PeerEndpointCandidate) bool { connectivity := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode)) reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability)) return connectivity == "private_lan" || reachability == "private" || endpointPrivateForOffsite(candidate.Address) } func endpointPrivateForOffsite(endpoint string) bool { host := peerEndpointHost(endpoint) if host == "" { return false } ip := net.ParseIP(host) return ip != nil && (ip.IsPrivate() || ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsUnspecified()) } func relayRequiredCandidateForOffsite(candidate PeerEndpointCandidate) PeerEndpointCandidate { candidate.Transport = "relay_quic" candidate.Reachability = "relay" candidate.ConnectivityMode = "relay_required" candidate.NATType = firstNonEmptyString(candidate.NATType, "unknown") candidate.Priority += 200 candidate.PolicyTags = appendMissingString(candidate.PolicyTags, "offsite-private-lan-blocked") candidate.PolicyTags = appendMissingString(candidate.PolicyTags, "relay-required") return candidate } func endpointReportFromHeartbeat(heartbeat NodeHeartbeat) (string, []PeerEndpointCandidate, bool) { var metadata struct { MeshEndpointReport heartbeatMeshEndpointReport `json:"mesh_endpoint_report"` } if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) { return "", nil, false } if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil { return "", nil, false } report := metadata.MeshEndpointReport if report.NodeID != "" && report.NodeID != heartbeat.NodeID { return "", nil, false } if report.ClusterID != "" && report.ClusterID != heartbeat.ClusterID { return "", nil, false } nodeID := heartbeat.NodeID rawPeerEndpoint := strings.TrimSpace(report.PeerEndpoint) peerEndpoint := rawPeerEndpoint if isUnusableLocalPeerEndpoint(peerEndpoint) { peerEndpoint = "" } out := make([]PeerEndpointCandidate, 0, len(report.EndpointCandidates)) for _, candidate := range report.EndpointCandidates { if candidate.NodeID == "" { candidate.NodeID = nodeID } if candidate.EndpointID == "" { candidate.EndpointID = nodeID + "-reported" } if candidate.Address == "" { candidate.Address = rawPeerEndpoint } if isUnusableLocalPeerEndpoint(candidate.Address) { continue } if candidate.Transport == "" { candidate.Transport = report.Transport } if candidate.ConnectivityMode == "" { candidate.ConnectivityMode = report.ConnectivityMode } if candidate.NATType == "" { candidate.NATType = report.NATType } if candidate.Region == "" { candidate.Region = report.Region } if candidate.Reachability == "" { candidate.Reachability = reachabilityFromConnectivityMode(candidate.ConnectivityMode) } if candidate.Metadata == nil { candidate.Metadata = json.RawMessage(`{}`) } if candidate.NodeID != nodeID { return "", nil, false } out = append(out, candidate) } if len(out) > 0 { if err := validatePeerEndpointCandidates(map[string][]PeerEndpointCandidate{nodeID: out}, []string{nodeID}); err != nil { return "", nil, false } } return peerEndpoint, out, peerEndpoint != "" || len(out) > 0 } func hasActiveNodeRole(roles []NodeRoleAssignment, role string) bool { for _, item := range roles { if item.Role == role && item.Status == "active" { return true } } return false } func nodeLastSeen(node ClusterNode) time.Time { if node.LastSeenAt == nil { return time.Time{} } return node.LastSeenAt.UTC() } func recoverySeedFromEndpointReport(nodeID, endpoint string, candidates []PeerEndpointCandidate, index int) PeerRecoverySeed { nodeID = strings.TrimSpace(nodeID) endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/") seed := PeerRecoverySeed{ NodeID: nodeID, Endpoint: endpoint, Transport: "direct_quic", Priority: 10 + index, Metadata: json.RawMessage(`{"source":"core_mesh_bootstrap"}`), } for _, candidate := range candidates { if strings.TrimSpace(candidate.Address) == "" { continue } seed.Endpoint = strings.TrimRight(strings.TrimSpace(candidate.Address), "/") if strings.TrimSpace(candidate.Transport) != "" { seed.Transport = candidate.Transport } seed.ConnectivityMode = candidate.ConnectivityMode seed.Region = candidate.Region if candidate.LastVerifiedAt != nil { seed.LastVerifiedAt = candidate.LastVerifiedAt } break } if seed.NodeID == "" || seed.Endpoint == "" { return PeerRecoverySeed{} } return seed } func firstNonEmptyString(values ...string) string { for _, value := range values { if trimmed := strings.TrimSpace(value); trimmed != "" { return trimmed } } return "" } const staleNodeRiskHeartbeatThreshold = 15 * time.Minute func (s *Service) evaluateStaleNodeRisk(ctx context.Context, clusterID string, node ClusterNode, now time.Time, releaseCache map[string][]ReleaseVersion) (StaleNodeRiskNode, error) { item := StaleNodeRiskNode{ NodeID: node.ID, Name: node.Name, RegistrationStatus: node.RegistrationStatus, HealthStatus: node.HealthStatus, ReportedVersion: node.ReportedVersion, LastSeenAt: node.LastSeenAt, RecoveryBridgeActions: []string{}, Products: []StaleNodeRiskProduct{}, } if heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, node.ID, 1); err == nil && len(heartbeats) > 0 { readyCount, targetCount, deficit, alert := directPeerRecoveryFromHeartbeat(heartbeats[0]) item.DirectPeerReadyCount = readyCount item.DirectPeerTargetCount = targetCount item.DirectPeerDeficit = deficit item.DirectPeerAlert = alert if alert { item.Alerts = append(item.Alerts, fmt.Sprintf("direct_peer_deficit:%d_of_%d", readyCount, targetCount)) } } if node.LastSeenAt == nil || now.Sub(node.LastSeenAt.UTC()) > staleNodeRiskHeartbeatThreshold { item.HeartbeatStale = true item.Risks = append(item.Risks, "stale_heartbeat") } products := []string{"rap-node-agent", "rap-host-agent"} statuses, err := s.store.ListNodeUpdateStatuses(ctx, clusterID, node.ID, 50) if err != nil { return StaleNodeRiskNode{}, err } for _, product := range products { policy, err := s.store.GetNodeUpdatePolicy(ctx, clusterID, node.ID, product) if errors.Is(err, pgx.ErrNoRows) { continue } if err != nil { return StaleNodeRiskNode{}, err } productRisk, err := s.evaluateStaleNodeRiskProduct(ctx, clusterID, node, nodeNeedsRecoveryHold(node, item.HeartbeatStale), product, policy, statuses, releaseCache) if err != nil { return StaleNodeRiskNode{}, err } if len(productRisk.Risks) > 0 { item.Risks = append(item.Risks, productRisk.Risks...) } if productRisk.RecoveryBridgeRequired { item.RecoveryBridgeRequired = true item.RecoveryBridgeActions = append(item.RecoveryBridgeActions, "preserve_compatibility_overlap") item.RecoveryBridgeActions = append(item.RecoveryBridgeActions, "preserve_install_type_aliases") } if productRisk.RecoveryBridgeReplayReady { item.RecoveryBridgeReplayReady = true item.RecoveryBridgeActions = append(item.RecoveryBridgeActions, "replay_legacy_update_plan_"+product) } item.Products = append(item.Products, productRisk) } item.RecoveryBridgeActions = trimStringSlice(item.RecoveryBridgeActions) item.Alerts = trimStringSlice(item.Alerts) item.Risks = trimStringSlice(item.Risks) return item, nil } func directPeerRecoveryFromHeartbeat(heartbeat NodeHeartbeat) (readyCount int, targetCount int, deficit int, alert bool) { if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) { return 0, 3, 3, true } var metadata struct { MeshEndpointReport heartbeatMeshEndpointReport `json:"mesh_endpoint_report"` MeshPeerRecoveryReport heartbeatMeshPeerRecoveryReport `json:"mesh_peer_recovery_report"` MeshPeerConnectionIntentReport heartbeatMeshPeerConnectionIntentReport `json:"mesh_peer_connection_intent_report"` MeshPeerConnectionManagerReport heartbeatMeshPeerConnectionManagerReport `json:"mesh_peer_connection_manager_report"` } if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil { return 0, 3, 3, true } readyCount = directPeerReadyCountFromManager(metadata.MeshPeerConnectionManagerReport) if readyCount <= 0 { readyCount = metadata.MeshPeerConnectionIntentReport.DirectCount } if readyCount <= 0 { readyCount = intFromCandidatesDirectCount(metadata.MeshEndpointReport.EndpointCandidates) } if readyCount <= 0 { readyCount = metadata.MeshEndpointReport.PeerRecoveryReady } targetCount = metadata.MeshPeerRecoveryReport.TargetReadyPeers if targetCount <= 0 { targetCount = 3 } deficit = metadata.MeshEndpointReport.PeerRecoveryDeficit if deficit <= 0 && readyCount < targetCount { deficit = targetCount - readyCount } if readyCount < 0 { readyCount = 0 } if deficit < 0 { deficit = 0 } alert = readyCount < targetCount return readyCount, targetCount, deficit, alert } func intFromCandidatesDirectCount(candidates []PeerEndpointCandidate) int { count := 0 for _, candidate := range candidates { if strings.Contains(strings.ToLower(strings.TrimSpace(candidate.Transport)), "quic") && isDirectConnectivityMode(candidate.ConnectivityMode) { count++ } } return count } func directPeerReadyCountFromManager(report heartbeatMeshPeerConnectionManagerReport) int { if report.PeerConnectionReady > 0 { return report.PeerConnectionReady } if len(report.ProbeResults) == 0 { return 0 } ready := map[string]struct{}{} for _, probe := range report.ProbeResults { nodeID := strings.TrimSpace(probe.NodeID) if nodeID == "" || !strings.EqualFold(strings.TrimSpace(probe.LinkStatus), "reachable") { continue } if isDirectTransportMode(probe.TransportMode) { ready[nodeID] = struct{}{} continue } for _, candidate := range probe.CandidateResults { if !strings.EqualFold(strings.TrimSpace(candidate.LinkStatus), "reachable") { continue } if isDirectTransportMode(candidate.Transport) { ready[nodeID] = struct{}{} break } } } return len(ready) } func isDirectConnectivityMode(mode string) bool { switch strings.ToLower(strings.TrimSpace(mode)) { case "direct", "private_lan", "corp_lan", "corporate_lan": return true default: return false } } func isDirectTransportMode(mode string) bool { switch strings.ToLower(strings.TrimSpace(mode)) { case "direct", "direct_quic", "private_lan", "reverse_quic": return true default: return false } } func (s *Service) evaluateStaleNodeRiskProduct(ctx context.Context, clusterID string, node ClusterNode, recoveryHoldActive bool, product string, policy NodeUpdatePolicy, statuses []NodeUpdateStatus, releaseCache map[string][]ReleaseVersion) (StaleNodeRiskProduct, error) { currentVersion, latestStatus := currentVersionFromNodeAndStatuses(node, product, statuses) detectedOS, detectedArch, detectedInstallType, profileKnown := inferUpdateProfile(product, statuses) item := StaleNodeRiskProduct{ Product: product, CurrentVersion: currentVersion, TargetVersion: policy.TargetVersion, Channel: policy.Channel, Strategy: policy.Strategy, Enabled: policy.Enabled, DetectedOS: detectedOS, DetectedArch: detectedArch, DetectedInstallType: detectedInstallType, } if latestStatus != nil { item.LastStatusObservedAt = &latestStatus.ObservedAt item.LastStatusPhase = latestStatus.Phase item.LastStatusValue = latestStatus.Status item.LastStatusReason = nodeUpdateStatusReason(*latestStatus) } if !policy.Enabled || !isNodeRecoveryRiskRelevant(node) || !recoveryHoldActive { return item, nil } if !profileKnown { item.Risks = append(item.Risks, "stale_node_unknown_profile_"+product) return item, nil } cacheKey := product + "|" + policy.Channel releases, ok := releaseCache[cacheKey] if !ok { var err error releases, err = s.store.ListReleaseVersions(ctx, clusterID, product, policy.Channel) if err != nil { return StaleNodeRiskProduct{}, err } releaseCache[cacheKey] = releases } release, artifact, found := selectReleaseArtifact(releases, GetNodeUpdatePlanInput{ ClusterID: clusterID, NodeID: node.ID, Product: product, CurrentVersion: currentVersion, OS: detectedOS, Arch: detectedArch, InstallType: detectedInstallType, Channel: policy.Channel, }, policy) if found { item.CompatibleArtifactFound = true item.MatchingReleaseVersion = release.Version _ = artifact } else { item.Risks = append(item.Risks, "stale_node_no_compatible_"+product+"_artifact") } if currentVersion == "" { item.Risks = append(item.Risks, "stale_node_unknown_"+product+"_version") } if latestStatus == nil { item.Risks = append(item.Risks, "stale_node_no_"+product+"_update_status") } else if item.CompatibleArtifactFound && strings.EqualFold(strings.TrimSpace(item.LastStatusReason), "no_matching_artifact") { item.Risks = append(item.Risks, "stale_node_legacy_recovery_contract_"+product) item.RecoveryBridgeRequired = true item.RecoveryBridgeReplayReady = true item.RecoveryBridgeMode = "legacy_contract_overlap" } item.Risks = trimStringSlice(item.Risks) return item, nil } func isNodeRecoveryRiskRelevant(node ClusterNode) bool { return node.RegistrationStatus == NodeRegistrationActive } func nodeNeedsRecoveryHold(node ClusterNode, heartbeatStale bool) bool { if heartbeatStale { return true } return !strings.EqualFold(strings.TrimSpace(node.HealthStatus), "healthy") } func currentVersionFromNodeAndStatuses(node ClusterNode, product string, statuses []NodeUpdateStatus) (string, *NodeUpdateStatus) { if product == "rap-node-agent" && node.ReportedVersion != nil { if version := strings.TrimSpace(*node.ReportedVersion); version != "" { if status := latestNodeUpdateStatusForProduct(statuses, product); status != nil { return version, status } return version, nil } } status := latestNodeUpdateStatusForProduct(statuses, product) if status == nil { return "", nil } return strings.TrimSpace(status.CurrentVersion), status } func latestNodeUpdateStatusForProduct(statuses []NodeUpdateStatus, product string) *NodeUpdateStatus { var latest *NodeUpdateStatus for i := range statuses { if statuses[i].Product != product { continue } if latest == nil || statuses[i].ObservedAt.After(latest.ObservedAt) { latest = &statuses[i] } } return latest } func inferUpdateProfile(product string, statuses []NodeUpdateStatus) (osValue string, arch string, installType string, known bool) { arch = "amd64" windowsObserved := false for _, status := range statuses { if nodeUpdateStatusLooksWindows(status) { windowsObserved = true break } } if windowsObserved { if product == "rap-host-agent" { return "windows", arch, "windows_binary", true } return "windows", arch, "windows_service", true } if product == "rap-host-agent" { return "linux", arch, "linux_binary", true } return "", arch, "", false } func nodeUpdateStatusReason(status NodeUpdateStatus) string { var payload map[string]any if len(status.Payload) == 0 || json.Unmarshal(status.Payload, &payload) != nil { return "" } return strings.TrimSpace(stringFromAny(payload["reason"])) } func containsAnyRiskWithPrefix(risks []string, prefix string) bool { for _, risk := range risks { if strings.HasPrefix(strings.TrimSpace(risk), prefix) { return true } } return false } func containsAnyRiskWithSuffix(risks []string, suffix string) bool { for _, risk := range risks { if strings.HasSuffix(strings.TrimSpace(risk), suffix) { return true } } return false } func releaseRequestsLegacyRemoval(raw json.RawMessage) bool { if len(raw) == 0 || !json.Valid(raw) { return false } var payload map[string]any if err := json.Unmarshal(raw, &payload); err != nil { return false } return truthyCompatibilityFlag(payload, "legacy_removal") || truthyCompatibilityFlag(payload, "remove_legacy_formats") || truthyCompatibilityFlag(payload, "remove_compatibility_formats") || truthyCompatibilityFlag(payload, "breaking_compatibility") } func truthyCompatibilityFlag(payload map[string]any, key string) bool { value, ok := payload[key] if !ok { return false } flag, ok := value.(bool) return ok && flag } func targetedReleaseRequestsLegacyRemoval(releases []ReleaseVersion, targetVersion string) bool { targetVersion = strings.TrimSpace(targetVersion) if targetVersion == "" { return false } for _, release := range releases { if strings.TrimSpace(release.Version) != targetVersion { continue } return releaseRequestsLegacyRemoval(release.Compatibility) } return false } func hasTargetedReleaseVersion(releases []ReleaseVersion, targetVersion string) bool { targetVersion = strings.TrimSpace(targetVersion) if targetVersion == "" { return false } for _, release := range releases { if strings.TrimSpace(release.Version) == targetVersion { return true } } return false } func (s *Service) recordLegacyRemovalBlockedAudit( ctx context.Context, clusterID string, actorUserID string, targetType string, targetID string, blockedOperation string, report StaleNodeRiskReport, ) { clusterID = strings.TrimSpace(clusterID) actorUserID = strings.TrimSpace(actorUserID) targetID = strings.TrimSpace(targetID) if clusterID == "" || actorUserID == "" { return } payload, err := json.Marshal(map[string]any{ "blocked_operation": blockedOperation, "blocked_operations": report.BlockedOperations, "bridge_hold_required": report.BridgeHoldRequired, "bridge_hold_reasons": report.BridgeHoldReasons, "bridge_hold_node_ids": report.BridgeHoldNodeIDs, "stale_nodes": report.Summary.StaleNodes, "blocked_nodes": report.Summary.BlockedNodes, "artifact_gap_nodes": report.Summary.ArtifactGapNodes, "unknown_profile_nodes": report.Summary.UnknownProfileNodes, "waiting_update_status_nodes": report.Summary.WaitingUpdateStatusNodes, "unknown_version_nodes": report.Summary.UnknownVersionNodes, "legacy_recovery_contract_nodes": report.Summary.LegacyRecoveryContractNodes, "recovery_bridge_required_nodes": report.Summary.RecoveryBridgeRequiredNodes, "recovery_bridge_replay_ready_nodes": report.Summary.RecoveryBridgeReplayReadyNodes, "waiting_recovery_heartbeat_nodes": report.Summary.WaitingRecoveryHeartbeatNodes, "legacy_removal_allowed": report.LegacyRemovalAllowed, "production_forwarding": false, }) if err != nil { return } clusterIDCopy := clusterID actorUserIDCopy := actorUserID var targetIDPtr *string if targetID != "" { targetIDCopy := targetID targetIDPtr = &targetIDCopy } _ = s.store.RecordAudit(ctx, ClusterAuditEvent{ ClusterID: &clusterIDCopy, ActorUserID: &actorUserIDCopy, EventType: "legacy_compatibility_removal.blocked", TargetType: targetType, TargetID: targetIDPtr, Payload: payload, CreatedAt: s.now(), }) } func trimStringSlice(values []string) []string { out := []string{} for _, value := range values { if trimmed := strings.TrimSpace(value); trimmed != "" && !containsString(out, trimmed) { out = append(out, trimmed) } } return out } func trimEndpointSlice(values []string) []string { out := []string{} for _, value := range values { trimmed := strings.TrimRight(strings.TrimSpace(value), "/") if trimmed != "" && !containsString(out, trimmed) { out = append(out, trimmed) } } return out } func normalizeUpdateToken(value string) string { return strings.ToLower(strings.TrimSpace(value)) } func selectReleaseArtifact(releases []ReleaseVersion, input GetNodeUpdatePlanInput, policy NodeUpdatePolicy) (ReleaseVersion, ReleaseArtifact, bool) { targetVersion := "" if policy.TargetVersion != nil { targetVersion = strings.TrimSpace(*policy.TargetVersion) } for _, release := range releases { if release.Status != "active" { continue } if targetVersion != "" && release.Version != targetVersion { continue } for _, artifact := range release.Artifacts { if releaseArtifactMatchesUpdateRequest(artifact, input) { artifact.URLs = releaseArtifactURLs(artifact) return release, artifact, true } } } return ReleaseVersion{}, ReleaseArtifact{}, false } func releaseArtifactMatchesUpdateRequest(artifact ReleaseArtifact, input GetNodeUpdatePlanInput) bool { if normalizeUpdateToken(artifact.OS) != input.OS || normalizeUpdateToken(artifact.Arch) != input.Arch { return false } artifactInstallType := normalizeUpdateToken(artifact.InstallType) if artifactInstallType == input.InstallType { return true } if input.Product == "rap-host-agent" && input.OS == "windows" { return hostAgentWindowsInstallTypeEquivalent(artifactInstallType, input.InstallType) } return false } func hostAgentWindowsInstallTypeEquivalent(left, right string) bool { left = normalizeUpdateToken(left) right = normalizeUpdateToken(right) if left == right { return true } return (left == "windows_binary" || left == "windows_service") && (right == "windows_binary" || right == "windows_service") } func releaseArtifactURLs(artifact ReleaseArtifact) []string { out := trimEndpointSlice(append([]string{artifact.URL}, artifact.URLs...)) if len(artifact.Metadata) > 0 && json.Valid(artifact.Metadata) { var metadata struct { URL string `json:"url"` URLs []string `json:"urls"` MirrorURLs []string `json:"mirror_urls"` Mirrors []string `json:"mirrors"` } if err := json.Unmarshal(artifact.Metadata, &metadata); err == nil { out = trimEndpointSlice(append(out, metadata.URL)) out = trimEndpointSlice(append(out, metadata.URLs...)) out = trimEndpointSlice(append(out, metadata.MirrorURLs...)) out = trimEndpointSlice(append(out, metadata.Mirrors...)) } } return out } func normalizeArtifactOrigin(value string) string { value = strings.TrimRight(strings.TrimSpace(value), "/") if value == "" { return "" } parsed, err := url.Parse(value) if err != nil || parsed.Scheme == "" || parsed.Host == "" { return "" } return canonicalArtifactOrigin(parsed.Scheme + "://" + parsed.Host) } func canonicalArtifactOrigin(value string) string { value = strings.TrimRight(strings.TrimSpace(value), "/") if value == "" { return "" } parsed, err := url.Parse(value) if err != nil || parsed.Scheme == "" || parsed.Host == "" { return "" } host := strings.ToLower(strings.TrimSpace(parsed.Hostname())) port := strings.TrimSpace(parsed.Port()) if (host == "94.141.118.222" || host == "vpn.cin.su") && port == "19191" { return "https://vpn.cin.su" } return parsed.Scheme + "://" + parsed.Host } func artifactOriginFromHeartbeat(heartbeat NodeHeartbeat) string { var metadata struct { MeshOutboundSessionReport struct { ControlPlaneURL string `json:"control_plane_url"` } `json:"mesh_outbound_session_report"` } if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) { return "" } if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil { return "" } return normalizeArtifactOrigin(metadata.MeshOutboundSessionReport.ControlPlaneURL) } func preferredNodeArtifactOrigin(current, fallback string) string { current = normalizeArtifactOrigin(current) fallback = normalizeArtifactOrigin(fallback) if fallback == "" { return current } if current == "" { return fallback } parsedCurrent, errCurrent := url.Parse(current) if errCurrent != nil { return current } host := strings.ToLower(strings.TrimSpace(parsedCurrent.Hostname())) switch host { case "", "127.0.0.1", "localhost", "192.168.200.61": return fallback } return current } func absolutizeReleaseArtifact(artifact ReleaseArtifact, origin string) ReleaseArtifact { if origin == "" { return artifact } artifact.URL = absolutizeArtifactURL(artifact.URL, origin) for i, raw := range artifact.URLs { artifact.URLs[i] = absolutizeArtifactURL(raw, origin) } return artifact } func absolutizeArtifactURL(raw, origin string) string { raw = strings.TrimSpace(raw) if raw == "" || origin == "" { return canonicalArtifactURL(raw) } parsed, err := url.Parse(raw) if err == nil && parsed.IsAbs() { if !artifactURLLooksLocal(parsed) { return canonicalArtifactURL(raw) } base, baseErr := url.Parse(origin) if baseErr != nil || base.Scheme == "" || base.Host == "" { return canonicalArtifactURL(raw) } rewritten := &url.URL{ Scheme: base.Scheme, Host: base.Host, Path: parsed.Path, RawPath: parsed.RawPath, RawQuery: parsed.RawQuery, Fragment: parsed.Fragment, } return canonicalArtifactURL(rewritten.String()) } if strings.HasPrefix(raw, "/") { return canonicalArtifactURL(origin + raw) } return canonicalArtifactURL(raw) } func canonicalArtifactURL(raw string) string { raw = strings.TrimSpace(raw) if raw == "" { return "" } parsed, err := url.Parse(raw) if err != nil || !parsed.IsAbs() { return raw } host := strings.ToLower(strings.TrimSpace(parsed.Hostname())) port := strings.TrimSpace(parsed.Port()) if (host == "94.141.118.222" || host == "vpn.cin.su") && port == "19191" { parsed.Scheme = "https" parsed.Host = "vpn.cin.su" return parsed.String() } return raw } func artifactURLLooksLocal(parsed *url.URL) bool { if parsed == nil { return false } host := strings.ToLower(strings.TrimSpace(parsed.Hostname())) switch host { case "", "localhost": return true } ip := net.ParseIP(host) if ip == nil { return false } if ip.IsLoopback() || ip.IsPrivate() || ip.IsLinkLocalUnicast() { return true } return false } func (s *Service) hostAgentPlatformMismatch(ctx context.Context, input GetNodeUpdatePlanInput) (bool, error) { if input.Product != "rap-host-agent" { return false, nil } if nodeUpdateRequestIsWindows(input) { return false, nil } statuses, err := s.store.ListNodeUpdateStatuses(ctx, input.ClusterID, input.NodeID, 20) if err != nil { return false, err } for _, status := range statuses { if status.Product != "rap-node-agent" || !nodeUpdateStatusLooksWindows(status) { continue } return true, nil } return false, nil } func nodeUpdateRequestIsWindows(input GetNodeUpdatePlanInput) bool { return normalizeUpdateToken(input.OS) == "windows" || strings.Contains(normalizeUpdateToken(input.InstallType), "windows") } func nodeUpdateStatusLooksWindows(status NodeUpdateStatus) bool { var payload map[string]any if len(status.Payload) == 0 || json.Unmarshal(status.Payload, &payload) != nil { return false } for _, key := range []string{"os", "runtime_os", "goos"} { if normalizeUpdateToken(stringFromAny(payload[key])) == "windows" { return true } } for _, key := range []string{"binary_path", "task", "windows_task_name"} { value := strings.ToLower(strings.TrimSpace(stringFromAny(payload[key]))) if strings.Contains(value, `:\`) || strings.Contains(value, `.exe`) || strings.Contains(value, "rap node agent ") { return true } } return false } func stringFromAny(value any) string { switch typed := value.(type) { case string: return typed default: return "" } } func boolPtrValue(value *bool, fallback bool) bool { if value == nil { return fallback } return *value } func positiveOrDefault(value, fallback int) int { if value > 0 { return value } return fallback } func nonNegativeOrDefault(value, fallback int) int { if value >= 0 { return value } return fallback } func cloneRawJSON(raw json.RawMessage) json.RawMessage { trimmed := bytes.TrimSpace(raw) if len(trimmed) == 0 { return nil } return append(json.RawMessage(nil), trimmed...) } func isOptionalJSONArray(raw json.RawMessage) bool { raw = bytes.TrimSpace(raw) if len(raw) == 0 { return true } if !json.Valid(raw) { return false } var items []json.RawMessage return json.Unmarshal(raw, &items) == nil } func safeInstallProfileSlug(value string) string { value = strings.ToLower(strings.TrimSpace(value)) var b strings.Builder lastDash := false for _, r := range value { ok := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') if ok { b.WriteRune(r) lastDash = false continue } if !lastDash { b.WriteByte('-') lastDash = true } } return strings.Trim(b.String(), "-") } func safeArtifactFileName(value string) string { value = strings.ToLower(strings.TrimSpace(value)) var b strings.Builder lastDash := false for _, r := range value { ok := (r >= 'a' && r <= 'z') || (r >= '0' && r <= '9') || r == '.' || r == '_' || r == '-' if ok { b.WriteRune(r) lastDash = false continue } if !lastDash { b.WriteByte('-') lastDash = true } } out := strings.Trim(b.String(), "-") if out == "" { return "rap-node-agent" } return out } func (s *Service) rendezvousRelayFeedback(ctx context.Context, clusterID string, routePath []string, now time.Time) ([]rendezvousRelayFeedbackEntry, error) { out := []rendezvousRelayFeedbackEntry{} seenNodes := map[string]struct{}{} for _, nodeID := range routePath { nodeID = strings.TrimSpace(nodeID) if nodeID == "" { continue } if _, duplicate := seenNodes[nodeID]; duplicate { continue } seenNodes[nodeID] = struct{}{} heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1) if err != nil { return nil, err } if len(heartbeats) == 0 { continue } out = append(out, rendezvousRelayFeedbackFromHeartbeat(heartbeats[0], now)...) } return out, nil } func rendezvousRelayFeedbackFromHeartbeat(heartbeat NodeHeartbeat, now time.Time) []rendezvousRelayFeedbackEntry { if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) { return nil } if now.IsZero() { now = time.Now().UTC() } else { now = now.UTC() } if heartbeat.ObservedAt.IsZero() || heartbeat.ObservedAt.After(now.Add(time.Minute)) || now.Sub(heartbeat.ObservedAt.UTC()) > rendezvousRelayFeedbackMaxAge { return nil } var metadata struct { MeshRendezvousLeaseReport heartbeatRendezvousLeaseReport `json:"mesh_rendezvous_lease_report"` } if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil { return nil } report := metadata.MeshRendezvousLeaseReport if report.NodeID != "" && report.NodeID != heartbeat.NodeID { return nil } if report.ClusterID != "" && report.ClusterID != heartbeat.ClusterID { return nil } out := []rendezvousRelayFeedbackEntry{} for _, lease := range report.Leases { if !lease.StaleRelay && !lease.WithdrawalNeeded && !lease.ReselectionNeeded { continue } if strings.TrimSpace(lease.PeerNodeID) == "" || strings.TrimSpace(lease.RelayNodeID) == "" { continue } out = append(out, rendezvousRelayFeedbackEntry{ ReporterNodeID: heartbeat.NodeID, RouteIDs: append([]string{}, lease.RouteIDs...), LeaseID: strings.TrimSpace(lease.LeaseID), PeerNodeID: strings.TrimSpace(lease.PeerNodeID), RelayNodeID: strings.TrimSpace(lease.RelayNodeID), ConnectionState: strings.TrimSpace(lease.ConnectionState), Reason: strings.TrimSpace(lease.Reason), WithdrawalNeeded: lease.WithdrawalNeeded, ReselectionNeeded: lease.ReselectionNeeded, ObservedAt: heartbeat.ObservedAt.UTC(), }) } return out } func (s *Service) rendezvousRelayReplacementHints(ctx context.Context, clusterID string, routePath []string, now time.Time) ([]RendezvousRelayPolicyDecision, error) { out := []RendezvousRelayPolicyDecision{} seenNodes := map[string]struct{}{} for _, nodeID := range routePath { nodeID = strings.TrimSpace(nodeID) if nodeID == "" { continue } if _, duplicate := seenNodes[nodeID]; duplicate { continue } seenNodes[nodeID] = struct{}{} heartbeats, err := s.store.ListNodeHeartbeats(ctx, clusterID, nodeID, 1) if err != nil { return nil, err } if len(heartbeats) == 0 { continue } out = append(out, rendezvousRelayReplacementHintsFromHeartbeat(heartbeats[0], now)...) } return out, nil } func rendezvousRelayReplacementHintsFromHeartbeat(heartbeat NodeHeartbeat, now time.Time) []RendezvousRelayPolicyDecision { if len(heartbeat.Metadata) == 0 || !json.Valid(heartbeat.Metadata) { return nil } if now.IsZero() { now = time.Now().UTC() } else { now = now.UTC() } if heartbeat.ObservedAt.IsZero() || heartbeat.ObservedAt.After(now.Add(time.Minute)) || now.Sub(heartbeat.ObservedAt.UTC()) > rendezvousRelayFeedbackMaxAge { return nil } var metadata struct { MeshRoutePathDecisionReport struct { ClusterID string `json:"cluster_id"` NodeID string `json:"node_id"` Decisions []RoutePathDecision `json:"decisions"` } `json:"mesh_route_path_decision_report"` } if err := json.Unmarshal(heartbeat.Metadata, &metadata); err != nil { return nil } report := metadata.MeshRoutePathDecisionReport if report.NodeID != "" && report.NodeID != heartbeat.NodeID { return nil } if report.ClusterID != "" && report.ClusterID != heartbeat.ClusterID { return nil } out := []RendezvousRelayPolicyDecision{} for _, decision := range report.Decisions { if strings.TrimSpace(decision.RouteID) == "" || decision.DecisionSource != "stale_relay_replacement" || strings.TrimSpace(decision.SelectedRelayID) == "" || strings.TrimSpace(decision.StaleRelayNodeID) == "" || decision.ProductionForwarding || !decision.ControlPlaneOnly || (!decision.ExpiresAt.IsZero() && !decision.ExpiresAt.After(now)) { continue } peerNodeID := strings.TrimSpace(decision.RendezvousPeerNodeID) if peerNodeID == "" { peerNodeID = replacementPeerNodeIDFromDecision(decision) } if peerNodeID == "" { continue } out = append(out, RendezvousRelayPolicyDecision{ RouteID: strings.TrimSpace(decision.RouteID), PeerNodeID: peerNodeID, StaleRelayNodeID: strings.TrimSpace(decision.StaleRelayNodeID), SelectedRelayID: strings.TrimSpace(decision.SelectedRelayID), SelectedEndpoint: strings.TrimRight(strings.TrimSpace(decision.SelectedRelayEndpoint), "/"), Score: decision.PathScore, Reason: "stale_relay_replacement", ScoreReasons: append([]string{}, decision.ScoreReasons...), ReporterNodeID: heartbeat.NodeID, }) } return out } func replacementPeerNodeIDFromDecision(decision RoutePathDecision) string { effectiveHops := cleanRouteNodePath(decision.EffectiveHops) selectedRelayID := strings.TrimSpace(decision.SelectedRelayID) for index, nodeID := range effectiveHops { if nodeID == selectedRelayID && index+1 < len(effectiveHops) { return effectiveHops[index+1] } } return strings.TrimSpace(decision.DestinationNodeID) } func replacementHintFeedback(hints []RendezvousRelayPolicyDecision, now time.Time) []rendezvousRelayFeedbackEntry { if len(hints) == 0 { return nil } if now.IsZero() { now = time.Now().UTC() } else { now = now.UTC() } out := make([]rendezvousRelayFeedbackEntry, 0, len(hints)) for _, hint := range hints { if strings.TrimSpace(hint.RouteID) == "" || strings.TrimSpace(hint.PeerNodeID) == "" || strings.TrimSpace(hint.StaleRelayNodeID) == "" || strings.TrimSpace(hint.SelectedRelayID) == "" { continue } out = append(out, rendezvousRelayFeedbackEntry{ ReporterNodeID: strings.TrimSpace(hint.ReporterNodeID), RouteIDs: []string{strings.TrimSpace(hint.RouteID)}, PeerNodeID: strings.TrimSpace(hint.PeerNodeID), RelayNodeID: strings.TrimSpace(hint.StaleRelayNodeID), ConnectionState: "replacement_hint", Reason: "stale_relay_replacement_hint", WithdrawalNeeded: true, ReselectionNeeded: true, ObservedAt: now, }) } return out } func rendezvousRelayRouteHealthFeedback(localNodeID string, route SyntheticMeshRouteConfig, links []MeshLinkObservation, now time.Time) []rendezvousRelayFeedbackEntry { out := []rendezvousRelayFeedbackEntry{} for _, link := range links { item, ok := rendezvousRelayRouteHealthFeedbackFromLink(localNodeID, route, link, now) if ok { out = append(out, item) } } return out } func rendezvousRelayRouteHealthFeedbackFromLink(localNodeID string, route SyntheticMeshRouteConfig, link MeshLinkObservation, now time.Time) (rendezvousRelayFeedbackEntry, bool) { localNodeID = strings.TrimSpace(localNodeID) if localNodeID == "" || link.SourceNodeID != localNodeID || strings.TrimSpace(route.RouteID) == "" { return rendezvousRelayFeedbackEntry{}, false } if !meshLinkObservationFresh(link, now) { return rendezvousRelayFeedbackEntry{}, false } metadata, ok := routeHealthMetadataFromLink(link) if !ok || metadata.ObservationType != "synthetic_route_health" || strings.TrimSpace(metadata.RouteID) != route.RouteID || !metadata.RoutePathDecisionApplied || metadata.ProductionForwarding || metadata.ProductionPayloadForwarding || metadata.RouteHealthProductionPayloadForwarding || metadata.RouteHealthServicePayloadForwarding { return rendezvousRelayFeedbackEntry{}, false } selectedRelayID := strings.TrimSpace(metadata.RoutePathDecisionSelectedRelayID) if selectedRelayID == "" { return rendezvousRelayFeedbackEntry{}, false } reason := "" switch { case metadata.RoutePathDriftDetected: reason = "synthetic_route_health_drift" case link.LinkStatus == "unreachable": reason = "synthetic_route_health_unreachable" case strings.TrimSpace(metadata.FailureReason) != "": reason = "synthetic_route_health_failure" default: return rendezvousRelayFeedbackEntry{}, false } peerNodeID := routeHealthPeerNodeID(metadata, route, link.TargetNodeID) if peerNodeID == "" { return rendezvousRelayFeedbackEntry{}, false } return rendezvousRelayFeedbackEntry{ ReporterNodeID: link.SourceNodeID, RouteIDs: []string{route.RouteID}, LeaseID: strings.TrimSpace(metadata.RoutePathDecisionRendezvousLeaseID), PeerNodeID: peerNodeID, RelayNodeID: selectedRelayID, ConnectionState: reason, Reason: reason, WithdrawalNeeded: true, ReselectionNeeded: true, ObservedAt: link.ObservedAt.UTC(), }, true } func routeHealthMetadataFromLink(link MeshLinkObservation) (meshRouteHealthObservationMetadata, bool) { if len(link.Metadata) == 0 || !json.Valid(link.Metadata) { return meshRouteHealthObservationMetadata{}, false } var metadata meshRouteHealthObservationMetadata if err := json.Unmarshal(link.Metadata, &metadata); err != nil { return meshRouteHealthObservationMetadata{}, false } return metadata, true } func meshLinkObservationFresh(link MeshLinkObservation, now time.Time) bool { if now.IsZero() { now = time.Now().UTC() } else { now = now.UTC() } return !link.ObservedAt.IsZero() && !link.ObservedAt.After(now.Add(time.Minute)) && now.Sub(link.ObservedAt.UTC()) <= rendezvousRelayFeedbackMaxAge } func routeHealthPeerNodeID(metadata meshRouteHealthObservationMetadata, route SyntheticMeshRouteConfig, targetNodeID string) string { if peerNodeID := strings.TrimSpace(metadata.RoutePathDecisionRendezvousPeerNodeID); peerNodeID != "" { return peerNodeID } selectedRelayID := strings.TrimSpace(metadata.RoutePathDecisionSelectedRelayID) if peerNodeID := nodeAfterInPath(cleanRouteNodePath(metadata.ExpectedEffectiveHops), selectedRelayID); peerNodeID != "" { return peerNodeID } if peerNodeID := nodeAfterInPath(cleanRouteNodePath(route.Hops), selectedRelayID); peerNodeID != "" { return peerNodeID } if targetNodeID = strings.TrimSpace(targetNodeID); targetNodeID != "" { return targetNodeID } return strings.TrimSpace(route.DestinationNodeID) } func nodeAfterInPath(path []string, nodeID string) string { nodeID = strings.TrimSpace(nodeID) if nodeID == "" { return "" } for index, item := range path { if item == nodeID && index+1 < len(path) { return path[index+1] } } return "" } func newRendezvousRelayPolicy(localNodeID string, links []MeshLinkObservation, now time.Time) *rendezvousRelayPolicy { if now.IsZero() { now = time.Now().UTC() } else { now = now.UTC() } return &rendezvousRelayPolicy{ localNodeID: strings.TrimSpace(localNodeID), now: now, links: append([]MeshLinkObservation{}, links...), withdrawn: map[string]RendezvousRelayPolicyDecision{}, replacements: map[string]RendezvousRelayPolicyDecision{}, } } func (p *rendezvousRelayPolicy) addFeedback(items []rendezvousRelayFeedbackEntry) { if p == nil { return } p.feedback = append(p.feedback, items...) } func (p *rendezvousRelayPolicy) staleForLease(routeID string, lease PeerRendezvousLease) (rendezvousRelayFeedbackEntry, bool) { if p == nil { return rendezvousRelayFeedbackEntry{}, false } for _, item := range p.feedback { if !rendezvousFeedbackAppliesToRoute(item, routeID) { continue } if item.LeaseID != "" && lease.LeaseID != "" && item.LeaseID == lease.LeaseID { return item, true } if item.PeerNodeID == lease.PeerNodeID && item.RelayNodeID == lease.RelayNodeID { return item, true } } return rendezvousRelayFeedbackEntry{}, false } func (p *rendezvousRelayPolicy) relayStale(routeID string, peerNodeID string, relayNodeID string) (rendezvousRelayFeedbackEntry, bool) { if p == nil { return rendezvousRelayFeedbackEntry{}, false } for _, item := range p.feedback { if item.PeerNodeID == peerNodeID && item.RelayNodeID == relayNodeID && rendezvousFeedbackAppliesToRoute(item, routeID) { return item, true } } return rendezvousRelayFeedbackEntry{}, false } func (p *rendezvousRelayPolicy) hasStalePeer(routeID string, peerNodeID string) (rendezvousRelayFeedbackEntry, bool) { if p == nil { return rendezvousRelayFeedbackEntry{}, false } for _, item := range p.feedback { if item.PeerNodeID == peerNodeID && rendezvousFeedbackAppliesToRoute(item, routeID) { return item, true } } return rendezvousRelayFeedbackEntry{}, false } func (p *rendezvousRelayPolicy) recordWithdrawal(route SyntheticMeshRouteConfig, lease PeerRendezvousLease, feedback rendezvousRelayFeedbackEntry) { if p == nil { return } key := route.RouteID + "\x00" + lease.LeaseID + "\x00" + lease.RelayNodeID p.withdrawn[key] = RendezvousRelayPolicyDecision{ RouteID: route.RouteID, PeerNodeID: lease.PeerNodeID, WithdrawnLeaseID: lease.LeaseID, StaleRelayNodeID: lease.RelayNodeID, Reason: "stale_relay_withdrawn", ReporterNodeID: feedback.ReporterNodeID, } } func (p *rendezvousRelayPolicy) recordReplacement(route SyntheticMeshRouteConfig, peerNodeID string, feedback rendezvousRelayFeedbackEntry, selection rendezvousRelaySelection) { if p == nil || selection.RelayNodeID == "" { return } key := rendezvousRelayReplacementKey(route.RouteID, peerNodeID, feedback.RelayNodeID, selection.RelayNodeID) p.replacements[key] = RendezvousRelayPolicyDecision{ RouteID: route.RouteID, PeerNodeID: peerNodeID, StaleRelayNodeID: feedback.RelayNodeID, SelectedRelayID: selection.RelayNodeID, SelectedEndpoint: selection.Endpoint, Score: selection.Score, Reason: "stale_relay_replacement", ScoreReasons: append([]string{}, selection.Reasons...), ReporterNodeID: feedback.ReporterNodeID, } } func (p *rendezvousRelayPolicy) addReplacementHints(hints []RendezvousRelayPolicyDecision) { if p == nil { return } for _, hint := range hints { hint.RouteID = strings.TrimSpace(hint.RouteID) hint.PeerNodeID = strings.TrimSpace(hint.PeerNodeID) hint.StaleRelayNodeID = strings.TrimSpace(hint.StaleRelayNodeID) hint.SelectedRelayID = strings.TrimSpace(hint.SelectedRelayID) hint.SelectedEndpoint = strings.TrimRight(strings.TrimSpace(hint.SelectedEndpoint), "/") if hint.RouteID == "" || hint.PeerNodeID == "" || hint.StaleRelayNodeID == "" || hint.SelectedRelayID == "" { continue } if hint.Reason == "" { hint.Reason = "stale_relay_replacement" } if len(hint.ScoreReasons) == 0 { hint.ScoreReasons = []string{"route_path_decision_hint"} } key := rendezvousRelayReplacementKey(hint.RouteID, hint.PeerNodeID, hint.StaleRelayNodeID, hint.SelectedRelayID) existing, exists := p.replacements[key] if !exists || hint.Score > existing.Score { p.replacements[key] = hint } } } func (p *rendezvousRelayPolicy) report() *RendezvousRelayPolicyReport { if p == nil || (len(p.feedback) == 0 && len(p.withdrawn) == 0 && len(p.replacements) == 0) { return nil } decisions := make([]RendezvousRelayPolicyDecision, 0, len(p.withdrawn)+len(p.replacements)) for _, decision := range p.withdrawn { decisions = append(decisions, decision) } for _, decision := range p.replacements { decisions = append(decisions, decision) } sort.SliceStable(decisions, func(i, j int) bool { if decisions[i].RouteID != decisions[j].RouteID { return decisions[i].RouteID < decisions[j].RouteID } if decisions[i].PeerNodeID != decisions[j].PeerNodeID { return decisions[i].PeerNodeID < decisions[j].PeerNodeID } if decisions[i].Reason != decisions[j].Reason { return decisions[i].Reason < decisions[j].Reason } return decisions[i].SelectedRelayID < decisions[j].SelectedRelayID }) return &RendezvousRelayPolicyReport{ SchemaVersion: "c17z15.rendezvous_relay_policy.v1", ScoringMode: "route_adjacency_endpoint_priority_mesh_link_health_synthetic_route_health_feedback", FeedbackMaxAgeSeconds: int(rendezvousRelayFeedbackMaxAge / time.Second), StaleRelayCount: len(p.feedback), WithdrawnLeaseCount: len(p.withdrawn), ReplacementLeaseCount: len(p.replacements), Decisions: decisions, } } func (p *rendezvousRelayPolicy) replacementDecision(routeID string, peerNodeID string, selectedRelayID string) (RendezvousRelayPolicyDecision, bool) { if p == nil { return RendezvousRelayPolicyDecision{}, false } for _, decision := range p.replacements { if decision.RouteID == routeID && decision.PeerNodeID == peerNodeID && decision.SelectedRelayID == selectedRelayID { return decision, true } } return RendezvousRelayPolicyDecision{}, false } func rendezvousRelayReplacementKey(routeID string, peerNodeID string, staleRelayNodeID string, selectedRelayID string) string { return strings.TrimSpace(routeID) + "\x00" + strings.TrimSpace(peerNodeID) + "\x00" + strings.TrimSpace(staleRelayNodeID) + "\x00" + strings.TrimSpace(selectedRelayID) } func routePathDecisionReport(generation string, decisions []RoutePathDecision) *RoutePathDecisionReport { return routePathDecisionReportWithRecoveryPolicy(generation, decisions, defaultFabricServiceChannelRecoveryPolicy()) } func routePathDecisionReportWithRecoveryPolicy(generation string, decisions []RoutePathDecision, policy FabricServiceChannelRecoveryPolicy) *RoutePathDecisionReport { if len(decisions) == 0 { return nil } policy = normalizeFabricServiceChannelRecoveryPolicy(policy, defaultFabricServiceChannelRecoveryPolicy()) out := append([]RoutePathDecision{}, decisions...) sort.SliceStable(out, func(i, j int) bool { if out[i].RouteID != out[j].RouteID { return out[i].RouteID < out[j].RouteID } return out[i].DecisionID < out[j].DecisionID }) replacements := 0 degraded := 0 rebuildRequests := 0 rebuildApplied := 0 recoveryHysteresis := 0 recoveryPromoted := 0 recoveryDemoted := 0 for _, decision := range out { if decision.DecisionSource == "stale_relay_replacement" || decision.DecisionSource == "service_channel_feedback_replacement" || decision.DecisionSource == "service_channel_feedback_exit_pool_replacement" || decision.DecisionSource == "service_channel_feedback_entry_pool_replacement" || decision.DecisionSource == "service_channel_feedback_entry_exit_pool_replacement" || (decision.DecisionSource == "service_channel_remediation_command" && strings.TrimSpace(decision.ReplacementRouteID) != "") { replacements++ } if containsString(decision.ScoreReasons, "service_channel_recovery_hysteresis") { recoveryHysteresis++ } if containsString(decision.ScoreReasons, "service_channel_recovery_promoted") { recoveryPromoted++ } if containsString(decision.ScoreReasons, "service_channel_recovery_demoted") { recoveryDemoted++ } if decision.DecisionSource == "service_channel_feedback_no_alternate" || decision.RebuildStatus == "no_alternate" { degraded++ } switch decision.RebuildStatus { case "requested", "pending_degraded_fallback", "no_alternate", "deferred_by_policy", "expired": rebuildRequests++ case "applied": rebuildRequests++ rebuildApplied++ } } return &RoutePathDecisionReport{ SchemaVersion: "c17z18.route_path_decisions.v1", DecisionMode: "control_plane_effective_path_from_relay_policy_and_service_channel_feedback", Generation: generation, RecoveryPolicy: fabricServiceChannelRecoveryPolicyRef(policy), DecisionCount: len(out), ReplacementDecisionCount: replacements, DegradedDecisionCount: degraded, RebuildRequestCount: rebuildRequests, RebuildAppliedCount: rebuildApplied, RecoveryHysteresisCount: recoveryHysteresis, RecoveryPromotedCount: recoveryPromoted, RecoveryDemotedCount: recoveryDemoted, ControlPlaneOnly: true, ProductionForwarding: false, Decisions: out, } } func serviceChannelFeedbackRequestsRebuild(item fabricServiceChannelRouteFeedback) bool { if item.RouteID == "" || !item.Fenced || item.ManualRetry { return false } return item.RouteRebuildRecommended || item.DegradedFallbackRecommended || item.ConsecutiveFailures >= 2 || containsString(item.Reasons, "service_channel_route_rebuild_recommended") } func serviceChannelRebuildRequestID(routeID, reporterNodeID, generation string) string { base := strings.TrimSpace(routeID) if base == "" { base = "route" } if strings.TrimSpace(reporterNodeID) != "" { base += "-" + strings.TrimSpace(reporterNodeID) } if strings.TrimSpace(generation) != "" { base += "-" + strings.TrimSpace(generation) } return base + "-rebuild" } func (s *Service) serviceChannelRouteReplacementDecision(input GetNodeSyntheticMeshConfigInput, fencedRoute SyntheticMeshRouteConfig, intents []MeshRouteIntent, feedback map[string]fabricServiceChannelRouteFeedback, generation string) RoutePathDecision { routeFeedback := feedback[fencedRoute.RouteID] decision := RoutePathDecision{ DecisionID: fencedRoute.RouteID + "-path-" + input.NodeID + "-service-channel-feedback", RouteID: fencedRoute.RouteID, ClusterID: fencedRoute.ClusterID, LocalNodeID: input.NodeID, SourceNodeID: fencedRoute.SourceNodeID, DestinationNodeID: fencedRoute.DestinationNodeID, OriginalHops: append([]string{}, fencedRoute.Hops...), EffectiveHops: append([]string{}, fencedRoute.Hops...), DecisionSource: "service_channel_feedback_no_alternate_keep_primary", Generation: generation, PathScore: serviceChannelReplacementRouteScore(fencedRoute), ScoreReasons: []string{"service_channel_fenced_route", "no_unfenced_alternate_route", "primary_route_retained_until_rebuild"}, ControlPlaneOnly: true, ProductionForwarding: false, ExpiresAt: fencedRoute.ExpiresAt.UTC(), } applyServiceChannelFeedbackCorrelationToDecision(&decision, routeFeedback) if serviceChannelFeedbackRequestsRebuild(routeFeedback) { decision.RebuildRequestID = serviceChannelRebuildRequestID(fencedRoute.RouteID, input.NodeID, generation) decision.RebuildStatus = "requested" decision.RebuildReason = "service_channel_feedback_rebuild_requested" decision.RebuildAttempt = routeFeedback.ConsecutiveFailures decision.ScoreReasons = append(decision.ScoreReasons, "service_channel_rebuild_requested") if routeFeedback.DegradedFallbackRecommended { decision.ScoreReasons = append(decision.ScoreReasons, "service_channel_degraded_fallback_recommended") } } replacement, replacementFeedback, ok := s.selectServiceChannelRouteReplacement(input, fencedRoute, intents, feedback) if ok { decision.ReplacementRouteID = replacement.RouteID decision.EffectiveHops = append([]string{}, replacement.Hops...) decision.DecisionSource = "service_channel_feedback_replacement" decision.PathScore = serviceChannelReplacementRouteScore(replacement) decision.ScoreReasons = []string{"service_channel_fenced_route", "selected_unfenced_alternate_route"} if replacement.SourceNodeID != fencedRoute.SourceNodeID { decision.DecisionSource = "service_channel_feedback_entry_pool_replacement" decision.ScoreReasons = append(decision.ScoreReasons, "selected_unfenced_entry_pool_route") } if replacement.DestinationNodeID != fencedRoute.DestinationNodeID { decision.DecisionSource = "service_channel_feedback_exit_pool_replacement" decision.ScoreReasons = append(decision.ScoreReasons, "selected_unfenced_exit_pool_route") } if replacement.SourceNodeID != fencedRoute.SourceNodeID && replacement.DestinationNodeID != fencedRoute.DestinationNodeID { decision.DecisionSource = "service_channel_feedback_entry_exit_pool_replacement" decision.ScoreReasons = append(decision.ScoreReasons, "selected_unfenced_entry_exit_pool_route") } if decision.RebuildRequestID != "" { decision.RebuildStatus = "applied" decision.RebuildReason = "service_channel_feedback_rebuild_applied_to_alternate" decision.ScoreReasons = append(decision.ScoreReasons, "service_channel_rebuild_applied") } if replacementFeedback.RouteID != "" && !replacementFeedback.Fenced { decision.PathScore += 10000 decision.ScoreReasons = append(decision.ScoreReasons, "active_healthy_feedback_dampening_window") decision.ScoreReasons = append(decision.ScoreReasons, replacementFeedback.Reasons...) } decision.ScoreReasons = dedupeStrings(decision.ScoreReasons) if replacement.ExpiresAt.Before(decision.ExpiresAt) { decision.ExpiresAt = replacement.ExpiresAt.UTC() } } decision.PreviousHopID, decision.NextHopID, decision.LocalRole = routePathLocalPosition(decision.EffectiveHops, input.NodeID, "", "") return decision } func applyServiceChannelFeedbackCorrelationToDecision(decision *RoutePathDecision, feedback fabricServiceChannelRouteFeedback) { if decision == nil || feedback.RouteID == "" { return } decision.FeedbackObservationID = feedback.ObservationID decision.FeedbackSource = feedback.Source if !feedback.ObservedAt.IsZero() { observedAt := feedback.ObservedAt.UTC() decision.FeedbackObservedAt = &observedAt } if !feedback.ExpiresAt.IsZero() { expiresAt := feedback.ExpiresAt.UTC() decision.FeedbackExpiresAt = &expiresAt } decision.FeedbackChannelID = feedback.ChannelID decision.FeedbackResourceID = feedback.ResourceID decision.FeedbackViolationStatus = feedback.ViolationStatus decision.FeedbackViolationReason = feedback.ViolationReason } func (s *Service) selectServiceChannelRouteReplacement(input GetNodeSyntheticMeshConfigInput, fencedRoute SyntheticMeshRouteConfig, intents []MeshRouteIntent, feedback map[string]fabricServiceChannelRouteFeedback) (SyntheticMeshRouteConfig, fabricServiceChannelRouteFeedback, bool) { var selected SyntheticMeshRouteConfig var selectedFeedback fabricServiceChannelRouteFeedback selectedScore := -1 scopes := fabricServiceChannelRouteIntentReplacementScopes(intents) for _, intent := range intents { route, _, _, _, _, ok := s.syntheticRouteFromIntent(input, intent, endpointPerspective{}) if !ok || route.RouteID == fencedRoute.RouteID { continue } if !fabricServiceChannelRoutesShareReplacementScope(fencedRoute, route, scopes) { continue } if !fabricChannelsIntersect(route.AllowedChannels, fencedRoute.AllowedChannels) { continue } if item, ok := feedback[route.RouteID]; ok && item.Fenced { continue } routeFeedback := feedback[route.RouteID] score := serviceChannelReplacementRouteScore(route) + intent.Priority if routeFeedback.RouteID != "" { score += 10000 } if route.DestinationNodeID != fencedRoute.DestinationNodeID { score -= 5 } if route.SourceNodeID != fencedRoute.SourceNodeID { score -= 10 } if score > selectedScore || (score == selectedScore && route.RouteID < selected.RouteID) { selected = route selectedFeedback = routeFeedback selectedScore = score } } return selected, selectedFeedback, selected.RouteID != "" } func serviceChannelReplacementRouteScore(route SyntheticMeshRouteConfig) int { score := 1000 - len(route.Hops)*10 if score < 1 { return 1 } return score } func routePathDecisionForRoute(route SyntheticMeshRouteConfig, localNodeID string, leases []PeerRendezvousLease, relayPolicy *rendezvousRelayPolicy, generation string, serviceFeedback fabricServiceChannelRouteFeedback) RoutePathDecision { decision := RoutePathDecision{ DecisionID: route.RouteID + "-path-" + localNodeID, RouteID: route.RouteID, ClusterID: route.ClusterID, LocalNodeID: localNodeID, SourceNodeID: route.SourceNodeID, DestinationNodeID: route.DestinationNodeID, OriginalHops: append([]string{}, route.Hops...), EffectiveHops: append([]string{}, route.Hops...), DecisionSource: "route_intent", Generation: generation, PathScore: 1000, ScoreReasons: []string{"route_intent_hops"}, ControlPlaneOnly: true, ProductionForwarding: false, ExpiresAt: route.ExpiresAt.UTC(), } if serviceFeedback.ManualRetry { decision.ScoreReasons = append(decision.ScoreReasons, "service_channel_route_retry_after_operator_expire") decision.ScoreReasons = append(decision.ScoreReasons, serviceFeedback.Reasons...) decision.ScoreReasons = dedupeStrings(decision.ScoreReasons) if serviceFeedback.RetryCooldownUntil != nil && serviceFeedback.RetryCooldownUntil.Before(decision.ExpiresAt) { decision.ExpiresAt = serviceFeedback.RetryCooldownUntil.UTC() } } var replacementLease PeerRendezvousLease var replacementDecision RendezvousRelayPolicyDecision replacementFound := false for _, lease := range leases { if !containsString(lease.RouteIDs, route.RouteID) { continue } relayDecision, ok := relayPolicy.replacementDecision(route.RouteID, lease.PeerNodeID, lease.RelayNodeID) if !ok && lease.Reason != "stale_relay_replacement" { continue } if !ok { relayDecision = RendezvousRelayPolicyDecision{ RouteID: route.RouteID, PeerNodeID: lease.PeerNodeID, SelectedRelayID: lease.RelayNodeID, SelectedEndpoint: lease.RelayEndpoint, Reason: "stale_relay_replacement", } } if !replacementFound || relayDecision.Score > replacementDecision.Score { replacementFound = true replacementLease = lease replacementDecision = relayDecision } } if replacementFound { decision.DecisionID = route.RouteID + "-path-" + localNodeID + "-via-" + replacementLease.RelayNodeID decision.EffectiveHops = effectiveRoutePathWithReplacement(route.Hops, replacementLease.PeerNodeID, replacementDecision.StaleRelayNodeID, replacementLease.RelayNodeID) decision.SelectedRelayID = replacementLease.RelayNodeID decision.SelectedRelayEndpoint = replacementLease.RelayEndpoint decision.StaleRelayNodeID = replacementDecision.StaleRelayNodeID decision.RendezvousPeerNodeID = replacementLease.PeerNodeID decision.RendezvousLeaseID = replacementLease.LeaseID decision.RendezvousLeaseReason = replacementLease.Reason decision.DecisionSource = "stale_relay_replacement" decision.PathScore = replacementDecision.Score if decision.PathScore == 0 { decision.PathScore = 1000 } decision.ScoreReasons = append([]string{}, replacementDecision.ScoreReasons...) if len(decision.ScoreReasons) == 0 { decision.ScoreReasons = []string{"relay_replacement_policy"} } } else if lease, ok := routePathRendezvousLeaseForRoute(route, leases); ok { decision.DecisionID = route.RouteID + "-path-" + localNodeID + "-via-" + lease.RelayNodeID decision.EffectiveHops = effectiveRoutePathWithReplacement(route.Hops, lease.PeerNodeID, "", lease.RelayNodeID) decision.SelectedRelayID = lease.RelayNodeID decision.SelectedRelayEndpoint = lease.RelayEndpoint decision.RendezvousPeerNodeID = lease.PeerNodeID decision.RendezvousLeaseID = lease.LeaseID decision.RendezvousLeaseReason = lease.Reason decision.DecisionSource = "rendezvous_relay_required" decision.PathScore = 900 if lease.Priority > 0 { decision.PathScore = 1000 - lease.Priority if decision.PathScore < 1 { decision.PathScore = 1 } } decision.ScoreReasons = []string{"rendezvous_relay_required", "passive_nat_reverse_path"} } decision.PreviousHopID, decision.NextHopID, decision.LocalRole = routePathLocalPosition(decision.EffectiveHops, localNodeID, decision.SelectedRelayID, decision.StaleRelayNodeID) return decision } func routePathRendezvousLeaseForRoute(route SyntheticMeshRouteConfig, leases []PeerRendezvousLease) (PeerRendezvousLease, bool) { var selected PeerRendezvousLease found := false for _, lease := range leases { if strings.TrimSpace(lease.RelayNodeID) == "" || strings.TrimSpace(lease.PeerNodeID) == "" || !containsString(lease.RouteIDs, route.RouteID) || !containsString(route.Hops, lease.PeerNodeID) { continue } if !found || rendezvousLeaseBetterForRoutePath(lease, selected) { selected = lease found = true } } return selected, found } func rendezvousLeaseBetterForRoutePath(candidate PeerRendezvousLease, current PeerRendezvousLease) bool { if candidate.Priority != current.Priority { if current.Priority <= 0 { return true } if candidate.Priority <= 0 { return false } return candidate.Priority < current.Priority } if candidate.RelayNodeID != current.RelayNodeID { return candidate.RelayNodeID < current.RelayNodeID } return candidate.LeaseID < current.LeaseID } func effectiveRoutePathWithReplacement(original []string, peerNodeID string, staleRelayNodeID string, selectedRelayID string) []string { out := make([]string, 0, len(original)+1) for _, nodeID := range original { nodeID = strings.TrimSpace(nodeID) if nodeID == "" || (staleRelayNodeID != "" && nodeID == staleRelayNodeID) { continue } out = append(out, nodeID) } if selectedRelayID == "" || containsString(out, selectedRelayID) { return out } peerIndex := -1 for index, nodeID := range out { if nodeID == peerNodeID { peerIndex = index break } } if peerIndex < 0 { return append(out, selectedRelayID) } out = append(out, "") copy(out[peerIndex+1:], out[peerIndex:]) out[peerIndex] = selectedRelayID return out } func routePathLocalPosition(path []string, localNodeID string, selectedRelayID string, staleRelayNodeID string) (string, string, string) { localIndex := -1 for index, nodeID := range path { if nodeID == localNodeID { localIndex = index break } } if localIndex < 0 { if staleRelayNodeID != "" && localNodeID == staleRelayNodeID { return "", "", "withdrawn_relay" } return "", "", "not_on_effective_path" } previous := "" next := "" if localIndex > 0 { previous = path[localIndex-1] } if localIndex < len(path)-1 { next = path[localIndex+1] } role := "transit" switch { case localIndex == 0: role = "entry" case localIndex == len(path)-1: role = "exit" case selectedRelayID != "" && localNodeID == selectedRelayID: role = "selected_relay" } return previous, next, role } func rendezvousFeedbackAppliesToRoute(item rendezvousRelayFeedbackEntry, routeID string) bool { if strings.TrimSpace(routeID) == "" || len(item.RouteIDs) == 0 { return true } return containsString(item.RouteIDs, routeID) } func reachabilityFromConnectivityMode(connectivityMode string) string { switch connectivityMode { case "outbound_only": return "outbound_only" case "relay_required": return "relay" case "private_lan": return "private" case "direct": return "public" default: return "unknown" } } func validatePeerRecoverySeeds(seeds []PeerRecoverySeed) error { if len(seeds) > maxScopedRecoverySeeds { return ErrInvalidPayload } seen := map[string]struct{}{} for _, seed := range seeds { key := strings.TrimSpace(seed.NodeID) + "\x00" + strings.TrimSpace(seed.Endpoint) if strings.TrimSpace(seed.NodeID) == "" || strings.TrimSpace(seed.Endpoint) == "" || !isPeerEndpointTransport(seed.Transport) || (seed.ConnectivityMode != "" && !isPeerEndpointConnectivityMode(seed.ConnectivityMode)) || (len(seed.Metadata) > 0 && !json.Valid(seed.Metadata)) { return ErrInvalidPayload } if _, duplicate := seen[key]; duplicate { return ErrInvalidPayload } seen[key] = struct{}{} } return nil } func validatePeerRendezvousLeases(leases []PeerRendezvousLease, routePath []string, now time.Time) error { if len(leases) > maxScopedRendezvousLeases { return ErrInvalidPayload } now = now.UTC() seen := map[string]struct{}{} for _, lease := range leases { peerNodeID := strings.TrimSpace(lease.PeerNodeID) relayNodeID := strings.TrimSpace(lease.RelayNodeID) relayEndpoint := strings.TrimSpace(lease.RelayEndpoint) transport := strings.TrimSpace(lease.Transport) if peerNodeID == "" || relayNodeID == "" || relayEndpoint == "" || peerNodeID == relayNodeID || !containsString(routePath, peerNodeID) || (transport != "" && !isPeerRendezvousTransport(transport)) || (!lease.ExpiresAt.IsZero() && !lease.ExpiresAt.After(now)) || (len(lease.Metadata) > 0 && !json.Valid(lease.Metadata)) { return ErrInvalidPayload } if strings.TrimSpace(lease.LeaseID) == "" { continue } if _, duplicate := seen[lease.LeaseID]; duplicate { return ErrInvalidPayload } seen[lease.LeaseID] = struct{}{} } return nil } func normalizeRendezvousLeases(leases []PeerRendezvousLease, route SyntheticMeshRouteConfig, now time.Time) []PeerRendezvousLease { out := make([]PeerRendezvousLease, 0, len(leases)) now = now.UTC() for _, lease := range leases { lease.PeerNodeID = strings.TrimSpace(lease.PeerNodeID) lease.RelayNodeID = strings.TrimSpace(lease.RelayNodeID) lease.RelayEndpoint = strings.TrimRight(strings.TrimSpace(lease.RelayEndpoint), "/") if lease.LeaseID == "" { lease.LeaseID = route.RouteID + "-rv-" + lease.PeerNodeID + "-via-" + lease.RelayNodeID } if lease.Transport == "" { lease.Transport = "relay_control" } if lease.ConnectivityMode == "" { lease.ConnectivityMode = "relay_required" } if lease.Priority <= 0 { lease.Priority = 100 } if len(lease.RouteIDs) == 0 { lease.RouteIDs = []string{route.RouteID} } else if !containsString(lease.RouteIDs, route.RouteID) { lease.RouteIDs = append(append([]string{}, lease.RouteIDs...), route.RouteID) } lease.AllowedChannels = controlPlaneAllowedChannels(firstNonEmptyStringSlice(lease.AllowedChannels, route.AllowedChannels)) if len(lease.AllowedChannels) == 0 { lease.AllowedChannels = []string{"fabric_control", "route_control"} } lease.ControlPlaneOnly = true if lease.IssuedAt.IsZero() { lease.IssuedAt = now } else { lease.IssuedAt = lease.IssuedAt.UTC() } if lease.ExpiresAt.IsZero() || (!route.ExpiresAt.IsZero() && lease.ExpiresAt.After(route.ExpiresAt)) { lease.ExpiresAt = route.ExpiresAt.UTC() } else { lease.ExpiresAt = lease.ExpiresAt.UTC() } if lease.Reason == "" { lease.Reason = "policy_rendezvous_lease" } if lease.Metadata == nil { lease.Metadata = json.RawMessage(`{}`) } if !lease.ExpiresAt.IsZero() && lease.ExpiresAt.After(now) { out = append(out, lease) } } return out } func scopedRendezvousLeases(leases []PeerRendezvousLease, route SyntheticMeshRouteConfig, localNodeID string, relayPolicy *rendezvousRelayPolicy, now time.Time) []PeerRendezvousLease { if !containsString(route.Hops, localNodeID) { return nil } normalized := normalizeRendezvousLeases(leases, route, now) out := make([]PeerRendezvousLease, 0, len(normalized)) for _, lease := range normalized { if feedback, stale := relayPolicy.staleForLease(route.RouteID, lease); stale { if rendezvousLeaseStaleFeedbackCanWithdraw(lease) || relayPolicy.hasReplacementForLease(route.RouteID, lease) { relayPolicy.recordWithdrawal(route, lease, feedback) continue } } if containsString(route.Hops, lease.PeerNodeID) { out = append(out, lease) } } return out } func (p *rendezvousRelayPolicy) hasReplacementForLease(routeID string, lease PeerRendezvousLease) bool { if p == nil { return false } for _, decision := range p.replacements { if decision.RouteID == routeID && decision.PeerNodeID == lease.PeerNodeID && decision.StaleRelayNodeID == lease.RelayNodeID && decision.SelectedRelayID != "" && decision.SelectedRelayID != lease.RelayNodeID { return true } } return false } func rendezvousLeaseStaleFeedbackCanWithdraw(lease PeerRendezvousLease) bool { reason := strings.ToLower(strings.TrimSpace(lease.Reason)) if strings.Contains(reason, "operator") || strings.Contains(reason, "manual") { return false } return true } func derivedRendezvousLeases(route SyntheticMeshRouteConfig, peers map[string]string, candidates map[string][]PeerEndpointCandidate, localNodeID string, localPerspective endpointPerspective, relayPolicy *rendezvousRelayPolicy, now time.Time) []PeerRendezvousLease { if !containsString(route.Hops, localNodeID) { return nil } out := []PeerRendezvousLease{} for peerNodeID, items := range candidates { peerNodeID = strings.TrimSpace(peerNodeID) if peerNodeID == "" || !containsString(route.Hops, peerNodeID) || !peerEndpointCandidatesRequireRendezvousForLocal(items, localPerspective) { continue } selection := selectRendezvousRelay(route, peerNodeID, localNodeID, peers, candidates, relayPolicy) if selection.RelayNodeID == "" || selection.Endpoint == "" { continue } _, replacement := relayPolicy.hasStalePeer(route.RouteID, peerNodeID) reason := rendezvousLeaseReason(items) if replacement { reason = "stale_relay_replacement" } lease := PeerRendezvousLease{ LeaseID: route.RouteID + "-rv-" + peerNodeID + "-via-" + selection.RelayNodeID, PeerNodeID: peerNodeID, RelayNodeID: selection.RelayNodeID, RelayEndpoint: selection.Endpoint, Transport: "relay_control", ConnectivityMode: "relay_required", RouteIDs: []string{route.RouteID}, AllowedChannels: controlPlaneAllowedChannels(route.AllowedChannels), Priority: rendezvousLeasePriority(items), ControlPlaneOnly: true, IssuedAt: now.UTC(), ExpiresAt: route.ExpiresAt.UTC(), Reason: reason, Metadata: rendezvousRelayLeaseMetadata(selection, replacement), } if len(lease.AllowedChannels) == 0 { lease.AllowedChannels = []string{"fabric_control", "route_control"} } if lease.Priority <= 0 { lease.Priority = 100 } if lease.ExpiresAt.After(now.UTC()) { out = append(out, lease) if feedback, ok := relayPolicy.hasStalePeer(route.RouteID, peerNodeID); ok && feedback.RelayNodeID != selection.RelayNodeID { relayPolicy.recordReplacement(route, peerNodeID, feedback, selection) } } } return out } func selectRendezvousRelay(route SyntheticMeshRouteConfig, peerNodeID string, localNodeID string, peers map[string]string, candidates map[string][]PeerEndpointCandidate, relayPolicy *rendezvousRelayPolicy) rendezvousRelaySelection { routePath := route.Hops peerIndex := -1 for index, nodeID := range routePath { if nodeID == peerNodeID { peerIndex = index break } } preferred := []string{} if peerIndex > 0 { preferred = append(preferred, routePath[peerIndex-1]) } if peerIndex >= 0 && peerIndex < len(routePath)-1 { preferred = append(preferred, routePath[peerIndex+1]) } preferred = append(preferred, routePath...) extraCandidates := make([]string, 0, len(candidates)) for nodeID := range candidates { nodeID = strings.TrimSpace(nodeID) if nodeID != "" { extraCandidates = append(extraCandidates, nodeID) } } sort.Strings(extraCandidates) preferred = append(preferred, extraCandidates...) seen := map[string]struct{}{} relayCandidates := []rendezvousRelaySelection{} for _, relayNodeID := range preferred { relayNodeID = strings.TrimSpace(relayNodeID) if relayNodeID == "" || relayNodeID == peerNodeID { continue } if _, duplicate := seen[relayNodeID]; duplicate { continue } seen[relayNodeID] = struct{}{} if _, stale := relayPolicy.relayStale(route.RouteID, peerNodeID, relayNodeID); stale { continue } endpoint, peerCertSHA256, endpointScore, endpointReasons := relayControlEndpointForNode(relayNodeID, peers, candidates) if endpoint == "" { continue } score, scoreReasons := rendezvousRelayCandidateScore(route.RouteID, routePath, peerIndex, relayNodeID, localNodeID, endpointScore, endpointReasons, relayPolicy) relayCandidates = append(relayCandidates, rendezvousRelaySelection{ RelayNodeID: relayNodeID, Endpoint: endpoint, PeerCertSHA256: peerCertSHA256, Score: score, Reasons: scoreReasons, }) } if len(relayCandidates) == 0 { return rendezvousRelaySelection{} } sort.SliceStable(relayCandidates, func(i, j int) bool { if relayCandidates[i].Score != relayCandidates[j].Score { return relayCandidates[i].Score > relayCandidates[j].Score } return relayCandidates[i].RelayNodeID < relayCandidates[j].RelayNodeID }) return relayCandidates[0] } func relayControlEndpointForNode(nodeID string, peers map[string]string, candidates map[string][]PeerEndpointCandidate) (string, string, int, []string) { items := append([]PeerEndpointCandidate{}, candidates[nodeID]...) sort.SliceStable(items, func(i, j int) bool { if items[i].Priority != items[j].Priority { return items[i].Priority < items[j].Priority } return items[i].EndpointID < items[j].EndpointID }) for _, candidate := range items { if endpointCandidateRequiresRendezvous(candidate) { continue } endpoint := strings.TrimRight(strings.TrimSpace(candidate.Address), "/") if isUsableFabricControlEndpoint(endpoint) { peerCertSHA256 := peerEndpointCandidateTLSCertSHA256(candidate) if peerCertSHA256 == "" { peerCertSHA256 = peerEndpointCandidateTLSCertSHA256ForEndpoint(items, endpoint) } score := 70 reasons := []string{"endpoint_candidate"} if candidate.Priority > 0 { score += maxInt(0, 50-candidate.Priority) } if hasPolicyTag(candidate.PolicyTags, "fast-path") { score += 25 reasons = append(reasons, "fast_path") } if hasPolicyTag(candidate.PolicyTags, "same-site") || hasPolicyTag(candidate.PolicyTags, "corp-lan") || hasPolicyTag(candidate.PolicyTags, "private-lan") { score += 20 reasons = append(reasons, "same_site") } if strings.EqualFold(candidate.ConnectivityMode, "direct") { score += 10 reasons = append(reasons, "direct") } if peerCertSHA256 != "" { score += 15 reasons = append(reasons, "pinned_relay_cert") } return endpoint, peerCertSHA256, score, reasons } } if endpoint := strings.TrimRight(strings.TrimSpace(peers[nodeID]), "/"); isUsableFabricControlEndpoint(endpoint) { return endpoint, "", 80, []string{"reported_peer_endpoint"} } return "", "", 0, nil } func coreMeshBootstrapRendezvousLeases(clusterID, localNodeID string, candidates map[string][]PeerEndpointCandidate, relayPolicy *rendezvousRelayPolicy, now time.Time) []PeerRendezvousLease { now = now.UTC() nodeIDs := make([]string, 0, len(candidates)) for nodeID := range candidates { nodeID = strings.TrimSpace(nodeID) if nodeID != "" { nodeIDs = append(nodeIDs, nodeID) } } sort.Strings(nodeIDs) out := []PeerRendezvousLease{} for _, peerNodeID := range nodeIDs { items := candidates[peerNodeID] if !peerEndpointCandidatesRequireRendezvous(items) { continue } selection := selectCoreMeshBootstrapRelay(peerNodeID, localNodeID, candidates, relayPolicy) if selection.RelayNodeID == "" || selection.Endpoint == "" { continue } lease := PeerRendezvousLease{ LeaseID: "core-mesh-bootstrap-rv-" + peerNodeID + "-via-" + selection.RelayNodeID, PeerNodeID: peerNodeID, RelayNodeID: selection.RelayNodeID, RelayEndpoint: selection.Endpoint, Transport: "relay_control", ConnectivityMode: "relay_required", RouteIDs: []string{"core-mesh-bootstrap"}, AllowedChannels: []string{"fabric_control", "route_control"}, Priority: rendezvousLeasePriority(items), ControlPlaneOnly: true, IssuedAt: now, ExpiresAt: now.Add(5 * time.Minute), Reason: "farm_mesh_bootstrap_relay", Metadata: coreMeshBootstrapRendezvousLeaseMetadata(clusterID, selection), } if lease.Priority <= 0 { lease.Priority = 90 } out = append(out, lease) } return out } func selectCoreMeshBootstrapRelay(peerNodeID, localNodeID string, candidates map[string][]PeerEndpointCandidate, relayPolicy *rendezvousRelayPolicy) rendezvousRelaySelection { relayNodeIDs := make([]string, 0, len(candidates)) for relayNodeID := range candidates { relayNodeID = strings.TrimSpace(relayNodeID) if relayNodeID != "" && relayNodeID != peerNodeID { relayNodeIDs = append(relayNodeIDs, relayNodeID) } } sort.Strings(relayNodeIDs) selections := []rendezvousRelaySelection{} for _, relayNodeID := range relayNodeIDs { endpoint, peerCertSHA256, endpointScore, endpointReasons := relayControlEndpointForNode(relayNodeID, nil, candidates) if endpoint == "" { continue } score := 500 + endpointScore reasons := append([]string{"farm_bootstrap_relay"}, endpointReasons...) if relayNodeID == localNodeID { score += 40 reasons = append(reasons, "local_entry_relay") } linkScore, linkReasons := rendezvousRelayLinkScore(relayNodeID, relayPolicy) score += linkScore reasons = append(reasons, linkReasons...) selections = append(selections, rendezvousRelaySelection{ RelayNodeID: relayNodeID, Endpoint: endpoint, PeerCertSHA256: peerCertSHA256, Score: score, Reasons: reasons, }) } if len(selections) == 0 { return rendezvousRelaySelection{} } sort.SliceStable(selections, func(i, j int) bool { if selections[i].Score != selections[j].Score { return selections[i].Score > selections[j].Score } return selections[i].RelayNodeID < selections[j].RelayNodeID }) return selections[0] } func coreMeshBootstrapRendezvousLeaseMetadata(clusterID string, selection rendezvousRelaySelection) json.RawMessage { payload := map[string]any{ "cluster_id": strings.TrimSpace(clusterID), "source": "farm_mesh_bootstrap", "selected_relay_score": selection.Score, "selected_relay_score_reasons": selection.Reasons, "service_workload_traffic": false, "production_forwarding": false, "control_service_relay": false, } if selection.PeerCertSHA256 != "" { payload["peer_cert_sha256"] = selection.PeerCertSHA256 payload["tls_cert_sha256"] = selection.PeerCertSHA256 } raw, err := json.Marshal(payload) if err != nil { return json.RawMessage(`{"source":"farm_mesh_bootstrap","control_service_relay":false}`) } return raw } func rendezvousRelayCandidateScore(routeID string, routePath []string, peerIndex int, relayNodeID string, localNodeID string, endpointScore int, endpointReasons []string, relayPolicy *rendezvousRelayPolicy) (int, []string) { score := 500 + endpointScore reasons := append([]string{}, endpointReasons...) relayIndex := -1 for index, nodeID := range routePath { if nodeID == relayNodeID { relayIndex = index break } } if peerIndex >= 0 && relayIndex >= 0 { distance := absInt(peerIndex - relayIndex) switch { case distance == 1: score += 180 reasons = append(reasons, "adjacent_to_peer") case distance == 2: score += 120 reasons = append(reasons, "near_peer") default: score += maxInt(0, 80-distance*10) reasons = append(reasons, "route_path_candidate") } } if relayIndex == 0 && len(routePath) > 2 { score -= 120 reasons = append(reasons, "entry_relay_fallback") } if relayNodeID == localNodeID { score += 40 reasons = append(reasons, "local_entry_relay") } linkScore, linkReasons := rendezvousRelayLinkScore(relayNodeID, relayPolicy) score += linkScore reasons = append(reasons, linkReasons...) routeHealthScore, routeHealthReasons := rendezvousRelayRouteHealthScore(routeID, relayNodeID, relayPolicy) score += routeHealthScore reasons = append(reasons, routeHealthReasons...) return score, reasons } func rendezvousRelayLinkScore(relayNodeID string, relayPolicy *rendezvousRelayPolicy) (int, []string) { if relayPolicy == nil || relayPolicy.localNodeID == "" { return 0, nil } var latest *MeshLinkObservation for i := range relayPolicy.links { link := &relayPolicy.links[i] if link.SourceNodeID != relayPolicy.localNodeID || link.TargetNodeID != relayNodeID { continue } if !link.ObservedAt.IsZero() && relayPolicy.now.Sub(link.ObservedAt.UTC()) > rendezvousRelayFeedbackMaxAge { continue } if latest == nil || link.ObservedAt.After(latest.ObservedAt) { latest = link } } if latest == nil { return 0, nil } switch latest.LinkStatus { case "reachable": score := 60 reasons := []string{"mesh_link_reachable"} if latest.QualityScore != nil { score += *latest.QualityScore reasons = append(reasons, "mesh_link_quality") } if latest.LatencyMs != nil { score += maxInt(0, 80-*latest.LatencyMs) reasons = append(reasons, "mesh_link_latency") } return score, reasons case "unreachable": return -250, []string{"mesh_link_unreachable"} default: return 0, nil } } func rendezvousRelayRouteHealthScore(routeID string, relayNodeID string, relayPolicy *rendezvousRelayPolicy) (int, []string) { if relayPolicy == nil || relayPolicy.localNodeID == "" { return 0, nil } routeID = strings.TrimSpace(routeID) relayNodeID = strings.TrimSpace(relayNodeID) if routeID == "" || relayNodeID == "" { return 0, nil } var latest *MeshLinkObservation var latestMetadata meshRouteHealthObservationMetadata for i := range relayPolicy.links { link := &relayPolicy.links[i] if link.SourceNodeID != relayPolicy.localNodeID || !meshLinkObservationFresh(*link, relayPolicy.now) { continue } metadata, ok := routeHealthMetadataFromLink(*link) if !ok || metadata.ObservationType != "synthetic_route_health" || strings.TrimSpace(metadata.RouteID) != routeID || strings.TrimSpace(metadata.RoutePathDecisionSelectedRelayID) != relayNodeID || metadata.ProductionForwarding || metadata.ProductionPayloadForwarding || metadata.RouteHealthProductionPayloadForwarding || metadata.RouteHealthServicePayloadForwarding { continue } if latest == nil || link.ObservedAt.After(latest.ObservedAt) { latest = link latestMetadata = metadata } } if latest == nil { return 0, nil } if latestMetadata.RoutePathDriftDetected { return -360, []string{"route_health_drift"} } if latest.LinkStatus == "unreachable" || strings.TrimSpace(latestMetadata.FailureReason) != "" { return -320, []string{"route_health_unreachable"} } if latest.LinkStatus != "reachable" { return 0, nil } score := 90 reasons := []string{"route_health_reachable", "route_health_no_drift"} if latest.QualityScore != nil { score += *latest.QualityScore reasons = append(reasons, "route_health_quality") } if latest.LatencyMs != nil { score += maxInt(0, 100-*latest.LatencyMs) reasons = append(reasons, "route_health_latency") } return score, reasons } func rendezvousRelayLeaseMetadata(selection rendezvousRelaySelection, replacement bool) json.RawMessage { payload := map[string]any{ "source": "control-plane", "derived_from": "endpoint_candidate", "lease_refresh_contract": "node_scoped_synthetic_config_get", "relay_replacement_contract": "stale_relay_feedback_policy", "relay_selection_score": selection.Score, "relay_selection_score_reasons": selection.Reasons, "production_payload_forwarding": false, } if selection.PeerCertSHA256 != "" { payload["peer_cert_sha256"] = selection.PeerCertSHA256 payload["tls_cert_sha256"] = selection.PeerCertSHA256 } if replacement { payload["replacement_for_stale_relay"] = true } raw, err := json.Marshal(payload) if err != nil { return json.RawMessage(`{"source":"control-plane","derived_from":"endpoint_candidate","lease_refresh_contract":"node_scoped_synthetic_config_get","relay_replacement_contract":"stale_relay_feedback_policy","production_payload_forwarding":false}`) } return raw } func peerEndpointCandidateTLSCertSHA256(candidate PeerEndpointCandidate) string { var metadata struct { PeerCertSHA256 string `json:"peer_cert_sha256,omitempty"` TLSCertSHA256 string `json:"tls_cert_sha256,omitempty"` } if len(candidate.Metadata) == 0 || !json.Valid(candidate.Metadata) { return "" } if err := json.Unmarshal(candidate.Metadata, &metadata); err != nil { return "" } return firstNonEmptyString(metadata.PeerCertSHA256, metadata.TLSCertSHA256) } func peerEndpointCandidateTLSCertSHA256ForEndpoint(candidates []PeerEndpointCandidate, endpoint string) string { endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/") if endpoint == "" { return "" } for _, candidate := range candidates { if strings.TrimRight(strings.TrimSpace(candidate.Address), "/") != endpoint { continue } if certSHA256 := peerEndpointCandidateTLSCertSHA256(candidate); certSHA256 != "" { return certSHA256 } } return "" } func hasPolicyTag(tags []string, want string) bool { want = strings.ToLower(strings.TrimSpace(want)) for _, tag := range tags { if strings.ToLower(strings.TrimSpace(tag)) == want { return true } } return false } func maxInt(a int, b int) int { if a > b { return a } return b } func minInt(a int, b int) int { if a < b { return a } return b } func absInt(value int) int { if value < 0 { return -value } return value } func peerEndpointCandidatesRequireRendezvous(candidates []PeerEndpointCandidate) bool { return peerEndpointCandidatesRequireRendezvousForLocal(candidates, endpointPerspective{}) } func peerEndpointCandidatesRequireRendezvousForLocal(candidates []PeerEndpointCandidate, local endpointPerspective) bool { for _, candidate := range candidates { if endpointCandidateRequiresRendezvous(candidate) || endpointCandidatePrivateForLocalOffsite(local, candidate) { return true } } return false } func endpointCandidateRequiresRendezvous(candidate PeerEndpointCandidate) bool { transport := strings.ToLower(strings.TrimSpace(candidate.Transport)) reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability)) connectivityMode := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode)) return strings.Contains(transport, "relay") || strings.Contains(transport, "outbound") || reachability == "relay" || reachability == "outbound_only" || connectivityMode == "relay_required" || connectivityMode == "outbound_only" } func rendezvousLeasePriority(candidates []PeerEndpointCandidate) int { priority := 0 for _, candidate := range candidates { if !endpointCandidateRequiresRendezvous(candidate) { continue } if priority == 0 || (candidate.Priority > 0 && candidate.Priority < priority) { priority = candidate.Priority } } return priority } func rendezvousLeaseReason(candidates []PeerEndpointCandidate) string { for _, candidate := range candidates { connectivityMode := strings.ToLower(strings.TrimSpace(candidate.ConnectivityMode)) reachability := strings.ToLower(strings.TrimSpace(candidate.Reachability)) if connectivityMode == "outbound_only" || reachability == "outbound_only" { return "auto_outbound_only" } if connectivityMode == "relay_required" || reachability == "relay" { return "auto_relay_required" } } return "auto_rendezvous_required" } func mergeRendezvousLeases(out map[string]PeerRendezvousLease, leases []PeerRendezvousLease) { for _, lease := range leases { if lease.Metadata == nil { lease.Metadata = json.RawMessage(`{}`) } key := strings.TrimSpace(lease.LeaseID) if key == "" { key = lease.PeerNodeID + "\x00" + lease.RelayNodeID + "\x00" + lease.RelayEndpoint } existing, ok := out[key] if !ok || lease.Priority < existing.Priority || existing.ExpiresAt.Before(lease.ExpiresAt) { out[key] = lease } } } func sortedRendezvousLeases(items map[string]PeerRendezvousLease, limit int) []PeerRendezvousLease { out := make([]PeerRendezvousLease, 0, len(items)) for _, item := range items { out = append(out, item) } sort.SliceStable(out, func(i, j int) bool { if out[i].Priority != out[j].Priority { return out[i].Priority < out[j].Priority } if out[i].PeerNodeID != out[j].PeerNodeID { return out[i].PeerNodeID < out[j].PeerNodeID } if out[i].RelayNodeID != out[j].RelayNodeID { return out[i].RelayNodeID < out[j].RelayNodeID } return out[i].LeaseID < out[j].LeaseID }) if len(out) > limit { out = out[:limit] } return out } func markPeerDirectoryRendezvousLeases(directory map[string]*PeerDirectoryEntry, leases []PeerRendezvousLease, localNodeID string) { for _, lease := range leases { if lease.PeerNodeID != "" && lease.PeerNodeID != localNodeID { entry := peerDirectoryEntry(directory, lease.PeerNodeID) entry.CandidateCount++ if !containsString(entry.ConnectivityModes, "relay_required") { entry.ConnectivityModes = append(entry.ConnectivityModes, "relay_required") } } if lease.RelayNodeID != "" && lease.RelayNodeID != localNodeID { entry := peerDirectoryEntry(directory, lease.RelayNodeID) entry.EndpointCount++ if !containsString(entry.ConnectivityModes, "relay_control") { entry.ConnectivityModes = append(entry.ConnectivityModes, "relay_control") } } } } func mergePeerDirectoryRoute(directory map[string]*PeerDirectoryEntry, route SyntheticMeshRouteConfig, localNodeID string) { for _, nodeID := range route.Hops { nodeID = strings.TrimSpace(nodeID) if nodeID == "" || nodeID == localNodeID { continue } entry := peerDirectoryEntry(directory, nodeID) if !containsString(entry.RouteIDs, route.RouteID) { entry.RouteIDs = append(entry.RouteIDs, route.RouteID) } } } func mergePeerDirectoryCandidates(directory map[string]*PeerDirectoryEntry, nodeID string, candidates []PeerEndpointCandidate) { entry := peerDirectoryEntry(directory, nodeID) entry.CandidateCount += len(candidates) for _, candidate := range candidates { if strings.TrimSpace(candidate.ConnectivityMode) != "" && !containsString(entry.ConnectivityModes, candidate.ConnectivityMode) { entry.ConnectivityModes = append(entry.ConnectivityModes, candidate.ConnectivityMode) } } } func peerDirectoryEntry(directory map[string]*PeerDirectoryEntry, nodeID string) *PeerDirectoryEntry { if entry, ok := directory[nodeID]; ok { return entry } entry := &PeerDirectoryEntry{NodeID: nodeID} directory[nodeID] = entry return entry } func mergeRecoverySeeds(out map[string]PeerRecoverySeed, seeds []PeerRecoverySeed) { for _, seed := range seeds { if seed.Metadata == nil { seed.Metadata = json.RawMessage(`{}`) } key := seed.NodeID + "\x00" + seed.Endpoint existing, ok := out[key] if !ok || seed.Priority < existing.Priority { out[key] = seed } } } func sortedRecoverySeeds(items map[string]PeerRecoverySeed, limit int) []PeerRecoverySeed { out := make([]PeerRecoverySeed, 0, len(items)) for _, item := range items { out = append(out, item) } sort.SliceStable(out, func(i, j int) bool { if out[i].Priority != out[j].Priority { return out[i].Priority < out[j].Priority } if out[i].NodeID != out[j].NodeID { return out[i].NodeID < out[j].NodeID } return out[i].Endpoint < out[j].Endpoint }) if len(out) > limit { out = out[:limit] } return out } func markPeerDirectoryRecoverySeeds(directory map[string]*PeerDirectoryEntry, seeds []PeerRecoverySeed) { for _, seed := range seeds { entry := peerDirectoryEntry(directory, seed.NodeID) entry.RecoverySeed = true if strings.TrimSpace(seed.ConnectivityMode) != "" && !containsString(entry.ConnectivityModes, seed.ConnectivityMode) { entry.ConnectivityModes = append(entry.ConnectivityModes, seed.ConnectivityMode) } } } func sortedPeerDirectory(items map[string]*PeerDirectoryEntry) []PeerDirectoryEntry { out := make([]PeerDirectoryEntry, 0, len(items)) for _, entry := range items { sort.Strings(entry.RouteIDs) sort.Strings(entry.ConnectivityModes) if entry.NodeID != "" { out = append(out, *entry) } } sort.SliceStable(out, func(i, j int) bool { return out[i].NodeID < out[j].NodeID }) return out } func validatePeerEndpointCandidates(candidates map[string][]PeerEndpointCandidate, routePath []string) error { if len(candidates) == 0 { return nil } for nodeID, items := range candidates { if strings.TrimSpace(nodeID) == "" || !containsString(routePath, nodeID) { return ErrInvalidPayload } for _, candidate := range items { if strings.TrimSpace(candidate.EndpointID) == "" || strings.TrimSpace(candidate.NodeID) == "" || candidate.NodeID != nodeID || strings.TrimSpace(candidate.Address) == "" || !isPeerEndpointTransport(candidate.Transport) || !isPeerEndpointReachability(candidate.Reachability) || !isPeerEndpointConnectivityMode(candidate.ConnectivityMode) || (candidate.NATType != "" && !isPeerEndpointNATType(candidate.NATType)) { return ErrInvalidPayload } if len(candidate.Metadata) > 0 && !json.Valid(candidate.Metadata) { return ErrInvalidPayload } } } return nil } func scopedPeerEndpoints(peers map[string]string, routePath []string) map[string]string { out := map[string]string{} for nodeID, endpoint := range peers { endpoint = strings.TrimSpace(endpoint) if containsString(routePath, nodeID) && endpoint != "" && !isUnusableLocalPeerEndpoint(endpoint) { out[nodeID] = endpoint } } return out } func scopedPeerEndpointCandidates(candidates map[string][]PeerEndpointCandidate, routePath []string) map[string][]PeerEndpointCandidate { out := map[string][]PeerEndpointCandidate{} for nodeID, items := range candidates { if !containsString(routePath, nodeID) { continue } for _, candidate := range items { if isUnusableLocalPeerEndpoint(candidate.Address) { continue } if candidate.Metadata == nil { candidate.Metadata = json.RawMessage(`{}`) } out[nodeID] = append(out[nodeID], candidate) } } return out } func isPeerEndpointTransport(value string) bool { switch value { case "direct_quic", "relay_quic", "direct_http", "direct_tcp_tls", "wss", "relay", "outbound_reverse": return true default: return false } } func isPeerRendezvousTransport(value string) bool { switch value { case "relay_control", "relay_quic", "relay", "direct_quic", "wss", "direct_tcp_tls": return true default: return false } } func isPeerEndpointReachability(value string) bool { switch value { case "public", "private", "relay", "outbound_only", "unknown": return true default: return false } } func isPeerEndpointConnectivityMode(value string) bool { switch value { case "direct", "private_lan", "relay_required", "outbound_only", "unknown": return true default: return false } } func isPeerEndpointNATType(value string) bool { switch value { case "unknown", "none", "full_cone", "restricted", "port_restricted", "symmetric", "blocked": return true default: return false } } func appendUniqueStrings(values []string, additions ...string) []string { seen := make(map[string]struct{}, len(values)+len(additions)) out := make([]string, 0, len(values)+len(additions)) for _, value := range values { value = strings.TrimSpace(value) if value == "" { continue } if _, ok := seen[value]; ok { continue } seen[value] = struct{}{} out = append(out, value) } for _, value := range additions { value = strings.TrimSpace(value) if value == "" { continue } if _, ok := seen[value]; ok { continue } seen[value] = struct{}{} out = append(out, value) } return out } func controlPlaneAllowedChannels(channels []string) []string { out := []string{} for _, channel := range channels { channel = strings.TrimSpace(channel) switch channel { case "fabric_control", "route_control": if !containsString(out, channel) { out = append(out, channel) } } } return out } func isHTTPControlEndpoint(endpoint string) bool { endpoint = strings.ToLower(strings.TrimSpace(endpoint)) return strings.HasPrefix(endpoint, "http://") || strings.HasPrefix(endpoint, "https://") } func isQUICControlEndpoint(endpoint string) bool { return strings.HasPrefix(strings.ToLower(strings.TrimSpace(endpoint)), "quic://") } func isUsableFabricControlEndpoint(endpoint string) bool { return (isQUICControlEndpoint(endpoint) || isHTTPControlEndpoint(endpoint)) && !isUnusableLocalPeerEndpoint(endpoint) } func isUsableHTTPControlEndpoint(endpoint string) bool { return isHTTPControlEndpoint(endpoint) && !isUnusableLocalPeerEndpoint(endpoint) } func isUnusableLocalPeerEndpoint(endpoint string) bool { host := peerEndpointHost(endpoint) if host == "" { return false } if strings.EqualFold(host, "localhost") { return true } ip := net.ParseIP(host) return ip != nil && (ip.IsLoopback() || ip.IsUnspecified()) } func peerEndpointHost(endpoint string) string { endpoint = strings.TrimRight(strings.TrimSpace(endpoint), "/") if endpoint == "" { return "" } if host, _, err := net.SplitHostPort(endpoint); err == nil { return strings.Trim(host, "[]") } if parsed, err := url.Parse(endpoint); err == nil && parsed.Host != "" { if host, _, err := net.SplitHostPort(parsed.Host); err == nil { return strings.Trim(host, "[]") } return strings.Trim(parsed.Host, "[]") } return strings.Trim(endpoint, "[]") } func firstNodeID(selector nodeSelector) string { if strings.TrimSpace(selector.NodeID) != "" { return strings.TrimSpace(selector.NodeID) } for _, nodeID := range selector.NodeIDs { if strings.TrimSpace(nodeID) != "" { return strings.TrimSpace(nodeID) } } return "" } func cleanRouteNodePath(values []string) []string { out := make([]string, 0, len(values)) for _, value := range values { value = strings.TrimSpace(value) if value != "" { out = append(out, value) } } return out } func containsString(values []string, needle string) bool { needle = strings.TrimSpace(needle) if needle == "" { return false } for _, value := range values { if strings.TrimSpace(value) == needle { return true } } return false } func appendMissingString(values []string, value string) []string { if containsString(values, value) { return values } return append(values, value) } func generateFencingToken() (string, error) { buf := make([]byte, 32) if _, err := rand.Read(buf); err != nil { return "", err } return "rap_vpn_fence_" + hex.EncodeToString(buf), nil }